You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_2x4_sse3.S 35 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 24 + STACK + ARGS(%esp)
  47. #define ARG_B 28 + STACK + ARGS(%esp)
  48. #define C 32 + STACK + ARGS(%esp)
  49. #define ARG_LDC 36 + STACK + ARGS(%esp)
  50. #define OFFSET 40 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define BX 4 + STACK(%esp)
  53. #define KK 8 + STACK(%esp)
  54. #define KKK 12 + STACK(%esp)
  55. #ifdef PENTIUM4
  56. #define PREFETCH_R (8 * 4)
  57. #define PREFETCH prefetcht1
  58. #define PREFETCHSIZE 84
  59. #endif
  60. #ifdef PENTIUMM
  61. #define PREFETCH_R (8 * 4)
  62. #define PREFETCH prefetcht1
  63. #define PREFETCHSIZE 84
  64. #endif
  65. #define AA %edx
  66. #define BB %ecx
  67. #define LDC %ebp
  68. #define B %edi
  69. #define KERNEL1(address) \
  70. mulpd %xmm0, %xmm2; \
  71. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  72. addpd %xmm2, %xmm4; \
  73. movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  74. mulpd %xmm0, %xmm2; \
  75. addpd %xmm2, %xmm5; \
  76. movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  77. mulpd %xmm0, %xmm2; \
  78. addpd %xmm2, %xmm6; \
  79. movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  80. mulpd %xmm0, %xmm2; \
  81. movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
  82. addpd %xmm2, %xmm7; \
  83. movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2
  84. #define KERNEL2(address) \
  85. mulpd %xmm0, %xmm2; \
  86. addpd %xmm2, %xmm4; \
  87. movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  88. mulpd %xmm0, %xmm2; \
  89. addpd %xmm2, %xmm5; \
  90. movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  91. mulpd %xmm0, %xmm2; \
  92. addpd %xmm2, %xmm6; \
  93. movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  94. mulpd %xmm0, %xmm2; \
  95. movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
  96. addpd %xmm2, %xmm7; \
  97. movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2
  98. #define KERNEL3(address) \
  99. mulpd %xmm0, %xmm3; \
  100. addpd %xmm3, %xmm4; \
  101. movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  102. mulpd %xmm0, %xmm3; \
  103. addpd %xmm3, %xmm5; \
  104. movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  105. mulpd %xmm0, %xmm3; \
  106. addpd %xmm3, %xmm6; \
  107. movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  108. mulpd %xmm0, %xmm3; \
  109. movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
  110. addpd %xmm3, %xmm7; \
  111. movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3
  112. #define KERNEL4(address) \
  113. mulpd %xmm0, %xmm3; \
  114. addpd %xmm3, %xmm4; \
  115. movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  116. mulpd %xmm0, %xmm3; \
  117. addpd %xmm3, %xmm5; \
  118. movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  119. mulpd %xmm0, %xmm3; \
  120. addpd %xmm3, %xmm6; \
  121. movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  122. mulpd %xmm0, %xmm3; \
  123. movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
  124. addpd %xmm3, %xmm7; \
  125. movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3
  126. #define KERNEL5(address) \
  127. mulpd %xmm1, %xmm2; \
  128. addpd %xmm2, %xmm4; \
  129. movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  130. mulpd %xmm1, %xmm2; \
  131. addpd %xmm2, %xmm5; \
  132. movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  133. mulpd %xmm1, %xmm2; \
  134. addpd %xmm2, %xmm6; \
  135. movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  136. mulpd %xmm1, %xmm2; \
  137. movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
  138. addpd %xmm2, %xmm7
  139. #define KERNEL6(address) \
  140. movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  141. mulpd %xmm1, %xmm2; \
  142. addpd %xmm2, %xmm4; \
  143. movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  144. mulpd %xmm1, %xmm2; \
  145. addpd %xmm2, %xmm5; \
  146. movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  147. mulpd %xmm1, %xmm2; \
  148. addpd %xmm2, %xmm6; \
  149. movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  150. mulpd %xmm1, %xmm2; \
  151. movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
  152. addpd %xmm2, %xmm7; \
  153. movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2
  154. #define KERNEL7(address) \
  155. mulpd %xmm1, %xmm3; \
  156. addpd %xmm3, %xmm4; \
  157. movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  158. mulpd %xmm1, %xmm3; \
  159. addpd %xmm3, %xmm5; \
  160. movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  161. mulpd %xmm1, %xmm3; \
  162. addpd %xmm3, %xmm6; \
  163. movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  164. mulpd %xmm1, %xmm3; \
  165. movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
  166. addpd %xmm3, %xmm7; \
  167. movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3
  168. #define KERNEL8(address) \
  169. mulpd %xmm1, %xmm3; \
  170. addpd %xmm3, %xmm4; \
  171. movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  172. mulpd %xmm1, %xmm3; \
  173. addpd %xmm3, %xmm5; \
  174. movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  175. mulpd %xmm1, %xmm3; \
  176. addpd %xmm3, %xmm6; \
  177. movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  178. mulpd %xmm1, %xmm3; \
  179. movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
  180. addpd %xmm3, %xmm7; \
  181. movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3
  182. PROLOGUE
  183. subl $ARGS, %esp
  184. pushl %ebp
  185. pushl %edi
  186. pushl %esi
  187. pushl %ebx
  188. PROFCODE
  189. movl ARG_B, B
  190. movl ARG_LDC, LDC
  191. #ifdef TRMMKERNEL
  192. movl OFFSET, %eax
  193. #ifndef LEFT
  194. negl %eax
  195. #endif
  196. movl %eax, KK
  197. #endif
  198. leal (, LDC, SIZE), LDC
  199. movl N, %eax
  200. sarl $2, %eax
  201. movl %eax, J
  202. jle .L30
  203. ALIGN_2
  204. .L10:
  205. #if defined(TRMMKERNEL) && defined(LEFT)
  206. movl OFFSET, %eax
  207. movl %eax, KK
  208. #endif
  209. movl K, %eax
  210. sall $BASE_SHIFT + 2, %eax
  211. leal (B, %eax), %eax
  212. movl %eax, BX
  213. movl C, %esi # coffset = c
  214. movl A, AA # aoffset = a
  215. movl M, %ebx
  216. sarl $1, %ebx # i = (m >> 2)
  217. jle .L20
  218. ALIGN_4
  219. .L11:
  220. #if !defined(TRMMKERNEL) || \
  221. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  222. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  223. movl B, BB
  224. #else
  225. movl KK, %eax
  226. leal (, %eax, SIZE), %eax
  227. leal (AA, %eax, 2), AA
  228. leal (B, %eax, 4), BB
  229. #endif
  230. movl BX, %eax
  231. prefetcht2 0 * SIZE(%eax)
  232. subl $-4 * SIZE, BX
  233. movapd 0 * SIZE(AA), %xmm0
  234. pxor %xmm4, %xmm4
  235. movapd 8 * SIZE(AA), %xmm1
  236. pxor %xmm5, %xmm5
  237. movddup 0 * SIZE(BB), %xmm2
  238. pxor %xmm6, %xmm6
  239. movddup 8 * SIZE(BB), %xmm3
  240. pxor %xmm7, %xmm7
  241. leal (LDC, LDC, 2), %eax
  242. #ifdef PENTIUM4
  243. prefetchnta 3 * SIZE(%esi)
  244. prefetchnta 3 * SIZE(%esi, LDC, 1)
  245. prefetchnta 3 * SIZE(%esi, LDC, 2)
  246. prefetchnta 3 * SIZE(%esi, %eax, 1)
  247. #endif
  248. #ifndef TRMMKERNEL
  249. movl K, %eax
  250. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  251. movl K, %eax
  252. subl KK, %eax
  253. movl %eax, KKK
  254. #else
  255. movl KK, %eax
  256. #ifdef LEFT
  257. addl $2, %eax
  258. #else
  259. addl $4, %eax
  260. #endif
  261. movl %eax, KKK
  262. #endif
  263. #ifdef CORE_PRESCOTT
  264. andl $-8, %eax
  265. sall $4, %eax
  266. je .L15
  267. .L1X:
  268. KERNEL1(16 * 0)
  269. KERNEL2(16 * 0)
  270. KERNEL3(16 * 0)
  271. KERNEL4(16 * 0)
  272. KERNEL5(16 * 0)
  273. KERNEL6(16 * 0)
  274. KERNEL7(16 * 0)
  275. KERNEL8(16 * 0)
  276. cmpl $128 * 1, %eax
  277. jle .L12
  278. KERNEL1(16 * 1)
  279. KERNEL2(16 * 1)
  280. KERNEL3(16 * 1)
  281. KERNEL4(16 * 1)
  282. KERNEL5(16 * 1)
  283. KERNEL6(16 * 1)
  284. KERNEL7(16 * 1)
  285. KERNEL8(16 * 1)
  286. cmpl $128 * 2, %eax
  287. jle .L12
  288. KERNEL1(16 * 2)
  289. KERNEL2(16 * 2)
  290. KERNEL3(16 * 2)
  291. KERNEL4(16 * 2)
  292. KERNEL5(16 * 2)
  293. KERNEL6(16 * 2)
  294. KERNEL7(16 * 2)
  295. KERNEL8(16 * 2)
  296. cmpl $128 * 3, %eax
  297. jle .L12
  298. KERNEL1(16 * 3)
  299. KERNEL2(16 * 3)
  300. KERNEL3(16 * 3)
  301. KERNEL4(16 * 3)
  302. KERNEL5(16 * 3)
  303. KERNEL6(16 * 3)
  304. KERNEL7(16 * 3)
  305. KERNEL8(16 * 3)
  306. cmpl $128 * 4, %eax
  307. jle .L12
  308. KERNEL1(16 * 4)
  309. KERNEL2(16 * 4)
  310. KERNEL3(16 * 4)
  311. KERNEL4(16 * 4)
  312. KERNEL5(16 * 4)
  313. KERNEL6(16 * 4)
  314. KERNEL7(16 * 4)
  315. KERNEL8(16 * 4)
  316. cmpl $128 * 5, %eax
  317. jle .L12
  318. KERNEL1(16 * 5)
  319. KERNEL2(16 * 5)
  320. KERNEL3(16 * 5)
  321. KERNEL4(16 * 5)
  322. KERNEL5(16 * 5)
  323. KERNEL6(16 * 5)
  324. KERNEL7(16 * 5)
  325. KERNEL8(16 * 5)
  326. cmpl $128 * 6, %eax
  327. jle .L12
  328. KERNEL1(16 * 6)
  329. KERNEL2(16 * 6)
  330. KERNEL3(16 * 6)
  331. KERNEL4(16 * 6)
  332. KERNEL5(16 * 6)
  333. KERNEL6(16 * 6)
  334. KERNEL7(16 * 6)
  335. KERNEL8(16 * 6)
  336. cmpl $128 * 7, %eax
  337. jle .L12
  338. KERNEL1(16 * 7)
  339. KERNEL2(16 * 7)
  340. KERNEL3(16 * 7)
  341. KERNEL4(16 * 7)
  342. KERNEL5(16 * 7)
  343. KERNEL6(16 * 7)
  344. KERNEL7(16 * 7)
  345. KERNEL8(16 * 7)
  346. #if 1
  347. cmpl $128 * 8, %eax
  348. jle .L12
  349. KERNEL1(16 * 8)
  350. KERNEL2(16 * 8)
  351. KERNEL3(16 * 8)
  352. KERNEL4(16 * 8)
  353. KERNEL5(16 * 8)
  354. KERNEL6(16 * 8)
  355. KERNEL7(16 * 8)
  356. KERNEL8(16 * 8)
  357. cmpl $128 * 9, %eax
  358. jle .L12
  359. KERNEL1(16 * 9)
  360. KERNEL2(16 * 9)
  361. KERNEL3(16 * 9)
  362. KERNEL4(16 * 9)
  363. KERNEL5(16 * 9)
  364. KERNEL6(16 * 9)
  365. KERNEL7(16 * 9)
  366. KERNEL8(16 * 9)
  367. cmpl $128 * 10, %eax
  368. jle .L12
  369. KERNEL1(16 * 10)
  370. KERNEL2(16 * 10)
  371. KERNEL3(16 * 10)
  372. KERNEL4(16 * 10)
  373. KERNEL5(16 * 10)
  374. KERNEL6(16 * 10)
  375. KERNEL7(16 * 10)
  376. KERNEL8(16 * 10)
  377. cmpl $128 * 11, %eax
  378. jle .L12
  379. KERNEL1(16 * 11)
  380. KERNEL2(16 * 11)
  381. KERNEL3(16 * 11)
  382. KERNEL4(16 * 11)
  383. KERNEL5(16 * 11)
  384. KERNEL6(16 * 11)
  385. KERNEL7(16 * 11)
  386. KERNEL8(16 * 11)
  387. cmpl $128 * 12, %eax
  388. jle .L12
  389. KERNEL1(16 * 12)
  390. KERNEL2(16 * 12)
  391. KERNEL3(16 * 12)
  392. KERNEL4(16 * 12)
  393. KERNEL5(16 * 12)
  394. KERNEL6(16 * 12)
  395. KERNEL7(16 * 12)
  396. KERNEL8(16 * 12)
  397. cmpl $128 * 13, %eax
  398. jle .L12
  399. KERNEL1(16 * 13)
  400. KERNEL2(16 * 13)
  401. KERNEL3(16 * 13)
  402. KERNEL4(16 * 13)
  403. KERNEL5(16 * 13)
  404. KERNEL6(16 * 13)
  405. KERNEL7(16 * 13)
  406. KERNEL8(16 * 13)
  407. cmpl $128 * 14, %eax
  408. jle .L12
  409. KERNEL1(16 * 14)
  410. KERNEL2(16 * 14)
  411. KERNEL3(16 * 14)
  412. KERNEL4(16 * 14)
  413. KERNEL5(16 * 14)
  414. KERNEL6(16 * 14)
  415. KERNEL7(16 * 14)
  416. KERNEL8(16 * 14)
  417. cmpl $128 * 15, %eax
  418. jle .L12
  419. KERNEL1(16 * 15)
  420. KERNEL2(16 * 15)
  421. KERNEL3(16 * 15)
  422. KERNEL4(16 * 15)
  423. KERNEL5(16 * 15)
  424. KERNEL6(16 * 15)
  425. KERNEL7(16 * 15)
  426. KERNEL8(16 * 15)
  427. #else
  428. addl $32 * 4 * SIZE, AA
  429. addl $32 * 8 * SIZE, BB
  430. subl $128 * 8, %eax
  431. jg .L1X
  432. #endif
  433. .L12:
  434. leal (AA, %eax, 1), AA # * 16
  435. leal (BB, %eax, 2), BB # * 64
  436. #else
  437. sarl $3, %eax
  438. je .L15
  439. ALIGN_4
  440. .L12:
  441. mulpd %xmm0, %xmm2
  442. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  443. addpd %xmm2, %xmm4
  444. movddup 1 * SIZE(BB), %xmm2
  445. mulpd %xmm0, %xmm2
  446. addpd %xmm2, %xmm5
  447. movddup 2 * SIZE(BB), %xmm2
  448. mulpd %xmm0, %xmm2
  449. addpd %xmm2, %xmm6
  450. movddup 3 * SIZE(BB), %xmm2
  451. mulpd %xmm0, %xmm2
  452. movapd 2 * SIZE(AA), %xmm0
  453. addpd %xmm2, %xmm7
  454. movddup 4 * SIZE(BB), %xmm2
  455. mulpd %xmm0, %xmm2
  456. addpd %xmm2, %xmm4
  457. movddup 5 * SIZE(BB), %xmm2
  458. mulpd %xmm0, %xmm2
  459. addpd %xmm2, %xmm5
  460. movddup 6 * SIZE(BB), %xmm2
  461. mulpd %xmm0, %xmm2
  462. addpd %xmm2, %xmm6
  463. movddup 7 * SIZE(BB), %xmm2
  464. mulpd %xmm0, %xmm2
  465. movapd 4 * SIZE(AA), %xmm0
  466. addpd %xmm2, %xmm7
  467. movddup 16 * SIZE(BB), %xmm2
  468. mulpd %xmm0, %xmm3
  469. addpd %xmm3, %xmm4
  470. movddup 9 * SIZE(BB), %xmm3
  471. mulpd %xmm0, %xmm3
  472. addpd %xmm3, %xmm5
  473. movddup 10 * SIZE(BB), %xmm3
  474. mulpd %xmm0, %xmm3
  475. addpd %xmm3, %xmm6
  476. movddup 11 * SIZE(BB), %xmm3
  477. mulpd %xmm0, %xmm3
  478. movapd 6 * SIZE(AA), %xmm0
  479. addpd %xmm3, %xmm7
  480. movddup 12 * SIZE(BB), %xmm3
  481. mulpd %xmm0, %xmm3
  482. addpd %xmm3, %xmm4
  483. movddup 13 * SIZE(BB), %xmm3
  484. mulpd %xmm0, %xmm3
  485. addpd %xmm3, %xmm5
  486. movddup 14 * SIZE(BB), %xmm3
  487. mulpd %xmm0, %xmm3
  488. addpd %xmm3, %xmm6
  489. movddup 15 * SIZE(BB), %xmm3
  490. mulpd %xmm0, %xmm3
  491. movapd 16 * SIZE(AA), %xmm0
  492. addpd %xmm3, %xmm7
  493. movddup 24 * SIZE(BB), %xmm3
  494. mulpd %xmm1, %xmm2
  495. addpd %xmm2, %xmm4
  496. movddup 17 * SIZE(BB), %xmm2
  497. mulpd %xmm1, %xmm2
  498. addpd %xmm2, %xmm5
  499. movddup 18 * SIZE(BB), %xmm2
  500. mulpd %xmm1, %xmm2
  501. addpd %xmm2, %xmm6
  502. movddup 19 * SIZE(BB), %xmm2
  503. mulpd %xmm1, %xmm2
  504. movapd 10 * SIZE(AA), %xmm1
  505. addpd %xmm2, %xmm7
  506. movddup 20 * SIZE(BB), %xmm2
  507. mulpd %xmm1, %xmm2
  508. addpd %xmm2, %xmm4
  509. movddup 21 * SIZE(BB), %xmm2
  510. mulpd %xmm1, %xmm2
  511. addpd %xmm2, %xmm5
  512. movddup 22 * SIZE(BB), %xmm2
  513. mulpd %xmm1, %xmm2
  514. addpd %xmm2, %xmm6
  515. movddup 23 * SIZE(BB), %xmm2
  516. mulpd %xmm1, %xmm2
  517. movapd 12 * SIZE(AA), %xmm1
  518. addpd %xmm2, %xmm7
  519. movddup 32 * SIZE(BB), %xmm2
  520. mulpd %xmm1, %xmm3
  521. addpd %xmm3, %xmm4
  522. movddup 25 * SIZE(BB), %xmm3
  523. mulpd %xmm1, %xmm3
  524. addpd %xmm3, %xmm5
  525. movddup 26 * SIZE(BB), %xmm3
  526. mulpd %xmm1, %xmm3
  527. addpd %xmm3, %xmm6
  528. movddup 27 * SIZE(BB), %xmm3
  529. mulpd %xmm1, %xmm3
  530. movapd 14 * SIZE(AA), %xmm1
  531. addpd %xmm3, %xmm7
  532. movddup 28 * SIZE(BB), %xmm3
  533. mulpd %xmm1, %xmm3
  534. addpd %xmm3, %xmm4
  535. movddup 29 * SIZE(BB), %xmm3
  536. mulpd %xmm1, %xmm3
  537. addpd %xmm3, %xmm5
  538. movddup 30 * SIZE(BB), %xmm3
  539. mulpd %xmm1, %xmm3
  540. addpd %xmm3, %xmm6
  541. movddup 31 * SIZE(BB), %xmm3
  542. mulpd %xmm1, %xmm3
  543. movapd 24 * SIZE(AA), %xmm1
  544. addpd %xmm3, %xmm7
  545. movddup 40 * SIZE(BB), %xmm3
  546. addl $32 * SIZE, BB
  547. addl $16 * SIZE, AA
  548. decl %eax
  549. jne .L12
  550. ALIGN_4
  551. #endif
  552. .L15:
  553. #ifndef TRMMKERNEL
  554. movl K, %eax
  555. #else
  556. movl KKK, %eax
  557. #endif
  558. movddup ALPHA, %xmm3
  559. andl $7, %eax # if (k & 1)
  560. BRANCH
  561. je .L18
  562. ALIGN_3
  563. .L16:
  564. mulpd %xmm0, %xmm2
  565. addpd %xmm2, %xmm4
  566. movddup 1 * SIZE(BB), %xmm2
  567. mulpd %xmm0, %xmm2
  568. addpd %xmm2, %xmm5
  569. movddup 2 * SIZE(BB), %xmm2
  570. mulpd %xmm0, %xmm2
  571. addpd %xmm2, %xmm6
  572. movddup 3 * SIZE(BB), %xmm2
  573. mulpd %xmm0, %xmm2
  574. movapd 2 * SIZE(AA), %xmm0
  575. addpd %xmm2, %xmm7
  576. movddup 4 * SIZE(BB), %xmm2
  577. addl $2 * SIZE, AA
  578. addl $4 * SIZE, BB
  579. decl %eax
  580. jg .L16
  581. ALIGN_4
  582. .L18:
  583. SHUFPD_2 %xmm0, %xmm0
  584. SHUFPD_2 %xmm1, %xmm1
  585. SHUFPD_2 %xmm2, %xmm2
  586. SHUFPD_2 %xmm3, %xmm3
  587. mulpd %xmm3, %xmm4
  588. mulpd %xmm3, %xmm5
  589. mulpd %xmm3, %xmm6
  590. mulpd %xmm3, %xmm7
  591. movl %esi, %eax
  592. orl LDC, %eax
  593. testl $15, %eax
  594. NOBRANCH
  595. jne .L18x
  596. leal (LDC, LDC, 2), %eax
  597. #ifndef TRMMKERNEL
  598. movapd 0 * SIZE(%esi), %xmm0
  599. movapd 0 * SIZE(%esi, LDC, 1), %xmm1
  600. movapd 0 * SIZE(%esi, LDC, 2), %xmm2
  601. movapd 0 * SIZE(%esi, %eax, 1), %xmm3
  602. addpd %xmm0, %xmm4
  603. addpd %xmm1, %xmm5
  604. addpd %xmm2, %xmm6
  605. addpd %xmm3, %xmm7
  606. #endif
  607. movapd %xmm4, 0 * SIZE(%esi)
  608. movapd %xmm5, 0 * SIZE(%esi, LDC, 1)
  609. movapd %xmm6, 0 * SIZE(%esi, LDC, 2)
  610. movapd %xmm7, 0 * SIZE(%esi, %eax, 1)
  611. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  612. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  613. movl K, %eax
  614. subl KKK, %eax
  615. leal (,%eax, SIZE), %eax
  616. leal (AA, %eax, 2), AA
  617. leal (BB, %eax, 4), BB
  618. #endif
  619. #if defined(TRMMKERNEL) && defined(LEFT)
  620. addl $2, KK
  621. #endif
  622. addl $2 * SIZE, %esi # coffset += 2
  623. decl %ebx # i --
  624. jg .L11
  625. jmp .L20
  626. ALIGN_4
  627. .L18x:
  628. leal (LDC, LDC, 2), %eax
  629. #ifndef TRMMKERNEL
  630. movsd 0 * SIZE(%esi), %xmm0
  631. movhpd 1 * SIZE(%esi), %xmm0
  632. movsd 0 * SIZE(%esi, LDC, 1), %xmm1
  633. movhpd 1 * SIZE(%esi, LDC, 1), %xmm1
  634. movsd 0 * SIZE(%esi, LDC, 2), %xmm2
  635. movhpd 1 * SIZE(%esi, LDC, 2), %xmm2
  636. movsd 0 * SIZE(%esi, %eax, 1), %xmm3
  637. movhpd 1 * SIZE(%esi, %eax, 1), %xmm3
  638. addpd %xmm0, %xmm4
  639. addpd %xmm1, %xmm5
  640. addpd %xmm2, %xmm6
  641. addpd %xmm3, %xmm7
  642. #endif
  643. movsd %xmm4, 0 * SIZE(%esi)
  644. movhpd %xmm4, 1 * SIZE(%esi)
  645. movsd %xmm5, 0 * SIZE(%esi, LDC, 1)
  646. movhpd %xmm5, 1 * SIZE(%esi, LDC, 1)
  647. movsd %xmm6, 0 * SIZE(%esi, LDC, 2)
  648. movhpd %xmm6, 1 * SIZE(%esi, LDC, 2)
  649. movsd %xmm7, 0 * SIZE(%esi, %eax, 1)
  650. movhpd %xmm7, 1 * SIZE(%esi, %eax, 1)
  651. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  652. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  653. movl K, %eax
  654. subl KKK, %eax
  655. leal (,%eax, SIZE), %eax
  656. leal (AA, %eax, 2), AA
  657. leal (BB, %eax, 4), BB
  658. #endif
  659. #if defined(TRMMKERNEL) && defined(LEFT)
  660. addl $2, KK
  661. #endif
  662. addl $2 * SIZE, %esi # coffset += 2
  663. decl %ebx # i --
  664. jg .L11
  665. ALIGN_3
  666. .L20:
  667. movl M, %ebx
  668. testl $1, %ebx # i = (m >> 2)
  669. jle .L29
  670. #if !defined(TRMMKERNEL) || \
  671. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  672. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  673. movl B, BB
  674. #else
  675. movl KK, %eax
  676. leal (, %eax, SIZE), %eax
  677. leal (AA, %eax, 1), AA
  678. leal (B, %eax, 4), BB
  679. #endif
  680. movddup 0 * SIZE(AA), %xmm0
  681. pxor %xmm4, %xmm4
  682. movddup 8 * SIZE(AA), %xmm1
  683. pxor %xmm5, %xmm5
  684. movapd 0 * SIZE(BB), %xmm2
  685. pxor %xmm6, %xmm6
  686. movapd 8 * SIZE(BB), %xmm3
  687. pxor %xmm7, %xmm7
  688. #ifndef TRMMKERNEL
  689. movl K, %eax
  690. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  691. movl K, %eax
  692. subl KK, %eax
  693. movl %eax, KKK
  694. #else
  695. movl KK, %eax
  696. #ifdef LEFT
  697. addl $1, %eax
  698. #else
  699. addl $4, %eax
  700. #endif
  701. movl %eax, KKK
  702. #endif
  703. sarl $4, %eax
  704. je .L25
  705. ALIGN_4
  706. .L22:
  707. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  708. mulpd %xmm0, %xmm2
  709. mulpd 2 * SIZE(BB), %xmm0
  710. addpd %xmm2, %xmm4
  711. movapd 4 * SIZE(BB), %xmm2
  712. addpd %xmm0, %xmm5
  713. movddup 1 * SIZE(AA), %xmm0
  714. mulpd %xmm0, %xmm2
  715. mulpd 6 * SIZE(BB), %xmm0
  716. addpd %xmm2, %xmm6
  717. movapd 16 * SIZE(BB), %xmm2
  718. addpd %xmm0, %xmm7
  719. movddup 2 * SIZE(AA), %xmm0
  720. mulpd %xmm0, %xmm3
  721. mulpd 10 * SIZE(BB), %xmm0
  722. addpd %xmm3, %xmm4
  723. movapd 12 * SIZE(BB), %xmm3
  724. addpd %xmm0, %xmm5
  725. movddup 3 * SIZE(AA), %xmm0
  726. mulpd %xmm0, %xmm3
  727. mulpd 14 * SIZE(BB), %xmm0
  728. addpd %xmm3, %xmm6
  729. movapd 24 * SIZE(BB), %xmm3
  730. addpd %xmm0, %xmm7
  731. movddup 4 * SIZE(AA), %xmm0
  732. mulpd %xmm0, %xmm2
  733. mulpd 18 * SIZE(BB), %xmm0
  734. addpd %xmm2, %xmm4
  735. movapd 20 * SIZE(BB), %xmm2
  736. addpd %xmm0, %xmm5
  737. movddup 5 * SIZE(AA), %xmm0
  738. mulpd %xmm0, %xmm2
  739. mulpd 22 * SIZE(BB), %xmm0
  740. addpd %xmm2, %xmm6
  741. movapd 32 * SIZE(BB), %xmm2
  742. addpd %xmm0, %xmm7
  743. movddup 6 * SIZE(AA), %xmm0
  744. mulpd %xmm0, %xmm3
  745. mulpd 26 * SIZE(BB), %xmm0
  746. addpd %xmm3, %xmm4
  747. movapd 28 * SIZE(BB), %xmm3
  748. addpd %xmm0, %xmm5
  749. movddup 7 * SIZE(AA), %xmm0
  750. mulpd %xmm0, %xmm3
  751. mulpd 30 * SIZE(BB), %xmm0
  752. addpd %xmm3, %xmm6
  753. movapd 40 * SIZE(BB), %xmm3
  754. addpd %xmm0, %xmm7
  755. movddup 16 * SIZE(AA), %xmm0
  756. mulpd %xmm1, %xmm2
  757. mulpd 34 * SIZE(BB), %xmm1
  758. addpd %xmm2, %xmm4
  759. movapd 36 * SIZE(BB), %xmm2
  760. addpd %xmm1, %xmm5
  761. movddup 9 * SIZE(AA), %xmm1
  762. mulpd %xmm1, %xmm2
  763. mulpd 38 * SIZE(BB), %xmm1
  764. addpd %xmm2, %xmm6
  765. movapd 48 * SIZE(BB), %xmm2
  766. addpd %xmm1, %xmm7
  767. movddup 10 * SIZE(AA), %xmm1
  768. mulpd %xmm1, %xmm3
  769. mulpd 42 * SIZE(BB), %xmm1
  770. addpd %xmm3, %xmm4
  771. movapd 44 * SIZE(BB), %xmm3
  772. addpd %xmm1, %xmm5
  773. movddup 11 * SIZE(AA), %xmm1
  774. mulpd %xmm1, %xmm3
  775. mulpd 46 * SIZE(BB), %xmm1
  776. addpd %xmm3, %xmm6
  777. movapd 56 * SIZE(BB), %xmm3
  778. addpd %xmm1, %xmm7
  779. movddup 12 * SIZE(AA), %xmm1
  780. mulpd %xmm1, %xmm2
  781. mulpd 50 * SIZE(BB), %xmm1
  782. addpd %xmm2, %xmm4
  783. movapd 52 * SIZE(BB), %xmm2
  784. addpd %xmm1, %xmm5
  785. movddup 13 * SIZE(AA), %xmm1
  786. mulpd %xmm1, %xmm2
  787. mulpd 54 * SIZE(BB), %xmm1
  788. addpd %xmm2, %xmm6
  789. movapd 64 * SIZE(BB), %xmm2
  790. addpd %xmm1, %xmm7
  791. movddup 14 * SIZE(AA), %xmm1
  792. mulpd %xmm1, %xmm3
  793. mulpd 58 * SIZE(BB), %xmm1
  794. addpd %xmm3, %xmm4
  795. movapd 60 * SIZE(BB), %xmm3
  796. addpd %xmm1, %xmm5
  797. movddup 15 * SIZE(AA), %xmm1
  798. mulpd %xmm1, %xmm3
  799. mulpd 62 * SIZE(BB), %xmm1
  800. addpd %xmm3, %xmm6
  801. movapd 72 * SIZE(BB), %xmm3
  802. addpd %xmm1, %xmm7
  803. movddup 24 * SIZE(AA), %xmm1
  804. addl $16 * SIZE, AA
  805. addl $64 * SIZE, BB
  806. decl %eax
  807. jne .L22
  808. ALIGN_4
  809. .L25:
  810. #ifndef TRMMKERNEL
  811. movl K, %eax
  812. #else
  813. movl KKK, %eax
  814. #endif
  815. movddup ALPHA, %xmm3
  816. andl $15, %eax # if (k & 1)
  817. BRANCH
  818. je .L28
  819. .L26:
  820. mulpd %xmm0, %xmm2
  821. mulpd 2 * SIZE(BB), %xmm0
  822. addpd %xmm2, %xmm4
  823. movapd 4 * SIZE(BB), %xmm2
  824. addpd %xmm0, %xmm5
  825. movddup 1 * SIZE(AA), %xmm0
  826. addl $1 * SIZE, AA
  827. addl $4 * SIZE, BB
  828. decl %eax
  829. jg .L26
  830. ALIGN_4
  831. .L28:
  832. leal (%esi, LDC, 1), %eax
  833. addpd %xmm6, %xmm4
  834. addpd %xmm7, %xmm5
  835. mulpd %xmm3, %xmm4
  836. mulpd %xmm3, %xmm5
  837. #ifndef TRMMKERNEL
  838. #ifdef PENTIUM4
  839. SHUFPD_2 %xmm0, %xmm0
  840. SHUFPD_2 %xmm1, %xmm1
  841. #endif
  842. movsd 0 * SIZE(%esi), %xmm0
  843. movhpd 0 * SIZE(%eax), %xmm0
  844. movsd 0 * SIZE(%esi, LDC, 2), %xmm1
  845. movhpd 0 * SIZE(%eax, LDC, 2), %xmm1
  846. addpd %xmm0, %xmm4
  847. addpd %xmm1, %xmm5
  848. #endif
  849. movsd %xmm4, 0 * SIZE(%esi)
  850. movhpd %xmm4, 0 * SIZE(%eax)
  851. movsd %xmm5, 0 * SIZE(%esi, LDC, 2)
  852. movhpd %xmm5, 0 * SIZE(%eax, LDC, 2)
  853. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  854. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  855. movl K, %eax
  856. subl KKK, %eax
  857. leal (,%eax, SIZE), %eax
  858. leal (AA, %eax, 1), AA
  859. leal (BB, %eax, 4), BB
  860. #endif
  861. #if defined(TRMMKERNEL) && defined(LEFT)
  862. addl $1, KK
  863. #endif
  864. ALIGN_4
  865. .L29:
  866. #if defined(TRMMKERNEL) && !defined(LEFT)
  867. addl $4, KK
  868. #endif
  869. leal (, LDC, 4), %eax
  870. movl BB, B
  871. addl %eax, C # c += 4 * ldc
  872. decl J # j --
  873. jg .L10
  874. ALIGN_4
  875. .L30:
  876. testl $2, N
  877. je .L60
  878. movl C, %esi # coffset = c
  879. movl A, AA # aoffset = a
  880. #if defined(TRMMKERNEL) && defined(LEFT)
  881. movl OFFSET, %eax
  882. movl %eax, KK
  883. #endif
  884. movl M, %ebx
  885. sarl $1, %ebx # i = (m >> 2)
  886. jle .L50
  887. ALIGN_4
  888. .L41:
  889. #if !defined(TRMMKERNEL) || \
  890. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  891. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  892. movl B, BB
  893. #else
  894. movl KK, %eax
  895. leal (, %eax, SIZE), %eax
  896. leal (AA, %eax, 2), AA
  897. leal (B, %eax, 2), BB
  898. #endif
  899. movapd 0 * SIZE(AA), %xmm0
  900. pxor %xmm4, %xmm4
  901. movapd 8 * SIZE(AA), %xmm1
  902. pxor %xmm5, %xmm5
  903. movddup 0 * SIZE(BB), %xmm2
  904. pxor %xmm6, %xmm6
  905. movddup 8 * SIZE(BB), %xmm3
  906. pxor %xmm7, %xmm7
  907. #ifdef HAVE_3DNOW
  908. prefetchw 2 * SIZE(%esi)
  909. prefetchw 2 * SIZE(%esi, LDC)
  910. #endif
  911. #ifdef PENTIUM4
  912. prefetchnta 3 * SIZE(%esi)
  913. prefetchnta 3 * SIZE(%esi, LDC)
  914. #endif
  915. #ifndef TRMMKERNEL
  916. movl K, %eax
  917. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  918. movl K, %eax
  919. subl KK, %eax
  920. movl %eax, KKK
  921. #else
  922. movl KK, %eax
  923. #ifdef LEFT
  924. addl $2, %eax
  925. #else
  926. addl $2, %eax
  927. #endif
  928. movl %eax, KKK
  929. #endif
  930. sarl $3, %eax
  931. je .L45
  932. ALIGN_4
  933. .L42:
  934. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  935. mulpd %xmm0, %xmm2
  936. addpd %xmm2, %xmm4
  937. movddup 1 * SIZE(BB), %xmm2
  938. mulpd %xmm0, %xmm2
  939. movapd 2 * SIZE(AA), %xmm0
  940. addpd %xmm2, %xmm5
  941. movddup 2 * SIZE(BB), %xmm2
  942. mulpd %xmm0, %xmm2
  943. addpd %xmm2, %xmm6
  944. movddup 3 * SIZE(BB), %xmm2
  945. mulpd %xmm0, %xmm2
  946. movapd 4 * SIZE(AA), %xmm0
  947. addpd %xmm2, %xmm7
  948. movddup 4 * SIZE(BB), %xmm2
  949. mulpd %xmm0, %xmm2
  950. addpd %xmm2, %xmm4
  951. movddup 5 * SIZE(BB), %xmm2
  952. mulpd %xmm0, %xmm2
  953. movapd 6 * SIZE(AA), %xmm0
  954. addpd %xmm2, %xmm5
  955. movddup 6 * SIZE(BB), %xmm2
  956. mulpd %xmm0, %xmm2
  957. addpd %xmm2, %xmm6
  958. movddup 7 * SIZE(BB), %xmm2
  959. mulpd %xmm0, %xmm2
  960. movapd 16 * SIZE(AA), %xmm0
  961. addpd %xmm2, %xmm7
  962. movddup 16 * SIZE(BB), %xmm2
  963. mulpd %xmm1, %xmm3
  964. addpd %xmm3, %xmm4
  965. movddup 9 * SIZE(BB), %xmm3
  966. mulpd %xmm1, %xmm3
  967. movapd 10 * SIZE(AA), %xmm1
  968. addpd %xmm3, %xmm5
  969. movddup 10 * SIZE(BB), %xmm3
  970. mulpd %xmm1, %xmm3
  971. addpd %xmm3, %xmm6
  972. movddup 11 * SIZE(BB), %xmm3
  973. mulpd %xmm1, %xmm3
  974. movapd 12 * SIZE(AA), %xmm1
  975. addpd %xmm3, %xmm7
  976. movddup 12 * SIZE(BB), %xmm3
  977. mulpd %xmm1, %xmm3
  978. addpd %xmm3, %xmm4
  979. movddup 13 * SIZE(BB), %xmm3
  980. mulpd %xmm1, %xmm3
  981. movapd 14 * SIZE(AA), %xmm1
  982. addpd %xmm3, %xmm5
  983. movddup 14 * SIZE(BB), %xmm3
  984. mulpd %xmm1, %xmm3
  985. addpd %xmm3, %xmm6
  986. movddup 15 * SIZE(BB), %xmm3
  987. mulpd %xmm1, %xmm3
  988. movapd 24 * SIZE(AA), %xmm1
  989. addpd %xmm3, %xmm7
  990. movddup 24 * SIZE(BB), %xmm3
  991. addl $16 * SIZE, AA
  992. addl $16 * SIZE, BB
  993. decl %eax
  994. jne .L42
  995. ALIGN_4
  996. .L45:
  997. #ifndef TRMMKERNEL
  998. movl K, %eax
  999. #else
  1000. movl KKK, %eax
  1001. #endif
  1002. movddup ALPHA, %xmm3
  1003. andl $7, %eax # if (k & 1)
  1004. BRANCH
  1005. je .L48
  1006. ALIGN_3
  1007. .L46:
  1008. mulpd %xmm0, %xmm2
  1009. addpd %xmm2, %xmm4
  1010. movddup 1 * SIZE(BB), %xmm2
  1011. mulpd %xmm0, %xmm2
  1012. movapd 2 * SIZE(AA), %xmm0
  1013. addpd %xmm2, %xmm5
  1014. movddup 2 * SIZE(BB), %xmm2
  1015. addl $2 * SIZE, AA
  1016. addl $2 * SIZE, BB
  1017. decl %eax
  1018. jg .L46
  1019. ALIGN_4
  1020. .L48:
  1021. addpd %xmm6, %xmm4
  1022. addpd %xmm7, %xmm5
  1023. mulpd %xmm3, %xmm4
  1024. mulpd %xmm3, %xmm5
  1025. #ifndef TRMMKERNEL
  1026. #ifdef PENTIUM4
  1027. SHUFPD_2 %xmm0, %xmm0
  1028. SHUFPD_2 %xmm1, %xmm1
  1029. #endif
  1030. movsd 0 * SIZE(%esi), %xmm0
  1031. movhpd 1 * SIZE(%esi), %xmm0
  1032. movsd 0 * SIZE(%esi, LDC, 1), %xmm1
  1033. movhpd 1 * SIZE(%esi, LDC, 1), %xmm1
  1034. addpd %xmm0, %xmm4
  1035. addpd %xmm1, %xmm5
  1036. #endif
  1037. movsd %xmm4, 0 * SIZE(%esi)
  1038. movhpd %xmm4, 1 * SIZE(%esi)
  1039. movsd %xmm5, 0 * SIZE(%esi, LDC, 1)
  1040. movhpd %xmm5, 1 * SIZE(%esi, LDC, 1)
  1041. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1042. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1043. movl K, %eax
  1044. subl KKK, %eax
  1045. leal (,%eax, SIZE), %eax
  1046. leal (AA, %eax, 2), AA
  1047. leal (BB, %eax, 2), BB
  1048. #endif
  1049. #if defined(TRMMKERNEL) && defined(LEFT)
  1050. addl $2, KK
  1051. #endif
  1052. addl $2 * SIZE, %esi # coffset += 2
  1053. decl %ebx # i --
  1054. jg .L41
  1055. ALIGN_4
  1056. .L50:
  1057. movl M, %ebx
  1058. testl $1, %ebx # i = (m >> 2)
  1059. jle .L59
  1060. #if !defined(TRMMKERNEL) || \
  1061. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1062. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1063. movl B, BB
  1064. #else
  1065. movl KK, %eax
  1066. leal (, %eax, SIZE), %eax
  1067. leal (AA, %eax, 1), AA
  1068. leal (B, %eax, 2), BB
  1069. #endif
  1070. movddup 0 * SIZE(AA), %xmm0
  1071. pxor %xmm4, %xmm4
  1072. movddup 8 * SIZE(AA), %xmm1
  1073. pxor %xmm5, %xmm5
  1074. movapd 0 * SIZE(BB), %xmm2
  1075. pxor %xmm6, %xmm6
  1076. movapd 8 * SIZE(BB), %xmm3
  1077. pxor %xmm7, %xmm7
  1078. #ifndef TRMMKERNEL
  1079. movl K, %eax
  1080. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1081. movl K, %eax
  1082. subl KK, %eax
  1083. movl %eax, KKK
  1084. #else
  1085. movl KK, %eax
  1086. #ifdef LEFT
  1087. addl $1, %eax
  1088. #else
  1089. addl $2, %eax
  1090. #endif
  1091. movl %eax, KKK
  1092. #endif
  1093. sarl $4, %eax
  1094. je .L55
  1095. ALIGN_4
  1096. .L52:
  1097. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1098. mulpd %xmm0, %xmm2
  1099. movddup 1 * SIZE(AA), %xmm0
  1100. addpd %xmm2, %xmm4
  1101. mulpd 2 * SIZE(BB), %xmm0
  1102. movapd 16 * SIZE(BB), %xmm2
  1103. addpd %xmm0, %xmm5
  1104. movddup 2 * SIZE(AA), %xmm0
  1105. mulpd 4 * SIZE(BB), %xmm0
  1106. addpd %xmm0, %xmm6
  1107. movddup 3 * SIZE(AA), %xmm0
  1108. mulpd 6 * SIZE(BB), %xmm0
  1109. addpd %xmm0, %xmm7
  1110. movddup 4 * SIZE(AA), %xmm0
  1111. mulpd %xmm0, %xmm3
  1112. movddup 5 * SIZE(AA), %xmm0
  1113. addpd %xmm3, %xmm4
  1114. mulpd 10 * SIZE(BB), %xmm0
  1115. movapd 24 * SIZE(BB), %xmm3
  1116. addpd %xmm0, %xmm5
  1117. movddup 6 * SIZE(AA), %xmm0
  1118. mulpd 12 * SIZE(BB), %xmm0
  1119. addpd %xmm0, %xmm6
  1120. movddup 7 * SIZE(AA), %xmm0
  1121. mulpd 14 * SIZE(BB), %xmm0
  1122. addpd %xmm0, %xmm7
  1123. movddup 16 * SIZE(AA), %xmm0
  1124. mulpd %xmm1, %xmm2
  1125. movddup 9 * SIZE(AA), %xmm1
  1126. addpd %xmm2, %xmm4
  1127. mulpd 18 * SIZE(BB), %xmm1
  1128. movapd 32 * SIZE(BB), %xmm2
  1129. addpd %xmm1, %xmm5
  1130. movddup 10 * SIZE(AA), %xmm1
  1131. mulpd 20 * SIZE(BB), %xmm1
  1132. addpd %xmm1, %xmm6
  1133. movddup 11 * SIZE(AA), %xmm1
  1134. mulpd 22 * SIZE(BB), %xmm1
  1135. addpd %xmm1, %xmm7
  1136. movddup 12 * SIZE(AA), %xmm1
  1137. mulpd %xmm1, %xmm3
  1138. movddup 13 * SIZE(AA), %xmm1
  1139. addpd %xmm3, %xmm4
  1140. mulpd 26 * SIZE(BB), %xmm1
  1141. movapd 40 * SIZE(BB), %xmm3
  1142. addpd %xmm1, %xmm5
  1143. movddup 14 * SIZE(AA), %xmm1
  1144. mulpd 28 * SIZE(BB), %xmm1
  1145. addpd %xmm1, %xmm6
  1146. movddup 15 * SIZE(AA), %xmm1
  1147. mulpd 30 * SIZE(BB), %xmm1
  1148. addpd %xmm1, %xmm7
  1149. movddup 24 * SIZE(AA), %xmm1
  1150. addl $16 * SIZE, AA
  1151. addl $32 * SIZE, BB
  1152. decl %eax
  1153. jne .L52
  1154. ALIGN_4
  1155. .L55:
  1156. #ifndef TRMMKERNEL
  1157. movl K, %eax
  1158. #else
  1159. movl KKK, %eax
  1160. #endif
  1161. movddup ALPHA, %xmm3
  1162. andl $15, %eax # if (k & 1)
  1163. BRANCH
  1164. je .L58
  1165. .L56:
  1166. mulpd %xmm0, %xmm2
  1167. movddup 1 * SIZE(AA), %xmm0
  1168. addpd %xmm2, %xmm4
  1169. movapd 2 * SIZE(BB), %xmm2
  1170. addl $1 * SIZE, AA
  1171. addl $2 * SIZE, BB
  1172. decl %eax
  1173. jg .L56
  1174. ALIGN_4
  1175. .L58:
  1176. addpd %xmm5, %xmm4
  1177. addpd %xmm7, %xmm6
  1178. addpd %xmm6, %xmm4
  1179. mulpd %xmm3, %xmm4
  1180. #ifndef TRMMKERNEL
  1181. #ifdef PENTIUM4
  1182. SHUFPD_2 %xmm0, %xmm0
  1183. #endif
  1184. movsd 0 * SIZE(%esi), %xmm0
  1185. movhpd 0 * SIZE(%esi, LDC, 1), %xmm0
  1186. addpd %xmm0, %xmm4
  1187. #endif
  1188. movsd %xmm4, 0 * SIZE(%esi)
  1189. movhpd %xmm4, 0 * SIZE(%esi, LDC, 1)
  1190. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1191. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1192. movl K, %eax
  1193. subl KKK, %eax
  1194. leal (,%eax, SIZE), %eax
  1195. leal (AA, %eax, 1), AA
  1196. leal (BB, %eax, 2), BB
  1197. #endif
  1198. #if defined(TRMMKERNEL) && defined(LEFT)
  1199. addl $1, KK
  1200. #endif
  1201. ALIGN_4
  1202. .L59:
  1203. #if defined(TRMMKERNEL) && !defined(LEFT)
  1204. addl $2, KK
  1205. #endif
  1206. leal (, LDC, 2), %eax
  1207. movl BB, B
  1208. addl %eax, C # c += 4 * ldc
  1209. ALIGN_4
  1210. .L60:
  1211. testl $1, N
  1212. je .L999
  1213. movl C, %esi # coffset = c
  1214. movl A, AA # aoffset = a
  1215. #if defined(TRMMKERNEL) && defined(LEFT)
  1216. movl OFFSET, %eax
  1217. movl %eax, KK
  1218. #endif
  1219. movl M, %ebx
  1220. sarl $1, %ebx # i = (m >> 2)
  1221. jle .L80
  1222. ALIGN_4
  1223. .L71:
  1224. #if !defined(TRMMKERNEL) || \
  1225. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1226. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1227. movl B, BB
  1228. #else
  1229. movl KK, %eax
  1230. leal (, %eax, SIZE), %eax
  1231. leal (AA, %eax, 2), AA
  1232. leal (B, %eax, 1), BB
  1233. #endif
  1234. movapd 0 * SIZE(AA), %xmm0
  1235. pxor %xmm4, %xmm4
  1236. movapd 8 * SIZE(AA), %xmm1
  1237. pxor %xmm5, %xmm5
  1238. movddup 0 * SIZE(BB), %xmm2
  1239. pxor %xmm6, %xmm6
  1240. movddup 4 * SIZE(BB), %xmm3
  1241. pxor %xmm7, %xmm7
  1242. #ifdef PENTIUM4
  1243. prefetchnta 3 * SIZE(%esi)
  1244. #endif
  1245. #ifndef TRMMKERNEL
  1246. movl K, %eax
  1247. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1248. movl K, %eax
  1249. subl KK, %eax
  1250. movl %eax, KKK
  1251. #else
  1252. movl KK, %eax
  1253. #ifdef LEFT
  1254. addl $2, %eax
  1255. #else
  1256. addl $1, %eax
  1257. #endif
  1258. movl %eax, KKK
  1259. #endif
  1260. sarl $3, %eax
  1261. je .L75
  1262. ALIGN_4
  1263. .L72:
  1264. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1265. mulpd %xmm2, %xmm0
  1266. movddup 1 * SIZE(BB), %xmm2
  1267. addpd %xmm0, %xmm4
  1268. movapd 16 * SIZE(AA), %xmm0
  1269. mulpd 2 * SIZE(AA), %xmm2
  1270. addpd %xmm2, %xmm5
  1271. movddup 2 * SIZE(BB), %xmm2
  1272. mulpd 4 * SIZE(AA), %xmm2
  1273. addpd %xmm2, %xmm6
  1274. movddup 3 * SIZE(BB), %xmm2
  1275. mulpd 6 * SIZE(AA), %xmm2
  1276. addpd %xmm2, %xmm7
  1277. movddup 8 * SIZE(BB), %xmm2
  1278. mulpd %xmm3, %xmm1
  1279. movddup 5 * SIZE(BB), %xmm3
  1280. addpd %xmm1, %xmm4
  1281. movapd 24 * SIZE(AA), %xmm1
  1282. mulpd 10 * SIZE(AA), %xmm3
  1283. addpd %xmm3, %xmm5
  1284. movddup 6 * SIZE(BB), %xmm3
  1285. mulpd 12 * SIZE(AA), %xmm3
  1286. addpd %xmm3, %xmm6
  1287. movddup 7 * SIZE(BB), %xmm3
  1288. mulpd 14 * SIZE(AA), %xmm3
  1289. addpd %xmm3, %xmm7
  1290. movddup 12 * SIZE(BB), %xmm3
  1291. addl $16 * SIZE, AA
  1292. addl $ 8 * SIZE, BB
  1293. decl %eax
  1294. jne .L72
  1295. ALIGN_4
  1296. .L75:
  1297. #ifndef TRMMKERNEL
  1298. movl K, %eax
  1299. #else
  1300. movl KKK, %eax
  1301. #endif
  1302. movddup ALPHA, %xmm3
  1303. andl $7, %eax # if (k & 1)
  1304. BRANCH
  1305. je .L78
  1306. ALIGN_3
  1307. .L76:
  1308. mulpd %xmm2, %xmm0
  1309. movddup 1 * SIZE(BB), %xmm2
  1310. addpd %xmm0, %xmm4
  1311. movapd 2 * SIZE(AA), %xmm0
  1312. addl $2 * SIZE, AA
  1313. addl $1 * SIZE, BB
  1314. decl %eax
  1315. jg .L76
  1316. ALIGN_4
  1317. .L78:
  1318. addpd %xmm5, %xmm4
  1319. addpd %xmm7, %xmm6
  1320. addpd %xmm6, %xmm4
  1321. mulpd %xmm3, %xmm4
  1322. #ifndef TRMMKERNEL
  1323. #ifdef PENTIUM4
  1324. SHUFPD_2 %xmm0, %xmm0
  1325. #endif
  1326. movsd 0 * SIZE(%esi), %xmm0
  1327. movhpd 1 * SIZE(%esi), %xmm0
  1328. addpd %xmm0, %xmm4
  1329. #endif
  1330. movsd %xmm4, 0 * SIZE(%esi)
  1331. movhpd %xmm4, 1 * SIZE(%esi)
  1332. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1333. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1334. movl K, %eax
  1335. subl KKK, %eax
  1336. leal (,%eax, SIZE), %eax
  1337. leal (AA, %eax, 2), AA
  1338. leal (BB, %eax, 1), BB
  1339. #endif
  1340. #if defined(TRMMKERNEL) && defined(LEFT)
  1341. addl $2, KK
  1342. #endif
  1343. addl $2 * SIZE, %esi # coffset += 2
  1344. decl %ebx # i --
  1345. jg .L71
  1346. ALIGN_4
  1347. .L80:
  1348. movl M, %ebx
  1349. testl $1, %ebx # i = (m >> 2)
  1350. jle .L999
  1351. #if !defined(TRMMKERNEL) || \
  1352. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1353. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1354. movl B, BB
  1355. #else
  1356. movl KK, %eax
  1357. leal (, %eax, SIZE), %eax
  1358. leal (AA, %eax, 1), AA
  1359. leal (B, %eax, 1), BB
  1360. #endif
  1361. movapd 0 * SIZE(AA), %xmm0
  1362. pxor %xmm4, %xmm4
  1363. movapd 8 * SIZE(AA), %xmm1
  1364. pxor %xmm5, %xmm5
  1365. movapd 0 * SIZE(BB), %xmm2
  1366. pxor %xmm6, %xmm6
  1367. movapd 8 * SIZE(BB), %xmm3
  1368. pxor %xmm7, %xmm7
  1369. #ifndef TRMMKERNEL
  1370. movl K, %eax
  1371. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1372. movl K, %eax
  1373. subl KK, %eax
  1374. movl %eax, KKK
  1375. #else
  1376. movl KK, %eax
  1377. #ifdef LEFT
  1378. addl $1, %eax
  1379. #else
  1380. addl $1, %eax
  1381. #endif
  1382. movl %eax, KKK
  1383. #endif
  1384. sarl $4, %eax
  1385. je .L85
  1386. ALIGN_4
  1387. .L82:
  1388. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1389. mulpd %xmm0, %xmm2
  1390. movapd 2 * SIZE(AA), %xmm0
  1391. addpd %xmm2, %xmm4
  1392. mulpd 2 * SIZE(BB), %xmm0
  1393. movapd 16 * SIZE(BB), %xmm2
  1394. addpd %xmm0, %xmm5
  1395. movapd 4 * SIZE(AA), %xmm0
  1396. mulpd 4 * SIZE(BB), %xmm0
  1397. addpd %xmm0, %xmm6
  1398. movapd 6 * SIZE(AA), %xmm0
  1399. mulpd 6 * SIZE(BB), %xmm0
  1400. addpd %xmm0, %xmm7
  1401. movapd 16 * SIZE(AA), %xmm0
  1402. mulpd %xmm1, %xmm3
  1403. movapd 10 * SIZE(AA), %xmm1
  1404. addpd %xmm3, %xmm4
  1405. mulpd 10 * SIZE(BB), %xmm1
  1406. movapd 24 * SIZE(BB), %xmm3
  1407. addpd %xmm1, %xmm5
  1408. movapd 12 * SIZE(AA), %xmm1
  1409. mulpd 12 * SIZE(BB), %xmm1
  1410. addpd %xmm1, %xmm6
  1411. movapd 14 * SIZE(AA), %xmm1
  1412. mulpd 14 * SIZE(BB), %xmm1
  1413. addpd %xmm1, %xmm7
  1414. movapd 24 * SIZE(AA), %xmm1
  1415. addl $16 * SIZE, AA
  1416. addl $16 * SIZE, BB
  1417. decl %eax
  1418. jne .L82
  1419. ALIGN_4
  1420. .L85:
  1421. #ifndef TRMMKERNEL
  1422. movl K, %eax
  1423. #else
  1424. movl KKK, %eax
  1425. #endif
  1426. movddup ALPHA, %xmm3
  1427. andl $15, %eax # if (k & 1)
  1428. BRANCH
  1429. je .L88
  1430. .L86:
  1431. mulsd %xmm0, %xmm2
  1432. movsd 1 * SIZE(AA), %xmm0
  1433. addsd %xmm2, %xmm4
  1434. movsd 1 * SIZE(BB), %xmm2
  1435. addl $1 * SIZE, AA
  1436. addl $1 * SIZE, BB
  1437. decl %eax
  1438. jg .L86
  1439. ALIGN_4
  1440. .L88:
  1441. addpd %xmm5, %xmm4
  1442. addpd %xmm7, %xmm6
  1443. addpd %xmm6, %xmm4
  1444. haddpd %xmm4, %xmm4
  1445. mulsd %xmm3, %xmm4
  1446. #ifndef TRMMKERNEL
  1447. #ifdef PENTIUM4
  1448. SHUFPD_2 %xmm0, %xmm0
  1449. #endif
  1450. movsd 0 * SIZE(%esi), %xmm0
  1451. addsd %xmm0, %xmm4
  1452. #endif
  1453. movsd %xmm4, 0 * SIZE(%esi)
  1454. ALIGN_4
  1455. .L999:
  1456. popl %ebx
  1457. popl %esi
  1458. popl %edi
  1459. popl %ebp
  1460. addl $ARGS, %esp
  1461. ret
  1462. EPILOGUE