You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_4x4_barcelona.S 45 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define OLD_M 4 + STACK(%esi)
  42. #define OLD_N 8 + STACK(%esi)
  43. #define OLD_K 12 + STACK(%esi)
  44. #define OLD_ALPHA 16 + STACK(%esi)
  45. #define OLD_A 20 + STACK(%esi)
  46. #define OLD_B 24 + STACK(%esi)
  47. #define OLD_C 28 + STACK(%esi)
  48. #define OLD_LDC 32 + STACK(%esi)
  49. #define STACK_OFFT 36 + STACK(%esi)
  50. #define ALPHA 0(%esp)
  51. #define K 16(%esp)
  52. #define N 20(%esp)
  53. #define M 24(%esp)
  54. #define A 28(%esp)
  55. #define C 32(%esp)
  56. #define J 36(%esp)
  57. #define OLD_STACK 40(%esp)
  58. #define OFFSET 44(%esp)
  59. #define KK 48(%esp)
  60. #define KKK 52(%esp)
  61. #define BUFFER 128(%esp)
  62. #define PREFETCH prefetch
  63. #define PREFETCHSIZE (16 * 17 + 0)
  64. #define RPREFETCHSIZE (16 * 9 + 0)
  65. #define WPREFETCHSIZE (16 * 9 + 0)
  66. #define AA %edx
  67. #define BB %ecx
  68. #define LDC %ebp
  69. #define KERNEL1(address) \
  70. mulps %xmm0, %xmm2; \
  71. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  72. addps %xmm2, %xmm4; \
  73. movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  74. mulps %xmm0, %xmm2; \
  75. addps %xmm2, %xmm5; \
  76. movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  77. mulps %xmm0, %xmm2; \
  78. mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  79. addps %xmm2, %xmm6; \
  80. movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  81. addps %xmm0, %xmm7; \
  82. movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  83. #define KERNEL2(address) \
  84. mulps %xmm0, %xmm3; \
  85. addps %xmm3, %xmm4; \
  86. movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  87. mulps %xmm0, %xmm3; \
  88. addps %xmm3, %xmm5; \
  89. movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  90. mulps %xmm0, %xmm3; \
  91. mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  92. addps %xmm3, %xmm6; \
  93. movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  94. addps %xmm0, %xmm7; \
  95. movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  96. #define KERNEL3(address) \
  97. mulps %xmm0, %xmm2; \
  98. addps %xmm2, %xmm4; \
  99. movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  100. mulps %xmm0, %xmm2; \
  101. addps %xmm2, %xmm5; \
  102. movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  103. mulps %xmm0, %xmm2; \
  104. mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  105. addps %xmm2, %xmm6; \
  106. movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  107. addps %xmm0, %xmm7; \
  108. movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  109. #define KERNEL4(address) \
  110. mulps %xmm0, %xmm3; \
  111. addps %xmm3, %xmm4; \
  112. movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  113. mulps %xmm0, %xmm3; \
  114. addps %xmm3, %xmm5; \
  115. movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  116. mulps %xmm0, %xmm3; \
  117. mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  118. addps %xmm3, %xmm6; \
  119. movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  120. addps %xmm0, %xmm7; \
  121. movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  122. #define KERNEL5(address) \
  123. PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \
  124. mulps %xmm1, %xmm2; \
  125. addps %xmm2, %xmm4; \
  126. movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  127. mulps %xmm1, %xmm2; \
  128. addps %xmm2, %xmm5; \
  129. movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  130. mulps %xmm1, %xmm2; \
  131. mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  132. addps %xmm2, %xmm6; \
  133. movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  134. addps %xmm1, %xmm7; \
  135. movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  136. #define KERNEL6(address) \
  137. mulps %xmm1, %xmm3; \
  138. addps %xmm3, %xmm4; \
  139. movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  140. mulps %xmm1, %xmm3; \
  141. addps %xmm3, %xmm5; \
  142. movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  143. mulps %xmm1, %xmm3; \
  144. mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  145. addps %xmm3, %xmm6; \
  146. movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  147. addps %xmm1, %xmm7; \
  148. movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  149. #define KERNEL7(address) \
  150. mulps %xmm1, %xmm2; \
  151. addps %xmm2, %xmm4; \
  152. movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  153. mulps %xmm1, %xmm2; \
  154. addps %xmm2, %xmm5; \
  155. movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  156. mulps %xmm1, %xmm2; \
  157. mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  158. addps %xmm2, %xmm6; \
  159. movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  160. addps %xmm1, %xmm7; \
  161. movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  162. #define KERNEL8(address) \
  163. mulps %xmm1, %xmm3; \
  164. addps %xmm3, %xmm4; \
  165. movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  166. mulps %xmm1, %xmm3; \
  167. addps %xmm3, %xmm5; \
  168. movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  169. mulps %xmm1, %xmm3; \
  170. mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  171. addps %xmm3, %xmm6; \
  172. movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  173. addps %xmm1, %xmm7; \
  174. movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
  175. PROLOGUE
  176. pushl %ebp
  177. pushl %edi
  178. pushl %esi
  179. pushl %ebx
  180. PROFCODE
  181. movl %esp, %esi # save old stack
  182. subl $128 + LOCAL_BUFFER_SIZE, %esp
  183. movl OLD_M, %ebx
  184. andl $-1024, %esp # align stack
  185. STACK_TOUCHING
  186. movl OLD_N, %eax
  187. movl OLD_K, %ecx
  188. movl OLD_A, %edx
  189. movss OLD_ALPHA, %xmm3
  190. #ifdef TRMMKERNEL
  191. movss STACK_OFFT, %xmm4
  192. #endif
  193. movl %ebx, M
  194. movl %eax, N
  195. movl %ecx, K
  196. movl %edx, A
  197. movl %esi, OLD_STACK
  198. shufps $0, %xmm3, %xmm3
  199. movl OLD_B, %edi
  200. movl OLD_C, %ebx
  201. movaps %xmm3, ALPHA
  202. movl %ebx, C
  203. movl OLD_LDC, LDC
  204. #ifdef TRMMKERNEL
  205. movss %xmm4, OFFSET
  206. movss %xmm4, KK
  207. #ifndef LEFT
  208. negl KK
  209. #endif
  210. #endif
  211. leal (, LDC, SIZE), LDC
  212. sarl $2, %eax
  213. movl %eax, J
  214. jle .L40
  215. .L01:
  216. #if defined(TRMMKERNEL) && defined(LEFT)
  217. movl OFFSET, %eax
  218. movl %eax, KK
  219. #endif
  220. /* Copying to Sub Buffer */
  221. leal BUFFER, %ecx
  222. movl K, %eax
  223. sarl $1, %eax
  224. jle .L05
  225. ALIGN_4
  226. .L02:
  227. prefetch (RPREFETCHSIZE + 0) * SIZE(%edi)
  228. movaps 0 * SIZE(%edi), %xmm3
  229. movaps 4 * SIZE(%edi), %xmm7
  230. prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx)
  231. pshufd $0x00, %xmm3, %xmm0
  232. pshufd $0x55, %xmm3, %xmm1
  233. pshufd $0xaa, %xmm3, %xmm2
  234. pshufd $0xff, %xmm3, %xmm3
  235. prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx)
  236. pshufd $0x00, %xmm7, %xmm4
  237. pshufd $0x55, %xmm7, %xmm5
  238. pshufd $0xaa, %xmm7, %xmm6
  239. pshufd $0xff, %xmm7, %xmm7
  240. movaps %xmm0, 0 * SIZE(%ecx)
  241. movaps %xmm1, 4 * SIZE(%ecx)
  242. movaps %xmm2, 8 * SIZE(%ecx)
  243. movaps %xmm3, 12 * SIZE(%ecx)
  244. movaps %xmm4, 16 * SIZE(%ecx)
  245. movaps %xmm5, 20 * SIZE(%ecx)
  246. movaps %xmm6, 24 * SIZE(%ecx)
  247. movaps %xmm7, 28 * SIZE(%ecx)
  248. addl $ 8 * SIZE, %edi
  249. subl $-32 * SIZE, %ecx
  250. decl %eax
  251. jne .L02
  252. ALIGN_2
  253. .L05:
  254. movl K, %eax
  255. andl $1, %eax
  256. BRANCH
  257. jle .L10
  258. movaps 0 * SIZE(%edi), %xmm3
  259. pshufd $0x00, %xmm3, %xmm0
  260. pshufd $0x55, %xmm3, %xmm1
  261. pshufd $0xaa, %xmm3, %xmm2
  262. pshufd $0xff, %xmm3, %xmm3
  263. movaps %xmm0, 0 * SIZE(%ecx)
  264. movaps %xmm1, 4 * SIZE(%ecx)
  265. movaps %xmm2, 8 * SIZE(%ecx)
  266. movaps %xmm3, 12 * SIZE(%ecx)
  267. addl $4 * SIZE, %edi
  268. ALIGN_4
  269. .L10:
  270. movl C, %esi # coffset = c
  271. movl A, %edx # aoffset = a
  272. movl M, %ebx
  273. sarl $2, %ebx # i = (m >> 2)
  274. jle .L20
  275. ALIGN_4
  276. .L11:
  277. #if !defined(TRMMKERNEL) || \
  278. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  279. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  280. leal BUFFER, BB # boffset1 = boffset
  281. #else
  282. leal BUFFER, BB # boffset1 = boffset
  283. movl KK, %eax
  284. leal (, %eax, 8), %eax
  285. leal (AA, %eax, 2), AA
  286. leal (BB, %eax, 8), BB
  287. #endif
  288. movaps 0 * SIZE(AA), %xmm0
  289. xorps %xmm4, %xmm4
  290. movaps 16 * SIZE(AA), %xmm1
  291. xorps %xmm5, %xmm5
  292. movaps 0 * SIZE(BB), %xmm2
  293. xorps %xmm6, %xmm6
  294. movaps 16 * SIZE(BB), %xmm3
  295. xorps %xmm7, %xmm7
  296. leal (%esi, LDC, 2), %eax
  297. prefetchw 3 * SIZE(%esi)
  298. prefetchw 3 * SIZE(%esi, LDC)
  299. prefetchw 3 * SIZE(%eax)
  300. prefetchw 3 * SIZE(%eax, LDC)
  301. #ifndef TRMMKERNEL
  302. movl K, %eax
  303. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  304. movl K, %eax
  305. subl KK, %eax
  306. movl %eax, KKK
  307. #else
  308. movl KK, %eax
  309. #ifdef LEFT
  310. addl $4, %eax
  311. #else
  312. addl $4, %eax
  313. #endif
  314. movl %eax, KKK
  315. #endif
  316. andl $-8, %eax
  317. sall $4, %eax
  318. je .L15
  319. .L1X:
  320. KERNEL1(32 * 0)
  321. KERNEL2(32 * 0)
  322. KERNEL3(32 * 0)
  323. KERNEL4(32 * 0)
  324. KERNEL5(32 * 0)
  325. KERNEL6(32 * 0)
  326. KERNEL7(32 * 0)
  327. KERNEL8(32 * 0)
  328. cmpl $128 * 1, %eax
  329. jle .L12
  330. KERNEL1(32 * 1)
  331. KERNEL2(32 * 1)
  332. KERNEL3(32 * 1)
  333. KERNEL4(32 * 1)
  334. KERNEL5(32 * 1)
  335. KERNEL6(32 * 1)
  336. KERNEL7(32 * 1)
  337. KERNEL8(32 * 1)
  338. cmpl $128 * 2, %eax
  339. jle .L12
  340. KERNEL1(32 * 2)
  341. KERNEL2(32 * 2)
  342. KERNEL3(32 * 2)
  343. KERNEL4(32 * 2)
  344. KERNEL5(32 * 2)
  345. KERNEL6(32 * 2)
  346. KERNEL7(32 * 2)
  347. KERNEL8(32 * 2)
  348. cmpl $128 * 3, %eax
  349. jle .L12
  350. KERNEL1(32 * 3)
  351. KERNEL2(32 * 3)
  352. KERNEL3(32 * 3)
  353. KERNEL4(32 * 3)
  354. KERNEL5(32 * 3)
  355. KERNEL6(32 * 3)
  356. KERNEL7(32 * 3)
  357. KERNEL8(32 * 3)
  358. cmpl $128 * 4, %eax
  359. jle .L12
  360. KERNEL1(32 * 4)
  361. KERNEL2(32 * 4)
  362. KERNEL3(32 * 4)
  363. KERNEL4(32 * 4)
  364. KERNEL5(32 * 4)
  365. KERNEL6(32 * 4)
  366. KERNEL7(32 * 4)
  367. KERNEL8(32 * 4)
  368. cmpl $128 * 5, %eax
  369. jle .L12
  370. KERNEL1(32 * 5)
  371. KERNEL2(32 * 5)
  372. KERNEL3(32 * 5)
  373. KERNEL4(32 * 5)
  374. KERNEL5(32 * 5)
  375. KERNEL6(32 * 5)
  376. KERNEL7(32 * 5)
  377. KERNEL8(32 * 5)
  378. cmpl $128 * 6, %eax
  379. jle .L12
  380. KERNEL1(32 * 6)
  381. KERNEL2(32 * 6)
  382. KERNEL3(32 * 6)
  383. KERNEL4(32 * 6)
  384. KERNEL5(32 * 6)
  385. KERNEL6(32 * 6)
  386. KERNEL7(32 * 6)
  387. KERNEL8(32 * 6)
  388. cmpl $128 * 7, %eax
  389. jle .L12
  390. KERNEL1(32 * 7)
  391. KERNEL2(32 * 7)
  392. KERNEL3(32 * 7)
  393. KERNEL4(32 * 7)
  394. KERNEL5(32 * 7)
  395. KERNEL6(32 * 7)
  396. KERNEL7(32 * 7)
  397. KERNEL8(32 * 7)
  398. addl $128 * 8 * SIZE, BB
  399. addl $128 * 2 * SIZE, AA
  400. subl $128 * 8, %eax
  401. jg .L1X
  402. jmp .L15
  403. .L12:
  404. leal (AA, %eax, 1), AA
  405. leal (BB, %eax, 4), BB
  406. ALIGN_4
  407. .L15:
  408. #ifndef TRMMKERNEL
  409. movl K, %eax
  410. #else
  411. movl KKK, %eax
  412. #endif
  413. movaps ALPHA, %xmm3
  414. andl $7, %eax # if (k & 1)
  415. BRANCH
  416. je .L18
  417. ALIGN_4
  418. .L16:
  419. mulps %xmm0, %xmm2
  420. addps %xmm2, %xmm4
  421. movaps 4 * SIZE(BB), %xmm2
  422. mulps %xmm0, %xmm2
  423. addps %xmm2, %xmm5
  424. movaps 8 * SIZE(BB), %xmm2
  425. mulps %xmm0, %xmm2
  426. mulps 12 * SIZE(BB), %xmm0
  427. addps %xmm2, %xmm6
  428. movaps 16 * SIZE(BB), %xmm2
  429. addps %xmm0, %xmm7
  430. movaps 4 * SIZE(AA), %xmm0
  431. addl $ 4 * SIZE, AA
  432. addl $16 * SIZE, BB
  433. decl %eax
  434. jg .L16
  435. ALIGN_4
  436. .L18:
  437. leal (LDC, LDC, 2), %eax
  438. #ifndef TRMMKERNEL
  439. shufps $0xe4, %xmm0, %xmm0
  440. shufps $0xe4, %xmm1, %xmm1
  441. shufps $0xe4, %xmm2, %xmm2
  442. shufps $0xe4, %xmm3, %xmm3
  443. mulps %xmm3, %xmm4
  444. movsd 0 * SIZE(%esi), %xmm0
  445. movhps 2 * SIZE(%esi), %xmm0
  446. mulps %xmm3, %xmm5
  447. movsd 0 * SIZE(%esi, LDC, 1), %xmm1
  448. movhps 2 * SIZE(%esi, LDC, 1), %xmm1
  449. mulps %xmm3, %xmm6
  450. movsd 0 * SIZE(%esi, LDC, 2), %xmm2
  451. movhps 2 * SIZE(%esi, LDC, 2), %xmm2
  452. mulps %xmm3, %xmm7
  453. movsd 0 * SIZE(%esi, %eax, 1), %xmm3
  454. movhps 2 * SIZE(%esi, %eax, 1), %xmm3
  455. addps %xmm0, %xmm4
  456. addps %xmm1, %xmm5
  457. addps %xmm2, %xmm6
  458. addps %xmm3, %xmm7
  459. #else
  460. mulps %xmm3, %xmm4
  461. mulps %xmm3, %xmm5
  462. mulps %xmm3, %xmm6
  463. mulps %xmm3, %xmm7
  464. #endif
  465. movsd %xmm4, 0 * SIZE(%esi)
  466. movhps %xmm4, 2 * SIZE(%esi)
  467. movsd %xmm5, 0 * SIZE(%esi, LDC, 1)
  468. movhps %xmm5, 2 * SIZE(%esi, LDC, 1)
  469. movsd %xmm6, 0 * SIZE(%esi, LDC, 2)
  470. movhps %xmm6, 2 * SIZE(%esi, LDC, 2)
  471. movsd %xmm7, 0 * SIZE(%esi, %eax, 1)
  472. movhps %xmm7, 2 * SIZE(%esi, %eax, 1)
  473. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  474. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  475. movl K, %eax
  476. subl KKK, %eax
  477. leal (,%eax, 8), %eax
  478. leal (AA, %eax, 2), AA
  479. leal (BB, %eax, 8), BB
  480. #endif
  481. #if defined(TRMMKERNEL) && defined(LEFT)
  482. addl $4, KK
  483. #endif
  484. addl $4 * SIZE, %esi # coffset += 2
  485. decl %ebx # i --
  486. jg .L11
  487. ALIGN_4
  488. .L20:
  489. testl $2, M
  490. je .L30
  491. #if !defined(TRMMKERNEL) || \
  492. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  493. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  494. leal BUFFER, BB # boffset1 = boffset
  495. #else
  496. leal BUFFER, BB # boffset1 = boffset
  497. movl KK, %eax
  498. leal (, %eax, 8), %eax
  499. leal (AA, %eax, 1), AA
  500. leal (BB, %eax, 8), BB
  501. #endif
  502. movsd 0 * SIZE(AA), %xmm0
  503. xorps %xmm4, %xmm4
  504. movsd 8 * SIZE(AA), %xmm1
  505. xorps %xmm5, %xmm5
  506. movsd 0 * SIZE(BB), %xmm2
  507. xorps %xmm6, %xmm6
  508. movsd 16 * SIZE(BB), %xmm3
  509. xorps %xmm7, %xmm7
  510. #ifndef TRMMKERNEL
  511. movl K, %eax
  512. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  513. movl K, %eax
  514. subl KK, %eax
  515. movl %eax, KKK
  516. #else
  517. movl KK, %eax
  518. #ifdef LEFT
  519. addl $2, %eax
  520. #else
  521. addl $4, %eax
  522. #endif
  523. movl %eax, KKK
  524. #endif
  525. sarl $3, %eax
  526. je .L25
  527. ALIGN_4
  528. .L22:
  529. mulps %xmm0, %xmm2
  530. addps %xmm2, %xmm4
  531. #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
  532. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  533. #endif
  534. movsd 4 * SIZE(BB), %xmm2
  535. mulps %xmm0, %xmm2
  536. addps %xmm2, %xmm5
  537. movsd 8 * SIZE(BB), %xmm2
  538. mulps %xmm0, %xmm2
  539. addps %xmm2, %xmm6
  540. movsd 12 * SIZE(BB), %xmm2
  541. mulps %xmm0, %xmm2
  542. movsd 2 * SIZE(AA), %xmm0
  543. addps %xmm2, %xmm7
  544. movsd 32 * SIZE(BB), %xmm2
  545. mulps %xmm0, %xmm3
  546. addps %xmm3, %xmm4
  547. movsd 20 * SIZE(BB), %xmm3
  548. mulps %xmm0, %xmm3
  549. addps %xmm3, %xmm5
  550. movsd 24 * SIZE(BB), %xmm3
  551. mulps %xmm0, %xmm3
  552. addps %xmm3, %xmm6
  553. movsd 28 * SIZE(BB), %xmm3
  554. mulps %xmm0, %xmm3
  555. movsd 4 * SIZE(AA), %xmm0
  556. addps %xmm3, %xmm7
  557. movsd 48 * SIZE(BB), %xmm3
  558. mulps %xmm0, %xmm2
  559. addps %xmm2, %xmm4
  560. movsd 36 * SIZE(BB), %xmm2
  561. mulps %xmm0, %xmm2
  562. addps %xmm2, %xmm5
  563. movsd 40 * SIZE(BB), %xmm2
  564. mulps %xmm0, %xmm2
  565. addps %xmm2, %xmm6
  566. movsd 44 * SIZE(BB), %xmm2
  567. mulps %xmm0, %xmm2
  568. movsd 6 * SIZE(AA), %xmm0
  569. addps %xmm2, %xmm7
  570. movsd 64 * SIZE(BB), %xmm2
  571. mulps %xmm0, %xmm3
  572. addps %xmm3, %xmm4
  573. movsd 52 * SIZE(BB), %xmm3
  574. mulps %xmm0, %xmm3
  575. addps %xmm3, %xmm5
  576. movsd 56 * SIZE(BB), %xmm3
  577. mulps %xmm0, %xmm3
  578. addps %xmm3, %xmm6
  579. movsd 60 * SIZE(BB), %xmm3
  580. mulps %xmm0, %xmm3
  581. movsd 16 * SIZE(AA), %xmm0
  582. addps %xmm3, %xmm7
  583. movsd 80 * SIZE(BB), %xmm3
  584. mulps %xmm1, %xmm2
  585. addps %xmm2, %xmm4
  586. movsd 68 * SIZE(BB), %xmm2
  587. mulps %xmm1, %xmm2
  588. addps %xmm2, %xmm5
  589. movsd 72 * SIZE(BB), %xmm2
  590. mulps %xmm1, %xmm2
  591. addps %xmm2, %xmm6
  592. movsd 76 * SIZE(BB), %xmm2
  593. mulps %xmm1, %xmm2
  594. movsd 10 * SIZE(AA), %xmm1
  595. addps %xmm2, %xmm7
  596. movsd 96 * SIZE(BB), %xmm2
  597. mulps %xmm1, %xmm3
  598. addps %xmm3, %xmm4
  599. movsd 84 * SIZE(BB), %xmm3
  600. mulps %xmm1, %xmm3
  601. addps %xmm3, %xmm5
  602. movsd 88 * SIZE(BB), %xmm3
  603. mulps %xmm1, %xmm3
  604. addps %xmm3, %xmm6
  605. movsd 92 * SIZE(BB), %xmm3
  606. mulps %xmm1, %xmm3
  607. movsd 12 * SIZE(AA), %xmm1
  608. addps %xmm3, %xmm7
  609. movsd 112 * SIZE(BB), %xmm3
  610. mulps %xmm1, %xmm2
  611. addps %xmm2, %xmm4
  612. movsd 100 * SIZE(BB), %xmm2
  613. mulps %xmm1, %xmm2
  614. addps %xmm2, %xmm5
  615. movsd 104 * SIZE(BB), %xmm2
  616. mulps %xmm1, %xmm2
  617. addps %xmm2, %xmm6
  618. movsd 108 * SIZE(BB), %xmm2
  619. mulps %xmm1, %xmm2
  620. movsd 14 * SIZE(AA), %xmm1
  621. addps %xmm2, %xmm7
  622. movsd 128 * SIZE(BB), %xmm2
  623. mulps %xmm1, %xmm3
  624. addps %xmm3, %xmm4
  625. movsd 116 * SIZE(BB), %xmm3
  626. mulps %xmm1, %xmm3
  627. addps %xmm3, %xmm5
  628. movsd 120 * SIZE(BB), %xmm3
  629. mulps %xmm1, %xmm3
  630. addps %xmm3, %xmm6
  631. movsd 124 * SIZE(BB), %xmm3
  632. mulps %xmm1, %xmm3
  633. movsd 24 * SIZE(AA), %xmm1
  634. addps %xmm3, %xmm7
  635. movsd 144 * SIZE(BB), %xmm3
  636. addl $ 16 * SIZE, AA
  637. addl $128 * SIZE, BB
  638. decl %eax
  639. jne .L22
  640. ALIGN_4
  641. .L25:
  642. #ifndef TRMMKERNEL
  643. movl K, %eax
  644. #else
  645. movl KKK, %eax
  646. #endif
  647. movaps ALPHA, %xmm3
  648. andl $7, %eax # if (k & 1)
  649. BRANCH
  650. je .L28
  651. ALIGN_4
  652. .L26:
  653. mulps %xmm0, %xmm2
  654. addps %xmm2, %xmm4
  655. movsd 4 * SIZE(BB), %xmm2
  656. mulps %xmm0, %xmm2
  657. addps %xmm2, %xmm5
  658. movsd 8 * SIZE(BB), %xmm2
  659. mulps %xmm0, %xmm2
  660. addps %xmm2, %xmm6
  661. movsd 12 * SIZE(BB), %xmm2
  662. mulps %xmm0, %xmm2
  663. movsd 2 * SIZE(AA), %xmm0
  664. addps %xmm2, %xmm7
  665. movsd 16 * SIZE(BB), %xmm2
  666. addl $ 2 * SIZE, AA
  667. addl $16 * SIZE, BB
  668. decl %eax
  669. jg .L26
  670. ALIGN_4
  671. .L28:
  672. leal (LDC, LDC, 2), %eax
  673. #ifndef TRMMKERNEL
  674. mulps %xmm3, %xmm4
  675. movsd 0 * SIZE(%esi), %xmm0
  676. mulps %xmm3, %xmm5
  677. movsd 0 * SIZE(%esi, LDC, 1), %xmm1
  678. mulps %xmm3, %xmm6
  679. movsd 0 * SIZE(%esi, LDC, 2), %xmm2
  680. mulps %xmm3, %xmm7
  681. movsd 0 * SIZE(%esi, %eax, 1), %xmm3
  682. addps %xmm0, %xmm4
  683. addps %xmm1, %xmm5
  684. addps %xmm2, %xmm6
  685. addps %xmm3, %xmm7
  686. #else
  687. mulps %xmm3, %xmm4
  688. mulps %xmm3, %xmm5
  689. mulps %xmm3, %xmm6
  690. mulps %xmm3, %xmm7
  691. #endif
  692. movsd %xmm4, 0 * SIZE(%esi)
  693. movsd %xmm5, 0 * SIZE(%esi, LDC, 1)
  694. movsd %xmm6, 0 * SIZE(%esi, LDC, 2)
  695. movsd %xmm7, 0 * SIZE(%esi, %eax, 1)
  696. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  697. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  698. movl K, %eax
  699. subl KKK, %eax
  700. leal (,%eax, 8), %eax
  701. leal (AA, %eax, 1), AA
  702. leal (BB, %eax, 8), BB
  703. #endif
  704. #if defined(TRMMKERNEL) && defined(LEFT)
  705. addl $2, KK
  706. #endif
  707. addl $2 * SIZE, %esi # coffset += 2
  708. ALIGN_4
  709. .L30:
  710. testl $1, M
  711. je .L39
  712. #if !defined(TRMMKERNEL) || \
  713. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  714. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  715. leal BUFFER, BB # boffset1 = boffset
  716. #else
  717. leal BUFFER, BB # boffset1 = boffset
  718. movl KK, %eax
  719. leal (, %eax, 4), %eax
  720. leal (AA, %eax, 1), AA
  721. leal (BB, %eax, 8), BB
  722. leal (BB, %eax, 8), BB
  723. #endif
  724. movss 0 * SIZE(AA), %xmm0
  725. xorps %xmm4, %xmm4
  726. movss 4 * SIZE(AA), %xmm1
  727. xorps %xmm5, %xmm5
  728. movss 0 * SIZE(BB), %xmm2
  729. xorps %xmm6, %xmm6
  730. movss 16 * SIZE(BB), %xmm3
  731. xorps %xmm7, %xmm7
  732. #ifndef TRMMKERNEL
  733. movl K, %eax
  734. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  735. movl K, %eax
  736. subl KK, %eax
  737. movl %eax, KKK
  738. #else
  739. movl KK, %eax
  740. #ifdef LEFT
  741. addl $1, %eax
  742. #else
  743. addl $4, %eax
  744. #endif
  745. movl %eax, KKK
  746. #endif
  747. sarl $3, %eax
  748. je .L35
  749. ALIGN_4
  750. .L32:
  751. mulss %xmm0, %xmm2
  752. addss %xmm2, %xmm4
  753. #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
  754. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  755. #endif
  756. movss 4 * SIZE(BB), %xmm2
  757. mulss %xmm0, %xmm2
  758. addss %xmm2, %xmm5
  759. movss 8 * SIZE(BB), %xmm2
  760. mulss %xmm0, %xmm2
  761. mulss 12 * SIZE(BB), %xmm0
  762. addss %xmm2, %xmm6
  763. movss 32 * SIZE(BB), %xmm2
  764. addss %xmm0, %xmm7
  765. movss 1 * SIZE(AA), %xmm0
  766. mulss %xmm0, %xmm3
  767. addss %xmm3, %xmm4
  768. movss 20 * SIZE(BB), %xmm3
  769. mulss %xmm0, %xmm3
  770. addss %xmm3, %xmm5
  771. movss 24 * SIZE(BB), %xmm3
  772. mulss %xmm0, %xmm3
  773. mulss 28 * SIZE(BB), %xmm0
  774. addss %xmm3, %xmm6
  775. movss 48 * SIZE(BB), %xmm3
  776. addss %xmm0, %xmm7
  777. movss 2 * SIZE(AA), %xmm0
  778. mulss %xmm0, %xmm2
  779. addss %xmm2, %xmm4
  780. movss 36 * SIZE(BB), %xmm2
  781. mulss %xmm0, %xmm2
  782. addss %xmm2, %xmm5
  783. movss 40 * SIZE(BB), %xmm2
  784. mulss %xmm0, %xmm2
  785. mulss 44 * SIZE(BB), %xmm0
  786. addss %xmm2, %xmm6
  787. movss 64 * SIZE(BB), %xmm2
  788. addss %xmm0, %xmm7
  789. movss 3 * SIZE(AA), %xmm0
  790. mulss %xmm0, %xmm3
  791. addss %xmm3, %xmm4
  792. movss 52 * SIZE(BB), %xmm3
  793. mulss %xmm0, %xmm3
  794. addss %xmm3, %xmm5
  795. movss 56 * SIZE(BB), %xmm3
  796. mulss %xmm0, %xmm3
  797. mulss 60 * SIZE(BB), %xmm0
  798. addss %xmm3, %xmm6
  799. movss 80 * SIZE(BB), %xmm3
  800. addss %xmm0, %xmm7
  801. movss 8 * SIZE(AA), %xmm0
  802. mulss %xmm1, %xmm2
  803. addss %xmm2, %xmm4
  804. movss 68 * SIZE(BB), %xmm2
  805. mulss %xmm1, %xmm2
  806. addss %xmm2, %xmm5
  807. movss 72 * SIZE(BB), %xmm2
  808. mulss %xmm1, %xmm2
  809. mulss 76 * SIZE(BB), %xmm1
  810. addss %xmm2, %xmm6
  811. movss 96 * SIZE(BB), %xmm2
  812. addss %xmm1, %xmm7
  813. movss 5 * SIZE(AA), %xmm1
  814. mulss %xmm1, %xmm3
  815. addss %xmm3, %xmm4
  816. movss 84 * SIZE(BB), %xmm3
  817. mulss %xmm1, %xmm3
  818. addss %xmm3, %xmm5
  819. movss 88 * SIZE(BB), %xmm3
  820. mulss %xmm1, %xmm3
  821. mulss 92 * SIZE(BB), %xmm1
  822. addss %xmm3, %xmm6
  823. movss 112 * SIZE(BB), %xmm3
  824. addss %xmm1, %xmm7
  825. movss 6 * SIZE(AA), %xmm1
  826. mulss %xmm1, %xmm2
  827. addss %xmm2, %xmm4
  828. movss 100 * SIZE(BB), %xmm2
  829. mulss %xmm1, %xmm2
  830. addss %xmm2, %xmm5
  831. movss 104 * SIZE(BB), %xmm2
  832. mulss %xmm1, %xmm2
  833. mulss 108 * SIZE(BB), %xmm1
  834. addss %xmm2, %xmm6
  835. movss 128 * SIZE(BB), %xmm2
  836. addss %xmm1, %xmm7
  837. movss 7 * SIZE(AA), %xmm1
  838. mulss %xmm1, %xmm3
  839. addss %xmm3, %xmm4
  840. movss 116 * SIZE(BB), %xmm3
  841. mulss %xmm1, %xmm3
  842. addss %xmm3, %xmm5
  843. movss 120 * SIZE(BB), %xmm3
  844. mulss %xmm1, %xmm3
  845. mulss 124 * SIZE(BB), %xmm1
  846. addss %xmm3, %xmm6
  847. movss 144 * SIZE(BB), %xmm3
  848. addss %xmm1, %xmm7
  849. movss 12 * SIZE(AA), %xmm1
  850. addl $ 8 * SIZE, AA
  851. addl $128 * SIZE, BB
  852. decl %eax
  853. jne .L32
  854. ALIGN_4
  855. .L35:
  856. #ifndef TRMMKERNEL
  857. movl K, %eax
  858. #else
  859. movl KKK, %eax
  860. #endif
  861. movaps ALPHA, %xmm3
  862. andl $7, %eax # if (k & 1)
  863. BRANCH
  864. je .L38
  865. ALIGN_4
  866. .L36:
  867. mulss %xmm0, %xmm2
  868. addss %xmm2, %xmm4
  869. movss 4 * SIZE(BB), %xmm2
  870. mulss %xmm0, %xmm2
  871. addss %xmm2, %xmm5
  872. movss 8 * SIZE(BB), %xmm2
  873. mulss %xmm0, %xmm2
  874. mulss 12 * SIZE(BB), %xmm0
  875. addss %xmm2, %xmm6
  876. movss 16 * SIZE(BB), %xmm2
  877. addss %xmm0, %xmm7
  878. movss 1 * SIZE(AA), %xmm0
  879. addl $ 1 * SIZE, AA
  880. addl $16 * SIZE, BB
  881. decl %eax
  882. jg .L36
  883. ALIGN_4
  884. .L38:
  885. leal (LDC, LDC, 2), %eax
  886. #ifndef TRMMKERNEL
  887. mulss %xmm3, %xmm4
  888. movss 0 * SIZE(%esi), %xmm0
  889. mulss %xmm3, %xmm5
  890. movss 0 * SIZE(%esi, LDC, 1), %xmm1
  891. mulss %xmm3, %xmm6
  892. movss 0 * SIZE(%esi, LDC, 2), %xmm2
  893. mulss %xmm3, %xmm7
  894. movss 0 * SIZE(%esi, %eax, 1), %xmm3
  895. addss %xmm0, %xmm4
  896. addss %xmm1, %xmm5
  897. addss %xmm2, %xmm6
  898. addss %xmm3, %xmm7
  899. #else
  900. mulss %xmm3, %xmm4
  901. mulss %xmm3, %xmm5
  902. mulss %xmm3, %xmm6
  903. mulss %xmm3, %xmm7
  904. #endif
  905. movss %xmm4, 0 * SIZE(%esi)
  906. movss %xmm5, 0 * SIZE(%esi, LDC, 1)
  907. movss %xmm6, 0 * SIZE(%esi, LDC, 2)
  908. movss %xmm7, 0 * SIZE(%esi, %eax, 1)
  909. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  910. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  911. movl K, %eax
  912. subl KKK, %eax
  913. leal (,%eax, 4), %eax
  914. leal (AA, %eax, 1), AA
  915. leal (BB, %eax, 8), BB
  916. leal (BB, %eax, 8), BB
  917. #endif
  918. #if defined(TRMMKERNEL) && defined(LEFT)
  919. addl $1, KK
  920. #endif
  921. ALIGN_4
  922. .L39:
  923. #if defined(TRMMKERNEL) && !defined(LEFT)
  924. addl $4, KK
  925. #endif
  926. leal (, LDC, 4), %eax
  927. addl %eax, C # c += 4 * ldc
  928. decl J # j --
  929. jg .L01
  930. ALIGN_4
  931. .L40:
  932. testl $2, N
  933. je .L80
  934. #if defined(TRMMKERNEL) && defined(LEFT)
  935. movl OFFSET, %eax
  936. movl %eax, KK
  937. #endif
  938. movl K, %eax
  939. leal BUFFER, %ecx
  940. sarl $2, %eax
  941. jle .L45
  942. ALIGN_4
  943. .L42:
  944. prefetch (RPREFETCHSIZE + 0) * SIZE(%edi)
  945. movaps 0 * SIZE(%edi), %xmm3
  946. movaps 4 * SIZE(%edi), %xmm7
  947. prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx)
  948. pshufd $0x00, %xmm3, %xmm0
  949. pshufd $0x55, %xmm3, %xmm1
  950. pshufd $0xaa, %xmm3, %xmm2
  951. pshufd $0xff, %xmm3, %xmm3
  952. prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx)
  953. pshufd $0x00, %xmm7, %xmm4
  954. pshufd $0x55, %xmm7, %xmm5
  955. pshufd $0xaa, %xmm7, %xmm6
  956. pshufd $0xff, %xmm7, %xmm7
  957. movaps %xmm0, 0 * SIZE(%ecx)
  958. movaps %xmm1, 4 * SIZE(%ecx)
  959. movaps %xmm2, 8 * SIZE(%ecx)
  960. movaps %xmm3, 12 * SIZE(%ecx)
  961. movaps %xmm4, 16 * SIZE(%ecx)
  962. movaps %xmm5, 20 * SIZE(%ecx)
  963. movaps %xmm6, 24 * SIZE(%ecx)
  964. movaps %xmm7, 28 * SIZE(%ecx)
  965. addl $ 8 * SIZE, %edi
  966. subl $-32 * SIZE, %ecx
  967. decl %eax
  968. jne .L42
  969. ALIGN_4
  970. .L45:
  971. movl K, %eax
  972. andl $3, %eax
  973. BRANCH
  974. jle .L50
  975. ALIGN_4
  976. .L46:
  977. movsd 0 * SIZE(%edi), %xmm3
  978. pshufd $0x00, %xmm3, %xmm0
  979. pshufd $0x55, %xmm3, %xmm1
  980. movaps %xmm0, 0 * SIZE(%ecx)
  981. movaps %xmm1, 4 * SIZE(%ecx)
  982. addl $2 * SIZE, %edi
  983. addl $8 * SIZE, %ecx
  984. decl %eax
  985. jne .L46
  986. ALIGN_4
  987. .L50:
  988. movl C, %esi # coffset = c
  989. movl A, %edx # aoffset = a
  990. movl M, %ebx
  991. sarl $2, %ebx # i = (m >> 2)
  992. jle .L60
  993. ALIGN_4
  994. .L51:
  995. #if !defined(TRMMKERNEL) || \
  996. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  997. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  998. leal BUFFER, BB # boffset1 = boffset
  999. #else
  1000. leal BUFFER, BB # boffset1 = boffset
  1001. movl KK, %eax
  1002. leal (, %eax, 8), %eax
  1003. leal (AA, %eax, 2), AA
  1004. leal (BB, %eax, 4), BB
  1005. #endif
  1006. xorps %xmm4, %xmm4
  1007. xorps %xmm5, %xmm5
  1008. xorps %xmm6, %xmm6
  1009. xorps %xmm7, %xmm7
  1010. movaps 0 * SIZE(AA), %xmm0
  1011. movaps 16 * SIZE(AA), %xmm1
  1012. movaps 0 * SIZE(BB), %xmm2
  1013. movaps 16 * SIZE(BB), %xmm3
  1014. prefetchw 3 * SIZE(%esi)
  1015. prefetchw 3 * SIZE(%esi, LDC)
  1016. #ifndef TRMMKERNEL
  1017. movl K, %eax
  1018. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1019. movl K, %eax
  1020. subl KK, %eax
  1021. movl %eax, KKK
  1022. #else
  1023. movl KK, %eax
  1024. #ifdef LEFT
  1025. addl $4, %eax
  1026. #else
  1027. addl $2, %eax
  1028. #endif
  1029. movl %eax, KKK
  1030. #endif
  1031. sarl $3, %eax
  1032. je .L55
  1033. ALIGN_4
  1034. .L52:
  1035. mulps %xmm0, %xmm2
  1036. #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
  1037. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1038. #endif
  1039. mulps 4 * SIZE(BB), %xmm0
  1040. addps %xmm2, %xmm4
  1041. movaps 8 * SIZE(BB), %xmm2
  1042. addps %xmm0, %xmm5
  1043. movaps 4 * SIZE(AA), %xmm0
  1044. mulps %xmm0, %xmm2
  1045. mulps 12 * SIZE(BB), %xmm0
  1046. addps %xmm2, %xmm4
  1047. movaps 32 * SIZE(BB), %xmm2
  1048. addps %xmm0, %xmm5
  1049. movaps 8 * SIZE(AA), %xmm0
  1050. mulps %xmm0, %xmm3
  1051. mulps 20 * SIZE(BB), %xmm0
  1052. addps %xmm3, %xmm4
  1053. movaps 24 * SIZE(BB), %xmm3
  1054. addps %xmm0, %xmm5
  1055. movaps 12 * SIZE(AA), %xmm0
  1056. mulps %xmm0, %xmm3
  1057. mulps 28 * SIZE(BB), %xmm0
  1058. addps %xmm3, %xmm4
  1059. movaps 48 * SIZE(BB), %xmm3
  1060. addps %xmm0, %xmm5
  1061. movaps 32 * SIZE(AA), %xmm0
  1062. #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
  1063. prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
  1064. #endif
  1065. mulps %xmm1, %xmm2
  1066. mulps 36 * SIZE(BB), %xmm1
  1067. addps %xmm2, %xmm4
  1068. movaps 40 * SIZE(BB), %xmm2
  1069. addps %xmm1, %xmm5
  1070. movaps 20 * SIZE(AA), %xmm1
  1071. mulps %xmm1, %xmm2
  1072. mulps 44 * SIZE(BB), %xmm1
  1073. addps %xmm2, %xmm4
  1074. movaps 64 * SIZE(BB), %xmm2
  1075. addps %xmm1, %xmm5
  1076. movaps 24 * SIZE(AA), %xmm1
  1077. mulps %xmm1, %xmm3
  1078. mulps 52 * SIZE(BB), %xmm1
  1079. addps %xmm3, %xmm4
  1080. movaps 56 * SIZE(BB), %xmm3
  1081. addps %xmm1, %xmm5
  1082. movaps 28 * SIZE(AA), %xmm1
  1083. mulps %xmm1, %xmm3
  1084. mulps 60 * SIZE(BB), %xmm1
  1085. addps %xmm3, %xmm4
  1086. movaps 80 * SIZE(BB), %xmm3
  1087. addps %xmm1, %xmm5
  1088. movaps 48 * SIZE(AA), %xmm1
  1089. addl $32 * SIZE, AA
  1090. addl $64 * SIZE, BB
  1091. decl %eax
  1092. jne .L52
  1093. ALIGN_4
  1094. .L55:
  1095. #ifndef TRMMKERNEL
  1096. movl K, %eax
  1097. #else
  1098. movl KKK, %eax
  1099. #endif
  1100. movaps ALPHA, %xmm3
  1101. andl $7, %eax # if (k & 1)
  1102. BRANCH
  1103. je .L58
  1104. ALIGN_4
  1105. .L56:
  1106. mulps %xmm0, %xmm2
  1107. mulps 4 * SIZE(BB), %xmm0
  1108. addps %xmm2, %xmm4
  1109. movaps 8 * SIZE(BB), %xmm2
  1110. addps %xmm0, %xmm5
  1111. movaps 4 * SIZE(AA), %xmm0
  1112. addl $4 * SIZE, AA
  1113. addl $8 * SIZE, BB
  1114. decl %eax
  1115. jg .L56
  1116. ALIGN_4
  1117. .L58:
  1118. mulps %xmm3, %xmm4
  1119. mulps %xmm3, %xmm5
  1120. #ifndef TRMMKERNEL
  1121. movsd 0 * SIZE(%esi), %xmm0
  1122. movhps 2 * SIZE(%esi), %xmm0
  1123. movsd 0 * SIZE(%esi, LDC, 1), %xmm1
  1124. movhps 2 * SIZE(%esi, LDC, 1), %xmm1
  1125. addps %xmm0, %xmm4
  1126. addps %xmm1, %xmm5
  1127. #endif
  1128. movsd %xmm4, 0 * SIZE(%esi)
  1129. movhps %xmm4, 2 * SIZE(%esi)
  1130. movsd %xmm5, 0 * SIZE(%esi, LDC, 1)
  1131. movhps %xmm5, 2 * SIZE(%esi, LDC, 1)
  1132. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1133. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1134. movl K, %eax
  1135. subl KKK, %eax
  1136. leal (,%eax, 8), %eax
  1137. leal (AA, %eax, 2), AA
  1138. leal (BB, %eax, 4), BB
  1139. #endif
  1140. #if defined(TRMMKERNEL) && defined(LEFT)
  1141. addl $4, KK
  1142. #endif
  1143. addl $4 * SIZE, %esi # coffset += 2
  1144. decl %ebx # i --
  1145. jg .L51
  1146. ALIGN_4
  1147. .L60:
  1148. testl $2, M
  1149. je .L70
  1150. #if !defined(TRMMKERNEL) || \
  1151. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1152. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1153. leal BUFFER, BB # boffset1 = boffset
  1154. #else
  1155. leal BUFFER, BB # boffset1 = boffset
  1156. movl KK, %eax
  1157. leal (, %eax, 8), %eax
  1158. leal (AA, %eax, 1), AA
  1159. leal (BB, %eax, 4), BB
  1160. #endif
  1161. xorps %xmm4, %xmm4
  1162. xorps %xmm5, %xmm5
  1163. xorps %xmm6, %xmm6
  1164. xorps %xmm7, %xmm7
  1165. movsd 0 * SIZE(AA), %xmm0
  1166. movsd 8 * SIZE(AA), %xmm1
  1167. movsd 0 * SIZE(BB), %xmm2
  1168. movsd 16 * SIZE(BB), %xmm3
  1169. leal (LDC, LDC, 2), %eax
  1170. #ifndef TRMMKERNEL
  1171. movl K, %eax
  1172. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1173. movl K, %eax
  1174. subl KK, %eax
  1175. movl %eax, KKK
  1176. #else
  1177. movl KK, %eax
  1178. #ifdef LEFT
  1179. addl $2, %eax
  1180. #else
  1181. addl $2, %eax
  1182. #endif
  1183. movl %eax, KKK
  1184. #endif
  1185. sarl $3, %eax
  1186. je .L65
  1187. ALIGN_4
  1188. .L62:
  1189. #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
  1190. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1191. #endif
  1192. mulps %xmm0, %xmm2
  1193. addps %xmm2, %xmm4
  1194. movsd 4 * SIZE(BB), %xmm2
  1195. mulps %xmm0, %xmm2
  1196. movsd 2 * SIZE(AA), %xmm0
  1197. addps %xmm2, %xmm5
  1198. movsd 8 * SIZE(BB), %xmm2
  1199. mulps %xmm0, %xmm2
  1200. addps %xmm2, %xmm6
  1201. movsd 12 * SIZE(BB), %xmm2
  1202. mulps %xmm0, %xmm2
  1203. movsd 4 * SIZE(AA), %xmm0
  1204. addps %xmm2, %xmm7
  1205. movsd 32 * SIZE(BB), %xmm2
  1206. mulps %xmm0, %xmm3
  1207. addps %xmm3, %xmm4
  1208. movsd 20 * SIZE(BB), %xmm3
  1209. mulps %xmm0, %xmm3
  1210. movsd 6 * SIZE(AA), %xmm0
  1211. addps %xmm3, %xmm5
  1212. movsd 24 * SIZE(BB), %xmm3
  1213. mulps %xmm0, %xmm3
  1214. addps %xmm3, %xmm6
  1215. movsd 28 * SIZE(BB), %xmm3
  1216. mulps %xmm0, %xmm3
  1217. movsd 16 * SIZE(AA), %xmm0
  1218. addps %xmm3, %xmm7
  1219. movsd 48 * SIZE(BB), %xmm3
  1220. mulps %xmm1, %xmm2
  1221. addps %xmm2, %xmm4
  1222. movsd 36 * SIZE(BB), %xmm2
  1223. mulps %xmm1, %xmm2
  1224. movsd 10 * SIZE(AA), %xmm1
  1225. addps %xmm2, %xmm5
  1226. movsd 40 * SIZE(BB), %xmm2
  1227. mulps %xmm1, %xmm2
  1228. addps %xmm2, %xmm6
  1229. movsd 44 * SIZE(BB), %xmm2
  1230. mulps %xmm1, %xmm2
  1231. movsd 12 * SIZE(AA), %xmm1
  1232. addps %xmm2, %xmm7
  1233. movsd 64 * SIZE(BB), %xmm2
  1234. mulps %xmm1, %xmm3
  1235. addps %xmm3, %xmm4
  1236. movsd 52 * SIZE(BB), %xmm3
  1237. mulps %xmm1, %xmm3
  1238. movsd 14 * SIZE(AA), %xmm1
  1239. addps %xmm3, %xmm5
  1240. movsd 56 * SIZE(BB), %xmm3
  1241. mulps %xmm1, %xmm3
  1242. addps %xmm3, %xmm6
  1243. movsd 60 * SIZE(BB), %xmm3
  1244. mulps %xmm1, %xmm3
  1245. movsd 24 * SIZE(AA), %xmm1
  1246. addps %xmm3, %xmm7
  1247. movsd 80 * SIZE(BB), %xmm3
  1248. addl $16 * SIZE, AA
  1249. addl $64 * SIZE, BB
  1250. decl %eax
  1251. jne .L62
  1252. ALIGN_4
  1253. .L65:
  1254. #ifndef TRMMKERNEL
  1255. movl K, %eax
  1256. #else
  1257. movl KKK, %eax
  1258. #endif
  1259. movaps ALPHA, %xmm3
  1260. andl $7, %eax # if (k & 1)
  1261. BRANCH
  1262. je .L68
  1263. ALIGN_4
  1264. .L66:
  1265. mulps %xmm0, %xmm2
  1266. addps %xmm2, %xmm4
  1267. movsd 4 * SIZE(BB), %xmm2
  1268. mulps %xmm0, %xmm2
  1269. movsd 2 * SIZE(AA), %xmm0
  1270. addps %xmm2, %xmm5
  1271. movsd 8 * SIZE(BB), %xmm2
  1272. addl $2 * SIZE, AA
  1273. addl $8 * SIZE, BB
  1274. decl %eax
  1275. jg .L66
  1276. ALIGN_4
  1277. .L68:
  1278. addps %xmm6, %xmm4
  1279. addps %xmm7, %xmm5
  1280. mulps %xmm3, %xmm4
  1281. mulps %xmm3, %xmm5
  1282. #ifndef TRMMKERNEL
  1283. movsd 0 * SIZE(%esi), %xmm0
  1284. movsd 0 * SIZE(%esi, LDC, 1), %xmm1
  1285. addps %xmm0, %xmm4
  1286. addps %xmm1, %xmm5
  1287. #endif
  1288. movsd %xmm4, 0 * SIZE(%esi)
  1289. movsd %xmm5, 0 * SIZE(%esi, LDC, 1)
  1290. addl $2 * SIZE, %esi # coffset += 2
  1291. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1292. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1293. movl K, %eax
  1294. subl KKK, %eax
  1295. leal (,%eax, 8), %eax
  1296. leal (AA, %eax, 1), AA
  1297. leal (BB, %eax, 4), BB
  1298. #endif
  1299. #if defined(TRMMKERNEL) && defined(LEFT)
  1300. addl $2, KK
  1301. #endif
  1302. ALIGN_4
  1303. .L70:
  1304. testl $1, M
  1305. je .L79
  1306. #if !defined(TRMMKERNEL) || \
  1307. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1308. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1309. leal BUFFER, BB # boffset1 = boffset
  1310. #else
  1311. leal BUFFER, BB # boffset1 = boffset
  1312. movl KK, %eax
  1313. leal (, %eax, 4), %eax
  1314. leal (AA, %eax, 1), AA
  1315. leal (BB, %eax, 8), BB
  1316. #endif
  1317. xorps %xmm4, %xmm4
  1318. xorps %xmm5, %xmm5
  1319. xorps %xmm6, %xmm6
  1320. xorps %xmm7, %xmm7
  1321. movss 0 * SIZE(AA), %xmm0
  1322. movss 4 * SIZE(AA), %xmm1
  1323. movss 0 * SIZE(BB), %xmm2
  1324. movss 16 * SIZE(BB), %xmm3
  1325. leal (LDC, LDC, 2), %eax
  1326. #ifndef TRMMKERNEL
  1327. movl K, %eax
  1328. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1329. movl K, %eax
  1330. subl KK, %eax
  1331. movl %eax, KKK
  1332. #else
  1333. movl KK, %eax
  1334. #ifdef LEFT
  1335. addl $1, %eax
  1336. #else
  1337. addl $2, %eax
  1338. #endif
  1339. movl %eax, KKK
  1340. #endif
  1341. sarl $3, %eax
  1342. je .L75
  1343. ALIGN_4
  1344. .L72:
  1345. mulss %xmm0, %xmm2
  1346. #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
  1347. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1348. #endif
  1349. mulss 4 * SIZE(BB), %xmm0
  1350. addss %xmm2, %xmm4
  1351. movss 8 * SIZE(BB), %xmm2
  1352. addss %xmm0, %xmm5
  1353. movss 1 * SIZE(AA), %xmm0
  1354. mulss %xmm0, %xmm2
  1355. mulss 12 * SIZE(BB), %xmm0
  1356. addss %xmm2, %xmm6
  1357. movss 32 * SIZE(BB), %xmm2
  1358. addss %xmm0, %xmm7
  1359. movss 2 * SIZE(AA), %xmm0
  1360. mulss %xmm0, %xmm3
  1361. mulss 20 * SIZE(BB), %xmm0
  1362. addss %xmm3, %xmm4
  1363. movss 24 * SIZE(BB), %xmm3
  1364. addss %xmm0, %xmm5
  1365. movss 3 * SIZE(AA), %xmm0
  1366. mulss %xmm0, %xmm3
  1367. mulss 28 * SIZE(BB), %xmm0
  1368. addss %xmm3, %xmm6
  1369. movss 48 * SIZE(BB), %xmm3
  1370. addss %xmm0, %xmm7
  1371. movss 8 * SIZE(AA), %xmm0
  1372. mulss %xmm1, %xmm2
  1373. mulss 36 * SIZE(BB), %xmm1
  1374. addss %xmm2, %xmm4
  1375. movss 40 * SIZE(BB), %xmm2
  1376. addss %xmm1, %xmm5
  1377. movss 5 * SIZE(AA), %xmm1
  1378. mulss %xmm1, %xmm2
  1379. mulss 44 * SIZE(BB), %xmm1
  1380. addss %xmm2, %xmm6
  1381. movss 64 * SIZE(BB), %xmm2
  1382. addss %xmm1, %xmm7
  1383. movss 6 * SIZE(AA), %xmm1
  1384. mulss %xmm1, %xmm3
  1385. mulss 52 * SIZE(BB), %xmm1
  1386. addss %xmm3, %xmm4
  1387. movss 56 * SIZE(BB), %xmm3
  1388. addss %xmm1, %xmm5
  1389. movss 7 * SIZE(AA), %xmm1
  1390. mulss %xmm1, %xmm3
  1391. mulss 60 * SIZE(BB), %xmm1
  1392. addss %xmm3, %xmm6
  1393. movss 80 * SIZE(BB), %xmm3
  1394. addss %xmm1, %xmm7
  1395. movss 12 * SIZE(AA), %xmm1
  1396. addl $ 8 * SIZE, AA
  1397. addl $64 * SIZE, BB
  1398. decl %eax
  1399. jne .L72
  1400. ALIGN_4
  1401. .L75:
  1402. #ifndef TRMMKERNEL
  1403. movl K, %eax
  1404. #else
  1405. movl KKK, %eax
  1406. #endif
  1407. movaps ALPHA, %xmm3
  1408. andl $7, %eax # if (k & 1)
  1409. BRANCH
  1410. je .L78
  1411. ALIGN_4
  1412. .L76:
  1413. mulss %xmm0, %xmm2
  1414. mulss 4 * SIZE(BB), %xmm0
  1415. addss %xmm2, %xmm4
  1416. movss 8 * SIZE(BB), %xmm2
  1417. addss %xmm0, %xmm5
  1418. movss 1 * SIZE(AA), %xmm0
  1419. addl $ 1 * SIZE, AA
  1420. addl $ 8 * SIZE, BB
  1421. decl %eax
  1422. jg .L76
  1423. ALIGN_4
  1424. .L78:
  1425. addss %xmm6, %xmm4
  1426. addss %xmm7, %xmm5
  1427. mulss %xmm3, %xmm4
  1428. mulss %xmm3, %xmm5
  1429. #ifndef TRMMKERNEL
  1430. movss 0 * SIZE(%esi), %xmm0
  1431. movss 0 * SIZE(%esi, LDC, 1), %xmm1
  1432. addss %xmm0, %xmm4
  1433. addss %xmm1, %xmm5
  1434. #endif
  1435. movss %xmm4, 0 * SIZE(%esi)
  1436. movss %xmm5, 0 * SIZE(%esi, LDC, 1)
  1437. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1438. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1439. movl K, %eax
  1440. subl KKK, %eax
  1441. leal (,%eax, 4), %eax
  1442. leal (AA, %eax, 1), AA
  1443. leal (BB, %eax, 8), BB
  1444. #endif
  1445. #if defined(TRMMKERNEL) && defined(LEFT)
  1446. addl $1, KK
  1447. #endif
  1448. ALIGN_4
  1449. .L79:
  1450. #if defined(TRMMKERNEL) && !defined(LEFT)
  1451. addl $2, KK
  1452. #endif
  1453. leal (, LDC, 2), %eax
  1454. addl %eax, C
  1455. ALIGN_4
  1456. .L80:
  1457. testl $1, N
  1458. je .L999
  1459. #if defined(TRMMKERNEL) && defined(LEFT)
  1460. movl OFFSET, %eax
  1461. movl %eax, KK
  1462. #endif
  1463. leal BUFFER, %ecx
  1464. movl K, %eax
  1465. sarl $3, %eax
  1466. jle .L85
  1467. ALIGN_4
  1468. .L82:
  1469. prefetch (RPREFETCHSIZE + 0) * SIZE(%edi)
  1470. movups 0 * SIZE(%edi), %xmm3
  1471. movups 4 * SIZE(%edi), %xmm7
  1472. prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx)
  1473. pshufd $0x00, %xmm3, %xmm0
  1474. pshufd $0x55, %xmm3, %xmm1
  1475. pshufd $0xaa, %xmm3, %xmm2
  1476. pshufd $0xff, %xmm3, %xmm3
  1477. prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx)
  1478. pshufd $0x00, %xmm7, %xmm4
  1479. pshufd $0x55, %xmm7, %xmm5
  1480. pshufd $0xaa, %xmm7, %xmm6
  1481. pshufd $0xff, %xmm7, %xmm7
  1482. movaps %xmm0, 0 * SIZE(%ecx)
  1483. movaps %xmm1, 4 * SIZE(%ecx)
  1484. movaps %xmm2, 8 * SIZE(%ecx)
  1485. movaps %xmm3, 12 * SIZE(%ecx)
  1486. movaps %xmm4, 16 * SIZE(%ecx)
  1487. movaps %xmm5, 20 * SIZE(%ecx)
  1488. movaps %xmm6, 24 * SIZE(%ecx)
  1489. movaps %xmm7, 28 * SIZE(%ecx)
  1490. addl $ 8 * SIZE, %edi
  1491. subl $-32 * SIZE, %ecx
  1492. decl %eax
  1493. jne .L82
  1494. ALIGN_4
  1495. .L85:
  1496. movl K, %eax
  1497. andl $7, %eax
  1498. BRANCH
  1499. jle .L90
  1500. ALIGN_4
  1501. .L86:
  1502. movss 0 * SIZE(%edi), %xmm3
  1503. pshufd $0x00, %xmm3, %xmm0
  1504. movaps %xmm0, 0 * SIZE(%ecx)
  1505. addl $1 * SIZE, %edi
  1506. addl $4 * SIZE, %ecx
  1507. decl %eax
  1508. jne .L86
  1509. ALIGN_4
  1510. .L90:
  1511. movl C, %esi # coffset = c
  1512. movl A, %edx # aoffset = a
  1513. movl M, %ebx
  1514. sarl $2, %ebx # i = (m >> 2)
  1515. jle .L100
  1516. ALIGN_4
  1517. .L91:
  1518. #if !defined(TRMMKERNEL) || \
  1519. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1520. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1521. leal BUFFER, BB # boffset1 = boffset
  1522. #else
  1523. leal BUFFER, BB # boffset1 = boffset
  1524. movl KK, %eax
  1525. leal (, %eax, 8), %eax
  1526. leal (AA, %eax, 2), AA
  1527. leal (BB, %eax, 2), BB
  1528. #endif
  1529. xorps %xmm4, %xmm4
  1530. xorps %xmm5, %xmm5
  1531. xorps %xmm6, %xmm6
  1532. xorps %xmm7, %xmm7
  1533. movaps 0 * SIZE(AA), %xmm0
  1534. movaps 16 * SIZE(AA), %xmm1
  1535. movaps 0 * SIZE(BB), %xmm2
  1536. movaps 16 * SIZE(BB), %xmm3
  1537. prefetchw 3 * SIZE(%esi)
  1538. #ifndef TRMMKERNEL
  1539. movl K, %eax
  1540. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1541. movl K, %eax
  1542. subl KK, %eax
  1543. movl %eax, KKK
  1544. #else
  1545. movl KK, %eax
  1546. #ifdef LEFT
  1547. addl $4, %eax
  1548. #else
  1549. addl $1, %eax
  1550. #endif
  1551. movl %eax, KKK
  1552. #endif
  1553. sarl $3, %eax
  1554. je .L95
  1555. ALIGN_4
  1556. .L92:
  1557. mulps %xmm0, %xmm2
  1558. #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
  1559. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1560. #endif
  1561. movaps 4 * SIZE(AA), %xmm0
  1562. addps %xmm2, %xmm4
  1563. movaps 32 * SIZE(BB), %xmm2
  1564. mulps 4 * SIZE(BB), %xmm0
  1565. addps %xmm0, %xmm5
  1566. movaps 8 * SIZE(AA), %xmm0
  1567. mulps 8 * SIZE(BB), %xmm0
  1568. addps %xmm0, %xmm6
  1569. movaps 12 * SIZE(AA), %xmm0
  1570. mulps 12 * SIZE(BB), %xmm0
  1571. addps %xmm0, %xmm7
  1572. movaps 32 * SIZE(AA), %xmm0
  1573. #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
  1574. prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
  1575. #endif
  1576. mulps %xmm1, %xmm3
  1577. movaps 20 * SIZE(AA), %xmm1
  1578. addps %xmm3, %xmm4
  1579. movaps 48 * SIZE(BB), %xmm3
  1580. mulps 20 * SIZE(BB), %xmm1
  1581. addps %xmm1, %xmm5
  1582. movaps 24 * SIZE(AA), %xmm1
  1583. mulps 24 * SIZE(BB), %xmm1
  1584. addps %xmm1, %xmm6
  1585. movaps 28 * SIZE(AA), %xmm1
  1586. mulps 28 * SIZE(BB), %xmm1
  1587. addps %xmm1, %xmm7
  1588. movaps 48 * SIZE(AA), %xmm1
  1589. addl $32 * SIZE, AA
  1590. addl $32 * SIZE, BB
  1591. decl %eax
  1592. jne .L92
  1593. ALIGN_4
  1594. .L95:
  1595. #ifndef TRMMKERNEL
  1596. movl K, %eax
  1597. #else
  1598. movl KKK, %eax
  1599. #endif
  1600. movaps ALPHA, %xmm3
  1601. andl $7, %eax # if (k & 1)
  1602. BRANCH
  1603. je .L98
  1604. ALIGN_4
  1605. .L96:
  1606. mulps %xmm0, %xmm2
  1607. addps %xmm2, %xmm4
  1608. movaps 4 * SIZE(AA), %xmm0
  1609. movaps 4 * SIZE(BB), %xmm2
  1610. addl $4 * SIZE, AA
  1611. addl $4 * SIZE, BB
  1612. decl %eax
  1613. jg .L96
  1614. ALIGN_4
  1615. .L98:
  1616. addps %xmm5, %xmm4
  1617. addps %xmm7, %xmm6
  1618. addps %xmm6, %xmm4
  1619. mulps %xmm3, %xmm4
  1620. #ifndef TRMMKERNEL
  1621. movsd 0 * SIZE(%esi), %xmm0
  1622. movhps 2 * SIZE(%esi), %xmm0
  1623. addps %xmm0, %xmm4
  1624. #endif
  1625. movsd %xmm4, 0 * SIZE(%esi)
  1626. movhps %xmm4, 2 * SIZE(%esi)
  1627. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1628. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1629. movl K, %eax
  1630. subl KKK, %eax
  1631. leal (,%eax, 8), %eax
  1632. leal (AA, %eax, 2), AA
  1633. leal (BB, %eax, 2), BB
  1634. #endif
  1635. #if defined(TRMMKERNEL) && defined(LEFT)
  1636. addl $4, KK
  1637. #endif
  1638. addl $4 * SIZE, %esi # coffset += 2
  1639. decl %ebx # i --
  1640. jg .L91
  1641. ALIGN_4
  1642. .L100:
  1643. testl $2, M
  1644. je .L110
  1645. #if !defined(TRMMKERNEL) || \
  1646. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1647. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1648. leal BUFFER, BB # boffset1 = boffset
  1649. #else
  1650. leal BUFFER, BB # boffset1 = boffset
  1651. movl KK, %eax
  1652. leal (, %eax, 8), %eax
  1653. leal (AA, %eax, 1), AA
  1654. leal (BB, %eax, 2), BB
  1655. #endif
  1656. xorps %xmm4, %xmm4
  1657. xorps %xmm5, %xmm5
  1658. xorps %xmm6, %xmm6
  1659. xorps %xmm7, %xmm7
  1660. movsd 0 * SIZE(AA), %xmm0
  1661. movsd 8 * SIZE(AA), %xmm1
  1662. movsd 0 * SIZE(BB), %xmm2
  1663. movsd 16 * SIZE(BB), %xmm3
  1664. leal (LDC, LDC, 2), %eax
  1665. #ifndef TRMMKERNEL
  1666. movl K, %eax
  1667. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1668. movl K, %eax
  1669. subl KK, %eax
  1670. movl %eax, KKK
  1671. #else
  1672. movl KK, %eax
  1673. #ifdef LEFT
  1674. addl $2, %eax
  1675. #else
  1676. addl $1, %eax
  1677. #endif
  1678. movl %eax, KKK
  1679. #endif
  1680. sarl $3, %eax
  1681. je .L105
  1682. ALIGN_4
  1683. .L102:
  1684. mulps %xmm0, %xmm2
  1685. #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
  1686. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1687. #endif
  1688. movsd 2 * SIZE(AA), %xmm0
  1689. addps %xmm2, %xmm4
  1690. movsd 4 * SIZE(BB), %xmm2
  1691. mulps %xmm0, %xmm2
  1692. movsd 4 * SIZE(AA), %xmm0
  1693. addps %xmm2, %xmm5
  1694. movsd 8 * SIZE(BB), %xmm2
  1695. mulps %xmm0, %xmm2
  1696. movsd 6 * SIZE(AA), %xmm0
  1697. addps %xmm2, %xmm6
  1698. movsd 12 * SIZE(BB), %xmm2
  1699. mulps %xmm0, %xmm2
  1700. movsd 16 * SIZE(AA), %xmm0
  1701. addps %xmm2, %xmm7
  1702. movsd 32 * SIZE(BB), %xmm2
  1703. mulps %xmm1, %xmm3
  1704. movsd 10 * SIZE(AA), %xmm1
  1705. addps %xmm3, %xmm4
  1706. movsd 20 * SIZE(BB), %xmm3
  1707. mulps %xmm1, %xmm3
  1708. movsd 12 * SIZE(AA), %xmm1
  1709. addps %xmm3, %xmm5
  1710. movsd 24 * SIZE(BB), %xmm3
  1711. mulps %xmm1, %xmm3
  1712. movsd 14 * SIZE(AA), %xmm1
  1713. addps %xmm3, %xmm6
  1714. movsd 28 * SIZE(BB), %xmm3
  1715. mulps %xmm1, %xmm3
  1716. movsd 24 * SIZE(AA), %xmm1
  1717. addps %xmm3, %xmm7
  1718. movsd 48 * SIZE(BB), %xmm3
  1719. addl $16 * SIZE, AA
  1720. addl $32 * SIZE, BB
  1721. decl %eax
  1722. jne .L102
  1723. ALIGN_4
  1724. .L105:
  1725. #ifndef TRMMKERNEL
  1726. movl K, %eax
  1727. #else
  1728. movl KKK, %eax
  1729. #endif
  1730. movaps ALPHA, %xmm3
  1731. andl $7, %eax # if (k & 1)
  1732. BRANCH
  1733. je .L108
  1734. ALIGN_4
  1735. .L106:
  1736. mulps %xmm0, %xmm2
  1737. addps %xmm2, %xmm4
  1738. movsd 2 * SIZE(AA), %xmm0
  1739. movsd 4 * SIZE(BB), %xmm2
  1740. addl $2 * SIZE, AA
  1741. addl $4 * SIZE, BB
  1742. decl %eax
  1743. jg .L106
  1744. ALIGN_4
  1745. .L108:
  1746. addps %xmm5, %xmm4
  1747. addps %xmm7, %xmm6
  1748. addps %xmm6, %xmm4
  1749. mulps %xmm3, %xmm4
  1750. #ifndef TRMMKERNEL
  1751. movsd 0 * SIZE(%esi), %xmm0
  1752. addps %xmm0, %xmm4
  1753. #endif
  1754. movsd %xmm4, 0 * SIZE(%esi)
  1755. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1756. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1757. movl K, %eax
  1758. subl KKK, %eax
  1759. leal (,%eax, 8), %eax
  1760. leal (AA, %eax, 1), AA
  1761. leal (BB, %eax, 2), BB
  1762. #endif
  1763. #if defined(TRMMKERNEL) && defined(LEFT)
  1764. addl $2, KK
  1765. #endif
  1766. addl $2 * SIZE, %esi # coffset += 2
  1767. ALIGN_4
  1768. .L110:
  1769. testl $1, M
  1770. je .L999
  1771. #if !defined(TRMMKERNEL) || \
  1772. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1773. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1774. leal BUFFER, BB # boffset1 = boffset
  1775. #else
  1776. leal BUFFER, BB # boffset1 = boffset
  1777. movl KK, %eax
  1778. leal (, %eax, 4), %eax
  1779. leal (AA, %eax, 1), AA
  1780. leal (BB, %eax, 4), BB
  1781. #endif
  1782. xorps %xmm4, %xmm4
  1783. xorps %xmm5, %xmm5
  1784. xorps %xmm6, %xmm6
  1785. xorps %xmm7, %xmm7
  1786. movss 0 * SIZE(AA), %xmm0
  1787. movss 4 * SIZE(AA), %xmm1
  1788. movss 0 * SIZE(BB), %xmm2
  1789. movss 16 * SIZE(BB), %xmm3
  1790. leal (LDC, LDC, 2), %eax
  1791. #ifndef TRMMKERNEL
  1792. movl K, %eax
  1793. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1794. movl K, %eax
  1795. subl KK, %eax
  1796. movl %eax, KKK
  1797. #else
  1798. movl KK, %eax
  1799. #ifdef LEFT
  1800. addl $1, %eax
  1801. #else
  1802. addl $1, %eax
  1803. #endif
  1804. movl %eax, KKK
  1805. #endif
  1806. sarl $3, %eax
  1807. je .L115
  1808. ALIGN_4
  1809. .L112:
  1810. mulss %xmm0, %xmm2
  1811. #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
  1812. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1813. #endif
  1814. movss 1 * SIZE(AA), %xmm0
  1815. addss %xmm2, %xmm4
  1816. movss 32 * SIZE(BB), %xmm2
  1817. mulss 4 * SIZE(BB), %xmm0
  1818. addss %xmm0, %xmm5
  1819. movss 2 * SIZE(AA), %xmm0
  1820. mulss 8 * SIZE(BB), %xmm0
  1821. addss %xmm0, %xmm6
  1822. movss 3 * SIZE(AA), %xmm0
  1823. mulss 12 * SIZE(BB), %xmm0
  1824. addss %xmm0, %xmm7
  1825. movss 8 * SIZE(AA), %xmm0
  1826. mulss %xmm1, %xmm3
  1827. movss 5 * SIZE(AA), %xmm1
  1828. addss %xmm3, %xmm4
  1829. movss 48 * SIZE(BB), %xmm3
  1830. mulss 20 * SIZE(BB), %xmm1
  1831. addss %xmm1, %xmm5
  1832. movss 6 * SIZE(AA), %xmm1
  1833. mulss 24 * SIZE(BB), %xmm1
  1834. addss %xmm1, %xmm6
  1835. movss 7 * SIZE(AA), %xmm1
  1836. mulss 28 * SIZE(BB), %xmm1
  1837. addss %xmm1, %xmm7
  1838. movss 12 * SIZE(AA), %xmm1
  1839. addl $ 8 * SIZE, AA
  1840. addl $32 * SIZE, BB
  1841. decl %eax
  1842. jne .L112
  1843. ALIGN_4
  1844. .L115:
  1845. #ifndef TRMMKERNEL
  1846. movl K, %eax
  1847. #else
  1848. movl KKK, %eax
  1849. #endif
  1850. movaps ALPHA, %xmm3
  1851. andl $7, %eax # if (k & 1)
  1852. BRANCH
  1853. je .L118
  1854. ALIGN_4
  1855. .L116:
  1856. mulss %xmm0, %xmm2
  1857. movss 1 * SIZE(AA), %xmm0
  1858. addss %xmm2, %xmm4
  1859. movss 4 * SIZE(BB), %xmm2
  1860. addl $ 1 * SIZE, AA
  1861. addl $ 4 * SIZE, BB
  1862. decl %eax
  1863. jg .L116
  1864. ALIGN_4
  1865. .L118:
  1866. addss %xmm5, %xmm4
  1867. addss %xmm7, %xmm6
  1868. addss %xmm6, %xmm4
  1869. mulss %xmm3, %xmm4
  1870. #ifndef TRMMKERNEL
  1871. movss 0 * SIZE(%esi), %xmm0
  1872. addss %xmm0, %xmm4
  1873. #endif
  1874. movss %xmm4, 0 * SIZE(%esi)
  1875. ALIGN_4
  1876. .L999:
  1877. movl OLD_STACK, %esp
  1878. popl %ebx
  1879. popl %esi
  1880. popl %edi
  1881. popl %ebp
  1882. ret
  1883. EPILOGUE