You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_4x8_nehalem.S 43 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define OLD_K %rdx
  43. #define M %r13
  44. #define N %r14
  45. #define K %r15
  46. #define A %rcx
  47. #define B %r8
  48. #define C %r9
  49. #define LDC %rbp
  50. #define I %r11
  51. #define AO %rdi
  52. #define BO %rsi
  53. #define CO1 %rbx
  54. #define CO2 %rdx
  55. #define BB %r12
  56. #define PREA %r10
  57. #ifndef WINDOWS_ABI
  58. #define STACKSIZE 128
  59. #define OLD_LDC 8 + STACKSIZE(%rsp)
  60. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  61. #define ALPHA 48(%rsp)
  62. #define J 56(%rsp)
  63. #define OFFSET 64(%rsp)
  64. #define KK 72(%rsp)
  65. #define KKK 80(%rsp)
  66. #else
  67. #define STACKSIZE 512
  68. #define OLD_A 40 + STACKSIZE(%rsp)
  69. #define OLD_B 48 + STACKSIZE(%rsp)
  70. #define OLD_C 56 + STACKSIZE(%rsp)
  71. #define OLD_LDC 64 + STACKSIZE(%rsp)
  72. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  73. #define ALPHA 224(%rsp)
  74. #define J 232(%rsp)
  75. #define OFFSET 240(%rsp)
  76. #define KK 248(%rsp)
  77. #define KKK 256(%rsp)
  78. #endif
  79. #define PREFETCHSIZE 8
  80. #define PREFETCH prefetcht0
  81. PROLOGUE
  82. PROFCODE
  83. subq $STACKSIZE, %rsp
  84. movq %rbx, 0(%rsp)
  85. movq %rbp, 8(%rsp)
  86. movq %r12, 16(%rsp)
  87. movq %r13, 24(%rsp)
  88. movq %r14, 32(%rsp)
  89. movq %r15, 40(%rsp)
  90. #ifdef WINDOWS_ABI
  91. movq %rdi, 48(%rsp)
  92. movq %rsi, 56(%rsp)
  93. movups %xmm6, 64(%rsp)
  94. movups %xmm7, 80(%rsp)
  95. movups %xmm8, 96(%rsp)
  96. movups %xmm9, 112(%rsp)
  97. movups %xmm10, 128(%rsp)
  98. movups %xmm11, 144(%rsp)
  99. movups %xmm12, 160(%rsp)
  100. movups %xmm13, 176(%rsp)
  101. movups %xmm14, 192(%rsp)
  102. movups %xmm15, 208(%rsp)
  103. movq ARG1, OLD_M
  104. movq ARG2, OLD_N
  105. movq ARG3, OLD_K
  106. movq OLD_A, A
  107. movq OLD_B, B
  108. movq OLD_C, C
  109. movq OLD_LDC, LDC
  110. #ifdef TRMMKERNEL
  111. movq OLD_OFFSET, %r11
  112. #endif
  113. movaps %xmm3, %xmm0
  114. #else
  115. movq OLD_LDC, LDC
  116. #ifdef TRMMKERNEL
  117. movq OLD_OFFSET, %r11
  118. #endif
  119. #endif
  120. unpcklps %xmm0, %xmm0
  121. movlps %xmm0, ALPHA
  122. subq $-32 * SIZE, A
  123. subq $-32 * SIZE, B
  124. movq OLD_M, M
  125. movq OLD_N, N
  126. movq OLD_K, K
  127. salq $BASE_SHIFT, LDC
  128. #ifdef TRMMKERNEL
  129. movq %r11, OFFSET
  130. #ifndef LEFT
  131. negq %r11
  132. #endif
  133. movq %r11, KK
  134. #endif
  135. movq N, J
  136. sarq $3, J
  137. NOBRANCH
  138. jle .L40
  139. ALIGN_4
  140. .L10:
  141. #if defined(TRMMKERNEL) && defined(LEFT)
  142. movq OFFSET, %rax
  143. movq %rax, KK
  144. #endif
  145. movq C, CO1
  146. leaq (C, LDC, 4), CO2
  147. movq A, AO
  148. movq K, %rax
  149. salq $BASE_SHIFT + 3, %rax
  150. leaq (B, %rax), BB
  151. movq M, I
  152. sarq $2, I
  153. NOBRANCH
  154. jle .L20
  155. ALIGN_4
  156. .L11:
  157. prefetcht2 -32 * SIZE(BB)
  158. subq $-16 * SIZE, BB
  159. #if !defined(TRMMKERNEL) || \
  160. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  161. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  162. movq B, BO
  163. #else
  164. movq B, BO
  165. movq KK, %rax
  166. leaq (, %rax, SIZE), %rax
  167. leaq (AO, %rax, 4), AO
  168. leaq (BO, %rax, 8), BO
  169. #endif
  170. leaq (LDC, LDC, 2), %rax
  171. xorps %xmm1, %xmm1
  172. xorps %xmm2, %xmm2
  173. xorps %xmm3, %xmm3
  174. PADDING
  175. xorps %xmm4, %xmm4
  176. PADDING
  177. xorps %xmm8, %xmm8
  178. prefetcht0 3 * SIZE(CO1)
  179. xorps %xmm9, %xmm9
  180. prefetcht0 7 * SIZE(CO1, LDC, 1)
  181. PADDING
  182. xorps %xmm10, %xmm10
  183. prefetcht0 3 * SIZE(CO1, LDC, 2)
  184. PADDING
  185. xorps %xmm11, %xmm11
  186. prefetcht0 7 * SIZE(CO1, %rax, 1)
  187. movaps -32 * SIZE(AO), %xmm0
  188. PADDING
  189. xorps %xmm12, %xmm12
  190. prefetcht0 3 * SIZE(CO2)
  191. xorps %xmm13, %xmm13
  192. prefetcht0 7 * SIZE(CO2, LDC, 1)
  193. xorps %xmm14, %xmm14
  194. prefetcht0 3 * SIZE(CO2, LDC, 2)
  195. xorps %xmm15, %xmm15
  196. prefetcht0 7 * SIZE(CO2, %rax, 1)
  197. #ifndef TRMMKERNEL
  198. movq K, %rax
  199. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  200. movq K, %rax
  201. subq KK, %rax
  202. movq %rax, KKK
  203. #else
  204. movq KK, %rax
  205. #ifdef LEFT
  206. addq $4, %rax
  207. #else
  208. addq $8, %rax
  209. #endif
  210. movq %rax, KKK
  211. #endif
  212. sarq $2, %rax
  213. NOBRANCH
  214. jle .L15
  215. ALIGN_3
  216. .L12:
  217. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  218. addps %xmm1, %xmm12
  219. movaps -32 * SIZE(BO), %xmm1
  220. addps %xmm2, %xmm13
  221. pshufd $0x39, %xmm1, %xmm2
  222. mulps %xmm0, %xmm1
  223. pshufd $0x39, %xmm2, %xmm5
  224. mulps %xmm0, %xmm2
  225. addps %xmm3, %xmm14
  226. addps %xmm4, %xmm15
  227. pshufd $0x39, %xmm5, %xmm6
  228. mulps %xmm0, %xmm5
  229. mulps %xmm0, %xmm6
  230. addps %xmm1, %xmm8
  231. movaps -28 * SIZE(BO), %xmm1
  232. addps %xmm2, %xmm9
  233. pshufd $0x39, %xmm1, %xmm2
  234. mulps %xmm0, %xmm1
  235. pshufd $0x39, %xmm2, %xmm3
  236. mulps %xmm0, %xmm2
  237. addps %xmm5, %xmm10
  238. addps %xmm6, %xmm11
  239. pshufd $0x39, %xmm3, %xmm4
  240. movaps -28 * SIZE(AO), %xmm7
  241. mulps %xmm0, %xmm3
  242. mulps %xmm0, %xmm4
  243. addps %xmm1, %xmm12
  244. movaps -24 * SIZE(BO), %xmm1
  245. addps %xmm2, %xmm13
  246. pshufd $0x39, %xmm1, %xmm2
  247. mulps %xmm7, %xmm1
  248. pshufd $0x39, %xmm2, %xmm5
  249. mulps %xmm7, %xmm2
  250. addps %xmm3, %xmm14
  251. addps %xmm4, %xmm15
  252. pshufd $0x39, %xmm5, %xmm6
  253. mulps %xmm7, %xmm5
  254. mulps %xmm7, %xmm6
  255. addps %xmm1, %xmm8
  256. movaps -20 * SIZE(BO), %xmm1
  257. addps %xmm2, %xmm9
  258. pshufd $0x39, %xmm1, %xmm2
  259. mulps %xmm7, %xmm1
  260. pshufd $0x39, %xmm2, %xmm3
  261. mulps %xmm7, %xmm2
  262. addps %xmm5, %xmm10
  263. addps %xmm6, %xmm11
  264. pshufd $0x39, %xmm3, %xmm4
  265. movaps -24 * SIZE(AO), %xmm0
  266. mulps %xmm7, %xmm3
  267. mulps %xmm7, %xmm4
  268. addps %xmm1, %xmm12
  269. movaps -16 * SIZE(BO), %xmm1
  270. addps %xmm2, %xmm13
  271. pshufd $0x39, %xmm1, %xmm2
  272. mulps %xmm0, %xmm1
  273. pshufd $0x39, %xmm2, %xmm5
  274. mulps %xmm0, %xmm2
  275. addps %xmm3, %xmm14
  276. addps %xmm4, %xmm15
  277. pshufd $0x39, %xmm5, %xmm6
  278. mulps %xmm0, %xmm5
  279. mulps %xmm0, %xmm6
  280. addps %xmm1, %xmm8
  281. movaps -12 * SIZE(BO), %xmm1
  282. addps %xmm2, %xmm9
  283. pshufd $0x39, %xmm1, %xmm2
  284. mulps %xmm0, %xmm1
  285. pshufd $0x39, %xmm2, %xmm3
  286. mulps %xmm0, %xmm2
  287. addps %xmm5, %xmm10
  288. addps %xmm6, %xmm11
  289. pshufd $0x39, %xmm3, %xmm4
  290. movaps -20 * SIZE(AO), %xmm7
  291. mulps %xmm0, %xmm3
  292. mulps %xmm0, %xmm4
  293. addps %xmm1, %xmm12
  294. movaps -8 * SIZE(BO), %xmm1
  295. addps %xmm2, %xmm13
  296. pshufd $0x39, %xmm1, %xmm2
  297. mulps %xmm7, %xmm1
  298. pshufd $0x39, %xmm2, %xmm5
  299. mulps %xmm7, %xmm2
  300. addps %xmm3, %xmm14
  301. addps %xmm4, %xmm15
  302. pshufd $0x39, %xmm5, %xmm6
  303. mulps %xmm7, %xmm5
  304. mulps %xmm7, %xmm6
  305. addps %xmm1, %xmm8
  306. movaps -4 * SIZE(BO), %xmm1
  307. addps %xmm2, %xmm9
  308. subq $-32 * SIZE, BO
  309. pshufd $0x39, %xmm1, %xmm2
  310. mulps %xmm7, %xmm1
  311. pshufd $0x39, %xmm2, %xmm3
  312. mulps %xmm7, %xmm2
  313. addps %xmm5, %xmm10
  314. addps %xmm6, %xmm11
  315. pshufd $0x39, %xmm3, %xmm4
  316. mulps %xmm7, %xmm3
  317. movaps -16 * SIZE(AO), %xmm0
  318. mulps %xmm7, %xmm4
  319. subq $-16 * SIZE, AO
  320. decq %rax
  321. BRANCH
  322. jg .L12
  323. ALIGN_3
  324. .L15:
  325. movddup ALPHA, %xmm7
  326. #ifndef TRMMKERNEL
  327. movq K, %rax
  328. #else
  329. movq KKK, %rax
  330. #endif
  331. andq $3, %rax # if (k & 1)
  332. BRANCH
  333. je .L18
  334. ALIGN_3
  335. .L16:
  336. addps %xmm1, %xmm12
  337. movaps -32 * SIZE(BO), %xmm1
  338. addps %xmm2, %xmm13
  339. pshufd $0x39, %xmm1, %xmm2
  340. mulps %xmm0, %xmm1
  341. pshufd $0x39, %xmm2, %xmm5
  342. mulps %xmm0, %xmm2
  343. addps %xmm3, %xmm14
  344. addps %xmm4, %xmm15
  345. pshufd $0x39, %xmm5, %xmm6
  346. mulps %xmm0, %xmm5
  347. mulps %xmm0, %xmm6
  348. addps %xmm1, %xmm8
  349. movaps -28 * SIZE(BO), %xmm1
  350. addps %xmm2, %xmm9
  351. pshufd $0x39, %xmm1, %xmm2
  352. mulps %xmm0, %xmm1
  353. pshufd $0x39, %xmm2, %xmm3
  354. mulps %xmm0, %xmm2
  355. addps %xmm5, %xmm10
  356. addps %xmm6, %xmm11
  357. pshufd $0x39, %xmm3, %xmm4
  358. mulps %xmm0, %xmm3
  359. mulps %xmm0, %xmm4
  360. movaps -28 * SIZE(AO), %xmm0
  361. addq $4 * SIZE, AO
  362. addq $8 * SIZE, BO
  363. subq $1, %rax
  364. BRANCH
  365. jg .L16
  366. ALIGN_3
  367. .L18:
  368. addps %xmm1, %xmm12
  369. addps %xmm2, %xmm13
  370. addps %xmm3, %xmm14
  371. addps %xmm4, %xmm15
  372. movaps %xmm9, %xmm4
  373. shufps $0xd8, %xmm8, %xmm9
  374. shufps $0xd8, %xmm11, %xmm8
  375. shufps $0xd8, %xmm10, %xmm11
  376. shufps $0xd8, %xmm4, %xmm10
  377. movaps %xmm8, %xmm4
  378. shufps $0xd8, %xmm10, %xmm8
  379. shufps $0xd8, %xmm4, %xmm10
  380. movaps %xmm9, %xmm5
  381. shufps $0xd8, %xmm11, %xmm9
  382. shufps $0xd8, %xmm5, %xmm11
  383. movaps %xmm13, %xmm4
  384. shufps $0xd8, %xmm12, %xmm13
  385. shufps $0xd8, %xmm15, %xmm12
  386. shufps $0xd8, %xmm14, %xmm15
  387. shufps $0xd8, %xmm4, %xmm14
  388. movaps %xmm12, %xmm4
  389. shufps $0xd8, %xmm14, %xmm12
  390. shufps $0xd8, %xmm4, %xmm14
  391. movaps %xmm13, %xmm5
  392. shufps $0xd8, %xmm15, %xmm13
  393. shufps $0xd8, %xmm5, %xmm15
  394. mulps %xmm7, %xmm8
  395. mulps %xmm7, %xmm9
  396. mulps %xmm7, %xmm10
  397. mulps %xmm7, %xmm11
  398. mulps %xmm7, %xmm12
  399. mulps %xmm7, %xmm13
  400. mulps %xmm7, %xmm14
  401. mulps %xmm7, %xmm15
  402. leaq (LDC, LDC, 2), %rax
  403. #ifndef TRMMKERNEL
  404. movups (CO1), %xmm0
  405. movups (CO1, LDC, 1), %xmm1
  406. movups (CO1, LDC, 2), %xmm2
  407. movups (CO1, %rax, 1), %xmm3
  408. movups (CO2), %xmm4
  409. movups (CO2, LDC, 1), %xmm5
  410. movups (CO2, LDC, 2), %xmm6
  411. movups (CO2, %rax, 1), %xmm7
  412. addps %xmm0, %xmm8
  413. addps %xmm1, %xmm9
  414. addps %xmm2, %xmm10
  415. addps %xmm3, %xmm11
  416. addps %xmm4, %xmm12
  417. addps %xmm5, %xmm13
  418. addps %xmm6, %xmm14
  419. addps %xmm7, %xmm15
  420. #endif
  421. movups %xmm8, (CO1)
  422. movups %xmm9, (CO1, LDC, 1)
  423. movups %xmm10, (CO1, LDC, 2)
  424. movups %xmm11, (CO1, %rax, 1)
  425. movups %xmm12, (CO2)
  426. movups %xmm13, (CO2, LDC, 1)
  427. movups %xmm14, (CO2, LDC, 2)
  428. movups %xmm15, (CO2, %rax, 1)
  429. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  430. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  431. movq K, %rax
  432. subq KKK, %rax
  433. leaq (,%rax, SIZE), %rax
  434. leaq (AO, %rax, 4), AO
  435. leaq (BO, %rax, 8), BO
  436. #endif
  437. #if defined(TRMMKERNEL) && defined(LEFT)
  438. addq $4, KK
  439. #endif
  440. addq $4 * SIZE, CO1
  441. addq $4 * SIZE, CO2
  442. decq I
  443. BRANCH
  444. jg .L11
  445. ALIGN_4
  446. .L20:
  447. testq $2, M
  448. BRANCH
  449. jle .L30
  450. #if !defined(TRMMKERNEL) || \
  451. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  452. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  453. movq B, BO
  454. #else
  455. movq B, BO
  456. movq KK, %rax
  457. leaq (, %rax, SIZE), %rax
  458. leaq (AO, %rax, 2), AO
  459. leaq (BO, %rax, 8), BO
  460. #endif
  461. xorps %xmm1, %xmm1
  462. movddup -32 * SIZE(AO), %xmm0
  463. xorps %xmm2, %xmm2
  464. movaps -32 * SIZE(BO), %xmm5
  465. xorps %xmm3, %xmm3
  466. xorps %xmm4, %xmm4
  467. xorps %xmm8, %xmm8
  468. xorps %xmm9, %xmm9
  469. xorps %xmm10, %xmm10
  470. xorps %xmm11, %xmm11
  471. #ifndef TRMMKERNEL
  472. movq K, %rax
  473. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  474. movq K, %rax
  475. subq KK, %rax
  476. movq %rax, KKK
  477. #else
  478. movq KK, %rax
  479. #ifdef LEFT
  480. addq $2, %rax
  481. #else
  482. addq $8, %rax
  483. #endif
  484. movq %rax, KKK
  485. #endif
  486. sarq $2, %rax
  487. NOBRANCH
  488. jle .L25
  489. ALIGN_3
  490. .L22:
  491. addps %xmm1, %xmm8
  492. pshufd $0x50, %xmm5, %xmm1
  493. mulps %xmm0, %xmm1
  494. addps %xmm2, %xmm9
  495. pshufd $0xfa, %xmm5, %xmm2
  496. mulps %xmm0, %xmm2
  497. movaps -28 * SIZE(BO), %xmm5
  498. addps %xmm3, %xmm10
  499. pshufd $0x50, %xmm5, %xmm3
  500. mulps %xmm0, %xmm3
  501. addps %xmm4, %xmm11
  502. pshufd $0xfa, %xmm5, %xmm4
  503. mulps %xmm0, %xmm4
  504. movaps -24 * SIZE(BO), %xmm5
  505. movddup -30 * SIZE(AO), %xmm0
  506. addps %xmm1, %xmm8
  507. pshufd $0x50, %xmm5, %xmm1
  508. mulps %xmm0, %xmm1
  509. addps %xmm2, %xmm9
  510. pshufd $0xfa, %xmm5, %xmm2
  511. mulps %xmm0, %xmm2
  512. movaps -20 * SIZE(BO), %xmm5
  513. addps %xmm3, %xmm10
  514. pshufd $0x50, %xmm5, %xmm3
  515. mulps %xmm0, %xmm3
  516. addps %xmm4, %xmm11
  517. pshufd $0xfa, %xmm5, %xmm4
  518. mulps %xmm0, %xmm4
  519. movaps -16 * SIZE(BO), %xmm5
  520. movddup -28 * SIZE(AO), %xmm0
  521. addps %xmm1, %xmm8
  522. pshufd $0x50, %xmm5, %xmm1
  523. mulps %xmm0, %xmm1
  524. addps %xmm2, %xmm9
  525. pshufd $0xfa, %xmm5, %xmm2
  526. mulps %xmm0, %xmm2
  527. movaps -12 * SIZE(BO), %xmm5
  528. addps %xmm3, %xmm10
  529. pshufd $0x50, %xmm5, %xmm3
  530. mulps %xmm0, %xmm3
  531. addps %xmm4, %xmm11
  532. pshufd $0xfa, %xmm5, %xmm4
  533. mulps %xmm0, %xmm4
  534. movaps -8 * SIZE(BO), %xmm5
  535. movddup -26 * SIZE(AO), %xmm0
  536. addps %xmm1, %xmm8
  537. pshufd $0x50, %xmm5, %xmm1
  538. mulps %xmm0, %xmm1
  539. addps %xmm2, %xmm9
  540. pshufd $0xfa, %xmm5, %xmm2
  541. mulps %xmm0, %xmm2
  542. movaps -4 * SIZE(BO), %xmm5
  543. addps %xmm3, %xmm10
  544. pshufd $0x50, %xmm5, %xmm3
  545. mulps %xmm0, %xmm3
  546. addps %xmm4, %xmm11
  547. pshufd $0xfa, %xmm5, %xmm4
  548. mulps %xmm0, %xmm4
  549. movaps 0 * SIZE(BO), %xmm5
  550. movddup -24 * SIZE(AO), %xmm0
  551. subq $-32 * SIZE, BO
  552. subq $ -8 * SIZE, AO
  553. subq $1, %rax
  554. BRANCH
  555. jg .L22
  556. ALIGN_3
  557. .L25:
  558. movddup ALPHA, %xmm7
  559. #ifndef TRMMKERNEL
  560. movq K, %rax
  561. #else
  562. movq KKK, %rax
  563. #endif
  564. andq $3, %rax # if (k & 1)
  565. BRANCH
  566. je .L28
  567. ALIGN_3
  568. .L26:
  569. addps %xmm1, %xmm8
  570. pshufd $0x50, %xmm5, %xmm1
  571. mulps %xmm0, %xmm1
  572. addps %xmm2, %xmm9
  573. pshufd $0xfa, %xmm5, %xmm2
  574. mulps %xmm0, %xmm2
  575. movaps -28 * SIZE(BO), %xmm5
  576. addps %xmm3, %xmm10
  577. pshufd $0x50, %xmm5, %xmm3
  578. mulps %xmm0, %xmm3
  579. addps %xmm4, %xmm11
  580. pshufd $0xfa, %xmm5, %xmm4
  581. mulps %xmm0, %xmm4
  582. movaps -24 * SIZE(BO), %xmm5
  583. movddup -30 * SIZE(AO), %xmm0
  584. addq $2 * SIZE, AO
  585. addq $8 * SIZE, BO
  586. subq $1, %rax
  587. BRANCH
  588. jg .L26
  589. ALIGN_3
  590. .L28:
  591. addps %xmm1, %xmm8
  592. addps %xmm2, %xmm9
  593. addps %xmm3, %xmm10
  594. addps %xmm4, %xmm11
  595. mulps %xmm7, %xmm8
  596. mulps %xmm7, %xmm9
  597. mulps %xmm7, %xmm10
  598. mulps %xmm7, %xmm11
  599. leaq (LDC, LDC, 2), %rax
  600. #ifndef TRMMKERNEL
  601. movsd (CO1), %xmm0
  602. movhps (CO1, LDC, 1), %xmm0
  603. movsd (CO1, LDC, 2), %xmm1
  604. movhps (CO1, %rax, 1), %xmm1
  605. movsd (CO2), %xmm2
  606. movhps (CO2, LDC, 1), %xmm2
  607. movsd (CO2, LDC, 2), %xmm3
  608. movhps (CO2, %rax, 1), %xmm3
  609. addps %xmm0, %xmm8
  610. addps %xmm1, %xmm9
  611. addps %xmm2, %xmm10
  612. addps %xmm3, %xmm11
  613. #endif
  614. movsd %xmm8, (CO1)
  615. movhps %xmm8, (CO1, LDC, 1)
  616. movsd %xmm9, (CO1, LDC, 2)
  617. movhps %xmm9, (CO1, %rax, 1)
  618. movsd %xmm10, (CO2)
  619. movhps %xmm10, (CO2, LDC, 1)
  620. movsd %xmm11, (CO2, LDC, 2)
  621. movhps %xmm11, (CO2, %rax, 1)
  622. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  623. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  624. movq K, %rax
  625. subq KKK, %rax
  626. leaq (,%rax, SIZE), %rax
  627. leaq (AO, %rax, 2), AO
  628. leaq (BO, %rax, 8), BO
  629. #endif
  630. #if defined(TRMMKERNEL) && defined(LEFT)
  631. addq $2, KK
  632. #endif
  633. addq $2 * SIZE, CO1
  634. addq $2 * SIZE, CO2
  635. ALIGN_4
  636. .L30:
  637. testq $1, M
  638. BRANCH
  639. jle .L39
  640. #if !defined(TRMMKERNEL) || \
  641. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  642. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  643. movq B, BO
  644. #else
  645. movq B, BO
  646. movq KK, %rax
  647. leaq (, %rax, SIZE), %rax
  648. leaq (AO, %rax, 1), AO
  649. leaq (BO, %rax, 8), BO
  650. #endif
  651. xorps %xmm2, %xmm2
  652. movsd -32 * SIZE(AO), %xmm0
  653. xorps %xmm3, %xmm3
  654. xorps %xmm8, %xmm8
  655. xorps %xmm12, %xmm12
  656. #ifndef TRMMKERNEL
  657. movq K, %rax
  658. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  659. movq K, %rax
  660. subq KK, %rax
  661. movq %rax, KKK
  662. #else
  663. movq KK, %rax
  664. #ifdef LEFT
  665. addq $1, %rax
  666. #else
  667. addq $8, %rax
  668. #endif
  669. movq %rax, KKK
  670. #endif
  671. sarq $2, %rax
  672. NOBRANCH
  673. jle .L35
  674. ALIGN_3
  675. .L32:
  676. pshufd $0x00, %xmm0, %xmm1
  677. addps %xmm2, %xmm8
  678. movaps -32 * SIZE(BO), %xmm2
  679. mulps %xmm1, %xmm2
  680. addps %xmm3, %xmm12
  681. movaps -28 * SIZE(BO), %xmm3
  682. mulps %xmm1, %xmm3
  683. pshufd $0x55, %xmm0, %xmm1
  684. movsd -30 * SIZE(AO), %xmm0
  685. addps %xmm2, %xmm8
  686. movaps -24 * SIZE(BO), %xmm2
  687. mulps %xmm1, %xmm2
  688. addps %xmm3, %xmm12
  689. movaps -20 * SIZE(BO), %xmm3
  690. mulps %xmm1, %xmm3
  691. pshufd $0x00, %xmm0, %xmm1
  692. addps %xmm2, %xmm8
  693. movaps -16 * SIZE(BO), %xmm2
  694. mulps %xmm1, %xmm2
  695. addps %xmm3, %xmm12
  696. movaps -12 * SIZE(BO), %xmm3
  697. mulps %xmm1, %xmm3
  698. pshufd $0x55, %xmm0, %xmm1
  699. movsd -28 * SIZE(AO), %xmm0
  700. addps %xmm2, %xmm8
  701. movaps -8 * SIZE(BO), %xmm2
  702. mulps %xmm1, %xmm2
  703. addps %xmm3, %xmm12
  704. movaps -4 * SIZE(BO), %xmm3
  705. mulps %xmm1, %xmm3
  706. subq $-32 * SIZE, BO
  707. subq $ -4 * SIZE, AO
  708. subq $1, %rax
  709. BRANCH
  710. jg .L32
  711. ALIGN_3
  712. .L35:
  713. movddup ALPHA, %xmm7
  714. #ifndef TRMMKERNEL
  715. movq K, %rax
  716. #else
  717. movq KKK, %rax
  718. #endif
  719. andq $3, %rax # if (k & 1)
  720. BRANCH
  721. je .L38
  722. ALIGN_3
  723. .L36:
  724. pshufd $0x00, %xmm0, %xmm1
  725. movss -31 * SIZE(AO), %xmm0
  726. addps %xmm2, %xmm8
  727. movaps -32 * SIZE(BO), %xmm2
  728. mulps %xmm1, %xmm2
  729. addps %xmm3, %xmm12
  730. movaps -28 * SIZE(BO), %xmm3
  731. mulps %xmm1, %xmm3
  732. addq $1 * SIZE, AO
  733. addq $8 * SIZE, BO
  734. subq $1, %rax
  735. BRANCH
  736. jg .L36
  737. ALIGN_3
  738. .L38:
  739. addps %xmm2, %xmm8
  740. addps %xmm3, %xmm12
  741. mulps %xmm7, %xmm8
  742. mulps %xmm7, %xmm12
  743. pshufd $0xff, %xmm8, %xmm11
  744. pshufd $0xaa, %xmm8, %xmm10
  745. pshufd $0x55, %xmm8, %xmm9
  746. pshufd $0x00, %xmm8, %xmm8
  747. pshufd $0xff, %xmm12, %xmm15
  748. pshufd $0xaa, %xmm12, %xmm14
  749. pshufd $0x55, %xmm12, %xmm13
  750. pshufd $0x00, %xmm12, %xmm12
  751. leaq (LDC, LDC, 2), %rax
  752. #ifndef TRMMKERNEL
  753. addss (CO1), %xmm8
  754. addss (CO1, LDC, 1), %xmm9
  755. addss (CO1, LDC, 2), %xmm10
  756. addss (CO1, %rax, 1), %xmm11
  757. addss (CO2), %xmm12
  758. addss (CO2, LDC, 1), %xmm13
  759. addss (CO2, LDC, 2), %xmm14
  760. addss (CO2, %rax, 1), %xmm15
  761. #endif
  762. movss %xmm8, (CO1)
  763. movss %xmm9, (CO1, LDC, 1)
  764. movss %xmm10, (CO1, LDC, 2)
  765. movss %xmm11, (CO1, %rax, 1)
  766. movss %xmm12, (CO2)
  767. movss %xmm13, (CO2, LDC, 1)
  768. movss %xmm14, (CO2, LDC, 2)
  769. movss %xmm15, (CO2, %rax, 1)
  770. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  771. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  772. movq K, %rax
  773. subq KKK, %rax
  774. leaq (,%rax, SIZE), %rax
  775. leaq (AO, %rax, 1), AO
  776. leaq (BO, %rax, 8), BO
  777. #endif
  778. #if defined(TRMMKERNEL) && defined(LEFT)
  779. addq $1, KK
  780. #endif
  781. ALIGN_4
  782. .L39:
  783. #if defined(TRMMKERNEL) && !defined(LEFT)
  784. addq $8, KK
  785. #endif
  786. movq BO, B
  787. leaq (C, LDC, 8), C
  788. subq $1, J
  789. BRANCH
  790. jg .L10
  791. ALIGN_4
  792. .L40:
  793. testq $4, N
  794. jle .L70
  795. #if defined(TRMMKERNEL) && defined(LEFT)
  796. movq OFFSET, %rax
  797. movq %rax, KK
  798. #endif
  799. movq C, CO1
  800. leaq (C, LDC, 2), CO2
  801. movq A, AO
  802. movq M, I
  803. sarq $2, I
  804. NOBRANCH
  805. jle .L50
  806. ALIGN_4
  807. .L41:
  808. #if !defined(TRMMKERNEL) || \
  809. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  810. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  811. movq B, BO
  812. #else
  813. movq B, BO
  814. movq KK, %rax
  815. leaq (, %rax, SIZE), %rax
  816. leaq (AO, %rax, 4), AO
  817. leaq (BO, %rax, 4), BO
  818. #endif
  819. xorps %xmm1, %xmm1
  820. movaps -32 * SIZE(AO), %xmm0
  821. xorps %xmm2, %xmm2
  822. xorps %xmm3, %xmm3
  823. xorps %xmm4, %xmm4
  824. xorps %xmm8, %xmm8
  825. prefetcht2 4 * SIZE(CO1)
  826. xorps %xmm9, %xmm9
  827. prefetcht2 4 * SIZE(CO1, LDC, 1)
  828. xorps %xmm10, %xmm10
  829. prefetcht2 4 * SIZE(CO2)
  830. xorps %xmm11, %xmm11
  831. prefetcht2 4 * SIZE(CO2, LDC, 1)
  832. #ifndef TRMMKERNEL
  833. movq K, %rax
  834. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  835. movq K, %rax
  836. subq KK, %rax
  837. movq %rax, KKK
  838. #else
  839. movq KK, %rax
  840. #ifdef LEFT
  841. addq $4, %rax
  842. #else
  843. addq $4, %rax
  844. #endif
  845. movq %rax, KKK
  846. #endif
  847. sarq $2, %rax
  848. NOBRANCH
  849. jle .L45
  850. ALIGN_3
  851. .L42:
  852. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  853. addps %xmm1, %xmm8
  854. movaps -32 * SIZE(BO), %xmm1
  855. addps %xmm2, %xmm9
  856. pshufd $0x39, %xmm1, %xmm2
  857. mulps %xmm0, %xmm1
  858. addps %xmm3, %xmm10
  859. pshufd $0x39, %xmm2, %xmm3
  860. mulps %xmm0, %xmm2
  861. addps %xmm4, %xmm11
  862. pshufd $0x39, %xmm3, %xmm4
  863. mulps %xmm0, %xmm3
  864. mulps %xmm0, %xmm4
  865. movaps -28 * SIZE(AO), %xmm0
  866. addps %xmm1, %xmm8
  867. movaps -28 * SIZE(BO), %xmm1
  868. addps %xmm2, %xmm9
  869. pshufd $0x39, %xmm1, %xmm2
  870. mulps %xmm0, %xmm1
  871. addps %xmm3, %xmm10
  872. pshufd $0x39, %xmm2, %xmm3
  873. mulps %xmm0, %xmm2
  874. addps %xmm4, %xmm11
  875. pshufd $0x39, %xmm3, %xmm4
  876. mulps %xmm0, %xmm3
  877. mulps %xmm0, %xmm4
  878. movaps -24 * SIZE(AO), %xmm0
  879. addps %xmm1, %xmm8
  880. movaps -24 * SIZE(BO), %xmm1
  881. addps %xmm2, %xmm9
  882. pshufd $0x39, %xmm1, %xmm2
  883. mulps %xmm0, %xmm1
  884. addps %xmm3, %xmm10
  885. pshufd $0x39, %xmm2, %xmm3
  886. mulps %xmm0, %xmm2
  887. addps %xmm4, %xmm11
  888. pshufd $0x39, %xmm3, %xmm4
  889. mulps %xmm0, %xmm3
  890. mulps %xmm0, %xmm4
  891. movaps -20 * SIZE(AO), %xmm0
  892. addps %xmm1, %xmm8
  893. movaps -20 * SIZE(BO), %xmm1
  894. addps %xmm2, %xmm9
  895. pshufd $0x39, %xmm1, %xmm2
  896. mulps %xmm0, %xmm1
  897. addps %xmm3, %xmm10
  898. pshufd $0x39, %xmm2, %xmm3
  899. mulps %xmm0, %xmm2
  900. addps %xmm4, %xmm11
  901. pshufd $0x39, %xmm3, %xmm4
  902. mulps %xmm0, %xmm3
  903. mulps %xmm0, %xmm4
  904. movaps -16 * SIZE(AO), %xmm0
  905. subq $-16 * SIZE, AO
  906. subq $-16 * SIZE, BO
  907. subq $1, %rax
  908. BRANCH
  909. jg .L42
  910. ALIGN_3
  911. .L45:
  912. movddup ALPHA, %xmm7
  913. #ifndef TRMMKERNEL
  914. movq K, %rax
  915. #else
  916. movq KKK, %rax
  917. #endif
  918. andq $3, %rax # if (k & 1)
  919. BRANCH
  920. je .L48
  921. ALIGN_3
  922. .L46:
  923. addps %xmm1, %xmm8
  924. movaps -32 * SIZE(BO), %xmm1
  925. addps %xmm2, %xmm9
  926. pshufd $0x39, %xmm1, %xmm2
  927. mulps %xmm0, %xmm1
  928. addps %xmm3, %xmm10
  929. pshufd $0x39, %xmm2, %xmm3
  930. mulps %xmm0, %xmm2
  931. addps %xmm4, %xmm11
  932. pshufd $0x39, %xmm3, %xmm4
  933. mulps %xmm0, %xmm3
  934. mulps %xmm0, %xmm4
  935. movaps -28 * SIZE(AO), %xmm0
  936. addq $4 * SIZE, AO
  937. addq $4 * SIZE, BO
  938. subq $1, %rax
  939. BRANCH
  940. jg .L46
  941. ALIGN_3
  942. .L48:
  943. addps %xmm1, %xmm8
  944. addps %xmm2, %xmm9
  945. addps %xmm3, %xmm10
  946. addps %xmm4, %xmm11
  947. movaps %xmm9, %xmm4
  948. shufps $0xd8, %xmm8, %xmm9
  949. shufps $0xd8, %xmm11, %xmm8
  950. shufps $0xd8, %xmm10, %xmm11
  951. shufps $0xd8, %xmm4, %xmm10
  952. movaps %xmm8, %xmm4
  953. shufps $0xd8, %xmm10, %xmm8
  954. shufps $0xd8, %xmm4, %xmm10
  955. movaps %xmm9, %xmm5
  956. shufps $0xd8, %xmm11, %xmm9
  957. shufps $0xd8, %xmm5, %xmm11
  958. mulps %xmm7, %xmm8
  959. mulps %xmm7, %xmm9
  960. mulps %xmm7, %xmm10
  961. mulps %xmm7, %xmm11
  962. #ifndef TRMMKERNEL
  963. movsd 0 * SIZE(CO1), %xmm0
  964. movhps 2 * SIZE(CO1), %xmm0
  965. movsd 0 * SIZE(CO1, LDC, 1), %xmm1
  966. movhps 2 * SIZE(CO1, LDC, 1), %xmm1
  967. movsd 0 * SIZE(CO2), %xmm2
  968. movhps 2 * SIZE(CO2), %xmm2
  969. movsd 0 * SIZE(CO2, LDC, 1), %xmm3
  970. movhps 2 * SIZE(CO2, LDC, 1), %xmm3
  971. addps %xmm0, %xmm8
  972. addps %xmm1, %xmm9
  973. addps %xmm2, %xmm10
  974. addps %xmm3, %xmm11
  975. #endif
  976. movsd %xmm8, 0 * SIZE(CO1)
  977. movhps %xmm8, 2 * SIZE(CO1)
  978. movsd %xmm9, 0 * SIZE(CO1, LDC, 1)
  979. movhps %xmm9, 2 * SIZE(CO1, LDC, 1)
  980. movsd %xmm10, 0 * SIZE(CO2)
  981. movhps %xmm10, 2 * SIZE(CO2)
  982. movsd %xmm11, 0 * SIZE(CO2, LDC, 1)
  983. movhps %xmm11, 2 * SIZE(CO2, LDC, 1)
  984. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  985. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  986. movq K, %rax
  987. subq KKK, %rax
  988. leaq (,%rax, SIZE), %rax
  989. leaq (AO, %rax, 4), AO
  990. leaq (BO, %rax, 4), BO
  991. #endif
  992. #if defined(TRMMKERNEL) && defined(LEFT)
  993. addq $4, KK
  994. #endif
  995. addq $4 * SIZE, CO1
  996. addq $4 * SIZE, CO2
  997. decq I
  998. BRANCH
  999. jg .L41
  1000. ALIGN_4
  1001. .L50:
  1002. testq $2, M
  1003. BRANCH
  1004. jle .L60
  1005. #if !defined(TRMMKERNEL) || \
  1006. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1007. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1008. movq B, BO
  1009. #else
  1010. movq B, BO
  1011. movq KK, %rax
  1012. leaq (, %rax, SIZE), %rax
  1013. leaq (AO, %rax, 2), AO
  1014. leaq (BO, %rax, 4), BO
  1015. #endif
  1016. xorps %xmm1, %xmm1
  1017. movddup -32 * SIZE(AO), %xmm0
  1018. xorps %xmm2, %xmm2
  1019. movaps -32 * SIZE(BO), %xmm5
  1020. xorps %xmm8, %xmm8
  1021. xorps %xmm9, %xmm9
  1022. #ifndef TRMMKERNEL
  1023. movq K, %rax
  1024. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1025. movq K, %rax
  1026. subq KK, %rax
  1027. movq %rax, KKK
  1028. #else
  1029. movq KK, %rax
  1030. #ifdef LEFT
  1031. addq $2, %rax
  1032. #else
  1033. addq $4, %rax
  1034. #endif
  1035. movq %rax, KKK
  1036. #endif
  1037. sarq $2, %rax
  1038. NOBRANCH
  1039. jle .L55
  1040. ALIGN_3
  1041. .L52:
  1042. addps %xmm1, %xmm8
  1043. pshufd $0x50, %xmm5, %xmm1
  1044. mulps %xmm0, %xmm1
  1045. addps %xmm2, %xmm9
  1046. pshufd $0xfa, %xmm5, %xmm2
  1047. movaps -28 * SIZE(BO), %xmm5
  1048. mulps %xmm0, %xmm2
  1049. movddup -30 * SIZE(AO), %xmm0
  1050. addps %xmm1, %xmm8
  1051. pshufd $0x50, %xmm5, %xmm1
  1052. mulps %xmm0, %xmm1
  1053. addps %xmm2, %xmm9
  1054. pshufd $0xfa, %xmm5, %xmm2
  1055. movaps -24 * SIZE(BO), %xmm5
  1056. mulps %xmm0, %xmm2
  1057. movddup -28 * SIZE(AO), %xmm0
  1058. addps %xmm1, %xmm8
  1059. pshufd $0x50, %xmm5, %xmm1
  1060. mulps %xmm0, %xmm1
  1061. addps %xmm2, %xmm9
  1062. pshufd $0xfa, %xmm5, %xmm2
  1063. movaps -20 * SIZE(BO), %xmm5
  1064. mulps %xmm0, %xmm2
  1065. movddup -26 * SIZE(AO), %xmm0
  1066. addps %xmm1, %xmm8
  1067. pshufd $0x50, %xmm5, %xmm1
  1068. mulps %xmm0, %xmm1
  1069. addps %xmm2, %xmm9
  1070. pshufd $0xfa, %xmm5, %xmm2
  1071. movaps -16 * SIZE(BO), %xmm5
  1072. mulps %xmm0, %xmm2
  1073. movddup -24 * SIZE(AO), %xmm0
  1074. subq $-16 * SIZE, BO
  1075. subq $ -8 * SIZE, AO
  1076. subq $1, %rax
  1077. BRANCH
  1078. jg .L52
  1079. ALIGN_3
  1080. .L55:
  1081. movddup ALPHA, %xmm7
  1082. #ifndef TRMMKERNEL
  1083. movq K, %rax
  1084. #else
  1085. movq KKK, %rax
  1086. #endif
  1087. andq $3, %rax # if (k & 1)
  1088. BRANCH
  1089. je .L58
  1090. ALIGN_3
  1091. .L56:
  1092. addps %xmm1, %xmm8
  1093. pshufd $0x50, %xmm5, %xmm1
  1094. mulps %xmm0, %xmm1
  1095. addps %xmm2, %xmm9
  1096. pshufd $0xfa, %xmm5, %xmm2
  1097. movaps -28 * SIZE(BO), %xmm5
  1098. mulps %xmm0, %xmm2
  1099. movddup -30 * SIZE(AO), %xmm0
  1100. addq $2 * SIZE, AO
  1101. addq $4 * SIZE, BO
  1102. subq $1, %rax
  1103. BRANCH
  1104. jg .L56
  1105. ALIGN_3
  1106. .L58:
  1107. addps %xmm1, %xmm8
  1108. addps %xmm2, %xmm9
  1109. mulps %xmm7, %xmm8
  1110. mulps %xmm7, %xmm9
  1111. #ifndef TRMMKERNEL
  1112. movsd (CO1), %xmm0
  1113. movhps (CO1, LDC, 1), %xmm0
  1114. movsd (CO2), %xmm1
  1115. movhps (CO2, LDC, 1), %xmm1
  1116. addps %xmm0, %xmm8
  1117. addps %xmm1, %xmm9
  1118. #endif
  1119. movsd %xmm8, (CO1)
  1120. movhps %xmm8, (CO1, LDC, 1)
  1121. movsd %xmm9, (CO2)
  1122. movhps %xmm9, (CO2, LDC, 1)
  1123. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1124. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1125. movq K, %rax
  1126. subq KKK, %rax
  1127. leaq (,%rax, SIZE), %rax
  1128. leaq (AO, %rax, 2), AO
  1129. leaq (BO, %rax, 4), BO
  1130. #endif
  1131. #if defined(TRMMKERNEL) && defined(LEFT)
  1132. addq $2, KK
  1133. #endif
  1134. addq $2 * SIZE, CO1
  1135. addq $2 * SIZE, CO2
  1136. ALIGN_4
  1137. .L60:
  1138. testq $1, M
  1139. BRANCH
  1140. jle .L69
  1141. #if !defined(TRMMKERNEL) || \
  1142. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1143. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1144. movq B, BO
  1145. #else
  1146. movq B, BO
  1147. movq KK, %rax
  1148. leaq (, %rax, SIZE), %rax
  1149. leaq (AO, %rax, 1), AO
  1150. leaq (BO, %rax, 4), BO
  1151. #endif
  1152. xorps %xmm2, %xmm2
  1153. movsd -32 * SIZE(AO), %xmm0
  1154. xorps %xmm8, %xmm8
  1155. xorps %xmm9, %xmm9
  1156. #ifndef TRMMKERNEL
  1157. movq K, %rax
  1158. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1159. movq K, %rax
  1160. subq KK, %rax
  1161. movq %rax, KKK
  1162. #else
  1163. movq KK, %rax
  1164. #ifdef LEFT
  1165. addq $1, %rax
  1166. #else
  1167. addq $4, %rax
  1168. #endif
  1169. movq %rax, KKK
  1170. #endif
  1171. sarq $2, %rax
  1172. NOBRANCH
  1173. jle .L65
  1174. ALIGN_3
  1175. .L62:
  1176. pshufd $0x00, %xmm0, %xmm1
  1177. addps %xmm2, %xmm8
  1178. movaps -32 * SIZE(BO), %xmm2
  1179. mulps %xmm1, %xmm2
  1180. pshufd $0x55, %xmm0, %xmm1
  1181. movsd -30 * SIZE(AO), %xmm0
  1182. addps %xmm2, %xmm9
  1183. movaps -28 * SIZE(BO), %xmm2
  1184. mulps %xmm1, %xmm2
  1185. pshufd $0x00, %xmm0, %xmm1
  1186. addps %xmm2, %xmm8
  1187. movaps -24 * SIZE(BO), %xmm2
  1188. mulps %xmm1, %xmm2
  1189. pshufd $0x55, %xmm0, %xmm1
  1190. movsd -28 * SIZE(AO), %xmm0
  1191. addps %xmm2, %xmm9
  1192. movaps -20 * SIZE(BO), %xmm2
  1193. mulps %xmm1, %xmm2
  1194. subq $-16 * SIZE, BO
  1195. subq $ -4 * SIZE, AO
  1196. subq $1, %rax
  1197. BRANCH
  1198. jg .L62
  1199. addps %xmm9, %xmm8
  1200. ALIGN_3
  1201. .L65:
  1202. movddup ALPHA, %xmm7
  1203. #ifndef TRMMKERNEL
  1204. movq K, %rax
  1205. #else
  1206. movq KKK, %rax
  1207. #endif
  1208. andq $3, %rax # if (k & 1)
  1209. BRANCH
  1210. je .L68
  1211. ALIGN_3
  1212. .L66:
  1213. pshufd $0x00, %xmm0, %xmm1
  1214. movss -31 * SIZE(AO), %xmm0
  1215. addps %xmm2, %xmm8
  1216. movaps -32 * SIZE(BO), %xmm2
  1217. mulps %xmm1, %xmm2
  1218. addq $1 * SIZE, AO
  1219. addq $4 * SIZE, BO
  1220. subq $1, %rax
  1221. BRANCH
  1222. jg .L66
  1223. ALIGN_3
  1224. .L68:
  1225. addps %xmm2, %xmm8
  1226. mulps %xmm7, %xmm8
  1227. pshufd $0xff, %xmm8, %xmm11
  1228. pshufd $0xaa, %xmm8, %xmm10
  1229. pshufd $0x55, %xmm8, %xmm9
  1230. pshufd $0x00, %xmm8, %xmm8
  1231. #ifndef TRMMKERNEL
  1232. addss (CO1), %xmm8
  1233. addss (CO1, LDC, 1), %xmm9
  1234. addss (CO2), %xmm10
  1235. addss (CO2, LDC, 1), %xmm11
  1236. #endif
  1237. movss %xmm8, (CO1)
  1238. movss %xmm9, (CO1, LDC, 1)
  1239. movss %xmm10, (CO2)
  1240. movss %xmm11, (CO2, LDC, 1)
  1241. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1242. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1243. movq K, %rax
  1244. subq KKK, %rax
  1245. leaq (,%rax, SIZE), %rax
  1246. leaq (AO, %rax, 1), AO
  1247. leaq (BO, %rax, 4), BO
  1248. #endif
  1249. #if defined(TRMMKERNEL) && defined(LEFT)
  1250. addq $1, KK
  1251. #endif
  1252. ALIGN_4
  1253. .L69:
  1254. #if defined(TRMMKERNEL) && !defined(LEFT)
  1255. addq $4, KK
  1256. #endif
  1257. movq BO, B
  1258. leaq (C, LDC, 4), C
  1259. ALIGN_4
  1260. .L70:
  1261. testq $2, N
  1262. jle .L100
  1263. #if defined(TRMMKERNEL) && defined(LEFT)
  1264. movq OFFSET, %rax
  1265. movq %rax, KK
  1266. #endif
  1267. movq C, CO1
  1268. leaq (C, LDC), CO2
  1269. movq A, AO
  1270. movq M, I
  1271. sarq $2, I
  1272. NOBRANCH
  1273. jle .L80
  1274. ALIGN_4
  1275. .L71:
  1276. #if !defined(TRMMKERNEL) || \
  1277. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1278. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1279. movq B, BO
  1280. #else
  1281. movq B, BO
  1282. movq KK, %rax
  1283. leaq (, %rax, SIZE), %rax
  1284. leaq (AO, %rax, 4), AO
  1285. leaq (BO, %rax, 2), BO
  1286. #endif
  1287. xorps %xmm1, %xmm1
  1288. movaps -32 * SIZE(AO), %xmm0
  1289. xorps %xmm2, %xmm2
  1290. movsd -32 * SIZE(BO), %xmm3
  1291. xorps %xmm8, %xmm8
  1292. prefetcht2 4 * SIZE(CO1)
  1293. xorps %xmm9, %xmm9
  1294. prefetcht2 4 * SIZE(CO2)
  1295. #ifndef TRMMKERNEL
  1296. movq K, %rax
  1297. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1298. movq K, %rax
  1299. subq KK, %rax
  1300. movq %rax, KKK
  1301. #else
  1302. movq KK, %rax
  1303. #ifdef LEFT
  1304. addq $4, %rax
  1305. #else
  1306. addq $2, %rax
  1307. #endif
  1308. movq %rax, KKK
  1309. #endif
  1310. sarq $2, %rax
  1311. NOBRANCH
  1312. jle .L75
  1313. ALIGN_3
  1314. .L72:
  1315. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1316. addps %xmm1, %xmm8
  1317. pshufd $0x00, %xmm3, %xmm1
  1318. mulps %xmm0, %xmm1
  1319. addps %xmm2, %xmm9
  1320. pshufd $0x55, %xmm3, %xmm2
  1321. movsd -30 * SIZE(BO), %xmm3
  1322. mulps %xmm0, %xmm2
  1323. movaps -28 * SIZE(AO), %xmm0
  1324. addps %xmm1, %xmm8
  1325. pshufd $0x00, %xmm3, %xmm1
  1326. mulps %xmm0, %xmm1
  1327. addps %xmm2, %xmm9
  1328. pshufd $0x55, %xmm3, %xmm2
  1329. movsd -28 * SIZE(BO), %xmm3
  1330. mulps %xmm0, %xmm2
  1331. movaps -24 * SIZE(AO), %xmm0
  1332. addps %xmm1, %xmm8
  1333. pshufd $0x00, %xmm3, %xmm1
  1334. mulps %xmm0, %xmm1
  1335. addps %xmm2, %xmm9
  1336. pshufd $0x55, %xmm3, %xmm2
  1337. movsd -26 * SIZE(BO), %xmm3
  1338. mulps %xmm0, %xmm2
  1339. movaps -20 * SIZE(AO), %xmm0
  1340. addps %xmm1, %xmm8
  1341. pshufd $0x00, %xmm3, %xmm1
  1342. mulps %xmm0, %xmm1
  1343. addps %xmm2, %xmm9
  1344. pshufd $0x55, %xmm3, %xmm2
  1345. movsd -24 * SIZE(BO), %xmm3
  1346. mulps %xmm0, %xmm2
  1347. movaps -16 * SIZE(AO), %xmm0
  1348. subq $-16 * SIZE, AO
  1349. subq $ -8 * SIZE, BO
  1350. subq $1, %rax
  1351. BRANCH
  1352. jg .L72
  1353. ALIGN_3
  1354. .L75:
  1355. movddup ALPHA, %xmm7
  1356. #ifndef TRMMKERNEL
  1357. movq K, %rax
  1358. #else
  1359. movq KKK, %rax
  1360. #endif
  1361. andq $3, %rax # if (k & 1)
  1362. BRANCH
  1363. je .L78
  1364. ALIGN_3
  1365. .L76:
  1366. addps %xmm1, %xmm8
  1367. pshufd $0x00, %xmm3, %xmm1
  1368. mulps %xmm0, %xmm1
  1369. addps %xmm2, %xmm9
  1370. pshufd $0x55, %xmm3, %xmm2
  1371. movsd -30 * SIZE(BO), %xmm3
  1372. mulps %xmm0, %xmm2
  1373. movaps -28 * SIZE(AO), %xmm0
  1374. addq $4 * SIZE, AO
  1375. addq $2 * SIZE, BO
  1376. subq $1, %rax
  1377. BRANCH
  1378. jg .L76
  1379. ALIGN_3
  1380. .L78:
  1381. addps %xmm1, %xmm8
  1382. addps %xmm2, %xmm9
  1383. mulps %xmm7, %xmm8
  1384. mulps %xmm7, %xmm9
  1385. #ifndef TRMMKERNEL
  1386. movsd 0 * SIZE(CO1), %xmm0
  1387. movhps 2 * SIZE(CO1), %xmm0
  1388. movsd 0 * SIZE(CO2), %xmm1
  1389. movhps 2 * SIZE(CO2), %xmm1
  1390. addps %xmm0, %xmm8
  1391. addps %xmm1, %xmm9
  1392. #endif
  1393. movsd %xmm8, 0 * SIZE(CO1)
  1394. movhps %xmm8, 2 * SIZE(CO1)
  1395. movsd %xmm9, 0 * SIZE(CO2)
  1396. movhps %xmm9, 2 * SIZE(CO2)
  1397. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1398. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1399. movq K, %rax
  1400. subq KKK, %rax
  1401. leaq (,%rax, SIZE), %rax
  1402. leaq (AO, %rax, 4), AO
  1403. leaq (BO, %rax, 2), BO
  1404. #endif
  1405. #if defined(TRMMKERNEL) && defined(LEFT)
  1406. addq $4, KK
  1407. #endif
  1408. addq $4 * SIZE, CO1
  1409. addq $4 * SIZE, CO2
  1410. decq I
  1411. BRANCH
  1412. jg .L71
  1413. ALIGN_4
  1414. .L80:
  1415. testq $2, M
  1416. BRANCH
  1417. jle .L90
  1418. #if !defined(TRMMKERNEL) || \
  1419. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1420. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1421. movq B, BO
  1422. #else
  1423. movq B, BO
  1424. movq KK, %rax
  1425. leaq (, %rax, SIZE), %rax
  1426. leaq (AO, %rax, 2), AO
  1427. leaq (BO, %rax, 2), BO
  1428. #endif
  1429. xorps %xmm1, %xmm1
  1430. movddup -32 * SIZE(AO), %xmm0
  1431. xorps %xmm2, %xmm2
  1432. movsd -32 * SIZE(BO), %xmm5
  1433. xorps %xmm8, %xmm8
  1434. xorps %xmm9, %xmm9
  1435. #ifndef TRMMKERNEL
  1436. movq K, %rax
  1437. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1438. movq K, %rax
  1439. subq KK, %rax
  1440. movq %rax, KKK
  1441. #else
  1442. movq KK, %rax
  1443. #ifdef LEFT
  1444. addq $2, %rax
  1445. #else
  1446. addq $2, %rax
  1447. #endif
  1448. movq %rax, KKK
  1449. #endif
  1450. sarq $2, %rax
  1451. NOBRANCH
  1452. jle .L85
  1453. ALIGN_3
  1454. .L82:
  1455. addps %xmm1, %xmm8
  1456. movsd -32 * SIZE(BO), %xmm1
  1457. unpcklps %xmm1, %xmm1
  1458. mulps %xmm0, %xmm1
  1459. movddup -30 * SIZE(AO), %xmm0
  1460. addps %xmm1, %xmm8
  1461. movsd -30 * SIZE(BO), %xmm1
  1462. unpcklps %xmm1, %xmm1
  1463. mulps %xmm0, %xmm1
  1464. movddup -28 * SIZE(AO), %xmm0
  1465. addps %xmm1, %xmm8
  1466. movsd -28 * SIZE(BO), %xmm1
  1467. unpcklps %xmm1, %xmm1
  1468. mulps %xmm0, %xmm1
  1469. movddup -26 * SIZE(AO), %xmm0
  1470. addps %xmm1, %xmm8
  1471. movsd -26 * SIZE(BO), %xmm1
  1472. unpcklps %xmm1, %xmm1
  1473. mulps %xmm0, %xmm1
  1474. movddup -24 * SIZE(AO), %xmm0
  1475. subq $-8 * SIZE, BO
  1476. subq $-8 * SIZE, AO
  1477. subq $1, %rax
  1478. BRANCH
  1479. jg .L82
  1480. ALIGN_3
  1481. .L85:
  1482. movddup ALPHA, %xmm7
  1483. #ifndef TRMMKERNEL
  1484. movq K, %rax
  1485. #else
  1486. movq KKK, %rax
  1487. #endif
  1488. andq $3, %rax # if (k & 1)
  1489. BRANCH
  1490. je .L88
  1491. ALIGN_3
  1492. .L86:
  1493. addps %xmm1, %xmm8
  1494. movsd -32 * SIZE(BO), %xmm1
  1495. unpcklps %xmm1, %xmm1
  1496. mulps %xmm0, %xmm1
  1497. movddup -30 * SIZE(AO), %xmm0
  1498. addq $2 * SIZE, AO
  1499. addq $2 * SIZE, BO
  1500. subq $1, %rax
  1501. BRANCH
  1502. jg .L86
  1503. ALIGN_3
  1504. .L88:
  1505. addps %xmm1, %xmm8
  1506. mulps %xmm7, %xmm8
  1507. #ifndef TRMMKERNEL
  1508. movsd (CO1), %xmm0
  1509. movhps (CO2), %xmm0
  1510. addps %xmm0, %xmm8
  1511. #endif
  1512. movsd %xmm8, (CO1)
  1513. movhps %xmm8, (CO2)
  1514. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1515. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1516. movq K, %rax
  1517. subq KKK, %rax
  1518. leaq (,%rax, SIZE), %rax
  1519. leaq (AO, %rax, 2), AO
  1520. leaq (BO, %rax, 2), BO
  1521. #endif
  1522. #if defined(TRMMKERNEL) && defined(LEFT)
  1523. addq $2, KK
  1524. #endif
  1525. addq $2 * SIZE, CO1
  1526. addq $2 * SIZE, CO2
  1527. ALIGN_4
  1528. .L90:
  1529. testq $1, M
  1530. BRANCH
  1531. jle .L99
  1532. #if !defined(TRMMKERNEL) || \
  1533. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1534. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1535. movq B, BO
  1536. #else
  1537. movq B, BO
  1538. movq KK, %rax
  1539. leaq (, %rax, SIZE), %rax
  1540. leaq (AO, %rax, 1), AO
  1541. leaq (BO, %rax, 2), BO
  1542. #endif
  1543. xorps %xmm2, %xmm2
  1544. movsd -32 * SIZE(AO), %xmm0
  1545. xorps %xmm8, %xmm8
  1546. xorps %xmm9, %xmm9
  1547. #ifndef TRMMKERNEL
  1548. movq K, %rax
  1549. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1550. movq K, %rax
  1551. subq KK, %rax
  1552. movq %rax, KKK
  1553. #else
  1554. movq KK, %rax
  1555. #ifdef LEFT
  1556. addq $1, %rax
  1557. #else
  1558. addq $2, %rax
  1559. #endif
  1560. movq %rax, KKK
  1561. #endif
  1562. sarq $2, %rax
  1563. NOBRANCH
  1564. jle .L95
  1565. ALIGN_3
  1566. .L92:
  1567. pshufd $0x00, %xmm0, %xmm1
  1568. addps %xmm2, %xmm8
  1569. movsd -32 * SIZE(BO), %xmm2
  1570. mulps %xmm1, %xmm2
  1571. pshufd $0x55, %xmm0, %xmm1
  1572. movsd -30 * SIZE(AO), %xmm0
  1573. addps %xmm2, %xmm9
  1574. movsd -30 * SIZE(BO), %xmm2
  1575. mulps %xmm1, %xmm2
  1576. pshufd $0x00, %xmm0, %xmm1
  1577. addps %xmm2, %xmm8
  1578. movsd -28 * SIZE(BO), %xmm2
  1579. mulps %xmm1, %xmm2
  1580. pshufd $0x55, %xmm0, %xmm1
  1581. movsd -28 * SIZE(AO), %xmm0
  1582. addps %xmm2, %xmm9
  1583. movsd -26 * SIZE(BO), %xmm2
  1584. mulps %xmm1, %xmm2
  1585. subq $-4 * SIZE, AO
  1586. subq $-8 * SIZE, BO
  1587. subq $1, %rax
  1588. BRANCH
  1589. jg .L92
  1590. addps %xmm9, %xmm8
  1591. ALIGN_3
  1592. .L95:
  1593. movddup ALPHA, %xmm7
  1594. #ifndef TRMMKERNEL
  1595. movq K, %rax
  1596. #else
  1597. movq KKK, %rax
  1598. #endif
  1599. andq $3, %rax # if (k & 1)
  1600. BRANCH
  1601. je .L98
  1602. ALIGN_3
  1603. .L96:
  1604. pshufd $0x00, %xmm0, %xmm1
  1605. movss -31 * SIZE(AO), %xmm0
  1606. addps %xmm2, %xmm8
  1607. movsd -32 * SIZE(BO), %xmm2
  1608. mulps %xmm1, %xmm2
  1609. addq $1 * SIZE, AO
  1610. addq $2 * SIZE, BO
  1611. subq $1, %rax
  1612. BRANCH
  1613. jg .L96
  1614. ALIGN_3
  1615. .L98:
  1616. addps %xmm2, %xmm8
  1617. mulps %xmm7, %xmm8
  1618. pshufd $0x55, %xmm8, %xmm9
  1619. pshufd $0x00, %xmm8, %xmm8
  1620. #ifndef TRMMKERNEL
  1621. addss (CO1), %xmm8
  1622. addss (CO2), %xmm9
  1623. #endif
  1624. movss %xmm8, (CO1)
  1625. movss %xmm9, (CO2)
  1626. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1627. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1628. movq K, %rax
  1629. subq KKK, %rax
  1630. leaq (,%rax, SIZE), %rax
  1631. leaq (AO, %rax, 1), AO
  1632. leaq (BO, %rax, 2), BO
  1633. #endif
  1634. #if defined(TRMMKERNEL) && defined(LEFT)
  1635. addq $1, KK
  1636. #endif
  1637. ALIGN_4
  1638. .L99:
  1639. #if defined(TRMMKERNEL) && !defined(LEFT)
  1640. addq $2, KK
  1641. #endif
  1642. movq BO, B
  1643. leaq (C, LDC, 2), C
  1644. ALIGN_4
  1645. .L100:
  1646. testq $1, N
  1647. jle .L999
  1648. #if defined(TRMMKERNEL) && defined(LEFT)
  1649. movq OFFSET, %rax
  1650. movq %rax, KK
  1651. #endif
  1652. movq C, CO1
  1653. movq A, AO
  1654. movq M, I
  1655. sarq $2, I
  1656. NOBRANCH
  1657. jle .L110
  1658. ALIGN_4
  1659. .L101:
  1660. #if !defined(TRMMKERNEL) || \
  1661. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1662. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1663. movq B, BO
  1664. #else
  1665. movq B, BO
  1666. movq KK, %rax
  1667. leaq (, %rax, SIZE), %rax
  1668. leaq (AO, %rax, 4), AO
  1669. leaq (BO, %rax, 1), BO
  1670. #endif
  1671. xorps %xmm1, %xmm1
  1672. movaps -32 * SIZE(AO), %xmm0
  1673. movsd -32 * SIZE(BO), %xmm3
  1674. xorps %xmm8, %xmm8
  1675. prefetcht2 4 * SIZE(CO1)
  1676. #ifndef TRMMKERNEL
  1677. movq K, %rax
  1678. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1679. movq K, %rax
  1680. subq KK, %rax
  1681. movq %rax, KKK
  1682. #else
  1683. movq KK, %rax
  1684. #ifdef LEFT
  1685. addq $4, %rax
  1686. #else
  1687. addq $1, %rax
  1688. #endif
  1689. movq %rax, KKK
  1690. #endif
  1691. sarq $2, %rax
  1692. NOBRANCH
  1693. jle .L105
  1694. ALIGN_3
  1695. .L102:
  1696. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1697. addps %xmm1, %xmm8
  1698. pshufd $0x00, %xmm3, %xmm1
  1699. movss -31 * SIZE(BO), %xmm3
  1700. mulps %xmm0, %xmm1
  1701. movaps -28 * SIZE(AO), %xmm0
  1702. addps %xmm1, %xmm8
  1703. pshufd $0x00, %xmm3, %xmm1
  1704. movss -30 * SIZE(BO), %xmm3
  1705. mulps %xmm0, %xmm1
  1706. movaps -24 * SIZE(AO), %xmm0
  1707. addps %xmm1, %xmm8
  1708. pshufd $0x00, %xmm3, %xmm1
  1709. movss -29 * SIZE(BO), %xmm3
  1710. mulps %xmm0, %xmm1
  1711. movaps -20 * SIZE(AO), %xmm0
  1712. addps %xmm1, %xmm8
  1713. pshufd $0x00, %xmm3, %xmm1
  1714. movss -28 * SIZE(BO), %xmm3
  1715. mulps %xmm0, %xmm1
  1716. movaps -16 * SIZE(AO), %xmm0
  1717. subq $-16 * SIZE, AO
  1718. subq $ -4 * SIZE, BO
  1719. subq $1, %rax
  1720. BRANCH
  1721. jg .L102
  1722. ALIGN_3
  1723. .L105:
  1724. movddup ALPHA, %xmm7
  1725. #ifndef TRMMKERNEL
  1726. movq K, %rax
  1727. #else
  1728. movq KKK, %rax
  1729. #endif
  1730. andq $3, %rax # if (k & 1)
  1731. BRANCH
  1732. je .L108
  1733. ALIGN_3
  1734. .L106:
  1735. addps %xmm1, %xmm8
  1736. pshufd $0x00, %xmm3, %xmm1
  1737. movss -31 * SIZE(BO), %xmm3
  1738. mulps %xmm0, %xmm1
  1739. movaps -28 * SIZE(AO), %xmm0
  1740. addq $4 * SIZE, AO
  1741. addq $1 * SIZE, BO
  1742. subq $1, %rax
  1743. BRANCH
  1744. jg .L106
  1745. ALIGN_3
  1746. .L108:
  1747. addps %xmm1, %xmm8
  1748. mulps %xmm7, %xmm8
  1749. #ifndef TRMMKERNEL
  1750. movsd 0 * SIZE(CO1), %xmm0
  1751. movhps 2 * SIZE(CO1), %xmm0
  1752. addps %xmm0, %xmm8
  1753. #endif
  1754. movsd %xmm8, 0 * SIZE(CO1)
  1755. movhps %xmm8, 2 * SIZE(CO1)
  1756. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1757. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1758. movq K, %rax
  1759. subq KKK, %rax
  1760. leaq (,%rax, SIZE), %rax
  1761. leaq (AO, %rax, 4), AO
  1762. leaq (BO, %rax, 1), BO
  1763. #endif
  1764. #if defined(TRMMKERNEL) && defined(LEFT)
  1765. addq $4, KK
  1766. #endif
  1767. addq $4 * SIZE, CO1
  1768. decq I
  1769. BRANCH
  1770. jg .L101
  1771. ALIGN_4
  1772. .L110:
  1773. testq $2, M
  1774. BRANCH
  1775. jle .L120
  1776. #if !defined(TRMMKERNEL) || \
  1777. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1778. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1779. movq B, BO
  1780. #else
  1781. movq B, BO
  1782. movq KK, %rax
  1783. leaq (, %rax, SIZE), %rax
  1784. leaq (AO, %rax, 2), AO
  1785. leaq (BO, %rax, 1), BO
  1786. #endif
  1787. xorps %xmm1, %xmm1
  1788. movddup -32 * SIZE(AO), %xmm0
  1789. xorps %xmm8, %xmm8
  1790. #ifndef TRMMKERNEL
  1791. movq K, %rax
  1792. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1793. movq K, %rax
  1794. subq KK, %rax
  1795. movq %rax, KKK
  1796. #else
  1797. movq KK, %rax
  1798. #ifdef LEFT
  1799. addq $2, %rax
  1800. #else
  1801. addq $1, %rax
  1802. #endif
  1803. movq %rax, KKK
  1804. #endif
  1805. sarq $2, %rax
  1806. NOBRANCH
  1807. jle .L115
  1808. ALIGN_3
  1809. .L112:
  1810. addps %xmm1, %xmm8
  1811. movss -32 * SIZE(BO), %xmm1
  1812. unpcklps %xmm1, %xmm1
  1813. mulps %xmm0, %xmm1
  1814. movsd -30 * SIZE(AO), %xmm0
  1815. addps %xmm1, %xmm8
  1816. movss -31 * SIZE(BO), %xmm1
  1817. unpcklps %xmm1, %xmm1
  1818. mulps %xmm0, %xmm1
  1819. movsd -28 * SIZE(AO), %xmm0
  1820. addps %xmm1, %xmm8
  1821. movss -30 * SIZE(BO), %xmm1
  1822. unpcklps %xmm1, %xmm1
  1823. mulps %xmm0, %xmm1
  1824. movsd -26 * SIZE(AO), %xmm0
  1825. addps %xmm1, %xmm8
  1826. movss -29 * SIZE(BO), %xmm1
  1827. unpcklps %xmm1, %xmm1
  1828. mulps %xmm0, %xmm1
  1829. movsd -24 * SIZE(AO), %xmm0
  1830. subq $-4 * SIZE, BO
  1831. subq $-8 * SIZE, AO
  1832. subq $1, %rax
  1833. BRANCH
  1834. jg .L112
  1835. ALIGN_3
  1836. .L115:
  1837. movddup ALPHA, %xmm7
  1838. #ifndef TRMMKERNEL
  1839. movq K, %rax
  1840. #else
  1841. movq KKK, %rax
  1842. #endif
  1843. andq $3, %rax # if (k & 1)
  1844. BRANCH
  1845. je .L118
  1846. ALIGN_3
  1847. .L116:
  1848. addps %xmm1, %xmm8
  1849. movss -32 * SIZE(BO), %xmm1
  1850. unpcklps %xmm1, %xmm1
  1851. mulps %xmm0, %xmm1
  1852. movsd -30 * SIZE(AO), %xmm0
  1853. addq $2 * SIZE, AO
  1854. addq $1 * SIZE, BO
  1855. subq $1, %rax
  1856. BRANCH
  1857. jg .L116
  1858. ALIGN_3
  1859. .L118:
  1860. addps %xmm1, %xmm8
  1861. mulps %xmm7, %xmm8
  1862. #ifndef TRMMKERNEL
  1863. movsd (CO1), %xmm0
  1864. addps %xmm0, %xmm8
  1865. #endif
  1866. movsd %xmm8, (CO1)
  1867. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1868. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1869. movq K, %rax
  1870. subq KKK, %rax
  1871. leaq (,%rax, SIZE), %rax
  1872. leaq (AO, %rax, 2), AO
  1873. leaq (BO, %rax, 1), BO
  1874. #endif
  1875. #if defined(TRMMKERNEL) && defined(LEFT)
  1876. addq $2, KK
  1877. #endif
  1878. addq $2 * SIZE, CO1
  1879. ALIGN_4
  1880. .L120:
  1881. testq $1, M
  1882. BRANCH
  1883. jle .L999
  1884. #if !defined(TRMMKERNEL) || \
  1885. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1886. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1887. movq B, BO
  1888. #else
  1889. movq B, BO
  1890. movq KK, %rax
  1891. leaq (, %rax, SIZE), %rax
  1892. leaq (AO, %rax, 1), AO
  1893. leaq (BO, %rax, 1), BO
  1894. #endif
  1895. xorps %xmm2, %xmm2
  1896. movss -32 * SIZE(AO), %xmm0
  1897. xorps %xmm8, %xmm8
  1898. #ifndef TRMMKERNEL
  1899. movq K, %rax
  1900. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1901. movq K, %rax
  1902. subq KK, %rax
  1903. movq %rax, KKK
  1904. #else
  1905. movq KK, %rax
  1906. #ifdef LEFT
  1907. addq $1, %rax
  1908. #else
  1909. addq $1, %rax
  1910. #endif
  1911. movq %rax, KKK
  1912. #endif
  1913. sarq $2, %rax
  1914. NOBRANCH
  1915. jle .L125
  1916. ALIGN_3
  1917. .L122:
  1918. addss %xmm2, %xmm8
  1919. movss -32 * SIZE(BO), %xmm2
  1920. mulss %xmm0, %xmm2
  1921. movss -31 * SIZE(AO), %xmm0
  1922. addss %xmm2, %xmm8
  1923. movss -31 * SIZE(BO), %xmm2
  1924. mulss %xmm0, %xmm2
  1925. movss -30 * SIZE(AO), %xmm0
  1926. addss %xmm2, %xmm8
  1927. movss -30 * SIZE(BO), %xmm2
  1928. mulss %xmm0, %xmm2
  1929. movss -29 * SIZE(AO), %xmm0
  1930. addss %xmm2, %xmm8
  1931. movss -29 * SIZE(BO), %xmm2
  1932. mulss %xmm0, %xmm2
  1933. movss -28 * SIZE(AO), %xmm0
  1934. subq $-4 * SIZE, AO
  1935. subq $-4 * SIZE, BO
  1936. subq $1, %rax
  1937. BRANCH
  1938. jg .L122
  1939. ALIGN_3
  1940. .L125:
  1941. movddup ALPHA, %xmm7
  1942. #ifndef TRMMKERNEL
  1943. movq K, %rax
  1944. #else
  1945. movq KKK, %rax
  1946. #endif
  1947. andq $3, %rax # if (k & 1)
  1948. BRANCH
  1949. je .L128
  1950. ALIGN_3
  1951. .L126:
  1952. addss %xmm2, %xmm8
  1953. movss -32 * SIZE(BO), %xmm2
  1954. mulss %xmm0, %xmm2
  1955. movss -31 * SIZE(AO), %xmm0
  1956. addq $1 * SIZE, AO
  1957. addq $1 * SIZE, BO
  1958. subq $1, %rax
  1959. BRANCH
  1960. jg .L126
  1961. ALIGN_3
  1962. .L128:
  1963. addps %xmm2, %xmm8
  1964. mulps %xmm7, %xmm8
  1965. #ifndef TRMMKERNEL
  1966. addss (CO1), %xmm8
  1967. #endif
  1968. movss %xmm8, (CO1)
  1969. ALIGN_4
  1970. .L999:
  1971. movq 0(%rsp), %rbx
  1972. movq 8(%rsp), %rbp
  1973. movq 16(%rsp), %r12
  1974. movq 24(%rsp), %r13
  1975. movq 32(%rsp), %r14
  1976. movq 40(%rsp), %r15
  1977. #ifdef WINDOWS_ABI
  1978. movq 48(%rsp), %rdi
  1979. movq 56(%rsp), %rsi
  1980. movups 64(%rsp), %xmm6
  1981. movups 80(%rsp), %xmm7
  1982. movups 96(%rsp), %xmm8
  1983. movups 112(%rsp), %xmm9
  1984. movups 128(%rsp), %xmm10
  1985. movups 144(%rsp), %xmm11
  1986. movups 160(%rsp), %xmm12
  1987. movups 176(%rsp), %xmm13
  1988. movups 192(%rsp), %xmm14
  1989. movups 208(%rsp), %xmm15
  1990. #endif
  1991. addq $STACKSIZE, %rsp
  1992. ret
  1993. EPILOGUE