You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_2x2_sse3.S 36 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %rdi
  41. #define N %rsi
  42. #define K %rdx
  43. #define A %rcx
  44. #define B %r8
  45. #define C %r9
  46. #define LDC %r10
  47. #define I %r11
  48. #define J %r12
  49. #define AO %r13
  50. #define BO %r14
  51. #define CO1 %r15
  52. #define CO2 %rbx
  53. #define BB %rbp
  54. #ifndef WINDOWS_ABI
  55. #define STACKSIZE 128
  56. #define OLD_LDC 8 + STACKSIZE(%rsp)
  57. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  58. #define ALPHA_R 48(%rsp)
  59. #define ALPHA_I 56(%rsp)
  60. #define OFFSET 64(%rsp)
  61. #define KKK 72(%rsp)
  62. #define KK 80(%rsp)
  63. #else
  64. #define STACKSIZE 512
  65. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  66. #define OLD_A 48 + STACKSIZE(%rsp)
  67. #define OLD_B 56 + STACKSIZE(%rsp)
  68. #define OLD_C 64 + STACKSIZE(%rsp)
  69. #define OLD_LDC 72 + STACKSIZE(%rsp)
  70. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  71. #define ALPHA_R 224(%rsp)
  72. #define ALPHA_I 232(%rsp)
  73. #define OFFSET 240(%rsp)
  74. #define KKK 248(%rsp)
  75. #define KK 256(%rsp)
  76. #endif
  77. #define PREFETCH prefetcht1
  78. #define PREFETCHSIZE (16 * 12 + 3)
  79. #define PREFETCH_R (4 * 4 + 0)
  80. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  81. #define ADD1 addpd
  82. #define ADD2 addpd
  83. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  84. #define ADD1 addpd
  85. #define ADD2 subpd
  86. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  87. #define ADD1 subpd
  88. #define ADD2 addpd
  89. #else
  90. #define ADD1 subpd
  91. #define ADD2 subpd
  92. #endif
  93. #define ADDSUB subpd
  94. #define KERNEL1(address) \
  95. mulpd %xmm8, %xmm9;\
  96. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\
  97. ADD1 %xmm9, %xmm0;\
  98. movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  99. mulpd %xmm8, %xmm9;\
  100. ADD2 %xmm9, %xmm1;\
  101. movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  102. mulpd %xmm8, %xmm9;\
  103. ADD1 %xmm9, %xmm2;\
  104. movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  105. mulpd %xmm8, %xmm9;\
  106. movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
  107. ADD2 %xmm9, %xmm3;\
  108. movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  109. #define KERNEL2(address) \
  110. mulpd %xmm8, %xmm9;\
  111. ADD1 %xmm9, %xmm4;\
  112. movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  113. mulpd %xmm8, %xmm9;\
  114. ADD2 %xmm9, %xmm5;\
  115. movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  116. mulpd %xmm8, %xmm9;\
  117. ADD1 %xmm9, %xmm6;\
  118. movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  119. mulpd %xmm8, %xmm9;\
  120. movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
  121. ADD2 %xmm9, %xmm7;\
  122. movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  123. #define KERNEL3(address) \
  124. mulpd %xmm8, %xmm9;\
  125. ADD1 %xmm9, %xmm0;\
  126. movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  127. mulpd %xmm8, %xmm9;\
  128. ADD2 %xmm9, %xmm1;\
  129. movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  130. mulpd %xmm8, %xmm9;\
  131. ADD1 %xmm9, %xmm2;\
  132. movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  133. mulpd %xmm8, %xmm9;\
  134. movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
  135. ADD2 %xmm9, %xmm3;\
  136. movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  137. #define KERNEL4(address) \
  138. mulpd %xmm8, %xmm9;\
  139. ADD1 %xmm9, %xmm4;\
  140. movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  141. mulpd %xmm8, %xmm9;\
  142. ADD2 %xmm9, %xmm5;\
  143. movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  144. mulpd %xmm8, %xmm9;\
  145. ADD1 %xmm9, %xmm6;\
  146. movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  147. mulpd %xmm8, %xmm9;\
  148. movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
  149. ADD2 %xmm9, %xmm7;\
  150. movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  151. #define KERNEL5(address) \
  152. mulpd %xmm10, %xmm11;\
  153. ADD1 %xmm11, %xmm0;\
  154. movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  155. mulpd %xmm10, %xmm11;\
  156. ADD2 %xmm11, %xmm1;\
  157. movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  158. mulpd %xmm10, %xmm11;\
  159. ADD1 %xmm11, %xmm2;\
  160. movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  161. mulpd %xmm10, %xmm11;\
  162. movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
  163. ADD2 %xmm11, %xmm3;\
  164. movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  165. #define KERNEL6(address) \
  166. mulpd %xmm10, %xmm11;\
  167. ADD1 %xmm11, %xmm4;\
  168. movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  169. mulpd %xmm10, %xmm11;\
  170. ADD2 %xmm11, %xmm5;\
  171. movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  172. mulpd %xmm10, %xmm11;\
  173. ADD1 %xmm11, %xmm6;\
  174. movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  175. mulpd %xmm10, %xmm11;\
  176. movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
  177. ADD2 %xmm11, %xmm7;\
  178. movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  179. #define KERNEL7(address) \
  180. mulpd %xmm10, %xmm11;\
  181. ADD1 %xmm11, %xmm0;\
  182. movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  183. mulpd %xmm10, %xmm11;\
  184. ADD2 %xmm11, %xmm1;\
  185. movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  186. mulpd %xmm10, %xmm11;\
  187. ADD1 %xmm11, %xmm2;\
  188. movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  189. mulpd %xmm10, %xmm11;\
  190. movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
  191. ADD2 %xmm11, %xmm3;\
  192. movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  193. #define KERNEL8(address) \
  194. mulpd %xmm10, %xmm11;\
  195. ADD1 %xmm11, %xmm4;\
  196. movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  197. mulpd %xmm10, %xmm11;\
  198. ADD2 %xmm11, %xmm5;\
  199. movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  200. mulpd %xmm10, %xmm11;\
  201. ADD1 %xmm11, %xmm6;\
  202. movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  203. mulpd %xmm10, %xmm11;\
  204. movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
  205. ADD2 %xmm11, %xmm7;\
  206. movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  207. #define KERNEL9(address) \
  208. mulpd %xmm12, %xmm13;\
  209. PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\
  210. ADD1 %xmm13, %xmm0;\
  211. movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  212. mulpd %xmm12, %xmm13;\
  213. ADD2 %xmm13, %xmm1;\
  214. movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  215. mulpd %xmm12, %xmm13;\
  216. ADD1 %xmm13, %xmm2;\
  217. movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  218. mulpd %xmm12, %xmm13;\
  219. movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
  220. ADD2 %xmm13, %xmm3;\
  221. movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  222. #define KERNEL10(address) \
  223. mulpd %xmm12, %xmm13;\
  224. ADD1 %xmm13, %xmm4;\
  225. movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  226. mulpd %xmm12, %xmm13;\
  227. ADD2 %xmm13, %xmm5;\
  228. movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  229. mulpd %xmm12, %xmm13;\
  230. ADD1 %xmm13, %xmm6;\
  231. movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  232. mulpd %xmm12, %xmm13;\
  233. movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
  234. ADD2 %xmm13, %xmm7;\
  235. movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  236. #define KERNEL11(address) \
  237. mulpd %xmm12, %xmm13;\
  238. ADD1 %xmm13, %xmm0;\
  239. movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  240. mulpd %xmm12, %xmm13;\
  241. ADD2 %xmm13, %xmm1;\
  242. movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  243. mulpd %xmm12, %xmm13;\
  244. ADD1 %xmm13, %xmm2;\
  245. movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  246. mulpd %xmm12, %xmm13;\
  247. movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
  248. ADD2 %xmm13, %xmm3;\
  249. movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  250. #define KERNEL12(address) \
  251. mulpd %xmm12, %xmm13;\
  252. ADD1 %xmm13, %xmm4;\
  253. movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  254. mulpd %xmm12, %xmm13;\
  255. ADD2 %xmm13, %xmm5;\
  256. movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  257. mulpd %xmm12, %xmm13;\
  258. ADD1 %xmm13, %xmm6;\
  259. movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  260. mulpd %xmm12, %xmm13;\
  261. movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
  262. ADD2 %xmm13, %xmm7;\
  263. movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  264. #define KERNEL13(address) \
  265. mulpd %xmm14, %xmm15;\
  266. ADD1 %xmm15, %xmm0;\
  267. movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  268. mulpd %xmm14, %xmm15;\
  269. ADD2 %xmm15, %xmm1;\
  270. movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  271. mulpd %xmm14, %xmm15;\
  272. ADD1 %xmm15, %xmm2;\
  273. movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  274. mulpd %xmm14, %xmm15;\
  275. movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
  276. ADD2 %xmm15, %xmm3;\
  277. movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  278. #define KERNEL14(address) \
  279. mulpd %xmm14, %xmm15;\
  280. ADD1 %xmm15, %xmm4;\
  281. movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  282. mulpd %xmm14, %xmm15;\
  283. ADD2 %xmm15, %xmm5;\
  284. movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  285. mulpd %xmm14, %xmm15;\
  286. ADD1 %xmm15, %xmm6;\
  287. movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  288. mulpd %xmm14, %xmm15;\
  289. movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
  290. ADD2 %xmm15, %xmm7;\
  291. movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  292. #define KERNEL15(address) \
  293. mulpd %xmm14, %xmm15;\
  294. ADD1 %xmm15, %xmm0;\
  295. movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  296. mulpd %xmm14, %xmm15;\
  297. ADD2 %xmm15, %xmm1;\
  298. movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  299. mulpd %xmm14, %xmm15;\
  300. ADD1 %xmm15, %xmm2;\
  301. movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  302. mulpd %xmm14, %xmm15;\
  303. movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
  304. ADD2 %xmm15, %xmm3;\
  305. movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  306. #define KERNEL16(address) \
  307. mulpd %xmm14, %xmm15;\
  308. ADD1 %xmm15, %xmm4;\
  309. movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  310. mulpd %xmm14, %xmm15;\
  311. ADD2 %xmm15, %xmm5;\
  312. movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  313. mulpd %xmm14, %xmm15;\
  314. ADD1 %xmm15, %xmm6;\
  315. movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  316. mulpd %xmm14, %xmm15;\
  317. movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
  318. ADD2 %xmm15, %xmm7;\
  319. movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  320. PROLOGUE
  321. PROFCODE
  322. subq $STACKSIZE, %rsp
  323. movq %rbx, 0(%rsp)
  324. movq %rbp, 8(%rsp)
  325. movq %r12, 16(%rsp)
  326. movq %r13, 24(%rsp)
  327. movq %r14, 32(%rsp)
  328. movq %r15, 40(%rsp)
  329. #ifdef WINDOWS_ABI
  330. movq %rdi, 48(%rsp)
  331. movq %rsi, 56(%rsp)
  332. movups %xmm6, 64(%rsp)
  333. movups %xmm7, 80(%rsp)
  334. movups %xmm8, 96(%rsp)
  335. movups %xmm9, 112(%rsp)
  336. movups %xmm10, 128(%rsp)
  337. movups %xmm11, 144(%rsp)
  338. movups %xmm12, 160(%rsp)
  339. movups %xmm13, 176(%rsp)
  340. movups %xmm14, 192(%rsp)
  341. movups %xmm15, 208(%rsp)
  342. movq ARG1, M
  343. movq ARG2, N
  344. movq ARG3, K
  345. movq OLD_A, A
  346. movq OLD_B, B
  347. movq OLD_C, C
  348. movq OLD_LDC, LDC
  349. #ifdef TRMMKERNEL
  350. movsd OLD_OFFSET, %xmm4
  351. #endif
  352. movaps %xmm3, %xmm0
  353. movsd OLD_ALPHA_I, %xmm1
  354. #else
  355. movq OLD_LDC, LDC
  356. #ifdef TRMMKERNEL
  357. movsd OLD_OFFSET, %xmm4
  358. #endif
  359. #endif
  360. movsd %xmm0, ALPHA_R
  361. movsd %xmm1, ALPHA_I
  362. #ifdef TRMMKERNEL
  363. movsd %xmm4, OFFSET
  364. movsd %xmm4, KK
  365. #ifndef LEFT
  366. negq KK
  367. #endif
  368. #endif
  369. salq $ZBASE_SHIFT, LDC
  370. movq N, J
  371. sarq $1, J # j = (n >> 2)
  372. jle .L100
  373. ALIGN_4
  374. .L01:
  375. #if defined(TRMMKERNEL) && defined(LEFT)
  376. movq OFFSET, %rax
  377. movq %rax, KK
  378. #endif
  379. movq C, CO1 # coffset1 = c
  380. leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
  381. movq A, AO # aoffset = a
  382. movq K, %rax
  383. salq $ZBASE_SHIFT + 1, %rax
  384. leaq (B, %rax), BB
  385. movq M, I
  386. sarq $1, I # i = (m >> 2)
  387. jle .L30
  388. ALIGN_4
  389. .L10:
  390. #if !defined(TRMMKERNEL) || \
  391. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  392. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  393. movq B, BO
  394. #else
  395. movq KK, %rax
  396. leaq (, %rax, SIZE), %rax
  397. leaq (AO, %rax, 4), AO
  398. leaq (B, %rax, 4), BO
  399. #endif
  400. prefetcht0 0 * SIZE(BB)
  401. subq $-8 * SIZE, BB
  402. movapd 0 * SIZE(AO), %xmm8
  403. pxor %xmm0, %xmm0
  404. movddup 0 * SIZE(BO), %xmm9
  405. pxor %xmm1, %xmm1
  406. movapd 8 * SIZE(AO), %xmm10
  407. pxor %xmm2, %xmm2
  408. movddup 8 * SIZE(BO), %xmm11
  409. pxor %xmm3, %xmm3
  410. movapd 16 * SIZE(AO), %xmm12
  411. pxor %xmm4, %xmm4
  412. movddup 16 * SIZE(BO), %xmm13
  413. pxor %xmm5, %xmm5
  414. movapd 24 * SIZE(AO), %xmm14
  415. pxor %xmm6, %xmm6
  416. movddup 24 * SIZE(BO), %xmm15
  417. pxor %xmm7, %xmm7
  418. prefetchnta 3 * SIZE(CO1)
  419. prefetchnta 3 * SIZE(CO2)
  420. #ifndef TRMMKERNEL
  421. movq K, %rax
  422. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  423. movq K, %rax
  424. subq KK, %rax
  425. movq %rax, KKK
  426. #else
  427. movq KK, %rax
  428. #ifdef LEFT
  429. addq $2, %rax
  430. #else
  431. addq $2, %rax
  432. #endif
  433. movq %rax, KKK
  434. #endif
  435. andq $-8, %rax
  436. salq $4, %rax
  437. je .L12
  438. .L1X:
  439. KERNEL1 (16 * 0)
  440. KERNEL2 (16 * 0)
  441. KERNEL3 (16 * 0)
  442. KERNEL4 (16 * 0)
  443. KERNEL5 (16 * 0)
  444. KERNEL6 (16 * 0)
  445. KERNEL7 (16 * 0)
  446. KERNEL8 (16 * 0)
  447. KERNEL9 (16 * 0)
  448. KERNEL10(16 * 0)
  449. KERNEL11(16 * 0)
  450. KERNEL12(16 * 0)
  451. KERNEL13(16 * 0)
  452. KERNEL14(16 * 0)
  453. KERNEL15(16 * 0)
  454. KERNEL16(16 * 0)
  455. cmpq $128 * 1, %rax
  456. NOBRANCH
  457. jle .L11
  458. KERNEL1 (16 * 1)
  459. KERNEL2 (16 * 1)
  460. KERNEL3 (16 * 1)
  461. KERNEL4 (16 * 1)
  462. KERNEL5 (16 * 1)
  463. KERNEL6 (16 * 1)
  464. KERNEL7 (16 * 1)
  465. KERNEL8 (16 * 1)
  466. KERNEL9 (16 * 1)
  467. KERNEL10(16 * 1)
  468. KERNEL11(16 * 1)
  469. KERNEL12(16 * 1)
  470. KERNEL13(16 * 1)
  471. KERNEL14(16 * 1)
  472. KERNEL15(16 * 1)
  473. KERNEL16(16 * 1)
  474. cmpq $128 * 2, %rax
  475. NOBRANCH
  476. jle .L11
  477. KERNEL1 (16 * 2)
  478. KERNEL2 (16 * 2)
  479. KERNEL3 (16 * 2)
  480. KERNEL4 (16 * 2)
  481. KERNEL5 (16 * 2)
  482. KERNEL6 (16 * 2)
  483. KERNEL7 (16 * 2)
  484. KERNEL8 (16 * 2)
  485. KERNEL9 (16 * 2)
  486. KERNEL10(16 * 2)
  487. KERNEL11(16 * 2)
  488. KERNEL12(16 * 2)
  489. KERNEL13(16 * 2)
  490. KERNEL14(16 * 2)
  491. KERNEL15(16 * 2)
  492. KERNEL16(16 * 2)
  493. cmpq $128 * 3, %rax
  494. NOBRANCH
  495. jle .L11
  496. KERNEL1 (16 * 3)
  497. KERNEL2 (16 * 3)
  498. KERNEL3 (16 * 3)
  499. KERNEL4 (16 * 3)
  500. KERNEL5 (16 * 3)
  501. KERNEL6 (16 * 3)
  502. KERNEL7 (16 * 3)
  503. KERNEL8 (16 * 3)
  504. KERNEL9 (16 * 3)
  505. KERNEL10(16 * 3)
  506. KERNEL11(16 * 3)
  507. KERNEL12(16 * 3)
  508. KERNEL13(16 * 3)
  509. KERNEL14(16 * 3)
  510. KERNEL15(16 * 3)
  511. KERNEL16(16 * 3)
  512. cmpq $128 * 4, %rax
  513. NOBRANCH
  514. jle .L11
  515. KERNEL1 (16 * 4)
  516. KERNEL2 (16 * 4)
  517. KERNEL3 (16 * 4)
  518. KERNEL4 (16 * 4)
  519. KERNEL5 (16 * 4)
  520. KERNEL6 (16 * 4)
  521. KERNEL7 (16 * 4)
  522. KERNEL8 (16 * 4)
  523. KERNEL9 (16 * 4)
  524. KERNEL10(16 * 4)
  525. KERNEL11(16 * 4)
  526. KERNEL12(16 * 4)
  527. KERNEL13(16 * 4)
  528. KERNEL14(16 * 4)
  529. KERNEL15(16 * 4)
  530. KERNEL16(16 * 4)
  531. cmpq $128 * 5, %rax
  532. NOBRANCH
  533. jle .L11
  534. KERNEL1 (16 * 5)
  535. KERNEL2 (16 * 5)
  536. KERNEL3 (16 * 5)
  537. KERNEL4 (16 * 5)
  538. KERNEL5 (16 * 5)
  539. KERNEL6 (16 * 5)
  540. KERNEL7 (16 * 5)
  541. KERNEL8 (16 * 5)
  542. KERNEL9 (16 * 5)
  543. KERNEL10(16 * 5)
  544. KERNEL11(16 * 5)
  545. KERNEL12(16 * 5)
  546. KERNEL13(16 * 5)
  547. KERNEL14(16 * 5)
  548. KERNEL15(16 * 5)
  549. KERNEL16(16 * 5)
  550. cmpq $128 * 6, %rax
  551. NOBRANCH
  552. jle .L11
  553. KERNEL1 (16 * 6)
  554. KERNEL2 (16 * 6)
  555. KERNEL3 (16 * 6)
  556. KERNEL4 (16 * 6)
  557. KERNEL5 (16 * 6)
  558. KERNEL6 (16 * 6)
  559. KERNEL7 (16 * 6)
  560. KERNEL8 (16 * 6)
  561. KERNEL9 (16 * 6)
  562. KERNEL10(16 * 6)
  563. KERNEL11(16 * 6)
  564. KERNEL12(16 * 6)
  565. KERNEL13(16 * 6)
  566. KERNEL14(16 * 6)
  567. KERNEL15(16 * 6)
  568. KERNEL16(16 * 6)
  569. cmpq $128 * 7, %rax
  570. NOBRANCH
  571. jle .L11
  572. KERNEL1 (16 * 7)
  573. KERNEL2 (16 * 7)
  574. KERNEL3 (16 * 7)
  575. KERNEL4 (16 * 7)
  576. KERNEL5 (16 * 7)
  577. KERNEL6 (16 * 7)
  578. KERNEL7 (16 * 7)
  579. KERNEL8 (16 * 7)
  580. KERNEL9 (16 * 7)
  581. KERNEL10(16 * 7)
  582. KERNEL11(16 * 7)
  583. KERNEL12(16 * 7)
  584. KERNEL13(16 * 7)
  585. KERNEL14(16 * 7)
  586. KERNEL15(16 * 7)
  587. KERNEL16(16 * 7)
  588. addq $32 * 8 * SIZE, AO
  589. addq $32 * 8 * SIZE, BO
  590. subq $128 * 8, %rax
  591. jg .L1X
  592. .L11:
  593. leaq (AO, %rax, 2), AO # * 16
  594. leaq (BO, %rax, 2), BO # * 64
  595. ALIGN_4
  596. .L12:
  597. #ifndef TRMMKERNEL
  598. movq K, %rax
  599. #else
  600. movq KKK, %rax
  601. #endif
  602. movddup ALPHA_R, %xmm14
  603. movddup ALPHA_I, %xmm15
  604. andq $7, %rax # if (k & 1)
  605. BRANCH
  606. je .L14
  607. ALIGN_4
  608. .L13:
  609. mulpd %xmm8, %xmm9
  610. movapd 2 * SIZE(AO), %xmm10
  611. ADD1 %xmm9, %xmm0
  612. movddup 1 * SIZE(BO), %xmm9
  613. mulpd %xmm8, %xmm9
  614. movddup 0 * SIZE(BO), %xmm11
  615. ADD2 %xmm9, %xmm1
  616. movddup 2 * SIZE(BO), %xmm9
  617. mulpd %xmm8, %xmm9
  618. ADD1 %xmm9, %xmm2
  619. movddup 3 * SIZE(BO), %xmm9
  620. mulpd %xmm8, %xmm9
  621. movapd 4 * SIZE(AO), %xmm8
  622. ADD2 %xmm9, %xmm3
  623. movddup 4 * SIZE(BO), %xmm9
  624. mulpd %xmm10, %xmm11
  625. ADD1 %xmm11, %xmm4
  626. movddup 1 * SIZE(BO), %xmm11
  627. mulpd %xmm10, %xmm11
  628. ADD2 %xmm11, %xmm5
  629. movddup 2 * SIZE(BO), %xmm11
  630. mulpd %xmm10, %xmm11
  631. ADD1 %xmm11, %xmm6
  632. movddup 3 * SIZE(BO), %xmm11
  633. mulpd %xmm10, %xmm11
  634. ADD2 %xmm11, %xmm7
  635. addq $4 * SIZE, AO # aoffset += 4
  636. addq $4 * SIZE, BO # boffset1 += 8
  637. decq %rax
  638. jg .L13
  639. ALIGN_4
  640. .L14:
  641. SHUFPD_1 %xmm1, %xmm1
  642. SHUFPD_1 %xmm3, %xmm3
  643. SHUFPD_1 %xmm5, %xmm5
  644. SHUFPD_1 %xmm7, %xmm7
  645. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  646. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  647. addsubpd %xmm1, %xmm0
  648. addsubpd %xmm3, %xmm2
  649. addsubpd %xmm5, %xmm4
  650. addsubpd %xmm7, %xmm6
  651. movapd %xmm0, %xmm1
  652. movapd %xmm2, %xmm3
  653. movapd %xmm4, %xmm5
  654. movapd %xmm6, %xmm7
  655. #else
  656. addsubpd %xmm0, %xmm1
  657. addsubpd %xmm2, %xmm3
  658. addsubpd %xmm4, %xmm5
  659. addsubpd %xmm6, %xmm7
  660. movapd %xmm1, %xmm0
  661. movapd %xmm3, %xmm2
  662. movapd %xmm5, %xmm4
  663. movapd %xmm7, %xmm6
  664. #endif
  665. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  666. movsd 0 * SIZE(CO1), %xmm8
  667. movhpd 1 * SIZE(CO1), %xmm8
  668. movsd 2 * SIZE(CO1), %xmm10
  669. movhpd 3 * SIZE(CO1), %xmm10
  670. movsd 0 * SIZE(CO2), %xmm9
  671. movhpd 1 * SIZE(CO2), %xmm9
  672. movsd 2 * SIZE(CO2), %xmm11
  673. movhpd 3 * SIZE(CO2), %xmm11
  674. #endif
  675. SHUFPD_1 %xmm1, %xmm1
  676. SHUFPD_1 %xmm3, %xmm3
  677. SHUFPD_1 %xmm5, %xmm5
  678. SHUFPD_1 %xmm7, %xmm7
  679. mulpd %xmm14, %xmm0
  680. mulpd %xmm14, %xmm2
  681. mulpd %xmm14, %xmm4
  682. mulpd %xmm14, %xmm6
  683. mulpd %xmm15, %xmm1
  684. mulpd %xmm15, %xmm3
  685. mulpd %xmm15, %xmm5
  686. mulpd %xmm15, %xmm7
  687. addsubpd %xmm1, %xmm0
  688. addsubpd %xmm3, %xmm2
  689. addsubpd %xmm5, %xmm4
  690. addsubpd %xmm7, %xmm6
  691. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  692. addpd %xmm8, %xmm0
  693. addpd %xmm9, %xmm2
  694. addpd %xmm10, %xmm4
  695. addpd %xmm11, %xmm6
  696. #endif
  697. movsd %xmm0, 0 * SIZE(CO1)
  698. movhpd %xmm0, 1 * SIZE(CO1)
  699. movsd %xmm4, 2 * SIZE(CO1)
  700. movhpd %xmm4, 3 * SIZE(CO1)
  701. movsd %xmm2, 0 * SIZE(CO2)
  702. movhpd %xmm2, 1 * SIZE(CO2)
  703. movsd %xmm6, 2 * SIZE(CO2)
  704. movhpd %xmm6, 3 * SIZE(CO2)
  705. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  706. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  707. movq K, %rax
  708. subq KKK, %rax
  709. leaq (,%rax, SIZE), %rax
  710. leaq (AO, %rax, 4), AO
  711. leaq (BO, %rax, 4), BO
  712. #endif
  713. #if defined(TRMMKERNEL) && defined(LEFT)
  714. addq $2, KK
  715. #endif
  716. addq $4 * SIZE, CO1 # coffset += 4
  717. addq $4 * SIZE, CO2 # coffset += 4
  718. decq I # i --
  719. jg .L10
  720. ALIGN_4
  721. .L30:
  722. testq $1, M
  723. jle .L99
  724. .L40:
  725. #if !defined(TRMMKERNEL) || \
  726. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  727. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  728. movq B, BO
  729. #else
  730. movq KK, %rax
  731. leaq (, %rax, SIZE), %rax
  732. leaq (AO, %rax, 2), AO
  733. leaq (B, %rax, 4), BO
  734. #endif
  735. movapd 0 * SIZE(AO), %xmm8
  736. pxor %xmm0, %xmm0
  737. movddup 0 * SIZE(BO), %xmm9
  738. pxor %xmm1, %xmm1
  739. movapd 8 * SIZE(AO), %xmm10
  740. pxor %xmm2, %xmm2
  741. movddup 8 * SIZE(BO), %xmm11
  742. pxor %xmm3, %xmm3
  743. #ifndef TRMMKERNEL
  744. movq K, %rax
  745. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  746. movq K, %rax
  747. subq KK, %rax
  748. movq %rax, KKK
  749. #else
  750. movq KK, %rax
  751. #ifdef LEFT
  752. addq $1, %rax
  753. #else
  754. addq $2, %rax
  755. #endif
  756. movq %rax, KKK
  757. #endif
  758. sarq $3, %rax
  759. je .L42
  760. .L41:
  761. mulpd %xmm8, %xmm9
  762. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  763. ADD1 %xmm9, %xmm0
  764. movddup 1 * SIZE(BO), %xmm9
  765. mulpd %xmm8, %xmm9
  766. ADD2 %xmm9, %xmm1
  767. movddup 2 * SIZE(BO), %xmm9
  768. mulpd %xmm8, %xmm9
  769. ADD1 %xmm9, %xmm2
  770. movddup 3 * SIZE(BO), %xmm9
  771. mulpd %xmm8, %xmm9
  772. movapd 2 * SIZE(AO), %xmm8
  773. ADD2 %xmm9, %xmm3
  774. movddup 4 * SIZE(BO), %xmm9
  775. mulpd %xmm8, %xmm9
  776. ADD1 %xmm9, %xmm0
  777. movddup 5 * SIZE(BO), %xmm9
  778. mulpd %xmm8, %xmm9
  779. ADD2 %xmm9, %xmm1
  780. movddup 6 * SIZE(BO), %xmm9
  781. mulpd %xmm8, %xmm9
  782. ADD1 %xmm9, %xmm2
  783. movddup 7 * SIZE(BO), %xmm9
  784. mulpd %xmm8, %xmm9
  785. movapd 4 * SIZE(AO), %xmm8
  786. ADD2 %xmm9, %xmm3
  787. movddup 16 * SIZE(BO), %xmm9
  788. mulpd %xmm8, %xmm11
  789. ADD1 %xmm11, %xmm0
  790. movddup 9 * SIZE(BO), %xmm11
  791. mulpd %xmm8, %xmm11
  792. ADD2 %xmm11, %xmm1
  793. movddup 10 * SIZE(BO), %xmm11
  794. mulpd %xmm8, %xmm11
  795. ADD1 %xmm11, %xmm2
  796. movddup 11 * SIZE(BO), %xmm11
  797. mulpd %xmm8, %xmm11
  798. movapd 6 * SIZE(AO), %xmm8
  799. ADD2 %xmm11, %xmm3
  800. movddup 12 * SIZE(BO), %xmm11
  801. mulpd %xmm8, %xmm11
  802. ADD1 %xmm11, %xmm0
  803. movddup 13 * SIZE(BO), %xmm11
  804. mulpd %xmm8, %xmm11
  805. ADD2 %xmm11, %xmm1
  806. movddup 14 * SIZE(BO), %xmm11
  807. mulpd %xmm8, %xmm11
  808. ADD1 %xmm11, %xmm2
  809. movddup 15 * SIZE(BO), %xmm11
  810. mulpd %xmm8, %xmm11
  811. movapd 16 * SIZE(AO), %xmm8
  812. ADD2 %xmm11, %xmm3
  813. movddup 24 * SIZE(BO), %xmm11
  814. mulpd %xmm10, %xmm9
  815. ADD1 %xmm9, %xmm0
  816. movddup 17 * SIZE(BO), %xmm9
  817. mulpd %xmm10, %xmm9
  818. ADD2 %xmm9, %xmm1
  819. movddup 18 * SIZE(BO), %xmm9
  820. mulpd %xmm10, %xmm9
  821. ADD1 %xmm9, %xmm2
  822. movddup 19 * SIZE(BO), %xmm9
  823. mulpd %xmm10, %xmm9
  824. movapd 10 * SIZE(AO), %xmm10
  825. ADD2 %xmm9, %xmm3
  826. movddup 20 * SIZE(BO), %xmm9
  827. mulpd %xmm10, %xmm9
  828. ADD1 %xmm9, %xmm0
  829. movddup 21 * SIZE(BO), %xmm9
  830. mulpd %xmm10, %xmm9
  831. ADD2 %xmm9, %xmm1
  832. movddup 22 * SIZE(BO), %xmm9
  833. mulpd %xmm10, %xmm9
  834. ADD1 %xmm9, %xmm2
  835. movddup 23 * SIZE(BO), %xmm9
  836. mulpd %xmm10, %xmm9
  837. movapd 12 * SIZE(AO), %xmm10
  838. ADD2 %xmm9, %xmm3
  839. movddup 32 * SIZE(BO), %xmm9
  840. mulpd %xmm10, %xmm11
  841. ADD1 %xmm11, %xmm0
  842. movddup 25 * SIZE(BO), %xmm11
  843. mulpd %xmm10, %xmm11
  844. ADD2 %xmm11, %xmm1
  845. movddup 26 * SIZE(BO), %xmm11
  846. mulpd %xmm10, %xmm11
  847. ADD1 %xmm11, %xmm2
  848. movddup 27 * SIZE(BO), %xmm11
  849. mulpd %xmm10, %xmm11
  850. movapd 14 * SIZE(AO), %xmm10
  851. ADD2 %xmm11, %xmm3
  852. movddup 28 * SIZE(BO), %xmm11
  853. mulpd %xmm10, %xmm11
  854. ADD1 %xmm11, %xmm0
  855. movddup 29 * SIZE(BO), %xmm11
  856. mulpd %xmm10, %xmm11
  857. ADD2 %xmm11, %xmm1
  858. movddup 30 * SIZE(BO), %xmm11
  859. mulpd %xmm10, %xmm11
  860. ADD1 %xmm11, %xmm2
  861. movddup 31 * SIZE(BO), %xmm11
  862. mulpd %xmm10, %xmm11
  863. movapd 24 * SIZE(AO), %xmm10
  864. ADD2 %xmm11, %xmm3
  865. movddup 40 * SIZE(BO), %xmm11
  866. addq $16 * SIZE, AO
  867. addq $32 * SIZE, BO
  868. decq %rax
  869. jne .L41
  870. .L42:
  871. #ifndef TRMMKERNEL
  872. movq K, %rax
  873. #else
  874. movq KKK, %rax
  875. #endif
  876. movddup ALPHA_R, %xmm14
  877. movddup ALPHA_I, %xmm15
  878. andq $7, %rax # if (k & 1)
  879. BRANCH
  880. jle .L44
  881. .L43:
  882. mulpd %xmm8, %xmm9
  883. ADD1 %xmm9, %xmm0
  884. movddup 1 * SIZE(BO), %xmm9
  885. mulpd %xmm8, %xmm9
  886. ADD2 %xmm9, %xmm1
  887. movddup 2 * SIZE(BO), %xmm9
  888. mulpd %xmm8, %xmm9
  889. ADD1 %xmm9, %xmm2
  890. movddup 3 * SIZE(BO), %xmm9
  891. mulpd %xmm8, %xmm9
  892. movapd 2 * SIZE(AO), %xmm8
  893. ADD2 %xmm9, %xmm3
  894. movddup 4 * SIZE(BO), %xmm9
  895. addq $2 * SIZE, AO # aoffset += 4
  896. addq $4 * SIZE, BO # boffset1 += 8
  897. decq %rax
  898. jg .L43
  899. ALIGN_4
  900. .L44:
  901. SHUFPD_1 %xmm1, %xmm1
  902. SHUFPD_1 %xmm3, %xmm3
  903. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  904. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  905. addsubpd %xmm1, %xmm0
  906. addsubpd %xmm3, %xmm2
  907. movapd %xmm0, %xmm1
  908. movapd %xmm2, %xmm3
  909. #else
  910. addsubpd %xmm0, %xmm1
  911. addsubpd %xmm2, %xmm3
  912. movapd %xmm1, %xmm0
  913. movapd %xmm3, %xmm2
  914. #endif
  915. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  916. movsd 0 * SIZE(CO1), %xmm8
  917. movhpd 1 * SIZE(CO1), %xmm8
  918. movsd 0 * SIZE(CO2), %xmm9
  919. movhpd 1 * SIZE(CO2), %xmm9
  920. #endif
  921. SHUFPD_1 %xmm1, %xmm1
  922. SHUFPD_1 %xmm3, %xmm3
  923. mulpd %xmm14, %xmm0
  924. mulpd %xmm14, %xmm2
  925. mulpd %xmm15, %xmm1
  926. mulpd %xmm15, %xmm3
  927. addsubpd %xmm1, %xmm0
  928. addsubpd %xmm3, %xmm2
  929. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  930. addpd %xmm8, %xmm0
  931. addpd %xmm9, %xmm2
  932. #endif
  933. movsd %xmm0, 0 * SIZE(CO1)
  934. movhpd %xmm0, 1 * SIZE(CO1)
  935. movsd %xmm2, 0 * SIZE(CO2)
  936. movhpd %xmm2, 1 * SIZE(CO2)
  937. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  938. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  939. movq K, %rax
  940. subq KKK, %rax
  941. leaq (,%rax, SIZE), %rax
  942. leaq (AO, %rax, 2), AO
  943. leaq (BO, %rax, 4), BO
  944. #endif
  945. #if defined(TRMMKERNEL) && defined(LEFT)
  946. addq $1, KK
  947. #endif
  948. ALIGN_4
  949. .L99:
  950. #if defined(TRMMKERNEL) && !defined(LEFT)
  951. addq $2, KK
  952. #endif
  953. leaq (C, LDC, 2), C # c += 2 * ldc
  954. movq BO, B
  955. decq J # j --
  956. jg .L01
  957. .L100:
  958. testq $1, N
  959. jle .L999
  960. .L101:
  961. #if defined(TRMMKERNEL) && defined(LEFT)
  962. movq OFFSET, %rax
  963. movq %rax, KK
  964. #endif
  965. movq C, CO1 # coffset1 = c
  966. movq A, AO # aoffset = a
  967. movq M, I
  968. sarq $1, I # i = (m >> 2)
  969. jle .L130
  970. ALIGN_4
  971. .L110:
  972. #if !defined(TRMMKERNEL) || \
  973. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  974. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  975. movq B, BO
  976. #else
  977. movq KK, %rax
  978. leaq (, %rax, SIZE), %rax
  979. leaq (AO, %rax, 4), AO
  980. leaq (B, %rax, 2), BO
  981. #endif
  982. movapd 0 * SIZE(AO), %xmm8
  983. pxor %xmm0, %xmm0
  984. movddup 0 * SIZE(BO), %xmm9
  985. pxor %xmm1, %xmm1
  986. movapd 8 * SIZE(AO), %xmm10
  987. pxor %xmm4, %xmm4
  988. movddup 8 * SIZE(BO), %xmm11
  989. pxor %xmm5, %xmm5
  990. prefetchnta 4 * SIZE(CO1)
  991. #ifndef TRMMKERNEL
  992. movq K, %rax
  993. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  994. movq K, %rax
  995. subq KK, %rax
  996. movq %rax, KKK
  997. #else
  998. movq KK, %rax
  999. #ifdef LEFT
  1000. addq $2, %rax
  1001. #else
  1002. addq $1, %rax
  1003. #endif
  1004. movq %rax, KKK
  1005. #endif
  1006. sarq $3, %rax
  1007. je .L112
  1008. .L111:
  1009. mulpd %xmm8, %xmm9
  1010. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1011. ADD1 %xmm9, %xmm0
  1012. movddup 1 * SIZE(BO), %xmm9
  1013. mulpd %xmm8, %xmm9
  1014. movapd 2 * SIZE(AO), %xmm8
  1015. ADD2 %xmm9, %xmm1
  1016. movddup 0 * SIZE(BO), %xmm9
  1017. mulpd %xmm8, %xmm9
  1018. ADD1 %xmm9, %xmm4
  1019. movddup 1 * SIZE(BO), %xmm9
  1020. mulpd %xmm8, %xmm9
  1021. movapd 4 * SIZE(AO), %xmm8
  1022. ADD2 %xmm9, %xmm5
  1023. movddup 2 * SIZE(BO), %xmm9
  1024. mulpd %xmm8, %xmm9
  1025. ADD1 %xmm9, %xmm0
  1026. movddup 3 * SIZE(BO), %xmm9
  1027. mulpd %xmm8, %xmm9
  1028. movapd 6 * SIZE(AO), %xmm8
  1029. ADD2 %xmm9, %xmm1
  1030. movddup 2 * SIZE(BO), %xmm9
  1031. mulpd %xmm8, %xmm9
  1032. ADD1 %xmm9, %xmm4
  1033. movddup 3 * SIZE(BO), %xmm9
  1034. mulpd %xmm8, %xmm9
  1035. movapd 16 * SIZE(AO), %xmm8
  1036. ADD2 %xmm9, %xmm5
  1037. movddup 4 * SIZE(BO), %xmm9
  1038. mulpd %xmm10, %xmm9
  1039. ADD1 %xmm9, %xmm0
  1040. movddup 5 * SIZE(BO), %xmm9
  1041. mulpd %xmm10, %xmm9
  1042. movapd 10 * SIZE(AO), %xmm10
  1043. ADD2 %xmm9, %xmm1
  1044. movddup 4 * SIZE(BO), %xmm9
  1045. mulpd %xmm10, %xmm9
  1046. ADD1 %xmm9, %xmm4
  1047. movddup 5 * SIZE(BO), %xmm9
  1048. mulpd %xmm10, %xmm9
  1049. movapd 12 * SIZE(AO), %xmm10
  1050. ADD2 %xmm9, %xmm5
  1051. movddup 6 * SIZE(BO), %xmm9
  1052. mulpd %xmm10, %xmm9
  1053. ADD1 %xmm9, %xmm0
  1054. movddup 7 * SIZE(BO), %xmm9
  1055. mulpd %xmm10, %xmm9
  1056. movapd 14 * SIZE(AO), %xmm10
  1057. ADD2 %xmm9, %xmm1
  1058. movddup 6 * SIZE(BO), %xmm9
  1059. mulpd %xmm10, %xmm9
  1060. ADD1 %xmm9, %xmm4
  1061. movddup 7 * SIZE(BO), %xmm9
  1062. mulpd %xmm10, %xmm9
  1063. movapd 40 * SIZE(AO), %xmm10
  1064. ADD2 %xmm9, %xmm5
  1065. movddup 16 * SIZE(BO), %xmm9
  1066. mulpd %xmm8, %xmm11
  1067. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  1068. ADD1 %xmm11, %xmm0
  1069. movddup 9 * SIZE(BO), %xmm11
  1070. mulpd %xmm8, %xmm11
  1071. movapd 18 * SIZE(AO), %xmm8
  1072. ADD2 %xmm11, %xmm1
  1073. movddup 8 * SIZE(BO), %xmm11
  1074. mulpd %xmm8, %xmm11
  1075. ADD1 %xmm11, %xmm4
  1076. movddup 9 * SIZE(BO), %xmm11
  1077. mulpd %xmm8, %xmm11
  1078. movapd 20 * SIZE(AO), %xmm8
  1079. ADD2 %xmm11, %xmm5
  1080. movddup 10 * SIZE(BO), %xmm11
  1081. mulpd %xmm8, %xmm11
  1082. ADD1 %xmm11, %xmm0
  1083. movddup 11 * SIZE(BO), %xmm11
  1084. mulpd %xmm8, %xmm11
  1085. movapd 22 * SIZE(AO), %xmm8
  1086. ADD2 %xmm11, %xmm1
  1087. movddup 10 * SIZE(BO), %xmm11
  1088. mulpd %xmm8, %xmm11
  1089. ADD1 %xmm11, %xmm4
  1090. movddup 11 * SIZE(BO), %xmm11
  1091. mulpd %xmm8, %xmm11
  1092. movapd 24 * SIZE(AO), %xmm8
  1093. ADD2 %xmm11, %xmm5
  1094. movddup 12 * SIZE(BO), %xmm11
  1095. mulpd %xmm8, %xmm11
  1096. ADD1 %xmm11, %xmm0
  1097. movddup 13 * SIZE(BO), %xmm11
  1098. mulpd %xmm8, %xmm11
  1099. movapd 26 * SIZE(AO), %xmm8
  1100. ADD2 %xmm11, %xmm1
  1101. movddup 12 * SIZE(BO), %xmm11
  1102. mulpd %xmm8, %xmm11
  1103. ADD1 %xmm11, %xmm4
  1104. movddup 13 * SIZE(BO), %xmm11
  1105. mulpd %xmm8, %xmm11
  1106. movapd 28 * SIZE(AO), %xmm8
  1107. ADD2 %xmm11, %xmm5
  1108. movddup 14 * SIZE(BO), %xmm11
  1109. mulpd %xmm8, %xmm11
  1110. ADD1 %xmm11, %xmm0
  1111. movddup 15 * SIZE(BO), %xmm11
  1112. mulpd %xmm8, %xmm11
  1113. movapd 30 * SIZE(AO), %xmm8
  1114. ADD2 %xmm11, %xmm1
  1115. movddup 14 * SIZE(BO), %xmm11
  1116. mulpd %xmm8, %xmm11
  1117. ADD1 %xmm11, %xmm4
  1118. movddup 15 * SIZE(BO), %xmm11
  1119. mulpd %xmm8, %xmm11
  1120. movapd 32 * SIZE(AO), %xmm8
  1121. ADD2 %xmm11, %xmm5
  1122. movddup 24 * SIZE(BO), %xmm11
  1123. addq $32 * SIZE, AO
  1124. addq $16 * SIZE, BO
  1125. decq %rax
  1126. jne .L111
  1127. ALIGN_4
  1128. .L112:
  1129. #ifndef TRMMKERNEL
  1130. movq K, %rax
  1131. #else
  1132. movq KKK, %rax
  1133. #endif
  1134. movddup ALPHA_R, %xmm14
  1135. movddup ALPHA_I, %xmm15
  1136. andq $7, %rax # if (k & 1)
  1137. BRANCH
  1138. jle .L114
  1139. .L113:
  1140. mulpd %xmm8, %xmm9
  1141. movapd 2 * SIZE(AO), %xmm10
  1142. ADD1 %xmm9, %xmm0
  1143. movddup 1 * SIZE(BO), %xmm9
  1144. mulpd %xmm8, %xmm9
  1145. movddup 0 * SIZE(BO), %xmm11
  1146. ADD2 %xmm9, %xmm1
  1147. movddup 2 * SIZE(BO), %xmm9
  1148. mulpd %xmm10, %xmm11
  1149. movapd 4 * SIZE(AO), %xmm8
  1150. ADD1 %xmm11, %xmm4
  1151. movddup 1 * SIZE(BO), %xmm11
  1152. mulpd %xmm10, %xmm11
  1153. ADD2 %xmm11, %xmm5
  1154. addq $4 * SIZE, AO # aoffset += 4
  1155. addq $2 * SIZE, BO # boffset1 += 8
  1156. decq %rax
  1157. jg .L113
  1158. ALIGN_4
  1159. .L114:
  1160. SHUFPD_1 %xmm1, %xmm1
  1161. SHUFPD_1 %xmm5, %xmm5
  1162. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1163. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1164. addsubpd %xmm1, %xmm0
  1165. addsubpd %xmm5, %xmm4
  1166. movapd %xmm0, %xmm1
  1167. movapd %xmm4, %xmm5
  1168. #else
  1169. addsubpd %xmm0, %xmm1
  1170. addsubpd %xmm4, %xmm5
  1171. movapd %xmm1, %xmm0
  1172. movapd %xmm5, %xmm4
  1173. #endif
  1174. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1175. movsd 0 * SIZE(CO1), %xmm8
  1176. movhpd 1 * SIZE(CO1), %xmm8
  1177. movsd 2 * SIZE(CO1), %xmm10
  1178. movhpd 3 * SIZE(CO1), %xmm10
  1179. #endif
  1180. SHUFPD_1 %xmm1, %xmm1
  1181. SHUFPD_1 %xmm5, %xmm5
  1182. mulpd %xmm14, %xmm0
  1183. mulpd %xmm14, %xmm4
  1184. mulpd %xmm15, %xmm1
  1185. mulpd %xmm15, %xmm5
  1186. addsubpd %xmm1, %xmm0
  1187. addsubpd %xmm5, %xmm4
  1188. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1189. addpd %xmm8, %xmm0
  1190. addpd %xmm10, %xmm4
  1191. #endif
  1192. movsd %xmm0, 0 * SIZE(CO1)
  1193. movhpd %xmm0, 1 * SIZE(CO1)
  1194. movsd %xmm4, 2 * SIZE(CO1)
  1195. movhpd %xmm4, 3 * SIZE(CO1)
  1196. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1197. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1198. movq K, %rax
  1199. subq KKK, %rax
  1200. leaq (,%rax, SIZE), %rax
  1201. leaq (AO, %rax, 4), AO
  1202. leaq (BO, %rax, 2), BO
  1203. #endif
  1204. #if defined(TRMMKERNEL) && defined(LEFT)
  1205. addq $2, KK
  1206. #endif
  1207. addq $4 * SIZE, CO1 # coffset += 4
  1208. decq I # i --
  1209. jg .L110
  1210. ALIGN_4
  1211. .L130:
  1212. testq $1, M
  1213. jle .L999
  1214. ALIGN_4
  1215. .L140:
  1216. #if !defined(TRMMKERNEL) || \
  1217. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1218. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1219. movq B, BO
  1220. #else
  1221. movq KK, %rax
  1222. leaq (, %rax, SIZE), %rax
  1223. leaq (AO, %rax, 2), AO
  1224. leaq (B, %rax, 2), BO
  1225. #endif
  1226. movapd 0 * SIZE(AO), %xmm8
  1227. pxor %xmm0, %xmm0
  1228. movddup 0 * SIZE(BO), %xmm9
  1229. pxor %xmm1, %xmm1
  1230. movapd 8 * SIZE(AO), %xmm10
  1231. pxor %xmm2, %xmm2
  1232. movddup 8 * SIZE(BO), %xmm11
  1233. pxor %xmm3, %xmm3
  1234. #ifndef TRMMKERNEL
  1235. movq K, %rax
  1236. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1237. movq K, %rax
  1238. subq KK, %rax
  1239. movq %rax, KKK
  1240. #else
  1241. movq KK, %rax
  1242. #ifdef LEFT
  1243. addq $1, %rax
  1244. #else
  1245. addq $1, %rax
  1246. #endif
  1247. movq %rax, KKK
  1248. #endif
  1249. sarq $3, %rax
  1250. je .L142
  1251. .L141:
  1252. mulpd %xmm8, %xmm9
  1253. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1254. ADD1 %xmm9, %xmm0
  1255. movddup 1 * SIZE(BO), %xmm9
  1256. mulpd %xmm8, %xmm9
  1257. movapd 2 * SIZE(AO), %xmm8
  1258. ADD2 %xmm9, %xmm1
  1259. movddup 2 * SIZE(BO), %xmm9
  1260. mulpd %xmm8, %xmm9
  1261. ADD1 %xmm9, %xmm2
  1262. movddup 3 * SIZE(BO), %xmm9
  1263. mulpd %xmm8, %xmm9
  1264. movapd 4 * SIZE(AO), %xmm8
  1265. ADD2 %xmm9, %xmm3
  1266. movddup 4 * SIZE(BO), %xmm9
  1267. mulpd %xmm8, %xmm9
  1268. ADD1 %xmm9, %xmm0
  1269. movddup 5 * SIZE(BO), %xmm9
  1270. mulpd %xmm8, %xmm9
  1271. movapd 6 * SIZE(AO), %xmm8
  1272. ADD2 %xmm9, %xmm1
  1273. movddup 6 * SIZE(BO), %xmm9
  1274. mulpd %xmm8, %xmm9
  1275. ADD1 %xmm9, %xmm2
  1276. movddup 7 * SIZE(BO), %xmm9
  1277. mulpd %xmm8, %xmm9
  1278. movapd 16 * SIZE(AO), %xmm8
  1279. ADD2 %xmm9, %xmm3
  1280. movddup 16 * SIZE(BO), %xmm9
  1281. mulpd %xmm10, %xmm11
  1282. ADD1 %xmm11, %xmm0
  1283. movddup 9 * SIZE(BO), %xmm11
  1284. mulpd %xmm10, %xmm11
  1285. movapd 10 * SIZE(AO), %xmm10
  1286. ADD2 %xmm11, %xmm1
  1287. movddup 10 * SIZE(BO), %xmm11
  1288. mulpd %xmm10, %xmm11
  1289. ADD1 %xmm11, %xmm2
  1290. movddup 11 * SIZE(BO), %xmm11
  1291. mulpd %xmm10, %xmm11
  1292. movapd 12 * SIZE(AO), %xmm10
  1293. ADD2 %xmm11, %xmm3
  1294. movddup 12 * SIZE(BO), %xmm11
  1295. mulpd %xmm10, %xmm11
  1296. ADD1 %xmm11, %xmm0
  1297. movddup 13 * SIZE(BO), %xmm11
  1298. mulpd %xmm10, %xmm11
  1299. movapd 14 * SIZE(AO), %xmm10
  1300. ADD2 %xmm11, %xmm1
  1301. movddup 14 * SIZE(BO), %xmm11
  1302. mulpd %xmm10, %xmm11
  1303. ADD1 %xmm11, %xmm2
  1304. movddup 15 * SIZE(BO), %xmm11
  1305. mulpd %xmm10, %xmm11
  1306. movapd 24 * SIZE(AO), %xmm10
  1307. ADD2 %xmm11, %xmm3
  1308. movddup 24 * SIZE(BO), %xmm11
  1309. addq $16 * SIZE, AO
  1310. addq $16 * SIZE, BO
  1311. decq %rax
  1312. jne .L141
  1313. .L142:
  1314. #ifndef TRMMKERNEL
  1315. movq K, %rax
  1316. #else
  1317. movq KKK, %rax
  1318. #endif
  1319. movddup ALPHA_R, %xmm14
  1320. movddup ALPHA_I, %xmm15
  1321. andq $7, %rax # if (k & 1)
  1322. BRANCH
  1323. jle .L144
  1324. .L143:
  1325. mulpd %xmm8, %xmm9
  1326. ADD1 %xmm9, %xmm0
  1327. movddup 1 * SIZE(BO), %xmm9
  1328. mulpd %xmm8, %xmm9
  1329. movapd 2 * SIZE(AO), %xmm8
  1330. ADD2 %xmm9, %xmm1
  1331. movddup 2 * SIZE(BO), %xmm9
  1332. addq $2 * SIZE, AO # aoffset += 4
  1333. addq $2 * SIZE, BO # boffset1 += 8
  1334. decq %rax
  1335. jg .L143
  1336. ALIGN_4
  1337. .L144:
  1338. addpd %xmm2, %xmm0
  1339. addpd %xmm3, %xmm1
  1340. SHUFPD_1 %xmm1, %xmm1
  1341. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1342. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1343. addsubpd %xmm1, %xmm0
  1344. movapd %xmm0, %xmm1
  1345. #else
  1346. addsubpd %xmm0, %xmm1
  1347. movapd %xmm1, %xmm0
  1348. #endif
  1349. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1350. movsd 0 * SIZE(CO1), %xmm8
  1351. movhpd 1 * SIZE(CO1), %xmm8
  1352. #endif
  1353. SHUFPD_1 %xmm1, %xmm1
  1354. mulpd %xmm14, %xmm0
  1355. mulpd %xmm15, %xmm1
  1356. addsubpd %xmm1, %xmm0
  1357. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1358. addpd %xmm8, %xmm0
  1359. #endif
  1360. movsd %xmm0, 0 * SIZE(CO1)
  1361. movhpd %xmm0, 1 * SIZE(CO1)
  1362. ALIGN_4
  1363. .L999:
  1364. movq 0(%rsp), %rbx
  1365. movq 8(%rsp), %rbp
  1366. movq 16(%rsp), %r12
  1367. movq 24(%rsp), %r13
  1368. movq 32(%rsp), %r14
  1369. movq 40(%rsp), %r15
  1370. #ifdef WINDOWS_ABI
  1371. movq 48(%rsp), %rdi
  1372. movq 56(%rsp), %rsi
  1373. movups 64(%rsp), %xmm6
  1374. movups 80(%rsp), %xmm7
  1375. movups 96(%rsp), %xmm8
  1376. movups 112(%rsp), %xmm9
  1377. movups 128(%rsp), %xmm10
  1378. movups 144(%rsp), %xmm11
  1379. movups 160(%rsp), %xmm12
  1380. movups 176(%rsp), %xmm13
  1381. movups 192(%rsp), %xmm14
  1382. movups 208(%rsp), %xmm15
  1383. #endif
  1384. addq $STACKSIZE, %rsp
  1385. ret
  1386. EPILOGUE