You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_4x2_penryn.S 35 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define OLD_K %rdx
  43. #define M %r13
  44. #define N %r14
  45. #define K %r15
  46. #define A %rcx
  47. #define B %r8
  48. #define C %r9
  49. #define LDC %r10
  50. #define I %r11
  51. #define AO %rdi
  52. #define BO %rsi
  53. #define CO1 %rbx
  54. #define CO2 %rbp
  55. #define BB %r12
  56. #define PREA %rdx
  57. #ifndef WINDOWS_ABI
  58. #define STACKSIZE 128
  59. #define OLD_LDC 8 + STACKSIZE(%rsp)
  60. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  61. #define ALPHA_R 48(%rsp)
  62. #define ALPHA_I 56(%rsp)
  63. #define J 64(%rsp)
  64. #define OFFSET 72(%rsp)
  65. #define KK 80(%rsp)
  66. #define KKK 88(%rsp)
  67. #else
  68. #define STACKSIZE 512
  69. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  70. #define OLD_A 48 + STACKSIZE(%rsp)
  71. #define OLD_B 56 + STACKSIZE(%rsp)
  72. #define OLD_C 64 + STACKSIZE(%rsp)
  73. #define OLD_LDC 72 + STACKSIZE(%rsp)
  74. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  75. #define ALPHA_R 224(%rsp)
  76. #define ALPHA_I 232(%rsp)
  77. #define J 240(%rsp)
  78. #define OFFSET 248(%rsp)
  79. #define KK 256(%rsp)
  80. #define KKK 264(%rsp)
  81. #endif
  82. #define PREFETCHSIZE (8 * 17 + 4)
  83. #define PREFETCH prefetcht0
  84. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  85. #define ADD1 addps
  86. #define ADD2 addps
  87. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  88. #define ADD1 addps
  89. #define ADD2 addps
  90. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  91. #define ADD1 addps
  92. #define ADD2 addps
  93. #else
  94. #define ADD1 addps
  95. #define ADD2 subps
  96. #endif
  97. PROLOGUE
  98. PROFCODE
  99. subq $STACKSIZE, %rsp
  100. movq %rbx, 0(%rsp)
  101. movq %rbp, 8(%rsp)
  102. movq %r12, 16(%rsp)
  103. movq %r13, 24(%rsp)
  104. movq %r14, 32(%rsp)
  105. movq %r15, 40(%rsp)
  106. #ifdef WINDOWS_ABI
  107. movq %rdi, 48(%rsp)
  108. movq %rsi, 56(%rsp)
  109. movups %xmm6, 64(%rsp)
  110. movups %xmm7, 80(%rsp)
  111. movups %xmm8, 96(%rsp)
  112. movups %xmm9, 112(%rsp)
  113. movups %xmm10, 128(%rsp)
  114. movups %xmm11, 144(%rsp)
  115. movups %xmm12, 160(%rsp)
  116. movups %xmm13, 176(%rsp)
  117. movups %xmm14, 192(%rsp)
  118. movups %xmm15, 208(%rsp)
  119. movq ARG1, OLD_M
  120. movq ARG2, OLD_N
  121. movq ARG3, OLD_K
  122. movq OLD_A, A
  123. movq OLD_B, B
  124. movq OLD_C, C
  125. movq OLD_LDC, LDC
  126. #ifdef TRMMKERNEL
  127. movq OLD_OFFSET, %r11
  128. #endif
  129. movaps %xmm3, %xmm0
  130. movss OLD_ALPHA_I, %xmm1
  131. #else
  132. movq OLD_LDC, LDC
  133. #ifdef TRMMKERNEL
  134. movq OLD_OFFSET, %r11
  135. #endif
  136. #endif
  137. unpcklps %xmm0, %xmm0
  138. unpcklps %xmm1, %xmm1
  139. movlps %xmm0, ALPHA_R
  140. movlps %xmm1, ALPHA_I
  141. subq $-32 * SIZE, A
  142. subq $-32 * SIZE, B
  143. movq OLD_M, M
  144. movq OLD_N, N
  145. movq OLD_K, K
  146. salq $ZBASE_SHIFT, LDC
  147. #ifdef TRMMKERNEL
  148. movq %r11, OFFSET
  149. #ifndef LEFT
  150. negq %r11
  151. #endif
  152. movq %r11, KK
  153. #endif
  154. movq N, J
  155. sarq $1, J
  156. NOBRANCH
  157. jle .L40
  158. ALIGN_4
  159. .L01:
  160. #if defined(TRMMKERNEL) && defined(LEFT)
  161. movq OFFSET, %rax
  162. movq %rax, KK
  163. #endif
  164. movq C, CO1
  165. leaq (C, LDC, 1), CO2
  166. movq A, AO
  167. movq K, %rax
  168. salq $ZBASE_SHIFT + 1, %rax
  169. leaq (B, %rax), BB
  170. movq M, I
  171. sarq $2, I
  172. NOBRANCH
  173. jle .L20
  174. ALIGN_4
  175. .L11:
  176. #if !defined(TRMMKERNEL) || \
  177. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  178. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  179. movq B, BO
  180. #else
  181. movq B, BO
  182. movq KK, %rax
  183. salq $ZBASE_SHIFT, %rax
  184. leaq (AO, %rax, 4), AO
  185. leaq (BO, %rax, 2), BO
  186. #endif
  187. movaps -32 * SIZE(AO), %xmm0
  188. pxor %xmm3, %xmm3
  189. movaps -28 * SIZE(AO), %xmm1
  190. pxor %xmm4, %xmm4
  191. movaps -32 * SIZE(BO), %xmm2
  192. pxor %xmm5, %xmm5
  193. prefetcht0 -32 * SIZE(BB)
  194. pxor %xmm6, %xmm6
  195. prefetcht2 7 * SIZE(CO1)
  196. movapd %xmm4, %xmm8
  197. movapd %xmm4, %xmm9
  198. movapd %xmm4, %xmm10
  199. movapd %xmm4, %xmm11
  200. prefetcht2 7 * SIZE(CO2)
  201. movapd %xmm4, %xmm12
  202. movapd %xmm4, %xmm13
  203. movapd %xmm4, %xmm14
  204. movapd %xmm4, %xmm15
  205. subq $-24 * SIZE, BB
  206. leaq (PREFETCHSIZE + 0) * SIZE(AO), PREA
  207. #ifndef TRMMKERNEL
  208. movq K, %rax
  209. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  210. movq K, %rax
  211. subq KK, %rax
  212. movq %rax, KKK
  213. #else
  214. movq KK, %rax
  215. #ifdef LEFT
  216. addq $4, %rax
  217. #else
  218. addq $2, %rax
  219. #endif
  220. movq %rax, KKK
  221. #endif
  222. sarq $3, %rax
  223. NOBRANCH
  224. jle .L15
  225. ALIGN_3
  226. .L12:
  227. PREFETCH -32 * SIZE(PREA)
  228. ADD1 %xmm6, %xmm10
  229. ADD1 %xmm3, %xmm14
  230. movaps %xmm2, %xmm3
  231. pshufd $0xb1, %xmm2, %xmm7
  232. mulps %xmm0, %xmm2
  233. mulps %xmm1, %xmm3
  234. ADD2 %xmm4, %xmm11
  235. ADD2 %xmm5, %xmm15
  236. movaps %xmm7, %xmm5
  237. pshufd $0x1b, %xmm7, %xmm6
  238. mulps %xmm0, %xmm7
  239. mulps %xmm1, %xmm5
  240. ADD1 %xmm2, %xmm8
  241. movaps -28 * SIZE(BO), %xmm2
  242. ADD1 %xmm3, %xmm12
  243. movaps %xmm6, %xmm3
  244. pshufd $0xb1, %xmm6, %xmm4
  245. mulps %xmm0, %xmm6
  246. mulps %xmm1, %xmm3
  247. ADD2 %xmm7, %xmm9
  248. ADD2 %xmm5, %xmm13
  249. movaps %xmm4, %xmm5
  250. mulps %xmm0, %xmm4
  251. movaps -24 * SIZE(AO), %xmm0
  252. mulps %xmm1, %xmm5
  253. movaps -20 * SIZE(AO), %xmm1
  254. ADD1 %xmm6, %xmm10
  255. ADD1 %xmm3, %xmm14
  256. movaps %xmm2, %xmm3
  257. pshufd $0xb1, %xmm2, %xmm7
  258. mulps %xmm0, %xmm2
  259. mulps %xmm1, %xmm3
  260. ADD2 %xmm4, %xmm11
  261. ADD2 %xmm5, %xmm15
  262. movaps %xmm7, %xmm5
  263. pshufd $0x1b, %xmm7, %xmm6
  264. mulps %xmm0, %xmm7
  265. mulps %xmm1, %xmm5
  266. ADD1 %xmm2, %xmm8
  267. movaps -24 * SIZE(BO), %xmm2
  268. ADD1 %xmm3, %xmm12
  269. movaps %xmm6, %xmm3
  270. pshufd $0xb1, %xmm6, %xmm4
  271. mulps %xmm0, %xmm6
  272. mulps %xmm1, %xmm3
  273. ADD2 %xmm7, %xmm9
  274. ADD2 %xmm5, %xmm13
  275. movaps %xmm4, %xmm5
  276. mulps %xmm0, %xmm4
  277. movaps -16 * SIZE(AO), %xmm0
  278. mulps %xmm1, %xmm5
  279. movaps -12 * SIZE(AO), %xmm1
  280. ADD1 %xmm6, %xmm10
  281. ADD1 %xmm3, %xmm14
  282. PREFETCH -16 * SIZE(PREA)
  283. movaps %xmm2, %xmm3
  284. pshufd $0xb1, %xmm2, %xmm7
  285. mulps %xmm0, %xmm2
  286. mulps %xmm1, %xmm3
  287. ADD2 %xmm4, %xmm11
  288. ADD2 %xmm5, %xmm15
  289. movaps %xmm7, %xmm5
  290. pshufd $0x1b, %xmm7, %xmm6
  291. mulps %xmm0, %xmm7
  292. mulps %xmm1, %xmm5
  293. ADD1 %xmm2, %xmm8
  294. movaps -20 * SIZE(BO), %xmm2
  295. ADD1 %xmm3, %xmm12
  296. movaps %xmm6, %xmm3
  297. pshufd $0xb1, %xmm6, %xmm4
  298. mulps %xmm0, %xmm6
  299. mulps %xmm1, %xmm3
  300. ADD2 %xmm7, %xmm9
  301. ADD2 %xmm5, %xmm13
  302. movaps %xmm4, %xmm5
  303. mulps %xmm0, %xmm4
  304. movaps -8 * SIZE(AO), %xmm0
  305. mulps %xmm1, %xmm5
  306. movaps -4 * SIZE(AO), %xmm1
  307. ADD1 %xmm6, %xmm10
  308. ADD1 %xmm3, %xmm14
  309. movaps %xmm2, %xmm3
  310. pshufd $0xb1, %xmm2, %xmm7
  311. mulps %xmm0, %xmm2
  312. mulps %xmm1, %xmm3
  313. ADD2 %xmm4, %xmm11
  314. ADD2 %xmm5, %xmm15
  315. movaps %xmm7, %xmm5
  316. pshufd $0x1b, %xmm7, %xmm6
  317. mulps %xmm0, %xmm7
  318. mulps %xmm1, %xmm5
  319. ADD1 %xmm2, %xmm8
  320. movaps -16 * SIZE(BO), %xmm2
  321. ADD1 %xmm3, %xmm12
  322. movaps %xmm6, %xmm3
  323. pshufd $0xb1, %xmm6, %xmm4
  324. mulps %xmm0, %xmm6
  325. mulps %xmm1, %xmm3
  326. ADD2 %xmm7, %xmm9
  327. ADD2 %xmm5, %xmm13
  328. movaps %xmm4, %xmm5
  329. mulps %xmm0, %xmm4
  330. movaps 0 * SIZE(AO), %xmm0
  331. mulps %xmm1, %xmm5
  332. movaps 4 * SIZE(AO), %xmm1
  333. ADD1 %xmm6, %xmm10
  334. ADD1 %xmm3, %xmm14
  335. PREFETCH 0 * SIZE(PREA)
  336. movaps %xmm2, %xmm3
  337. pshufd $0xb1, %xmm2, %xmm7
  338. mulps %xmm0, %xmm2
  339. mulps %xmm1, %xmm3
  340. ADD2 %xmm4, %xmm11
  341. ADD2 %xmm5, %xmm15
  342. movaps %xmm7, %xmm5
  343. pshufd $0x1b, %xmm7, %xmm6
  344. mulps %xmm0, %xmm7
  345. mulps %xmm1, %xmm5
  346. ADD1 %xmm2, %xmm8
  347. movaps -12 * SIZE(BO), %xmm2
  348. ADD1 %xmm3, %xmm12
  349. movaps %xmm6, %xmm3
  350. pshufd $0xb1, %xmm6, %xmm4
  351. mulps %xmm0, %xmm6
  352. mulps %xmm1, %xmm3
  353. ADD2 %xmm7, %xmm9
  354. ADD2 %xmm5, %xmm13
  355. movaps %xmm4, %xmm5
  356. mulps %xmm0, %xmm4
  357. movaps 8 * SIZE(AO), %xmm0
  358. mulps %xmm1, %xmm5
  359. movaps 12 * SIZE(AO), %xmm1
  360. ADD1 %xmm6, %xmm10
  361. ADD1 %xmm3, %xmm14
  362. movaps %xmm2, %xmm3
  363. pshufd $0xb1, %xmm2, %xmm7
  364. mulps %xmm0, %xmm2
  365. mulps %xmm1, %xmm3
  366. ADD2 %xmm4, %xmm11
  367. ADD2 %xmm5, %xmm15
  368. movaps %xmm7, %xmm5
  369. pshufd $0x1b, %xmm7, %xmm6
  370. mulps %xmm0, %xmm7
  371. mulps %xmm1, %xmm5
  372. ADD1 %xmm2, %xmm8
  373. movaps -8 * SIZE(BO), %xmm2
  374. ADD1 %xmm3, %xmm12
  375. movaps %xmm6, %xmm3
  376. pshufd $0xb1, %xmm6, %xmm4
  377. mulps %xmm0, %xmm6
  378. mulps %xmm1, %xmm3
  379. ADD2 %xmm7, %xmm9
  380. ADD2 %xmm5, %xmm13
  381. movaps %xmm4, %xmm5
  382. mulps %xmm0, %xmm4
  383. movaps 16 * SIZE(AO), %xmm0
  384. mulps %xmm1, %xmm5
  385. movaps 20 * SIZE(AO), %xmm1
  386. ADD1 %xmm6, %xmm10
  387. ADD1 %xmm3, %xmm14
  388. PREFETCH 16 * SIZE(PREA)
  389. movaps %xmm2, %xmm3
  390. pshufd $0xb1, %xmm2, %xmm7
  391. mulps %xmm0, %xmm2
  392. mulps %xmm1, %xmm3
  393. ADD2 %xmm4, %xmm11
  394. ADD2 %xmm5, %xmm15
  395. movaps %xmm7, %xmm5
  396. pshufd $0x1b, %xmm7, %xmm6
  397. mulps %xmm0, %xmm7
  398. mulps %xmm1, %xmm5
  399. ADD1 %xmm2, %xmm8
  400. movaps -4 * SIZE(BO), %xmm2
  401. ADD1 %xmm3, %xmm12
  402. movaps %xmm6, %xmm3
  403. pshufd $0xb1, %xmm6, %xmm4
  404. mulps %xmm0, %xmm6
  405. mulps %xmm1, %xmm3
  406. ADD2 %xmm7, %xmm9
  407. ADD2 %xmm5, %xmm13
  408. movaps %xmm4, %xmm5
  409. mulps %xmm0, %xmm4
  410. movaps 24 * SIZE(AO), %xmm0
  411. mulps %xmm1, %xmm5
  412. movaps 28 * SIZE(AO), %xmm1
  413. ADD1 %xmm6, %xmm10
  414. ADD1 %xmm3, %xmm14
  415. movaps %xmm2, %xmm3
  416. pshufd $0xb1, %xmm2, %xmm7
  417. mulps %xmm0, %xmm2
  418. mulps %xmm1, %xmm3
  419. ADD2 %xmm4, %xmm11
  420. ADD2 %xmm5, %xmm15
  421. movaps %xmm7, %xmm5
  422. pshufd $0x1b, %xmm7, %xmm6
  423. mulps %xmm0, %xmm7
  424. mulps %xmm1, %xmm5
  425. ADD1 %xmm2, %xmm8
  426. subq $-64 * SIZE, AO
  427. movaps 0 * SIZE(BO), %xmm2
  428. ADD1 %xmm3, %xmm12
  429. movaps %xmm6, %xmm3
  430. subq $-32 * SIZE, BO
  431. pshufd $0xb1, %xmm6, %xmm4
  432. mulps %xmm0, %xmm6
  433. mulps %xmm1, %xmm3
  434. ADD2 %xmm7, %xmm9
  435. ADD2 %xmm5, %xmm13
  436. movaps %xmm4, %xmm5
  437. mulps %xmm0, %xmm4
  438. movaps -32 * SIZE(AO), %xmm0
  439. mulps %xmm1, %xmm5
  440. movaps -28 * SIZE(AO), %xmm1
  441. subq $-64 * SIZE, PREA
  442. subq $1, %rax
  443. BRANCH
  444. jg .L12
  445. ALIGN_3
  446. .L15:
  447. prefetcht0 -16 * SIZE(BB)
  448. #ifndef TRMMKERNEL
  449. movq K, %rax
  450. #else
  451. movq KKK, %rax
  452. #endif
  453. andq $7, %rax # if (k & 1)
  454. BRANCH
  455. je .L18
  456. ALIGN_3
  457. .L16:
  458. ADD1 %xmm6, %xmm10
  459. ADD1 %xmm3, %xmm14
  460. movaps %xmm2, %xmm3
  461. pshufd $0xb1, %xmm2, %xmm7
  462. mulps %xmm0, %xmm2
  463. mulps %xmm1, %xmm3
  464. ADD2 %xmm4, %xmm11
  465. ADD2 %xmm5, %xmm15
  466. movaps %xmm7, %xmm5
  467. pshufd $0x1b, %xmm7, %xmm6
  468. mulps %xmm0, %xmm7
  469. mulps %xmm1, %xmm5
  470. ADD1 %xmm2, %xmm8
  471. movaps -28 * SIZE(BO), %xmm2
  472. ADD1 %xmm3, %xmm12
  473. movaps %xmm6, %xmm3
  474. pshufd $0xb1, %xmm6, %xmm4
  475. mulps %xmm0, %xmm6
  476. mulps %xmm1, %xmm3
  477. ADD2 %xmm7, %xmm9
  478. ADD2 %xmm5, %xmm13
  479. movaps %xmm4, %xmm5
  480. mulps %xmm0, %xmm4
  481. movaps -24 * SIZE(AO), %xmm0
  482. mulps %xmm1, %xmm5
  483. movaps -20 * SIZE(AO), %xmm1
  484. addq $8 * SIZE, AO
  485. addq $4 * SIZE, BO
  486. subq $1, %rax
  487. BRANCH
  488. jg .L16
  489. ALIGN_3
  490. .L18:
  491. ADD1 %xmm6, %xmm10
  492. ADD1 %xmm3, %xmm14
  493. ADD2 %xmm4, %xmm11
  494. ADD2 %xmm5, %xmm15
  495. pcmpeqb %xmm0, %xmm0
  496. psllq $63, %xmm0
  497. movddup ALPHA_R, %xmm2
  498. movddup ALPHA_I, %xmm3
  499. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  500. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  501. pxor %xmm0, %xmm8
  502. pxor %xmm0, %xmm10
  503. pxor %xmm0, %xmm12
  504. pxor %xmm0, %xmm14
  505. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  506. pshufd $0xb1, %xmm0, %xmm0
  507. pxor %xmm0, %xmm9
  508. pxor %xmm0, %xmm11
  509. pxor %xmm0, %xmm13
  510. pxor %xmm0, %xmm15
  511. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  512. pxor %xmm0, %xmm9
  513. pxor %xmm0, %xmm11
  514. pxor %xmm0, %xmm13
  515. pxor %xmm0, %xmm15
  516. #endif
  517. haddps %xmm9, %xmm8
  518. haddps %xmm11, %xmm10
  519. haddps %xmm13, %xmm12
  520. haddps %xmm15, %xmm14
  521. shufps $0xd8, %xmm8, %xmm8
  522. shufps $0xd8, %xmm10, %xmm10
  523. shufps $0xd8, %xmm12, %xmm12
  524. shufps $0xd8, %xmm14, %xmm14
  525. movaps %xmm8, %xmm9
  526. shufps $0xe4, %xmm10, %xmm8
  527. shufps $0xe4, %xmm9, %xmm10
  528. movaps %xmm12, %xmm13
  529. shufps $0xe4, %xmm14, %xmm12
  530. shufps $0xe4, %xmm13, %xmm14
  531. pshufd $0xb1, %xmm8, %xmm9
  532. pshufd $0xb1, %xmm10, %xmm11
  533. pshufd $0xb1, %xmm12, %xmm13
  534. pshufd $0xb1, %xmm14, %xmm15
  535. mulps %xmm2, %xmm8
  536. mulps %xmm3, %xmm9
  537. mulps %xmm2, %xmm12
  538. mulps %xmm3, %xmm13
  539. mulps %xmm2, %xmm10
  540. mulps %xmm3, %xmm11
  541. mulps %xmm2, %xmm14
  542. mulps %xmm3, %xmm15
  543. addsubps %xmm9, %xmm8
  544. addsubps %xmm11, %xmm10
  545. addsubps %xmm13, %xmm12
  546. addsubps %xmm15, %xmm14
  547. #ifndef TRMMKERNEL
  548. movsd 0 * SIZE(CO1), %xmm0
  549. movhps 2 * SIZE(CO1), %xmm0
  550. movsd 4 * SIZE(CO1), %xmm1
  551. movhps 6 * SIZE(CO1), %xmm1
  552. movsd 0 * SIZE(CO2), %xmm2
  553. movhps 2 * SIZE(CO2), %xmm2
  554. movsd 4 * SIZE(CO2), %xmm3
  555. movhps 6 * SIZE(CO2), %xmm3
  556. addps %xmm0, %xmm8
  557. addps %xmm1, %xmm12
  558. addps %xmm2, %xmm10
  559. addps %xmm3, %xmm14
  560. #endif
  561. movsd %xmm8, 0 * SIZE(CO1)
  562. movhps %xmm8, 2 * SIZE(CO1)
  563. movsd %xmm12, 4 * SIZE(CO1)
  564. movhps %xmm12, 6 * SIZE(CO1)
  565. movsd %xmm10, 0 * SIZE(CO2)
  566. movhps %xmm10, 2 * SIZE(CO2)
  567. movsd %xmm14, 4 * SIZE(CO2)
  568. movhps %xmm14, 6 * SIZE(CO2)
  569. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  570. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  571. movq K, %rax
  572. subq KKK, %rax
  573. salq $ZBASE_SHIFT, %rax
  574. leaq (AO, %rax, 4), AO
  575. leaq (BO, %rax, 2), BO
  576. #endif
  577. #if defined(TRMMKERNEL) && defined(LEFT)
  578. addq $4, KK
  579. #endif
  580. addq $8 * SIZE, CO1 # coffset += 4
  581. addq $8 * SIZE, CO2 # coffset += 4
  582. decq I # i --
  583. BRANCH
  584. jg .L11
  585. ALIGN_4
  586. .L20:
  587. testq $2, M
  588. BRANCH
  589. jle .L30
  590. #if !defined(TRMMKERNEL) || \
  591. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  592. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  593. movq B, BO
  594. #else
  595. movq B, BO
  596. movq KK, %rax
  597. salq $ZBASE_SHIFT, %rax
  598. leaq (AO, %rax, 2), AO
  599. leaq (BO, %rax, 2), BO
  600. #endif
  601. movaps -32 * SIZE(AO), %xmm0
  602. pxor %xmm4, %xmm4
  603. pxor %xmm6, %xmm6
  604. movaps -32 * SIZE(BO), %xmm2
  605. pxor %xmm8, %xmm8
  606. pxor %xmm9, %xmm9
  607. pxor %xmm10, %xmm10
  608. pxor %xmm11, %xmm11
  609. #ifndef TRMMKERNEL
  610. movq K, %rax
  611. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  612. movq K, %rax
  613. subq KK, %rax
  614. movq %rax, KKK
  615. #else
  616. movq KK, %rax
  617. #ifdef LEFT
  618. addq $2, %rax
  619. #else
  620. addq $2, %rax
  621. #endif
  622. movq %rax, KKK
  623. #endif
  624. sarq $2, %rax
  625. NOBRANCH
  626. jle .L25
  627. ALIGN_3
  628. .L22:
  629. ADD1 %xmm6, %xmm10
  630. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  631. pshufd $0xb1, %xmm2, %xmm7
  632. mulps %xmm0, %xmm2
  633. ADD2 %xmm4, %xmm11
  634. pshufd $0x1b, %xmm7, %xmm6
  635. mulps %xmm0, %xmm7
  636. ADD1 %xmm2, %xmm8
  637. movaps -28 * SIZE(BO), %xmm2
  638. pshufd $0xb1, %xmm6, %xmm4
  639. mulps %xmm0, %xmm6
  640. ADD2 %xmm7, %xmm9
  641. mulps %xmm0, %xmm4
  642. movaps -28 * SIZE(AO), %xmm0
  643. ADD1 %xmm6, %xmm10
  644. pshufd $0xb1, %xmm2, %xmm7
  645. mulps %xmm0, %xmm2
  646. ADD2 %xmm4, %xmm11
  647. pshufd $0x1b, %xmm7, %xmm6
  648. mulps %xmm0, %xmm7
  649. ADD1 %xmm2, %xmm8
  650. movaps -24 * SIZE(BO), %xmm2
  651. pshufd $0xb1, %xmm6, %xmm4
  652. mulps %xmm0, %xmm6
  653. ADD2 %xmm7, %xmm9
  654. mulps %xmm0, %xmm4
  655. movaps -24 * SIZE(AO), %xmm0
  656. ADD1 %xmm6, %xmm10
  657. pshufd $0xb1, %xmm2, %xmm7
  658. mulps %xmm0, %xmm2
  659. ADD2 %xmm4, %xmm11
  660. pshufd $0x1b, %xmm7, %xmm6
  661. mulps %xmm0, %xmm7
  662. ADD1 %xmm2, %xmm8
  663. movaps -20 * SIZE(BO), %xmm2
  664. pshufd $0xb1, %xmm6, %xmm4
  665. mulps %xmm0, %xmm6
  666. ADD2 %xmm7, %xmm9
  667. mulps %xmm0, %xmm4
  668. movaps -20 * SIZE(AO), %xmm0
  669. ADD1 %xmm6, %xmm10
  670. pshufd $0xb1, %xmm2, %xmm7
  671. mulps %xmm0, %xmm2
  672. ADD2 %xmm4, %xmm11
  673. pshufd $0x1b, %xmm7, %xmm6
  674. mulps %xmm0, %xmm7
  675. ADD1 %xmm2, %xmm8
  676. movaps -16 * SIZE(BO), %xmm2
  677. pshufd $0xb1, %xmm6, %xmm4
  678. mulps %xmm0, %xmm6
  679. subq $-16 * SIZE, AO
  680. ADD2 %xmm7, %xmm9
  681. mulps %xmm0, %xmm4
  682. movaps -32 * SIZE(AO), %xmm0
  683. subq $-16 * SIZE, BO
  684. subq $1, %rax
  685. BRANCH
  686. jg .L22
  687. ALIGN_3
  688. .L25:
  689. #ifndef TRMMKERNEL
  690. movq K, %rax
  691. #else
  692. movq KKK, %rax
  693. #endif
  694. andq $3, %rax # if (k & 1)
  695. BRANCH
  696. je .L28
  697. ALIGN_3
  698. .L26:
  699. ADD1 %xmm6, %xmm10
  700. pshufd $0xb1, %xmm2, %xmm7
  701. mulps %xmm0, %xmm2
  702. ADD2 %xmm4, %xmm11
  703. pshufd $0x1b, %xmm7, %xmm6
  704. mulps %xmm0, %xmm7
  705. ADD1 %xmm2, %xmm8
  706. movaps -28 * SIZE(BO), %xmm2
  707. pshufd $0xb1, %xmm6, %xmm4
  708. mulps %xmm0, %xmm6
  709. ADD2 %xmm7, %xmm9
  710. mulps %xmm0, %xmm4
  711. movaps -28 * SIZE(AO), %xmm0
  712. addq $4 * SIZE, AO
  713. addq $4 * SIZE, BO
  714. subq $1, %rax
  715. BRANCH
  716. jg .L26
  717. ALIGN_3
  718. .L28:
  719. ADD1 %xmm6, %xmm10
  720. ADD2 %xmm4, %xmm11
  721. pcmpeqb %xmm0, %xmm0
  722. psllq $63, %xmm0
  723. movddup ALPHA_R, %xmm2
  724. movddup ALPHA_I, %xmm3
  725. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  726. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  727. pxor %xmm0, %xmm8
  728. pxor %xmm0, %xmm10
  729. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  730. pshufd $0xb1, %xmm0, %xmm0
  731. pxor %xmm0, %xmm9
  732. pxor %xmm0, %xmm11
  733. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  734. pxor %xmm0, %xmm9
  735. pxor %xmm0, %xmm11
  736. #endif
  737. haddps %xmm9, %xmm8
  738. haddps %xmm11, %xmm10
  739. shufps $0xd8, %xmm8, %xmm8
  740. shufps $0xd8, %xmm10, %xmm10
  741. movaps %xmm8, %xmm9
  742. shufps $0xe4, %xmm10, %xmm8
  743. shufps $0xe4, %xmm9, %xmm10
  744. pshufd $0xb1, %xmm8, %xmm9
  745. pshufd $0xb1, %xmm10, %xmm11
  746. mulps %xmm2, %xmm8
  747. mulps %xmm3, %xmm9
  748. mulps %xmm2, %xmm10
  749. mulps %xmm3, %xmm11
  750. addsubps %xmm9, %xmm8
  751. addsubps %xmm11, %xmm10
  752. #ifndef TRMMKERNEL
  753. movsd 0 * SIZE(CO1), %xmm0
  754. movhps 2 * SIZE(CO1), %xmm0
  755. movsd 0 * SIZE(CO2), %xmm2
  756. movhps 2 * SIZE(CO2), %xmm2
  757. addps %xmm0, %xmm8
  758. addps %xmm2, %xmm10
  759. #endif
  760. movsd %xmm8, 0 * SIZE(CO1)
  761. movhps %xmm8, 2 * SIZE(CO1)
  762. movsd %xmm10, 0 * SIZE(CO2)
  763. movhps %xmm10, 2 * SIZE(CO2)
  764. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  765. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  766. movq K, %rax
  767. subq KKK, %rax
  768. salq $ZBASE_SHIFT, %rax
  769. leaq (AO, %rax, 2), AO
  770. leaq (BO, %rax, 2), BO
  771. #endif
  772. #if defined(TRMMKERNEL) && defined(LEFT)
  773. addq $2, KK
  774. #endif
  775. addq $4 * SIZE, CO1
  776. addq $4 * SIZE, CO2
  777. ALIGN_4
  778. .L30:
  779. testq $1, M
  780. BRANCH
  781. jle .L39
  782. #if !defined(TRMMKERNEL) || \
  783. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  784. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  785. movq B, BO
  786. #else
  787. movq B, BO
  788. movq KK, %rax
  789. salq $ZBASE_SHIFT, %rax
  790. leaq (AO, %rax, 1), AO
  791. leaq (BO, %rax, 2), BO
  792. #endif
  793. movsd -32 * SIZE(AO), %xmm0
  794. pxor %xmm4, %xmm4
  795. pxor %xmm6, %xmm6
  796. movaps -32 * SIZE(BO), %xmm2
  797. pxor %xmm8, %xmm8
  798. pxor %xmm9, %xmm9
  799. pxor %xmm10, %xmm10
  800. pxor %xmm11, %xmm11
  801. #ifndef TRMMKERNEL
  802. movq K, %rax
  803. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  804. movq K, %rax
  805. subq KK, %rax
  806. movq %rax, KKK
  807. #else
  808. movq KK, %rax
  809. #ifdef LEFT
  810. addq $1, %rax
  811. #else
  812. addq $2, %rax
  813. #endif
  814. movq %rax, KKK
  815. #endif
  816. sarq $2, %rax
  817. NOBRANCH
  818. jle .L35
  819. ALIGN_3
  820. .L32:
  821. ADD1 %xmm6, %xmm10
  822. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  823. pshufd $0xb1, %xmm2, %xmm7
  824. mulps %xmm0, %xmm2
  825. ADD2 %xmm4, %xmm11
  826. pshufd $0x1b, %xmm7, %xmm6
  827. mulps %xmm0, %xmm7
  828. ADD1 %xmm2, %xmm8
  829. movaps -28 * SIZE(BO), %xmm2
  830. pshufd $0xb1, %xmm6, %xmm4
  831. mulps %xmm0, %xmm6
  832. ADD2 %xmm7, %xmm9
  833. mulps %xmm0, %xmm4
  834. movsd -30 * SIZE(AO), %xmm0
  835. ADD1 %xmm6, %xmm10
  836. pshufd $0xb1, %xmm2, %xmm7
  837. mulps %xmm0, %xmm2
  838. ADD2 %xmm4, %xmm11
  839. pshufd $0x1b, %xmm7, %xmm6
  840. mulps %xmm0, %xmm7
  841. ADD1 %xmm2, %xmm8
  842. movaps -24 * SIZE(BO), %xmm2
  843. pshufd $0xb1, %xmm6, %xmm4
  844. mulps %xmm0, %xmm6
  845. ADD2 %xmm7, %xmm9
  846. mulps %xmm0, %xmm4
  847. movsd -28 * SIZE(AO), %xmm0
  848. ADD1 %xmm6, %xmm10
  849. pshufd $0xb1, %xmm2, %xmm7
  850. mulps %xmm0, %xmm2
  851. ADD2 %xmm4, %xmm11
  852. pshufd $0x1b, %xmm7, %xmm6
  853. mulps %xmm0, %xmm7
  854. ADD1 %xmm2, %xmm8
  855. movaps -20 * SIZE(BO), %xmm2
  856. pshufd $0xb1, %xmm6, %xmm4
  857. mulps %xmm0, %xmm6
  858. ADD2 %xmm7, %xmm9
  859. mulps %xmm0, %xmm4
  860. movsd -26 * SIZE(AO), %xmm0
  861. ADD1 %xmm6, %xmm10
  862. pshufd $0xb1, %xmm2, %xmm7
  863. mulps %xmm0, %xmm2
  864. ADD2 %xmm4, %xmm11
  865. pshufd $0x1b, %xmm7, %xmm6
  866. mulps %xmm0, %xmm7
  867. ADD1 %xmm2, %xmm8
  868. movaps -16 * SIZE(BO), %xmm2
  869. pshufd $0xb1, %xmm6, %xmm4
  870. mulps %xmm0, %xmm6
  871. subq $-8 * SIZE, AO
  872. ADD2 %xmm7, %xmm9
  873. mulps %xmm0, %xmm4
  874. movsd -32 * SIZE(AO), %xmm0
  875. subq $-16 * SIZE, BO
  876. subq $1, %rax
  877. BRANCH
  878. jg .L32
  879. ALIGN_3
  880. .L35:
  881. #ifndef TRMMKERNEL
  882. movq K, %rax
  883. #else
  884. movq KKK, %rax
  885. #endif
  886. andq $3, %rax # if (k & 1)
  887. BRANCH
  888. je .L38
  889. ALIGN_3
  890. .L36:
  891. ADD1 %xmm6, %xmm10
  892. pshufd $0xb1, %xmm2, %xmm7
  893. mulps %xmm0, %xmm2
  894. ADD2 %xmm4, %xmm11
  895. pshufd $0x1b, %xmm7, %xmm6
  896. mulps %xmm0, %xmm7
  897. ADD1 %xmm2, %xmm8
  898. movaps -28 * SIZE(BO), %xmm2
  899. pshufd $0xb1, %xmm6, %xmm4
  900. mulps %xmm0, %xmm6
  901. ADD2 %xmm7, %xmm9
  902. mulps %xmm0, %xmm4
  903. movsd -30 * SIZE(AO), %xmm0
  904. addq $2 * SIZE, AO
  905. addq $4 * SIZE, BO
  906. subq $1, %rax
  907. BRANCH
  908. jg .L36
  909. ALIGN_3
  910. .L38:
  911. ADD1 %xmm6, %xmm10
  912. ADD2 %xmm4, %xmm11
  913. pcmpeqb %xmm0, %xmm0
  914. psllq $63, %xmm0
  915. movddup ALPHA_R, %xmm2
  916. movddup ALPHA_I, %xmm3
  917. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  918. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  919. pxor %xmm0, %xmm8
  920. pxor %xmm0, %xmm10
  921. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  922. pshufd $0xb1, %xmm0, %xmm0
  923. pxor %xmm0, %xmm9
  924. pxor %xmm0, %xmm11
  925. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  926. pxor %xmm0, %xmm9
  927. pxor %xmm0, %xmm11
  928. #endif
  929. haddps %xmm9, %xmm8
  930. haddps %xmm11, %xmm10
  931. shufps $0xd8, %xmm8, %xmm8
  932. shufps $0xd8, %xmm10, %xmm10
  933. movaps %xmm8, %xmm9
  934. shufps $0xe4, %xmm10, %xmm8
  935. shufps $0xe4, %xmm9, %xmm10
  936. pshufd $0xb1, %xmm8, %xmm9
  937. pshufd $0xb1, %xmm10, %xmm11
  938. mulps %xmm2, %xmm8
  939. mulps %xmm3, %xmm9
  940. mulps %xmm2, %xmm10
  941. mulps %xmm3, %xmm11
  942. addsubps %xmm9, %xmm8
  943. addsubps %xmm11, %xmm10
  944. #ifndef TRMMKERNEL
  945. movsd 0 * SIZE(CO1), %xmm0
  946. movsd 0 * SIZE(CO2), %xmm2
  947. addps %xmm0, %xmm8
  948. addps %xmm2, %xmm10
  949. #endif
  950. movsd %xmm8, 0 * SIZE(CO1)
  951. movsd %xmm10, 0 * SIZE(CO2)
  952. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  953. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  954. movq K, %rax
  955. subq KKK, %rax
  956. salq $ZBASE_SHIFT, %rax
  957. leaq (AO, %rax, 1), AO
  958. leaq (BO, %rax, 2), BO
  959. #endif
  960. #if defined(TRMMKERNEL) && defined(LEFT)
  961. addq $1, KK
  962. #endif
  963. ALIGN_4
  964. .L39:
  965. #if defined(TRMMKERNEL) && !defined(LEFT)
  966. addq $2, KK
  967. #endif
  968. leaq (C, LDC, 2), C
  969. movq BO, B
  970. subq $1, J
  971. BRANCH
  972. jg .L01
  973. ALIGN_4
  974. .L40:
  975. testq $1, N
  976. BRANCH
  977. jle .L999
  978. #if defined(TRMMKERNEL) && defined(LEFT)
  979. movq OFFSET, %rax
  980. movq %rax, KK
  981. #endif
  982. movq C, CO1
  983. movq A, AO
  984. movq M, I
  985. sarq $2, I
  986. NOBRANCH
  987. jle .L50
  988. ALIGN_4
  989. .L41:
  990. #if !defined(TRMMKERNEL) || \
  991. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  992. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  993. movq B, BO
  994. #else
  995. movq B, BO
  996. movq KK, %rax
  997. salq $ZBASE_SHIFT, %rax
  998. leaq (AO, %rax, 4), AO
  999. leaq (BO, %rax, 1), BO
  1000. #endif
  1001. movaps -32 * SIZE(AO), %xmm0
  1002. pxor %xmm3, %xmm3
  1003. movaps -28 * SIZE(AO), %xmm1
  1004. pxor %xmm4, %xmm4
  1005. movaps -32 * SIZE(BO), %xmm2
  1006. pxor %xmm5, %xmm5
  1007. pxor %xmm6, %xmm6
  1008. prefetcht0 7 * SIZE(CO1)
  1009. pxor %xmm8, %xmm8
  1010. pxor %xmm9, %xmm9
  1011. pxor %xmm12, %xmm12
  1012. pxor %xmm13, %xmm13
  1013. #ifndef TRMMKERNEL
  1014. movq K, %rax
  1015. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1016. movq K, %rax
  1017. subq KK, %rax
  1018. movq %rax, KKK
  1019. #else
  1020. movq KK, %rax
  1021. #ifdef LEFT
  1022. addq $4, %rax
  1023. #else
  1024. addq $1, %rax
  1025. #endif
  1026. movq %rax, KKK
  1027. #endif
  1028. sarq $2, %rax
  1029. NOBRANCH
  1030. jle .L45
  1031. ALIGN_3
  1032. .L42:
  1033. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1034. addps %xmm3, %xmm8
  1035. pshufd $0x00, %xmm2, %xmm3
  1036. mulps %xmm0, %xmm3
  1037. addps %xmm4, %xmm12
  1038. pshufd $0x00, %xmm2, %xmm4
  1039. mulps %xmm1, %xmm4
  1040. addps %xmm5, %xmm9
  1041. pshufd $0x55, %xmm2, %xmm5
  1042. mulps %xmm0, %xmm5
  1043. movaps -24 * SIZE(AO), %xmm0
  1044. addps %xmm6, %xmm13
  1045. pshufd $0x55, %xmm2, %xmm6
  1046. mulps %xmm1, %xmm6
  1047. movaps -20 * SIZE(AO), %xmm1
  1048. addps %xmm3, %xmm8
  1049. pshufd $0xaa, %xmm2, %xmm3
  1050. mulps %xmm0, %xmm3
  1051. addps %xmm4, %xmm12
  1052. pshufd $0xaa, %xmm2, %xmm4
  1053. mulps %xmm1, %xmm4
  1054. addps %xmm5, %xmm9
  1055. pshufd $0xff, %xmm2, %xmm5
  1056. mulps %xmm0, %xmm5
  1057. movaps -16 * SIZE(AO), %xmm0
  1058. addps %xmm6, %xmm13
  1059. pshufd $0xff, %xmm2, %xmm6
  1060. movaps -28 * SIZE(BO), %xmm2
  1061. mulps %xmm1, %xmm6
  1062. movaps -12 * SIZE(AO), %xmm1
  1063. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  1064. addps %xmm3, %xmm8
  1065. pshufd $0x00, %xmm2, %xmm3
  1066. mulps %xmm0, %xmm3
  1067. addps %xmm4, %xmm12
  1068. pshufd $0x00, %xmm2, %xmm4
  1069. mulps %xmm1, %xmm4
  1070. addps %xmm5, %xmm9
  1071. pshufd $0x55, %xmm2, %xmm5
  1072. mulps %xmm0, %xmm5
  1073. movaps -8 * SIZE(AO), %xmm0
  1074. addps %xmm6, %xmm13
  1075. pshufd $0x55, %xmm2, %xmm6
  1076. mulps %xmm1, %xmm6
  1077. movaps -4 * SIZE(AO), %xmm1
  1078. addps %xmm3, %xmm8
  1079. pshufd $0xaa, %xmm2, %xmm3
  1080. mulps %xmm0, %xmm3
  1081. addps %xmm4, %xmm12
  1082. pshufd $0xaa, %xmm2, %xmm4
  1083. mulps %xmm1, %xmm4
  1084. addps %xmm5, %xmm9
  1085. pshufd $0xff, %xmm2, %xmm5
  1086. mulps %xmm0, %xmm5
  1087. movaps 0 * SIZE(AO), %xmm0
  1088. addps %xmm6, %xmm13
  1089. pshufd $0xff, %xmm2, %xmm6
  1090. movaps -24 * SIZE(BO), %xmm2
  1091. mulps %xmm1, %xmm6
  1092. movaps 4 * SIZE(AO), %xmm1
  1093. subq $-32 * SIZE, AO
  1094. subq $ -8 * SIZE, BO
  1095. subq $1, %rax
  1096. BRANCH
  1097. jg .L42
  1098. ALIGN_3
  1099. .L45:
  1100. #ifndef TRMMKERNEL
  1101. movq K, %rax
  1102. #else
  1103. movq KKK, %rax
  1104. #endif
  1105. andq $3, %rax # if (k & 1)
  1106. BRANCH
  1107. je .L48
  1108. ALIGN_3
  1109. .L46:
  1110. addps %xmm3, %xmm8
  1111. pshufd $0x00, %xmm2, %xmm3
  1112. mulps %xmm0, %xmm3
  1113. addps %xmm4, %xmm12
  1114. pshufd $0x00, %xmm2, %xmm4
  1115. mulps %xmm1, %xmm4
  1116. addps %xmm5, %xmm9
  1117. pshufd $0x55, %xmm2, %xmm5
  1118. mulps %xmm0, %xmm5
  1119. movaps -24 * SIZE(AO), %xmm0
  1120. addps %xmm6, %xmm13
  1121. pshufd $0x55, %xmm2, %xmm6
  1122. movsd -30 * SIZE(BO), %xmm2
  1123. mulps %xmm1, %xmm6
  1124. movaps -20 * SIZE(AO), %xmm1
  1125. addq $8 * SIZE, AO
  1126. addq $2 * SIZE, BO
  1127. subq $1, %rax
  1128. BRANCH
  1129. jg .L46
  1130. ALIGN_3
  1131. .L48:
  1132. addps %xmm3, %xmm8
  1133. addps %xmm4, %xmm12
  1134. addps %xmm5, %xmm9
  1135. addps %xmm6, %xmm13
  1136. pshufd $0xb1, %xmm9, %xmm9
  1137. movddup ALPHA_R, %xmm2
  1138. pshufd $0xb1, %xmm13, %xmm13
  1139. movddup ALPHA_I, %xmm3
  1140. pcmpeqb %xmm0, %xmm0
  1141. psllq $63, %xmm0
  1142. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1143. pxor %xmm0, %xmm9
  1144. pxor %xmm0, %xmm13
  1145. subps %xmm9, %xmm8
  1146. subps %xmm13, %xmm12
  1147. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1148. pxor %xmm0, %xmm9
  1149. pxor %xmm0, %xmm13
  1150. addps %xmm9, %xmm8
  1151. addps %xmm13, %xmm12
  1152. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1153. pxor %xmm0, %xmm8
  1154. pxor %xmm0, %xmm12
  1155. addps %xmm9, %xmm8
  1156. addps %xmm13, %xmm12
  1157. #else
  1158. pxor %xmm0, %xmm8
  1159. pxor %xmm0, %xmm12
  1160. subps %xmm9, %xmm8
  1161. subps %xmm13, %xmm12
  1162. #endif
  1163. pshufd $0xb1, %xmm8, %xmm9
  1164. pshufd $0xb1, %xmm12, %xmm13
  1165. mulps %xmm2, %xmm8
  1166. mulps %xmm3, %xmm9
  1167. mulps %xmm2, %xmm12
  1168. mulps %xmm3, %xmm13
  1169. addsubps %xmm9, %xmm8
  1170. addsubps %xmm13, %xmm12
  1171. #ifndef TRMMKERNEL
  1172. movsd 0 * SIZE(CO1), %xmm0
  1173. movhps 2 * SIZE(CO1), %xmm0
  1174. movsd 4 * SIZE(CO1), %xmm1
  1175. movhps 6 * SIZE(CO1), %xmm1
  1176. addps %xmm0, %xmm8
  1177. addps %xmm1, %xmm12
  1178. #endif
  1179. movsd %xmm8, 0 * SIZE(CO1)
  1180. movhps %xmm8, 2 * SIZE(CO1)
  1181. movsd %xmm12, 4 * SIZE(CO1)
  1182. movhps %xmm12, 6 * SIZE(CO1)
  1183. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1184. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1185. movq K, %rax
  1186. subq KKK, %rax
  1187. salq $ZBASE_SHIFT, %rax
  1188. leaq (AO, %rax, 4), AO
  1189. leaq (BO, %rax, 1), BO
  1190. #endif
  1191. #if defined(TRMMKERNEL) && defined(LEFT)
  1192. addq $4, KK
  1193. #endif
  1194. addq $8 * SIZE, CO1 # coffset += 4
  1195. decq I # i --
  1196. BRANCH
  1197. jg .L41
  1198. ALIGN_4
  1199. .L50:
  1200. testq $2, M
  1201. BRANCH
  1202. jle .L60
  1203. #if !defined(TRMMKERNEL) || \
  1204. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1205. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1206. movq B, BO
  1207. #else
  1208. movq B, BO
  1209. movq KK, %rax
  1210. salq $ZBASE_SHIFT, %rax
  1211. leaq (AO, %rax, 2), AO
  1212. leaq (BO, %rax, 1), BO
  1213. #endif
  1214. movaps -32 * SIZE(AO), %xmm0
  1215. pxor %xmm3, %xmm3
  1216. pxor %xmm4, %xmm4
  1217. movaps -32 * SIZE(BO), %xmm2
  1218. pxor %xmm8, %xmm8
  1219. pxor %xmm9, %xmm9
  1220. #ifndef TRMMKERNEL
  1221. movq K, %rax
  1222. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1223. movq K, %rax
  1224. subq KK, %rax
  1225. movq %rax, KKK
  1226. #else
  1227. movq KK, %rax
  1228. #ifdef LEFT
  1229. addq $2, %rax
  1230. #else
  1231. addq $1, %rax
  1232. #endif
  1233. movq %rax, KKK
  1234. #endif
  1235. sarq $2, %rax
  1236. NOBRANCH
  1237. jle .L55
  1238. ALIGN_3
  1239. .L52:
  1240. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1241. addps %xmm3, %xmm8
  1242. pshufd $0x00, %xmm2, %xmm3
  1243. mulps %xmm0, %xmm3
  1244. addps %xmm4, %xmm9
  1245. pshufd $0x55, %xmm2, %xmm4
  1246. mulps %xmm0, %xmm4
  1247. movaps -28 * SIZE(AO), %xmm0
  1248. addps %xmm3, %xmm8
  1249. pshufd $0xaa, %xmm2, %xmm3
  1250. mulps %xmm0, %xmm3
  1251. addps %xmm4, %xmm9
  1252. pshufd $0xff, %xmm2, %xmm4
  1253. movaps -28 * SIZE(BO), %xmm2
  1254. mulps %xmm0, %xmm4
  1255. movaps -24 * SIZE(AO), %xmm0
  1256. addps %xmm3, %xmm8
  1257. pshufd $0x00, %xmm2, %xmm3
  1258. mulps %xmm0, %xmm3
  1259. addps %xmm4, %xmm9
  1260. pshufd $0x55, %xmm2, %xmm4
  1261. mulps %xmm0, %xmm4
  1262. movaps -20 * SIZE(AO), %xmm0
  1263. addps %xmm3, %xmm8
  1264. pshufd $0xaa, %xmm2, %xmm3
  1265. mulps %xmm0, %xmm3
  1266. addps %xmm4, %xmm9
  1267. pshufd $0xff, %xmm2, %xmm4
  1268. movaps -24 * SIZE(BO), %xmm2
  1269. mulps %xmm0, %xmm4
  1270. movaps -16 * SIZE(AO), %xmm0
  1271. subq $-16 * SIZE, AO
  1272. subq $ -8 * SIZE, BO
  1273. subq $1, %rax
  1274. BRANCH
  1275. jg .L52
  1276. ALIGN_3
  1277. .L55:
  1278. #ifndef TRMMKERNEL
  1279. movq K, %rax
  1280. #else
  1281. movq KKK, %rax
  1282. #endif
  1283. andq $3, %rax # if (k & 1)
  1284. BRANCH
  1285. je .L58
  1286. ALIGN_3
  1287. .L56:
  1288. addps %xmm3, %xmm8
  1289. pshufd $0x00, %xmm2, %xmm3
  1290. mulps %xmm0, %xmm3
  1291. addps %xmm4, %xmm9
  1292. pshufd $0x55, %xmm2, %xmm4
  1293. movsd -30 * SIZE(BO), %xmm2
  1294. mulps %xmm0, %xmm4
  1295. movaps -28 * SIZE(AO), %xmm0
  1296. addq $4 * SIZE, AO
  1297. addq $2 * SIZE, BO
  1298. subq $1, %rax
  1299. BRANCH
  1300. jg .L56
  1301. ALIGN_3
  1302. .L58:
  1303. addps %xmm3, %xmm8
  1304. movddup ALPHA_R, %xmm2
  1305. addps %xmm4, %xmm9
  1306. movddup ALPHA_I, %xmm3
  1307. pshufd $0xb1, %xmm9, %xmm9
  1308. pcmpeqb %xmm0, %xmm0
  1309. psllq $63, %xmm0
  1310. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1311. pxor %xmm0, %xmm9
  1312. subps %xmm9, %xmm8
  1313. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1314. pxor %xmm0, %xmm9
  1315. addps %xmm9, %xmm8
  1316. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1317. pxor %xmm0, %xmm8
  1318. addps %xmm9, %xmm8
  1319. #else
  1320. pxor %xmm0, %xmm8
  1321. subps %xmm9, %xmm8
  1322. #endif
  1323. pshufd $0xb1, %xmm8, %xmm9
  1324. mulps %xmm2, %xmm8
  1325. mulps %xmm3, %xmm9
  1326. addsubps %xmm9, %xmm8
  1327. #ifndef TRMMKERNEL
  1328. movsd 0 * SIZE(CO1), %xmm0
  1329. movhps 2 * SIZE(CO1), %xmm0
  1330. addps %xmm0, %xmm8
  1331. #endif
  1332. movsd %xmm8, 0 * SIZE(CO1)
  1333. movhps %xmm8, 2 * SIZE(CO1)
  1334. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1335. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1336. movq K, %rax
  1337. subq KKK, %rax
  1338. salq $ZBASE_SHIFT, %rax
  1339. leaq (AO, %rax, 2), AO
  1340. leaq (BO, %rax, 1), BO
  1341. #endif
  1342. #if defined(TRMMKERNEL) && defined(LEFT)
  1343. addq $2, KK
  1344. #endif
  1345. addq $4 * SIZE, CO1
  1346. ALIGN_4
  1347. .L60:
  1348. testq $1, M
  1349. BRANCH
  1350. jle .L999
  1351. #if !defined(TRMMKERNEL) || \
  1352. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1353. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1354. movq B, BO
  1355. #else
  1356. movq B, BO
  1357. movq KK, %rax
  1358. salq $ZBASE_SHIFT, %rax
  1359. leaq (AO, %rax, 1), AO
  1360. leaq (BO, %rax, 1), BO
  1361. #endif
  1362. movsd -32 * SIZE(AO), %xmm0
  1363. pxor %xmm3, %xmm3
  1364. pxor %xmm4, %xmm4
  1365. movaps -32 * SIZE(BO), %xmm2
  1366. pxor %xmm8, %xmm8
  1367. pxor %xmm9, %xmm9
  1368. #ifndef TRMMKERNEL
  1369. movq K, %rax
  1370. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1371. movq K, %rax
  1372. subq KK, %rax
  1373. movq %rax, KKK
  1374. #else
  1375. movq KK, %rax
  1376. #ifdef LEFT
  1377. addq $1, %rax
  1378. #else
  1379. addq $1, %rax
  1380. #endif
  1381. movq %rax, KKK
  1382. #endif
  1383. sarq $2, %rax
  1384. NOBRANCH
  1385. jle .L65
  1386. ALIGN_3
  1387. .L62:
  1388. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1389. addps %xmm3, %xmm8
  1390. pshufd $0x00, %xmm2, %xmm3
  1391. mulps %xmm0, %xmm3
  1392. addps %xmm4, %xmm9
  1393. pshufd $0x55, %xmm2, %xmm4
  1394. mulps %xmm0, %xmm4
  1395. movsd -30 * SIZE(AO), %xmm0
  1396. addps %xmm3, %xmm8
  1397. pshufd $0xaa, %xmm2, %xmm3
  1398. mulps %xmm0, %xmm3
  1399. addps %xmm4, %xmm9
  1400. pshufd $0xff, %xmm2, %xmm4
  1401. movaps -28 * SIZE(BO), %xmm2
  1402. mulps %xmm0, %xmm4
  1403. movsd -28 * SIZE(AO), %xmm0
  1404. addps %xmm3, %xmm8
  1405. pshufd $0x00, %xmm2, %xmm3
  1406. mulps %xmm0, %xmm3
  1407. addps %xmm4, %xmm9
  1408. pshufd $0x55, %xmm2, %xmm4
  1409. mulps %xmm0, %xmm4
  1410. movsd -26 * SIZE(AO), %xmm0
  1411. addps %xmm3, %xmm8
  1412. pshufd $0xaa, %xmm2, %xmm3
  1413. mulps %xmm0, %xmm3
  1414. addps %xmm4, %xmm9
  1415. pshufd $0xff, %xmm2, %xmm4
  1416. movaps -24 * SIZE(BO), %xmm2
  1417. mulps %xmm0, %xmm4
  1418. movsd -24 * SIZE(AO), %xmm0
  1419. subq $-8 * SIZE, AO
  1420. subq $-8 * SIZE, BO
  1421. subq $1, %rax
  1422. BRANCH
  1423. jg .L62
  1424. ALIGN_3
  1425. .L65:
  1426. #ifndef TRMMKERNEL
  1427. movq K, %rax
  1428. #else
  1429. movq KKK, %rax
  1430. #endif
  1431. andq $3, %rax # if (k & 1)
  1432. BRANCH
  1433. je .L68
  1434. ALIGN_3
  1435. .L66:
  1436. addps %xmm3, %xmm8
  1437. pshufd $0x00, %xmm2, %xmm3
  1438. mulps %xmm0, %xmm3
  1439. addps %xmm4, %xmm9
  1440. pshufd $0x55, %xmm2, %xmm4
  1441. movsd -30 * SIZE(BO), %xmm2
  1442. mulps %xmm0, %xmm4
  1443. movsd -30 * SIZE(AO), %xmm0
  1444. addq $2 * SIZE, AO
  1445. addq $2 * SIZE, BO
  1446. subq $1, %rax
  1447. BRANCH
  1448. jg .L66
  1449. ALIGN_3
  1450. .L68:
  1451. addps %xmm3, %xmm8
  1452. movddup ALPHA_R, %xmm2
  1453. addps %xmm4, %xmm9
  1454. movddup ALPHA_I, %xmm3
  1455. pshufd $0xb1, %xmm9, %xmm9
  1456. pcmpeqb %xmm0, %xmm0
  1457. psllq $63, %xmm0
  1458. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1459. pxor %xmm0, %xmm9
  1460. subps %xmm9, %xmm8
  1461. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1462. pxor %xmm0, %xmm9
  1463. addps %xmm9, %xmm8
  1464. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1465. pxor %xmm0, %xmm8
  1466. addps %xmm9, %xmm8
  1467. #else
  1468. pxor %xmm0, %xmm8
  1469. subps %xmm9, %xmm8
  1470. #endif
  1471. pshufd $0xb1, %xmm8, %xmm9
  1472. mulps %xmm2, %xmm8
  1473. mulps %xmm3, %xmm9
  1474. addsubps %xmm9, %xmm8
  1475. #ifndef TRMMKERNEL
  1476. movsd 0 * SIZE(CO1), %xmm0
  1477. addps %xmm0, %xmm8
  1478. #endif
  1479. movsd %xmm8, 0 * SIZE(CO1)
  1480. ALIGN_4
  1481. .L999:
  1482. movq 0(%rsp), %rbx
  1483. movq 8(%rsp), %rbp
  1484. movq 16(%rsp), %r12
  1485. movq 24(%rsp), %r13
  1486. movq 32(%rsp), %r14
  1487. movq 40(%rsp), %r15
  1488. #ifdef WINDOWS_ABI
  1489. movq 48(%rsp), %rdi
  1490. movq 56(%rsp), %rsi
  1491. movups 64(%rsp), %xmm6
  1492. movups 80(%rsp), %xmm7
  1493. movups 96(%rsp), %xmm8
  1494. movups 112(%rsp), %xmm9
  1495. movups 128(%rsp), %xmm10
  1496. movups 144(%rsp), %xmm11
  1497. movups 160(%rsp), %xmm12
  1498. movups 176(%rsp), %xmm13
  1499. movups 192(%rsp), %xmm14
  1500. movups 208(%rsp), %xmm15
  1501. #endif
  1502. addq $STACKSIZE, %rsp
  1503. ret
  1504. EPILOGUE