You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LT_2x1_atom.S 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %rdi
  41. #define N %rsi
  42. #define K %rdx
  43. #define A %rcx
  44. #define B %r8
  45. #define C %r9
  46. #define LDC %r10
  47. #define I %r11
  48. #define J %r12
  49. #define AO %r13
  50. #define BO %r14
  51. #define CO1 %r15
  52. #define BB %rbx
  53. #define KK %rbp
  54. #ifndef WINDOWS_ABI
  55. #define STACKSIZE 128
  56. #define OLD_LDC 8 + STACKSIZE(%rsp)
  57. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  58. #define OFFSET 48(%rsp)
  59. #define KKK 56(%rsp)
  60. #define AORIG 64(%rsp)
  61. #else
  62. #define STACKSIZE 256
  63. #define OLD_A 48 + STACKSIZE(%rsp)
  64. #define OLD_B 56 + STACKSIZE(%rsp)
  65. #define OLD_C 64 + STACKSIZE(%rsp)
  66. #define OLD_LDC 72 + STACKSIZE(%rsp)
  67. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  68. #define OFFSET 224(%rsp)
  69. #define KKK 232(%rsp)
  70. #define AORIG 240(%rsp)
  71. #endif
  72. #define PREFETCH prefetcht0
  73. #define PREFETCHSIZE (8 * 8 + 3)
  74. #ifndef CONJ
  75. #define ADDSD1 addsd
  76. #define ADDSD2 addsd
  77. #define ADDSD3 addsd
  78. #define ADDSD4 subsd
  79. #elif defined(LN) || defined(LT)
  80. #define ADDSD1 addsd
  81. #define ADDSD2 addsd
  82. #define ADDSD3 subsd
  83. #define ADDSD4 addsd
  84. #else
  85. #define ADDSD1 addsd
  86. #define ADDSD2 subsd
  87. #define ADDSD3 addsd
  88. #define ADDSD4 addsd
  89. #endif
  90. PROLOGUE
  91. PROFCODE
  92. subq $STACKSIZE, %rsp
  93. movq %rbx, 0(%rsp)
  94. movq %rbp, 8(%rsp)
  95. movq %r12, 16(%rsp)
  96. movq %r13, 24(%rsp)
  97. movq %r14, 32(%rsp)
  98. movq %r15, 40(%rsp)
  99. #ifdef WINDOWS_ABI
  100. movq %rdi, 48(%rsp)
  101. movq %rsi, 56(%rsp)
  102. movups %xmm6, 64(%rsp)
  103. movups %xmm7, 80(%rsp)
  104. movups %xmm8, 96(%rsp)
  105. movups %xmm9, 112(%rsp)
  106. movups %xmm10, 128(%rsp)
  107. movups %xmm11, 144(%rsp)
  108. movups %xmm12, 160(%rsp)
  109. movups %xmm13, 176(%rsp)
  110. movups %xmm14, 192(%rsp)
  111. movups %xmm15, 208(%rsp)
  112. movq ARG1, M
  113. movq ARG2, N
  114. movq ARG3, K
  115. movq OLD_A, A
  116. movq OLD_B, B
  117. movq OLD_C, C
  118. movq OLD_LDC, LDC
  119. #endif
  120. movq OLD_LDC, LDC
  121. movq OLD_OFFSET, KK
  122. movq KK, OFFSET
  123. salq $ZBASE_SHIFT, LDC
  124. #ifdef LN
  125. movq M, %rax
  126. salq $ZBASE_SHIFT, %rax
  127. addq %rax, C
  128. imulq K, %rax
  129. addq %rax, A
  130. #endif
  131. #ifdef RT
  132. movq N, %rax
  133. salq $ZBASE_SHIFT, %rax
  134. imulq K, %rax
  135. addq %rax, B
  136. movq N, %rax
  137. imulq LDC, %rax
  138. addq %rax, C
  139. #endif
  140. #ifdef RN
  141. negq KK
  142. #endif
  143. #ifdef RT
  144. movq N, KK
  145. subq OFFSET, KK
  146. #endif
  147. movq N, J
  148. testq N, N
  149. jle .L999
  150. ALIGN_4
  151. .L01:
  152. #if defined(LT) || defined(RN)
  153. movq A, AO
  154. #else
  155. movq A, AORIG
  156. #endif
  157. #ifdef RT
  158. movq K, %rax
  159. salq $ZBASE_SHIFT, %rax
  160. subq %rax, B
  161. subq LDC, C
  162. #endif
  163. movq C, CO1
  164. #ifndef RT
  165. addq LDC, C
  166. #endif
  167. #ifdef LN
  168. movq OFFSET, KK
  169. addq M, KK
  170. #endif
  171. #ifdef LT
  172. movq OFFSET, KK
  173. #endif
  174. movq K, %rax
  175. salq $ZBASE_SHIFT, %rax
  176. leaq (B, %rax), BB
  177. movq M, I
  178. sarq $1, I
  179. jle .L20
  180. ALIGN_4
  181. .L10:
  182. #ifdef LN
  183. movq K, %rax
  184. salq $1 + ZBASE_SHIFT, %rax
  185. subq %rax, AORIG
  186. #endif
  187. #if defined(LN) || defined(RT)
  188. movq KK, %rax
  189. leaq (, %rax, SIZE), %rax
  190. movq AORIG, AO
  191. leaq (AO, %rax, 4), AO
  192. leaq (B, %rax, 2), BO
  193. #else
  194. movq B, BO
  195. #endif
  196. prefetcht0 0 * SIZE(BB)
  197. subq $-8 * SIZE, BB
  198. movsd 0 * SIZE(AO), %xmm0
  199. xorps %xmm2, %xmm2
  200. movsd 1 * SIZE(AO), %xmm4
  201. xorps %xmm5, %xmm5
  202. movsd 2 * SIZE(AO), %xmm5
  203. xorps %xmm6, %xmm6
  204. xorps %xmm7, %xmm7
  205. movsd 0 * SIZE(BO), %xmm1
  206. xorps %xmm8, %xmm8
  207. xorps %xmm9, %xmm9
  208. movsd 1 * SIZE(BO), %xmm3
  209. xorps %xmm10, %xmm10
  210. xorps %xmm11, %xmm11
  211. prefetcht0 3 * SIZE(CO1)
  212. xorps %xmm12, %xmm12
  213. xorps %xmm13, %xmm13
  214. xorps %xmm14, %xmm14
  215. xorps %xmm15, %xmm15
  216. #if defined(LT) || defined(RN)
  217. movq KK, %rax
  218. #else
  219. movq K, %rax
  220. subq KK, %rax
  221. #endif
  222. sarq $2, %rax
  223. je .L15
  224. ALIGN_4
  225. .L12:
  226. ADDSD2 %xmm2, %xmm13
  227. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  228. movaps %xmm0, %xmm2
  229. mulsd %xmm1, %xmm0
  230. ADDSD3 %xmm7, %xmm14
  231. movsd 3 * SIZE(AO), %xmm7
  232. mulsd %xmm3, %xmm2
  233. ADDSD4 %xmm6, %xmm15
  234. PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO)
  235. movaps %xmm4, %xmm6
  236. mulsd %xmm1, %xmm4
  237. ADDSD1 %xmm0, %xmm8
  238. movsd 4 * SIZE(AO), %xmm0
  239. mulsd %xmm3, %xmm6
  240. ADDSD2 %xmm2, %xmm9
  241. movaps %xmm5, %xmm2
  242. mulsd %xmm1, %xmm5
  243. ADDSD3 %xmm4, %xmm10
  244. movsd 5 * SIZE(AO), %xmm4
  245. mulsd %xmm3, %xmm2
  246. ADDSD4 %xmm6, %xmm11
  247. movaps %xmm7, %xmm6
  248. mulsd %xmm1, %xmm7
  249. movsd 2 * SIZE(BO), %xmm1
  250. ADDSD1 %xmm5, %xmm12
  251. movsd 6 * SIZE(AO), %xmm5
  252. mulsd %xmm3, %xmm6
  253. movsd 3 * SIZE(BO), %xmm3
  254. ADDSD2 %xmm2, %xmm13
  255. movaps %xmm0, %xmm2
  256. mulsd %xmm1, %xmm0
  257. ADDSD3 %xmm7, %xmm14
  258. movsd 7 * SIZE(AO), %xmm7
  259. mulsd %xmm3, %xmm2
  260. ADDSD4 %xmm6, %xmm15
  261. movaps %xmm4, %xmm6
  262. mulsd %xmm1, %xmm4
  263. ADDSD1 %xmm0, %xmm8
  264. movsd 8 * SIZE(AO), %xmm0
  265. mulsd %xmm3, %xmm6
  266. ADDSD2 %xmm2, %xmm9
  267. movaps %xmm5, %xmm2
  268. mulsd %xmm1, %xmm5
  269. ADDSD3 %xmm4, %xmm10
  270. movsd 9 * SIZE(AO), %xmm4
  271. mulsd %xmm3, %xmm2
  272. ADDSD4 %xmm6, %xmm11
  273. movaps %xmm7, %xmm6
  274. mulsd %xmm1, %xmm7
  275. movsd 4 * SIZE(BO), %xmm1
  276. ADDSD1 %xmm5, %xmm12
  277. movsd 10 * SIZE(AO), %xmm5
  278. mulsd %xmm3, %xmm6
  279. movsd 5 * SIZE(BO), %xmm3
  280. ADDSD2 %xmm2, %xmm13
  281. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  282. movaps %xmm0, %xmm2
  283. mulsd %xmm1, %xmm0
  284. ADDSD3 %xmm7, %xmm14
  285. movsd 11 * SIZE(AO), %xmm7
  286. mulsd %xmm3, %xmm2
  287. ADDSD4 %xmm6, %xmm15
  288. movaps %xmm4, %xmm6
  289. mulsd %xmm1, %xmm4
  290. ADDSD1 %xmm0, %xmm8
  291. movsd 12 * SIZE(AO), %xmm0
  292. mulsd %xmm3, %xmm6
  293. ADDSD2 %xmm2, %xmm9
  294. movaps %xmm5, %xmm2
  295. mulsd %xmm1, %xmm5
  296. ADDSD3 %xmm4, %xmm10
  297. movsd 13 * SIZE(AO), %xmm4
  298. mulsd %xmm3, %xmm2
  299. ADDSD4 %xmm6, %xmm11
  300. movaps %xmm7, %xmm6
  301. mulsd %xmm1, %xmm7
  302. movsd 6 * SIZE(BO), %xmm1
  303. ADDSD1 %xmm5, %xmm12
  304. movsd 14 * SIZE(AO), %xmm5
  305. mulsd %xmm3, %xmm6
  306. movsd 7 * SIZE(BO), %xmm3
  307. ADDSD2 %xmm2, %xmm13
  308. movaps %xmm0, %xmm2
  309. mulsd %xmm1, %xmm0
  310. ADDSD3 %xmm7, %xmm14
  311. movsd 15 * SIZE(AO), %xmm7
  312. mulsd %xmm3, %xmm2
  313. subq $-16 * SIZE, AO
  314. ADDSD4 %xmm6, %xmm15
  315. movaps %xmm4, %xmm6
  316. mulsd %xmm1, %xmm4
  317. ADDSD1 %xmm0, %xmm8
  318. movsd 0 * SIZE(AO), %xmm0
  319. mulsd %xmm3, %xmm6
  320. ADDSD2 %xmm2, %xmm9
  321. movaps %xmm5, %xmm2
  322. mulsd %xmm1, %xmm5
  323. addq $ 8 * SIZE, BO
  324. ADDSD3 %xmm4, %xmm10
  325. movsd 1 * SIZE(AO), %xmm4
  326. mulsd %xmm3, %xmm2
  327. decq %rax
  328. ADDSD4 %xmm6, %xmm11
  329. movaps %xmm7, %xmm6
  330. mulsd %xmm1, %xmm7
  331. movsd 0 * SIZE(BO), %xmm1
  332. ADDSD1 %xmm5, %xmm12
  333. movsd 2 * SIZE(AO), %xmm5
  334. mulsd %xmm3, %xmm6
  335. movsd 1 * SIZE(BO), %xmm3
  336. jne .L12
  337. ALIGN_4
  338. .L15:
  339. #if defined(LT) || defined(RN)
  340. movq KK, %rax
  341. #else
  342. movq K, %rax
  343. subq KK, %rax
  344. #endif
  345. andq $3, %rax
  346. BRANCH
  347. BRANCH
  348. je .L18
  349. ALIGN_4
  350. .L16:
  351. ADDSD2 %xmm2, %xmm13
  352. movaps %xmm0, %xmm2
  353. mulsd %xmm1, %xmm0
  354. ADDSD3 %xmm7, %xmm14
  355. movsd 3 * SIZE(AO), %xmm7
  356. mulsd %xmm3, %xmm2
  357. ADDSD4 %xmm6, %xmm15
  358. movaps %xmm4, %xmm6
  359. mulsd %xmm1, %xmm4
  360. ADDSD1 %xmm0, %xmm8
  361. movsd 4 * SIZE(AO), %xmm0
  362. mulsd %xmm3, %xmm6
  363. ADDSD2 %xmm2, %xmm9
  364. movaps %xmm5, %xmm2
  365. mulsd %xmm1, %xmm5
  366. ADDSD3 %xmm4, %xmm10
  367. movsd 5 * SIZE(AO), %xmm4
  368. mulsd %xmm3, %xmm2
  369. ADDSD4 %xmm6, %xmm11
  370. movaps %xmm7, %xmm6
  371. mulsd %xmm1, %xmm7
  372. movsd 2 * SIZE(BO), %xmm1
  373. ADDSD1 %xmm5, %xmm12
  374. movsd 6 * SIZE(AO), %xmm5
  375. mulsd %xmm3, %xmm6
  376. movsd 3 * SIZE(BO), %xmm3
  377. addq $4 * SIZE, AO
  378. addq $2 * SIZE, BO
  379. decq %rax
  380. BRANCH
  381. jg .L16
  382. ALIGN_4
  383. .L18:
  384. ADDSD2 %xmm2, %xmm13
  385. ADDSD3 %xmm7, %xmm14
  386. ADDSD4 %xmm6, %xmm15
  387. addsd %xmm11, %xmm8
  388. addsd %xmm9, %xmm10
  389. addsd %xmm15, %xmm12
  390. addsd %xmm13, %xmm14
  391. #if defined(LN) || defined(RT)
  392. movq KK, %rax
  393. #ifdef LN
  394. subq $2, %rax
  395. #else
  396. subq $1, %rax
  397. #endif
  398. leaq (, %rax, SIZE), %rax
  399. movq AORIG, AO
  400. leaq (AO, %rax, 4), AO
  401. leaq (B, %rax, 2), BO
  402. #endif
  403. #if defined(LN) || defined(LT)
  404. movsd 0 * SIZE(BO), %xmm0
  405. movsd 1 * SIZE(BO), %xmm1
  406. movsd 2 * SIZE(BO), %xmm2
  407. movsd 3 * SIZE(BO), %xmm3
  408. #else
  409. movsd 0 * SIZE(AO), %xmm0
  410. movsd 1 * SIZE(AO), %xmm1
  411. movsd 2 * SIZE(AO), %xmm2
  412. movsd 3 * SIZE(AO), %xmm3
  413. #endif
  414. subsd %xmm8, %xmm0
  415. subsd %xmm10, %xmm1
  416. subsd %xmm12, %xmm2
  417. subsd %xmm14, %xmm3
  418. #ifdef LN
  419. movsd 6 * SIZE(AO), %xmm6
  420. movsd 7 * SIZE(AO), %xmm7
  421. movaps %xmm2, %xmm5
  422. movaps %xmm3, %xmm4
  423. mulsd %xmm6, %xmm2
  424. mulsd %xmm6, %xmm3
  425. movsd 4 * SIZE(AO), %xmm6
  426. mulsd %xmm7, %xmm5
  427. mulsd %xmm7, %xmm4
  428. movsd 5 * SIZE(AO), %xmm7
  429. ADDSD4 %xmm4, %xmm2
  430. ADDSD3 %xmm5, %xmm3
  431. movaps %xmm2, %xmm4
  432. movaps %xmm3, %xmm5
  433. mulsd %xmm6, %xmm4
  434. mulsd %xmm7, %xmm5
  435. mulsd %xmm3, %xmm6
  436. mulsd %xmm2, %xmm7
  437. subsd %xmm4, %xmm0
  438. subsd %xmm6, %xmm1
  439. movsd 0 * SIZE(AO), %xmm6
  440. ADDSD3 %xmm5, %xmm0
  441. ADDSD4 %xmm7, %xmm1
  442. movsd 1 * SIZE(AO), %xmm7
  443. movaps %xmm0, %xmm5
  444. movaps %xmm1, %xmm4
  445. mulsd %xmm6, %xmm0
  446. mulsd %xmm6, %xmm1
  447. mulsd %xmm7, %xmm5
  448. mulsd %xmm7, %xmm4
  449. ADDSD4 %xmm4, %xmm0
  450. ADDSD3 %xmm5, %xmm1
  451. #endif
  452. #ifdef LT
  453. movsd 0 * SIZE(AO), %xmm6
  454. movsd 1 * SIZE(AO), %xmm7
  455. movaps %xmm0, %xmm5
  456. movaps %xmm1, %xmm4
  457. mulsd %xmm6, %xmm0
  458. mulsd %xmm6, %xmm1
  459. movsd 2 * SIZE(AO), %xmm6
  460. mulsd %xmm7, %xmm5
  461. mulsd %xmm7, %xmm4
  462. movsd 3 * SIZE(AO), %xmm7
  463. ADDSD4 %xmm4, %xmm0
  464. ADDSD3 %xmm5, %xmm1
  465. movaps %xmm0, %xmm4
  466. movaps %xmm1, %xmm5
  467. mulsd %xmm6, %xmm4
  468. mulsd %xmm7, %xmm5
  469. mulsd %xmm1, %xmm6
  470. mulsd %xmm0, %xmm7
  471. subsd %xmm4, %xmm2
  472. subsd %xmm6, %xmm3
  473. movsd 6 * SIZE(AO), %xmm6
  474. ADDSD3 %xmm5, %xmm2
  475. ADDSD4 %xmm7, %xmm3
  476. movsd 7 * SIZE(AO), %xmm7
  477. movaps %xmm2, %xmm5
  478. movaps %xmm3, %xmm4
  479. mulsd %xmm6, %xmm2
  480. mulsd %xmm6, %xmm3
  481. mulsd %xmm7, %xmm5
  482. mulsd %xmm7, %xmm4
  483. ADDSD4 %xmm4, %xmm2
  484. ADDSD3 %xmm5, %xmm3
  485. #endif
  486. #if defined(RN) || defined(RT)
  487. movsd 0 * SIZE(BO), %xmm8
  488. movaps %xmm0, %xmm5
  489. movsd 1 * SIZE(BO), %xmm9
  490. movaps %xmm1, %xmm4
  491. movaps %xmm2, %xmm7
  492. movaps %xmm3, %xmm6
  493. mulsd %xmm8, %xmm0
  494. mulsd %xmm8, %xmm1
  495. mulsd %xmm9, %xmm5
  496. mulsd %xmm9, %xmm4
  497. ADDSD4 %xmm4, %xmm0
  498. mulsd %xmm8, %xmm2
  499. ADDSD2 %xmm5, %xmm1
  500. mulsd %xmm8, %xmm3
  501. mulsd %xmm9, %xmm7
  502. mulsd %xmm9, %xmm6
  503. ADDSD4 %xmm6, %xmm2
  504. ADDSD2 %xmm7, %xmm3
  505. #endif
  506. #ifdef LN
  507. subq $4 * SIZE, CO1
  508. #endif
  509. movsd %xmm0, 0 * SIZE(CO1)
  510. movsd %xmm1, 1 * SIZE(CO1)
  511. movsd %xmm2, 2 * SIZE(CO1)
  512. movsd %xmm3, 3 * SIZE(CO1)
  513. #if defined(LN) || defined(LT)
  514. movsd %xmm0, 0 * SIZE(BO)
  515. movsd %xmm1, 1 * SIZE(BO)
  516. movsd %xmm2, 2 * SIZE(BO)
  517. movsd %xmm3, 3 * SIZE(BO)
  518. #else
  519. movsd %xmm0, 0 * SIZE(AO)
  520. movsd %xmm1, 1 * SIZE(AO)
  521. movsd %xmm2, 2 * SIZE(AO)
  522. movsd %xmm3, 3 * SIZE(AO)
  523. #endif
  524. #ifndef LN
  525. addq $4 * SIZE, CO1
  526. #endif
  527. #if defined(LT) || defined(RN)
  528. movq K, %rax
  529. subq KK, %rax
  530. leaq (,%rax, SIZE), %rax
  531. leaq (AO, %rax, 4), AO
  532. leaq (BO, %rax, 2), BO
  533. #endif
  534. #ifdef LN
  535. subq $2, KK
  536. #endif
  537. #ifdef LT
  538. addq $2, KK
  539. #endif
  540. #ifdef RT
  541. movq K, %rax
  542. salq $1 + ZBASE_SHIFT, %rax
  543. addq %rax, AORIG
  544. #endif
  545. decq I # i --
  546. jg .L10
  547. ALIGN_4
  548. .L20:
  549. testq $1, M
  550. jle .L99
  551. #ifdef LN
  552. movq K, %rax
  553. salq $0 + ZBASE_SHIFT, %rax
  554. subq %rax, AORIG
  555. #endif
  556. #if defined(LN) || defined(RT)
  557. movq KK, %rax
  558. leaq (, %rax, SIZE), %rax
  559. movq AORIG, AO
  560. leaq (AO, %rax, 2), AO
  561. leaq (B, %rax, 2), BO
  562. #else
  563. movq B, BO
  564. #endif
  565. movsd 0 * SIZE(AO), %xmm0
  566. xorps %xmm2, %xmm2
  567. movsd 1 * SIZE(AO), %xmm4
  568. xorps %xmm5, %xmm5
  569. movsd 2 * SIZE(AO), %xmm5
  570. xorps %xmm6, %xmm6
  571. movsd 3 * SIZE(AO), %xmm7
  572. movsd 0 * SIZE(BO), %xmm1
  573. xorps %xmm8, %xmm8
  574. xorps %xmm9, %xmm9
  575. movsd 1 * SIZE(BO), %xmm3
  576. xorps %xmm10, %xmm10
  577. xorps %xmm11, %xmm11
  578. #if defined(LT) || defined(RN)
  579. movq KK, %rax
  580. #else
  581. movq K, %rax
  582. subq KK, %rax
  583. #endif
  584. sarq $2, %rax
  585. je .L25
  586. ALIGN_4
  587. .L22:
  588. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  589. ADDSD2 %xmm2, %xmm9
  590. movaps %xmm0, %xmm2
  591. mulsd %xmm1, %xmm0
  592. ADDSD4 %xmm6, %xmm11
  593. movaps %xmm4, %xmm6
  594. mulsd %xmm1, %xmm4
  595. movsd 2 * SIZE(BO), %xmm1
  596. ADDSD1 %xmm0, %xmm8
  597. movsd 4 * SIZE(AO), %xmm0
  598. mulsd %xmm3, %xmm2
  599. ADDSD3 %xmm4, %xmm10
  600. movsd 5 * SIZE(AO), %xmm4
  601. mulsd %xmm3, %xmm6
  602. movsd 3 * SIZE(BO), %xmm3
  603. ADDSD2 %xmm2, %xmm9
  604. movaps %xmm5, %xmm2
  605. mulsd %xmm1, %xmm5
  606. ADDSD4 %xmm6, %xmm11
  607. movaps %xmm7, %xmm6
  608. mulsd %xmm1, %xmm7
  609. movsd 4 * SIZE(BO), %xmm1
  610. ADDSD1 %xmm5, %xmm8
  611. movsd 6 * SIZE(AO), %xmm5
  612. mulsd %xmm3, %xmm2
  613. ADDSD3 %xmm7, %xmm10
  614. movsd 7 * SIZE(AO), %xmm7
  615. mulsd %xmm3, %xmm6
  616. movsd 5 * SIZE(BO), %xmm3
  617. ADDSD2 %xmm2, %xmm9
  618. movaps %xmm0, %xmm2
  619. mulsd %xmm1, %xmm0
  620. ADDSD4 %xmm6, %xmm11
  621. movaps %xmm4, %xmm6
  622. mulsd %xmm1, %xmm4
  623. movsd 6 * SIZE(BO), %xmm1
  624. ADDSD1 %xmm0, %xmm8
  625. movsd 8 * SIZE(AO), %xmm0
  626. mulsd %xmm3, %xmm2
  627. ADDSD3 %xmm4, %xmm10
  628. movsd 9 * SIZE(AO), %xmm4
  629. mulsd %xmm3, %xmm6
  630. movsd 7 * SIZE(BO), %xmm3
  631. ADDSD2 %xmm2, %xmm9
  632. movaps %xmm5, %xmm2
  633. mulsd %xmm1, %xmm5
  634. ADDSD4 %xmm6, %xmm11
  635. movaps %xmm7, %xmm6
  636. mulsd %xmm1, %xmm7
  637. movsd 8 * SIZE(BO), %xmm1
  638. ADDSD1 %xmm5, %xmm8
  639. movsd 10 * SIZE(AO), %xmm5
  640. mulsd %xmm3, %xmm2
  641. ADDSD3 %xmm7, %xmm10
  642. movsd 11 * SIZE(AO), %xmm7
  643. mulsd %xmm3, %xmm6
  644. movsd 9 * SIZE(BO), %xmm3
  645. addq $8 * SIZE, AO
  646. addq $8 * SIZE, BO
  647. decq %rax
  648. jne .L22
  649. ALIGN_4
  650. .L25:
  651. #if defined(LT) || defined(RN)
  652. movq KK, %rax
  653. #else
  654. movq K, %rax
  655. subq KK, %rax
  656. #endif
  657. andq $3, %rax
  658. BRANCH
  659. BRANCH
  660. je .L29
  661. ALIGN_4
  662. .L26:
  663. ADDSD2 %xmm2, %xmm9
  664. movaps %xmm0, %xmm2
  665. mulsd %xmm1, %xmm0
  666. ADDSD4 %xmm6, %xmm11
  667. movaps %xmm4, %xmm6
  668. mulsd %xmm1, %xmm4
  669. movsd 2 * SIZE(BO), %xmm1
  670. mulsd %xmm3, %xmm2
  671. ADDSD1 %xmm0, %xmm8
  672. movsd 2 * SIZE(AO), %xmm0
  673. mulsd %xmm3, %xmm6
  674. movsd 3 * SIZE(BO), %xmm3
  675. ADDSD3 %xmm4, %xmm10
  676. movsd 3 * SIZE(AO), %xmm4
  677. addq $2 * SIZE, AO
  678. addq $2 * SIZE, BO
  679. decq %rax
  680. BRANCH
  681. jg .L26
  682. ALIGN_4
  683. .L29:
  684. ADDSD2 %xmm2, %xmm9
  685. ADDSD4 %xmm6, %xmm11
  686. addsd %xmm11, %xmm8
  687. addsd %xmm9, %xmm10
  688. #if defined(LN) || defined(RT)
  689. movq KK, %rax
  690. #ifdef LN
  691. subq $1, %rax
  692. #else
  693. subq $1, %rax
  694. #endif
  695. leaq (, %rax, SIZE), %rax
  696. movq AORIG, AO
  697. leaq (AO, %rax, 2), AO
  698. leaq (B, %rax, 2), BO
  699. #endif
  700. #if defined(LN) || defined(LT)
  701. movsd 0 * SIZE(BO), %xmm0
  702. movsd 1 * SIZE(BO), %xmm1
  703. #else
  704. movsd 0 * SIZE(AO), %xmm0
  705. movsd 1 * SIZE(AO), %xmm1
  706. #endif
  707. subsd %xmm8, %xmm0
  708. subsd %xmm10, %xmm1
  709. #if defined(LN) || defined(LT)
  710. movsd 0 * SIZE(AO), %xmm6
  711. movaps %xmm0, %xmm5
  712. movsd 1 * SIZE(AO), %xmm7
  713. movaps %xmm1, %xmm4
  714. mulsd %xmm6, %xmm0
  715. mulsd %xmm6, %xmm1
  716. mulsd %xmm7, %xmm5
  717. mulsd %xmm7, %xmm4
  718. ADDSD4 %xmm4, %xmm0
  719. ADDSD3 %xmm5, %xmm1
  720. #endif
  721. #if defined(RN) || defined(RT)
  722. movsd 0 * SIZE(BO), %xmm8
  723. movaps %xmm0, %xmm5
  724. movsd 1 * SIZE(BO), %xmm9
  725. movaps %xmm1, %xmm4
  726. mulsd %xmm8, %xmm0
  727. mulsd %xmm8, %xmm1
  728. mulsd %xmm9, %xmm5
  729. mulsd %xmm9, %xmm4
  730. ADDSD4 %xmm4, %xmm0
  731. ADDSD2 %xmm5, %xmm1
  732. #endif
  733. #ifdef LN
  734. subq $2 * SIZE, CO1
  735. #endif
  736. movsd %xmm0, 0 * SIZE(CO1)
  737. movsd %xmm1, 1 * SIZE(CO1)
  738. #if defined(LN) || defined(LT)
  739. movsd %xmm0, 0 * SIZE(BO)
  740. movsd %xmm1, 1 * SIZE(BO)
  741. #else
  742. movsd %xmm0, 0 * SIZE(AO)
  743. movsd %xmm1, 1 * SIZE(AO)
  744. #endif
  745. #ifndef LN
  746. addq $2 * SIZE, CO1
  747. #endif
  748. #if defined(LT) || defined(RN)
  749. movq K, %rax
  750. subq KK, %rax
  751. leaq (,%rax, SIZE), %rax
  752. leaq (AO, %rax, 2), AO
  753. leaq (BO, %rax, 2), BO
  754. #endif
  755. #ifdef LN
  756. subq $1, KK
  757. #endif
  758. #ifdef LT
  759. addq $1, KK
  760. #endif
  761. #ifdef RT
  762. movq K, %rax
  763. salq $0 + ZBASE_SHIFT, %rax
  764. addq %rax, AORIG
  765. #endif
  766. ALIGN_4
  767. .L99:
  768. #ifdef LN
  769. leaq (, K, SIZE), %rax
  770. leaq (B, %rax, 2), B
  771. #endif
  772. #if defined(LT) || defined(RN)
  773. movq BO, B
  774. #endif
  775. #ifdef RN
  776. addq $1, KK
  777. #endif
  778. #ifdef RT
  779. subq $1, KK
  780. #endif
  781. decq J # j --
  782. jg .L01
  783. ALIGN_4
  784. .L999:
  785. movq 0(%rsp), %rbx
  786. movq 8(%rsp), %rbp
  787. movq 16(%rsp), %r12
  788. movq 24(%rsp), %r13
  789. movq 32(%rsp), %r14
  790. movq 40(%rsp), %r15
  791. #ifdef WINDOWS_ABI
  792. movq 48(%rsp), %rdi
  793. movq 56(%rsp), %rsi
  794. movups 64(%rsp), %xmm6
  795. movups 80(%rsp), %xmm7
  796. movups 96(%rsp), %xmm8
  797. movups 112(%rsp), %xmm9
  798. movups 128(%rsp), %xmm10
  799. movups 144(%rsp), %xmm11
  800. movups 160(%rsp), %xmm12
  801. movups 176(%rsp), %xmm13
  802. movups 192(%rsp), %xmm14
  803. movups 208(%rsp), %xmm15
  804. #endif
  805. addq $STACKSIZE, %rsp
  806. ret
  807. EPILOGUE