You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

xgemm3m_kernel_2x2.S 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M ARG1
  41. #define N ARG2
  42. #define K ARG3
  43. #define A ARG4
  44. #define B ARG5
  45. #define C ARG6
  46. #define LDC %r10
  47. #define I %r12
  48. #define J %r13
  49. #define AO %r14
  50. #define BO %r15
  51. #define CO %rbp
  52. #define KK %r11
  53. #define KKK 48(%rsp)
  54. #define STACKSIZE 64
  55. #define ALPHA_R 8 + STACKSIZE(%rsp)
  56. #define ALPHA_I 24 + STACKSIZE(%rsp)
  57. #define OFFSET 48 + STACKSIZE(%rsp)
  58. #ifdef OPTERON
  59. #define PREFETCH prefetch
  60. #define PREFETCHW prefetchw
  61. #else
  62. #define PREFETCH prefetcht0
  63. #define PREFETCHW prefetcht0
  64. #endif
  65. #define PREFETCHSIZE (5 + 4 * 10)
  66. #if defined(OS_LINUX) && defined(CORE_BARCELONA)
  67. .align 32768
  68. #endif
  69. PROLOGUE
  70. PROFCODE
  71. #ifdef WINDOWS_ABI
  72. emms
  73. #endif
  74. subq $STACKSIZE, %rsp
  75. movq %rbx, 0(%rsp)
  76. movq %rbp, 8(%rsp)
  77. movq %r12, 16(%rsp)
  78. movq %r13, 24(%rsp)
  79. movq %r14, 32(%rsp)
  80. movq %r15, 40(%rsp)
  81. movq 40 + STACKSIZE(%rsp), LDC
  82. #if defined(TRMMKERNEL) && !defined(LEFT)
  83. movq OFFSET, %rax
  84. negq %rax
  85. movq %rax, KK
  86. #endif
  87. addq $8 * SIZE, A
  88. addq $8 * SIZE, B
  89. salq $ZBASE_SHIFT, LDC
  90. movq N, %rax
  91. sarq $1, %rax
  92. movq %rax, J
  93. je .L30
  94. ALIGN_4
  95. .L01:
  96. #if defined(TRMMKERNEL) && defined(LEFT)
  97. movq OFFSET, %rax
  98. movq %rax, KK
  99. #endif
  100. movq A, AO
  101. movq C, CO
  102. leaq (, LDC, 2), %rax
  103. addq %rax, C
  104. movq M, I
  105. sarq $1, I
  106. je .L20
  107. ALIGN_4
  108. .L11:
  109. #if !defined(TRMMKERNEL) || \
  110. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  111. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  112. movq B, BO
  113. #else
  114. movq KK, %rax
  115. salq $BASE_SHIFT, %rax
  116. leaq (AO, %rax, 2), AO
  117. leaq (B, %rax, 2), BO
  118. #endif
  119. fldz
  120. fldz
  121. fldz
  122. fldz
  123. #if defined(HAVE_3DNOW)
  124. prefetchw 2 * SIZE(CO)
  125. prefetchw 2 * SIZE(CO, LDC, 1)
  126. #elif defined(HAVE_SSE)
  127. prefetchnta 2 * SIZE(CO)
  128. prefetchnta 2 * SIZE(CO, LDC, 1)
  129. #endif
  130. #ifndef TRMMKERNEL
  131. movq K, %rax
  132. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  133. movq K, %rax
  134. subq KK, %rax
  135. movq %rax, KKK
  136. #else
  137. movq KK, %rax
  138. #ifdef LEFT
  139. addq $2, %rax
  140. #else
  141. addq $2, %rax
  142. #endif
  143. movq %rax, KKK
  144. #endif
  145. sarq $2, %rax
  146. je .L15
  147. ALIGN_4
  148. .L12:
  149. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  150. FLD -8 * SIZE(AO)
  151. FLD -8 * SIZE(BO)
  152. fld %st(1)
  153. fmul %st(1), %st
  154. faddp %st, %st(3)
  155. FLD -7 * SIZE(BO)
  156. fmul %st, %st(2)
  157. FLD -7 * SIZE(AO)
  158. fmul %st, %st(2)
  159. fmulp %st, %st(1)
  160. faddp %st, %st(6)
  161. faddp %st, %st(3)
  162. faddp %st, %st(3)
  163. FLD -6 * SIZE(AO)
  164. FLD -6 * SIZE(BO)
  165. fld %st(1)
  166. fmul %st(1), %st
  167. faddp %st, %st(3)
  168. FLD -5 * SIZE(BO)
  169. fmul %st, %st(2)
  170. FLD -5 * SIZE(AO)
  171. fmul %st, %st(2)
  172. fmulp %st, %st(1)
  173. faddp %st, %st(6)
  174. faddp %st, %st(3)
  175. faddp %st, %st(3)
  176. PREFETCH (PREFETCHSIZE + 4) * SIZE(AO)
  177. FLD -4 * SIZE(AO)
  178. FLD -4 * SIZE(BO)
  179. fld %st(1)
  180. fmul %st(1), %st
  181. faddp %st, %st(3)
  182. FLD -3 * SIZE(BO)
  183. fmul %st, %st(2)
  184. FLD -3 * SIZE(AO)
  185. fmul %st, %st(2)
  186. fmulp %st, %st(1)
  187. faddp %st, %st(6)
  188. faddp %st, %st(3)
  189. faddp %st, %st(3)
  190. FLD -2 * SIZE(AO)
  191. FLD -2 * SIZE(BO)
  192. fld %st(1)
  193. fmul %st(1), %st
  194. faddp %st, %st(3)
  195. FLD -1 * SIZE(BO)
  196. fmul %st, %st(2)
  197. FLD -1 * SIZE(AO)
  198. fmul %st, %st(2)
  199. fmulp %st, %st(1)
  200. faddp %st, %st(6)
  201. faddp %st, %st(3)
  202. faddp %st, %st(3)
  203. addq $8 * SIZE,AO
  204. addq $8 * SIZE,BO
  205. decq %rax
  206. jne .L12
  207. ALIGN_4
  208. .L15:
  209. #ifndef TRMMKERNEL
  210. movq K, %rax
  211. #else
  212. movq KKK, %rax
  213. #endif
  214. and $3, %rax
  215. je .L18
  216. ALIGN_4
  217. .L16:
  218. FLD -8 * SIZE(AO)
  219. FLD -8 * SIZE(BO)
  220. fld %st(1)
  221. fmul %st(1), %st
  222. faddp %st, %st(3)
  223. FLD -7 * SIZE(BO)
  224. fmul %st, %st(2)
  225. FLD -7 * SIZE(AO)
  226. fmul %st, %st(2)
  227. fmulp %st, %st(1)
  228. faddp %st, %st(6)
  229. faddp %st, %st(3)
  230. faddp %st, %st(3)
  231. addq $2 * SIZE,AO
  232. addq $2 * SIZE,BO
  233. decq %rax
  234. jne .L16
  235. ALIGN_4
  236. .L18:
  237. #ifndef TRMMKERNEL
  238. FLD ALPHA_I
  239. FLD ALPHA_R
  240. fld %st(2)
  241. fmul %st(1), %st
  242. FLD 0 * SIZE(CO)
  243. faddp %st, %st(1)
  244. FST 0 * SIZE(CO)
  245. fld %st(3)
  246. fmul %st(1), %st
  247. FLD 2 * SIZE(CO)
  248. faddp %st, %st(1)
  249. FST 2 * SIZE(CO)
  250. fld %st(4)
  251. fmul %st(1), %st
  252. FLD 0 * SIZE(CO, LDC)
  253. faddp %st, %st(1)
  254. FST 0 * SIZE(CO, LDC)
  255. fmul %st(5), %st
  256. FLD 2 * SIZE(CO, LDC)
  257. faddp %st, %st(1)
  258. FST 2 * SIZE(CO, LDC)
  259. fmul %st, %st(1)
  260. fmul %st, %st(2)
  261. fmul %st, %st(3)
  262. fmulp %st, %st(4)
  263. FLD 1 * SIZE(CO)
  264. faddp %st, %st(1)
  265. FST 1 * SIZE(CO)
  266. FLD 3 * SIZE(CO)
  267. faddp %st, %st(1)
  268. FST 3 * SIZE(CO)
  269. FLD 1 * SIZE(CO, LDC)
  270. faddp %st, %st(1)
  271. FST 1 * SIZE(CO, LDC)
  272. FLD 3 * SIZE(CO, LDC)
  273. faddp %st, %st(1)
  274. FST 3 * SIZE(CO, LDC)
  275. #else
  276. FST 0 * SIZE(CO)
  277. FST 1 * SIZE(CO)
  278. FST 0 * SIZE(CO, LDC)
  279. FST 1 * SIZE(CO, LDC)
  280. #endif
  281. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  282. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  283. movq K, %rax
  284. subq KKK, %rax
  285. salq $BASE_SHIFT, %rax
  286. leaq (AO, %rax, 2), AO
  287. leaq (BO, %rax, 2), BO
  288. #endif
  289. #if defined(TRMMKERNEL) && defined(LEFT)
  290. addq $2, KK
  291. #endif
  292. addq $4 * SIZE, CO
  293. decq I
  294. jne .L11
  295. ALIGN_4
  296. .L20:
  297. movq M, %rax
  298. andq $1, %rax
  299. je .L29
  300. ALIGN_4
  301. .L21:
  302. #if !defined(TRMMKERNEL) || \
  303. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  304. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  305. movq B, BO
  306. #else
  307. movq KK, %rax
  308. salq $BASE_SHIFT, %rax
  309. leaq (AO, %rax, 1), AO
  310. leaq ( B, %rax, 2), BO
  311. #endif
  312. fldz
  313. fldz
  314. #ifndef TRMMKERNEL
  315. movq K, %rax
  316. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  317. movq K, %rax
  318. subq KK, %rax
  319. movq %rax, KKK
  320. #else
  321. movq KK, %rax
  322. #ifdef LEFT
  323. addq $1, %rax
  324. #else
  325. addq $2, %rax
  326. #endif
  327. movq %rax, KKK
  328. #endif
  329. sarq $2, %rax
  330. je .L25
  331. ALIGN_4
  332. .L22:
  333. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  334. FLD -8 * SIZE(AO)
  335. FLD -8 * SIZE(BO)
  336. fmul %st(1), %st
  337. faddp %st, %st(2)
  338. FLD -7 * SIZE(BO)
  339. fmulp %st, %st(1)
  340. faddp %st, %st(2)
  341. FLD -7 * SIZE(AO)
  342. FLD -6 * SIZE(BO)
  343. fmul %st(1), %st
  344. faddp %st, %st(2)
  345. FLD -5 * SIZE(BO)
  346. fmulp %st, %st(1)
  347. faddp %st, %st(2)
  348. FLD -6 * SIZE(AO)
  349. FLD -4 * SIZE(BO)
  350. fmul %st(1), %st
  351. faddp %st, %st(2)
  352. FLD -3 * SIZE(BO)
  353. fmulp %st, %st(1)
  354. faddp %st, %st(2)
  355. FLD -5 * SIZE(AO)
  356. FLD -2 * SIZE(BO)
  357. fmul %st(1), %st
  358. faddp %st, %st(2)
  359. FLD -1 * SIZE(BO)
  360. fmulp %st, %st(1)
  361. faddp %st, %st(2)
  362. addq $4 * SIZE,AO
  363. addq $8 * SIZE,BO
  364. decq %rax
  365. jne .L22
  366. ALIGN_4
  367. .L25:
  368. #ifndef TRMMKERNEL
  369. movq K, %rax
  370. #else
  371. movq KKK, %rax
  372. #endif
  373. and $3, %rax
  374. je .L28
  375. ALIGN_4
  376. .L26:
  377. FLD -8 * SIZE(AO)
  378. FLD -8 * SIZE(BO)
  379. fmul %st(1), %st
  380. faddp %st, %st(2)
  381. FLD -7 * SIZE(BO)
  382. fmulp %st, %st(1)
  383. faddp %st, %st(2)
  384. addq $1 * SIZE,AO
  385. addq $2 * SIZE,BO
  386. decq %rax
  387. jne .L26
  388. ALIGN_4
  389. .L28:
  390. #ifndef TRMMKERNEL
  391. FLD ALPHA_I
  392. FLD ALPHA_R
  393. fld %st(2)
  394. fmul %st(1), %st
  395. FLD 0 * SIZE(CO)
  396. faddp %st, %st(1)
  397. FST 0 * SIZE(CO)
  398. fmul %st(3), %st
  399. FLD 0 * SIZE(CO, LDC)
  400. faddp %st, %st(1)
  401. FST 0 * SIZE(CO, LDC)
  402. fmul %st, %st(1)
  403. fmulp %st, %st(2)
  404. FLD 1 * SIZE(CO)
  405. faddp %st, %st(1)
  406. FST 1 * SIZE(CO)
  407. FLD 1 * SIZE(CO, LDC)
  408. faddp %st, %st(1)
  409. FST 1 * SIZE(CO, LDC)
  410. #else
  411. FST 0 * SIZE(CO)
  412. FST 0 * SIZE(CO, LDC)
  413. #endif
  414. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  415. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  416. movq K, %rax
  417. subq KKK, %rax
  418. salq $BASE_SHIFT, %rax
  419. leaq (AO, %rax, 1), AO
  420. leaq (BO, %rax, 2), BO
  421. #endif
  422. #if defined(TRMMKERNEL) && defined(LEFT)
  423. addq $1, KK
  424. #endif
  425. addq $1 * SIZE, CO
  426. ALIGN_4
  427. .L29:
  428. #if defined(TRMMKERNEL) && !defined(LEFT)
  429. addq $2, KK
  430. #endif
  431. movq BO, B
  432. decq J
  433. jne .L01
  434. ALIGN_4
  435. .L30:
  436. movq N, %rax
  437. testq $1, %rax
  438. je .L999
  439. #if defined(TRMMKERNEL) && defined(LEFT)
  440. movq OFFSET, %rax
  441. movq %rax, KK
  442. #endif
  443. movq A, AO
  444. movq C, CO
  445. addq LDC, C
  446. movq M, I
  447. sarq $1, I
  448. je .L40
  449. ALIGN_4
  450. .L31:
  451. #if !defined(TRMMKERNEL) || \
  452. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  453. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  454. movq B, BO
  455. #else
  456. movq KK, %rax
  457. salq $BASE_SHIFT, %rax
  458. leaq (AO, %rax, 2), AO
  459. leaq ( B, %rax, 1), BO
  460. #endif
  461. fldz
  462. fldz
  463. #if defined(HAVE_3DNOW)
  464. prefetchw 2 * SIZE(CO)
  465. #elif defined(HAVE_SSE)
  466. prefetchnta 2 * SIZE(CO)
  467. #endif
  468. #ifndef TRMMKERNEL
  469. movq K, %rax
  470. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  471. movq K, %rax
  472. subq KK, %rax
  473. movq %rax, KKK
  474. #else
  475. movq KK, %rax
  476. #ifdef LEFT
  477. addq $2, %rax
  478. #else
  479. addq $1, %rax
  480. #endif
  481. movq %rax, KKK
  482. #endif
  483. sarq $2, %rax
  484. je .L35
  485. ALIGN_4
  486. .L32:
  487. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  488. FLD -8 * SIZE(BO)
  489. FLD -8 * SIZE(AO)
  490. fmul %st(1), %st
  491. faddp %st, %st(2)
  492. FLD -7 * SIZE(AO)
  493. fmulp %st, %st(1)
  494. faddp %st, %st(2)
  495. FLD -7 * SIZE(BO)
  496. FLD -6 * SIZE(AO)
  497. fmul %st(1), %st
  498. faddp %st, %st(2)
  499. FLD -5 * SIZE(AO)
  500. fmulp %st, %st(1)
  501. faddp %st, %st(2)
  502. FLD -6 * SIZE(BO)
  503. FLD -4 * SIZE(AO)
  504. fmul %st(1), %st
  505. faddp %st, %st(2)
  506. FLD -3 * SIZE(AO)
  507. fmulp %st, %st(1)
  508. faddp %st, %st(2)
  509. FLD -5 * SIZE(BO)
  510. FLD -2 * SIZE(AO)
  511. fmul %st(1), %st
  512. faddp %st, %st(2)
  513. FLD -1 * SIZE(AO)
  514. fmulp %st, %st(1)
  515. faddp %st, %st(2)
  516. addq $8 * SIZE,AO
  517. addq $4 * SIZE,BO
  518. decq %rax
  519. jne .L32
  520. ALIGN_4
  521. .L35:
  522. #ifndef TRMMKERNEL
  523. movq K, %rax
  524. #else
  525. movq KKK, %rax
  526. #endif
  527. and $3, %rax
  528. je .L38
  529. ALIGN_4
  530. .L36:
  531. FLD -8 * SIZE(BO)
  532. FLD -8 * SIZE(AO)
  533. fmul %st(1), %st
  534. faddp %st, %st(2)
  535. FLD -7 * SIZE(AO)
  536. fmulp %st, %st(1)
  537. faddp %st, %st(2)
  538. addq $2 * SIZE,AO
  539. addq $1 * SIZE,BO
  540. decq %rax
  541. jne .L36
  542. ALIGN_4
  543. .L38:
  544. #ifndef TRMMKERNEL
  545. FLD ALPHA_I
  546. FLD ALPHA_R
  547. fld %st(2)
  548. fmul %st(1), %st
  549. FLD 0 * SIZE(CO)
  550. faddp %st, %st(1)
  551. FST 0 * SIZE(CO)
  552. fmul %st(3), %st
  553. FLD 2 * SIZE(CO)
  554. faddp %st, %st(1)
  555. FST 2 * SIZE(CO)
  556. fmul %st, %st(1)
  557. fmulp %st, %st(2)
  558. FLD 1 * SIZE(CO)
  559. faddp %st, %st(1)
  560. FST 1 * SIZE(CO)
  561. FLD 3 * SIZE(CO)
  562. faddp %st, %st(1)
  563. FST 3 * SIZE(CO)
  564. #else
  565. FST 0 * SIZE(CO)
  566. FST 1 * SIZE(CO)
  567. #endif
  568. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  569. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  570. movq K, %rax
  571. subq KKK, %rax
  572. salq $BASE_SHIFT, %rax
  573. leaq (AO, %rax, 2), AO
  574. leaq (BO, %rax, 1), BO
  575. #endif
  576. #if defined(TRMMKERNEL) && defined(LEFT)
  577. addq $2, KK
  578. #endif
  579. addq $4 * SIZE, CO
  580. decq I
  581. jne .L31
  582. ALIGN_4
  583. .L40:
  584. movq M, %rax
  585. andq $1, %rax
  586. je .L49
  587. ALIGN_4
  588. .L41:
  589. #if !defined(TRMMKERNEL) || \
  590. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  591. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  592. movq B, BO
  593. #else
  594. movq KK, %rax
  595. salq $BASE_SHIFT, %rax
  596. leaq (AO, %rax, 1), AO
  597. leaq ( B, %rax, 1), BO
  598. #endif
  599. fldz
  600. #ifndef TRMMKERNEL
  601. movq K, %rax
  602. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  603. movq K, %rax
  604. subq KK, %rax
  605. movq %rax, KKK
  606. #else
  607. movq KK, %rax
  608. #ifdef LEFT
  609. addq $1, %rax
  610. #else
  611. addq $1, %rax
  612. #endif
  613. movq %rax, KKK
  614. #endif
  615. sarq $2, %rax
  616. je .L45
  617. ALIGN_4
  618. .L42:
  619. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  620. FLD -8 * SIZE(AO)
  621. FLD -8 * SIZE(BO)
  622. fmulp %st, %st(1)
  623. faddp %st, %st(1)
  624. FLD -7 * SIZE(AO)
  625. FLD -7 * SIZE(BO)
  626. fmulp %st, %st(1)
  627. faddp %st, %st(1)
  628. FLD -6 * SIZE(AO)
  629. FLD -6 * SIZE(BO)
  630. fmulp %st, %st(1)
  631. faddp %st, %st(1)
  632. FLD -5 * SIZE(AO)
  633. FLD -5 * SIZE(BO)
  634. fmulp %st, %st(1)
  635. faddp %st, %st(1)
  636. addq $4 * SIZE,AO
  637. addq $4 * SIZE,BO
  638. decq %rax
  639. jne .L42
  640. ALIGN_4
  641. .L45:
  642. #ifndef TRMMKERNEL
  643. movq K, %rax
  644. #else
  645. movq KKK, %rax
  646. #endif
  647. and $3, %rax
  648. je .L48
  649. ALIGN_4
  650. .L46:
  651. FLD -8 * SIZE(AO)
  652. FLD -8 * SIZE(BO)
  653. fmulp %st, %st(1)
  654. faddp %st, %st(1)
  655. addq $1 * SIZE,AO
  656. addq $1 * SIZE,BO
  657. decq %rax
  658. jne .L46
  659. ALIGN_4
  660. .L48:
  661. #ifndef TRMMKERNEL
  662. FLD ALPHA_I
  663. FLD ALPHA_R
  664. fmul %st(2), %st
  665. FLD 0 * SIZE(CO)
  666. faddp %st, %st(1)
  667. FST 0 * SIZE(CO)
  668. fmulp %st, %st(1)
  669. FLD 1 * SIZE(CO)
  670. faddp %st, %st(1)
  671. FST 1 * SIZE(CO)
  672. #else
  673. FST 0 * SIZE(CO)
  674. #endif
  675. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  676. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  677. movq K, %rax
  678. subq KKK, %rax
  679. salq $BASE_SHIFT, %rax
  680. leaq (AO, %rax, 1), AO
  681. leaq (BO, %rax, 1), BO
  682. #endif
  683. #if defined(TRMMKERNEL) && defined(LEFT)
  684. addq $1, KK
  685. #endif
  686. addq $1 * SIZE, CO
  687. ALIGN_4
  688. .L49:
  689. #if defined(TRMMKERNEL) && !defined(LEFT)
  690. addq $1, KK
  691. #endif
  692. movq BO, B
  693. ALIGN_4
  694. .L999:
  695. EMMS
  696. movq 0(%rsp), %rbx
  697. movq 8(%rsp), %rbp
  698. movq 16(%rsp), %r12
  699. movq 24(%rsp), %r13
  700. movq 32(%rsp), %r14
  701. movq 40(%rsp), %r15
  702. addq $STACKSIZE, %rsp
  703. ret
  704. EPILOGUE