You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

qgemm_kernel_2x2.S 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M ARG1
  41. #define N ARG2
  42. #define K ARG3
  43. #define A ARG4
  44. #define B ARG5
  45. #define C ARG6
  46. #define LDC %r10
  47. #define I %r12
  48. #define J %r13
  49. #define AO %r14
  50. #define BO %r15
  51. #define CO %rbp
  52. #define KK %r11
  53. #define KKK 48(%rsp)
  54. #define STACKSIZE 64
  55. #define ALPHA 8 + STACKSIZE(%rsp)
  56. #define OFFSET 32 + STACKSIZE(%rsp)
  57. #ifdef OPTERON
  58. #define PREFETCH prefetch
  59. #define PREFETCHW prefetchw
  60. #else
  61. #define PREFETCH prefetcht0
  62. #define PREFETCHW prefetcht0
  63. #endif
  64. #define PREFETCHSIZE (5 + 4 * 10)
  65. PROLOGUE
  66. PROFCODE
  67. #ifdef WINDOWS_ABI
  68. emms
  69. #endif
  70. subq $STACKSIZE, %rsp
  71. movq %rbx, 0(%rsp)
  72. movq %rbp, 8(%rsp)
  73. movq %r12, 16(%rsp)
  74. movq %r13, 24(%rsp)
  75. movq %r14, 32(%rsp)
  76. movq %r15, 40(%rsp)
  77. movq 24 + STACKSIZE(%rsp), LDC
  78. #if defined(TRMMKERNEL) && !defined(LEFT)
  79. movq OFFSET, %rax
  80. negq %rax
  81. movq %rax, KK
  82. #endif
  83. addq $8 * SIZE, A
  84. addq $8 * SIZE, B
  85. salq $BASE_SHIFT, LDC
  86. movq N, %rax
  87. sarq $1, %rax
  88. movq %rax, J
  89. je .L30
  90. ALIGN_4
  91. .L01:
  92. #if defined(TRMMKERNEL) && defined(LEFT)
  93. movq OFFSET, %rax
  94. movq %rax, KK
  95. #endif
  96. movq A, AO
  97. movq C, CO
  98. leaq (, LDC, 2), %rax
  99. addq %rax, C
  100. movq M, I
  101. sarq $1, I
  102. je .L20
  103. ALIGN_4
  104. .L11:
  105. #if !defined(TRMMKERNEL) || \
  106. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  107. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  108. movq B, BO
  109. #else
  110. movq KK, %rax
  111. salq $BASE_SHIFT, %rax
  112. leaq (AO, %rax, 2), AO
  113. leaq (B, %rax, 2), BO
  114. #endif
  115. fldz
  116. fldz
  117. fldz
  118. fldz
  119. #if defined(HAVE_3DNOW)
  120. prefetchw 2 * SIZE(CO)
  121. prefetchw 2 * SIZE(CO, LDC, 1)
  122. #elif defined(HAVE_SSE)
  123. prefetchnta 2 * SIZE(CO)
  124. prefetchnta 2 * SIZE(CO, LDC, 1)
  125. #endif
  126. #ifndef TRMMKERNEL
  127. movq K, %rax
  128. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  129. movq K, %rax
  130. subq KK, %rax
  131. movq %rax, KKK
  132. #else
  133. movq KK, %rax
  134. #ifdef LEFT
  135. addq $2, %rax
  136. #else
  137. addq $2, %rax
  138. #endif
  139. movq %rax, KKK
  140. #endif
  141. sarq $2, %rax
  142. je .L15
  143. ALIGN_4
  144. .L12:
  145. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  146. FLD -8 * SIZE(AO)
  147. FLD -8 * SIZE(BO)
  148. fld %st(1)
  149. fmul %st(1), %st
  150. faddp %st, %st(3)
  151. FLD -7 * SIZE(BO)
  152. fmul %st, %st(2)
  153. FLD -7 * SIZE(AO)
  154. fmul %st, %st(2)
  155. fmulp %st, %st(1)
  156. faddp %st, %st(6)
  157. faddp %st, %st(3)
  158. faddp %st, %st(3)
  159. FLD -6 * SIZE(AO)
  160. FLD -6 * SIZE(BO)
  161. fld %st(1)
  162. fmul %st(1), %st
  163. faddp %st, %st(3)
  164. FLD -5 * SIZE(BO)
  165. fmul %st, %st(2)
  166. FLD -5 * SIZE(AO)
  167. fmul %st, %st(2)
  168. fmulp %st, %st(1)
  169. faddp %st, %st(6)
  170. faddp %st, %st(3)
  171. faddp %st, %st(3)
  172. PREFETCH (PREFETCHSIZE + 4) * SIZE(AO)
  173. FLD -4 * SIZE(AO)
  174. FLD -4 * SIZE(BO)
  175. fld %st(1)
  176. fmul %st(1), %st
  177. faddp %st, %st(3)
  178. FLD -3 * SIZE(BO)
  179. fmul %st, %st(2)
  180. FLD -3 * SIZE(AO)
  181. fmul %st, %st(2)
  182. fmulp %st, %st(1)
  183. faddp %st, %st(6)
  184. faddp %st, %st(3)
  185. faddp %st, %st(3)
  186. FLD -2 * SIZE(AO)
  187. FLD -2 * SIZE(BO)
  188. fld %st(1)
  189. fmul %st(1), %st
  190. faddp %st, %st(3)
  191. FLD -1 * SIZE(BO)
  192. fmul %st, %st(2)
  193. FLD -1 * SIZE(AO)
  194. fmul %st, %st(2)
  195. fmulp %st, %st(1)
  196. faddp %st, %st(6)
  197. faddp %st, %st(3)
  198. faddp %st, %st(3)
  199. addq $8 * SIZE,AO
  200. addq $8 * SIZE,BO
  201. decq %rax
  202. jne .L12
  203. ALIGN_4
  204. .L15:
  205. #ifndef TRMMKERNEL
  206. movq K, %rax
  207. #else
  208. movq KKK, %rax
  209. #endif
  210. and $3, %rax
  211. je .L18
  212. ALIGN_4
  213. .L16:
  214. FLD -8 * SIZE(AO)
  215. FLD -8 * SIZE(BO)
  216. fld %st(1)
  217. fmul %st(1), %st
  218. faddp %st, %st(3)
  219. FLD -7 * SIZE(BO)
  220. fmul %st, %st(2)
  221. FLD -7 * SIZE(AO)
  222. fmul %st, %st(2)
  223. fmulp %st, %st(1)
  224. faddp %st, %st(6)
  225. faddp %st, %st(3)
  226. faddp %st, %st(3)
  227. addq $2 * SIZE,AO
  228. addq $2 * SIZE,BO
  229. decq %rax
  230. jne .L16
  231. ALIGN_4
  232. .L18:
  233. #ifndef TRMMKERNEL
  234. FLD ALPHA
  235. fmul %st, %st(1)
  236. fmul %st, %st(2)
  237. fmul %st, %st(3)
  238. fmulp %st, %st(4)
  239. FLD 0 * SIZE(CO)
  240. faddp %st, %st(1)
  241. FST 0 * SIZE(CO)
  242. FLD 1 * SIZE(CO)
  243. faddp %st, %st(1)
  244. FST 1 * SIZE(CO)
  245. FLD 0 * SIZE(CO, LDC)
  246. faddp %st, %st(1)
  247. FST 0 * SIZE(CO, LDC)
  248. FLD 1 * SIZE(CO, LDC)
  249. faddp %st, %st(1)
  250. FST 1 * SIZE(CO, LDC)
  251. #else
  252. FST 0 * SIZE(CO)
  253. FST 1 * SIZE(CO)
  254. FST 0 * SIZE(CO, LDC)
  255. FST 1 * SIZE(CO, LDC)
  256. #endif
  257. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  258. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  259. movq K, %rax
  260. subq KKK, %rax
  261. salq $BASE_SHIFT, %rax
  262. leaq (AO, %rax, 2), AO
  263. leaq (BO, %rax, 2), BO
  264. #endif
  265. #if defined(TRMMKERNEL) && defined(LEFT)
  266. addq $2, KK
  267. #endif
  268. addq $2 * SIZE, CO
  269. decq I
  270. jne .L11
  271. ALIGN_4
  272. .L20:
  273. movq M, %rax
  274. andq $1, %rax
  275. je .L29
  276. ALIGN_4
  277. .L21:
  278. #if !defined(TRMMKERNEL) || \
  279. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  280. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  281. movq B, BO
  282. #else
  283. movq KK, %rax
  284. salq $BASE_SHIFT, %rax
  285. leaq (AO, %rax, 1), AO
  286. leaq ( B, %rax, 2), BO
  287. #endif
  288. fldz
  289. fldz
  290. #ifndef TRMMKERNEL
  291. movq K, %rax
  292. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  293. movq K, %rax
  294. subq KK, %rax
  295. movq %rax, KKK
  296. #else
  297. movq KK, %rax
  298. #ifdef LEFT
  299. addq $1, %rax
  300. #else
  301. addq $2, %rax
  302. #endif
  303. movq %rax, KKK
  304. #endif
  305. sarq $2, %rax
  306. je .L25
  307. ALIGN_4
  308. .L22:
  309. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  310. FLD -8 * SIZE(AO)
  311. FLD -8 * SIZE(BO)
  312. fmul %st(1), %st
  313. faddp %st, %st(2)
  314. FLD -7 * SIZE(BO)
  315. fmulp %st, %st(1)
  316. faddp %st, %st(2)
  317. FLD -7 * SIZE(AO)
  318. FLD -6 * SIZE(BO)
  319. fmul %st(1), %st
  320. faddp %st, %st(2)
  321. FLD -5 * SIZE(BO)
  322. fmulp %st, %st(1)
  323. faddp %st, %st(2)
  324. FLD -6 * SIZE(AO)
  325. FLD -4 * SIZE(BO)
  326. fmul %st(1), %st
  327. faddp %st, %st(2)
  328. FLD -3 * SIZE(BO)
  329. fmulp %st, %st(1)
  330. faddp %st, %st(2)
  331. FLD -5 * SIZE(AO)
  332. FLD -2 * SIZE(BO)
  333. fmul %st(1), %st
  334. faddp %st, %st(2)
  335. FLD -1 * SIZE(BO)
  336. fmulp %st, %st(1)
  337. faddp %st, %st(2)
  338. addq $4 * SIZE,AO
  339. addq $8 * SIZE,BO
  340. decq %rax
  341. jne .L22
  342. ALIGN_4
  343. .L25:
  344. #ifndef TRMMKERNEL
  345. movq K, %rax
  346. #else
  347. movq KKK, %rax
  348. #endif
  349. and $3, %rax
  350. je .L28
  351. ALIGN_4
  352. .L26:
  353. FLD -8 * SIZE(AO)
  354. FLD -8 * SIZE(BO)
  355. fmul %st(1), %st
  356. faddp %st, %st(2)
  357. FLD -7 * SIZE(BO)
  358. fmulp %st, %st(1)
  359. faddp %st, %st(2)
  360. addq $1 * SIZE,AO
  361. addq $2 * SIZE,BO
  362. decq %rax
  363. jne .L26
  364. ALIGN_4
  365. .L28:
  366. #ifndef TRMMKERNEL
  367. FLD ALPHA
  368. fmul %st, %st(1)
  369. fmulp %st, %st(2)
  370. FLD 0 * SIZE(CO)
  371. faddp %st, %st(1)
  372. FST 0 * SIZE(CO)
  373. FLD 0 * SIZE(CO, LDC)
  374. faddp %st, %st(1)
  375. FST 0 * SIZE(CO, LDC)
  376. #else
  377. FST 0 * SIZE(CO)
  378. FST 0 * SIZE(CO, LDC)
  379. #endif
  380. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  381. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  382. movq K, %rax
  383. subq KKK, %rax
  384. salq $BASE_SHIFT, %rax
  385. leaq (AO, %rax, 1), AO
  386. leaq (BO, %rax, 2), BO
  387. #endif
  388. #if defined(TRMMKERNEL) && defined(LEFT)
  389. addq $1, KK
  390. #endif
  391. addq $1 * SIZE, CO
  392. ALIGN_4
  393. .L29:
  394. #if defined(TRMMKERNEL) && !defined(LEFT)
  395. addq $2, KK
  396. #endif
  397. movq BO, B
  398. decq J
  399. jne .L01
  400. ALIGN_4
  401. .L30:
  402. movq N, %rax
  403. testq $1, %rax
  404. je .L999
  405. #if defined(TRMMKERNEL) && defined(LEFT)
  406. movq OFFSET, %rax
  407. movq %rax, KK
  408. #endif
  409. movq A, AO
  410. movq C, CO
  411. addq LDC, C
  412. movq M, I
  413. sarq $1, I
  414. je .L40
  415. ALIGN_4
  416. .L31:
  417. #if !defined(TRMMKERNEL) || \
  418. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  419. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  420. movq B, BO
  421. #else
  422. movq KK, %rax
  423. salq $BASE_SHIFT, %rax
  424. leaq (AO, %rax, 2), AO
  425. leaq ( B, %rax, 1), BO
  426. #endif
  427. fldz
  428. fldz
  429. #if defined(HAVE_3DNOW)
  430. prefetchw 2 * SIZE(CO)
  431. #elif defined(HAVE_SSE)
  432. prefetchnta 2 * SIZE(CO)
  433. #endif
  434. #ifndef TRMMKERNEL
  435. movq K, %rax
  436. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  437. movq K, %rax
  438. subq KK, %rax
  439. movq %rax, KKK
  440. #else
  441. movq KK, %rax
  442. #ifdef LEFT
  443. addq $2, %rax
  444. #else
  445. addq $1, %rax
  446. #endif
  447. movq %rax, KKK
  448. #endif
  449. sarq $2, %rax
  450. je .L35
  451. ALIGN_4
  452. .L32:
  453. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  454. FLD -8 * SIZE(BO)
  455. FLD -8 * SIZE(AO)
  456. fmul %st(1), %st
  457. faddp %st, %st(2)
  458. FLD -7 * SIZE(AO)
  459. fmulp %st, %st(1)
  460. faddp %st, %st(2)
  461. FLD -7 * SIZE(BO)
  462. FLD -6 * SIZE(AO)
  463. fmul %st(1), %st
  464. faddp %st, %st(2)
  465. FLD -5 * SIZE(AO)
  466. fmulp %st, %st(1)
  467. faddp %st, %st(2)
  468. FLD -6 * SIZE(BO)
  469. FLD -4 * SIZE(AO)
  470. fmul %st(1), %st
  471. faddp %st, %st(2)
  472. FLD -3 * SIZE(AO)
  473. fmulp %st, %st(1)
  474. faddp %st, %st(2)
  475. FLD -5 * SIZE(BO)
  476. FLD -2 * SIZE(AO)
  477. fmul %st(1), %st
  478. faddp %st, %st(2)
  479. FLD -1 * SIZE(AO)
  480. fmulp %st, %st(1)
  481. faddp %st, %st(2)
  482. addq $8 * SIZE,AO
  483. addq $4 * SIZE,BO
  484. decq %rax
  485. jne .L32
  486. ALIGN_4
  487. .L35:
  488. #ifndef TRMMKERNEL
  489. movq K, %rax
  490. #else
  491. movq KKK, %rax
  492. #endif
  493. and $3, %rax
  494. je .L38
  495. ALIGN_4
  496. .L36:
  497. FLD -8 * SIZE(BO)
  498. FLD -8 * SIZE(AO)
  499. fmul %st(1), %st
  500. faddp %st, %st(2)
  501. FLD -7 * SIZE(AO)
  502. fmulp %st, %st(1)
  503. faddp %st, %st(2)
  504. addq $2 * SIZE,AO
  505. addq $1 * SIZE,BO
  506. decq %rax
  507. jne .L36
  508. ALIGN_4
  509. .L38:
  510. #ifndef TRMMKERNEL
  511. FLD ALPHA
  512. fmul %st, %st(1)
  513. fmulp %st, %st(2)
  514. FLD 0 * SIZE(CO)
  515. faddp %st, %st(1)
  516. FST 0 * SIZE(CO)
  517. FLD 1 * SIZE(CO)
  518. faddp %st, %st(1)
  519. FST 1 * SIZE(CO)
  520. #else
  521. FST 0 * SIZE(CO)
  522. FST 1 * SIZE(CO)
  523. #endif
  524. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  525. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  526. movq K, %rax
  527. subq KKK, %rax
  528. salq $BASE_SHIFT, %rax
  529. leaq (AO, %rax, 2), AO
  530. leaq (BO, %rax, 1), BO
  531. #endif
  532. #if defined(TRMMKERNEL) && defined(LEFT)
  533. addq $2, KK
  534. #endif
  535. addq $2 * SIZE, CO
  536. decq I
  537. jne .L31
  538. ALIGN_4
  539. .L40:
  540. movq M, %rax
  541. andq $1, %rax
  542. je .L49
  543. ALIGN_4
  544. .L41:
  545. #if !defined(TRMMKERNEL) || \
  546. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  547. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  548. movq B, BO
  549. #else
  550. movq KK, %rax
  551. salq $BASE_SHIFT, %rax
  552. leaq (AO, %rax, 1), AO
  553. leaq ( B, %rax, 1), BO
  554. #endif
  555. fldz
  556. #ifndef TRMMKERNEL
  557. movq K, %rax
  558. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  559. movq K, %rax
  560. subq KK, %rax
  561. movq %rax, KKK
  562. #else
  563. movq KK, %rax
  564. #ifdef LEFT
  565. addq $1, %rax
  566. #else
  567. addq $1, %rax
  568. #endif
  569. movq %rax, KKK
  570. #endif
  571. sarq $2, %rax
  572. je .L45
  573. ALIGN_4
  574. .L42:
  575. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  576. FLD -8 * SIZE(AO)
  577. FLD -8 * SIZE(BO)
  578. fmulp %st, %st(1)
  579. faddp %st, %st(1)
  580. FLD -7 * SIZE(AO)
  581. FLD -7 * SIZE(BO)
  582. fmulp %st, %st(1)
  583. faddp %st, %st(1)
  584. FLD -6 * SIZE(AO)
  585. FLD -6 * SIZE(BO)
  586. fmulp %st, %st(1)
  587. faddp %st, %st(1)
  588. FLD -5 * SIZE(AO)
  589. FLD -5 * SIZE(BO)
  590. fmulp %st, %st(1)
  591. faddp %st, %st(1)
  592. addq $4 * SIZE,AO
  593. addq $4 * SIZE,BO
  594. decq %rax
  595. jne .L42
  596. ALIGN_4
  597. .L45:
  598. #ifndef TRMMKERNEL
  599. movq K, %rax
  600. #else
  601. movq KKK, %rax
  602. #endif
  603. and $3, %rax
  604. je .L48
  605. ALIGN_4
  606. .L46:
  607. FLD -8 * SIZE(AO)
  608. FLD -8 * SIZE(BO)
  609. fmulp %st, %st(1)
  610. faddp %st, %st(1)
  611. addq $1 * SIZE,AO
  612. addq $1 * SIZE,BO
  613. decq %rax
  614. jne .L46
  615. ALIGN_4
  616. .L48:
  617. #ifndef TRMMKERNEL
  618. FLD ALPHA
  619. fmulp %st, %st(1)
  620. FLD 0 * SIZE(CO)
  621. faddp %st, %st(1)
  622. FST 0 * SIZE(CO)
  623. #else
  624. FST 0 * SIZE(CO)
  625. #endif
  626. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  627. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  628. movq K, %rax
  629. subq KKK, %rax
  630. salq $BASE_SHIFT, %rax
  631. leaq (AO, %rax, 1), AO
  632. leaq (BO, %rax, 1), BO
  633. #endif
  634. #if defined(TRMMKERNEL) && defined(LEFT)
  635. addq $1, KK
  636. #endif
  637. addq $1 * SIZE, CO
  638. ALIGN_4
  639. .L49:
  640. #if defined(TRMMKERNEL) && !defined(LEFT)
  641. addq $1, KK
  642. #endif
  643. movq BO, B
  644. ALIGN_4
  645. .L999:
  646. EMMS
  647. movq 0(%rsp), %rbx
  648. movq 8(%rsp), %rbp
  649. movq 16(%rsp), %r12
  650. movq 24(%rsp), %r13
  651. movq 32(%rsp), %r14
  652. movq 40(%rsp), %r15
  653. addq $STACKSIZE, %rsp
  654. ret
  655. EPILOGUE