You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

qgemm_kernel_2x2.S 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(OPTERON) || defined(BARCELONA)
  41. #define PREFETCH prefetch
  42. #define PREFETCHW prefetchw
  43. #else
  44. #define PREFETCH prefetcht0
  45. #define PREFETCHW prefetcht0
  46. #endif
  47. #define PREFETCHSIZE (5 + 4 * 10)
  48. #define STACK 16
  49. #define ARGS 16
  50. #define J 0 + STACK(%esp)
  51. #define KK 4 + STACK(%esp)
  52. #define KKK 8 + STACK(%esp)
  53. #define M 4 + STACK + ARGS(%esp)
  54. #define N 8 + STACK + ARGS(%esp)
  55. #define K 12 + STACK + ARGS(%esp)
  56. #define ALPHA 16 + STACK + ARGS(%esp)
  57. #define A 32 + STACK + ARGS(%esp)
  58. #define ARG_B 36 + STACK + ARGS(%esp)
  59. #define C 40 + STACK + ARGS(%esp)
  60. #define ARG_LDC 44 + STACK + ARGS(%esp)
  61. #define OFFSET 48 + STACK + ARGS(%esp)
  62. #define I %esi
  63. #define B %ebx
  64. #define CO %edi
  65. #define AO %edx
  66. #define BO %ecx
  67. #define LDC %ebp
  68. #define PREFETCH_OFFSET 48
  69. PROLOGUE
  70. subl $ARGS, %esp # Generate Stack Frame
  71. pushl %ebp
  72. pushl %edi
  73. pushl %esi
  74. pushl %ebx
  75. PROFCODE
  76. #if defined(TRMMKERNEL) && !defined(LEFT)
  77. movl OFFSET, %eax
  78. negl %eax
  79. movl %eax, KK
  80. #endif
  81. movl ARG_LDC, LDC
  82. movl ARG_B, B
  83. addl $8 * SIZE, A
  84. addl $8 * SIZE, B
  85. sall $BASE_SHIFT, LDC
  86. movl N, %eax
  87. sarl $1, %eax
  88. movl %eax, J
  89. je .L30
  90. ALIGN_4
  91. .L01:
  92. #if defined(TRMMKERNEL) && defined(LEFT)
  93. movl OFFSET, %eax
  94. movl %eax, KK
  95. #endif
  96. movl A, AO
  97. movl C, CO
  98. lea (, LDC, 2), %eax
  99. addl %eax, C
  100. movl M, I
  101. sarl $1, I
  102. je .L20
  103. ALIGN_4
  104. .L11:
  105. #if !defined(TRMMKERNEL) || \
  106. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  107. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  108. movl B, BO
  109. #else
  110. movl KK, %eax
  111. sall $BASE_SHIFT, %eax
  112. leal (AO, %eax, 2), AO
  113. leal (B, %eax, 2), BO
  114. #endif
  115. fldz
  116. fldz
  117. fldz
  118. fldz
  119. #if defined(HAVE_3DNOW)
  120. prefetchw 2 * SIZE(CO)
  121. prefetchw 2 * SIZE(CO, LDC, 1)
  122. #elif defined(HAVE_SSE)
  123. prefetchnta 2 * SIZE(CO)
  124. prefetchnta 2 * SIZE(CO, LDC, 1)
  125. #endif
  126. #ifndef TRMMKERNEL
  127. movl K, %eax
  128. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  129. movl K, %eax
  130. subl KK, %eax
  131. movl %eax, KKK
  132. #else
  133. movl KK, %eax
  134. #ifdef LEFT
  135. addl $2, %eax
  136. #else
  137. addl $2, %eax
  138. #endif
  139. movl %eax, KKK
  140. #endif
  141. sarl $2, %eax
  142. je .L15
  143. ALIGN_4
  144. .L12:
  145. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  146. FLD -8 * SIZE(AO)
  147. FLD -8 * SIZE(BO)
  148. fld %st(1)
  149. fmul %st(1), %st
  150. faddp %st, %st(3)
  151. FLD -7 * SIZE(BO)
  152. fmul %st, %st(2)
  153. FLD -7 * SIZE(AO)
  154. fmul %st, %st(2)
  155. fmulp %st, %st(1)
  156. faddp %st, %st(6)
  157. faddp %st, %st(3)
  158. faddp %st, %st(3)
  159. FLD -6 * SIZE(AO)
  160. FLD -6 * SIZE(BO)
  161. fld %st(1)
  162. fmul %st(1), %st
  163. faddp %st, %st(3)
  164. FLD -5 * SIZE(BO)
  165. fmul %st, %st(2)
  166. FLD -5 * SIZE(AO)
  167. fmul %st, %st(2)
  168. fmulp %st, %st(1)
  169. faddp %st, %st(6)
  170. faddp %st, %st(3)
  171. faddp %st, %st(3)
  172. PREFETCH (PREFETCHSIZE + 4) * SIZE(AO)
  173. FLD -4 * SIZE(AO)
  174. FLD -4 * SIZE(BO)
  175. fld %st(1)
  176. fmul %st(1), %st
  177. faddp %st, %st(3)
  178. FLD -3 * SIZE(BO)
  179. fmul %st, %st(2)
  180. FLD -3 * SIZE(AO)
  181. fmul %st, %st(2)
  182. fmulp %st, %st(1)
  183. faddp %st, %st(6)
  184. faddp %st, %st(3)
  185. faddp %st, %st(3)
  186. FLD -2 * SIZE(AO)
  187. FLD -2 * SIZE(BO)
  188. fld %st(1)
  189. fmul %st(1), %st
  190. faddp %st, %st(3)
  191. FLD -1 * SIZE(BO)
  192. fmul %st, %st(2)
  193. FLD -1 * SIZE(AO)
  194. fmul %st, %st(2)
  195. fmulp %st, %st(1)
  196. faddp %st, %st(6)
  197. faddp %st, %st(3)
  198. faddp %st, %st(3)
  199. addl $8 * SIZE,AO
  200. addl $8 * SIZE,BO
  201. decl %eax
  202. jne .L12
  203. ALIGN_4
  204. .L15:
  205. #ifndef TRMMKERNEL
  206. movl K, %eax
  207. #else
  208. movl KKK, %eax
  209. #endif
  210. and $3, %eax
  211. je .L18
  212. ALIGN_4
  213. .L16:
  214. FLD -8 * SIZE(AO)
  215. FLD -8 * SIZE(BO)
  216. fld %st(1)
  217. fmul %st(1), %st
  218. faddp %st, %st(3)
  219. FLD -7 * SIZE(BO)
  220. fmul %st, %st(2)
  221. FLD -7 * SIZE(AO)
  222. fmul %st, %st(2)
  223. fmulp %st, %st(1)
  224. faddp %st, %st(6)
  225. faddp %st, %st(3)
  226. faddp %st, %st(3)
  227. addl $2 * SIZE,AO
  228. addl $2 * SIZE,BO
  229. decl %eax
  230. jne .L16
  231. ALIGN_4
  232. .L18:
  233. #ifndef TRMMKERNEL
  234. FLD ALPHA
  235. fmul %st, %st(1)
  236. fmul %st, %st(2)
  237. fmul %st, %st(3)
  238. fmulp %st, %st(4)
  239. FLD 0 * SIZE(CO)
  240. faddp %st, %st(1)
  241. FST 0 * SIZE(CO)
  242. FLD 1 * SIZE(CO)
  243. faddp %st, %st(1)
  244. FST 1 * SIZE(CO)
  245. FLD 0 * SIZE(CO, LDC)
  246. faddp %st, %st(1)
  247. FST 0 * SIZE(CO, LDC)
  248. FLD 1 * SIZE(CO, LDC)
  249. faddp %st, %st(1)
  250. FST 1 * SIZE(CO, LDC)
  251. #else
  252. FST 0 * SIZE(CO)
  253. FST 1 * SIZE(CO)
  254. FST 0 * SIZE(CO, LDC)
  255. FST 1 * SIZE(CO, LDC)
  256. #endif
  257. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  258. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  259. movl K, %eax
  260. subl KKK, %eax
  261. sall $BASE_SHIFT, %eax
  262. leal (AO, %eax, 2), AO
  263. leal (BO, %eax, 2), BO
  264. #endif
  265. #if defined(TRMMKERNEL) && defined(LEFT)
  266. addl $2, KK
  267. #endif
  268. addl $2 * SIZE, CO
  269. decl I
  270. jne .L11
  271. ALIGN_4
  272. .L20:
  273. movl M, %eax
  274. andl $1, %eax
  275. je .L29
  276. ALIGN_4
  277. .L21:
  278. #if !defined(TRMMKERNEL) || \
  279. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  280. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  281. movl B, BO
  282. #else
  283. movl KK, %eax
  284. sall $BASE_SHIFT, %eax
  285. leal (AO, %eax, 1), AO
  286. leal ( B, %eax, 2), BO
  287. #endif
  288. fldz
  289. fldz
  290. #ifndef TRMMKERNEL
  291. movl K, %eax
  292. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  293. movl K, %eax
  294. subl KK, %eax
  295. movl %eax, KKK
  296. #else
  297. movl KK, %eax
  298. #ifdef LEFT
  299. addl $1, %eax
  300. #else
  301. addl $2, %eax
  302. #endif
  303. movl %eax, KKK
  304. #endif
  305. sarl $2, %eax
  306. je .L25
  307. ALIGN_4
  308. .L22:
  309. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  310. FLD -8 * SIZE(AO)
  311. FLD -8 * SIZE(BO)
  312. fmul %st(1), %st
  313. faddp %st, %st(2)
  314. FLD -7 * SIZE(BO)
  315. fmulp %st, %st(1)
  316. faddp %st, %st(2)
  317. FLD -7 * SIZE(AO)
  318. FLD -6 * SIZE(BO)
  319. fmul %st(1), %st
  320. faddp %st, %st(2)
  321. FLD -5 * SIZE(BO)
  322. fmulp %st, %st(1)
  323. faddp %st, %st(2)
  324. FLD -6 * SIZE(AO)
  325. FLD -4 * SIZE(BO)
  326. fmul %st(1), %st
  327. faddp %st, %st(2)
  328. FLD -3 * SIZE(BO)
  329. fmulp %st, %st(1)
  330. faddp %st, %st(2)
  331. FLD -5 * SIZE(AO)
  332. FLD -2 * SIZE(BO)
  333. fmul %st(1), %st
  334. faddp %st, %st(2)
  335. FLD -1 * SIZE(BO)
  336. fmulp %st, %st(1)
  337. faddp %st, %st(2)
  338. addl $4 * SIZE,AO
  339. addl $8 * SIZE,BO
  340. decl %eax
  341. jne .L22
  342. ALIGN_4
  343. .L25:
  344. #ifndef TRMMKERNEL
  345. movl K, %eax
  346. #else
  347. movl KKK, %eax
  348. #endif
  349. and $3, %eax
  350. je .L28
  351. ALIGN_4
  352. .L26:
  353. FLD -8 * SIZE(AO)
  354. FLD -8 * SIZE(BO)
  355. fmul %st(1), %st
  356. faddp %st, %st(2)
  357. FLD -7 * SIZE(BO)
  358. fmulp %st, %st(1)
  359. faddp %st, %st(2)
  360. addl $1 * SIZE,AO
  361. addl $2 * SIZE,BO
  362. decl %eax
  363. jne .L26
  364. ALIGN_4
  365. .L28:
  366. #ifndef TRMMKERNEL
  367. FLD ALPHA
  368. fmul %st, %st(1)
  369. fmulp %st, %st(2)
  370. FLD 0 * SIZE(CO)
  371. faddp %st, %st(1)
  372. FST 0 * SIZE(CO)
  373. FLD 0 * SIZE(CO, LDC)
  374. faddp %st, %st(1)
  375. FST 0 * SIZE(CO, LDC)
  376. #else
  377. FST 0 * SIZE(CO)
  378. FST 0 * SIZE(CO, LDC)
  379. #endif
  380. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  381. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  382. movl K, %eax
  383. subl KKK, %eax
  384. sall $BASE_SHIFT, %eax
  385. leal (AO, %eax, 1), AO
  386. leal (BO, %eax, 2), BO
  387. #endif
  388. #if defined(TRMMKERNEL) && defined(LEFT)
  389. addl $1, KK
  390. #endif
  391. addl $1 * SIZE, CO
  392. ALIGN_4
  393. .L29:
  394. #if defined(TRMMKERNEL) && !defined(LEFT)
  395. addl $2, KK
  396. #endif
  397. movl BO, B
  398. decl J
  399. jne .L01
  400. ALIGN_4
  401. .L30:
  402. movl N, %eax
  403. testl $1, %eax
  404. je .L999
  405. #if defined(TRMMKERNEL) && defined(LEFT)
  406. movl OFFSET, %eax
  407. movl %eax, KK
  408. #endif
  409. movl A, AO
  410. movl C, CO
  411. addl LDC, C
  412. movl M, I
  413. sarl $1, I
  414. je .L40
  415. ALIGN_4
  416. .L31:
  417. #if !defined(TRMMKERNEL) || \
  418. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  419. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  420. movl B, BO
  421. #else
  422. movl KK, %eax
  423. sall $BASE_SHIFT, %eax
  424. leal (AO, %eax, 2), AO
  425. leal ( B, %eax, 1), BO
  426. #endif
  427. fldz
  428. fldz
  429. #if defined(HAVE_3DNOW)
  430. prefetchw 2 * SIZE(CO)
  431. #elif defined(HAVE_SSE)
  432. prefetchnta 2 * SIZE(CO)
  433. #endif
  434. #ifndef TRMMKERNEL
  435. movl K, %eax
  436. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  437. movl K, %eax
  438. subl KK, %eax
  439. movl %eax, KKK
  440. #else
  441. movl KK, %eax
  442. #ifdef LEFT
  443. addl $2, %eax
  444. #else
  445. addl $1, %eax
  446. #endif
  447. movl %eax, KKK
  448. #endif
  449. sarl $2, %eax
  450. je .L35
  451. ALIGN_4
  452. .L32:
  453. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  454. FLD -8 * SIZE(BO)
  455. FLD -8 * SIZE(AO)
  456. fmul %st(1), %st
  457. faddp %st, %st(2)
  458. FLD -7 * SIZE(AO)
  459. fmulp %st, %st(1)
  460. faddp %st, %st(2)
  461. FLD -7 * SIZE(BO)
  462. FLD -6 * SIZE(AO)
  463. fmul %st(1), %st
  464. faddp %st, %st(2)
  465. FLD -5 * SIZE(AO)
  466. fmulp %st, %st(1)
  467. faddp %st, %st(2)
  468. FLD -6 * SIZE(BO)
  469. FLD -4 * SIZE(AO)
  470. fmul %st(1), %st
  471. faddp %st, %st(2)
  472. FLD -3 * SIZE(AO)
  473. fmulp %st, %st(1)
  474. faddp %st, %st(2)
  475. FLD -5 * SIZE(BO)
  476. FLD -2 * SIZE(AO)
  477. fmul %st(1), %st
  478. faddp %st, %st(2)
  479. FLD -1 * SIZE(AO)
  480. fmulp %st, %st(1)
  481. faddp %st, %st(2)
  482. addl $8 * SIZE,AO
  483. addl $4 * SIZE,BO
  484. decl %eax
  485. jne .L32
  486. ALIGN_4
  487. .L35:
  488. #ifndef TRMMKERNEL
  489. movl K, %eax
  490. #else
  491. movl KKK, %eax
  492. #endif
  493. and $3, %eax
  494. je .L38
  495. ALIGN_4
  496. .L36:
  497. FLD -8 * SIZE(BO)
  498. FLD -8 * SIZE(AO)
  499. fmul %st(1), %st
  500. faddp %st, %st(2)
  501. FLD -7 * SIZE(AO)
  502. fmulp %st, %st(1)
  503. faddp %st, %st(2)
  504. addl $2 * SIZE,AO
  505. addl $1 * SIZE,BO
  506. decl %eax
  507. jne .L36
  508. ALIGN_4
  509. .L38:
  510. #ifndef TRMMKERNEL
  511. FLD ALPHA
  512. fmul %st, %st(1)
  513. fmulp %st, %st(2)
  514. FLD 0 * SIZE(CO)
  515. faddp %st, %st(1)
  516. FST 0 * SIZE(CO)
  517. FLD 1 * SIZE(CO)
  518. faddp %st, %st(1)
  519. FST 1 * SIZE(CO)
  520. #else
  521. FST 0 * SIZE(CO)
  522. FST 1 * SIZE(CO)
  523. #endif
  524. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  525. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  526. movl K, %eax
  527. subl KKK, %eax
  528. sall $BASE_SHIFT, %eax
  529. leal (AO, %eax, 2), AO
  530. leal (BO, %eax, 1), BO
  531. #endif
  532. #if defined(TRMMKERNEL) && defined(LEFT)
  533. addl $2, KK
  534. #endif
  535. addl $2 * SIZE, CO
  536. decl I
  537. jne .L31
  538. ALIGN_4
  539. .L40:
  540. movl M, %eax
  541. andl $1, %eax
  542. je .L49
  543. ALIGN_4
  544. .L41:
  545. #if !defined(TRMMKERNEL) || \
  546. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  547. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  548. movl B, BO
  549. #else
  550. movl KK, %eax
  551. sall $BASE_SHIFT, %eax
  552. leal (AO, %eax, 1), AO
  553. leal ( B, %eax, 1), BO
  554. #endif
  555. fldz
  556. #ifndef TRMMKERNEL
  557. movl K, %eax
  558. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  559. movl K, %eax
  560. subl KK, %eax
  561. movl %eax, KKK
  562. #else
  563. movl KK, %eax
  564. #ifdef LEFT
  565. addl $1, %eax
  566. #else
  567. addl $1, %eax
  568. #endif
  569. movl %eax, KKK
  570. #endif
  571. sarl $2, %eax
  572. je .L45
  573. ALIGN_4
  574. .L42:
  575. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  576. FLD -8 * SIZE(AO)
  577. FLD -8 * SIZE(BO)
  578. fmulp %st, %st(1)
  579. faddp %st, %st(1)
  580. FLD -7 * SIZE(AO)
  581. FLD -7 * SIZE(BO)
  582. fmulp %st, %st(1)
  583. faddp %st, %st(1)
  584. FLD -6 * SIZE(AO)
  585. FLD -6 * SIZE(BO)
  586. fmulp %st, %st(1)
  587. faddp %st, %st(1)
  588. FLD -5 * SIZE(AO)
  589. FLD -5 * SIZE(BO)
  590. fmulp %st, %st(1)
  591. faddp %st, %st(1)
  592. addl $4 * SIZE,AO
  593. addl $4 * SIZE,BO
  594. decl %eax
  595. jne .L42
  596. ALIGN_4
  597. .L45:
  598. #ifndef TRMMKERNEL
  599. movl K, %eax
  600. #else
  601. movl KKK, %eax
  602. #endif
  603. and $3, %eax
  604. je .L48
  605. ALIGN_4
  606. .L46:
  607. FLD -8 * SIZE(AO)
  608. FLD -8 * SIZE(BO)
  609. fmulp %st, %st(1)
  610. faddp %st, %st(1)
  611. addl $1 * SIZE,AO
  612. addl $1 * SIZE,BO
  613. decl %eax
  614. jne .L46
  615. ALIGN_4
  616. .L48:
  617. #ifndef TRMMKERNEL
  618. FLD ALPHA
  619. fmulp %st, %st(1)
  620. FLD 0 * SIZE(CO)
  621. faddp %st, %st(1)
  622. FST 0 * SIZE(CO)
  623. #else
  624. FST 0 * SIZE(CO)
  625. #endif
  626. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  627. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  628. movl K, %eax
  629. subl KKK, %eax
  630. sall $BASE_SHIFT, %eax
  631. leal (AO, %eax, 1), AO
  632. leal (BO, %eax, 1), BO
  633. #endif
  634. #if defined(TRMMKERNEL) && defined(LEFT)
  635. addl $1, KK
  636. #endif
  637. addl $1 * SIZE, CO
  638. ALIGN_4
  639. .L49:
  640. #if defined(TRMMKERNEL) && !defined(LEFT)
  641. addl $1, KK
  642. #endif
  643. movl BO, B
  644. ALIGN_4
  645. .L999:
  646. popl %ebx
  647. popl %esi
  648. popl %edi
  649. popl %ebp
  650. addl $ARGS, %esp
  651. ret
  652. EPILOGUE