You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

qtrsm_kernel_LT_2x2.S 19 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef OPTERON
  41. #define PREFETCH prefetch
  42. #define PREFETCHW prefetchw
  43. #else
  44. #define PREFETCH prefetcht0
  45. #define PREFETCHW prefetcht0
  46. #endif
  47. #define PREFETCHSIZE (5 + 4 * 10)
  48. #define STACK 16
  49. #define ARGS 16
  50. #define J 0 + STACK(%esp)
  51. #define KK 4 + STACK(%esp)
  52. #define AORIG 8 + STACK(%esp)
  53. #define M 4 + STACK + ARGS(%esp)
  54. #define N 8 + STACK + ARGS(%esp)
  55. #define K 12 + STACK + ARGS(%esp)
  56. #define ALPHA 16 + STACK + ARGS(%esp)
  57. #define A 32 + STACK + ARGS(%esp)
  58. #define ARG_B 36 + STACK + ARGS(%esp)
  59. #define C 40 + STACK + ARGS(%esp)
  60. #define ARG_LDC 44 + STACK + ARGS(%esp)
  61. #define OFFSET 48 + STACK + ARGS(%esp)
  62. #define I %esi
  63. #define B %ebx
  64. #define CO %edi
  65. #define AO %edx
  66. #define BO %ecx
  67. #define LDC %ebp
  68. #define PREFETCH_OFFSET 48
  69. PROLOGUE
  70. subl $ARGS, %esp # Generate Stack Frame
  71. pushl %ebp
  72. pushl %edi
  73. pushl %esi
  74. pushl %ebx
  75. PROFCODE
  76. movl ARG_LDC, LDC
  77. movl ARG_B, B
  78. sall $BASE_SHIFT, LDC
  79. addl $8 * SIZE, A
  80. addl $8 * SIZE, B
  81. #ifdef LN
  82. movl M, %eax
  83. sall $BASE_SHIFT, %eax
  84. addl %eax, C
  85. imull K, %eax
  86. addl %eax, A
  87. #endif
  88. #ifdef RT
  89. movl N, %eax
  90. sall $BASE_SHIFT, %eax
  91. imull K, %eax
  92. addl %eax, B
  93. movl N, %eax
  94. imull %ebp, %eax
  95. addl %eax, C
  96. #endif
  97. #ifdef RN
  98. movl OFFSET, %eax
  99. negl %eax
  100. movl %eax, KK
  101. #endif
  102. #ifdef RT
  103. movl N, %eax
  104. subl OFFSET, %eax
  105. movl %eax, KK
  106. #endif
  107. movl N, %eax
  108. sarl $1, %eax
  109. movl %eax, J
  110. je .L30
  111. ALIGN_4
  112. .L01:
  113. #if defined(LT) || defined(RN)
  114. movl A, AO
  115. #else
  116. movl A, %eax
  117. movl %eax, AORIG
  118. #endif
  119. #ifdef RT
  120. movl K, %eax
  121. sall $1 + BASE_SHIFT, %eax
  122. subl %eax, B
  123. #endif
  124. lea (, LDC, 2), %eax
  125. #ifdef RT
  126. subl %eax, C
  127. #endif
  128. movl C, CO
  129. #ifndef RT
  130. addl %eax, C
  131. #endif
  132. #ifdef LN
  133. movl OFFSET, %eax
  134. addl M, %eax
  135. movl %eax, KK
  136. #endif
  137. #ifdef LT
  138. movl OFFSET, %eax
  139. movl %eax, KK
  140. #endif
  141. movl M, I
  142. sarl $1, I
  143. je .L20
  144. ALIGN_4
  145. .L11:
  146. #ifdef LN
  147. movl K, %eax
  148. sall $1 + BASE_SHIFT, %eax
  149. subl %eax, AORIG
  150. #endif
  151. #if defined(LN) || defined(RT)
  152. movl KK, %eax
  153. sall $BASE_SHIFT, %eax
  154. movl AORIG, AO
  155. leal (AO, %eax, 2), AO
  156. leal (B, %eax, 2), BO
  157. #else
  158. movl B, BO
  159. #endif
  160. fldz
  161. fldz
  162. fldz
  163. fldz
  164. #if defined(HAVE_3DNOW)
  165. prefetchw 2 * SIZE(CO)
  166. prefetchw 2 * SIZE(CO, LDC, 1)
  167. #elif defined(HAVE_SSE)
  168. prefetchnta 2 * SIZE(CO)
  169. prefetchnta 2 * SIZE(CO, LDC, 1)
  170. #endif
  171. #if defined(LT) || defined(RN)
  172. movl KK, %eax
  173. #else
  174. movl K, %eax
  175. subl KK, %eax
  176. #endif
  177. sarl $2, %eax
  178. je .L15
  179. ALIGN_4
  180. .L12:
  181. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  182. FLD -8 * SIZE(AO)
  183. FLD -8 * SIZE(BO)
  184. fld %st(1)
  185. fmul %st(1), %st
  186. faddp %st, %st(3)
  187. FLD -7 * SIZE(BO)
  188. fmul %st, %st(2)
  189. FLD -7 * SIZE(AO)
  190. fmul %st, %st(2)
  191. fmulp %st, %st(1)
  192. faddp %st, %st(6)
  193. faddp %st, %st(4)
  194. faddp %st, %st(2)
  195. FLD -6 * SIZE(AO)
  196. FLD -6 * SIZE(BO)
  197. fld %st(1)
  198. fmul %st(1), %st
  199. faddp %st, %st(3)
  200. FLD -5 * SIZE(BO)
  201. fmul %st, %st(2)
  202. FLD -5 * SIZE(AO)
  203. fmul %st, %st(2)
  204. fmulp %st, %st(1)
  205. faddp %st, %st(6)
  206. faddp %st, %st(4)
  207. faddp %st, %st(2)
  208. PREFETCH (PREFETCHSIZE + 4) * SIZE(AO)
  209. FLD -4 * SIZE(AO)
  210. FLD -4 * SIZE(BO)
  211. fld %st(1)
  212. fmul %st(1), %st
  213. faddp %st, %st(3)
  214. FLD -3 * SIZE(BO)
  215. fmul %st, %st(2)
  216. FLD -3 * SIZE(AO)
  217. fmul %st, %st(2)
  218. fmulp %st, %st(1)
  219. faddp %st, %st(6)
  220. faddp %st, %st(4)
  221. faddp %st, %st(2)
  222. FLD -2 * SIZE(AO)
  223. FLD -2 * SIZE(BO)
  224. fld %st(1)
  225. fmul %st(1), %st
  226. faddp %st, %st(3)
  227. FLD -1 * SIZE(BO)
  228. fmul %st, %st(2)
  229. FLD -1 * SIZE(AO)
  230. fmul %st, %st(2)
  231. fmulp %st, %st(1)
  232. faddp %st, %st(6)
  233. faddp %st, %st(4)
  234. faddp %st, %st(2)
  235. addl $8 * SIZE,AO
  236. addl $8 * SIZE,BO
  237. decl %eax
  238. jne .L12
  239. ALIGN_4
  240. .L15:
  241. #if defined(LT) || defined(RN)
  242. movl KK, %eax
  243. #else
  244. movl K, %eax
  245. subl KK, %eax
  246. #endif
  247. and $3, %eax
  248. je .L18
  249. ALIGN_4
  250. .L16:
  251. FLD -8 * SIZE(AO)
  252. FLD -8 * SIZE(BO)
  253. fld %st(1)
  254. fmul %st(1), %st
  255. faddp %st, %st(3)
  256. FLD -7 * SIZE(BO)
  257. fmul %st, %st(2)
  258. FLD -7 * SIZE(AO)
  259. fmul %st, %st(2)
  260. fmulp %st, %st(1)
  261. faddp %st, %st(6)
  262. faddp %st, %st(4)
  263. faddp %st, %st(2)
  264. addl $2 * SIZE,AO
  265. addl $2 * SIZE,BO
  266. decl %eax
  267. jne .L16
  268. ALIGN_4
  269. .L18:
  270. #if defined(LN) || defined(RT)
  271. movl KK, %eax
  272. #ifdef LN
  273. subl $2, %eax
  274. #else
  275. subl $2, %eax
  276. #endif
  277. sall $BASE_SHIFT, %eax
  278. movl AORIG, AO
  279. leal (AO, %eax, 2), AO
  280. leal (B, %eax, 2), BO
  281. #endif
  282. #if defined(LN) || defined(LT)
  283. FLD -8 * SIZE(BO)
  284. fsubp %st, %st(1)
  285. FLD -7 * SIZE(BO)
  286. fsubp %st, %st(2)
  287. FLD -6 * SIZE(BO)
  288. fsubp %st, %st(3)
  289. FLD -5 * SIZE(BO)
  290. fsubp %st, %st(4)
  291. #else
  292. FLD -8 * SIZE(AO)
  293. fsubp %st, %st(1)
  294. FLD -7 * SIZE(AO)
  295. fsubp %st, %st(3)
  296. FLD -6 * SIZE(AO)
  297. fsubp %st, %st(2)
  298. FLD -5 * SIZE(AO)
  299. fsubp %st, %st(4)
  300. #endif
  301. #ifdef LN
  302. FLD -5 * SIZE(AO)
  303. fmul %st, %st(3)
  304. fmulp %st, %st(4)
  305. FLD -6 * SIZE(AO)
  306. fmul %st(3), %st
  307. FLD -6 * SIZE(AO)
  308. fmul %st(5), %st
  309. fsubrp %st, %st(3)
  310. fsubrp %st, %st(1)
  311. FLD -8 * SIZE(AO)
  312. fmul %st, %st(1)
  313. fmulp %st, %st(2)
  314. #endif
  315. #ifdef LT
  316. FLD -8 * SIZE(AO)
  317. fmul %st, %st(1)
  318. fmulp %st, %st(2)
  319. FLD -7 * SIZE(AO)
  320. fmul %st(1), %st
  321. FLD -7 * SIZE(AO)
  322. fmul %st(3), %st
  323. fsubrp %st, %st(5)
  324. fsubrp %st, %st(3)
  325. FLD -5 * SIZE(AO)
  326. fmul %st, %st(3)
  327. fmulp %st, %st(4)
  328. #endif
  329. #ifdef RN
  330. FLD -8 * SIZE(BO)
  331. fmul %st, %st(1)
  332. fmulp %st, %st(3)
  333. FLD -7 * SIZE(BO)
  334. fmul %st(1), %st
  335. FLD -7 * SIZE(BO)
  336. fmul %st(4), %st
  337. fsubrp %st, %st(5)
  338. fsubrp %st, %st(2)
  339. FLD -5 * SIZE(BO)
  340. fmul %st, %st(2)
  341. fmulp %st, %st(4)
  342. #endif
  343. #ifdef RT
  344. FLD -5 * SIZE(BO)
  345. fmul %st, %st(2)
  346. fmulp %st, %st(4)
  347. FLD -6 * SIZE(BO)
  348. fmul %st(2), %st
  349. FLD -6 * SIZE(BO)
  350. fmul %st(5), %st
  351. fsubrp %st, %st(4)
  352. fsubrp %st, %st(1)
  353. FLD -8 * SIZE(BO)
  354. fmul %st, %st(1)
  355. fmulp %st, %st(3)
  356. #endif
  357. #ifdef LN
  358. subl $2 * SIZE, CO
  359. #endif
  360. #if defined(LN) || defined(LT)
  361. fld %st
  362. FST -8 * SIZE(BO)
  363. fxch %st(1)
  364. fld %st
  365. FST -7 * SIZE(BO)
  366. fxch %st(2)
  367. fld %st
  368. FST -6 * SIZE(BO)
  369. fxch %st(3)
  370. fld %st
  371. FST -5 * SIZE(BO)
  372. FST 1 * SIZE(CO, LDC)
  373. FST 0 * SIZE(CO)
  374. FST 0 * SIZE(CO, LDC)
  375. FST 1 * SIZE(CO)
  376. #else
  377. fld %st
  378. FST -8 * SIZE(AO)
  379. fxch %st(2)
  380. fld %st
  381. FST -7 * SIZE(AO)
  382. fxch %st(1)
  383. fld %st
  384. FST -6 * SIZE(AO)
  385. fxch %st(3)
  386. fld %st
  387. FST -5 * SIZE(AO)
  388. FST 1 * SIZE(CO, LDC)
  389. FST 1 * SIZE(CO)
  390. FST 0 * SIZE(CO)
  391. FST 0 * SIZE(CO, LDC)
  392. #endif
  393. #ifndef LN
  394. addl $2 * SIZE, CO
  395. #endif
  396. #if defined(LT) || defined(RN)
  397. movl K, %eax
  398. subl KK, %eax
  399. sall $BASE_SHIFT, %eax
  400. leal (AO, %eax, 2), AO
  401. leal (BO, %eax, 2), BO
  402. #endif
  403. #ifdef LN
  404. subl $2, KK
  405. #endif
  406. #ifdef LT
  407. addl $2, KK
  408. #endif
  409. #ifdef RT
  410. movl K, %eax
  411. sall $1 + BASE_SHIFT, %eax
  412. addl %eax, AORIG
  413. #endif
  414. decl I
  415. jne .L11
  416. ALIGN_4
  417. .L20:
  418. movl M, %eax
  419. andl $1, %eax
  420. je .L29
  421. ALIGN_4
  422. .L21:
  423. #ifdef LN
  424. movl K, %eax
  425. sall $0 + BASE_SHIFT, %eax
  426. subl %eax, AORIG
  427. #endif
  428. #if defined(LN) || defined(RT)
  429. movl KK, %eax
  430. sall $BASE_SHIFT, %eax
  431. movl AORIG, AO
  432. leal (AO, %eax, 1), AO
  433. leal (B, %eax, 2), BO
  434. #else
  435. movl B, BO
  436. #endif
  437. fldz
  438. fldz
  439. #if defined(LT) || defined(RN)
  440. movl KK, %eax
  441. #else
  442. movl K, %eax
  443. subl KK, %eax
  444. #endif
  445. sarl $2, %eax
  446. je .L25
  447. ALIGN_4
  448. .L22:
  449. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  450. FLD -8 * SIZE(AO)
  451. FLD -8 * SIZE(BO)
  452. fmul %st(1), %st
  453. faddp %st, %st(2)
  454. FLD -7 * SIZE(BO)
  455. fmulp %st, %st(1)
  456. faddp %st, %st(2)
  457. FLD -7 * SIZE(AO)
  458. FLD -6 * SIZE(BO)
  459. fmul %st(1), %st
  460. faddp %st, %st(2)
  461. FLD -5 * SIZE(BO)
  462. fmulp %st, %st(1)
  463. faddp %st, %st(2)
  464. FLD -6 * SIZE(AO)
  465. FLD -4 * SIZE(BO)
  466. fmul %st(1), %st
  467. faddp %st, %st(2)
  468. FLD -3 * SIZE(BO)
  469. fmulp %st, %st(1)
  470. faddp %st, %st(2)
  471. FLD -5 * SIZE(AO)
  472. FLD -2 * SIZE(BO)
  473. fmul %st(1), %st
  474. faddp %st, %st(2)
  475. FLD -1 * SIZE(BO)
  476. fmulp %st, %st(1)
  477. faddp %st, %st(2)
  478. addl $4 * SIZE,AO
  479. addl $8 * SIZE,BO
  480. decl %eax
  481. jne .L22
  482. ALIGN_4
  483. .L25:
  484. #if defined(LT) || defined(RN)
  485. movl KK, %eax
  486. #else
  487. movl K, %eax
  488. subl KK, %eax
  489. #endif
  490. and $3, %eax
  491. je .L28
  492. ALIGN_4
  493. .L26:
  494. FLD -8 * SIZE(AO)
  495. FLD -8 * SIZE(BO)
  496. fmul %st(1), %st
  497. faddp %st, %st(2)
  498. FLD -7 * SIZE(BO)
  499. fmulp %st, %st(1)
  500. faddp %st, %st(2)
  501. addl $1 * SIZE,AO
  502. addl $2 * SIZE,BO
  503. decl %eax
  504. jne .L26
  505. ALIGN_4
  506. .L28:
  507. #if defined(LN) || defined(RT)
  508. movl KK, %eax
  509. #ifdef LN
  510. subl $1, %eax
  511. #else
  512. subl $2, %eax
  513. #endif
  514. sall $BASE_SHIFT, %eax
  515. movl AORIG, AO
  516. leal (AO, %eax, 1), AO
  517. leal (B, %eax, 2), BO
  518. #endif
  519. #if defined(LN) || defined(LT)
  520. FLD -8 * SIZE(BO)
  521. fsubp %st, %st(1)
  522. FLD -7 * SIZE(BO)
  523. fsubp %st, %st(2)
  524. #else
  525. FLD -8 * SIZE(AO)
  526. fsubp %st, %st(1)
  527. FLD -7 * SIZE(AO)
  528. fsubp %st, %st(2)
  529. #endif
  530. #if defined(LN) || defined(LT)
  531. FLD -8 * SIZE(AO)
  532. fmul %st, %st(1)
  533. fmulp %st, %st(2)
  534. #endif
  535. #ifdef RN
  536. FLD -8 * SIZE(BO)
  537. fmulp %st, %st(1)
  538. FLD -7 * SIZE(BO)
  539. fmul %st(1), %st
  540. fsubrp %st, %st(2)
  541. FLD -5 * SIZE(BO)
  542. fmulp %st, %st(2)
  543. #endif
  544. #ifdef RT
  545. FLD -5 * SIZE(BO)
  546. fmulp %st, %st(2)
  547. FLD -6 * SIZE(BO)
  548. fmul %st(2), %st
  549. fsubrp %st, %st(1)
  550. FLD -8 * SIZE(BO)
  551. fmulp %st, %st(1)
  552. #endif
  553. #ifdef LN
  554. subl $1 * SIZE, CO
  555. #endif
  556. #if defined(LN) || defined(LT)
  557. fld %st
  558. FST -8 * SIZE(BO)
  559. fxch %st(1)
  560. fld %st
  561. FST -7 * SIZE(BO)
  562. #else
  563. fld %st
  564. FST -8 * SIZE(AO)
  565. fxch %st(1)
  566. fld %st
  567. FST -7 * SIZE(AO)
  568. #endif
  569. FST 0 * SIZE(CO, LDC)
  570. FST 0 * SIZE(CO)
  571. #ifndef LN
  572. addl $1 * SIZE, CO
  573. #endif
  574. #if defined(LT) || defined(RN)
  575. movl K, %eax
  576. subl KK, %eax
  577. sall $BASE_SHIFT, %eax
  578. leal (AO, %eax, 1), AO
  579. leal (BO, %eax, 2), BO
  580. #endif
  581. #ifdef LN
  582. subl $1, KK
  583. #endif
  584. #ifdef LT
  585. addl $1, KK
  586. #endif
  587. #ifdef RT
  588. movl K, %eax
  589. sall $0 + BASE_SHIFT, %eax
  590. addl %eax, AORIG
  591. #endif
  592. ALIGN_4
  593. .L29:
  594. #ifdef LN
  595. movl K, %eax
  596. sall $BASE_SHIFT, %eax
  597. leal (B, %eax, 2), B
  598. #endif
  599. #if defined(LT) || defined(RN)
  600. movl BO, B
  601. #endif
  602. #ifdef RN
  603. addl $2, KK
  604. #endif
  605. #ifdef RT
  606. subl $2, KK
  607. #endif
  608. decl J
  609. jne .L01
  610. ALIGN_4
  611. .L30:
  612. movl N, %eax
  613. testl $1, %eax
  614. je .L999
  615. #if defined(LT) || defined(RN)
  616. movl A, AO
  617. #else
  618. movl A, %eax
  619. movl %eax, AORIG
  620. #endif
  621. #ifdef RT
  622. movl K, %eax
  623. sall $0 + BASE_SHIFT, %eax
  624. subl %eax, B
  625. #endif
  626. #ifdef RT
  627. subl LDC, C
  628. #endif
  629. movl C, CO
  630. #ifndef RT
  631. addl LDC, C
  632. #endif
  633. #ifdef LN
  634. movl OFFSET, %eax
  635. addl M, %eax
  636. movl %eax, KK
  637. #endif
  638. #ifdef LT
  639. movl OFFSET, %eax
  640. movl %eax, KK
  641. #endif
  642. movl M, I
  643. sarl $1, I
  644. je .L40
  645. ALIGN_4
  646. .L31:
  647. #ifdef LN
  648. movl K, %eax
  649. sall $1 + BASE_SHIFT, %eax
  650. subl %eax, AORIG
  651. #endif
  652. #if defined(LN) || defined(RT)
  653. movl KK, %eax
  654. sall $BASE_SHIFT, %eax
  655. movl AORIG, AO
  656. leal (AO, %eax, 2), AO
  657. leal (B, %eax, 1), BO
  658. #else
  659. movl B, BO
  660. #endif
  661. fldz
  662. fldz
  663. #if defined(HAVE_3DNOW)
  664. prefetchw 2 * SIZE(CO)
  665. #elif defined(HAVE_SSE)
  666. prefetchnta 2 * SIZE(CO)
  667. #endif
  668. #if defined(LT) || defined(RN)
  669. movl KK, %eax
  670. #else
  671. movl K, %eax
  672. subl KK, %eax
  673. #endif
  674. sarl $2, %eax
  675. je .L35
  676. ALIGN_4
  677. .L32:
  678. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  679. FLD -8 * SIZE(BO)
  680. FLD -8 * SIZE(AO)
  681. fmul %st(1), %st
  682. faddp %st, %st(2)
  683. FLD -7 * SIZE(AO)
  684. fmulp %st, %st(1)
  685. faddp %st, %st(2)
  686. FLD -7 * SIZE(BO)
  687. FLD -6 * SIZE(AO)
  688. fmul %st(1), %st
  689. faddp %st, %st(2)
  690. FLD -5 * SIZE(AO)
  691. fmulp %st, %st(1)
  692. faddp %st, %st(2)
  693. FLD -6 * SIZE(BO)
  694. FLD -4 * SIZE(AO)
  695. fmul %st(1), %st
  696. faddp %st, %st(2)
  697. FLD -3 * SIZE(AO)
  698. fmulp %st, %st(1)
  699. faddp %st, %st(2)
  700. FLD -5 * SIZE(BO)
  701. FLD -2 * SIZE(AO)
  702. fmul %st(1), %st
  703. faddp %st, %st(2)
  704. FLD -1 * SIZE(AO)
  705. fmulp %st, %st(1)
  706. faddp %st, %st(2)
  707. addl $8 * SIZE,AO
  708. addl $4 * SIZE,BO
  709. decl %eax
  710. jne .L32
  711. ALIGN_4
  712. .L35:
  713. #if defined(LT) || defined(RN)
  714. movl KK, %eax
  715. #else
  716. movl K, %eax
  717. subl KK, %eax
  718. #endif
  719. and $3, %eax
  720. je .L38
  721. ALIGN_4
  722. .L36:
  723. FLD -8 * SIZE(BO)
  724. FLD -8 * SIZE(AO)
  725. fmul %st(1), %st
  726. faddp %st, %st(2)
  727. FLD -7 * SIZE(AO)
  728. fmulp %st, %st(1)
  729. faddp %st, %st(2)
  730. addl $2 * SIZE,AO
  731. addl $1 * SIZE,BO
  732. decl %eax
  733. jne .L36
  734. ALIGN_4
  735. .L38:
  736. #if defined(LN) || defined(RT)
  737. movl KK, %eax
  738. #ifdef LN
  739. subl $2, %eax
  740. #else
  741. subl $1, %eax
  742. #endif
  743. sall $BASE_SHIFT, %eax
  744. movl AORIG, AO
  745. leal (AO, %eax, 2), AO
  746. leal (B, %eax, 1), BO
  747. #endif
  748. #if defined(LN) || defined(LT)
  749. FLD -8 * SIZE(BO)
  750. fsubp %st, %st(1)
  751. FLD -7 * SIZE(BO)
  752. fsubp %st, %st(2)
  753. #else
  754. FLD -8 * SIZE(AO)
  755. fsubp %st, %st(1)
  756. FLD -7 * SIZE(AO)
  757. fsubp %st, %st(2)
  758. #endif
  759. #ifdef LN
  760. FLD -5 * SIZE(AO)
  761. fmulp %st, %st(2)
  762. FLD -6 * SIZE(AO)
  763. fmul %st(2), %st
  764. fsubrp %st, %st(1)
  765. FLD -8 * SIZE(AO)
  766. fmulp %st, %st(1)
  767. #endif
  768. #ifdef LT
  769. FLD -8 * SIZE(AO)
  770. fmulp %st, %st(1)
  771. FLD -7 * SIZE(AO)
  772. fmul %st(1), %st
  773. fsubrp %st, %st(2)
  774. FLD -5 * SIZE(AO)
  775. fmulp %st, %st(2)
  776. #endif
  777. #ifdef RN
  778. FLD -8 * SIZE(BO)
  779. fmul %st, %st(1)
  780. fmulp %st, %st(2)
  781. #endif
  782. #ifdef RT
  783. FLD -8 * SIZE(BO)
  784. fmul %st, %st(1)
  785. fmulp %st, %st(2)
  786. #endif
  787. #ifdef LN
  788. subl $2 * SIZE, CO
  789. #endif
  790. #if defined(LN) || defined(LT)
  791. fld %st
  792. FST -8 * SIZE(BO)
  793. fxch %st(1)
  794. fld %st
  795. FST -7 * SIZE(BO)
  796. #else
  797. fld %st
  798. FST -8 * SIZE(AO)
  799. fxch %st(1)
  800. fld %st
  801. FST -7 * SIZE(AO)
  802. #endif
  803. FST 1 * SIZE(CO)
  804. FST 0 * SIZE(CO)
  805. #ifndef LN
  806. addl $2 * SIZE, CO
  807. #endif
  808. #if defined(LT) || defined(RN)
  809. movl K, %eax
  810. subl KK, %eax
  811. sall $BASE_SHIFT, %eax
  812. leal (AO, %eax, 2), AO
  813. leal (BO, %eax, 1), BO
  814. #endif
  815. #ifdef LN
  816. subl $2, KK
  817. #endif
  818. #ifdef LT
  819. addl $2, KK
  820. #endif
  821. #ifdef RT
  822. movl K, %eax
  823. sall $1 + BASE_SHIFT, %eax
  824. addl %eax, AORIG
  825. #endif
  826. decl I
  827. jne .L31
  828. ALIGN_4
  829. .L40:
  830. movl M, %eax
  831. andl $1, %eax
  832. je .L49
  833. ALIGN_4
  834. .L41:
  835. #ifdef LN
  836. movl K, %eax
  837. sall $0 + BASE_SHIFT, %eax
  838. subl %eax, AORIG
  839. #endif
  840. #if defined(LN) || defined(RT)
  841. movl KK, %eax
  842. sall $BASE_SHIFT, %eax
  843. movl AORIG, AO
  844. leal (AO, %eax, 1), AO
  845. leal (B, %eax, 1), BO
  846. #else
  847. movl B, BO
  848. #endif
  849. fldz
  850. #if defined(LT) || defined(RN)
  851. movl KK, %eax
  852. #else
  853. movl K, %eax
  854. subl KK, %eax
  855. #endif
  856. sarl $2, %eax
  857. je .L45
  858. ALIGN_4
  859. .L42:
  860. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  861. FLD -8 * SIZE(AO)
  862. FLD -8 * SIZE(BO)
  863. fmulp %st, %st(1)
  864. faddp %st, %st(1)
  865. FLD -7 * SIZE(AO)
  866. FLD -7 * SIZE(BO)
  867. fmulp %st, %st(1)
  868. faddp %st, %st(1)
  869. FLD -6 * SIZE(AO)
  870. FLD -6 * SIZE(BO)
  871. fmulp %st, %st(1)
  872. faddp %st, %st(1)
  873. FLD -5 * SIZE(AO)
  874. FLD -5 * SIZE(BO)
  875. fmulp %st, %st(1)
  876. faddp %st, %st(1)
  877. addl $4 * SIZE,AO
  878. addl $4 * SIZE,BO
  879. decl %eax
  880. jne .L42
  881. ALIGN_4
  882. .L45:
  883. #if defined(LT) || defined(RN)
  884. movl KK, %eax
  885. #else
  886. movl K, %eax
  887. subl KK, %eax
  888. #endif
  889. and $3, %eax
  890. je .L48
  891. ALIGN_4
  892. .L46:
  893. FLD -8 * SIZE(AO)
  894. FLD -8 * SIZE(BO)
  895. fmulp %st, %st(1)
  896. faddp %st, %st(1)
  897. addl $1 * SIZE,AO
  898. addl $1 * SIZE,BO
  899. decl %eax
  900. jne .L46
  901. ALIGN_4
  902. .L48:
  903. #if defined(LN) || defined(RT)
  904. movl KK, %eax
  905. #ifdef LN
  906. subl $1, %eax
  907. #else
  908. subl $1, %eax
  909. #endif
  910. sall $BASE_SHIFT, %eax
  911. movl AORIG, AO
  912. leal (AO, %eax, 1), AO
  913. leal (B, %eax, 1), BO
  914. #endif
  915. #if defined(LN) || defined(LT)
  916. FLD -8 * SIZE(BO)
  917. fsubp %st, %st(1)
  918. #else
  919. FLD -8 * SIZE(AO)
  920. fsubp %st, %st(1)
  921. #endif
  922. #ifdef LN
  923. FLD -8 * SIZE(AO)
  924. fmulp %st, %st(1)
  925. #endif
  926. #ifdef LT
  927. FLD -8 * SIZE(AO)
  928. fmulp %st, %st(1)
  929. #endif
  930. #ifdef RN
  931. FLD -8 * SIZE(BO)
  932. fmulp %st, %st(1)
  933. #endif
  934. #ifdef RT
  935. FLD -8 * SIZE(BO)
  936. fmulp %st, %st(1)
  937. #endif
  938. #ifdef LN
  939. subl $1 * SIZE, CO
  940. #endif
  941. #if defined(LN) || defined(LT)
  942. fld %st
  943. FST -8 * SIZE(BO)
  944. #else
  945. fld %st
  946. FST -8 * SIZE(AO)
  947. #endif
  948. FST 0 * SIZE(CO)
  949. #ifndef LN
  950. addl $1 * SIZE, CO
  951. #endif
  952. #if defined(LT) || defined(RN)
  953. movl K, %eax
  954. subl KK, %eax
  955. sall $BASE_SHIFT, %eax
  956. leal (AO, %eax, 1), AO
  957. leal (BO, %eax, 1), BO
  958. #endif
  959. #ifdef LN
  960. subl $1, KK
  961. #endif
  962. #ifdef LT
  963. addl $1, KK
  964. #endif
  965. #ifdef RT
  966. movl K, %eax
  967. sall $0 + BASE_SHIFT, %eax
  968. addl %eax, AORIG
  969. #endif
  970. ALIGN_4
  971. .L49:
  972. #ifdef LN
  973. movl K, %eax
  974. sall $BASE_SHIFT, %eax
  975. leal (B, %eax, 1), B
  976. #endif
  977. #if defined(LT) || defined(RN)
  978. movl BO, B
  979. #endif
  980. #ifdef RN
  981. addl $1, KK
  982. #endif
  983. #ifdef RT
  984. subl $1, KK
  985. #endif
  986. ALIGN_4
  987. .L999:
  988. popl %ebx
  989. popl %esi
  990. popl %edi
  991. popl %ebp
  992. addl $ARGS, %esp
  993. ret
  994. EPILOGUE