You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

qtrsm_kernel_RT_2x2.S 19 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef OPTERON
  41. #define PREFETCH prefetch
  42. #define PREFETCHW prefetchw
  43. #else
  44. #define PREFETCH prefetcht0
  45. #define PREFETCHW prefetcht0
  46. #endif
  47. #define PREFETCHSIZE (5 + 4 * 10)
  48. #define STACK 16
  49. #define ARGS 16
  50. #define J 0 + STACK(%esp)
  51. #define KK 4 + STACK(%esp)
  52. #define KKK 8 + STACK(%esp)
  53. #define AORIG 12 + STACK(%esp)
  54. #define M 4 + STACK + ARGS(%esp)
  55. #define N 8 + STACK + ARGS(%esp)
  56. #define K 12 + STACK + ARGS(%esp)
  57. #define ALPHA 16 + STACK + ARGS(%esp)
  58. #define A 32 + STACK + ARGS(%esp)
  59. #define ARG_B 36 + STACK + ARGS(%esp)
  60. #define C 40 + STACK + ARGS(%esp)
  61. #define ARG_LDC 44 + STACK + ARGS(%esp)
  62. #define OFFSET 48 + STACK + ARGS(%esp)
  63. #define I %esi
  64. #define B %ebx
  65. #define CO %edi
  66. #define AO %edx
  67. #define BO %ecx
  68. #define LDC %ebp
  69. #define PREFETCH_OFFSET 48
  70. PROLOGUE
  71. subl $ARGS, %esp # Generate Stack Frame
  72. pushl %ebp
  73. pushl %edi
  74. pushl %esi
  75. pushl %ebx
  76. PROFCODE
  77. movl ARG_LDC, LDC
  78. movl ARG_B, B
  79. sall $BASE_SHIFT, LDC
  80. addl $8 * SIZE, A
  81. addl $8 * SIZE, B
  82. #ifdef LN
  83. movl M, %eax
  84. sall $BASE_SHIFT, %eax
  85. addl %eax, C
  86. imull K, %eax
  87. addl %eax, A
  88. #endif
  89. #ifdef RT
  90. movl N, %eax
  91. sall $BASE_SHIFT, %eax
  92. imull K, %eax
  93. addl %eax, B
  94. movl N, %eax
  95. imull %ebp, %eax
  96. addl %eax, C
  97. #endif
  98. #ifdef RN
  99. movl OFFSET, %eax
  100. negl %eax
  101. movl %eax, KK
  102. #endif
  103. #ifdef RT
  104. movl N, %eax
  105. subl OFFSET, %eax
  106. movl %eax, KK
  107. #endif
  108. movl N, %eax
  109. testl $1, %eax
  110. je .L30
  111. #if defined(LT) || defined(RN)
  112. movl A, AO
  113. #else
  114. movl A, %eax
  115. movl %eax, AORIG
  116. #endif
  117. #ifdef RT
  118. movl K, %eax
  119. sall $0 + BASE_SHIFT, %eax
  120. subl %eax, B
  121. #endif
  122. #ifdef RT
  123. subl LDC, C
  124. #endif
  125. movl C, CO
  126. #ifndef RT
  127. addl LDC, C
  128. #endif
  129. #ifdef LN
  130. movl OFFSET, %eax
  131. addl M, %eax
  132. movl %eax, KK
  133. #endif
  134. #ifdef LT
  135. movl OFFSET, %eax
  136. movl %eax, KK
  137. #endif
  138. movl M, I
  139. sarl $1, I
  140. je .L40
  141. ALIGN_4
  142. .L31:
  143. #ifdef LN
  144. movl K, %eax
  145. sall $1 + BASE_SHIFT, %eax
  146. subl %eax, AORIG
  147. #endif
  148. #if defined(LN) || defined(RT)
  149. movl KK, %eax
  150. sall $BASE_SHIFT, %eax
  151. movl AORIG, AO
  152. leal (AO, %eax, 2), AO
  153. leal (B, %eax, 1), BO
  154. #else
  155. movl B, BO
  156. #endif
  157. fldz
  158. fldz
  159. #if defined(HAVE_3DNOW)
  160. prefetchw 2 * SIZE(CO)
  161. #elif defined(HAVE_SSE)
  162. prefetchnta 2 * SIZE(CO)
  163. #endif
  164. #if defined(LT) || defined(RN)
  165. movl KK, %eax
  166. #else
  167. movl K, %eax
  168. subl KK, %eax
  169. #endif
  170. sarl $2, %eax
  171. je .L35
  172. ALIGN_4
  173. .L32:
  174. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  175. FLD -8 * SIZE(BO)
  176. FLD -8 * SIZE(AO)
  177. fmul %st(1), %st
  178. faddp %st, %st(2)
  179. FLD -7 * SIZE(AO)
  180. fmulp %st, %st(1)
  181. faddp %st, %st(2)
  182. FLD -7 * SIZE(BO)
  183. FLD -6 * SIZE(AO)
  184. fmul %st(1), %st
  185. faddp %st, %st(2)
  186. FLD -5 * SIZE(AO)
  187. fmulp %st, %st(1)
  188. faddp %st, %st(2)
  189. FLD -6 * SIZE(BO)
  190. FLD -4 * SIZE(AO)
  191. fmul %st(1), %st
  192. faddp %st, %st(2)
  193. FLD -3 * SIZE(AO)
  194. fmulp %st, %st(1)
  195. faddp %st, %st(2)
  196. FLD -5 * SIZE(BO)
  197. FLD -2 * SIZE(AO)
  198. fmul %st(1), %st
  199. faddp %st, %st(2)
  200. FLD -1 * SIZE(AO)
  201. fmulp %st, %st(1)
  202. faddp %st, %st(2)
  203. addl $8 * SIZE,AO
  204. addl $4 * SIZE,BO
  205. decl %eax
  206. jne .L32
  207. ALIGN_4
  208. .L35:
  209. #if defined(LT) || defined(RN)
  210. movl KK, %eax
  211. #else
  212. movl K, %eax
  213. subl KK, %eax
  214. #endif
  215. and $3, %eax
  216. je .L38
  217. ALIGN_4
  218. .L36:
  219. FLD -8 * SIZE(BO)
  220. FLD -8 * SIZE(AO)
  221. fmul %st(1), %st
  222. faddp %st, %st(2)
  223. FLD -7 * SIZE(AO)
  224. fmulp %st, %st(1)
  225. faddp %st, %st(2)
  226. addl $2 * SIZE,AO
  227. addl $1 * SIZE,BO
  228. decl %eax
  229. jne .L36
  230. ALIGN_4
  231. .L38:
  232. #if defined(LN) || defined(RT)
  233. movl KK, %eax
  234. #ifdef LN
  235. subl $2, %eax
  236. #else
  237. subl $1, %eax
  238. #endif
  239. sall $BASE_SHIFT, %eax
  240. movl AORIG, AO
  241. leal (AO, %eax, 2), AO
  242. leal (B, %eax, 1), BO
  243. #endif
  244. #if defined(LN) || defined(LT)
  245. FLD -8 * SIZE(BO)
  246. fsubp %st, %st(1)
  247. FLD -7 * SIZE(BO)
  248. fsubp %st, %st(2)
  249. #else
  250. FLD -8 * SIZE(AO)
  251. fsubp %st, %st(1)
  252. FLD -7 * SIZE(AO)
  253. fsubp %st, %st(2)
  254. #endif
  255. #ifdef LN
  256. FLD -5 * SIZE(AO)
  257. fmulp %st, %st(2)
  258. FLD -6 * SIZE(AO)
  259. fmul %st(2), %st
  260. fsubrp %st, %st(1)
  261. FLD -8 * SIZE(AO)
  262. fmulp %st, %st(1)
  263. #endif
  264. #ifdef LT
  265. FLD -8 * SIZE(AO)
  266. fmulp %st, %st(1)
  267. FLD -7 * SIZE(AO)
  268. fmul %st(1), %st
  269. fsubrp %st, %st(2)
  270. FLD -5 * SIZE(AO)
  271. fmulp %st, %st(2)
  272. #endif
  273. #ifdef RN
  274. FLD -8 * SIZE(BO)
  275. fmul %st, %st(1)
  276. fmulp %st, %st(2)
  277. #endif
  278. #ifdef RT
  279. FLD -8 * SIZE(BO)
  280. fmul %st, %st(1)
  281. fmulp %st, %st(2)
  282. #endif
  283. #ifdef LN
  284. subl $2 * SIZE, CO
  285. #endif
  286. #if defined(LN) || defined(LT)
  287. fld %st
  288. FST -8 * SIZE(BO)
  289. fxch %st(1)
  290. fld %st
  291. FST -7 * SIZE(BO)
  292. #else
  293. fld %st
  294. FST -8 * SIZE(AO)
  295. fxch %st(1)
  296. fld %st
  297. FST -7 * SIZE(AO)
  298. #endif
  299. FST 1 * SIZE(CO)
  300. FST 0 * SIZE(CO)
  301. #ifndef LN
  302. addl $2 * SIZE, CO
  303. #endif
  304. #if defined(LT) || defined(RN)
  305. movl K, %eax
  306. subl KK, %eax
  307. sall $BASE_SHIFT, %eax
  308. leal (AO, %eax, 2), AO
  309. leal (BO, %eax, 1), BO
  310. #endif
  311. #ifdef LN
  312. subl $2, KK
  313. #endif
  314. #ifdef LT
  315. addl $2, KK
  316. #endif
  317. #ifdef RT
  318. movl K, %eax
  319. sall $1 + BASE_SHIFT, %eax
  320. addl %eax, AORIG
  321. #endif
  322. decl I
  323. jne .L31
  324. ALIGN_4
  325. .L40:
  326. movl M, %eax
  327. andl $1, %eax
  328. je .L49
  329. ALIGN_4
  330. .L41:
  331. #ifdef LN
  332. movl K, %eax
  333. sall $0 + BASE_SHIFT, %eax
  334. subl %eax, AORIG
  335. #endif
  336. #if defined(LN) || defined(RT)
  337. movl KK, %eax
  338. sall $BASE_SHIFT, %eax
  339. movl AORIG, AO
  340. leal (AO, %eax, 1), AO
  341. leal (B, %eax, 1), BO
  342. #else
  343. movl B, BO
  344. #endif
  345. fldz
  346. #if defined(LT) || defined(RN)
  347. movl KK, %eax
  348. #else
  349. movl K, %eax
  350. subl KK, %eax
  351. #endif
  352. sarl $2, %eax
  353. je .L45
  354. ALIGN_4
  355. .L42:
  356. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  357. FLD -8 * SIZE(AO)
  358. FLD -8 * SIZE(BO)
  359. fmulp %st, %st(1)
  360. faddp %st, %st(1)
  361. FLD -7 * SIZE(AO)
  362. FLD -7 * SIZE(BO)
  363. fmulp %st, %st(1)
  364. faddp %st, %st(1)
  365. FLD -6 * SIZE(AO)
  366. FLD -6 * SIZE(BO)
  367. fmulp %st, %st(1)
  368. faddp %st, %st(1)
  369. FLD -5 * SIZE(AO)
  370. FLD -5 * SIZE(BO)
  371. fmulp %st, %st(1)
  372. faddp %st, %st(1)
  373. addl $4 * SIZE,AO
  374. addl $4 * SIZE,BO
  375. decl %eax
  376. jne .L42
  377. ALIGN_4
  378. .L45:
  379. #if defined(LT) || defined(RN)
  380. movl KK, %eax
  381. #else
  382. movl K, %eax
  383. subl KK, %eax
  384. #endif
  385. and $3, %eax
  386. je .L48
  387. ALIGN_4
  388. .L46:
  389. FLD -8 * SIZE(AO)
  390. FLD -8 * SIZE(BO)
  391. fmulp %st, %st(1)
  392. faddp %st, %st(1)
  393. addl $1 * SIZE,AO
  394. addl $1 * SIZE,BO
  395. decl %eax
  396. jne .L46
  397. ALIGN_4
  398. .L48:
  399. #if defined(LN) || defined(RT)
  400. movl KK, %eax
  401. #ifdef LN
  402. subl $1, %eax
  403. #else
  404. subl $1, %eax
  405. #endif
  406. sall $BASE_SHIFT, %eax
  407. movl AORIG, AO
  408. leal (AO, %eax, 1), AO
  409. leal (B, %eax, 1), BO
  410. #endif
  411. #if defined(LN) || defined(LT)
  412. FLD -8 * SIZE(BO)
  413. fsubp %st, %st(1)
  414. #else
  415. FLD -8 * SIZE(AO)
  416. fsubp %st, %st(1)
  417. #endif
  418. #ifdef LN
  419. FLD -8 * SIZE(AO)
  420. fmulp %st, %st(1)
  421. #endif
  422. #ifdef LT
  423. FLD -8 * SIZE(AO)
  424. fmulp %st, %st(1)
  425. #endif
  426. #ifdef RN
  427. FLD -8 * SIZE(BO)
  428. fmulp %st, %st(1)
  429. #endif
  430. #ifdef RT
  431. FLD -8 * SIZE(BO)
  432. fmulp %st, %st(1)
  433. #endif
  434. #ifdef LN
  435. subl $1 * SIZE, CO
  436. #endif
  437. #if defined(LN) || defined(LT)
  438. fld %st
  439. FST -8 * SIZE(BO)
  440. #else
  441. fld %st
  442. FST -8 * SIZE(AO)
  443. #endif
  444. FST 0 * SIZE(CO)
  445. #ifndef LN
  446. addl $1 * SIZE, CO
  447. #endif
  448. #if defined(LT) || defined(RN)
  449. movl K, %eax
  450. subl KK, %eax
  451. sall $BASE_SHIFT, %eax
  452. leal (AO, %eax, 1), AO
  453. leal (BO, %eax, 1), BO
  454. #endif
  455. #ifdef LN
  456. subl $1, KK
  457. #endif
  458. #ifdef LT
  459. addl $1, KK
  460. #endif
  461. #ifdef RT
  462. movl K, %eax
  463. sall $0 + BASE_SHIFT, %eax
  464. addl %eax, AORIG
  465. #endif
  466. ALIGN_4
  467. .L49:
  468. #ifdef LN
  469. movl K, %eax
  470. sall $BASE_SHIFT, %eax
  471. leal (B, %eax, 1), B
  472. #endif
  473. #if defined(LT) || defined(RN)
  474. movl BO, B
  475. #endif
  476. #ifdef RN
  477. addl $1, KK
  478. #endif
  479. #ifdef RT
  480. subl $1, KK
  481. #endif
  482. ALIGN_4
  483. .L30:
  484. movl N, %eax
  485. sarl $1, %eax
  486. movl %eax, J
  487. je .L999
  488. ALIGN_4
  489. .L01:
  490. #if defined(LT) || defined(RN)
  491. movl A, AO
  492. #else
  493. movl A, %eax
  494. movl %eax, AORIG
  495. #endif
  496. #ifdef RT
  497. movl K, %eax
  498. sall $1 + BASE_SHIFT, %eax
  499. subl %eax, B
  500. #endif
  501. lea (, LDC, 2), %eax
  502. #ifdef RT
  503. subl %eax, C
  504. #endif
  505. movl C, CO
  506. #ifndef RT
  507. addl %eax, C
  508. #endif
  509. #ifdef LN
  510. movl OFFSET, %eax
  511. addl M, %eax
  512. movl %eax, KK
  513. #endif
  514. #ifdef LT
  515. movl OFFSET, %eax
  516. movl %eax, KK
  517. #endif
  518. movl M, I
  519. sarl $1, I
  520. je .L20
  521. ALIGN_4
  522. .L11:
  523. #ifdef LN
  524. movl K, %eax
  525. sall $1 + BASE_SHIFT, %eax
  526. subl %eax, AORIG
  527. #endif
  528. #if defined(LN) || defined(RT)
  529. movl KK, %eax
  530. sall $BASE_SHIFT, %eax
  531. movl AORIG, AO
  532. leal (AO, %eax, 2), AO
  533. leal (B, %eax, 2), BO
  534. #else
  535. movl B, BO
  536. #endif
  537. fldz
  538. fldz
  539. fldz
  540. fldz
  541. #if defined(HAVE_3DNOW)
  542. prefetchw 2 * SIZE(CO)
  543. prefetchw 2 * SIZE(CO, LDC, 1)
  544. #elif defined(HAVE_SSE)
  545. prefetchnta 2 * SIZE(CO)
  546. prefetchnta 2 * SIZE(CO, LDC, 1)
  547. #endif
  548. #if defined(LT) || defined(RN)
  549. movl KK, %eax
  550. #else
  551. movl K, %eax
  552. subl KK, %eax
  553. #endif
  554. sarl $2, %eax
  555. je .L15
  556. ALIGN_4
  557. .L12:
  558. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  559. FLD -8 * SIZE(AO)
  560. FLD -8 * SIZE(BO)
  561. fld %st(1)
  562. fmul %st(1), %st
  563. faddp %st, %st(3)
  564. FLD -7 * SIZE(BO)
  565. fmul %st, %st(2)
  566. FLD -7 * SIZE(AO)
  567. fmul %st, %st(2)
  568. fmulp %st, %st(1)
  569. faddp %st, %st(6)
  570. faddp %st, %st(4)
  571. faddp %st, %st(2)
  572. FLD -6 * SIZE(AO)
  573. FLD -6 * SIZE(BO)
  574. fld %st(1)
  575. fmul %st(1), %st
  576. faddp %st, %st(3)
  577. FLD -5 * SIZE(BO)
  578. fmul %st, %st(2)
  579. FLD -5 * SIZE(AO)
  580. fmul %st, %st(2)
  581. fmulp %st, %st(1)
  582. faddp %st, %st(6)
  583. faddp %st, %st(4)
  584. faddp %st, %st(2)
  585. PREFETCH (PREFETCHSIZE + 4) * SIZE(AO)
  586. FLD -4 * SIZE(AO)
  587. FLD -4 * SIZE(BO)
  588. fld %st(1)
  589. fmul %st(1), %st
  590. faddp %st, %st(3)
  591. FLD -3 * SIZE(BO)
  592. fmul %st, %st(2)
  593. FLD -3 * SIZE(AO)
  594. fmul %st, %st(2)
  595. fmulp %st, %st(1)
  596. faddp %st, %st(6)
  597. faddp %st, %st(4)
  598. faddp %st, %st(2)
  599. FLD -2 * SIZE(AO)
  600. FLD -2 * SIZE(BO)
  601. fld %st(1)
  602. fmul %st(1), %st
  603. faddp %st, %st(3)
  604. FLD -1 * SIZE(BO)
  605. fmul %st, %st(2)
  606. FLD -1 * SIZE(AO)
  607. fmul %st, %st(2)
  608. fmulp %st, %st(1)
  609. faddp %st, %st(6)
  610. faddp %st, %st(4)
  611. faddp %st, %st(2)
  612. addl $8 * SIZE,AO
  613. addl $8 * SIZE,BO
  614. decl %eax
  615. jne .L12
  616. ALIGN_4
  617. .L15:
  618. #if defined(LT) || defined(RN)
  619. movl KK, %eax
  620. #else
  621. movl K, %eax
  622. subl KK, %eax
  623. #endif
  624. and $3, %eax
  625. je .L18
  626. ALIGN_4
  627. .L16:
  628. FLD -8 * SIZE(AO)
  629. FLD -8 * SIZE(BO)
  630. fld %st(1)
  631. fmul %st(1), %st
  632. faddp %st, %st(3)
  633. FLD -7 * SIZE(BO)
  634. fmul %st, %st(2)
  635. FLD -7 * SIZE(AO)
  636. fmul %st, %st(2)
  637. fmulp %st, %st(1)
  638. faddp %st, %st(6)
  639. faddp %st, %st(4)
  640. faddp %st, %st(2)
  641. addl $2 * SIZE,AO
  642. addl $2 * SIZE,BO
  643. decl %eax
  644. jne .L16
  645. ALIGN_4
  646. .L18:
  647. #if defined(LN) || defined(RT)
  648. movl KK, %eax
  649. #ifdef LN
  650. subl $2, %eax
  651. #else
  652. subl $2, %eax
  653. #endif
  654. sall $BASE_SHIFT, %eax
  655. movl AORIG, AO
  656. leal (AO, %eax, 2), AO
  657. leal (B, %eax, 2), BO
  658. #endif
  659. #if defined(LN) || defined(LT)
  660. FLD -8 * SIZE(BO)
  661. fsubp %st, %st(1)
  662. FLD -7 * SIZE(BO)
  663. fsubp %st, %st(2)
  664. FLD -6 * SIZE(BO)
  665. fsubp %st, %st(3)
  666. FLD -5 * SIZE(BO)
  667. fsubp %st, %st(4)
  668. #else
  669. FLD -8 * SIZE(AO)
  670. fsubp %st, %st(1)
  671. FLD -7 * SIZE(AO)
  672. fsubp %st, %st(3)
  673. FLD -6 * SIZE(AO)
  674. fsubp %st, %st(2)
  675. FLD -5 * SIZE(AO)
  676. fsubp %st, %st(4)
  677. #endif
  678. #ifdef LN
  679. FLD -5 * SIZE(AO)
  680. fmul %st, %st(3)
  681. fmulp %st, %st(4)
  682. FLD -6 * SIZE(AO)
  683. fmul %st(3), %st
  684. FLD -6 * SIZE(AO)
  685. fmul %st(5), %st
  686. fsubrp %st, %st(3)
  687. fsubrp %st, %st(1)
  688. FLD -8 * SIZE(AO)
  689. fmul %st, %st(1)
  690. fmulp %st, %st(2)
  691. #endif
  692. #ifdef LT
  693. FLD -8 * SIZE(AO)
  694. fmul %st, %st(1)
  695. fmulp %st, %st(2)
  696. FLD -7 * SIZE(AO)
  697. fmul %st(1), %st
  698. FLD -7 * SIZE(AO)
  699. fmul %st(3), %st
  700. fsubrp %st, %st(5)
  701. fsubrp %st, %st(3)
  702. FLD -5 * SIZE(AO)
  703. fmul %st, %st(3)
  704. fmulp %st, %st(4)
  705. #endif
  706. #ifdef RN
  707. FLD -8 * SIZE(BO)
  708. fmul %st, %st(1)
  709. fmulp %st, %st(3)
  710. FLD -7 * SIZE(BO)
  711. fmul %st(1), %st
  712. FLD -7 * SIZE(BO)
  713. fmul %st(4), %st
  714. fsubrp %st, %st(5)
  715. fsubrp %st, %st(2)
  716. FLD -5 * SIZE(BO)
  717. fmul %st, %st(2)
  718. fmulp %st, %st(4)
  719. #endif
  720. #ifdef RT
  721. FLD -5 * SIZE(BO)
  722. fmul %st, %st(2)
  723. fmulp %st, %st(4)
  724. FLD -6 * SIZE(BO)
  725. fmul %st(2), %st
  726. FLD -6 * SIZE(BO)
  727. fmul %st(5), %st
  728. fsubrp %st, %st(4)
  729. fsubrp %st, %st(1)
  730. FLD -8 * SIZE(BO)
  731. fmul %st, %st(1)
  732. fmulp %st, %st(3)
  733. #endif
  734. #ifdef LN
  735. subl $2 * SIZE, CO
  736. #endif
  737. #if defined(LN) || defined(LT)
  738. fld %st
  739. FST -8 * SIZE(BO)
  740. fxch %st(1)
  741. fld %st
  742. FST -7 * SIZE(BO)
  743. fxch %st(2)
  744. fld %st
  745. FST -6 * SIZE(BO)
  746. fxch %st(3)
  747. fld %st
  748. FST -5 * SIZE(BO)
  749. FST 1 * SIZE(CO, LDC)
  750. FST 0 * SIZE(CO)
  751. FST 0 * SIZE(CO, LDC)
  752. FST 1 * SIZE(CO)
  753. #else
  754. fld %st
  755. FST -8 * SIZE(AO)
  756. fxch %st(2)
  757. fld %st
  758. FST -7 * SIZE(AO)
  759. fxch %st(1)
  760. fld %st
  761. FST -6 * SIZE(AO)
  762. fxch %st(3)
  763. fld %st
  764. FST -5 * SIZE(AO)
  765. FST 1 * SIZE(CO, LDC)
  766. FST 1 * SIZE(CO)
  767. FST 0 * SIZE(CO)
  768. FST 0 * SIZE(CO, LDC)
  769. #endif
  770. #ifndef LN
  771. addl $2 * SIZE, CO
  772. #endif
  773. #if defined(LT) || defined(RN)
  774. movl K, %eax
  775. subl KK, %eax
  776. sall $BASE_SHIFT, %eax
  777. leal (AO, %eax, 2), AO
  778. leal (BO, %eax, 2), BO
  779. #endif
  780. #ifdef LN
  781. subl $2, KK
  782. #endif
  783. #ifdef LT
  784. addl $2, KK
  785. #endif
  786. #ifdef RT
  787. movl K, %eax
  788. sall $1 + BASE_SHIFT, %eax
  789. addl %eax, AORIG
  790. #endif
  791. decl I
  792. jne .L11
  793. ALIGN_4
  794. .L20:
  795. movl M, %eax
  796. andl $1, %eax
  797. je .L29
  798. ALIGN_4
  799. .L21:
  800. #ifdef LN
  801. movl K, %eax
  802. sall $0 + BASE_SHIFT, %eax
  803. subl %eax, AORIG
  804. #endif
  805. #if defined(LN) || defined(RT)
  806. movl KK, %eax
  807. sall $BASE_SHIFT, %eax
  808. movl AORIG, AO
  809. leal (AO, %eax, 1), AO
  810. leal (B, %eax, 2), BO
  811. #else
  812. movl B, BO
  813. #endif
  814. fldz
  815. fldz
  816. #if defined(LT) || defined(RN)
  817. movl KK, %eax
  818. #else
  819. movl K, %eax
  820. subl KK, %eax
  821. #endif
  822. sarl $2, %eax
  823. je .L25
  824. ALIGN_4
  825. .L22:
  826. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  827. FLD -8 * SIZE(AO)
  828. FLD -8 * SIZE(BO)
  829. fmul %st(1), %st
  830. faddp %st, %st(2)
  831. FLD -7 * SIZE(BO)
  832. fmulp %st, %st(1)
  833. faddp %st, %st(2)
  834. FLD -7 * SIZE(AO)
  835. FLD -6 * SIZE(BO)
  836. fmul %st(1), %st
  837. faddp %st, %st(2)
  838. FLD -5 * SIZE(BO)
  839. fmulp %st, %st(1)
  840. faddp %st, %st(2)
  841. FLD -6 * SIZE(AO)
  842. FLD -4 * SIZE(BO)
  843. fmul %st(1), %st
  844. faddp %st, %st(2)
  845. FLD -3 * SIZE(BO)
  846. fmulp %st, %st(1)
  847. faddp %st, %st(2)
  848. FLD -5 * SIZE(AO)
  849. FLD -2 * SIZE(BO)
  850. fmul %st(1), %st
  851. faddp %st, %st(2)
  852. FLD -1 * SIZE(BO)
  853. fmulp %st, %st(1)
  854. faddp %st, %st(2)
  855. addl $4 * SIZE,AO
  856. addl $8 * SIZE,BO
  857. decl %eax
  858. jne .L22
  859. ALIGN_4
  860. .L25:
  861. #if defined(LT) || defined(RN)
  862. movl KK, %eax
  863. #else
  864. movl K, %eax
  865. subl KK, %eax
  866. #endif
  867. and $3, %eax
  868. je .L28
  869. ALIGN_4
  870. .L26:
  871. FLD -8 * SIZE(AO)
  872. FLD -8 * SIZE(BO)
  873. fmul %st(1), %st
  874. faddp %st, %st(2)
  875. FLD -7 * SIZE(BO)
  876. fmulp %st, %st(1)
  877. faddp %st, %st(2)
  878. addl $1 * SIZE,AO
  879. addl $2 * SIZE,BO
  880. decl %eax
  881. jne .L26
  882. ALIGN_4
  883. .L28:
  884. #if defined(LN) || defined(RT)
  885. movl KK, %eax
  886. #ifdef LN
  887. subl $1, %eax
  888. #else
  889. subl $2, %eax
  890. #endif
  891. sall $BASE_SHIFT, %eax
  892. movl AORIG, AO
  893. leal (AO, %eax, 1), AO
  894. leal (B, %eax, 2), BO
  895. #endif
  896. #if defined(LN) || defined(LT)
  897. FLD -8 * SIZE(BO)
  898. fsubp %st, %st(1)
  899. FLD -7 * SIZE(BO)
  900. fsubp %st, %st(2)
  901. #else
  902. FLD -8 * SIZE(AO)
  903. fsubp %st, %st(1)
  904. FLD -7 * SIZE(AO)
  905. fsubp %st, %st(2)
  906. #endif
  907. #if defined(LN) || defined(LT)
  908. FLD -8 * SIZE(AO)
  909. fmul %st, %st(1)
  910. fmulp %st, %st(2)
  911. #endif
  912. #ifdef RN
  913. FLD -8 * SIZE(BO)
  914. fmulp %st, %st(1)
  915. FLD -7 * SIZE(BO)
  916. fmul %st(1), %st
  917. fsubrp %st, %st(2)
  918. FLD -5 * SIZE(BO)
  919. fmulp %st, %st(2)
  920. #endif
  921. #ifdef RT
  922. FLD -5 * SIZE(BO)
  923. fmulp %st, %st(2)
  924. FLD -6 * SIZE(BO)
  925. fmul %st(2), %st
  926. fsubrp %st, %st(1)
  927. FLD -8 * SIZE(BO)
  928. fmulp %st, %st(1)
  929. #endif
  930. #ifdef LN
  931. subl $1 * SIZE, CO
  932. #endif
  933. #if defined(LN) || defined(LT)
  934. fld %st
  935. FST -8 * SIZE(BO)
  936. fxch %st(1)
  937. fld %st
  938. FST -7 * SIZE(BO)
  939. #else
  940. fld %st
  941. FST -8 * SIZE(AO)
  942. fxch %st(1)
  943. fld %st
  944. FST -7 * SIZE(AO)
  945. #endif
  946. FST 0 * SIZE(CO, LDC)
  947. FST 0 * SIZE(CO)
  948. #ifndef LN
  949. addl $1 * SIZE, CO
  950. #endif
  951. #if defined(LT) || defined(RN)
  952. movl K, %eax
  953. subl KK, %eax
  954. sall $BASE_SHIFT, %eax
  955. leal (AO, %eax, 1), AO
  956. leal (BO, %eax, 2), BO
  957. #endif
  958. #ifdef LN
  959. subl $1, KK
  960. #endif
  961. #ifdef LT
  962. addl $1, KK
  963. #endif
  964. #ifdef RT
  965. movl K, %eax
  966. sall $0 + BASE_SHIFT, %eax
  967. addl %eax, AORIG
  968. #endif
  969. ALIGN_4
  970. .L29:
  971. #ifdef LN
  972. movl K, %eax
  973. sall $BASE_SHIFT, %eax
  974. leal (B, %eax, 2), B
  975. #endif
  976. #if defined(LT) || defined(RN)
  977. movl BO, B
  978. #endif
  979. #ifdef RN
  980. addl $2, KK
  981. #endif
  982. #ifdef RT
  983. subl $2, KK
  984. #endif
  985. decl J
  986. jne .L01
  987. ALIGN_4
  988. .L999:
  989. popl %ebx
  990. popl %esi
  991. popl %edi
  992. popl %ebp
  993. addl $ARGS, %esp
  994. ret
  995. EPILOGUE