You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT.S 65 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %i0
  41. #define N %i1
  42. #define K %i2
  43. #if defined(DOUBLE) && !defined(__64BIT__)
  44. #define A %i5
  45. #define B %i4
  46. #else
  47. #define A %i4
  48. #define B %i5
  49. #endif
  50. #define C %o4
  51. #define LDC %o5
  52. #define AO %l0
  53. #define BO %l1
  54. #define I %l2
  55. #define J %l3
  56. #define L %l4
  57. #define C1 %o0
  58. #define C2 %o1
  59. #define C3 %o2
  60. #define C4 %o3
  61. #define OFFSET %l5
  62. #define KK %l6
  63. #define TEMP1 %l7
  64. #define TEMP2 %i3
  65. #define AORIG %g1
  66. #ifdef DOUBLE
  67. #define c01 %f0
  68. #define c02 %f2
  69. #define c03 %f4
  70. #define c04 %f6
  71. #define c05 %f8
  72. #define c06 %f10
  73. #define c07 %f12
  74. #define c08 %f14
  75. #define c09 %f16
  76. #define c10 %f18
  77. #define c11 %f20
  78. #define c12 %f22
  79. #define c13 %f24
  80. #define c14 %f26
  81. #define c15 %f28
  82. #define c16 %f30
  83. #define t1 %f32
  84. #define t2 %f34
  85. #define t3 %f36
  86. #define t4 %f38
  87. #define a1 %f40
  88. #define a2 %f42
  89. #define a3 %f44
  90. #define a4 %f46
  91. #define a5 %f58
  92. #define b1 %f48
  93. #define b2 %f50
  94. #define b3 %f52
  95. #define b4 %f54
  96. #define b5 %f56
  97. #define FZERO %f60
  98. #else
  99. #define c01 %f0
  100. #define c02 %f1
  101. #define c03 %f2
  102. #define c04 %f3
  103. #define c05 %f4
  104. #define c06 %f5
  105. #define c07 %f6
  106. #define c08 %f7
  107. #define c09 %f8
  108. #define c10 %f9
  109. #define c11 %f10
  110. #define c12 %f11
  111. #define c13 %f12
  112. #define c14 %f13
  113. #define c15 %f14
  114. #define c16 %f15
  115. #define t1 %f16
  116. #define t2 %f17
  117. #define t3 %f18
  118. #define t4 %f19
  119. #define a1 %f20
  120. #define a2 %f21
  121. #define a3 %f22
  122. #define a4 %f23
  123. #define a5 %f31
  124. #define b1 %f24
  125. #define b2 %f25
  126. #define b3 %f26
  127. #define b4 %f27
  128. #define b5 %f28
  129. #define FZERO %f29
  130. #endif
  131. PROLOGUE
  132. SAVESP
  133. nop
  134. #ifndef __64BIT__
  135. #ifdef DOUBLE
  136. ld [%sp + STACK_START + 28], B
  137. ld [%sp + STACK_START + 32], C
  138. ld [%sp + STACK_START + 36], LDC
  139. ld [%sp + STACK_START + 40], OFFSET
  140. #else
  141. ld [%sp + STACK_START + 28], C
  142. ld [%sp + STACK_START + 32], LDC
  143. ld [%sp + STACK_START + 36], OFFSET
  144. #endif
  145. #else
  146. ldx [%sp+ STACK_START + 56], C
  147. ldx [%sp+ STACK_START + 64], LDC
  148. ldx [%sp+ STACK_START + 72], OFFSET
  149. #endif
  150. FCLR(29)
  151. sll LDC, BASE_SHIFT, LDC
  152. #ifdef LN
  153. smul M, K, TEMP1
  154. sll TEMP1, BASE_SHIFT, TEMP1
  155. add A, TEMP1, A
  156. sll M, BASE_SHIFT, TEMP1
  157. add C, TEMP1, C
  158. #endif
  159. #ifdef RN
  160. neg OFFSET, KK
  161. #endif
  162. #ifdef RT
  163. smul N, K, TEMP1
  164. sll TEMP1, BASE_SHIFT, TEMP1
  165. add B, TEMP1, B
  166. smul N, LDC, TEMP1
  167. add C, TEMP1, C
  168. sub N, OFFSET, KK
  169. #endif
  170. sra N, 2, J
  171. cmp J, 0
  172. ble,pn %icc, .LL100
  173. nop
  174. .LL11:
  175. #ifdef RT
  176. sll K, 2 + BASE_SHIFT, TEMP1
  177. sub B, TEMP1, B
  178. sll LDC, 2, TEMP1
  179. sub C, TEMP1, C
  180. #endif
  181. add C, LDC, C2
  182. FMOV FZERO, t1
  183. nop
  184. mov C, C1
  185. add C2, LDC, C3
  186. FMOV FZERO, t2
  187. sra M, 2, I
  188. add C3, LDC, C4
  189. FMOV FZERO, t3
  190. #ifdef LN
  191. add M, OFFSET, KK
  192. #endif
  193. #ifdef LT
  194. mov OFFSET, KK
  195. #endif
  196. #if defined(LN) || defined(RT)
  197. mov A, AORIG
  198. #else
  199. mov A, AO
  200. #endif
  201. cmp I, 0
  202. #ifndef RT
  203. add C4, LDC, C
  204. #endif
  205. FMOV FZERO, t4
  206. ble,pn %icc, .LL50
  207. FMOV FZERO, c01
  208. .LL21:
  209. FMOV FZERO, c02
  210. FMOV FZERO, c03
  211. #if defined(LT) || defined(RN)
  212. sra KK, 2, L
  213. mov B, BO
  214. cmp L, 0
  215. #else
  216. #ifdef LN
  217. sll K, 2 + BASE_SHIFT, TEMP1
  218. sub AORIG, TEMP1, AORIG
  219. #endif
  220. sll KK, 2 + BASE_SHIFT, TEMP1
  221. add AORIG, TEMP1, AO
  222. add B, TEMP1, BO
  223. sub K, KK, TEMP1
  224. sra TEMP1, 2, L
  225. cmp L, 0
  226. #endif
  227. LDF [AO + 0 * SIZE], a1
  228. FMOV FZERO, c04
  229. LDF [BO + 0 * SIZE], b1
  230. FMOV FZERO, c05
  231. LDF [AO + 1 * SIZE], a2
  232. FMOV FZERO, c06
  233. LDF [BO + 1 * SIZE], b2
  234. FMOV FZERO, c07
  235. LDF [AO + 2 * SIZE], a3
  236. FMOV FZERO, c08
  237. LDF [BO + 2 * SIZE], b3
  238. FMOV FZERO, c09
  239. LDF [AO + 3 * SIZE], a4
  240. FMOV FZERO, c10
  241. LDF [BO + 3 * SIZE], b4
  242. FMOV FZERO, c11
  243. LDF [BO + 4 * SIZE], b5 /* ***** */
  244. LDF [AO + 4 * SIZE], a5 /* ***** */
  245. prefetch [C1 + 3 * SIZE], 3
  246. FMOV FZERO, c12
  247. prefetch [C2 + 3 * SIZE], 3
  248. FMOV FZERO, c13
  249. prefetch [C3 + 3 * SIZE], 3
  250. FMOV FZERO, c14
  251. prefetch [C4 + 3 * SIZE], 3
  252. FMOV FZERO, c15
  253. ble,pn %icc, .LL25
  254. FMOV FZERO, c16
  255. #define APREFETCHSIZE 40
  256. #define BPREFETCHSIZE 40
  257. #define APREFETCH_CATEGORY 0
  258. #define BPREFETCH_CATEGORY 0
  259. .LL22:
  260. FADD c04, t1, c04
  261. prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY
  262. FMUL a1, b1, t1
  263. nop
  264. FADD c08, t2, c08
  265. prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY
  266. FMUL a1, b2, t2
  267. add AO, 16 * SIZE, AO
  268. FADD c12, t3, c12
  269. LDF [AO - 13 * SIZE], a4
  270. FMUL a1, b3, t3
  271. add BO, 16 * SIZE, BO
  272. FADD c16, t4, c16
  273. nop
  274. FMUL a1, b4, t4
  275. LDF [AO - 8 * SIZE], a1
  276. FADD c01, t1, c01
  277. nop
  278. FMUL a2, b1, t1
  279. nop
  280. FADD c05, t2, c05
  281. nop
  282. FMUL a2, b2, t2
  283. nop
  284. FADD c09, t3, c09
  285. nop
  286. FMUL a2, b3, t3
  287. nop
  288. FADD c13, t4, c13
  289. add L, -1, L
  290. FMUL a2, b4, t4
  291. LDF [AO - 11 * SIZE], a2
  292. FADD c02, t1, c02
  293. nop
  294. FMUL a3, b1, t1
  295. nop
  296. FADD c06, t2, c06
  297. nop
  298. FMUL a3, b2, t2
  299. nop
  300. FADD c10, t3, c10
  301. nop
  302. FMUL a3, b3, t3
  303. nop
  304. FADD c14, t4, c14
  305. nop
  306. FMUL a3, b4, t4
  307. LDF [AO - 10 * SIZE], a3
  308. FADD c03, t1, c03
  309. nop
  310. FMUL a4, b1, t1
  311. LDF [BO - 8 * SIZE], b1
  312. FADD c07, t2, c07
  313. nop
  314. FMUL a4, b2, t2
  315. LDF [BO - 11 * SIZE], b2
  316. FADD c11, t3, c11
  317. nop
  318. FMUL a4, b3, t3
  319. LDF [BO - 10 * SIZE], b3
  320. FADD c15, t4, c15
  321. nop
  322. FMUL a4, b4, t4
  323. LDF [BO - 9 * SIZE], b4
  324. FADD c04, t1, c04
  325. nop
  326. FMUL a5, b5, t1
  327. LDF [AO - 9 * SIZE], a4
  328. FADD c08, t2, c08
  329. nop
  330. FMUL a5, b2, t2
  331. nop
  332. FADD c12, t3, c12
  333. nop
  334. FMUL a5, b3, t3
  335. nop
  336. FADD c16, t4, c16
  337. nop
  338. FMUL a5, b4, t4
  339. LDF [AO - 4 * SIZE], a5
  340. FADD c01, t1, c01
  341. nop
  342. FMUL a2, b5, t1
  343. nop
  344. FADD c05, t2, c05
  345. nop
  346. FMUL a2, b2, t2
  347. nop
  348. FADD c09, t3, c09
  349. nop
  350. FMUL a2, b3, t3
  351. nop
  352. FADD c13, t4, c13
  353. nop
  354. FMUL a2, b4, t4
  355. LDF [AO - 7 * SIZE], a2
  356. FADD c02, t1, c02
  357. nop
  358. FMUL a3, b5, t1
  359. nop
  360. FADD c06, t2, c06
  361. nop
  362. FMUL a3, b2, t2
  363. nop
  364. FADD c10, t3, c10
  365. nop
  366. FMUL a3, b3, t3
  367. nop
  368. FADD c14, t4, c14
  369. nop
  370. FMUL a3, b4, t4
  371. LDF [AO - 6 * SIZE], a3
  372. FADD c03, t1, c03
  373. nop
  374. FMUL a4, b5, t1
  375. LDF [BO - 4 * SIZE], b5
  376. FADD c07, t2, c07
  377. nop
  378. FMUL a4, b2, t2
  379. LDF [BO - 7 * SIZE], b2
  380. FADD c11, t3, c11
  381. nop
  382. FMUL a4, b3, t3
  383. LDF [BO - 6 * SIZE], b3
  384. FADD c15, t4, c15
  385. nop
  386. FMUL a4, b4, t4
  387. LDF [BO - 5 * SIZE], b4
  388. FADD c04, t1, c04
  389. nop
  390. FMUL a1, b1, t1
  391. LDF [AO - 5 * SIZE], a4
  392. FADD c08, t2, c08
  393. nop
  394. FMUL a1, b2, t2
  395. nop
  396. FADD c12, t3, c12
  397. nop
  398. FMUL a1, b3, t3
  399. nop
  400. FADD c16, t4, c16
  401. nop
  402. FMUL a1, b4, t4
  403. LDF [AO - 0 * SIZE], a1
  404. FADD c01, t1, c01
  405. nop
  406. FMUL a2, b1, t1
  407. nop
  408. #ifdef DOUBLE
  409. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  410. #else
  411. nop
  412. #endif
  413. FADD c05, t2, c05
  414. nop
  415. FMUL a2, b2, t2
  416. FADD c09, t3, c09
  417. nop
  418. FMUL a2, b3, t3
  419. nop
  420. FADD c13, t4, c13
  421. nop
  422. FMUL a2, b4, t4
  423. nop
  424. FADD c02, t1, c02
  425. nop
  426. FMUL a3, b1, t1
  427. LDF [AO - 3 * SIZE], a2
  428. FADD c06, t2, c06
  429. #ifdef DOUBLE
  430. prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY
  431. #else
  432. nop
  433. #endif
  434. FMUL a3, b2, t2
  435. nop
  436. FADD c10, t3, c10
  437. nop
  438. FMUL a3, b3, t3
  439. nop
  440. FADD c14, t4, c14
  441. nop
  442. FMUL a3, b4, t4
  443. LDF [AO - 2 * SIZE], a3
  444. FADD c03, t1, c03
  445. nop
  446. FMUL a4, b1, t1
  447. LDF [BO - 0 * SIZE], b1
  448. FADD c07, t2, c07
  449. nop
  450. FMUL a4, b2, t2
  451. LDF [BO - 3 * SIZE], b2
  452. FADD c11, t3, c11
  453. nop
  454. FMUL a4, b3, t3
  455. LDF [BO - 2 * SIZE], b3
  456. FADD c15, t4, c15
  457. nop
  458. FMUL a4, b4, t4
  459. LDF [BO - 1 * SIZE], b4
  460. FADD c04, t1, c04
  461. nop
  462. FMUL a5, b5, t1
  463. LDF [AO - 1 * SIZE], a4
  464. FADD c08, t2, c08
  465. FMUL a5, b2, t2
  466. FADD c12, t3, c12
  467. FMUL a5, b3, t3
  468. FADD c16, t4, c16
  469. nop
  470. FMUL a5, b4, t4
  471. LDF [AO + 4 * SIZE], a5
  472. FADD c01, t1, c01
  473. nop
  474. FMUL a2, b5, t1
  475. nop
  476. FADD c05, t2, c05
  477. nop
  478. FMUL a2, b2, t2
  479. nop
  480. FADD c09, t3, c09
  481. nop
  482. FMUL a2, b3, t3
  483. nop
  484. FADD c13, t4, c13
  485. nop
  486. FMUL a2, b4, t4
  487. LDF [AO + 1 * SIZE], a2
  488. FADD c02, t1, c02
  489. nop
  490. FMUL a3, b5, t1
  491. nop
  492. FADD c06, t2, c06
  493. nop
  494. FMUL a3, b2, t2
  495. nop
  496. FADD c10, t3, c10
  497. nop
  498. FMUL a3, b3, t3
  499. nop
  500. FADD c14, t4, c14
  501. nop
  502. FMUL a3, b4, t4
  503. LDF [AO + 2 * SIZE], a3
  504. FADD c03, t1, c03
  505. cmp L, 0
  506. FMUL a4, b5, t1
  507. LDF [BO + 4 * SIZE], b5
  508. FADD c07, t2, c07
  509. nop
  510. FMUL a4, b2, t2
  511. LDF [BO + 1 * SIZE], b2
  512. FADD c11, t3, c11
  513. nop
  514. FMUL a4, b3, t3
  515. LDF [BO + 2 * SIZE], b3
  516. FADD c15, t4, c15
  517. FMUL a4, b4, t4
  518. bg,pt %icc, .LL22
  519. LDF [BO + 3 * SIZE], b4
  520. .LL25:
  521. #if defined(LT) || defined(RN)
  522. and KK, 3, L
  523. #else
  524. and TEMP1, 3, L
  525. #endif
  526. cmp L, 0
  527. ble,a,pn %icc, .LL29
  528. nop
  529. .LL26:
  530. FADD c04, t1, c04
  531. LDF [AO + 3 * SIZE], a4
  532. FMUL a1, b1, t1
  533. add AO, 4 * SIZE, AO
  534. FADD c08, t2, c08
  535. add BO, 4 * SIZE, BO
  536. FMUL a1, b2, t2
  537. add L, -1, L
  538. FADD c12, t3, c12
  539. nop
  540. FMUL a1, b3, t3
  541. cmp L, 0
  542. FADD c16, t4, c16
  543. nop
  544. FMUL a1, b4, t4
  545. LDF [AO + 0 * SIZE], a1
  546. FADD c01, t1, c01
  547. nop
  548. FMUL a2, b1, t1
  549. nop
  550. FADD c05, t2, c05
  551. nop
  552. FMUL a2, b2, t2
  553. nop
  554. FADD c09, t3, c09
  555. nop
  556. FMUL a2, b3, t3
  557. nop
  558. FADD c13, t4, c13
  559. nop
  560. FMUL a2, b4, t4
  561. LDF [AO + 1 * SIZE], a2
  562. FADD c02, t1, c02
  563. nop
  564. FMUL a3, b1, t1
  565. nop
  566. FADD c06, t2, c06
  567. nop
  568. FMUL a3, b2, t2
  569. nop
  570. FADD c10, t3, c10
  571. nop
  572. FMUL a3, b3, t3
  573. nop
  574. FADD c14, t4, c14
  575. nop
  576. FMUL a3, b4, t4
  577. LDF [AO + 2 * SIZE], a3
  578. FADD c03, t1, c03
  579. nop
  580. FMUL a4, b1, t1
  581. LDF [BO + 0 * SIZE], b1
  582. FADD c07, t2, c07
  583. nop
  584. FMUL a4, b2, t2
  585. LDF [BO + 1 * SIZE], b2
  586. FADD c11, t3, c11
  587. nop
  588. FMUL a4, b3, t3
  589. LDF [BO + 2 * SIZE], b3
  590. FADD c15, t4, c15
  591. FMUL a4, b4, t4
  592. bg,pt %icc, .LL26
  593. LDF [BO + 3 * SIZE], b4
  594. .LL29:
  595. #if defined(LN) || defined(RT)
  596. sub KK, 4, TEMP1
  597. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  598. add AORIG, TEMP1, AO
  599. add B, TEMP1, BO
  600. #endif
  601. FADD c04, t1, c04
  602. FADD c08, t2, c08
  603. FADD c12, t3, c12
  604. FADD c16, t4, c16
  605. #if defined(LN) || defined(LT)
  606. LDF [BO + 0 * SIZE], a1
  607. LDF [BO + 1 * SIZE], a2
  608. LDF [BO + 2 * SIZE], a3
  609. LDF [BO + 3 * SIZE], a4
  610. LDF [BO + 4 * SIZE], b1
  611. LDF [BO + 5 * SIZE], b2
  612. LDF [BO + 6 * SIZE], b3
  613. LDF [BO + 7 * SIZE], b4
  614. FSUB a1, c01, c01
  615. FSUB a2, c05, c05
  616. FSUB a3, c09, c09
  617. FSUB a4, c13, c13
  618. FSUB b1, c02, c02
  619. FSUB b2, c06, c06
  620. FSUB b3, c10, c10
  621. FSUB b4, c14, c14
  622. LDF [BO + 8 * SIZE], a1
  623. LDF [BO + 9 * SIZE], a2
  624. LDF [BO + 10 * SIZE], a3
  625. LDF [BO + 11 * SIZE], a4
  626. LDF [BO + 12 * SIZE], b1
  627. LDF [BO + 13 * SIZE], b2
  628. LDF [BO + 14 * SIZE], b3
  629. LDF [BO + 15 * SIZE], b4
  630. FSUB a1, c03, c03
  631. FSUB a2, c07, c07
  632. FSUB a3, c11, c11
  633. FSUB a4, c15, c15
  634. FSUB b1, c04, c04
  635. FSUB b2, c08, c08
  636. FSUB b3, c12, c12
  637. FSUB b4, c16, c16
  638. #else
  639. LDF [AO + 0 * SIZE], a1
  640. LDF [AO + 1 * SIZE], a2
  641. LDF [AO + 2 * SIZE], a3
  642. LDF [AO + 3 * SIZE], a4
  643. LDF [AO + 4 * SIZE], b1
  644. LDF [AO + 5 * SIZE], b2
  645. LDF [AO + 6 * SIZE], b3
  646. LDF [AO + 7 * SIZE], b4
  647. FSUB a1, c01, c01
  648. FSUB a2, c02, c02
  649. FSUB a3, c03, c03
  650. FSUB a4, c04, c04
  651. FSUB b1, c05, c05
  652. FSUB b2, c06, c06
  653. FSUB b3, c07, c07
  654. FSUB b4, c08, c08
  655. LDF [AO + 8 * SIZE], a1
  656. LDF [AO + 9 * SIZE], a2
  657. LDF [AO + 10 * SIZE], a3
  658. LDF [AO + 11 * SIZE], a4
  659. LDF [AO + 12 * SIZE], b1
  660. LDF [AO + 13 * SIZE], b2
  661. LDF [AO + 14 * SIZE], b3
  662. LDF [AO + 15 * SIZE], b4
  663. FSUB a1, c09, c09
  664. FSUB a2, c10, c10
  665. FSUB a3, c11, c11
  666. FSUB a4, c12, c12
  667. FSUB b1, c13, c13
  668. FSUB b2, c14, c14
  669. FSUB b3, c15, c15
  670. FSUB b4, c16, c16
  671. #endif
  672. #ifdef LN
  673. LDF [AO + 15 * SIZE], a1
  674. LDF [AO + 14 * SIZE], a2
  675. LDF [AO + 13 * SIZE], a3
  676. LDF [AO + 12 * SIZE], a4
  677. FMUL a1, c04, c04
  678. FMUL a1, c08, c08
  679. FMUL a1, c12, c12
  680. FMUL a1, c16, c16
  681. FMUL a2, c04, t1
  682. FMUL a2, c08, t2
  683. FMUL a2, c12, t3
  684. FMUL a2, c16, t4
  685. FSUB c03, t1, c03
  686. FSUB c07, t2, c07
  687. FSUB c11, t3, c11
  688. FSUB c15, t4, c15
  689. FMUL a3, c04, t1
  690. FMUL a3, c08, t2
  691. FMUL a3, c12, t3
  692. FMUL a3, c16, t4
  693. FSUB c02, t1, c02
  694. FSUB c06, t2, c06
  695. FSUB c10, t3, c10
  696. FSUB c14, t4, c14
  697. FMUL a4, c04, t1
  698. FMUL a4, c08, t2
  699. FMUL a4, c12, t3
  700. FMUL a4, c16, t4
  701. FSUB c01, t1, c01
  702. FSUB c05, t2, c05
  703. FSUB c09, t3, c09
  704. FSUB c13, t4, c13
  705. LDF [AO + 10 * SIZE], a1
  706. LDF [AO + 9 * SIZE], a2
  707. LDF [AO + 8 * SIZE], a3
  708. FMUL a1, c03, c03
  709. FMUL a1, c07, c07
  710. FMUL a1, c11, c11
  711. FMUL a1, c15, c15
  712. FMUL a2, c03, t1
  713. FMUL a2, c07, t2
  714. FMUL a2, c11, t3
  715. FMUL a2, c15, t4
  716. FSUB c02, t1, c02
  717. FSUB c06, t2, c06
  718. FSUB c10, t3, c10
  719. FSUB c14, t4, c14
  720. FMUL a3, c03, t1
  721. FMUL a3, c07, t2
  722. FMUL a3, c11, t3
  723. FMUL a3, c15, t4
  724. FSUB c01, t1, c01
  725. FSUB c05, t2, c05
  726. FSUB c09, t3, c09
  727. FSUB c13, t4, c13
  728. LDF [AO + 5 * SIZE], a1
  729. LDF [AO + 4 * SIZE], a2
  730. FMUL a1, c02, c02
  731. FMUL a1, c06, c06
  732. FMUL a1, c10, c10
  733. FMUL a1, c14, c14
  734. FMUL a2, c02, t1
  735. FMUL a2, c06, t2
  736. FMUL a2, c10, t3
  737. FMUL a2, c14, t4
  738. FSUB c01, t1, c01
  739. FSUB c05, t2, c05
  740. FSUB c09, t3, c09
  741. FSUB c13, t4, c13
  742. LDF [AO + 0 * SIZE], a1
  743. FMUL a1, c01, c01
  744. FMUL a1, c05, c05
  745. FMUL a1, c09, c09
  746. FMUL a1, c13, c13
  747. #endif
  748. #ifdef LT
  749. LDF [AO + 0 * SIZE], a1
  750. LDF [AO + 1 * SIZE], a2
  751. LDF [AO + 2 * SIZE], a3
  752. LDF [AO + 3 * SIZE], a4
  753. FMUL a1, c01, c01
  754. FMUL a1, c05, c05
  755. FMUL a1, c09, c09
  756. FMUL a1, c13, c13
  757. FMUL a2, c01, t1
  758. FMUL a2, c05, t2
  759. FMUL a2, c09, t3
  760. FMUL a2, c13, t4
  761. FSUB c02, t1, c02
  762. FSUB c06, t2, c06
  763. FSUB c10, t3, c10
  764. FSUB c14, t4, c14
  765. FMUL a3, c01, t1
  766. FMUL a3, c05, t2
  767. FMUL a3, c09, t3
  768. FMUL a3, c13, t4
  769. FSUB c03, t1, c03
  770. FSUB c07, t2, c07
  771. FSUB c11, t3, c11
  772. FSUB c15, t4, c15
  773. FMUL a4, c01, t1
  774. FMUL a4, c05, t2
  775. FMUL a4, c09, t3
  776. FMUL a4, c13, t4
  777. FSUB c04, t1, c04
  778. FSUB c08, t2, c08
  779. FSUB c12, t3, c12
  780. FSUB c16, t4, c16
  781. LDF [AO + 5 * SIZE], a1
  782. LDF [AO + 6 * SIZE], a2
  783. LDF [AO + 7 * SIZE], a3
  784. FMUL a1, c02, c02
  785. FMUL a1, c06, c06
  786. FMUL a1, c10, c10
  787. FMUL a1, c14, c14
  788. FMUL a2, c02, t1
  789. FMUL a2, c06, t2
  790. FMUL a2, c10, t3
  791. FMUL a2, c14, t4
  792. FSUB c03, t1, c03
  793. FSUB c07, t2, c07
  794. FSUB c11, t3, c11
  795. FSUB c15, t4, c15
  796. FMUL a3, c02, t1
  797. FMUL a3, c06, t2
  798. FMUL a3, c10, t3
  799. FMUL a3, c14, t4
  800. FSUB c04, t1, c04
  801. FSUB c08, t2, c08
  802. FSUB c12, t3, c12
  803. FSUB c16, t4, c16
  804. LDF [AO + 10 * SIZE], a1
  805. LDF [AO + 11 * SIZE], a2
  806. FMUL a1, c03, c03
  807. FMUL a1, c07, c07
  808. FMUL a1, c11, c11
  809. FMUL a1, c15, c15
  810. FMUL a2, c03, t1
  811. FMUL a2, c07, t2
  812. FMUL a2, c11, t3
  813. FMUL a2, c15, t4
  814. FSUB c04, t1, c04
  815. FSUB c08, t2, c08
  816. FSUB c12, t3, c12
  817. FSUB c16, t4, c16
  818. LDF [AO + 15 * SIZE], a1
  819. FMUL a1, c04, c04
  820. FMUL a1, c08, c08
  821. FMUL a1, c12, c12
  822. FMUL a1, c16, c16
  823. #endif
  824. #ifdef RN
  825. LDF [BO + 0 * SIZE], a1
  826. LDF [BO + 1 * SIZE], a2
  827. LDF [BO + 2 * SIZE], a3
  828. LDF [BO + 3 * SIZE], a4
  829. FMUL a1, c01, c01
  830. FMUL a1, c02, c02
  831. FMUL a1, c03, c03
  832. FMUL a1, c04, c04
  833. FMUL a2, c01, t1
  834. FMUL a2, c02, t2
  835. FMUL a2, c03, t3
  836. FMUL a2, c04, t4
  837. FSUB c05, t1, c05
  838. FSUB c06, t2, c06
  839. FSUB c07, t3, c07
  840. FSUB c08, t4, c08
  841. FMUL a3, c01, t1
  842. FMUL a3, c02, t2
  843. FMUL a3, c03, t3
  844. FMUL a3, c04, t4
  845. FSUB c09, t1, c09
  846. FSUB c10, t2, c10
  847. FSUB c11, t3, c11
  848. FSUB c12, t4, c12
  849. FMUL a4, c01, t1
  850. FMUL a4, c02, t2
  851. FMUL a4, c03, t3
  852. FMUL a4, c04, t4
  853. FSUB c13, t1, c13
  854. FSUB c14, t2, c14
  855. FSUB c15, t3, c15
  856. FSUB c16, t4, c16
  857. LDF [BO + 5 * SIZE], a1
  858. LDF [BO + 6 * SIZE], a2
  859. LDF [BO + 7 * SIZE], a3
  860. FMUL a1, c05, c05
  861. FMUL a1, c06, c06
  862. FMUL a1, c07, c07
  863. FMUL a1, c08, c08
  864. FMUL a2, c05, t1
  865. FMUL a2, c06, t2
  866. FMUL a2, c07, t3
  867. FMUL a2, c08, t4
  868. FSUB c09, t1, c09
  869. FSUB c10, t2, c10
  870. FSUB c11, t3, c11
  871. FSUB c12, t4, c12
  872. FMUL a3, c05, t1
  873. FMUL a3, c06, t2
  874. FMUL a3, c07, t3
  875. FMUL a3, c08, t4
  876. FSUB c13, t1, c13
  877. FSUB c14, t2, c14
  878. FSUB c15, t3, c15
  879. FSUB c16, t4, c16
  880. LDF [BO + 10 * SIZE], a1
  881. LDF [BO + 11 * SIZE], a2
  882. FMUL a1, c09, c09
  883. FMUL a1, c10, c10
  884. FMUL a1, c11, c11
  885. FMUL a1, c12, c12
  886. FMUL a2, c09, t1
  887. FMUL a2, c10, t2
  888. FMUL a2, c11, t3
  889. FMUL a2, c12, t4
  890. FSUB c13, t1, c13
  891. FSUB c14, t2, c14
  892. FSUB c15, t3, c15
  893. FSUB c16, t4, c16
  894. LDF [BO + 15 * SIZE], a1
  895. FMUL a1, c13, c13
  896. FMUL a1, c14, c14
  897. FMUL a1, c15, c15
  898. FMUL a1, c16, c16
  899. #endif
  900. #ifdef RT
  901. LDF [BO + 15 * SIZE], a1
  902. LDF [BO + 14 * SIZE], a2
  903. LDF [BO + 13 * SIZE], a3
  904. LDF [BO + 12 * SIZE], a4
  905. FMUL a1, c13, c13
  906. FMUL a1, c14, c14
  907. FMUL a1, c15, c15
  908. FMUL a1, c16, c16
  909. FMUL a2, c13, t1
  910. FMUL a2, c14, t2
  911. FMUL a2, c15, t3
  912. FMUL a2, c16, t4
  913. FSUB c09, t1, c09
  914. FSUB c10, t2, c10
  915. FSUB c11, t3, c11
  916. FSUB c12, t4, c12
  917. FMUL a3, c13, t1
  918. FMUL a3, c14, t2
  919. FMUL a3, c15, t3
  920. FMUL a3, c16, t4
  921. FSUB c05, t1, c05
  922. FSUB c06, t2, c06
  923. FSUB c07, t3, c07
  924. FSUB c08, t4, c08
  925. FMUL a4, c13, t1
  926. FMUL a4, c14, t2
  927. FMUL a4, c15, t3
  928. FMUL a4, c16, t4
  929. FSUB c01, t1, c01
  930. FSUB c02, t2, c02
  931. FSUB c03, t3, c03
  932. FSUB c04, t4, c04
  933. LDF [BO + 10 * SIZE], a1
  934. LDF [BO + 9 * SIZE], a2
  935. LDF [BO + 8 * SIZE], a3
  936. FMUL a1, c09, c09
  937. FMUL a1, c10, c10
  938. FMUL a1, c11, c11
  939. FMUL a1, c12, c12
  940. FMUL a2, c09, t1
  941. FMUL a2, c10, t2
  942. FMUL a2, c11, t3
  943. FMUL a2, c12, t4
  944. FSUB c05, t1, c05
  945. FSUB c06, t2, c06
  946. FSUB c07, t3, c07
  947. FSUB c08, t4, c08
  948. FMUL a3, c09, t1
  949. FMUL a3, c10, t2
  950. FMUL a3, c11, t3
  951. FMUL a3, c12, t4
  952. FSUB c01, t1, c01
  953. FSUB c02, t2, c02
  954. FSUB c03, t3, c03
  955. FSUB c04, t4, c04
  956. LDF [BO + 5 * SIZE], a1
  957. LDF [BO + 4 * SIZE], a2
  958. FMUL a1, c05, c05
  959. FMUL a1, c06, c06
  960. FMUL a1, c07, c07
  961. FMUL a1, c08, c08
  962. FMUL a2, c05, t1
  963. FMUL a2, c06, t2
  964. FMUL a2, c07, t3
  965. FMUL a2, c08, t4
  966. FSUB c01, t1, c01
  967. FSUB c02, t2, c02
  968. FSUB c03, t3, c03
  969. FSUB c04, t4, c04
  970. LDF [BO + 0 * SIZE], a1
  971. FMUL a1, c01, c01
  972. FMUL a1, c02, c02
  973. FMUL a1, c03, c03
  974. FMUL a1, c04, c04
  975. #endif
  976. #ifdef LN
  977. add C1, -4 * SIZE, C1
  978. add C2, -4 * SIZE, C2
  979. add C3, -4 * SIZE, C3
  980. add C4, -4 * SIZE, C4
  981. #endif
  982. #if defined(LN) || defined(LT)
  983. STF c01, [BO + 0 * SIZE]
  984. STF c05, [BO + 1 * SIZE]
  985. STF c09, [BO + 2 * SIZE]
  986. STF c13, [BO + 3 * SIZE]
  987. STF c02, [BO + 4 * SIZE]
  988. STF c06, [BO + 5 * SIZE]
  989. STF c10, [BO + 6 * SIZE]
  990. STF c14, [BO + 7 * SIZE]
  991. STF c03, [BO + 8 * SIZE]
  992. STF c07, [BO + 9 * SIZE]
  993. STF c11, [BO + 10 * SIZE]
  994. STF c15, [BO + 11 * SIZE]
  995. STF c04, [BO + 12 * SIZE]
  996. STF c08, [BO + 13 * SIZE]
  997. STF c12, [BO + 14 * SIZE]
  998. STF c16, [BO + 15 * SIZE]
  999. #else
  1000. STF c01, [AO + 0 * SIZE]
  1001. STF c02, [AO + 1 * SIZE]
  1002. STF c03, [AO + 2 * SIZE]
  1003. STF c04, [AO + 3 * SIZE]
  1004. STF c05, [AO + 4 * SIZE]
  1005. STF c06, [AO + 5 * SIZE]
  1006. STF c07, [AO + 6 * SIZE]
  1007. STF c08, [AO + 7 * SIZE]
  1008. STF c09, [AO + 8 * SIZE]
  1009. STF c10, [AO + 9 * SIZE]
  1010. STF c11, [AO + 10 * SIZE]
  1011. STF c12, [AO + 11 * SIZE]
  1012. STF c13, [AO + 12 * SIZE]
  1013. STF c14, [AO + 13 * SIZE]
  1014. STF c15, [AO + 14 * SIZE]
  1015. STF c16, [AO + 15 * SIZE]
  1016. #endif
  1017. STF c01, [C1 + 0 * SIZE]
  1018. STF c02, [C1 + 1 * SIZE]
  1019. STF c03, [C1 + 2 * SIZE]
  1020. STF c04, [C1 + 3 * SIZE]
  1021. STF c05, [C2 + 0 * SIZE]
  1022. STF c06, [C2 + 1 * SIZE]
  1023. STF c07, [C2 + 2 * SIZE]
  1024. STF c08, [C2 + 3 * SIZE]
  1025. STF c09, [C3 + 0 * SIZE]
  1026. STF c10, [C3 + 1 * SIZE]
  1027. STF c11, [C3 + 2 * SIZE]
  1028. STF c12, [C3 + 3 * SIZE]
  1029. STF c13, [C4 + 0 * SIZE]
  1030. STF c14, [C4 + 1 * SIZE]
  1031. STF c15, [C4 + 2 * SIZE]
  1032. STF c16, [C4 + 3 * SIZE]
  1033. FMOV FZERO, t1
  1034. FMOV FZERO, t2
  1035. FMOV FZERO, t3
  1036. FMOV FZERO, t4
  1037. #ifndef LN
  1038. add C1, 4 * SIZE, C1
  1039. add C2, 4 * SIZE, C2
  1040. add C3, 4 * SIZE, C3
  1041. add C4, 4 * SIZE, C4
  1042. #endif
  1043. #ifdef RT
  1044. sll K, 2 + BASE_SHIFT, TEMP1
  1045. add AORIG, TEMP1, AORIG
  1046. #endif
  1047. #if defined(LT) || defined(RN)
  1048. sub K, KK, TEMP1
  1049. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  1050. add AO, TEMP1, AO
  1051. add BO, TEMP1, BO
  1052. #endif
  1053. #ifdef LT
  1054. add KK, 4, KK
  1055. #endif
  1056. #ifdef LN
  1057. sub KK, 4, KK
  1058. #endif
  1059. add I, -1, I
  1060. cmp I, 0
  1061. bg,pt %icc, .LL21
  1062. FMOV FZERO, c01
  1063. .LL50:
  1064. and M, 2, I
  1065. cmp I, 0
  1066. ble,pn %icc, .LL70
  1067. nop
  1068. #if defined(LT) || defined(RN)
  1069. sra KK, 2, L
  1070. mov B, BO
  1071. cmp L, 0
  1072. #else
  1073. #ifdef LN
  1074. sll K, 1 + BASE_SHIFT, TEMP1
  1075. sub AORIG, TEMP1, AORIG
  1076. #endif
  1077. sll KK, 1 + BASE_SHIFT, TEMP1
  1078. sll KK, 2 + BASE_SHIFT, TEMP2
  1079. add AORIG, TEMP1, AO
  1080. add B, TEMP2, BO
  1081. sub K, KK, TEMP1
  1082. sra TEMP1, 2, L
  1083. cmp L, 0
  1084. #endif
  1085. FMOV FZERO, c02
  1086. FMOV FZERO, t1
  1087. FMOV FZERO, c04
  1088. LDF [AO + 0 * SIZE], a1
  1089. FMOV FZERO, t2
  1090. LDF [BO + 0 * SIZE], b1
  1091. FMOV FZERO, c06
  1092. LDF [AO + 1 * SIZE], a2
  1093. FMOV FZERO, t3
  1094. LDF [BO + 1 * SIZE], b2
  1095. FMOV FZERO, c08
  1096. LDF [AO + 2 * SIZE], a3
  1097. FMOV FZERO, t4
  1098. LDF [BO + 2 * SIZE], b3
  1099. FMOV FZERO, c01
  1100. LDF [AO + 3 * SIZE], a4
  1101. FMOV FZERO, c03
  1102. LDF [BO + 3 * SIZE], b4
  1103. FMOV FZERO, c05
  1104. ble,pn %icc, .LL55
  1105. FMOV FZERO, c07
  1106. .LL52:
  1107. FADD c02, t1, c02
  1108. add AO, 8 * SIZE, AO
  1109. prefetch [AO + APREFETCHSIZE * SIZE], 0
  1110. FMUL a1, b1, t1
  1111. add BO, 16 * SIZE, BO
  1112. FADD c04, t2, c04
  1113. add L, -1, L
  1114. FMUL a1, b2, t2
  1115. FADD c06, t3, c06
  1116. cmp L, 0
  1117. FMUL a1, b3, t3
  1118. FADD c08, t4, c08
  1119. FMUL a1, b4, t4
  1120. LDF [AO - 4 * SIZE], a1
  1121. FADD c01, t1, c01
  1122. FMUL a2, b1, t1
  1123. LDF [BO - 12 * SIZE], b1
  1124. FADD c03, t2, c03
  1125. FMUL a2, b2, t2
  1126. LDF [BO - 11 * SIZE], b2
  1127. FADD c05, t3, c05
  1128. FMUL a2, b3, t3
  1129. LDF [BO - 10 * SIZE], b3
  1130. FADD c07, t4, c07
  1131. FMUL a2, b4, t4
  1132. LDF [BO - 9 * SIZE], b4
  1133. FADD c02, t1, c02
  1134. FMUL a3, b1, t1
  1135. LDF [AO - 3 * SIZE], a2
  1136. FADD c04, t2, c04
  1137. FMUL a3, b2, t2
  1138. FADD c06, t3, c06
  1139. FMUL a3, b3, t3
  1140. FADD c08, t4, c08
  1141. FMUL a3, b4, t4
  1142. LDF [AO - 2 * SIZE], a3
  1143. FADD c01, t1, c01
  1144. FMUL a4, b1, t1
  1145. LDF [BO - 8 * SIZE], b1
  1146. FADD c03, t2, c03
  1147. FMUL a4, b2, t2
  1148. LDF [BO - 7 * SIZE], b2
  1149. FADD c05, t3, c05
  1150. FMUL a4, b3, t3
  1151. LDF [BO - 6 * SIZE], b3
  1152. FADD c07, t4, c07
  1153. FMUL a4, b4, t4
  1154. LDF [BO - 5 * SIZE], b4
  1155. FADD c02, t1, c02
  1156. FMUL a1, b1, t1
  1157. LDF [AO - 1 * SIZE], a4
  1158. FADD c04, t2, c04
  1159. FMUL a1, b2, t2
  1160. FADD c06, t3, c06
  1161. FMUL a1, b3, t3
  1162. FADD c08, t4, c08
  1163. FMUL a1, b4, t4
  1164. LDF [AO + 0 * SIZE], a1
  1165. FADD c01, t1, c01
  1166. FMUL a2, b1, t1
  1167. LDF [BO - 4 * SIZE], b1
  1168. FADD c03, t2, c03
  1169. FMUL a2, b2, t2
  1170. LDF [BO - 3 * SIZE], b2
  1171. FADD c05, t3, c05
  1172. FMUL a2, b3, t3
  1173. LDF [BO - 2 * SIZE], b3
  1174. FADD c07, t4, c07
  1175. FMUL a2, b4, t4
  1176. LDF [BO - 1 * SIZE], b4
  1177. FADD c02, t1, c02
  1178. FMUL a3, b1, t1
  1179. LDF [AO + 1 * SIZE], a2
  1180. FADD c04, t2, c04
  1181. FMUL a3, b2, t2
  1182. FADD c06, t3, c06
  1183. FMUL a3, b3, t3
  1184. FADD c08, t4, c08
  1185. FMUL a3, b4, t4
  1186. LDF [AO + 2 * SIZE], a3
  1187. FADD c01, t1, c01
  1188. FMUL a4, b1, t1
  1189. LDF [BO + 0 * SIZE], b1
  1190. FADD c03, t2, c03
  1191. FMUL a4, b2, t2
  1192. LDF [BO + 1 * SIZE], b2
  1193. FADD c05, t3, c05
  1194. FMUL a4, b3, t3
  1195. LDF [BO + 2 * SIZE], b3
  1196. FADD c07, t4, c07
  1197. FMUL a4, b4, t4
  1198. LDF [BO + 3 * SIZE], b4
  1199. bg,pt %icc, .LL52
  1200. LDF [AO + 3 * SIZE], a4
  1201. .LL55:
  1202. #if defined(LT) || defined(RN)
  1203. and KK, 3, L
  1204. #else
  1205. and TEMP1, 3, L
  1206. #endif
  1207. cmp L, 0
  1208. ble,a,pn %icc, .LL59
  1209. nop
  1210. .LL56:
  1211. FADD c02, t1, c02
  1212. add AO, 2 * SIZE, AO
  1213. FMUL a1, b1, t1
  1214. add L, -1, L
  1215. add BO, 4 * SIZE, BO
  1216. FADD c04, t2, c04
  1217. cmp L, 0
  1218. FMUL a1, b2, t2
  1219. FADD c06, t3, c06
  1220. FMUL a1, b3, t3
  1221. FADD c08, t4, c08
  1222. FMUL a1, b4, t4
  1223. LDF [AO + 0 * SIZE], a1
  1224. FADD c01, t1, c01
  1225. FMUL a2, b1, t1
  1226. LDF [BO + 0 * SIZE], b1
  1227. FADD c03, t2, c03
  1228. FMUL a2, b2, t2
  1229. LDF [BO + 1 * SIZE], b2
  1230. FADD c05, t3, c05
  1231. FMUL a2, b3, t3
  1232. LDF [BO + 2 * SIZE], b3
  1233. FADD c07, t4, c07
  1234. FMUL a2, b4, t4
  1235. LDF [BO + 3 * SIZE], b4
  1236. bg,pt %icc, .LL56
  1237. LDF [AO + 1 * SIZE], a2
  1238. .LL59:
  1239. #if defined(LN) || defined(RT)
  1240. #ifdef LN
  1241. sub KK, 2, TEMP1
  1242. #else
  1243. sub KK, 4, TEMP1
  1244. #endif
  1245. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  1246. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  1247. add AORIG, TEMP2, AO
  1248. add B, TEMP1, BO
  1249. #endif
  1250. FADD c02, t1, c02
  1251. FADD c04, t2, c04
  1252. FADD c06, t3, c06
  1253. FADD c08, t4, c08
  1254. #if defined(LN) || defined(LT)
  1255. LDF [BO + 0 * SIZE], a1
  1256. LDF [BO + 1 * SIZE], a2
  1257. LDF [BO + 2 * SIZE], a3
  1258. LDF [BO + 3 * SIZE], a4
  1259. LDF [BO + 4 * SIZE], b1
  1260. LDF [BO + 5 * SIZE], b2
  1261. LDF [BO + 6 * SIZE], b3
  1262. LDF [BO + 7 * SIZE], b4
  1263. FSUB a1, c01, c01
  1264. FSUB a2, c03, c03
  1265. FSUB a3, c05, c05
  1266. FSUB a4, c07, c07
  1267. FSUB b1, c02, c02
  1268. FSUB b2, c04, c04
  1269. FSUB b3, c06, c06
  1270. FSUB b4, c08, c08
  1271. #else
  1272. LDF [AO + 0 * SIZE], a1
  1273. LDF [AO + 1 * SIZE], a2
  1274. LDF [AO + 2 * SIZE], a3
  1275. LDF [AO + 3 * SIZE], a4
  1276. LDF [AO + 4 * SIZE], b1
  1277. LDF [AO + 5 * SIZE], b2
  1278. LDF [AO + 6 * SIZE], b3
  1279. LDF [AO + 7 * SIZE], b4
  1280. FSUB a1, c01, c01
  1281. FSUB a2, c02, c02
  1282. FSUB a3, c03, c03
  1283. FSUB a4, c04, c04
  1284. FSUB b1, c05, c05
  1285. FSUB b2, c06, c06
  1286. FSUB b3, c07, c07
  1287. FSUB b4, c08, c08
  1288. #endif
  1289. #ifdef LN
  1290. LDF [AO + 3 * SIZE], a1
  1291. LDF [AO + 2 * SIZE], a2
  1292. LDF [AO + 0 * SIZE], a3
  1293. FMUL a1, c02, c02
  1294. FMUL a1, c04, c04
  1295. FMUL a1, c06, c06
  1296. FMUL a1, c08, c08
  1297. FMUL a2, c02, t1
  1298. FMUL a2, c04, t2
  1299. FMUL a2, c06, t3
  1300. FMUL a2, c08, t4
  1301. FSUB c01, t1, c01
  1302. FSUB c03, t2, c03
  1303. FSUB c05, t3, c05
  1304. FSUB c07, t4, c07
  1305. FMUL a3, c01, c01
  1306. FMUL a3, c03, c03
  1307. FMUL a3, c05, c05
  1308. FMUL a3, c07, c07
  1309. #endif
  1310. #ifdef LT
  1311. LDF [AO + 0 * SIZE], a1
  1312. LDF [AO + 1 * SIZE], a2
  1313. LDF [AO + 3 * SIZE], a3
  1314. FMUL a1, c01, c01
  1315. FMUL a1, c03, c03
  1316. FMUL a1, c05, c05
  1317. FMUL a1, c07, c07
  1318. FMUL a2, c01, t1
  1319. FMUL a2, c03, t2
  1320. FMUL a2, c05, t3
  1321. FMUL a2, c07, t4
  1322. FSUB c02, t1, c02
  1323. FSUB c04, t2, c04
  1324. FSUB c06, t3, c06
  1325. FSUB c08, t4, c08
  1326. FMUL a3, c02, c02
  1327. FMUL a3, c04, c04
  1328. FMUL a3, c06, c06
  1329. FMUL a3, c08, c08
  1330. #endif
  1331. #ifdef RN
  1332. LDF [BO + 0 * SIZE], a1
  1333. LDF [BO + 1 * SIZE], a2
  1334. LDF [BO + 2 * SIZE], a3
  1335. LDF [BO + 3 * SIZE], a4
  1336. FMUL a1, c01, c01
  1337. FMUL a1, c02, c02
  1338. FMUL a2, c01, t1
  1339. FMUL a2, c02, t2
  1340. FSUB c03, t1, c03
  1341. FSUB c04, t2, c04
  1342. FMUL a3, c01, t1
  1343. FMUL a3, c02, t2
  1344. FSUB c05, t1, c05
  1345. FSUB c06, t2, c06
  1346. FMUL a4, c01, t1
  1347. FMUL a4, c02, t2
  1348. FSUB c07, t1, c07
  1349. FSUB c08, t2, c08
  1350. LDF [BO + 5 * SIZE], a1
  1351. LDF [BO + 6 * SIZE], a2
  1352. LDF [BO + 7 * SIZE], a3
  1353. FMUL a1, c03, c03
  1354. FMUL a1, c04, c04
  1355. FMUL a2, c03, t1
  1356. FMUL a2, c04, t2
  1357. FSUB c05, t1, c05
  1358. FSUB c06, t2, c06
  1359. FMUL a3, c03, t1
  1360. FMUL a3, c04, t2
  1361. FSUB c07, t1, c07
  1362. FSUB c08, t2, c08
  1363. LDF [BO + 10 * SIZE], a1
  1364. LDF [BO + 11 * SIZE], a2
  1365. FMUL a1, c05, c05
  1366. FMUL a1, c06, c06
  1367. FMUL a2, c05, t1
  1368. FMUL a2, c06, t2
  1369. FSUB c07, t1, c07
  1370. FSUB c08, t2, c08
  1371. LDF [BO + 15 * SIZE], a1
  1372. FMUL a1, c07, c07
  1373. FMUL a1, c08, c08
  1374. #endif
  1375. #ifdef RT
  1376. LDF [BO + 15 * SIZE], a1
  1377. LDF [BO + 14 * SIZE], a2
  1378. LDF [BO + 13 * SIZE], a3
  1379. LDF [BO + 12 * SIZE], a4
  1380. FMUL a1, c07, c07
  1381. FMUL a1, c08, c08
  1382. FMUL a2, c07, t1
  1383. FMUL a2, c08, t2
  1384. FSUB c05, t1, c05
  1385. FSUB c06, t2, c06
  1386. FMUL a3, c07, t1
  1387. FMUL a3, c08, t2
  1388. FSUB c03, t1, c03
  1389. FSUB c04, t2, c04
  1390. FMUL a4, c07, t1
  1391. FMUL a4, c08, t2
  1392. FSUB c01, t1, c01
  1393. FSUB c02, t2, c02
  1394. LDF [BO + 10 * SIZE], a1
  1395. LDF [BO + 9 * SIZE], a2
  1396. LDF [BO + 8 * SIZE], a3
  1397. FMUL a1, c05, c05
  1398. FMUL a1, c06, c06
  1399. FMUL a2, c05, t1
  1400. FMUL a2, c06, t2
  1401. FSUB c03, t1, c03
  1402. FSUB c04, t2, c04
  1403. FMUL a3, c05, t1
  1404. FMUL a3, c06, t2
  1405. FSUB c01, t1, c01
  1406. FSUB c02, t2, c02
  1407. LDF [BO + 5 * SIZE], a1
  1408. LDF [BO + 4 * SIZE], a2
  1409. FMUL a1, c03, c03
  1410. FMUL a1, c04, c04
  1411. FMUL a2, c03, t1
  1412. FMUL a2, c04, t2
  1413. FSUB c01, t1, c01
  1414. FSUB c02, t2, c02
  1415. LDF [BO + 0 * SIZE], a1
  1416. FMUL a1, c01, c01
  1417. FMUL a1, c02, c02
  1418. #endif
  1419. #ifdef LN
  1420. add C1, -2 * SIZE, C1
  1421. add C2, -2 * SIZE, C2
  1422. add C3, -2 * SIZE, C3
  1423. add C4, -2 * SIZE, C4
  1424. #endif
  1425. #if defined(LN) || defined(LT)
  1426. STF c01, [BO + 0 * SIZE]
  1427. STF c03, [BO + 1 * SIZE]
  1428. STF c05, [BO + 2 * SIZE]
  1429. STF c07, [BO + 3 * SIZE]
  1430. STF c02, [BO + 4 * SIZE]
  1431. STF c04, [BO + 5 * SIZE]
  1432. STF c06, [BO + 6 * SIZE]
  1433. STF c08, [BO + 7 * SIZE]
  1434. #else
  1435. STF c01, [AO + 0 * SIZE]
  1436. STF c02, [AO + 1 * SIZE]
  1437. STF c03, [AO + 2 * SIZE]
  1438. STF c04, [AO + 3 * SIZE]
  1439. STF c05, [AO + 4 * SIZE]
  1440. STF c06, [AO + 5 * SIZE]
  1441. STF c07, [AO + 6 * SIZE]
  1442. STF c08, [AO + 7 * SIZE]
  1443. #endif
  1444. STF c01, [C1 + 0 * SIZE]
  1445. STF c02, [C1 + 1 * SIZE]
  1446. STF c03, [C2 + 0 * SIZE]
  1447. STF c04, [C2 + 1 * SIZE]
  1448. STF c05, [C3 + 0 * SIZE]
  1449. STF c06, [C3 + 1 * SIZE]
  1450. STF c07, [C4 + 0 * SIZE]
  1451. STF c08, [C4 + 1 * SIZE]
  1452. FMOV FZERO, t1
  1453. FMOV FZERO, t2
  1454. FMOV FZERO, t3
  1455. FMOV FZERO, t4
  1456. #ifndef LN
  1457. add C1, 2 * SIZE, C1
  1458. add C2, 2 * SIZE, C2
  1459. add C3, 2 * SIZE, C3
  1460. add C4, 2 * SIZE, C4
  1461. #endif
  1462. #ifdef RT
  1463. sll K, 1 + BASE_SHIFT, TEMP1
  1464. add AORIG, TEMP1, AORIG
  1465. #endif
  1466. #if defined(LT) || defined(RN)
  1467. sub K, KK, TEMP1
  1468. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  1469. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  1470. add AO, TEMP2, AO
  1471. add BO, TEMP1, BO
  1472. #endif
  1473. #ifdef LT
  1474. add KK, 2, KK
  1475. #endif
  1476. #ifdef LN
  1477. sub KK, 2, KK
  1478. #endif
  1479. .LL70:
  1480. and M, 1, I
  1481. cmp I, 0
  1482. ble,pn %icc, .LL99
  1483. nop
  1484. #if defined(LT) || defined(RN)
  1485. sra KK, 2, L
  1486. mov B, BO
  1487. cmp L, 0
  1488. #else
  1489. #ifdef LN
  1490. sll K, 0 + BASE_SHIFT, TEMP1
  1491. sub AORIG, TEMP1, AORIG
  1492. #endif
  1493. sll KK, 0 + BASE_SHIFT, TEMP1
  1494. sll KK, 2 + BASE_SHIFT, TEMP2
  1495. add AORIG, TEMP1, AO
  1496. add B, TEMP2, BO
  1497. sub K, KK, TEMP1
  1498. sra TEMP1, 2, L
  1499. cmp L, 0
  1500. #endif
  1501. LDF [AO + 0 * SIZE], a1
  1502. FMOV FZERO, c01
  1503. LDF [BO + 0 * SIZE], b1
  1504. FMOV FZERO, t1
  1505. LDF [AO + 1 * SIZE], a2
  1506. FMOV FZERO, c02
  1507. LDF [BO + 1 * SIZE], b2
  1508. FMOV FZERO, t2
  1509. LDF [AO + 2 * SIZE], a3
  1510. FMOV FZERO, c03
  1511. LDF [BO + 2 * SIZE], b3
  1512. FMOV FZERO, t3
  1513. LDF [AO + 3 * SIZE], a4
  1514. FMOV FZERO, c04
  1515. LDF [BO + 3 * SIZE], b4
  1516. FMOV FZERO, t4
  1517. ble,pn %icc, .LL75
  1518. nop
  1519. .LL72:
  1520. FADD c01, t1, c01
  1521. add L, -1, L
  1522. FMUL a1, b1, t1
  1523. LDF [BO + 4 * SIZE], b1
  1524. FADD c02, t2, c02
  1525. cmp L, 0
  1526. FMUL a1, b2, t2
  1527. LDF [BO + 5 * SIZE], b2
  1528. FADD c03, t3, c03
  1529. FMUL a1, b3, t3
  1530. LDF [BO + 6 * SIZE], b3
  1531. FADD c04, t4, c04
  1532. FMUL a1, b4, t4
  1533. LDF [BO + 7 * SIZE], b4
  1534. LDF [AO + 4 * SIZE], a1
  1535. FADD c01, t1, c01
  1536. add AO, 4 * SIZE, AO
  1537. FMUL a2, b1, t1
  1538. LDF [BO + 8 * SIZE], b1
  1539. FADD c02, t2, c02
  1540. FMUL a2, b2, t2
  1541. LDF [BO + 9 * SIZE], b2
  1542. FADD c03, t3, c03
  1543. FMUL a2, b3, t3
  1544. LDF [BO + 10 * SIZE], b3
  1545. FADD c04, t4, c04
  1546. FMUL a2, b4, t4
  1547. LDF [BO + 11 * SIZE], b4
  1548. LDF [AO + 1 * SIZE], a2
  1549. FADD c01, t1, c01
  1550. FMUL a3, b1, t1
  1551. LDF [BO + 12 * SIZE], b1
  1552. FADD c02, t2, c02
  1553. FMUL a3, b2, t2
  1554. LDF [BO + 13 * SIZE], b2
  1555. FADD c03, t3, c03
  1556. FMUL a3, b3, t3
  1557. LDF [BO + 14 * SIZE], b3
  1558. FADD c04, t4, c04
  1559. FMUL a3, b4, t4
  1560. LDF [BO + 15 * SIZE], b4
  1561. LDF [AO + 2 * SIZE], a3
  1562. FADD c01, t1, c01
  1563. FMUL a4, b1, t1
  1564. LDF [BO + 16 * SIZE], b1
  1565. FADD c02, t2, c02
  1566. FMUL a4, b2, t2
  1567. LDF [BO + 17 * SIZE], b2
  1568. FADD c03, t3, c03
  1569. FMUL a4, b3, t3
  1570. LDF [BO + 18 * SIZE], b3
  1571. FADD c04, t4, c04
  1572. FMUL a4, b4, t4
  1573. LDF [BO + 19 * SIZE], b4
  1574. add BO, 16 * SIZE, BO
  1575. bg,pt %icc, .LL72
  1576. LDF [AO + 3 * SIZE], a4
  1577. .LL75:
  1578. #if defined(LT) || defined(RN)
  1579. and KK, 3, L
  1580. #else
  1581. and TEMP1, 3, L
  1582. #endif
  1583. cmp L, 0
  1584. ble,a,pn %icc, .LL79
  1585. nop
  1586. .LL76:
  1587. FADD c01, t1, c01
  1588. add AO, 1 * SIZE, AO
  1589. FMUL a1, b1, t1
  1590. LDF [BO + 4 * SIZE], b1
  1591. FADD c02, t2, c02
  1592. add L, -1, L
  1593. FMUL a1, b2, t2
  1594. LDF [BO + 5 * SIZE], b2
  1595. FADD c03, t3, c03
  1596. cmp L, 0
  1597. FMUL a1, b3, t3
  1598. LDF [BO + 6 * SIZE], b3
  1599. FADD c04, t4, c04
  1600. add BO, 4 * SIZE, BO
  1601. FMUL a1, b4, t4
  1602. LDF [AO + 0 * SIZE], a1
  1603. bg,pt %icc, .LL76
  1604. LDF [BO + 3 * SIZE], b4
  1605. .LL79:
  1606. FADD c01, t1, c01
  1607. FADD c02, t2, c02
  1608. FADD c03, t3, c03
  1609. FADD c04, t4, c04
  1610. #if defined(LN) || defined(RT)
  1611. #ifdef LN
  1612. sub KK, 1, TEMP1
  1613. #else
  1614. sub KK, 4, TEMP1
  1615. #endif
  1616. sll TEMP1, 0 + BASE_SHIFT, TEMP2
  1617. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  1618. add AORIG, TEMP2, AO
  1619. add B, TEMP1, BO
  1620. #endif
  1621. #if defined(LN) || defined(LT)
  1622. LDF [BO + 0 * SIZE], a1
  1623. LDF [BO + 1 * SIZE], a2
  1624. LDF [BO + 2 * SIZE], a3
  1625. LDF [BO + 3 * SIZE], a4
  1626. FSUB a1, c01, c01
  1627. FSUB a2, c02, c02
  1628. FSUB a3, c03, c03
  1629. FSUB a4, c04, c04
  1630. #else
  1631. LDF [AO + 0 * SIZE], a1
  1632. LDF [AO + 1 * SIZE], a2
  1633. LDF [AO + 2 * SIZE], a3
  1634. LDF [AO + 3 * SIZE], a4
  1635. FSUB a1, c01, c01
  1636. FSUB a2, c02, c02
  1637. FSUB a3, c03, c03
  1638. FSUB a4, c04, c04
  1639. #endif
  1640. #ifdef LN
  1641. LDF [AO + 0 * SIZE], a1
  1642. FMUL a1, c01, c01
  1643. FMUL a1, c02, c02
  1644. FMUL a1, c03, c03
  1645. FMUL a1, c04, c04
  1646. #endif
  1647. #ifdef LT
  1648. LDF [AO + 0 * SIZE], a1
  1649. FMUL a1, c01, c01
  1650. FMUL a1, c02, c02
  1651. FMUL a1, c03, c03
  1652. FMUL a1, c04, c04
  1653. #endif
  1654. #ifdef RN
  1655. LDF [BO + 0 * SIZE], a1
  1656. LDF [BO + 1 * SIZE], a2
  1657. LDF [BO + 2 * SIZE], a3
  1658. LDF [BO + 3 * SIZE], a4
  1659. FMUL a1, c01, c01
  1660. FMUL a2, c01, t1
  1661. FSUB c02, t1, c02
  1662. FMUL a3, c01, t1
  1663. FSUB c03, t1, c03
  1664. FMUL a4, c01, t1
  1665. FSUB c04, t1, c04
  1666. LDF [BO + 5 * SIZE], a1
  1667. LDF [BO + 6 * SIZE], a2
  1668. LDF [BO + 7 * SIZE], a3
  1669. FMUL a1, c02, c02
  1670. FMUL a2, c02, t1
  1671. FSUB c03, t1, c03
  1672. FMUL a3, c02, t1
  1673. FSUB c04, t1, c04
  1674. LDF [BO + 10 * SIZE], a1
  1675. LDF [BO + 11 * SIZE], a2
  1676. FMUL a1, c03, c03
  1677. FMUL a2, c03, t1
  1678. FSUB c04, t1, c04
  1679. LDF [BO + 15 * SIZE], a1
  1680. FMUL a1, c04, c04
  1681. #endif
  1682. #ifdef RT
  1683. LDF [BO + 15 * SIZE], a1
  1684. LDF [BO + 14 * SIZE], a2
  1685. LDF [BO + 13 * SIZE], a3
  1686. LDF [BO + 12 * SIZE], a4
  1687. FMUL a1, c04, c04
  1688. FMUL a2, c04, t1
  1689. FSUB c03, t1, c03
  1690. FMUL a3, c04, t1
  1691. FSUB c02, t1, c02
  1692. FMUL a4, c04, t1
  1693. FSUB c01, t1, c01
  1694. LDF [BO + 10 * SIZE], a1
  1695. LDF [BO + 9 * SIZE], a2
  1696. LDF [BO + 8 * SIZE], a3
  1697. FMUL a1, c03, c03
  1698. FMUL a2, c03, t1
  1699. FSUB c02, t1, c02
  1700. FMUL a3, c03, t1
  1701. FSUB c01, t1, c01
  1702. LDF [BO + 5 * SIZE], a1
  1703. LDF [BO + 4 * SIZE], a2
  1704. FMUL a1, c02, c02
  1705. FMUL a2, c02, t1
  1706. FSUB c01, t1, c01
  1707. LDF [BO + 0 * SIZE], a1
  1708. FMUL a1, c01, c01
  1709. #endif
  1710. #ifdef LN
  1711. add C1, -1 * SIZE, C1
  1712. add C2, -1 * SIZE, C2
  1713. add C3, -1 * SIZE, C3
  1714. add C4, -1 * SIZE, C4
  1715. #endif
  1716. #if defined(LN) || defined(LT)
  1717. STF c01, [BO + 0 * SIZE]
  1718. STF c02, [BO + 1 * SIZE]
  1719. STF c03, [BO + 2 * SIZE]
  1720. STF c04, [BO + 3 * SIZE]
  1721. #else
  1722. STF c01, [AO + 0 * SIZE]
  1723. STF c02, [AO + 1 * SIZE]
  1724. STF c03, [AO + 2 * SIZE]
  1725. STF c04, [AO + 3 * SIZE]
  1726. #endif
  1727. STF c01, [C1 + 0 * SIZE]
  1728. STF c02, [C2 + 0 * SIZE]
  1729. STF c03, [C3 + 0 * SIZE]
  1730. STF c04, [C4 + 0 * SIZE]
  1731. FMOV FZERO, t1
  1732. FMOV FZERO, t2
  1733. FMOV FZERO, t3
  1734. FMOV FZERO, t4
  1735. #ifndef LN
  1736. add C1, 1 * SIZE, C1
  1737. add C2, 1 * SIZE, C2
  1738. add C3, 1 * SIZE, C3
  1739. add C4, 1 * SIZE, C4
  1740. #endif
  1741. #ifdef RT
  1742. sll K, 0 + BASE_SHIFT, TEMP1
  1743. add AORIG, TEMP1, AORIG
  1744. #endif
  1745. #if defined(LT) || defined(RN)
  1746. sub K, KK, TEMP1
  1747. sll TEMP1, 0 + BASE_SHIFT, TEMP2
  1748. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  1749. add AO, TEMP2, AO
  1750. add BO, TEMP1, BO
  1751. #endif
  1752. #ifdef LT
  1753. add KK, 1, KK
  1754. #endif
  1755. #ifdef LN
  1756. sub KK, 1, KK
  1757. #endif
  1758. .LL99:
  1759. #ifdef LN
  1760. sll K, 2 + BASE_SHIFT, TEMP1
  1761. add B, TEMP1, B
  1762. #endif
  1763. #if defined(LT) || defined(RN)
  1764. mov BO, B
  1765. #endif
  1766. #ifdef RN
  1767. add KK, 4, KK
  1768. #endif
  1769. #ifdef RT
  1770. sub KK, 4, KK
  1771. #endif
  1772. add J, -1, J
  1773. cmp J, 0
  1774. bg,pt %icc, .LL11
  1775. nop
  1776. .LL100: /* n & 2 */
  1777. and N, 2, J
  1778. cmp J, 0
  1779. ble,pn %icc, .LL200
  1780. nop
  1781. #ifdef RT
  1782. sll K, 1 + BASE_SHIFT, TEMP1
  1783. sub B, TEMP1, B
  1784. sll LDC, 1, TEMP1
  1785. sub C, TEMP1, C
  1786. #endif
  1787. mov C, C1
  1788. add C, LDC, C2
  1789. #ifdef LN
  1790. add M, OFFSET, KK
  1791. #endif
  1792. #ifdef LT
  1793. mov OFFSET, KK
  1794. #endif
  1795. #if defined(LN) || defined(RT)
  1796. mov A, AORIG
  1797. #else
  1798. mov A, AO
  1799. #endif
  1800. #ifndef RT
  1801. add C2, LDC, C
  1802. #endif
  1803. sra M, 2, I
  1804. cmp I, 0
  1805. ble,pn %icc, .LL150
  1806. FMOV FZERO, c03
  1807. .LL121:
  1808. #if defined(LT) || defined(RN)
  1809. sra KK, 2, L
  1810. mov B, BO
  1811. cmp L, 0
  1812. #else
  1813. #ifdef LN
  1814. sll K, 2 + BASE_SHIFT, TEMP1
  1815. sub AORIG, TEMP1, AORIG
  1816. #endif
  1817. sll KK, 2 + BASE_SHIFT, TEMP1
  1818. sll KK, 1 + BASE_SHIFT, TEMP2
  1819. add AORIG, TEMP1, AO
  1820. add B, TEMP2, BO
  1821. sub K, KK, TEMP1
  1822. sra TEMP1, 2, L
  1823. cmp L, 0
  1824. #endif
  1825. LDF [AO + 0 * SIZE], a1
  1826. FMOV FZERO, t1
  1827. LDF [BO + 0 * SIZE], b1
  1828. FMOV FZERO, c07
  1829. LDF [AO + 1 * SIZE], a2
  1830. FMOV FZERO, t2
  1831. LDF [BO + 1 * SIZE], b2
  1832. FMOV FZERO, c04
  1833. LDF [AO + 2 * SIZE], a3
  1834. FMOV FZERO, t3
  1835. LDF [BO + 2 * SIZE], b3
  1836. FMOV FZERO, c08
  1837. LDF [AO + 3 * SIZE], a4
  1838. FMOV FZERO, t4
  1839. LDF [BO + 3 * SIZE], b4
  1840. FMOV FZERO, c01
  1841. prefetch [C1 + 3 * SIZE], 2
  1842. FMOV FZERO, c05
  1843. prefetch [C2 + 3 * SIZE], 2
  1844. FMOV FZERO, c02
  1845. ble,pn %icc, .LL125
  1846. FMOV FZERO, c06
  1847. .LL122:
  1848. FADD c03, t1, c03
  1849. add L, -1, L
  1850. FMUL a1, b1, t1
  1851. prefetch [AO + APREFETCHSIZE * SIZE], 0
  1852. FADD c07, t2, c07
  1853. add BO, 8 * SIZE, BO
  1854. FMUL a1, b2, t2
  1855. LDF [AO + 4 * SIZE], a1
  1856. FADD c04, t3, c04
  1857. add AO, 16 * SIZE, AO
  1858. FMUL a2, b1, t3
  1859. cmp L, 0
  1860. FADD c08, t4, c08
  1861. nop
  1862. FMUL a2, b2, t4
  1863. LDF [AO - 11 * SIZE], a2
  1864. FADD c01, t1, c01
  1865. nop
  1866. FMUL a3, b1, t1
  1867. nop
  1868. FADD c05, t2, c05
  1869. nop
  1870. FMUL a3, b2, t2
  1871. LDF [AO - 10 * SIZE], a3
  1872. FADD c02, t3, c02
  1873. nop
  1874. FMUL a4, b1, t3
  1875. LDF [BO - 4 * SIZE], b1
  1876. FADD c06, t4, c06
  1877. nop
  1878. FMUL a4, b2, t4
  1879. LDF [BO - 3 * SIZE], b2
  1880. FADD c03, t1, c03
  1881. nop
  1882. FMUL a1, b3, t1
  1883. LDF [AO - 9 * SIZE], a4
  1884. FADD c07, t2, c07
  1885. nop
  1886. FMUL a1, b4, t2
  1887. LDF [AO - 8 * SIZE], a1
  1888. FADD c04, t3, c04
  1889. nop
  1890. FMUL a2, b3, t3
  1891. nop
  1892. FADD c08, t4, c08
  1893. nop
  1894. FMUL a2, b4, t4
  1895. LDF [AO - 7 * SIZE], a2
  1896. FADD c01, t1, c01
  1897. nop
  1898. FMUL a3, b3, t1
  1899. nop
  1900. FADD c05, t2, c05
  1901. nop
  1902. FMUL a3, b4, t2
  1903. LDF [AO - 6 * SIZE], a3
  1904. FADD c02, t3, c02
  1905. nop
  1906. FMUL a4, b3, t3
  1907. LDF [BO - 2 * SIZE], b3
  1908. FADD c06, t4, c06
  1909. nop
  1910. FMUL a4, b4, t4
  1911. LDF [BO - 1 * SIZE], b4
  1912. FADD c03, t1, c03
  1913. nop
  1914. FMUL a1, b1, t1
  1915. LDF [AO - 5 * SIZE], a4
  1916. FADD c07, t2, c07
  1917. nop
  1918. FMUL a1, b2, t2
  1919. LDF [AO - 4 * SIZE], a1
  1920. FADD c04, t3, c04
  1921. nop
  1922. FMUL a2, b1, t3
  1923. nop
  1924. FADD c08, t4, c08
  1925. nop
  1926. FMUL a2, b2, t4
  1927. LDF [AO - 3 * SIZE], a2
  1928. FADD c01, t1, c01
  1929. nop
  1930. FMUL a3, b1, t1
  1931. nop
  1932. FADD c05, t2, c05
  1933. nop
  1934. FMUL a3, b2, t2
  1935. LDF [AO - 2 * SIZE], a3
  1936. FADD c02, t3, c02
  1937. nop
  1938. FMUL a4, b1, t3
  1939. LDF [BO + 0 * SIZE], b1
  1940. FADD c06, t4, c06
  1941. nop
  1942. FMUL a4, b2, t4
  1943. LDF [BO + 1 * SIZE], b2
  1944. FADD c03, t1, c03
  1945. nop
  1946. FMUL a1, b3, t1
  1947. LDF [AO - 1 * SIZE], a4
  1948. FADD c07, t2, c07
  1949. nop
  1950. FMUL a1, b4, t2
  1951. LDF [AO + 0 * SIZE], a1
  1952. FADD c04, t3, c04
  1953. nop
  1954. FMUL a2, b3, t3
  1955. nop
  1956. FADD c08, t4, c08
  1957. nop
  1958. FMUL a2, b4, t4
  1959. LDF [AO + 1 * SIZE], a2
  1960. FADD c01, t1, c01
  1961. nop
  1962. FMUL a3, b3, t1
  1963. nop
  1964. FADD c05, t2, c05
  1965. nop
  1966. FMUL a3, b4, t2
  1967. LDF [AO + 2 * SIZE], a3
  1968. FADD c02, t3, c02
  1969. nop
  1970. FMUL a4, b3, t3
  1971. LDF [BO + 2 * SIZE], b3
  1972. FADD c06, t4, c06
  1973. FMUL a4, b4, t4
  1974. LDF [AO + 3 * SIZE], a4
  1975. bg,pt %icc, .LL122
  1976. LDF [BO + 3 * SIZE], b4
  1977. .LL125:
  1978. #if defined(LT) || defined(RN)
  1979. and KK, 3, L
  1980. #else
  1981. and TEMP1, 3, L
  1982. #endif
  1983. cmp L, 0
  1984. ble,a,pn %icc, .LL129
  1985. nop
  1986. .LL126:
  1987. FADD c03, t1, c03
  1988. add AO, 4 * SIZE, AO
  1989. FMUL a1, b1, t1
  1990. add BO, 2 * SIZE, BO
  1991. FADD c07, t2, c07
  1992. add L, -1, L
  1993. FMUL a1, b2, t2
  1994. LDF [AO + 0 * SIZE], a1
  1995. FADD c04, t3, c04
  1996. cmp L, 0
  1997. FMUL a2, b1, t3
  1998. FADD c08, t4, c08
  1999. FMUL a2, b2, t4
  2000. LDF [AO + 1 * SIZE], a2
  2001. FADD c01, t1, c01
  2002. FMUL a3, b1, t1
  2003. FADD c05, t2, c05
  2004. FMUL a3, b2, t2
  2005. LDF [AO + 2 * SIZE], a3
  2006. FADD c02, t3, c02
  2007. FMUL a4, b1, t3
  2008. LDF [BO + 0 * SIZE], b1
  2009. FADD c06, t4, c06
  2010. FMUL a4, b2, t4
  2011. LDF [BO + 1 * SIZE], b2
  2012. bg,pt %icc, .LL126
  2013. LDF [AO + 3 * SIZE], a4
  2014. .LL129:
  2015. FADD c03, t1, c03
  2016. FADD c07, t2, c07
  2017. FADD c04, t3, c04
  2018. FADD c08, t4, c08
  2019. #if defined(LN) || defined(RT)
  2020. #ifdef LN
  2021. sub KK, 4, TEMP1
  2022. #else
  2023. sub KK, 2, TEMP1
  2024. #endif
  2025. sll TEMP1, 2 + BASE_SHIFT, TEMP2
  2026. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  2027. add AORIG, TEMP2, AO
  2028. add B, TEMP1, BO
  2029. #endif
  2030. #if defined(LN) || defined(LT)
  2031. LDF [BO + 0 * SIZE], a1
  2032. LDF [BO + 1 * SIZE], a2
  2033. LDF [BO + 2 * SIZE], a3
  2034. LDF [BO + 3 * SIZE], a4
  2035. LDF [BO + 4 * SIZE], b1
  2036. LDF [BO + 5 * SIZE], b2
  2037. LDF [BO + 6 * SIZE], b3
  2038. LDF [BO + 7 * SIZE], b4
  2039. FSUB a1, c01, c01
  2040. FSUB a2, c05, c05
  2041. FSUB a3, c02, c02
  2042. FSUB a4, c06, c06
  2043. FSUB b1, c03, c03
  2044. FSUB b2, c07, c07
  2045. FSUB b3, c04, c04
  2046. FSUB b4, c08, c08
  2047. #else
  2048. LDF [AO + 0 * SIZE], a1
  2049. LDF [AO + 1 * SIZE], a2
  2050. LDF [AO + 2 * SIZE], a3
  2051. LDF [AO + 3 * SIZE], a4
  2052. LDF [AO + 4 * SIZE], b1
  2053. LDF [AO + 5 * SIZE], b2
  2054. LDF [AO + 6 * SIZE], b3
  2055. LDF [AO + 7 * SIZE], b4
  2056. FSUB a1, c01, c01
  2057. FSUB a2, c02, c02
  2058. FSUB a3, c03, c03
  2059. FSUB a4, c04, c04
  2060. FSUB b1, c05, c05
  2061. FSUB b2, c06, c06
  2062. FSUB b3, c07, c07
  2063. FSUB b4, c08, c08
  2064. #endif
  2065. #ifdef LN
  2066. LDF [AO + 15 * SIZE], a1
  2067. LDF [AO + 14 * SIZE], a2
  2068. LDF [AO + 13 * SIZE], a3
  2069. LDF [AO + 12 * SIZE], a4
  2070. FMUL a1, c04, c04
  2071. FMUL a1, c08, c08
  2072. FMUL a2, c04, t1
  2073. FMUL a2, c08, t2
  2074. FSUB c03, t1, c03
  2075. FSUB c07, t2, c07
  2076. FMUL a3, c04, t1
  2077. FMUL a3, c08, t2
  2078. FSUB c02, t1, c02
  2079. FSUB c06, t2, c06
  2080. FMUL a4, c04, t1
  2081. FMUL a4, c08, t2
  2082. FSUB c01, t1, c01
  2083. FSUB c05, t2, c05
  2084. LDF [AO + 10 * SIZE], a1
  2085. LDF [AO + 9 * SIZE], a2
  2086. LDF [AO + 8 * SIZE], a3
  2087. FMUL a1, c03, c03
  2088. FMUL a1, c07, c07
  2089. FMUL a2, c03, t1
  2090. FMUL a2, c07, t2
  2091. FSUB c02, t1, c02
  2092. FSUB c06, t2, c06
  2093. FMUL a3, c03, t1
  2094. FMUL a3, c07, t2
  2095. FSUB c01, t1, c01
  2096. FSUB c05, t2, c05
  2097. LDF [AO + 5 * SIZE], a1
  2098. LDF [AO + 4 * SIZE], a2
  2099. FMUL a1, c02, c02
  2100. FMUL a1, c06, c06
  2101. FMUL a2, c02, t1
  2102. FMUL a2, c06, t2
  2103. FSUB c01, t1, c01
  2104. FSUB c05, t2, c05
  2105. LDF [AO + 0 * SIZE], a1
  2106. FMUL a1, c01, c01
  2107. FMUL a1, c05, c05
  2108. #endif
  2109. #ifdef LT
  2110. LDF [AO + 0 * SIZE], a1
  2111. LDF [AO + 1 * SIZE], a2
  2112. LDF [AO + 2 * SIZE], a3
  2113. LDF [AO + 3 * SIZE], a4
  2114. FMUL a1, c01, c01
  2115. FMUL a1, c05, c05
  2116. FMUL a2, c01, t1
  2117. FMUL a2, c05, t2
  2118. FSUB c02, t1, c02
  2119. FSUB c06, t2, c06
  2120. FMUL a3, c01, t1
  2121. FMUL a3, c05, t2
  2122. FSUB c03, t1, c03
  2123. FSUB c07, t2, c07
  2124. FMUL a4, c01, t1
  2125. FMUL a4, c05, t2
  2126. FSUB c04, t1, c04
  2127. FSUB c08, t2, c08
  2128. LDF [AO + 5 * SIZE], a1
  2129. LDF [AO + 6 * SIZE], a2
  2130. LDF [AO + 7 * SIZE], a3
  2131. FMUL a1, c02, c02
  2132. FMUL a1, c06, c06
  2133. FMUL a2, c02, t1
  2134. FMUL a2, c06, t2
  2135. FSUB c03, t1, c03
  2136. FSUB c07, t2, c07
  2137. FMUL a3, c02, t1
  2138. FMUL a3, c06, t2
  2139. FSUB c04, t1, c04
  2140. FSUB c08, t2, c08
  2141. LDF [AO + 10 * SIZE], a1
  2142. LDF [AO + 11 * SIZE], a2
  2143. FMUL a1, c03, c03
  2144. FMUL a1, c07, c07
  2145. FMUL a2, c03, t1
  2146. FMUL a2, c07, t2
  2147. FSUB c04, t1, c04
  2148. FSUB c08, t2, c08
  2149. LDF [AO + 15 * SIZE], a1
  2150. FMUL a1, c04, c04
  2151. FMUL a1, c08, c08
  2152. #endif
  2153. #ifdef RN
  2154. LDF [BO + 0 * SIZE], a1
  2155. LDF [BO + 1 * SIZE], a2
  2156. LDF [BO + 3 * SIZE], a3
  2157. FMUL a1, c01, c01
  2158. FMUL a1, c02, c02
  2159. FMUL a1, c03, c03
  2160. FMUL a1, c04, c04
  2161. FMUL a2, c01, t1
  2162. FMUL a2, c02, t2
  2163. FMUL a2, c03, t3
  2164. FMUL a2, c04, t4
  2165. FSUB c05, t1, c05
  2166. FSUB c06, t2, c06
  2167. FSUB c07, t3, c07
  2168. FSUB c08, t4, c08
  2169. FMUL a3, c05, c05
  2170. FMUL a3, c06, c06
  2171. FMUL a3, c07, c07
  2172. FMUL a3, c08, c08
  2173. #endif
  2174. #ifdef RT
  2175. LDF [BO + 3 * SIZE], a1
  2176. LDF [BO + 2 * SIZE], a2
  2177. LDF [BO + 0 * SIZE], a3
  2178. FMUL a1, c05, c05
  2179. FMUL a1, c06, c06
  2180. FMUL a1, c07, c07
  2181. FMUL a1, c08, c08
  2182. FMUL a2, c05, t1
  2183. FMUL a2, c06, t2
  2184. FMUL a2, c07, t3
  2185. FMUL a2, c08, t4
  2186. FSUB c01, t1, c01
  2187. FSUB c02, t2, c02
  2188. FSUB c03, t3, c03
  2189. FSUB c04, t4, c04
  2190. FMUL a3, c01, c01
  2191. FMUL a3, c02, c02
  2192. FMUL a3, c03, c03
  2193. FMUL a3, c04, c04
  2194. #endif
  2195. #ifdef LN
  2196. add C1, -4 * SIZE, C1
  2197. add C2, -4 * SIZE, C2
  2198. #endif
  2199. #if defined(LN) || defined(LT)
  2200. STF c01, [BO + 0 * SIZE]
  2201. STF c05, [BO + 1 * SIZE]
  2202. STF c02, [BO + 2 * SIZE]
  2203. STF c06, [BO + 3 * SIZE]
  2204. STF c03, [BO + 4 * SIZE]
  2205. STF c07, [BO + 5 * SIZE]
  2206. STF c04, [BO + 6 * SIZE]
  2207. STF c08, [BO + 7 * SIZE]
  2208. #else
  2209. STF c01, [AO + 0 * SIZE]
  2210. STF c02, [AO + 1 * SIZE]
  2211. STF c03, [AO + 2 * SIZE]
  2212. STF c04, [AO + 3 * SIZE]
  2213. STF c05, [AO + 4 * SIZE]
  2214. STF c06, [AO + 5 * SIZE]
  2215. STF c07, [AO + 6 * SIZE]
  2216. STF c08, [AO + 7 * SIZE]
  2217. #endif
  2218. STF c01, [C1 + 0 * SIZE]
  2219. STF c02, [C1 + 1 * SIZE]
  2220. STF c03, [C1 + 2 * SIZE]
  2221. STF c04, [C1 + 3 * SIZE]
  2222. STF c05, [C2 + 0 * SIZE]
  2223. STF c06, [C2 + 1 * SIZE]
  2224. STF c07, [C2 + 2 * SIZE]
  2225. STF c08, [C2 + 3 * SIZE]
  2226. FMOV FZERO, t1
  2227. FMOV FZERO, t2
  2228. FMOV FZERO, t3
  2229. FMOV FZERO, t4
  2230. #ifndef LN
  2231. add C1, 4 * SIZE, C1
  2232. add C2, 4 * SIZE, C2
  2233. #endif
  2234. #ifdef RT
  2235. sll K, 2 + BASE_SHIFT, TEMP1
  2236. add AORIG, TEMP1, AORIG
  2237. #endif
  2238. #if defined(LT) || defined(RN)
  2239. sub K, KK, TEMP1
  2240. sll TEMP1, 2 + BASE_SHIFT, TEMP2
  2241. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  2242. add AO, TEMP2, AO
  2243. add BO, TEMP1, BO
  2244. #endif
  2245. #ifdef LT
  2246. add KK, 4, KK
  2247. #endif
  2248. #ifdef LN
  2249. sub KK, 4, KK
  2250. #endif
  2251. add I, -1, I
  2252. cmp I, 0
  2253. bg,pt %icc, .LL121
  2254. FMOV FZERO, c03
  2255. .LL150:
  2256. and M, 2, I
  2257. cmp I, 0
  2258. ble,pn %icc, .LL170
  2259. nop
  2260. #if defined(LT) || defined(RN)
  2261. sra KK, 2, L
  2262. mov B, BO
  2263. cmp L, 0
  2264. #else
  2265. #ifdef LN
  2266. sll K, 1 + BASE_SHIFT, TEMP1
  2267. sub AORIG, TEMP1, AORIG
  2268. #endif
  2269. sll KK, 1 + BASE_SHIFT, TEMP1
  2270. sll KK, 1 + BASE_SHIFT, TEMP2
  2271. add AORIG, TEMP1, AO
  2272. add B, TEMP2, BO
  2273. sub K, KK, TEMP1
  2274. sra TEMP1, 2, L
  2275. cmp L, 0
  2276. #endif
  2277. LDF [AO + 0 * SIZE], a1
  2278. FMOV FZERO, c01
  2279. LDF [BO + 0 * SIZE], b1
  2280. FMOV FZERO, t1
  2281. LDF [AO + 1 * SIZE], a2
  2282. cmp L, 0
  2283. FMOV FZERO, c02
  2284. LDF [BO + 1 * SIZE], b2
  2285. FMOV FZERO, t2
  2286. LDF [AO + 2 * SIZE], a3
  2287. FMOV FZERO, c03
  2288. LDF [BO + 2 * SIZE], b3
  2289. FMOV FZERO, t3
  2290. LDF [AO + 3 * SIZE], a4
  2291. FMOV FZERO, c04
  2292. LDF [BO + 3 * SIZE], b4
  2293. FMOV FZERO, t4
  2294. ble,pn %icc, .LL155
  2295. nop
  2296. .LL152:
  2297. FADD c01, t1, c01
  2298. add L, -1, L
  2299. FMUL a1, b1, t1
  2300. prefetch [AO + APREFETCHSIZE * SIZE], 0
  2301. FADD c02, t2, c02
  2302. add BO, 8 * SIZE, BO
  2303. FMUL a1, b2, t2
  2304. LDF [AO + 4 * SIZE], a1
  2305. FADD c03, t3, c03
  2306. cmp L, 0
  2307. FMUL a2, b1, t3
  2308. LDF [BO - 4 * SIZE], b1
  2309. FADD c04, t4, c04
  2310. nop
  2311. FMUL a2, b2, t4
  2312. LDF [AO + 5 * SIZE], a2
  2313. FADD c01, t1, c01
  2314. nop
  2315. FMUL a3, b3, t1
  2316. LDF [BO - 3 * SIZE], b2
  2317. FADD c02, t2, c02
  2318. nop
  2319. FMUL a3, b4, t2
  2320. LDF [AO + 6 * SIZE], a3
  2321. FADD c03, t3, c03
  2322. nop
  2323. FMUL a4, b3, t3
  2324. LDF [BO - 2 * SIZE], b3
  2325. FADD c04, t4, c04
  2326. nop
  2327. FMUL a4, b4, t4
  2328. LDF [AO + 7 * SIZE], a4
  2329. FADD c01, t1, c01
  2330. nop
  2331. FMUL a1, b1, t1
  2332. LDF [BO - 1 * SIZE], b4
  2333. FADD c02, t2, c02
  2334. FMUL a1, b2, t2
  2335. LDF [AO + 8 * SIZE], a1
  2336. FADD c03, t3, c03
  2337. FMUL a2, b1, t3
  2338. LDF [BO + 0 * SIZE], b1
  2339. FADD c04, t4, c04
  2340. FMUL a2, b2, t4
  2341. LDF [AO + 9 * SIZE], a2
  2342. FADD c01, t1, c01
  2343. FMUL a3, b3, t1
  2344. LDF [BO + 1 * SIZE], b2
  2345. FADD c02, t2, c02
  2346. FMUL a3, b4, t2
  2347. LDF [AO + 10 * SIZE], a3
  2348. FADD c03, t3, c03
  2349. FMUL a4, b3, t3
  2350. LDF [BO + 2 * SIZE], b3
  2351. FADD c04, t4, c04
  2352. FMUL a4, b4, t4
  2353. LDF [AO + 11 * SIZE], a4
  2354. add AO, 8 * SIZE, AO
  2355. bg,pt %icc, .LL152
  2356. LDF [BO + 3 * SIZE], b4
  2357. .LL155:
  2358. #if defined(LT) || defined(RN)
  2359. and KK, 3, L
  2360. #else
  2361. and TEMP1, 3, L
  2362. #endif
  2363. cmp L, 0
  2364. ble,a,pn %icc, .LL159
  2365. nop
  2366. .LL156:
  2367. LDF [AO + 0 * SIZE], a1
  2368. LDF [AO + 1 * SIZE], a2
  2369. LDF [BO + 0 * SIZE], b1
  2370. LDF [BO + 1 * SIZE], b2
  2371. FADD c01, t1, c01
  2372. FADD c02, t2, c02
  2373. FADD c03, t3, c03
  2374. FADD c04, t4, c04
  2375. FMUL a1, b1, t1
  2376. FMUL a1, b2, t2
  2377. FMUL a2, b1, t3
  2378. FMUL a2, b2, t4
  2379. add AO, 2 * SIZE, AO
  2380. add BO, 2 * SIZE, BO
  2381. add L, -1, L
  2382. cmp L, 0
  2383. bg,pt %icc, .LL156
  2384. nop
  2385. .LL159:
  2386. FADD c01, t1, c01
  2387. FADD c02, t2, c02
  2388. FADD c03, t3, c03
  2389. FADD c04, t4, c04
  2390. #if defined(LN) || defined(RT)
  2391. #ifdef LN
  2392. sub KK, 2, TEMP1
  2393. #else
  2394. sub KK, 2, TEMP1
  2395. #endif
  2396. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  2397. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  2398. add AORIG, TEMP2, AO
  2399. add B, TEMP1, BO
  2400. #endif
  2401. #if defined(LN) || defined(LT)
  2402. LDF [BO + 0 * SIZE], a1
  2403. LDF [BO + 1 * SIZE], a2
  2404. LDF [BO + 2 * SIZE], a3
  2405. LDF [BO + 3 * SIZE], a4
  2406. FSUB a1, c01, c01
  2407. FSUB a2, c02, c02
  2408. FSUB a3, c03, c03
  2409. FSUB a4, c04, c04
  2410. #else
  2411. LDF [AO + 0 * SIZE], a1
  2412. LDF [AO + 1 * SIZE], a2
  2413. LDF [AO + 2 * SIZE], a3
  2414. LDF [AO + 3 * SIZE], a4
  2415. FSUB a1, c01, c01
  2416. FSUB a2, c03, c03
  2417. FSUB a3, c02, c02
  2418. FSUB a4, c04, c04
  2419. #endif
  2420. #ifdef LN
  2421. LDF [AO + 3 * SIZE], a1
  2422. LDF [AO + 2 * SIZE], a2
  2423. LDF [AO + 0 * SIZE], a3
  2424. FMUL a1, c03, c03
  2425. FMUL a1, c04, c04
  2426. FMUL a2, c03, t1
  2427. FMUL a2, c04, t2
  2428. FSUB c01, t1, c01
  2429. FSUB c02, t2, c02
  2430. FMUL a3, c01, c01
  2431. FMUL a3, c02, c02
  2432. #endif
  2433. #ifdef LT
  2434. LDF [AO + 0 * SIZE], a1
  2435. LDF [AO + 1 * SIZE], a2
  2436. LDF [AO + 3 * SIZE], a3
  2437. FMUL a1, c01, c01
  2438. FMUL a1, c02, c02
  2439. FMUL a2, c01, t1
  2440. FMUL a2, c02, t2
  2441. FSUB c03, t1, c03
  2442. FSUB c04, t2, c04
  2443. FMUL a3, c03, c03
  2444. FMUL a3, c04, c04
  2445. #endif
  2446. #ifdef RN
  2447. LDF [BO + 0 * SIZE], a1
  2448. LDF [BO + 1 * SIZE], a2
  2449. LDF [BO + 3 * SIZE], a3
  2450. FMUL a1, c01, c01
  2451. FMUL a1, c03, c03
  2452. FMUL a2, c01, t1
  2453. FMUL a2, c03, t2
  2454. FSUB c02, t1, c02
  2455. FSUB c04, t2, c04
  2456. FMUL a3, c02, c02
  2457. FMUL a3, c04, c04
  2458. #endif
  2459. #ifdef RT
  2460. LDF [BO + 3 * SIZE], a1
  2461. LDF [BO + 2 * SIZE], a2
  2462. LDF [BO + 0 * SIZE], a3
  2463. FMUL a1, c02, c02
  2464. FMUL a1, c04, c04
  2465. FMUL a2, c02, t1
  2466. FMUL a2, c04, t2
  2467. FSUB c01, t1, c01
  2468. FSUB c03, t2, c03
  2469. FMUL a3, c01, c01
  2470. FMUL a3, c03, c03
  2471. #endif
  2472. #ifdef LN
  2473. add C1, -2 * SIZE, C1
  2474. add C2, -2 * SIZE, C2
  2475. #endif
  2476. #if defined(LN) || defined(LT)
  2477. STF c01, [BO + 0 * SIZE]
  2478. STF c02, [BO + 1 * SIZE]
  2479. STF c03, [BO + 2 * SIZE]
  2480. STF c04, [BO + 3 * SIZE]
  2481. #else
  2482. STF c01, [AO + 0 * SIZE]
  2483. STF c03, [AO + 1 * SIZE]
  2484. STF c02, [AO + 2 * SIZE]
  2485. STF c04, [AO + 3 * SIZE]
  2486. #endif
  2487. STF c01, [C1 + 0 * SIZE]
  2488. STF c03, [C1 + 1 * SIZE]
  2489. STF c02, [C2 + 0 * SIZE]
  2490. STF c04, [C2 + 1 * SIZE]
  2491. FMOV FZERO, t1
  2492. FMOV FZERO, t2
  2493. FMOV FZERO, t3
  2494. FMOV FZERO, t4
  2495. #ifndef LN
  2496. add C1, 2 * SIZE, C1
  2497. add C2, 2 * SIZE, C2
  2498. #endif
  2499. #ifdef RT
  2500. sll K, 1 + BASE_SHIFT, TEMP1
  2501. add AORIG, TEMP1, AORIG
  2502. #endif
  2503. #if defined(LT) || defined(RN)
  2504. sub K, KK, TEMP1
  2505. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  2506. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  2507. add AO, TEMP2, AO
  2508. add BO, TEMP1, BO
  2509. #endif
  2510. #ifdef LT
  2511. add KK, 2, KK
  2512. #endif
  2513. #ifdef LN
  2514. sub KK, 2, KK
  2515. #endif
  2516. .LL170:
  2517. and M, 1, I
  2518. cmp I, 0
  2519. ble,pn %icc, .LL199
  2520. nop
  2521. #if defined(LT) || defined(RN)
  2522. sra KK, 2, L
  2523. mov B, BO
  2524. cmp L, 0
  2525. #else
  2526. #ifdef LN
  2527. sll K, 0 + BASE_SHIFT, TEMP1
  2528. sub AORIG, TEMP1, AORIG
  2529. #endif
  2530. sll KK, 0 + BASE_SHIFT, TEMP1
  2531. sll KK, 1 + BASE_SHIFT, TEMP2
  2532. add AORIG, TEMP1, AO
  2533. add B, TEMP2, BO
  2534. sub K, KK, TEMP1
  2535. sra TEMP1, 2, L
  2536. cmp L, 0
  2537. #endif
  2538. LDF [AO + 0 * SIZE], a1
  2539. FMOV FZERO, c01
  2540. LDF [BO + 0 * SIZE], b1
  2541. FMOV FZERO, t1
  2542. LDF [AO + 1 * SIZE], a2
  2543. FMOV FZERO, c02
  2544. LDF [BO + 1 * SIZE], b2
  2545. FMOV FZERO, t2
  2546. LDF [AO + 2 * SIZE], a3
  2547. FMOV FZERO, c03
  2548. LDF [BO + 2 * SIZE], b3
  2549. FMOV FZERO, t3
  2550. LDF [AO + 3 * SIZE], a4
  2551. FMOV FZERO, c04
  2552. LDF [BO + 3 * SIZE], b4
  2553. FMOV FZERO, t4
  2554. ble,pn %icc, .LL175
  2555. nop
  2556. .LL172:
  2557. FADD c01, t1, c01
  2558. add AO, 4 * SIZE, AO
  2559. FMUL a1, b1, t1
  2560. LDF [BO + 4 * SIZE], b1
  2561. FADD c02, t2, c02
  2562. FMUL a1, b2, t2
  2563. LDF [BO + 5 * SIZE], b2
  2564. add L, -1, L
  2565. LDF [AO + 0 * SIZE], a1
  2566. FADD c03, t3, c03
  2567. cmp L, 0
  2568. FMUL a2, b3, t3
  2569. LDF [BO + 6 * SIZE], b3
  2570. FADD c04, t4, c04
  2571. FMUL a2, b4, t4
  2572. LDF [BO + 7 * SIZE], b4
  2573. LDF [AO + 1 * SIZE], a2
  2574. FADD c01, t1, c01
  2575. FMUL a3, b1, t1
  2576. LDF [BO + 8 * SIZE], b1
  2577. FADD c02, t2, c02
  2578. FMUL a3, b2, t2
  2579. LDF [BO + 9 * SIZE], b2
  2580. LDF [AO + 2 * SIZE], a3
  2581. FADD c03, t3, c03
  2582. FMUL a4, b3, t3
  2583. LDF [BO + 10 * SIZE], b3
  2584. FADD c04, t4, c04
  2585. FMUL a4, b4, t4
  2586. LDF [BO + 11 * SIZE], b4
  2587. add BO, 8 * SIZE, BO
  2588. bg,pt %icc, .LL172
  2589. LDF [AO + 3 * SIZE], a4
  2590. .LL175:
  2591. #if defined(LT) || defined(RN)
  2592. and KK, 3, L
  2593. #else
  2594. and TEMP1, 3, L
  2595. #endif
  2596. cmp L, 0
  2597. ble,a,pn %icc, .LL179
  2598. nop
  2599. .LL176:
  2600. FADD c01, t1, c01
  2601. add L, -1, L
  2602. FMUL a1, b1, t1
  2603. add AO, 1 * SIZE, AO
  2604. LDF [BO + 2 * SIZE], b1
  2605. FADD c02, t2, c02
  2606. cmp L, 0
  2607. FMUL a1, b2, t2
  2608. LDF [BO + 3 * SIZE], b2
  2609. add BO, 2 * SIZE, BO
  2610. bg,pt %icc, .LL176
  2611. LDF [AO + 0 * SIZE], a1
  2612. .LL179:
  2613. FADD c01, t1, c01
  2614. FADD c02, t2, c02
  2615. FADD c03, t3, c03
  2616. FADD c04, t4, c04
  2617. FADD c01, c03, c01
  2618. FADD c02, c04, c02
  2619. #if defined(LN) || defined(RT)
  2620. #ifdef LN
  2621. sub KK, 1, TEMP1
  2622. #else
  2623. sub KK, 2, TEMP1
  2624. #endif
  2625. sll TEMP1, 0 + BASE_SHIFT, TEMP2
  2626. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  2627. add AORIG, TEMP2, AO
  2628. add B, TEMP1, BO
  2629. #endif
  2630. #if defined(LN) || defined(LT)
  2631. LDF [BO + 0 * SIZE], a1
  2632. LDF [BO + 1 * SIZE], a2
  2633. FSUB a1, c01, c01
  2634. FSUB a2, c02, c02
  2635. #else
  2636. LDF [AO + 0 * SIZE], a1
  2637. LDF [AO + 1 * SIZE], a2
  2638. FSUB a1, c01, c01
  2639. FSUB a2, c02, c02
  2640. #endif
  2641. #ifdef LN
  2642. LDF [AO + 0 * SIZE], a1
  2643. FMUL a1, c01, c01
  2644. FMUL a1, c02, c02
  2645. #endif
  2646. #ifdef LT
  2647. LDF [AO + 0 * SIZE], a1
  2648. FMUL a1, c01, c01
  2649. FMUL a1, c02, c02
  2650. #endif
  2651. #ifdef RN
  2652. LDF [BO + 0 * SIZE], a1
  2653. LDF [BO + 1 * SIZE], a2
  2654. LDF [BO + 3 * SIZE], a3
  2655. FMUL a1, c01, c01
  2656. FMUL a2, c01, t1
  2657. FSUB c02, t1, c02
  2658. FMUL a3, c02, c02
  2659. #endif
  2660. #ifdef RT
  2661. LDF [BO + 3 * SIZE], a1
  2662. LDF [BO + 2 * SIZE], a2
  2663. LDF [BO + 0 * SIZE], a3
  2664. FMUL a1, c02, c02
  2665. FMUL a2, c02, t1
  2666. FSUB c01, t1, c01
  2667. FMUL a3, c01, c01
  2668. #endif
  2669. #ifdef LN
  2670. add C1, -1 * SIZE, C1
  2671. add C2, -1 * SIZE, C2
  2672. #endif
  2673. #if defined(LN) || defined(LT)
  2674. STF c01, [BO + 0 * SIZE]
  2675. STF c02, [BO + 1 * SIZE]
  2676. #else
  2677. STF c01, [AO + 0 * SIZE]
  2678. STF c02, [AO + 1 * SIZE]
  2679. #endif
  2680. STF c01, [C1 + 0 * SIZE]
  2681. STF c02, [C2 + 0 * SIZE]
  2682. FMOV FZERO, t1
  2683. FMOV FZERO, t2
  2684. FMOV FZERO, t3
  2685. FMOV FZERO, t4
  2686. #ifndef LN
  2687. add C1, 1 * SIZE, C1
  2688. add C2, 1 * SIZE, C2
  2689. #endif
  2690. #ifdef RT
  2691. sll K, 0 + BASE_SHIFT, TEMP1
  2692. add AORIG, TEMP1, AORIG
  2693. #endif
  2694. #if defined(LT) || defined(RN)
  2695. sub K, KK, TEMP1
  2696. sll TEMP1, 0 + BASE_SHIFT, TEMP2
  2697. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  2698. add AO, TEMP2, AO
  2699. add BO, TEMP1, BO
  2700. #endif
  2701. #ifdef LT
  2702. add KK, 1, KK
  2703. #endif
  2704. #ifdef LN
  2705. sub KK, 1, KK
  2706. #endif
  2707. .LL199:
  2708. #ifdef LN
  2709. sll K, 1 + BASE_SHIFT, TEMP1
  2710. add B, TEMP1, B
  2711. #endif
  2712. #if defined(LT) || defined(RN)
  2713. mov BO, B
  2714. #endif
  2715. #ifdef RN
  2716. add KK, 2, KK
  2717. #endif
  2718. #ifdef RT
  2719. sub KK, 2, KK
  2720. #endif
  2721. .LL200:
  2722. and N, 1, J
  2723. cmp J, 0
  2724. ble,pn %icc, .LL999
  2725. nop
  2726. #ifdef RT
  2727. sll K, 0 + BASE_SHIFT, TEMP1
  2728. sub B, TEMP1, B
  2729. sub C, LDC, C
  2730. #endif
  2731. mov C, C1
  2732. #ifdef LN
  2733. add M, OFFSET, KK
  2734. #endif
  2735. #ifdef LT
  2736. mov OFFSET, KK
  2737. #endif
  2738. #if defined(LN) || defined(RT)
  2739. mov A, AORIG
  2740. #else
  2741. mov A, AO
  2742. #endif
  2743. #ifndef RT
  2744. add C, LDC, C
  2745. #endif
  2746. sra M, 2, I
  2747. cmp I, 0
  2748. ble,pn %icc, .LL250
  2749. nop
  2750. .LL221:
  2751. #if defined(LT) || defined(RN)
  2752. sra KK, 2, L
  2753. mov B, BO
  2754. cmp L, 0
  2755. #else
  2756. #ifdef LN
  2757. sll K, 2 + BASE_SHIFT, TEMP1
  2758. sub AORIG, TEMP1, AORIG
  2759. #endif
  2760. sll KK, 2 + BASE_SHIFT, TEMP1
  2761. sll KK, 0 + BASE_SHIFT, TEMP2
  2762. add AORIG, TEMP1, AO
  2763. add B, TEMP2, BO
  2764. sub K, KK, TEMP1
  2765. sra TEMP1, 2, L
  2766. cmp L, 0
  2767. #endif
  2768. LDF [AO + 0 * SIZE], a1
  2769. FMOV FZERO, c01
  2770. LDF [BO + 0 * SIZE], b1
  2771. FMOV FZERO, t1
  2772. LDF [AO + 1 * SIZE], a2
  2773. FMOV FZERO, c02
  2774. LDF [BO + 1 * SIZE], b2
  2775. FMOV FZERO, t2
  2776. LDF [AO + 2 * SIZE], a3
  2777. FMOV FZERO, c03
  2778. LDF [BO + 2 * SIZE], b3
  2779. FMOV FZERO, t3
  2780. LDF [AO + 3 * SIZE], a4
  2781. FMOV FZERO, c04
  2782. LDF [BO + 3 * SIZE], b4
  2783. FMOV FZERO, t4
  2784. ble,pn %icc, .LL225
  2785. prefetch [C1 + 4 * SIZE], 2
  2786. .LL222:
  2787. FADD c01, t1, c01
  2788. add BO, 4 * SIZE, BO
  2789. FMUL a1, b1, t1
  2790. LDF [AO + 4 * SIZE], a1
  2791. FADD c02, t2, c02
  2792. FMUL a2, b1, t2
  2793. LDF [AO + 5 * SIZE], a2
  2794. FADD c03, t3, c03
  2795. add L, -1, L
  2796. FMUL a3, b1, t3
  2797. LDF [AO + 6 * SIZE], a3
  2798. FADD c04, t4, c04
  2799. FMUL a4, b1, t4
  2800. LDF [AO + 7 * SIZE], a4
  2801. LDF [BO + 0 * SIZE], b1
  2802. FADD c01, t1, c01
  2803. cmp L, 0
  2804. FMUL a1, b2, t1
  2805. LDF [AO + 8 * SIZE], a1
  2806. FADD c02, t2, c02
  2807. FMUL a2, b2, t2
  2808. LDF [AO + 9 * SIZE], a2
  2809. FADD c03, t3, c03
  2810. FMUL a3, b2, t3
  2811. LDF [AO + 10 * SIZE], a3
  2812. FADD c04, t4, c04
  2813. FMUL a4, b2, t4
  2814. LDF [AO + 11 * SIZE], a4
  2815. LDF [BO + 1 * SIZE], b2
  2816. FADD c01, t1, c01
  2817. FMUL a1, b3, t1
  2818. LDF [AO + 12 * SIZE], a1
  2819. FADD c02, t2, c02
  2820. FMUL a2, b3, t2
  2821. LDF [AO + 13 * SIZE], a2
  2822. FADD c03, t3, c03
  2823. FMUL a3, b3, t3
  2824. LDF [AO + 14 * SIZE], a3
  2825. FADD c04, t4, c04
  2826. FMUL a4, b3, t4
  2827. LDF [AO + 15 * SIZE], a4
  2828. LDF [BO + 2 * SIZE], b3
  2829. FADD c01, t1, c01
  2830. FMUL a1, b4, t1
  2831. LDF [AO + 16 * SIZE], a1
  2832. FADD c02, t2, c02
  2833. FMUL a2, b4, t2
  2834. LDF [AO + 17 * SIZE], a2
  2835. FADD c03, t3, c03
  2836. FMUL a3, b4, t3
  2837. LDF [AO + 18 * SIZE], a3
  2838. FADD c04, t4, c04
  2839. FMUL a4, b4, t4
  2840. LDF [AO + 19 * SIZE], a4
  2841. add AO, 16 * SIZE, AO
  2842. bg,pt %icc, .LL222
  2843. LDF [BO + 3 * SIZE], b4
  2844. .LL225:
  2845. #if defined(LT) || defined(RN)
  2846. and KK, 3, L
  2847. #else
  2848. and TEMP1, 3, L
  2849. #endif
  2850. cmp L, 0
  2851. ble,a,pn %icc, .LL229
  2852. nop
  2853. .LL226:
  2854. FADD c01, t1, c01
  2855. add BO, 1 * SIZE, BO
  2856. FMUL a1, b1, t1
  2857. LDF [AO + 4 * SIZE], a1
  2858. FADD c02, t2, c02
  2859. add L, -1, L
  2860. FMUL a2, b1, t2
  2861. LDF [AO + 5 * SIZE], a2
  2862. FADD c03, t3, c03
  2863. cmp L, 0
  2864. FMUL a3, b1, t3
  2865. LDF [AO + 6 * SIZE], a3
  2866. FADD c04, t4, c04
  2867. FMUL a4, b1, t4
  2868. LDF [AO + 7 * SIZE], a4
  2869. add AO, 4 * SIZE, AO
  2870. bg,pt %icc, .LL226
  2871. LDF [BO + 0 * SIZE], b1
  2872. .LL229:
  2873. FADD c01, t1, c01
  2874. FADD c02, t2, c02
  2875. FADD c03, t3, c03
  2876. FADD c04, t4, c04
  2877. #if defined(LN) || defined(RT)
  2878. #ifdef LN
  2879. sub KK, 4, TEMP1
  2880. #else
  2881. sub KK, 1, TEMP1
  2882. #endif
  2883. sll TEMP1, 2 + BASE_SHIFT, TEMP2
  2884. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  2885. add AORIG, TEMP2, AO
  2886. add B, TEMP1, BO
  2887. #endif
  2888. #if defined(LN) || defined(LT)
  2889. LDF [BO + 0 * SIZE], a1
  2890. LDF [BO + 1 * SIZE], a2
  2891. LDF [BO + 2 * SIZE], a3
  2892. LDF [BO + 3 * SIZE], a4
  2893. FSUB a1, c01, c01
  2894. FSUB a2, c02, c02
  2895. FSUB a3, c03, c03
  2896. FSUB a4, c04, c04
  2897. #else
  2898. LDF [AO + 0 * SIZE], a1
  2899. LDF [AO + 1 * SIZE], a2
  2900. LDF [AO + 2 * SIZE], a3
  2901. LDF [AO + 3 * SIZE], a4
  2902. FSUB a1, c01, c01
  2903. FSUB a2, c02, c02
  2904. FSUB a3, c03, c03
  2905. FSUB a4, c04, c04
  2906. #endif
  2907. #ifdef LN
  2908. LDF [AO + 15 * SIZE], a1
  2909. LDF [AO + 14 * SIZE], a2
  2910. LDF [AO + 13 * SIZE], a3
  2911. LDF [AO + 12 * SIZE], a4
  2912. FMUL a1, c04, c04
  2913. FMUL a2, c04, t1
  2914. FSUB c03, t1, c03
  2915. FMUL a3, c04, t1
  2916. FSUB c02, t1, c02
  2917. FMUL a4, c04, t1
  2918. FSUB c01, t1, c01
  2919. LDF [AO + 10 * SIZE], a1
  2920. LDF [AO + 9 * SIZE], a2
  2921. LDF [AO + 8 * SIZE], a3
  2922. FMUL a1, c03, c03
  2923. FMUL a2, c03, t1
  2924. FSUB c02, t1, c02
  2925. FMUL a3, c03, t1
  2926. FSUB c01, t1, c01
  2927. LDF [AO + 5 * SIZE], a1
  2928. LDF [AO + 4 * SIZE], a2
  2929. FMUL a1, c02, c02
  2930. FMUL a2, c02, t1
  2931. FSUB c01, t1, c01
  2932. LDF [AO + 0 * SIZE], a1
  2933. FMUL a1, c01, c01
  2934. #endif
  2935. #ifdef LT
  2936. LDF [AO + 0 * SIZE], a1
  2937. LDF [AO + 1 * SIZE], a2
  2938. LDF [AO + 2 * SIZE], a3
  2939. LDF [AO + 3 * SIZE], a4
  2940. FMUL a1, c01, c01
  2941. FMUL a2, c01, t1
  2942. FSUB c02, t1, c02
  2943. FMUL a3, c01, t1
  2944. FSUB c03, t1, c03
  2945. FMUL a4, c01, t1
  2946. FSUB c04, t1, c04
  2947. LDF [AO + 5 * SIZE], a1
  2948. LDF [AO + 6 * SIZE], a2
  2949. LDF [AO + 7 * SIZE], a3
  2950. FMUL a1, c02, c02
  2951. FMUL a2, c02, t1
  2952. FSUB c03, t1, c03
  2953. FMUL a3, c02, t1
  2954. FSUB c04, t1, c04
  2955. LDF [AO + 10 * SIZE], a1
  2956. LDF [AO + 11 * SIZE], a2
  2957. FMUL a1, c03, c03
  2958. FMUL a2, c03, t1
  2959. FSUB c04, t1, c04
  2960. LDF [AO + 15 * SIZE], a1
  2961. FMUL a1, c04, c04
  2962. #endif
  2963. #ifdef RN
  2964. LDF [BO + 0 * SIZE], a1
  2965. FMUL a1, c01, c01
  2966. FMUL a1, c02, c02
  2967. FMUL a1, c03, c03
  2968. FMUL a1, c04, c04
  2969. #endif
  2970. #ifdef RT
  2971. LDF [BO + 0 * SIZE], a1
  2972. FMUL a1, c01, c01
  2973. FMUL a1, c02, c02
  2974. FMUL a1, c03, c03
  2975. FMUL a1, c04, c04
  2976. #endif
  2977. #ifdef LN
  2978. add C1, -4 * SIZE, C1
  2979. #endif
  2980. #if defined(LN) || defined(LT)
  2981. STF c01, [BO + 0 * SIZE]
  2982. STF c02, [BO + 1 * SIZE]
  2983. STF c03, [BO + 2 * SIZE]
  2984. STF c04, [BO + 3 * SIZE]
  2985. #else
  2986. STF c01, [AO + 0 * SIZE]
  2987. STF c02, [AO + 1 * SIZE]
  2988. STF c03, [AO + 2 * SIZE]
  2989. STF c04, [AO + 3 * SIZE]
  2990. #endif
  2991. STF c01, [C1 + 0 * SIZE]
  2992. STF c02, [C1 + 1 * SIZE]
  2993. STF c03, [C1 + 2 * SIZE]
  2994. STF c04, [C1 + 3 * SIZE]
  2995. FMOV FZERO, t1
  2996. FMOV FZERO, t2
  2997. FMOV FZERO, t3
  2998. FMOV FZERO, t4
  2999. #ifndef LN
  3000. add C1, 4 * SIZE, C1
  3001. #endif
  3002. #ifdef RT
  3003. sll K, 2 + BASE_SHIFT, TEMP1
  3004. add AORIG, TEMP1, AORIG
  3005. #endif
  3006. #if defined(LT) || defined(RN)
  3007. sub K, KK, TEMP1
  3008. sll TEMP1, 2 + BASE_SHIFT, TEMP2
  3009. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  3010. add AO, TEMP2, AO
  3011. add BO, TEMP1, BO
  3012. #endif
  3013. #ifdef LT
  3014. add KK, 4, KK
  3015. #endif
  3016. #ifdef LN
  3017. sub KK, 4, KK
  3018. #endif
  3019. add I, -1, I
  3020. cmp I, 0
  3021. bg,pt %icc, .LL221
  3022. nop
  3023. .LL250:
  3024. and M, 2, I
  3025. cmp I, 0
  3026. ble,pn %icc, .LL270
  3027. nop
  3028. #if defined(LT) || defined(RN)
  3029. sra KK, 2, L
  3030. mov B, BO
  3031. cmp L, 0
  3032. #else
  3033. #ifdef LN
  3034. sll K, 1 + BASE_SHIFT, TEMP1
  3035. sub AORIG, TEMP1, AORIG
  3036. #endif
  3037. sll KK, 1 + BASE_SHIFT, TEMP1
  3038. sll KK, 0 + BASE_SHIFT, TEMP2
  3039. add AORIG, TEMP1, AO
  3040. add B, TEMP2, BO
  3041. sub K, KK, TEMP1
  3042. sra TEMP1, 2, L
  3043. cmp L, 0
  3044. #endif
  3045. LDF [AO + 0 * SIZE], a1
  3046. FMOV FZERO, c01
  3047. LDF [BO + 0 * SIZE], b1
  3048. FMOV FZERO, t1
  3049. LDF [AO + 1 * SIZE], a2
  3050. FMOV FZERO, c02
  3051. LDF [BO + 1 * SIZE], b2
  3052. FMOV FZERO, t2
  3053. LDF [AO + 2 * SIZE], a3
  3054. FMOV FZERO, c03
  3055. LDF [BO + 2 * SIZE], b3
  3056. FMOV FZERO, t3
  3057. LDF [AO + 3 * SIZE], a4
  3058. FMOV FZERO, c04
  3059. LDF [BO + 3 * SIZE], b4
  3060. FMOV FZERO, t4
  3061. ble,pn %icc, .LL255
  3062. nop
  3063. .LL252:
  3064. FADD c01, t1, c01
  3065. add L, -1, L
  3066. FMUL a1, b1, t1
  3067. LDF [AO + 4 * SIZE], a1
  3068. FADD c02, t2, c02
  3069. FMUL a2, b1, t2
  3070. LDF [AO + 5 * SIZE], a2
  3071. LDF [BO + 4 * SIZE], b1
  3072. FADD c03, t3, c03
  3073. cmp L, 0
  3074. FMUL a3, b2, t3
  3075. LDF [AO + 6 * SIZE], a3
  3076. FADD c04, t4, c04
  3077. FMUL a4, b2, t4
  3078. LDF [AO + 7 * SIZE], a4
  3079. LDF [BO + 5 * SIZE], b2
  3080. FADD c01, t1, c01
  3081. FMUL a1, b3, t1
  3082. LDF [AO + 8 * SIZE], a1
  3083. FADD c02, t2, c02
  3084. FMUL a2, b3, t2
  3085. LDF [AO + 9 * SIZE], a2
  3086. LDF [BO + 6 * SIZE], b3
  3087. FADD c03, t3, c03
  3088. FMUL a3, b4, t3
  3089. LDF [AO + 10 * SIZE], a3
  3090. FADD c04, t4, c04
  3091. FMUL a4, b4, t4
  3092. LDF [AO + 11 * SIZE], a4
  3093. add AO, 8 * SIZE, AO
  3094. LDF [BO + 7 * SIZE], b4
  3095. bg,pt %icc, .LL252
  3096. add BO, 4 * SIZE, BO
  3097. .LL255:
  3098. #if defined(LT) || defined(RN)
  3099. and KK, 3, L
  3100. #else
  3101. and TEMP1, 3, L
  3102. #endif
  3103. cmp L, 0
  3104. ble,a,pn %icc, .LL259
  3105. nop
  3106. .LL256:
  3107. FADD c01, t1, c01
  3108. add L, -1, L
  3109. FMUL a1, b1, t1
  3110. LDF [AO + 2 * SIZE], a1
  3111. FADD c02, t2, c02
  3112. cmp L, 0
  3113. FMUL a2, b1, t2
  3114. LDF [AO + 3 * SIZE], a2
  3115. LDF [BO + 1 * SIZE], b1
  3116. add AO, 2 * SIZE, AO
  3117. bg,pt %icc, .LL256
  3118. add BO, 1 * SIZE, BO
  3119. .LL259:
  3120. FADD c01, t1, c01
  3121. FADD c02, t2, c02
  3122. FADD c03, t3, c03
  3123. FADD c04, t4, c04
  3124. FADD c01, c03, c01
  3125. FADD c02, c04, c02
  3126. #if defined(LN) || defined(RT)
  3127. #ifdef LN
  3128. sub KK, 2, TEMP1
  3129. #else
  3130. sub KK, 1, TEMP1
  3131. #endif
  3132. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  3133. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  3134. add AORIG, TEMP2, AO
  3135. add B, TEMP1, BO
  3136. #endif
  3137. #if defined(LN) || defined(LT)
  3138. LDF [BO + 0 * SIZE], a1
  3139. LDF [BO + 1 * SIZE], a2
  3140. FSUB a1, c01, c01
  3141. FSUB a2, c02, c02
  3142. #else
  3143. LDF [AO + 0 * SIZE], a1
  3144. LDF [AO + 1 * SIZE], a2
  3145. FSUB a1, c01, c01
  3146. FSUB a2, c02, c02
  3147. #endif
  3148. #ifdef LN
  3149. LDF [AO + 3 * SIZE], a1
  3150. LDF [AO + 2 * SIZE], a2
  3151. LDF [AO + 0 * SIZE], a3
  3152. FMUL a1, c02, c02
  3153. FMUL a2, c02, t1
  3154. FSUB c01, t1, c01
  3155. FMUL a3, c01, c01
  3156. #endif
  3157. #ifdef LT
  3158. LDF [AO + 0 * SIZE], a1
  3159. LDF [AO + 1 * SIZE], a2
  3160. LDF [AO + 3 * SIZE], a3
  3161. FMUL a1, c01, c01
  3162. FMUL a2, c01, t1
  3163. FSUB c02, t1, c02
  3164. FMUL a3, c02, c02
  3165. #endif
  3166. #ifdef RN
  3167. LDF [BO + 0 * SIZE], a1
  3168. FMUL a1, c01, c01
  3169. FMUL a1, c02, c02
  3170. #endif
  3171. #ifdef RT
  3172. LDF [BO + 0 * SIZE], a1
  3173. FMUL a1, c01, c01
  3174. FMUL a1, c02, c02
  3175. #endif
  3176. #ifdef LN
  3177. add C1, -2 * SIZE, C1
  3178. #endif
  3179. #if defined(LN) || defined(LT)
  3180. STF c01, [BO + 0 * SIZE]
  3181. STF c02, [BO + 1 * SIZE]
  3182. #else
  3183. STF c01, [AO + 0 * SIZE]
  3184. STF c02, [AO + 1 * SIZE]
  3185. #endif
  3186. STF c01, [C1 + 0 * SIZE]
  3187. STF c02, [C1 + 1 * SIZE]
  3188. FMOV FZERO, t1
  3189. FMOV FZERO, t2
  3190. FMOV FZERO, t3
  3191. FMOV FZERO, t4
  3192. #ifndef LN
  3193. add C1, 2 * SIZE, C1
  3194. #endif
  3195. #ifdef RT
  3196. sll K, 1 + BASE_SHIFT, TEMP1
  3197. add AORIG, TEMP1, AORIG
  3198. #endif
  3199. #if defined(LT) || defined(RN)
  3200. sub K, KK, TEMP1
  3201. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  3202. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  3203. add AO, TEMP2, AO
  3204. add BO, TEMP1, BO
  3205. #endif
  3206. #ifdef LT
  3207. add KK, 2, KK
  3208. #endif
  3209. #ifdef LN
  3210. sub KK, 2, KK
  3211. #endif
  3212. .LL270:
  3213. and M, 1, I
  3214. cmp I, 0
  3215. ble,pn %icc, .LL299
  3216. nop
  3217. #if defined(LT) || defined(RN)
  3218. sra KK, 2, L
  3219. mov B, BO
  3220. cmp L, 0
  3221. #else
  3222. #ifdef LN
  3223. sll K, 0 + BASE_SHIFT, TEMP1
  3224. sub AORIG, TEMP1, AORIG
  3225. #endif
  3226. sll KK, 0 + BASE_SHIFT, TEMP1
  3227. add AORIG, TEMP1, AO
  3228. add B, TEMP1, BO
  3229. sub K, KK, TEMP1
  3230. sra TEMP1, 2, L
  3231. cmp L, 0
  3232. #endif
  3233. LDF [AO + 0 * SIZE], a1
  3234. FMOV FZERO, t1
  3235. LDF [AO + 1 * SIZE], a2
  3236. FMOV FZERO, c01
  3237. LDF [AO + 2 * SIZE], a3
  3238. FMOV FZERO, t2
  3239. LDF [AO + 3 * SIZE], a4
  3240. FMOV FZERO, c02
  3241. LDF [BO + 0 * SIZE], b1
  3242. FMOV FZERO, t3
  3243. LDF [BO + 1 * SIZE], b2
  3244. FMOV FZERO, t4
  3245. LDF [BO + 2 * SIZE], b3
  3246. ble,pn %icc, .LL275
  3247. LDF [BO + 3 * SIZE], b4
  3248. .LL272:
  3249. FADD c01, t1, c01
  3250. add L, -1, L
  3251. add AO, 4 * SIZE, AO
  3252. FMUL a1, b1, t1
  3253. add BO, 4 * SIZE, BO
  3254. LDF [AO + 0 * SIZE], a1
  3255. FADD c02, t2, c02
  3256. cmp L, 0
  3257. LDF [BO + 0 * SIZE], b1
  3258. FMUL a2, b2, t2
  3259. LDF [AO + 1 * SIZE], a2
  3260. FADD c01, t3, c01
  3261. LDF [BO + 1 * SIZE], b2
  3262. FMUL a3, b3, t3
  3263. LDF [AO + 2 * SIZE], a3
  3264. FADD c02, t4, c02
  3265. LDF [BO + 2 * SIZE], b3
  3266. FMUL a4, b4, t4
  3267. LDF [AO + 3 * SIZE], a4
  3268. bg,pt %icc, .LL272
  3269. LDF [BO + 3 * SIZE], b4
  3270. .LL275:
  3271. #if defined(LT) || defined(RN)
  3272. and KK, 3, L
  3273. #else
  3274. and TEMP1, 3, L
  3275. #endif
  3276. cmp L, 0
  3277. ble,a,pn %icc, .LL279
  3278. nop
  3279. .LL276:
  3280. FADD c01, t1, c01
  3281. add L, -1, L
  3282. FMUL a1, b1, t1
  3283. LDF [AO + 1 * SIZE], a1
  3284. LDF [BO + 1 * SIZE], b1
  3285. add BO, 1 * SIZE, BO
  3286. cmp L, 0
  3287. bg,pt %icc, .LL276
  3288. add AO, 1 * SIZE, AO
  3289. .LL279:
  3290. FADD c01, t1, c01
  3291. FADD c02, t2, c02
  3292. FADD c01, t3, c01
  3293. FADD c02, t4, c02
  3294. FADD c01, c02, c01
  3295. #if defined(LN) || defined(RT)
  3296. sub KK, 1, TEMP1
  3297. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  3298. add AORIG, TEMP1, AO
  3299. add B, TEMP1, BO
  3300. #endif
  3301. #if defined(LN) || defined(LT)
  3302. LDF [BO + 0 * SIZE], a1
  3303. FSUB a1, c01, c01
  3304. #else
  3305. LDF [AO + 0 * SIZE], a1
  3306. FSUB a1, c01, c01
  3307. #endif
  3308. #ifdef LN
  3309. LDF [AO + 0 * SIZE], a1
  3310. FMUL a1, c01, c01
  3311. #endif
  3312. #ifdef LT
  3313. LDF [AO + 0 * SIZE], a1
  3314. FMUL a1, c01, c01
  3315. #endif
  3316. #ifdef RN
  3317. LDF [BO + 0 * SIZE], a1
  3318. FMUL a1, c01, c01
  3319. #endif
  3320. #ifdef RT
  3321. LDF [BO + 0 * SIZE], a1
  3322. FMUL a1, c01, c01
  3323. #endif
  3324. #ifdef LN
  3325. add C1, -1 * SIZE, C1
  3326. #endif
  3327. #if defined(LN) || defined(LT)
  3328. STF c01, [BO + 0 * SIZE]
  3329. #else
  3330. STF c01, [AO + 0 * SIZE]
  3331. #endif
  3332. STF c01, [C1 + 0 * SIZE]
  3333. FMOV FZERO, t1
  3334. FMOV FZERO, t2
  3335. FMOV FZERO, t3
  3336. FMOV FZERO, t4
  3337. #ifndef LN
  3338. add C1, 1 * SIZE, C1
  3339. #endif
  3340. #ifdef RT
  3341. sll K, 0 + BASE_SHIFT, TEMP1
  3342. add AORIG, TEMP1, AORIG
  3343. #endif
  3344. #if defined(LT) || defined(RN)
  3345. sub K, KK, TEMP1
  3346. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  3347. add AO, TEMP1, AO
  3348. add BO, TEMP1, BO
  3349. #endif
  3350. #ifdef LT
  3351. add KK, 1, KK
  3352. #endif
  3353. #ifdef LN
  3354. sub KK, 1, KK
  3355. #endif
  3356. .LL299:
  3357. #ifdef LN
  3358. sll K, 0 + BASE_SHIFT, TEMP1
  3359. add B, TEMP1, B
  3360. #endif
  3361. #if defined(LT) || defined(RN)
  3362. mov BO, B
  3363. #endif
  3364. #ifdef RN
  3365. add KK, 1, KK
  3366. #endif
  3367. #ifdef RT
  3368. sub KK, 1, KK
  3369. #endif
  3370. .LL999:
  3371. return %i7 + 8
  3372. clr %o0
  3373. EPILOGUE