You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LN.S 66 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %i0
  41. #define N %i1
  42. #define K %i2
  43. #if defined(DOUBLE) && !defined(__64BIT__)
  44. #define A %i5
  45. #define B %i4
  46. #else
  47. #define A %i4
  48. #define B %i5
  49. #endif
  50. #define C %o4
  51. #define LDC %o5
  52. #define AO %l0
  53. #define BO %l1
  54. #define I %l2
  55. #define J %l3
  56. #define L %l4
  57. #define C1 %o0
  58. #define C2 %o1
  59. #define C3 %o2
  60. #define C4 %o3
  61. #define OFFSET %l5
  62. #define KK %l6
  63. #define TEMP1 %l7
  64. #define TEMP2 %i3
  65. #define AORIG %g1
  66. #ifdef DOUBLE
  67. #define c01 %f0
  68. #define c02 %f2
  69. #define c03 %f4
  70. #define c04 %f6
  71. #define c05 %f8
  72. #define c06 %f10
  73. #define c07 %f12
  74. #define c08 %f14
  75. #define c09 %f16
  76. #define c10 %f18
  77. #define c11 %f20
  78. #define c12 %f22
  79. #define c13 %f24
  80. #define c14 %f26
  81. #define c15 %f28
  82. #define c16 %f30
  83. #define t1 %f32
  84. #define t2 %f34
  85. #define t3 %f36
  86. #define t4 %f38
  87. #define a1 %f40
  88. #define a2 %f42
  89. #define a3 %f44
  90. #define a4 %f46
  91. #define a5 %f58
  92. #define b1 %f48
  93. #define b2 %f50
  94. #define b3 %f52
  95. #define b4 %f54
  96. #define b5 %f56
  97. #define FZERO %f60
  98. #define ALPHA %f62
  99. #else
  100. #define c01 %f0
  101. #define c02 %f1
  102. #define c03 %f2
  103. #define c04 %f3
  104. #define c05 %f4
  105. #define c06 %f5
  106. #define c07 %f6
  107. #define c08 %f7
  108. #define c09 %f8
  109. #define c10 %f9
  110. #define c11 %f10
  111. #define c12 %f11
  112. #define c13 %f12
  113. #define c14 %f13
  114. #define c15 %f14
  115. #define c16 %f15
  116. #define t1 %f16
  117. #define t2 %f17
  118. #define t3 %f18
  119. #define t4 %f19
  120. #define a1 %f20
  121. #define a2 %f21
  122. #define a3 %f22
  123. #define a4 %f23
  124. #define a5 %f31
  125. #define b1 %f24
  126. #define b2 %f25
  127. #define b3 %f26
  128. #define b4 %f27
  129. #define b5 %f28
  130. #define FZERO %f29
  131. #define ALPHA %f30
  132. #endif
  133. #define APREFETCHSIZE 40
  134. #define BPREFETCHSIZE 40
  135. #define APREFETCH_CATEGORY 0
  136. #define BPREFETCH_CATEGORY 0
  137. PROLOGUE
  138. SAVESP
  139. nop
  140. #ifndef __64BIT__
  141. #ifdef DOUBLE
  142. ld [%sp + STACK_START + 28], B
  143. ld [%sp + STACK_START + 32], C
  144. ld [%sp + STACK_START + 36], LDC
  145. ld [%sp + STACK_START + 40], OFFSET
  146. #else
  147. ld [%sp + STACK_START + 28], C
  148. ld [%sp + STACK_START + 32], LDC
  149. ld [%sp + STACK_START + 36], OFFSET
  150. #endif
  151. #else
  152. ldx [%sp+ STACK_START + 56], C
  153. ldx [%sp+ STACK_START + 64], LDC
  154. ldx [%sp+ STACK_START + 72], OFFSET
  155. #endif
  156. FCLR(29)
  157. sll LDC, BASE_SHIFT, LDC
  158. #ifdef LN
  159. smul M, K, TEMP1
  160. sll TEMP1, BASE_SHIFT, TEMP1
  161. add A, TEMP1, A
  162. sll M, BASE_SHIFT, TEMP1
  163. add C, TEMP1, C
  164. #endif
  165. #ifdef RN
  166. neg OFFSET, KK
  167. #endif
  168. #ifdef RT
  169. smul N, K, TEMP1
  170. sll TEMP1, BASE_SHIFT, TEMP1
  171. add B, TEMP1, B
  172. smul N, LDC, TEMP1
  173. add C, TEMP1, C
  174. sub N, OFFSET, KK
  175. #endif
  176. sra N, 2, J
  177. cmp J, 0
  178. ble,pn %icc, .LL100
  179. nop
  180. .LL11:
  181. #ifdef RT
  182. sll K, 2 + BASE_SHIFT, TEMP1
  183. sub B, TEMP1, B
  184. sll LDC, 2, TEMP1
  185. sub C, TEMP1, C
  186. #endif
  187. mov C, C1
  188. add C, LDC, C2
  189. add C2, LDC, C3
  190. add C3, LDC, C4
  191. #ifdef LN
  192. add M, OFFSET, KK
  193. #endif
  194. #ifdef LT
  195. mov OFFSET, KK
  196. #endif
  197. #if defined(LN) || defined(RT)
  198. mov A, AORIG
  199. #else
  200. mov A, AO
  201. #endif
  202. #ifndef RT
  203. add C4, LDC, C
  204. #endif
  205. and M, 1, I
  206. cmp I, 0
  207. ble,pn %icc, .LL50
  208. nop
  209. #if defined(LT) || defined(RN)
  210. sra KK, 2, L
  211. mov B, BO
  212. cmp L, 0
  213. #else
  214. #ifdef LN
  215. sll K, 0 + BASE_SHIFT, TEMP1
  216. sub AORIG, TEMP1, AORIG
  217. #endif
  218. sll KK, 0 + BASE_SHIFT, TEMP1
  219. sll KK, 2 + BASE_SHIFT, TEMP2
  220. add AORIG, TEMP1, AO
  221. add B, TEMP2, BO
  222. sub K, KK, TEMP1
  223. sra TEMP1, 2, L
  224. cmp L, 0
  225. #endif
  226. LDF [AO + 0 * SIZE], a1
  227. FMOV FZERO, c01
  228. LDF [BO + 0 * SIZE], b1
  229. FMOV FZERO, t1
  230. LDF [AO + 1 * SIZE], a2
  231. FMOV FZERO, c02
  232. LDF [BO + 1 * SIZE], b2
  233. FMOV FZERO, t2
  234. LDF [AO + 2 * SIZE], a3
  235. FMOV FZERO, c03
  236. LDF [BO + 2 * SIZE], b3
  237. FMOV FZERO, t3
  238. LDF [AO + 3 * SIZE], a4
  239. FMOV FZERO, c04
  240. LDF [BO + 3 * SIZE], b4
  241. FMOV FZERO, t4
  242. ble,pn %icc, .LL75
  243. nop
  244. .LL72:
  245. FADD c01, t1, c01
  246. add L, -1, L
  247. FMUL a1, b1, t1
  248. LDF [BO + 4 * SIZE], b1
  249. FADD c02, t2, c02
  250. cmp L, 0
  251. FMUL a1, b2, t2
  252. LDF [BO + 5 * SIZE], b2
  253. FADD c03, t3, c03
  254. FMUL a1, b3, t3
  255. LDF [BO + 6 * SIZE], b3
  256. FADD c04, t4, c04
  257. FMUL a1, b4, t4
  258. LDF [BO + 7 * SIZE], b4
  259. LDF [AO + 4 * SIZE], a1
  260. FADD c01, t1, c01
  261. add AO, 4 * SIZE, AO
  262. FMUL a2, b1, t1
  263. LDF [BO + 8 * SIZE], b1
  264. FADD c02, t2, c02
  265. FMUL a2, b2, t2
  266. LDF [BO + 9 * SIZE], b2
  267. FADD c03, t3, c03
  268. FMUL a2, b3, t3
  269. LDF [BO + 10 * SIZE], b3
  270. FADD c04, t4, c04
  271. FMUL a2, b4, t4
  272. LDF [BO + 11 * SIZE], b4
  273. LDF [AO + 1 * SIZE], a2
  274. FADD c01, t1, c01
  275. FMUL a3, b1, t1
  276. LDF [BO + 12 * SIZE], b1
  277. FADD c02, t2, c02
  278. FMUL a3, b2, t2
  279. LDF [BO + 13 * SIZE], b2
  280. FADD c03, t3, c03
  281. FMUL a3, b3, t3
  282. LDF [BO + 14 * SIZE], b3
  283. FADD c04, t4, c04
  284. FMUL a3, b4, t4
  285. LDF [BO + 15 * SIZE], b4
  286. LDF [AO + 2 * SIZE], a3
  287. FADD c01, t1, c01
  288. FMUL a4, b1, t1
  289. LDF [BO + 16 * SIZE], b1
  290. FADD c02, t2, c02
  291. FMUL a4, b2, t2
  292. LDF [BO + 17 * SIZE], b2
  293. FADD c03, t3, c03
  294. FMUL a4, b3, t3
  295. LDF [BO + 18 * SIZE], b3
  296. FADD c04, t4, c04
  297. FMUL a4, b4, t4
  298. LDF [BO + 19 * SIZE], b4
  299. add BO, 16 * SIZE, BO
  300. bg,pt %icc, .LL72
  301. LDF [AO + 3 * SIZE], a4
  302. .LL75:
  303. #if defined(LT) || defined(RN)
  304. and KK, 3, L
  305. #else
  306. and TEMP1, 3, L
  307. #endif
  308. cmp L, 0
  309. ble,a,pn %icc, .LL79
  310. nop
  311. .LL76:
  312. FADD c01, t1, c01
  313. add AO, 1 * SIZE, AO
  314. FMUL a1, b1, t1
  315. LDF [BO + 4 * SIZE], b1
  316. FADD c02, t2, c02
  317. add L, -1, L
  318. FMUL a1, b2, t2
  319. LDF [BO + 5 * SIZE], b2
  320. FADD c03, t3, c03
  321. cmp L, 0
  322. FMUL a1, b3, t3
  323. LDF [BO + 6 * SIZE], b3
  324. FADD c04, t4, c04
  325. add BO, 4 * SIZE, BO
  326. FMUL a1, b4, t4
  327. LDF [AO + 0 * SIZE], a1
  328. bg,pt %icc, .LL76
  329. LDF [BO + 3 * SIZE], b4
  330. .LL79:
  331. FADD c01, t1, c01
  332. FADD c02, t2, c02
  333. FADD c03, t3, c03
  334. FADD c04, t4, c04
  335. #if defined(LN) || defined(RT)
  336. #ifdef LN
  337. sub KK, 1, TEMP1
  338. #else
  339. sub KK, 4, TEMP1
  340. #endif
  341. sll TEMP1, 0 + BASE_SHIFT, TEMP2
  342. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  343. add AORIG, TEMP2, AO
  344. add B, TEMP1, BO
  345. #endif
  346. #if defined(LN) || defined(LT)
  347. LDF [BO + 0 * SIZE], a1
  348. LDF [BO + 1 * SIZE], a2
  349. LDF [BO + 2 * SIZE], a3
  350. LDF [BO + 3 * SIZE], a4
  351. FSUB a1, c01, c01
  352. FSUB a2, c02, c02
  353. FSUB a3, c03, c03
  354. FSUB a4, c04, c04
  355. #else
  356. LDF [AO + 0 * SIZE], a1
  357. LDF [AO + 1 * SIZE], a2
  358. LDF [AO + 2 * SIZE], a3
  359. LDF [AO + 3 * SIZE], a4
  360. FSUB a1, c01, c01
  361. FSUB a2, c02, c02
  362. FSUB a3, c03, c03
  363. FSUB a4, c04, c04
  364. #endif
  365. #ifdef LN
  366. LDF [AO + 0 * SIZE], a1
  367. FMUL a1, c01, c01
  368. FMUL a1, c02, c02
  369. FMUL a1, c03, c03
  370. FMUL a1, c04, c04
  371. #endif
  372. #ifdef LT
  373. LDF [AO + 0 * SIZE], a1
  374. FMUL a1, c01, c01
  375. FMUL a1, c02, c02
  376. FMUL a1, c03, c03
  377. FMUL a1, c04, c04
  378. #endif
  379. #ifdef RN
  380. LDF [BO + 0 * SIZE], a1
  381. LDF [BO + 1 * SIZE], a2
  382. LDF [BO + 2 * SIZE], a3
  383. LDF [BO + 3 * SIZE], a4
  384. FMUL a1, c01, c01
  385. FMUL a2, c01, t1
  386. FSUB c02, t1, c02
  387. FMUL a3, c01, t1
  388. FSUB c03, t1, c03
  389. FMUL a4, c01, t1
  390. FSUB c04, t1, c04
  391. LDF [BO + 5 * SIZE], a1
  392. LDF [BO + 6 * SIZE], a2
  393. LDF [BO + 7 * SIZE], a3
  394. FMUL a1, c02, c02
  395. FMUL a2, c02, t1
  396. FSUB c03, t1, c03
  397. FMUL a3, c02, t1
  398. FSUB c04, t1, c04
  399. LDF [BO + 10 * SIZE], a1
  400. LDF [BO + 11 * SIZE], a2
  401. FMUL a1, c03, c03
  402. FMUL a2, c03, t1
  403. FSUB c04, t1, c04
  404. LDF [BO + 15 * SIZE], a1
  405. FMUL a1, c04, c04
  406. #endif
  407. #ifdef RT
  408. LDF [BO + 15 * SIZE], a1
  409. LDF [BO + 14 * SIZE], a2
  410. LDF [BO + 13 * SIZE], a3
  411. LDF [BO + 12 * SIZE], a4
  412. FMUL a1, c04, c04
  413. FMUL a2, c04, t1
  414. FSUB c03, t1, c03
  415. FMUL a3, c04, t1
  416. FSUB c02, t1, c02
  417. FMUL a4, c04, t1
  418. FSUB c01, t1, c01
  419. LDF [BO + 10 * SIZE], a1
  420. LDF [BO + 9 * SIZE], a2
  421. LDF [BO + 8 * SIZE], a3
  422. FMUL a1, c03, c03
  423. FMUL a2, c03, t1
  424. FSUB c02, t1, c02
  425. FMUL a3, c03, t1
  426. FSUB c01, t1, c01
  427. LDF [BO + 5 * SIZE], a1
  428. LDF [BO + 4 * SIZE], a2
  429. FMUL a1, c02, c02
  430. FMUL a2, c02, t1
  431. FSUB c01, t1, c01
  432. LDF [BO + 0 * SIZE], a1
  433. FMUL a1, c01, c01
  434. #endif
  435. #ifdef LN
  436. add C1, -1 * SIZE, C1
  437. add C2, -1 * SIZE, C2
  438. add C3, -1 * SIZE, C3
  439. add C4, -1 * SIZE, C4
  440. #endif
  441. #if defined(LN) || defined(LT)
  442. STF c01, [BO + 0 * SIZE]
  443. STF c02, [BO + 1 * SIZE]
  444. STF c03, [BO + 2 * SIZE]
  445. STF c04, [BO + 3 * SIZE]
  446. #else
  447. STF c01, [AO + 0 * SIZE]
  448. STF c02, [AO + 1 * SIZE]
  449. STF c03, [AO + 2 * SIZE]
  450. STF c04, [AO + 3 * SIZE]
  451. #endif
  452. STF c01, [C1 + 0 * SIZE]
  453. STF c02, [C2 + 0 * SIZE]
  454. STF c03, [C3 + 0 * SIZE]
  455. STF c04, [C4 + 0 * SIZE]
  456. FMOV FZERO, t1
  457. FMOV FZERO, t2
  458. FMOV FZERO, t3
  459. FMOV FZERO, t4
  460. #ifndef LN
  461. add C1, 1 * SIZE, C1
  462. add C2, 1 * SIZE, C2
  463. add C3, 1 * SIZE, C3
  464. add C4, 1 * SIZE, C4
  465. #endif
  466. #ifdef RT
  467. sll K, 0 + BASE_SHIFT, TEMP1
  468. add AORIG, TEMP1, AORIG
  469. #endif
  470. #if defined(LT) || defined(RN)
  471. sub K, KK, TEMP1
  472. sll TEMP1, 0 + BASE_SHIFT, TEMP2
  473. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  474. add AO, TEMP2, AO
  475. add BO, TEMP1, BO
  476. #endif
  477. #ifdef LT
  478. add KK, 1, KK
  479. #endif
  480. #ifdef LN
  481. sub KK, 1, KK
  482. #endif
  483. .LL50:
  484. and M, 2, I
  485. cmp I, 0
  486. ble,pn %icc, .LL70
  487. nop
  488. #if defined(LT) || defined(RN)
  489. sra KK, 2, L
  490. mov B, BO
  491. cmp L, 0
  492. #else
  493. #ifdef LN
  494. sll K, 1 + BASE_SHIFT, TEMP1
  495. sub AORIG, TEMP1, AORIG
  496. #endif
  497. sll KK, 1 + BASE_SHIFT, TEMP1
  498. sll KK, 2 + BASE_SHIFT, TEMP2
  499. add AORIG, TEMP1, AO
  500. add B, TEMP2, BO
  501. sub K, KK, TEMP1
  502. sra TEMP1, 2, L
  503. cmp L, 0
  504. #endif
  505. FMOV FZERO, c02
  506. FMOV FZERO, t1
  507. FMOV FZERO, c04
  508. LDF [AO + 0 * SIZE], a1
  509. FMOV FZERO, t2
  510. LDF [BO + 0 * SIZE], b1
  511. FMOV FZERO, c06
  512. LDF [AO + 1 * SIZE], a2
  513. FMOV FZERO, t3
  514. LDF [BO + 1 * SIZE], b2
  515. FMOV FZERO, c08
  516. LDF [AO + 2 * SIZE], a3
  517. FMOV FZERO, t4
  518. LDF [BO + 2 * SIZE], b3
  519. FMOV FZERO, c01
  520. LDF [AO + 3 * SIZE], a4
  521. FMOV FZERO, c03
  522. LDF [BO + 3 * SIZE], b4
  523. FMOV FZERO, c05
  524. ble,pn %icc, .LL55
  525. FMOV FZERO, c07
  526. .LL52:
  527. FADD c02, t1, c02
  528. add AO, 8 * SIZE, AO
  529. prefetch [AO + APREFETCHSIZE * SIZE], 0
  530. FMUL a1, b1, t1
  531. add BO, 16 * SIZE, BO
  532. FADD c04, t2, c04
  533. add L, -1, L
  534. FMUL a1, b2, t2
  535. FADD c06, t3, c06
  536. cmp L, 0
  537. FMUL a1, b3, t3
  538. FADD c08, t4, c08
  539. FMUL a1, b4, t4
  540. LDF [AO - 4 * SIZE], a1
  541. FADD c01, t1, c01
  542. FMUL a2, b1, t1
  543. LDF [BO - 12 * SIZE], b1
  544. FADD c03, t2, c03
  545. FMUL a2, b2, t2
  546. LDF [BO - 11 * SIZE], b2
  547. FADD c05, t3, c05
  548. FMUL a2, b3, t3
  549. LDF [BO - 10 * SIZE], b3
  550. FADD c07, t4, c07
  551. FMUL a2, b4, t4
  552. LDF [BO - 9 * SIZE], b4
  553. FADD c02, t1, c02
  554. FMUL a3, b1, t1
  555. LDF [AO - 3 * SIZE], a2
  556. FADD c04, t2, c04
  557. FMUL a3, b2, t2
  558. FADD c06, t3, c06
  559. FMUL a3, b3, t3
  560. FADD c08, t4, c08
  561. FMUL a3, b4, t4
  562. LDF [AO - 2 * SIZE], a3
  563. FADD c01, t1, c01
  564. FMUL a4, b1, t1
  565. LDF [BO - 8 * SIZE], b1
  566. FADD c03, t2, c03
  567. FMUL a4, b2, t2
  568. LDF [BO - 7 * SIZE], b2
  569. FADD c05, t3, c05
  570. FMUL a4, b3, t3
  571. LDF [BO - 6 * SIZE], b3
  572. FADD c07, t4, c07
  573. FMUL a4, b4, t4
  574. LDF [BO - 5 * SIZE], b4
  575. FADD c02, t1, c02
  576. FMUL a1, b1, t1
  577. LDF [AO - 1 * SIZE], a4
  578. FADD c04, t2, c04
  579. FMUL a1, b2, t2
  580. FADD c06, t3, c06
  581. FMUL a1, b3, t3
  582. FADD c08, t4, c08
  583. FMUL a1, b4, t4
  584. LDF [AO + 0 * SIZE], a1
  585. FADD c01, t1, c01
  586. FMUL a2, b1, t1
  587. LDF [BO - 4 * SIZE], b1
  588. FADD c03, t2, c03
  589. FMUL a2, b2, t2
  590. LDF [BO - 3 * SIZE], b2
  591. FADD c05, t3, c05
  592. FMUL a2, b3, t3
  593. LDF [BO - 2 * SIZE], b3
  594. FADD c07, t4, c07
  595. FMUL a2, b4, t4
  596. LDF [BO - 1 * SIZE], b4
  597. FADD c02, t1, c02
  598. FMUL a3, b1, t1
  599. LDF [AO + 1 * SIZE], a2
  600. FADD c04, t2, c04
  601. FMUL a3, b2, t2
  602. FADD c06, t3, c06
  603. FMUL a3, b3, t3
  604. FADD c08, t4, c08
  605. FMUL a3, b4, t4
  606. LDF [AO + 2 * SIZE], a3
  607. FADD c01, t1, c01
  608. FMUL a4, b1, t1
  609. LDF [BO + 0 * SIZE], b1
  610. FADD c03, t2, c03
  611. FMUL a4, b2, t2
  612. LDF [BO + 1 * SIZE], b2
  613. FADD c05, t3, c05
  614. FMUL a4, b3, t3
  615. LDF [BO + 2 * SIZE], b3
  616. FADD c07, t4, c07
  617. FMUL a4, b4, t4
  618. LDF [BO + 3 * SIZE], b4
  619. bg,pt %icc, .LL52
  620. LDF [AO + 3 * SIZE], a4
  621. .LL55:
  622. #if defined(LT) || defined(RN)
  623. and KK, 3, L
  624. #else
  625. and TEMP1, 3, L
  626. #endif
  627. cmp L, 0
  628. ble,a,pn %icc, .LL59
  629. nop
  630. .LL56:
  631. FADD c02, t1, c02
  632. add AO, 2 * SIZE, AO
  633. FMUL a1, b1, t1
  634. add L, -1, L
  635. add BO, 4 * SIZE, BO
  636. FADD c04, t2, c04
  637. cmp L, 0
  638. FMUL a1, b2, t2
  639. FADD c06, t3, c06
  640. FMUL a1, b3, t3
  641. FADD c08, t4, c08
  642. FMUL a1, b4, t4
  643. LDF [AO + 0 * SIZE], a1
  644. FADD c01, t1, c01
  645. FMUL a2, b1, t1
  646. LDF [BO + 0 * SIZE], b1
  647. FADD c03, t2, c03
  648. FMUL a2, b2, t2
  649. LDF [BO + 1 * SIZE], b2
  650. FADD c05, t3, c05
  651. FMUL a2, b3, t3
  652. LDF [BO + 2 * SIZE], b3
  653. FADD c07, t4, c07
  654. FMUL a2, b4, t4
  655. LDF [BO + 3 * SIZE], b4
  656. bg,pt %icc, .LL56
  657. LDF [AO + 1 * SIZE], a2
  658. .LL59:
  659. #if defined(LN) || defined(RT)
  660. #ifdef LN
  661. sub KK, 2, TEMP1
  662. #else
  663. sub KK, 4, TEMP1
  664. #endif
  665. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  666. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  667. add AORIG, TEMP2, AO
  668. add B, TEMP1, BO
  669. #endif
  670. FADD c02, t1, c02
  671. FADD c04, t2, c04
  672. FADD c06, t3, c06
  673. FADD c08, t4, c08
  674. #if defined(LN) || defined(LT)
  675. LDF [BO + 0 * SIZE], a1
  676. LDF [BO + 1 * SIZE], a2
  677. LDF [BO + 2 * SIZE], a3
  678. LDF [BO + 3 * SIZE], a4
  679. LDF [BO + 4 * SIZE], b1
  680. LDF [BO + 5 * SIZE], b2
  681. LDF [BO + 6 * SIZE], b3
  682. LDF [BO + 7 * SIZE], b4
  683. FSUB a1, c01, c01
  684. FSUB a2, c03, c03
  685. FSUB a3, c05, c05
  686. FSUB a4, c07, c07
  687. FSUB b1, c02, c02
  688. FSUB b2, c04, c04
  689. FSUB b3, c06, c06
  690. FSUB b4, c08, c08
  691. #else
  692. LDF [AO + 0 * SIZE], a1
  693. LDF [AO + 1 * SIZE], a2
  694. LDF [AO + 2 * SIZE], a3
  695. LDF [AO + 3 * SIZE], a4
  696. LDF [AO + 4 * SIZE], b1
  697. LDF [AO + 5 * SIZE], b2
  698. LDF [AO + 6 * SIZE], b3
  699. LDF [AO + 7 * SIZE], b4
  700. FSUB a1, c01, c01
  701. FSUB a2, c02, c02
  702. FSUB a3, c03, c03
  703. FSUB a4, c04, c04
  704. FSUB b1, c05, c05
  705. FSUB b2, c06, c06
  706. FSUB b3, c07, c07
  707. FSUB b4, c08, c08
  708. #endif
  709. #ifdef LN
  710. LDF [AO + 3 * SIZE], a1
  711. LDF [AO + 2 * SIZE], a2
  712. LDF [AO + 0 * SIZE], a3
  713. FMUL a1, c02, c02
  714. FMUL a1, c04, c04
  715. FMUL a1, c06, c06
  716. FMUL a1, c08, c08
  717. FMUL a2, c02, t1
  718. FMUL a2, c04, t2
  719. FMUL a2, c06, t3
  720. FMUL a2, c08, t4
  721. FSUB c01, t1, c01
  722. FSUB c03, t2, c03
  723. FSUB c05, t3, c05
  724. FSUB c07, t4, c07
  725. FMUL a3, c01, c01
  726. FMUL a3, c03, c03
  727. FMUL a3, c05, c05
  728. FMUL a3, c07, c07
  729. #endif
  730. #ifdef LT
  731. LDF [AO + 0 * SIZE], a1
  732. LDF [AO + 1 * SIZE], a2
  733. LDF [AO + 3 * SIZE], a3
  734. FMUL a1, c01, c01
  735. FMUL a1, c03, c03
  736. FMUL a1, c05, c05
  737. FMUL a1, c07, c07
  738. FMUL a2, c01, t1
  739. FMUL a2, c03, t2
  740. FMUL a2, c05, t3
  741. FMUL a2, c07, t4
  742. FSUB c02, t1, c02
  743. FSUB c04, t2, c04
  744. FSUB c06, t3, c06
  745. FSUB c08, t4, c08
  746. FMUL a3, c02, c02
  747. FMUL a3, c04, c04
  748. FMUL a3, c06, c06
  749. FMUL a3, c08, c08
  750. #endif
  751. #ifdef RN
  752. LDF [BO + 0 * SIZE], a1
  753. LDF [BO + 1 * SIZE], a2
  754. LDF [BO + 2 * SIZE], a3
  755. LDF [BO + 3 * SIZE], a4
  756. FMUL a1, c01, c01
  757. FMUL a1, c02, c02
  758. FMUL a2, c01, t1
  759. FMUL a2, c02, t2
  760. FSUB c03, t1, c03
  761. FSUB c04, t2, c04
  762. FMUL a3, c01, t1
  763. FMUL a3, c02, t2
  764. FSUB c05, t1, c05
  765. FSUB c06, t2, c06
  766. FMUL a4, c01, t1
  767. FMUL a4, c02, t2
  768. FSUB c07, t1, c07
  769. FSUB c08, t2, c08
  770. LDF [BO + 5 * SIZE], a1
  771. LDF [BO + 6 * SIZE], a2
  772. LDF [BO + 7 * SIZE], a3
  773. FMUL a1, c03, c03
  774. FMUL a1, c04, c04
  775. FMUL a2, c03, t1
  776. FMUL a2, c04, t2
  777. FSUB c05, t1, c05
  778. FSUB c06, t2, c06
  779. FMUL a3, c03, t1
  780. FMUL a3, c04, t2
  781. FSUB c07, t1, c07
  782. FSUB c08, t2, c08
  783. LDF [BO + 10 * SIZE], a1
  784. LDF [BO + 11 * SIZE], a2
  785. FMUL a1, c05, c05
  786. FMUL a1, c06, c06
  787. FMUL a2, c05, t1
  788. FMUL a2, c06, t2
  789. FSUB c07, t1, c07
  790. FSUB c08, t2, c08
  791. LDF [BO + 15 * SIZE], a1
  792. FMUL a1, c07, c07
  793. FMUL a1, c08, c08
  794. #endif
  795. #ifdef RT
  796. LDF [BO + 15 * SIZE], a1
  797. LDF [BO + 14 * SIZE], a2
  798. LDF [BO + 13 * SIZE], a3
  799. LDF [BO + 12 * SIZE], a4
  800. FMUL a1, c07, c07
  801. FMUL a1, c08, c08
  802. FMUL a2, c07, t1
  803. FMUL a2, c08, t2
  804. FSUB c05, t1, c05
  805. FSUB c06, t2, c06
  806. FMUL a3, c07, t1
  807. FMUL a3, c08, t2
  808. FSUB c03, t1, c03
  809. FSUB c04, t2, c04
  810. FMUL a4, c07, t1
  811. FMUL a4, c08, t2
  812. FSUB c01, t1, c01
  813. FSUB c02, t2, c02
  814. LDF [BO + 10 * SIZE], a1
  815. LDF [BO + 9 * SIZE], a2
  816. LDF [BO + 8 * SIZE], a3
  817. FMUL a1, c05, c05
  818. FMUL a1, c06, c06
  819. FMUL a2, c05, t1
  820. FMUL a2, c06, t2
  821. FSUB c03, t1, c03
  822. FSUB c04, t2, c04
  823. FMUL a3, c05, t1
  824. FMUL a3, c06, t2
  825. FSUB c01, t1, c01
  826. FSUB c02, t2, c02
  827. LDF [BO + 5 * SIZE], a1
  828. LDF [BO + 4 * SIZE], a2
  829. FMUL a1, c03, c03
  830. FMUL a1, c04, c04
  831. FMUL a2, c03, t1
  832. FMUL a2, c04, t2
  833. FSUB c01, t1, c01
  834. FSUB c02, t2, c02
  835. LDF [BO + 0 * SIZE], a1
  836. FMUL a1, c01, c01
  837. FMUL a1, c02, c02
  838. #endif
  839. #ifdef LN
  840. add C1, -2 * SIZE, C1
  841. add C2, -2 * SIZE, C2
  842. add C3, -2 * SIZE, C3
  843. add C4, -2 * SIZE, C4
  844. #endif
  845. #if defined(LN) || defined(LT)
  846. STF c01, [BO + 0 * SIZE]
  847. STF c03, [BO + 1 * SIZE]
  848. STF c05, [BO + 2 * SIZE]
  849. STF c07, [BO + 3 * SIZE]
  850. STF c02, [BO + 4 * SIZE]
  851. STF c04, [BO + 5 * SIZE]
  852. STF c06, [BO + 6 * SIZE]
  853. STF c08, [BO + 7 * SIZE]
  854. #else
  855. STF c01, [AO + 0 * SIZE]
  856. STF c02, [AO + 1 * SIZE]
  857. STF c03, [AO + 2 * SIZE]
  858. STF c04, [AO + 3 * SIZE]
  859. STF c05, [AO + 4 * SIZE]
  860. STF c06, [AO + 5 * SIZE]
  861. STF c07, [AO + 6 * SIZE]
  862. STF c08, [AO + 7 * SIZE]
  863. #endif
  864. STF c01, [C1 + 0 * SIZE]
  865. STF c02, [C1 + 1 * SIZE]
  866. STF c03, [C2 + 0 * SIZE]
  867. STF c04, [C2 + 1 * SIZE]
  868. STF c05, [C3 + 0 * SIZE]
  869. STF c06, [C3 + 1 * SIZE]
  870. STF c07, [C4 + 0 * SIZE]
  871. STF c08, [C4 + 1 * SIZE]
  872. FMOV FZERO, t1
  873. FMOV FZERO, t2
  874. FMOV FZERO, t3
  875. FMOV FZERO, t4
  876. #ifndef LN
  877. add C1, 2 * SIZE, C1
  878. add C2, 2 * SIZE, C2
  879. add C3, 2 * SIZE, C3
  880. add C4, 2 * SIZE, C4
  881. #endif
  882. #ifdef RT
  883. sll K, 1 + BASE_SHIFT, TEMP1
  884. add AORIG, TEMP1, AORIG
  885. #endif
  886. #if defined(LT) || defined(RN)
  887. sub K, KK, TEMP1
  888. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  889. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  890. add AO, TEMP2, AO
  891. add BO, TEMP1, BO
  892. #endif
  893. #ifdef LT
  894. add KK, 2, KK
  895. #endif
  896. #ifdef LN
  897. sub KK, 2, KK
  898. #endif
  899. .LL70:
  900. sra M, 2, I
  901. cmp I, 0
  902. ble,pn %icc, .LL99
  903. nop
  904. .LL21:
  905. FMOV FZERO, t1
  906. FMOV FZERO, t2
  907. FMOV FZERO, t3
  908. FMOV FZERO, t4
  909. FMOV FZERO, c01
  910. FMOV FZERO, c02
  911. FMOV FZERO, c03
  912. #if defined(LT) || defined(RN)
  913. sra KK, 2, L
  914. mov B, BO
  915. cmp L, 0
  916. #else
  917. #ifdef LN
  918. sll K, 2 + BASE_SHIFT, TEMP1
  919. sub AORIG, TEMP1, AORIG
  920. #endif
  921. sll KK, 2 + BASE_SHIFT, TEMP1
  922. add AORIG, TEMP1, AO
  923. add B, TEMP1, BO
  924. sub K, KK, TEMP1
  925. sra TEMP1, 2, L
  926. cmp L, 0
  927. #endif
  928. LDF [AO + 0 * SIZE], a1
  929. FMOV FZERO, c04
  930. LDF [BO + 0 * SIZE], b1
  931. FMOV FZERO, c05
  932. LDF [AO + 1 * SIZE], a2
  933. FMOV FZERO, c06
  934. LDF [BO + 1 * SIZE], b2
  935. FMOV FZERO, c07
  936. LDF [AO + 2 * SIZE], a3
  937. FMOV FZERO, c08
  938. LDF [BO + 2 * SIZE], b3
  939. FMOV FZERO, c09
  940. LDF [AO + 3 * SIZE], a4
  941. FMOV FZERO, c10
  942. LDF [BO + 3 * SIZE], b4
  943. FMOV FZERO, c11
  944. LDF [BO + 4 * SIZE], b5 /* ***** */
  945. LDF [AO + 4 * SIZE], a5 /* ***** */
  946. #ifdef LN
  947. prefetch [C1 + 3 * SIZE], 3
  948. FMOV FZERO, c12
  949. prefetch [C2 + 3 * SIZE], 3
  950. FMOV FZERO, c13
  951. prefetch [C3 + 3 * SIZE], 3
  952. FMOV FZERO, c14
  953. prefetch [C4 + 3 * SIZE], 3
  954. FMOV FZERO, c15
  955. #else
  956. prefetch [C1 - 3 * SIZE], 3
  957. FMOV FZERO, c12
  958. prefetch [C2 - 3 * SIZE], 3
  959. FMOV FZERO, c13
  960. prefetch [C3 - 3 * SIZE], 3
  961. FMOV FZERO, c14
  962. prefetch [C4 - 3 * SIZE], 3
  963. FMOV FZERO, c15
  964. #endif
  965. ble,pn %icc, .LL25
  966. FMOV FZERO, c16
  967. .LL22:
  968. FADD c04, t1, c04
  969. prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY
  970. FMUL a1, b1, t1
  971. nop
  972. FADD c08, t2, c08
  973. prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY
  974. FMUL a1, b2, t2
  975. add AO, 16 * SIZE, AO
  976. FADD c12, t3, c12
  977. LDF [AO - 13 * SIZE], a4
  978. FMUL a1, b3, t3
  979. add BO, 16 * SIZE, BO
  980. FADD c16, t4, c16
  981. nop
  982. FMUL a1, b4, t4
  983. LDF [AO - 8 * SIZE], a1
  984. FADD c01, t1, c01
  985. nop
  986. FMUL a2, b1, t1
  987. nop
  988. FADD c05, t2, c05
  989. nop
  990. FMUL a2, b2, t2
  991. nop
  992. FADD c09, t3, c09
  993. nop
  994. FMUL a2, b3, t3
  995. nop
  996. FADD c13, t4, c13
  997. add L, -1, L
  998. FMUL a2, b4, t4
  999. LDF [AO - 11 * SIZE], a2
  1000. FADD c02, t1, c02
  1001. nop
  1002. FMUL a3, b1, t1
  1003. nop
  1004. FADD c06, t2, c06
  1005. nop
  1006. FMUL a3, b2, t2
  1007. nop
  1008. FADD c10, t3, c10
  1009. nop
  1010. FMUL a3, b3, t3
  1011. nop
  1012. FADD c14, t4, c14
  1013. nop
  1014. FMUL a3, b4, t4
  1015. LDF [AO - 10 * SIZE], a3
  1016. FADD c03, t1, c03
  1017. nop
  1018. FMUL a4, b1, t1
  1019. LDF [BO - 8 * SIZE], b1
  1020. FADD c07, t2, c07
  1021. nop
  1022. FMUL a4, b2, t2
  1023. LDF [BO - 11 * SIZE], b2
  1024. FADD c11, t3, c11
  1025. nop
  1026. FMUL a4, b3, t3
  1027. LDF [BO - 10 * SIZE], b3
  1028. FADD c15, t4, c15
  1029. nop
  1030. FMUL a4, b4, t4
  1031. LDF [BO - 9 * SIZE], b4
  1032. FADD c04, t1, c04
  1033. nop
  1034. FMUL a5, b5, t1
  1035. LDF [AO - 9 * SIZE], a4
  1036. FADD c08, t2, c08
  1037. nop
  1038. FMUL a5, b2, t2
  1039. nop
  1040. FADD c12, t3, c12
  1041. nop
  1042. FMUL a5, b3, t3
  1043. nop
  1044. FADD c16, t4, c16
  1045. nop
  1046. FMUL a5, b4, t4
  1047. LDF [AO - 4 * SIZE], a5
  1048. FADD c01, t1, c01
  1049. nop
  1050. FMUL a2, b5, t1
  1051. nop
  1052. FADD c05, t2, c05
  1053. nop
  1054. FMUL a2, b2, t2
  1055. nop
  1056. FADD c09, t3, c09
  1057. nop
  1058. FMUL a2, b3, t3
  1059. nop
  1060. FADD c13, t4, c13
  1061. nop
  1062. FMUL a2, b4, t4
  1063. LDF [AO - 7 * SIZE], a2
  1064. FADD c02, t1, c02
  1065. nop
  1066. FMUL a3, b5, t1
  1067. nop
  1068. FADD c06, t2, c06
  1069. nop
  1070. FMUL a3, b2, t2
  1071. nop
  1072. FADD c10, t3, c10
  1073. nop
  1074. FMUL a3, b3, t3
  1075. nop
  1076. FADD c14, t4, c14
  1077. nop
  1078. FMUL a3, b4, t4
  1079. LDF [AO - 6 * SIZE], a3
  1080. FADD c03, t1, c03
  1081. nop
  1082. FMUL a4, b5, t1
  1083. LDF [BO - 4 * SIZE], b5
  1084. FADD c07, t2, c07
  1085. nop
  1086. FMUL a4, b2, t2
  1087. LDF [BO - 7 * SIZE], b2
  1088. FADD c11, t3, c11
  1089. nop
  1090. FMUL a4, b3, t3
  1091. LDF [BO - 6 * SIZE], b3
  1092. FADD c15, t4, c15
  1093. nop
  1094. FMUL a4, b4, t4
  1095. LDF [BO - 5 * SIZE], b4
  1096. FADD c04, t1, c04
  1097. nop
  1098. FMUL a1, b1, t1
  1099. LDF [AO - 5 * SIZE], a4
  1100. FADD c08, t2, c08
  1101. nop
  1102. FMUL a1, b2, t2
  1103. nop
  1104. FADD c12, t3, c12
  1105. nop
  1106. FMUL a1, b3, t3
  1107. nop
  1108. FADD c16, t4, c16
  1109. nop
  1110. FMUL a1, b4, t4
  1111. LDF [AO - 0 * SIZE], a1
  1112. FADD c01, t1, c01
  1113. nop
  1114. FMUL a2, b1, t1
  1115. nop
  1116. #ifdef DOUBLE
  1117. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  1118. #else
  1119. nop
  1120. #endif
  1121. FADD c05, t2, c05
  1122. nop
  1123. FMUL a2, b2, t2
  1124. FADD c09, t3, c09
  1125. nop
  1126. FMUL a2, b3, t3
  1127. nop
  1128. FADD c13, t4, c13
  1129. nop
  1130. FMUL a2, b4, t4
  1131. nop
  1132. FADD c02, t1, c02
  1133. nop
  1134. FMUL a3, b1, t1
  1135. LDF [AO - 3 * SIZE], a2
  1136. FADD c06, t2, c06
  1137. #ifdef DOUBLE
  1138. prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY
  1139. #else
  1140. nop
  1141. #endif
  1142. FMUL a3, b2, t2
  1143. nop
  1144. FADD c10, t3, c10
  1145. nop
  1146. FMUL a3, b3, t3
  1147. nop
  1148. FADD c14, t4, c14
  1149. nop
  1150. FMUL a3, b4, t4
  1151. LDF [AO - 2 * SIZE], a3
  1152. FADD c03, t1, c03
  1153. nop
  1154. FMUL a4, b1, t1
  1155. LDF [BO - 0 * SIZE], b1
  1156. FADD c07, t2, c07
  1157. nop
  1158. FMUL a4, b2, t2
  1159. LDF [BO - 3 * SIZE], b2
  1160. FADD c11, t3, c11
  1161. nop
  1162. FMUL a4, b3, t3
  1163. LDF [BO - 2 * SIZE], b3
  1164. FADD c15, t4, c15
  1165. nop
  1166. FMUL a4, b4, t4
  1167. LDF [BO - 1 * SIZE], b4
  1168. FADD c04, t1, c04
  1169. nop
  1170. FMUL a5, b5, t1
  1171. LDF [AO - 1 * SIZE], a4
  1172. FADD c08, t2, c08
  1173. FMUL a5, b2, t2
  1174. FADD c12, t3, c12
  1175. FMUL a5, b3, t3
  1176. FADD c16, t4, c16
  1177. nop
  1178. FMUL a5, b4, t4
  1179. LDF [AO + 4 * SIZE], a5
  1180. FADD c01, t1, c01
  1181. nop
  1182. FMUL a2, b5, t1
  1183. nop
  1184. FADD c05, t2, c05
  1185. nop
  1186. FMUL a2, b2, t2
  1187. nop
  1188. FADD c09, t3, c09
  1189. nop
  1190. FMUL a2, b3, t3
  1191. nop
  1192. FADD c13, t4, c13
  1193. nop
  1194. FMUL a2, b4, t4
  1195. LDF [AO + 1 * SIZE], a2
  1196. FADD c02, t1, c02
  1197. nop
  1198. FMUL a3, b5, t1
  1199. nop
  1200. FADD c06, t2, c06
  1201. nop
  1202. FMUL a3, b2, t2
  1203. nop
  1204. FADD c10, t3, c10
  1205. nop
  1206. FMUL a3, b3, t3
  1207. nop
  1208. FADD c14, t4, c14
  1209. nop
  1210. FMUL a3, b4, t4
  1211. LDF [AO + 2 * SIZE], a3
  1212. FADD c03, t1, c03
  1213. cmp L, 0
  1214. FMUL a4, b5, t1
  1215. LDF [BO + 4 * SIZE], b5
  1216. FADD c07, t2, c07
  1217. nop
  1218. FMUL a4, b2, t2
  1219. LDF [BO + 1 * SIZE], b2
  1220. FADD c11, t3, c11
  1221. nop
  1222. FMUL a4, b3, t3
  1223. LDF [BO + 2 * SIZE], b3
  1224. FADD c15, t4, c15
  1225. FMUL a4, b4, t4
  1226. bg,pt %icc, .LL22
  1227. LDF [BO + 3 * SIZE], b4
  1228. .LL25:
  1229. #if defined(LT) || defined(RN)
  1230. and KK, 3, L
  1231. #else
  1232. and TEMP1, 3, L
  1233. #endif
  1234. cmp L, 0
  1235. ble,a,pn %icc, .LL29
  1236. nop
  1237. .LL26:
  1238. FADD c04, t1, c04
  1239. LDF [AO + 3 * SIZE], a4
  1240. FMUL a1, b1, t1
  1241. add AO, 4 * SIZE, AO
  1242. FADD c08, t2, c08
  1243. add BO, 4 * SIZE, BO
  1244. FMUL a1, b2, t2
  1245. add L, -1, L
  1246. FADD c12, t3, c12
  1247. nop
  1248. FMUL a1, b3, t3
  1249. cmp L, 0
  1250. FADD c16, t4, c16
  1251. nop
  1252. FMUL a1, b4, t4
  1253. LDF [AO + 0 * SIZE], a1
  1254. FADD c01, t1, c01
  1255. nop
  1256. FMUL a2, b1, t1
  1257. nop
  1258. FADD c05, t2, c05
  1259. nop
  1260. FMUL a2, b2, t2
  1261. nop
  1262. FADD c09, t3, c09
  1263. nop
  1264. FMUL a2, b3, t3
  1265. nop
  1266. FADD c13, t4, c13
  1267. nop
  1268. FMUL a2, b4, t4
  1269. LDF [AO + 1 * SIZE], a2
  1270. FADD c02, t1, c02
  1271. nop
  1272. FMUL a3, b1, t1
  1273. nop
  1274. FADD c06, t2, c06
  1275. nop
  1276. FMUL a3, b2, t2
  1277. nop
  1278. FADD c10, t3, c10
  1279. nop
  1280. FMUL a3, b3, t3
  1281. nop
  1282. FADD c14, t4, c14
  1283. nop
  1284. FMUL a3, b4, t4
  1285. LDF [AO + 2 * SIZE], a3
  1286. FADD c03, t1, c03
  1287. nop
  1288. FMUL a4, b1, t1
  1289. LDF [BO + 0 * SIZE], b1
  1290. FADD c07, t2, c07
  1291. nop
  1292. FMUL a4, b2, t2
  1293. LDF [BO + 1 * SIZE], b2
  1294. FADD c11, t3, c11
  1295. nop
  1296. FMUL a4, b3, t3
  1297. LDF [BO + 2 * SIZE], b3
  1298. FADD c15, t4, c15
  1299. FMUL a4, b4, t4
  1300. bg,pt %icc, .LL26
  1301. LDF [BO + 3 * SIZE], b4
  1302. .LL29:
  1303. #if defined(LN) || defined(RT)
  1304. sub KK, 4, TEMP1
  1305. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  1306. add AORIG, TEMP1, AO
  1307. add B, TEMP1, BO
  1308. #endif
  1309. FADD c04, t1, c04
  1310. FADD c08, t2, c08
  1311. FADD c12, t3, c12
  1312. FADD c16, t4, c16
  1313. #if defined(LN) || defined(LT)
  1314. LDF [BO + 0 * SIZE], a1
  1315. LDF [BO + 1 * SIZE], a2
  1316. LDF [BO + 2 * SIZE], a3
  1317. LDF [BO + 3 * SIZE], a4
  1318. LDF [BO + 4 * SIZE], b1
  1319. LDF [BO + 5 * SIZE], b2
  1320. LDF [BO + 6 * SIZE], b3
  1321. LDF [BO + 7 * SIZE], b4
  1322. FSUB a1, c01, c01
  1323. FSUB a2, c05, c05
  1324. FSUB a3, c09, c09
  1325. FSUB a4, c13, c13
  1326. FSUB b1, c02, c02
  1327. FSUB b2, c06, c06
  1328. FSUB b3, c10, c10
  1329. FSUB b4, c14, c14
  1330. LDF [BO + 8 * SIZE], a1
  1331. LDF [BO + 9 * SIZE], a2
  1332. LDF [BO + 10 * SIZE], a3
  1333. LDF [BO + 11 * SIZE], a4
  1334. LDF [BO + 12 * SIZE], b1
  1335. LDF [BO + 13 * SIZE], b2
  1336. LDF [BO + 14 * SIZE], b3
  1337. LDF [BO + 15 * SIZE], b4
  1338. FSUB a1, c03, c03
  1339. FSUB a2, c07, c07
  1340. FSUB a3, c11, c11
  1341. FSUB a4, c15, c15
  1342. FSUB b1, c04, c04
  1343. FSUB b2, c08, c08
  1344. FSUB b3, c12, c12
  1345. FSUB b4, c16, c16
  1346. #else
  1347. LDF [AO + 0 * SIZE], a1
  1348. LDF [AO + 1 * SIZE], a2
  1349. LDF [AO + 2 * SIZE], a3
  1350. LDF [AO + 3 * SIZE], a4
  1351. LDF [AO + 4 * SIZE], b1
  1352. LDF [AO + 5 * SIZE], b2
  1353. LDF [AO + 6 * SIZE], b3
  1354. LDF [AO + 7 * SIZE], b4
  1355. FSUB a1, c01, c01
  1356. FSUB a2, c02, c02
  1357. FSUB a3, c03, c03
  1358. FSUB a4, c04, c04
  1359. FSUB b1, c05, c05
  1360. FSUB b2, c06, c06
  1361. FSUB b3, c07, c07
  1362. FSUB b4, c08, c08
  1363. LDF [AO + 8 * SIZE], a1
  1364. LDF [AO + 9 * SIZE], a2
  1365. LDF [AO + 10 * SIZE], a3
  1366. LDF [AO + 11 * SIZE], a4
  1367. LDF [AO + 12 * SIZE], b1
  1368. LDF [AO + 13 * SIZE], b2
  1369. LDF [AO + 14 * SIZE], b3
  1370. LDF [AO + 15 * SIZE], b4
  1371. FSUB a1, c09, c09
  1372. FSUB a2, c10, c10
  1373. FSUB a3, c11, c11
  1374. FSUB a4, c12, c12
  1375. FSUB b1, c13, c13
  1376. FSUB b2, c14, c14
  1377. FSUB b3, c15, c15
  1378. FSUB b4, c16, c16
  1379. #endif
  1380. #ifdef LN
  1381. LDF [AO + 15 * SIZE], a1
  1382. LDF [AO + 14 * SIZE], a2
  1383. LDF [AO + 13 * SIZE], a3
  1384. LDF [AO + 12 * SIZE], a4
  1385. FMUL a1, c04, c04
  1386. FMUL a1, c08, c08
  1387. FMUL a1, c12, c12
  1388. FMUL a1, c16, c16
  1389. FMUL a2, c04, t1
  1390. FMUL a2, c08, t2
  1391. FMUL a2, c12, t3
  1392. FMUL a2, c16, t4
  1393. FSUB c03, t1, c03
  1394. FSUB c07, t2, c07
  1395. FSUB c11, t3, c11
  1396. FSUB c15, t4, c15
  1397. FMUL a3, c04, t1
  1398. FMUL a3, c08, t2
  1399. FMUL a3, c12, t3
  1400. FMUL a3, c16, t4
  1401. FSUB c02, t1, c02
  1402. FSUB c06, t2, c06
  1403. FSUB c10, t3, c10
  1404. FSUB c14, t4, c14
  1405. FMUL a4, c04, t1
  1406. FMUL a4, c08, t2
  1407. FMUL a4, c12, t3
  1408. FMUL a4, c16, t4
  1409. FSUB c01, t1, c01
  1410. FSUB c05, t2, c05
  1411. FSUB c09, t3, c09
  1412. FSUB c13, t4, c13
  1413. LDF [AO + 10 * SIZE], a1
  1414. LDF [AO + 9 * SIZE], a2
  1415. LDF [AO + 8 * SIZE], a3
  1416. FMUL a1, c03, c03
  1417. FMUL a1, c07, c07
  1418. FMUL a1, c11, c11
  1419. FMUL a1, c15, c15
  1420. FMUL a2, c03, t1
  1421. FMUL a2, c07, t2
  1422. FMUL a2, c11, t3
  1423. FMUL a2, c15, t4
  1424. FSUB c02, t1, c02
  1425. FSUB c06, t2, c06
  1426. FSUB c10, t3, c10
  1427. FSUB c14, t4, c14
  1428. FMUL a3, c03, t1
  1429. FMUL a3, c07, t2
  1430. FMUL a3, c11, t3
  1431. FMUL a3, c15, t4
  1432. FSUB c01, t1, c01
  1433. FSUB c05, t2, c05
  1434. FSUB c09, t3, c09
  1435. FSUB c13, t4, c13
  1436. LDF [AO + 5 * SIZE], a1
  1437. LDF [AO + 4 * SIZE], a2
  1438. FMUL a1, c02, c02
  1439. FMUL a1, c06, c06
  1440. FMUL a1, c10, c10
  1441. FMUL a1, c14, c14
  1442. FMUL a2, c02, t1
  1443. FMUL a2, c06, t2
  1444. FMUL a2, c10, t3
  1445. FMUL a2, c14, t4
  1446. FSUB c01, t1, c01
  1447. FSUB c05, t2, c05
  1448. FSUB c09, t3, c09
  1449. FSUB c13, t4, c13
  1450. LDF [AO + 0 * SIZE], a1
  1451. FMUL a1, c01, c01
  1452. FMUL a1, c05, c05
  1453. FMUL a1, c09, c09
  1454. FMUL a1, c13, c13
  1455. #endif
  1456. #ifdef LT
  1457. LDF [AO + 0 * SIZE], a1
  1458. LDF [AO + 1 * SIZE], a2
  1459. LDF [AO + 2 * SIZE], a3
  1460. LDF [AO + 3 * SIZE], a4
  1461. FMUL a1, c01, c01
  1462. FMUL a1, c05, c05
  1463. FMUL a1, c09, c09
  1464. FMUL a1, c13, c13
  1465. FMUL a2, c01, t1
  1466. FMUL a2, c05, t2
  1467. FMUL a2, c09, t3
  1468. FMUL a2, c13, t4
  1469. FSUB c02, t1, c02
  1470. FSUB c06, t2, c06
  1471. FSUB c10, t3, c10
  1472. FSUB c14, t4, c14
  1473. FMUL a3, c01, t1
  1474. FMUL a3, c05, t2
  1475. FMUL a3, c09, t3
  1476. FMUL a3, c13, t4
  1477. FSUB c03, t1, c03
  1478. FSUB c07, t2, c07
  1479. FSUB c11, t3, c11
  1480. FSUB c15, t4, c15
  1481. FMUL a4, c01, t1
  1482. FMUL a4, c05, t2
  1483. FMUL a4, c09, t3
  1484. FMUL a4, c13, t4
  1485. FSUB c04, t1, c04
  1486. FSUB c08, t2, c08
  1487. FSUB c12, t3, c12
  1488. FSUB c16, t4, c16
  1489. LDF [AO + 5 * SIZE], a1
  1490. LDF [AO + 6 * SIZE], a2
  1491. LDF [AO + 7 * SIZE], a3
  1492. FMUL a1, c02, c02
  1493. FMUL a1, c06, c06
  1494. FMUL a1, c10, c10
  1495. FMUL a1, c14, c14
  1496. FMUL a2, c02, t1
  1497. FMUL a2, c06, t2
  1498. FMUL a2, c10, t3
  1499. FMUL a2, c14, t4
  1500. FSUB c03, t1, c03
  1501. FSUB c07, t2, c07
  1502. FSUB c11, t3, c11
  1503. FSUB c15, t4, c15
  1504. FMUL a3, c02, t1
  1505. FMUL a3, c06, t2
  1506. FMUL a3, c10, t3
  1507. FMUL a3, c14, t4
  1508. FSUB c04, t1, c04
  1509. FSUB c08, t2, c08
  1510. FSUB c12, t3, c12
  1511. FSUB c16, t4, c16
  1512. LDF [AO + 10 * SIZE], a1
  1513. LDF [AO + 11 * SIZE], a2
  1514. FMUL a1, c03, c03
  1515. FMUL a1, c07, c07
  1516. FMUL a1, c11, c11
  1517. FMUL a1, c15, c15
  1518. FMUL a2, c03, t1
  1519. FMUL a2, c07, t2
  1520. FMUL a2, c11, t3
  1521. FMUL a2, c15, t4
  1522. FSUB c04, t1, c04
  1523. FSUB c08, t2, c08
  1524. FSUB c12, t3, c12
  1525. FSUB c16, t4, c16
  1526. LDF [AO + 15 * SIZE], a1
  1527. FMUL a1, c04, c04
  1528. FMUL a1, c08, c08
  1529. FMUL a1, c12, c12
  1530. FMUL a1, c16, c16
  1531. #endif
  1532. #ifdef RN
  1533. LDF [BO + 0 * SIZE], a1
  1534. LDF [BO + 1 * SIZE], a2
  1535. LDF [BO + 2 * SIZE], a3
  1536. LDF [BO + 3 * SIZE], a4
  1537. FMUL a1, c01, c01
  1538. FMUL a1, c02, c02
  1539. FMUL a1, c03, c03
  1540. FMUL a1, c04, c04
  1541. FMUL a2, c01, t1
  1542. FMUL a2, c02, t2
  1543. FMUL a2, c03, t3
  1544. FMUL a2, c04, t4
  1545. FSUB c05, t1, c05
  1546. FSUB c06, t2, c06
  1547. FSUB c07, t3, c07
  1548. FSUB c08, t4, c08
  1549. FMUL a3, c01, t1
  1550. FMUL a3, c02, t2
  1551. FMUL a3, c03, t3
  1552. FMUL a3, c04, t4
  1553. FSUB c09, t1, c09
  1554. FSUB c10, t2, c10
  1555. FSUB c11, t3, c11
  1556. FSUB c12, t4, c12
  1557. FMUL a4, c01, t1
  1558. FMUL a4, c02, t2
  1559. FMUL a4, c03, t3
  1560. FMUL a4, c04, t4
  1561. FSUB c13, t1, c13
  1562. FSUB c14, t2, c14
  1563. FSUB c15, t3, c15
  1564. FSUB c16, t4, c16
  1565. LDF [BO + 5 * SIZE], a1
  1566. LDF [BO + 6 * SIZE], a2
  1567. LDF [BO + 7 * SIZE], a3
  1568. FMUL a1, c05, c05
  1569. FMUL a1, c06, c06
  1570. FMUL a1, c07, c07
  1571. FMUL a1, c08, c08
  1572. FMUL a2, c05, t1
  1573. FMUL a2, c06, t2
  1574. FMUL a2, c07, t3
  1575. FMUL a2, c08, t4
  1576. FSUB c09, t1, c09
  1577. FSUB c10, t2, c10
  1578. FSUB c11, t3, c11
  1579. FSUB c12, t4, c12
  1580. FMUL a3, c05, t1
  1581. FMUL a3, c06, t2
  1582. FMUL a3, c07, t3
  1583. FMUL a3, c08, t4
  1584. FSUB c13, t1, c13
  1585. FSUB c14, t2, c14
  1586. FSUB c15, t3, c15
  1587. FSUB c16, t4, c16
  1588. LDF [BO + 10 * SIZE], a1
  1589. LDF [BO + 11 * SIZE], a2
  1590. FMUL a1, c09, c09
  1591. FMUL a1, c10, c10
  1592. FMUL a1, c11, c11
  1593. FMUL a1, c12, c12
  1594. FMUL a2, c09, t1
  1595. FMUL a2, c10, t2
  1596. FMUL a2, c11, t3
  1597. FMUL a2, c12, t4
  1598. FSUB c13, t1, c13
  1599. FSUB c14, t2, c14
  1600. FSUB c15, t3, c15
  1601. FSUB c16, t4, c16
  1602. LDF [BO + 15 * SIZE], a1
  1603. FMUL a1, c13, c13
  1604. FMUL a1, c14, c14
  1605. FMUL a1, c15, c15
  1606. FMUL a1, c16, c16
  1607. #endif
  1608. #ifdef RT
  1609. LDF [BO + 15 * SIZE], a1
  1610. LDF [BO + 14 * SIZE], a2
  1611. LDF [BO + 13 * SIZE], a3
  1612. LDF [BO + 12 * SIZE], a4
  1613. FMUL a1, c13, c13
  1614. FMUL a1, c14, c14
  1615. FMUL a1, c15, c15
  1616. FMUL a1, c16, c16
  1617. FMUL a2, c13, t1
  1618. FMUL a2, c14, t2
  1619. FMUL a2, c15, t3
  1620. FMUL a2, c16, t4
  1621. FSUB c09, t1, c09
  1622. FSUB c10, t2, c10
  1623. FSUB c11, t3, c11
  1624. FSUB c12, t4, c12
  1625. FMUL a3, c13, t1
  1626. FMUL a3, c14, t2
  1627. FMUL a3, c15, t3
  1628. FMUL a3, c16, t4
  1629. FSUB c05, t1, c05
  1630. FSUB c06, t2, c06
  1631. FSUB c07, t3, c07
  1632. FSUB c08, t4, c08
  1633. FMUL a4, c13, t1
  1634. FMUL a4, c14, t2
  1635. FMUL a4, c15, t3
  1636. FMUL a4, c16, t4
  1637. FSUB c01, t1, c01
  1638. FSUB c02, t2, c02
  1639. FSUB c03, t3, c03
  1640. FSUB c04, t4, c04
  1641. LDF [BO + 10 * SIZE], a1
  1642. LDF [BO + 9 * SIZE], a2
  1643. LDF [BO + 8 * SIZE], a3
  1644. FMUL a1, c09, c09
  1645. FMUL a1, c10, c10
  1646. FMUL a1, c11, c11
  1647. FMUL a1, c12, c12
  1648. FMUL a2, c09, t1
  1649. FMUL a2, c10, t2
  1650. FMUL a2, c11, t3
  1651. FMUL a2, c12, t4
  1652. FSUB c05, t1, c05
  1653. FSUB c06, t2, c06
  1654. FSUB c07, t3, c07
  1655. FSUB c08, t4, c08
  1656. FMUL a3, c09, t1
  1657. FMUL a3, c10, t2
  1658. FMUL a3, c11, t3
  1659. FMUL a3, c12, t4
  1660. FSUB c01, t1, c01
  1661. FSUB c02, t2, c02
  1662. FSUB c03, t3, c03
  1663. FSUB c04, t4, c04
  1664. LDF [BO + 5 * SIZE], a1
  1665. LDF [BO + 4 * SIZE], a2
  1666. FMUL a1, c05, c05
  1667. FMUL a1, c06, c06
  1668. FMUL a1, c07, c07
  1669. FMUL a1, c08, c08
  1670. FMUL a2, c05, t1
  1671. FMUL a2, c06, t2
  1672. FMUL a2, c07, t3
  1673. FMUL a2, c08, t4
  1674. FSUB c01, t1, c01
  1675. FSUB c02, t2, c02
  1676. FSUB c03, t3, c03
  1677. FSUB c04, t4, c04
  1678. LDF [BO + 0 * SIZE], a1
  1679. FMUL a1, c01, c01
  1680. FMUL a1, c02, c02
  1681. FMUL a1, c03, c03
  1682. FMUL a1, c04, c04
  1683. #endif
  1684. #ifdef LN
  1685. add C1, -4 * SIZE, C1
  1686. add C2, -4 * SIZE, C2
  1687. add C3, -4 * SIZE, C3
  1688. add C4, -4 * SIZE, C4
  1689. #endif
  1690. #if defined(LN) || defined(LT)
  1691. STF c01, [BO + 0 * SIZE]
  1692. STF c05, [BO + 1 * SIZE]
  1693. STF c09, [BO + 2 * SIZE]
  1694. STF c13, [BO + 3 * SIZE]
  1695. STF c02, [BO + 4 * SIZE]
  1696. STF c06, [BO + 5 * SIZE]
  1697. STF c10, [BO + 6 * SIZE]
  1698. STF c14, [BO + 7 * SIZE]
  1699. STF c03, [BO + 8 * SIZE]
  1700. STF c07, [BO + 9 * SIZE]
  1701. STF c11, [BO + 10 * SIZE]
  1702. STF c15, [BO + 11 * SIZE]
  1703. STF c04, [BO + 12 * SIZE]
  1704. STF c08, [BO + 13 * SIZE]
  1705. STF c12, [BO + 14 * SIZE]
  1706. STF c16, [BO + 15 * SIZE]
  1707. #else
  1708. STF c01, [AO + 0 * SIZE]
  1709. STF c02, [AO + 1 * SIZE]
  1710. STF c03, [AO + 2 * SIZE]
  1711. STF c04, [AO + 3 * SIZE]
  1712. STF c05, [AO + 4 * SIZE]
  1713. STF c06, [AO + 5 * SIZE]
  1714. STF c07, [AO + 6 * SIZE]
  1715. STF c08, [AO + 7 * SIZE]
  1716. STF c09, [AO + 8 * SIZE]
  1717. STF c10, [AO + 9 * SIZE]
  1718. STF c11, [AO + 10 * SIZE]
  1719. STF c12, [AO + 11 * SIZE]
  1720. STF c13, [AO + 12 * SIZE]
  1721. STF c14, [AO + 13 * SIZE]
  1722. STF c15, [AO + 14 * SIZE]
  1723. STF c16, [AO + 15 * SIZE]
  1724. #endif
  1725. STF c01, [C1 + 0 * SIZE]
  1726. STF c02, [C1 + 1 * SIZE]
  1727. STF c03, [C1 + 2 * SIZE]
  1728. STF c04, [C1 + 3 * SIZE]
  1729. STF c05, [C2 + 0 * SIZE]
  1730. STF c06, [C2 + 1 * SIZE]
  1731. STF c07, [C2 + 2 * SIZE]
  1732. STF c08, [C2 + 3 * SIZE]
  1733. STF c09, [C3 + 0 * SIZE]
  1734. STF c10, [C3 + 1 * SIZE]
  1735. STF c11, [C3 + 2 * SIZE]
  1736. STF c12, [C3 + 3 * SIZE]
  1737. STF c13, [C4 + 0 * SIZE]
  1738. STF c14, [C4 + 1 * SIZE]
  1739. STF c15, [C4 + 2 * SIZE]
  1740. STF c16, [C4 + 3 * SIZE]
  1741. FMOV FZERO, t1
  1742. FMOV FZERO, t2
  1743. FMOV FZERO, t3
  1744. FMOV FZERO, t4
  1745. #ifndef LN
  1746. add C1, 4 * SIZE, C1
  1747. add C2, 4 * SIZE, C2
  1748. add C3, 4 * SIZE, C3
  1749. add C4, 4 * SIZE, C4
  1750. #endif
  1751. #ifdef RT
  1752. sll K, 2 + BASE_SHIFT, TEMP1
  1753. add AORIG, TEMP1, AORIG
  1754. #endif
  1755. #if defined(LT) || defined(RN)
  1756. sub K, KK, TEMP1
  1757. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  1758. add AO, TEMP1, AO
  1759. add BO, TEMP1, BO
  1760. #endif
  1761. #ifdef LT
  1762. add KK, 4, KK
  1763. #endif
  1764. #ifdef LN
  1765. sub KK, 4, KK
  1766. #endif
  1767. add I, -1, I
  1768. cmp I, 0
  1769. sra K, 2, L
  1770. bg,pt %icc, .LL21
  1771. FMOV FZERO, c01
  1772. .LL99:
  1773. #ifdef LN
  1774. sll K, 2 + BASE_SHIFT, TEMP1
  1775. add B, TEMP1, B
  1776. #endif
  1777. #if defined(LT) || defined(RN)
  1778. mov BO, B
  1779. #endif
  1780. #ifdef RN
  1781. add KK, 4, KK
  1782. #endif
  1783. #ifdef RT
  1784. sub KK, 4, KK
  1785. #endif
  1786. add J, -1, J
  1787. cmp J, 0
  1788. bg,pt %icc, .LL11
  1789. nop
  1790. .LL100: /* n & 2 */
  1791. and N, 2, J
  1792. cmp J, 0
  1793. ble,pn %icc, .LL200
  1794. nop
  1795. #ifdef RT
  1796. sll K, 1 + BASE_SHIFT, TEMP1
  1797. sub B, TEMP1, B
  1798. sll LDC, 1, TEMP1
  1799. sub C, TEMP1, C
  1800. #endif
  1801. mov C, C1
  1802. add C, LDC, C2
  1803. #ifdef LN
  1804. add M, OFFSET, KK
  1805. #endif
  1806. #ifdef LT
  1807. mov OFFSET, KK
  1808. #endif
  1809. #if defined(LN) || defined(RT)
  1810. mov A, AORIG
  1811. #else
  1812. mov A, AO
  1813. #endif
  1814. #ifndef RT
  1815. add C2, LDC, C
  1816. #endif
  1817. and M, 1, I
  1818. cmp I, 0
  1819. ble,pn %icc, .LL150
  1820. nop
  1821. #if defined(LT) || defined(RN)
  1822. sra KK, 2, L
  1823. mov B, BO
  1824. cmp L, 0
  1825. #else
  1826. #ifdef LN
  1827. sll K, 0 + BASE_SHIFT, TEMP1
  1828. sub AORIG, TEMP1, AORIG
  1829. #endif
  1830. sll KK, 0 + BASE_SHIFT, TEMP1
  1831. sll KK, 1 + BASE_SHIFT, TEMP2
  1832. add AORIG, TEMP1, AO
  1833. add B, TEMP2, BO
  1834. sub K, KK, TEMP1
  1835. sra TEMP1, 2, L
  1836. cmp L, 0
  1837. #endif
  1838. LDF [AO + 0 * SIZE], a1
  1839. FMOV FZERO, c01
  1840. LDF [BO + 0 * SIZE], b1
  1841. FMOV FZERO, t1
  1842. LDF [AO + 1 * SIZE], a2
  1843. FMOV FZERO, c02
  1844. LDF [BO + 1 * SIZE], b2
  1845. FMOV FZERO, t2
  1846. LDF [AO + 2 * SIZE], a3
  1847. FMOV FZERO, c03
  1848. LDF [BO + 2 * SIZE], b3
  1849. FMOV FZERO, t3
  1850. LDF [AO + 3 * SIZE], a4
  1851. FMOV FZERO, c04
  1852. LDF [BO + 3 * SIZE], b4
  1853. FMOV FZERO, t4
  1854. ble,pn %icc, .LL175
  1855. nop
  1856. .LL172:
  1857. FADD c01, t1, c01
  1858. add AO, 4 * SIZE, AO
  1859. FMUL a1, b1, t1
  1860. LDF [BO + 4 * SIZE], b1
  1861. FADD c02, t2, c02
  1862. FMUL a1, b2, t2
  1863. LDF [BO + 5 * SIZE], b2
  1864. add L, -1, L
  1865. LDF [AO + 0 * SIZE], a1
  1866. FADD c03, t3, c03
  1867. cmp L, 0
  1868. FMUL a2, b3, t3
  1869. LDF [BO + 6 * SIZE], b3
  1870. FADD c04, t4, c04
  1871. FMUL a2, b4, t4
  1872. LDF [BO + 7 * SIZE], b4
  1873. LDF [AO + 1 * SIZE], a2
  1874. FADD c01, t1, c01
  1875. FMUL a3, b1, t1
  1876. LDF [BO + 8 * SIZE], b1
  1877. FADD c02, t2, c02
  1878. FMUL a3, b2, t2
  1879. LDF [BO + 9 * SIZE], b2
  1880. LDF [AO + 2 * SIZE], a3
  1881. FADD c03, t3, c03
  1882. FMUL a4, b3, t3
  1883. LDF [BO + 10 * SIZE], b3
  1884. FADD c04, t4, c04
  1885. FMUL a4, b4, t4
  1886. LDF [BO + 11 * SIZE], b4
  1887. add BO, 8 * SIZE, BO
  1888. bg,pt %icc, .LL172
  1889. LDF [AO + 3 * SIZE], a4
  1890. .LL175:
  1891. #if defined(LT) || defined(RN)
  1892. and KK, 3, L
  1893. #else
  1894. and TEMP1, 3, L
  1895. #endif
  1896. cmp L, 0
  1897. ble,a,pn %icc, .LL179
  1898. nop
  1899. .LL176:
  1900. FADD c01, t1, c01
  1901. add L, -1, L
  1902. FMUL a1, b1, t1
  1903. add AO, 1 * SIZE, AO
  1904. LDF [BO + 2 * SIZE], b1
  1905. FADD c02, t2, c02
  1906. cmp L, 0
  1907. FMUL a1, b2, t2
  1908. LDF [BO + 3 * SIZE], b2
  1909. add BO, 2 * SIZE, BO
  1910. bg,pt %icc, .LL176
  1911. LDF [AO + 0 * SIZE], a1
  1912. .LL179:
  1913. FADD c01, t1, c01
  1914. FADD c02, t2, c02
  1915. FADD c03, t3, c03
  1916. FADD c04, t4, c04
  1917. FADD c01, c03, c01
  1918. FADD c02, c04, c02
  1919. #if defined(LN) || defined(RT)
  1920. #ifdef LN
  1921. sub KK, 1, TEMP1
  1922. #else
  1923. sub KK, 2, TEMP1
  1924. #endif
  1925. sll TEMP1, 0 + BASE_SHIFT, TEMP2
  1926. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  1927. add AORIG, TEMP2, AO
  1928. add B, TEMP1, BO
  1929. #endif
  1930. #if defined(LN) || defined(LT)
  1931. LDF [BO + 0 * SIZE], a1
  1932. LDF [BO + 1 * SIZE], a2
  1933. FSUB a1, c01, c01
  1934. FSUB a2, c02, c02
  1935. #else
  1936. LDF [AO + 0 * SIZE], a1
  1937. LDF [AO + 1 * SIZE], a2
  1938. FSUB a1, c01, c01
  1939. FSUB a2, c02, c02
  1940. #endif
  1941. #ifdef LN
  1942. LDF [AO + 0 * SIZE], a1
  1943. FMUL a1, c01, c01
  1944. FMUL a1, c02, c02
  1945. #endif
  1946. #ifdef LT
  1947. LDF [AO + 0 * SIZE], a1
  1948. FMUL a1, c01, c01
  1949. FMUL a1, c02, c02
  1950. #endif
  1951. #ifdef RN
  1952. LDF [BO + 0 * SIZE], a1
  1953. LDF [BO + 1 * SIZE], a2
  1954. LDF [BO + 3 * SIZE], a3
  1955. FMUL a1, c01, c01
  1956. FMUL a2, c01, t1
  1957. FSUB c02, t1, c02
  1958. FMUL a3, c02, c02
  1959. #endif
  1960. #ifdef RT
  1961. LDF [BO + 3 * SIZE], a1
  1962. LDF [BO + 2 * SIZE], a2
  1963. LDF [BO + 0 * SIZE], a3
  1964. FMUL a1, c02, c02
  1965. FMUL a2, c02, t1
  1966. FSUB c01, t1, c01
  1967. FMUL a3, c01, c01
  1968. #endif
  1969. #ifdef LN
  1970. add C1, -1 * SIZE, C1
  1971. add C2, -1 * SIZE, C2
  1972. #endif
  1973. #if defined(LN) || defined(LT)
  1974. STF c01, [BO + 0 * SIZE]
  1975. STF c02, [BO + 1 * SIZE]
  1976. #else
  1977. STF c01, [AO + 0 * SIZE]
  1978. STF c02, [AO + 1 * SIZE]
  1979. #endif
  1980. STF c01, [C1 + 0 * SIZE]
  1981. STF c02, [C2 + 0 * SIZE]
  1982. FMOV FZERO, t1
  1983. FMOV FZERO, t2
  1984. FMOV FZERO, t3
  1985. FMOV FZERO, t4
  1986. #ifndef LN
  1987. add C1, 1 * SIZE, C1
  1988. add C2, 1 * SIZE, C2
  1989. #endif
  1990. #ifdef RT
  1991. sll K, 0 + BASE_SHIFT, TEMP1
  1992. add AORIG, TEMP1, AORIG
  1993. #endif
  1994. #if defined(LT) || defined(RN)
  1995. sub K, KK, TEMP1
  1996. sll TEMP1, 0 + BASE_SHIFT, TEMP2
  1997. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  1998. add AO, TEMP2, AO
  1999. add BO, TEMP1, BO
  2000. #endif
  2001. #ifdef LT
  2002. add KK, 1, KK
  2003. #endif
  2004. #ifdef LN
  2005. sub KK, 1, KK
  2006. #endif
  2007. .LL150:
  2008. and M, 2, I
  2009. cmp I, 0
  2010. ble,pn %icc, .LL170
  2011. nop
  2012. #if defined(LT) || defined(RN)
  2013. sra KK, 2, L
  2014. mov B, BO
  2015. cmp L, 0
  2016. #else
  2017. #ifdef LN
  2018. sll K, 1 + BASE_SHIFT, TEMP1
  2019. sub AORIG, TEMP1, AORIG
  2020. #endif
  2021. sll KK, 1 + BASE_SHIFT, TEMP1
  2022. sll KK, 1 + BASE_SHIFT, TEMP2
  2023. add AORIG, TEMP1, AO
  2024. add B, TEMP2, BO
  2025. sub K, KK, TEMP1
  2026. sra TEMP1, 2, L
  2027. cmp L, 0
  2028. #endif
  2029. LDF [AO + 0 * SIZE], a1
  2030. FMOV FZERO, c01
  2031. LDF [BO + 0 * SIZE], b1
  2032. FMOV FZERO, t1
  2033. LDF [AO + 1 * SIZE], a2
  2034. cmp L, 0
  2035. FMOV FZERO, c02
  2036. LDF [BO + 1 * SIZE], b2
  2037. FMOV FZERO, t2
  2038. LDF [AO + 2 * SIZE], a3
  2039. FMOV FZERO, c03
  2040. LDF [BO + 2 * SIZE], b3
  2041. FMOV FZERO, t3
  2042. LDF [AO + 3 * SIZE], a4
  2043. FMOV FZERO, c04
  2044. LDF [BO + 3 * SIZE], b4
  2045. FMOV FZERO, t4
  2046. ble,pn %icc, .LL155
  2047. nop
  2048. .LL152:
  2049. FADD c01, t1, c01
  2050. add L, -1, L
  2051. FMUL a1, b1, t1
  2052. prefetch [AO + APREFETCHSIZE * SIZE], 0
  2053. FADD c02, t2, c02
  2054. add BO, 8 * SIZE, BO
  2055. FMUL a1, b2, t2
  2056. LDF [AO + 4 * SIZE], a1
  2057. FADD c03, t3, c03
  2058. cmp L, 0
  2059. FMUL a2, b1, t3
  2060. LDF [BO - 4 * SIZE], b1
  2061. FADD c04, t4, c04
  2062. nop
  2063. FMUL a2, b2, t4
  2064. LDF [AO + 5 * SIZE], a2
  2065. FADD c01, t1, c01
  2066. nop
  2067. FMUL a3, b3, t1
  2068. LDF [BO - 3 * SIZE], b2
  2069. FADD c02, t2, c02
  2070. nop
  2071. FMUL a3, b4, t2
  2072. LDF [AO + 6 * SIZE], a3
  2073. FADD c03, t3, c03
  2074. nop
  2075. FMUL a4, b3, t3
  2076. LDF [BO - 2 * SIZE], b3
  2077. FADD c04, t4, c04
  2078. nop
  2079. FMUL a4, b4, t4
  2080. LDF [AO + 7 * SIZE], a4
  2081. FADD c01, t1, c01
  2082. nop
  2083. FMUL a1, b1, t1
  2084. LDF [BO - 1 * SIZE], b4
  2085. FADD c02, t2, c02
  2086. FMUL a1, b2, t2
  2087. LDF [AO + 8 * SIZE], a1
  2088. FADD c03, t3, c03
  2089. FMUL a2, b1, t3
  2090. LDF [BO + 0 * SIZE], b1
  2091. FADD c04, t4, c04
  2092. FMUL a2, b2, t4
  2093. LDF [AO + 9 * SIZE], a2
  2094. FADD c01, t1, c01
  2095. FMUL a3, b3, t1
  2096. LDF [BO + 1 * SIZE], b2
  2097. FADD c02, t2, c02
  2098. FMUL a3, b4, t2
  2099. LDF [AO + 10 * SIZE], a3
  2100. FADD c03, t3, c03
  2101. FMUL a4, b3, t3
  2102. LDF [BO + 2 * SIZE], b3
  2103. FADD c04, t4, c04
  2104. FMUL a4, b4, t4
  2105. LDF [AO + 11 * SIZE], a4
  2106. add AO, 8 * SIZE, AO
  2107. bg,pt %icc, .LL152
  2108. LDF [BO + 3 * SIZE], b4
  2109. .LL155:
  2110. #if defined(LT) || defined(RN)
  2111. and KK, 3, L
  2112. #else
  2113. and TEMP1, 3, L
  2114. #endif
  2115. cmp L, 0
  2116. ble,a,pn %icc, .LL159
  2117. nop
  2118. .LL156:
  2119. LDF [AO + 0 * SIZE], a1
  2120. LDF [AO + 1 * SIZE], a2
  2121. LDF [BO + 0 * SIZE], b1
  2122. LDF [BO + 1 * SIZE], b2
  2123. FADD c01, t1, c01
  2124. FADD c02, t2, c02
  2125. FADD c03, t3, c03
  2126. FADD c04, t4, c04
  2127. FMUL a1, b1, t1
  2128. FMUL a1, b2, t2
  2129. FMUL a2, b1, t3
  2130. FMUL a2, b2, t4
  2131. add AO, 2 * SIZE, AO
  2132. add BO, 2 * SIZE, BO
  2133. add L, -1, L
  2134. cmp L, 0
  2135. bg,pt %icc, .LL156
  2136. nop
  2137. .LL159:
  2138. FADD c01, t1, c01
  2139. FADD c02, t2, c02
  2140. FADD c03, t3, c03
  2141. FADD c04, t4, c04
  2142. #if defined(LN) || defined(RT)
  2143. #ifdef LN
  2144. sub KK, 2, TEMP1
  2145. #else
  2146. sub KK, 2, TEMP1
  2147. #endif
  2148. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  2149. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  2150. add AORIG, TEMP2, AO
  2151. add B, TEMP1, BO
  2152. #endif
  2153. #if defined(LN) || defined(LT)
  2154. LDF [BO + 0 * SIZE], a1
  2155. LDF [BO + 1 * SIZE], a2
  2156. LDF [BO + 2 * SIZE], a3
  2157. LDF [BO + 3 * SIZE], a4
  2158. FSUB a1, c01, c01
  2159. FSUB a2, c02, c02
  2160. FSUB a3, c03, c03
  2161. FSUB a4, c04, c04
  2162. #else
  2163. LDF [AO + 0 * SIZE], a1
  2164. LDF [AO + 1 * SIZE], a2
  2165. LDF [AO + 2 * SIZE], a3
  2166. LDF [AO + 3 * SIZE], a4
  2167. FSUB a1, c01, c01
  2168. FSUB a2, c03, c03
  2169. FSUB a3, c02, c02
  2170. FSUB a4, c04, c04
  2171. #endif
  2172. #ifdef LN
  2173. LDF [AO + 3 * SIZE], a1
  2174. LDF [AO + 2 * SIZE], a2
  2175. LDF [AO + 0 * SIZE], a3
  2176. FMUL a1, c03, c03
  2177. FMUL a1, c04, c04
  2178. FMUL a2, c03, t1
  2179. FMUL a2, c04, t2
  2180. FSUB c01, t1, c01
  2181. FSUB c02, t2, c02
  2182. FMUL a3, c01, c01
  2183. FMUL a3, c02, c02
  2184. #endif
  2185. #ifdef LT
  2186. LDF [AO + 0 * SIZE], a1
  2187. LDF [AO + 1 * SIZE], a2
  2188. LDF [AO + 3 * SIZE], a3
  2189. FMUL a1, c01, c01
  2190. FMUL a1, c02, c02
  2191. FMUL a2, c01, t1
  2192. FMUL a2, c02, t2
  2193. FSUB c03, t1, c03
  2194. FSUB c04, t2, c04
  2195. FMUL a3, c03, c03
  2196. FMUL a3, c04, c04
  2197. #endif
  2198. #ifdef RN
  2199. LDF [BO + 0 * SIZE], a1
  2200. LDF [BO + 1 * SIZE], a2
  2201. LDF [BO + 3 * SIZE], a3
  2202. FMUL a1, c01, c01
  2203. FMUL a1, c03, c03
  2204. FMUL a2, c01, t1
  2205. FMUL a2, c03, t2
  2206. FSUB c02, t1, c02
  2207. FSUB c04, t2, c04
  2208. FMUL a3, c02, c02
  2209. FMUL a3, c04, c04
  2210. #endif
  2211. #ifdef RT
  2212. LDF [BO + 3 * SIZE], a1
  2213. LDF [BO + 2 * SIZE], a2
  2214. LDF [BO + 0 * SIZE], a3
  2215. FMUL a1, c02, c02
  2216. FMUL a1, c04, c04
  2217. FMUL a2, c02, t1
  2218. FMUL a2, c04, t2
  2219. FSUB c01, t1, c01
  2220. FSUB c03, t2, c03
  2221. FMUL a3, c01, c01
  2222. FMUL a3, c03, c03
  2223. #endif
  2224. #ifdef LN
  2225. add C1, -2 * SIZE, C1
  2226. add C2, -2 * SIZE, C2
  2227. #endif
  2228. #if defined(LN) || defined(LT)
  2229. STF c01, [BO + 0 * SIZE]
  2230. STF c02, [BO + 1 * SIZE]
  2231. STF c03, [BO + 2 * SIZE]
  2232. STF c04, [BO + 3 * SIZE]
  2233. #else
  2234. STF c01, [AO + 0 * SIZE]
  2235. STF c03, [AO + 1 * SIZE]
  2236. STF c02, [AO + 2 * SIZE]
  2237. STF c04, [AO + 3 * SIZE]
  2238. #endif
  2239. STF c01, [C1 + 0 * SIZE]
  2240. STF c03, [C1 + 1 * SIZE]
  2241. STF c02, [C2 + 0 * SIZE]
  2242. STF c04, [C2 + 1 * SIZE]
  2243. FMOV FZERO, t1
  2244. FMOV FZERO, t2
  2245. FMOV FZERO, t3
  2246. FMOV FZERO, t4
  2247. #ifndef LN
  2248. add C1, 2 * SIZE, C1
  2249. add C2, 2 * SIZE, C2
  2250. #endif
  2251. #ifdef RT
  2252. sll K, 1 + BASE_SHIFT, TEMP1
  2253. add AORIG, TEMP1, AORIG
  2254. #endif
  2255. #if defined(LT) || defined(RN)
  2256. sub K, KK, TEMP1
  2257. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  2258. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  2259. add AO, TEMP2, AO
  2260. add BO, TEMP1, BO
  2261. #endif
  2262. #ifdef LT
  2263. add KK, 2, KK
  2264. #endif
  2265. #ifdef LN
  2266. sub KK, 2, KK
  2267. #endif
  2268. .LL170:
  2269. sra M, 2, I
  2270. cmp I, 0
  2271. ble,pn %icc, .LL199
  2272. FMOV FZERO, c03
  2273. .LL121:
  2274. #if defined(LT) || defined(RN)
  2275. sra KK, 2, L
  2276. mov B, BO
  2277. cmp L, 0
  2278. #else
  2279. #ifdef LN
  2280. sll K, 2 + BASE_SHIFT, TEMP1
  2281. sub AORIG, TEMP1, AORIG
  2282. #endif
  2283. sll KK, 2 + BASE_SHIFT, TEMP1
  2284. sll KK, 1 + BASE_SHIFT, TEMP2
  2285. add AORIG, TEMP1, AO
  2286. add B, TEMP2, BO
  2287. sub K, KK, TEMP1
  2288. sra TEMP1, 2, L
  2289. cmp L, 0
  2290. #endif
  2291. LDF [AO + 0 * SIZE], a1
  2292. FMOV FZERO, t1
  2293. LDF [BO + 0 * SIZE], b1
  2294. FMOV FZERO, c07
  2295. LDF [AO + 1 * SIZE], a2
  2296. FMOV FZERO, t2
  2297. LDF [BO + 1 * SIZE], b2
  2298. FMOV FZERO, c04
  2299. LDF [AO + 2 * SIZE], a3
  2300. FMOV FZERO, t3
  2301. LDF [BO + 2 * SIZE], b3
  2302. FMOV FZERO, c08
  2303. LDF [AO + 3 * SIZE], a4
  2304. FMOV FZERO, t4
  2305. LDF [BO + 3 * SIZE], b4
  2306. FMOV FZERO, c01
  2307. #ifdef LN
  2308. prefetch [C1 - 3 * SIZE], 2
  2309. FMOV FZERO, c05
  2310. prefetch [C2 - 3 * SIZE], 2
  2311. FMOV FZERO, c02
  2312. #else
  2313. prefetch [C1 + 3 * SIZE], 2
  2314. FMOV FZERO, c05
  2315. prefetch [C2 + 3 * SIZE], 2
  2316. FMOV FZERO, c02
  2317. #endif
  2318. ble,pn %icc, .LL125
  2319. FMOV FZERO, c06
  2320. .LL122:
  2321. FADD c03, t1, c03
  2322. add L, -1, L
  2323. FMUL a1, b1, t1
  2324. prefetch [AO + APREFETCHSIZE * SIZE], 0
  2325. FADD c07, t2, c07
  2326. add BO, 8 * SIZE, BO
  2327. FMUL a1, b2, t2
  2328. LDF [AO + 4 * SIZE], a1
  2329. FADD c04, t3, c04
  2330. add AO, 16 * SIZE, AO
  2331. FMUL a2, b1, t3
  2332. cmp L, 0
  2333. FADD c08, t4, c08
  2334. nop
  2335. FMUL a2, b2, t4
  2336. LDF [AO - 11 * SIZE], a2
  2337. FADD c01, t1, c01
  2338. nop
  2339. FMUL a3, b1, t1
  2340. nop
  2341. FADD c05, t2, c05
  2342. nop
  2343. FMUL a3, b2, t2
  2344. LDF [AO - 10 * SIZE], a3
  2345. FADD c02, t3, c02
  2346. nop
  2347. FMUL a4, b1, t3
  2348. LDF [BO - 4 * SIZE], b1
  2349. FADD c06, t4, c06
  2350. nop
  2351. FMUL a4, b2, t4
  2352. LDF [BO - 3 * SIZE], b2
  2353. FADD c03, t1, c03
  2354. nop
  2355. FMUL a1, b3, t1
  2356. LDF [AO - 9 * SIZE], a4
  2357. FADD c07, t2, c07
  2358. nop
  2359. FMUL a1, b4, t2
  2360. LDF [AO - 8 * SIZE], a1
  2361. FADD c04, t3, c04
  2362. nop
  2363. FMUL a2, b3, t3
  2364. nop
  2365. FADD c08, t4, c08
  2366. nop
  2367. FMUL a2, b4, t4
  2368. LDF [AO - 7 * SIZE], a2
  2369. FADD c01, t1, c01
  2370. nop
  2371. FMUL a3, b3, t1
  2372. nop
  2373. FADD c05, t2, c05
  2374. nop
  2375. FMUL a3, b4, t2
  2376. LDF [AO - 6 * SIZE], a3
  2377. FADD c02, t3, c02
  2378. nop
  2379. FMUL a4, b3, t3
  2380. LDF [BO - 2 * SIZE], b3
  2381. FADD c06, t4, c06
  2382. nop
  2383. FMUL a4, b4, t4
  2384. LDF [BO - 1 * SIZE], b4
  2385. FADD c03, t1, c03
  2386. nop
  2387. FMUL a1, b1, t1
  2388. LDF [AO - 5 * SIZE], a4
  2389. FADD c07, t2, c07
  2390. nop
  2391. FMUL a1, b2, t2
  2392. LDF [AO - 4 * SIZE], a1
  2393. FADD c04, t3, c04
  2394. nop
  2395. FMUL a2, b1, t3
  2396. nop
  2397. FADD c08, t4, c08
  2398. nop
  2399. FMUL a2, b2, t4
  2400. LDF [AO - 3 * SIZE], a2
  2401. FADD c01, t1, c01
  2402. nop
  2403. FMUL a3, b1, t1
  2404. nop
  2405. FADD c05, t2, c05
  2406. nop
  2407. FMUL a3, b2, t2
  2408. LDF [AO - 2 * SIZE], a3
  2409. FADD c02, t3, c02
  2410. nop
  2411. FMUL a4, b1, t3
  2412. LDF [BO + 0 * SIZE], b1
  2413. FADD c06, t4, c06
  2414. nop
  2415. FMUL a4, b2, t4
  2416. LDF [BO + 1 * SIZE], b2
  2417. FADD c03, t1, c03
  2418. nop
  2419. FMUL a1, b3, t1
  2420. LDF [AO - 1 * SIZE], a4
  2421. FADD c07, t2, c07
  2422. nop
  2423. FMUL a1, b4, t2
  2424. LDF [AO + 0 * SIZE], a1
  2425. FADD c04, t3, c04
  2426. nop
  2427. FMUL a2, b3, t3
  2428. nop
  2429. FADD c08, t4, c08
  2430. nop
  2431. FMUL a2, b4, t4
  2432. LDF [AO + 1 * SIZE], a2
  2433. FADD c01, t1, c01
  2434. nop
  2435. FMUL a3, b3, t1
  2436. nop
  2437. FADD c05, t2, c05
  2438. nop
  2439. FMUL a3, b4, t2
  2440. LDF [AO + 2 * SIZE], a3
  2441. FADD c02, t3, c02
  2442. nop
  2443. FMUL a4, b3, t3
  2444. LDF [BO + 2 * SIZE], b3
  2445. FADD c06, t4, c06
  2446. FMUL a4, b4, t4
  2447. LDF [AO + 3 * SIZE], a4
  2448. bg,pt %icc, .LL122
  2449. LDF [BO + 3 * SIZE], b4
  2450. .LL125:
  2451. #if defined(LT) || defined(RN)
  2452. and KK, 3, L
  2453. #else
  2454. and TEMP1, 3, L
  2455. #endif
  2456. cmp L, 0
  2457. ble,a,pn %icc, .LL129
  2458. nop
  2459. .LL126:
  2460. FADD c03, t1, c03
  2461. add AO, 4 * SIZE, AO
  2462. FMUL a1, b1, t1
  2463. add BO, 2 * SIZE, BO
  2464. FADD c07, t2, c07
  2465. add L, -1, L
  2466. FMUL a1, b2, t2
  2467. LDF [AO + 0 * SIZE], a1
  2468. FADD c04, t3, c04
  2469. cmp L, 0
  2470. FMUL a2, b1, t3
  2471. FADD c08, t4, c08
  2472. FMUL a2, b2, t4
  2473. LDF [AO + 1 * SIZE], a2
  2474. FADD c01, t1, c01
  2475. FMUL a3, b1, t1
  2476. FADD c05, t2, c05
  2477. FMUL a3, b2, t2
  2478. LDF [AO + 2 * SIZE], a3
  2479. FADD c02, t3, c02
  2480. FMUL a4, b1, t3
  2481. LDF [BO + 0 * SIZE], b1
  2482. FADD c06, t4, c06
  2483. FMUL a4, b2, t4
  2484. LDF [BO + 1 * SIZE], b2
  2485. bg,pt %icc, .LL126
  2486. LDF [AO + 3 * SIZE], a4
  2487. .LL129:
  2488. FADD c03, t1, c03
  2489. FADD c07, t2, c07
  2490. FADD c04, t3, c04
  2491. FADD c08, t4, c08
  2492. #if defined(LN) || defined(RT)
  2493. #ifdef LN
  2494. sub KK, 4, TEMP1
  2495. #else
  2496. sub KK, 2, TEMP1
  2497. #endif
  2498. sll TEMP1, 2 + BASE_SHIFT, TEMP2
  2499. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  2500. add AORIG, TEMP2, AO
  2501. add B, TEMP1, BO
  2502. #endif
  2503. #if defined(LN) || defined(LT)
  2504. LDF [BO + 0 * SIZE], a1
  2505. LDF [BO + 1 * SIZE], a2
  2506. LDF [BO + 2 * SIZE], a3
  2507. LDF [BO + 3 * SIZE], a4
  2508. LDF [BO + 4 * SIZE], b1
  2509. LDF [BO + 5 * SIZE], b2
  2510. LDF [BO + 6 * SIZE], b3
  2511. LDF [BO + 7 * SIZE], b4
  2512. FSUB a1, c01, c01
  2513. FSUB a2, c05, c05
  2514. FSUB a3, c02, c02
  2515. FSUB a4, c06, c06
  2516. FSUB b1, c03, c03
  2517. FSUB b2, c07, c07
  2518. FSUB b3, c04, c04
  2519. FSUB b4, c08, c08
  2520. #else
  2521. LDF [AO + 0 * SIZE], a1
  2522. LDF [AO + 1 * SIZE], a2
  2523. LDF [AO + 2 * SIZE], a3
  2524. LDF [AO + 3 * SIZE], a4
  2525. LDF [AO + 4 * SIZE], b1
  2526. LDF [AO + 5 * SIZE], b2
  2527. LDF [AO + 6 * SIZE], b3
  2528. LDF [AO + 7 * SIZE], b4
  2529. FSUB a1, c01, c01
  2530. FSUB a2, c02, c02
  2531. FSUB a3, c03, c03
  2532. FSUB a4, c04, c04
  2533. FSUB b1, c05, c05
  2534. FSUB b2, c06, c06
  2535. FSUB b3, c07, c07
  2536. FSUB b4, c08, c08
  2537. #endif
  2538. #ifdef LN
  2539. LDF [AO + 15 * SIZE], a1
  2540. LDF [AO + 14 * SIZE], a2
  2541. LDF [AO + 13 * SIZE], a3
  2542. LDF [AO + 12 * SIZE], a4
  2543. FMUL a1, c04, c04
  2544. FMUL a1, c08, c08
  2545. FMUL a2, c04, t1
  2546. FMUL a2, c08, t2
  2547. FSUB c03, t1, c03
  2548. FSUB c07, t2, c07
  2549. FMUL a3, c04, t1
  2550. FMUL a3, c08, t2
  2551. FSUB c02, t1, c02
  2552. FSUB c06, t2, c06
  2553. FMUL a4, c04, t1
  2554. FMUL a4, c08, t2
  2555. FSUB c01, t1, c01
  2556. FSUB c05, t2, c05
  2557. LDF [AO + 10 * SIZE], a1
  2558. LDF [AO + 9 * SIZE], a2
  2559. LDF [AO + 8 * SIZE], a3
  2560. FMUL a1, c03, c03
  2561. FMUL a1, c07, c07
  2562. FMUL a2, c03, t1
  2563. FMUL a2, c07, t2
  2564. FSUB c02, t1, c02
  2565. FSUB c06, t2, c06
  2566. FMUL a3, c03, t1
  2567. FMUL a3, c07, t2
  2568. FSUB c01, t1, c01
  2569. FSUB c05, t2, c05
  2570. LDF [AO + 5 * SIZE], a1
  2571. LDF [AO + 4 * SIZE], a2
  2572. FMUL a1, c02, c02
  2573. FMUL a1, c06, c06
  2574. FMUL a2, c02, t1
  2575. FMUL a2, c06, t2
  2576. FSUB c01, t1, c01
  2577. FSUB c05, t2, c05
  2578. LDF [AO + 0 * SIZE], a1
  2579. FMUL a1, c01, c01
  2580. FMUL a1, c05, c05
  2581. #endif
  2582. #ifdef LT
  2583. LDF [AO + 0 * SIZE], a1
  2584. LDF [AO + 1 * SIZE], a2
  2585. LDF [AO + 2 * SIZE], a3
  2586. LDF [AO + 3 * SIZE], a4
  2587. FMUL a1, c01, c01
  2588. FMUL a1, c05, c05
  2589. FMUL a2, c01, t1
  2590. FMUL a2, c05, t2
  2591. FSUB c02, t1, c02
  2592. FSUB c06, t2, c06
  2593. FMUL a3, c01, t1
  2594. FMUL a3, c05, t2
  2595. FSUB c03, t1, c03
  2596. FSUB c07, t2, c07
  2597. FMUL a4, c01, t1
  2598. FMUL a4, c05, t2
  2599. FSUB c04, t1, c04
  2600. FSUB c08, t2, c08
  2601. LDF [AO + 5 * SIZE], a1
  2602. LDF [AO + 6 * SIZE], a2
  2603. LDF [AO + 7 * SIZE], a3
  2604. FMUL a1, c02, c02
  2605. FMUL a1, c06, c06
  2606. FMUL a2, c02, t1
  2607. FMUL a2, c06, t2
  2608. FSUB c03, t1, c03
  2609. FSUB c07, t2, c07
  2610. FMUL a3, c02, t1
  2611. FMUL a3, c06, t2
  2612. FSUB c04, t1, c04
  2613. FSUB c08, t2, c08
  2614. LDF [AO + 10 * SIZE], a1
  2615. LDF [AO + 11 * SIZE], a2
  2616. FMUL a1, c03, c03
  2617. FMUL a1, c07, c07
  2618. FMUL a2, c03, t1
  2619. FMUL a2, c07, t2
  2620. FSUB c04, t1, c04
  2621. FSUB c08, t2, c08
  2622. LDF [AO + 15 * SIZE], a1
  2623. FMUL a1, c04, c04
  2624. FMUL a1, c08, c08
  2625. #endif
  2626. #ifdef RN
  2627. LDF [BO + 0 * SIZE], a1
  2628. LDF [BO + 1 * SIZE], a2
  2629. LDF [BO + 3 * SIZE], a3
  2630. FMUL a1, c01, c01
  2631. FMUL a1, c02, c02
  2632. FMUL a1, c03, c03
  2633. FMUL a1, c04, c04
  2634. FMUL a2, c01, t1
  2635. FMUL a2, c02, t2
  2636. FMUL a2, c03, t3
  2637. FMUL a2, c04, t4
  2638. FSUB c05, t1, c05
  2639. FSUB c06, t2, c06
  2640. FSUB c07, t3, c07
  2641. FSUB c08, t4, c08
  2642. FMUL a3, c05, c05
  2643. FMUL a3, c06, c06
  2644. FMUL a3, c07, c07
  2645. FMUL a3, c08, c08
  2646. #endif
  2647. #ifdef RT
  2648. LDF [BO + 3 * SIZE], a1
  2649. LDF [BO + 2 * SIZE], a2
  2650. LDF [BO + 0 * SIZE], a3
  2651. FMUL a1, c05, c05
  2652. FMUL a1, c06, c06
  2653. FMUL a1, c07, c07
  2654. FMUL a1, c08, c08
  2655. FMUL a2, c05, t1
  2656. FMUL a2, c06, t2
  2657. FMUL a2, c07, t3
  2658. FMUL a2, c08, t4
  2659. FSUB c01, t1, c01
  2660. FSUB c02, t2, c02
  2661. FSUB c03, t3, c03
  2662. FSUB c04, t4, c04
  2663. FMUL a3, c01, c01
  2664. FMUL a3, c02, c02
  2665. FMUL a3, c03, c03
  2666. FMUL a3, c04, c04
  2667. #endif
  2668. #ifdef LN
  2669. add C1, -4 * SIZE, C1
  2670. add C2, -4 * SIZE, C2
  2671. #endif
  2672. #if defined(LN) || defined(LT)
  2673. STF c01, [BO + 0 * SIZE]
  2674. STF c05, [BO + 1 * SIZE]
  2675. STF c02, [BO + 2 * SIZE]
  2676. STF c06, [BO + 3 * SIZE]
  2677. STF c03, [BO + 4 * SIZE]
  2678. STF c07, [BO + 5 * SIZE]
  2679. STF c04, [BO + 6 * SIZE]
  2680. STF c08, [BO + 7 * SIZE]
  2681. #else
  2682. STF c01, [AO + 0 * SIZE]
  2683. STF c02, [AO + 1 * SIZE]
  2684. STF c03, [AO + 2 * SIZE]
  2685. STF c04, [AO + 3 * SIZE]
  2686. STF c05, [AO + 4 * SIZE]
  2687. STF c06, [AO + 5 * SIZE]
  2688. STF c07, [AO + 6 * SIZE]
  2689. STF c08, [AO + 7 * SIZE]
  2690. #endif
  2691. STF c01, [C1 + 0 * SIZE]
  2692. STF c02, [C1 + 1 * SIZE]
  2693. STF c03, [C1 + 2 * SIZE]
  2694. STF c04, [C1 + 3 * SIZE]
  2695. STF c05, [C2 + 0 * SIZE]
  2696. STF c06, [C2 + 1 * SIZE]
  2697. STF c07, [C2 + 2 * SIZE]
  2698. STF c08, [C2 + 3 * SIZE]
  2699. FMOV FZERO, t1
  2700. FMOV FZERO, t2
  2701. FMOV FZERO, t3
  2702. FMOV FZERO, t4
  2703. #ifndef LN
  2704. add C1, 4 * SIZE, C1
  2705. add C2, 4 * SIZE, C2
  2706. #endif
  2707. #ifdef RT
  2708. sll K, 2 + BASE_SHIFT, TEMP1
  2709. add AORIG, TEMP1, AORIG
  2710. #endif
  2711. #if defined(LT) || defined(RN)
  2712. sub K, KK, TEMP1
  2713. sll TEMP1, 2 + BASE_SHIFT, TEMP2
  2714. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  2715. add AO, TEMP2, AO
  2716. add BO, TEMP1, BO
  2717. #endif
  2718. #ifdef LT
  2719. add KK, 4, KK
  2720. #endif
  2721. #ifdef LN
  2722. sub KK, 4, KK
  2723. #endif
  2724. add I, -1, I
  2725. cmp I, 0
  2726. bg,pt %icc, .LL121
  2727. FMOV FZERO, c03
  2728. .LL199:
  2729. #ifdef LN
  2730. sll K, 1 + BASE_SHIFT, TEMP1
  2731. add B, TEMP1, B
  2732. #endif
  2733. #if defined(LT) || defined(RN)
  2734. mov BO, B
  2735. #endif
  2736. #ifdef RN
  2737. add KK, 2, KK
  2738. #endif
  2739. #ifdef RT
  2740. sub KK, 2, KK
  2741. #endif
  2742. .LL200:
  2743. and N, 1, J
  2744. cmp J, 0
  2745. ble,pn %icc, .LL999
  2746. nop
  2747. #ifdef RT
  2748. sll K, 0 + BASE_SHIFT, TEMP1
  2749. sub B, TEMP1, B
  2750. sub C, LDC, C
  2751. #endif
  2752. mov C, C1
  2753. #ifdef LN
  2754. add M, OFFSET, KK
  2755. #endif
  2756. #ifdef LT
  2757. mov OFFSET, KK
  2758. #endif
  2759. #if defined(LN) || defined(RT)
  2760. mov A, AORIG
  2761. #else
  2762. mov A, AO
  2763. #endif
  2764. #ifndef RT
  2765. add C, LDC, C
  2766. #endif
  2767. and M, 1, I
  2768. cmp I, 0
  2769. ble,pn %icc, .LL250
  2770. nop
  2771. #if defined(LT) || defined(RN)
  2772. sra KK, 2, L
  2773. mov B, BO
  2774. cmp L, 0
  2775. #else
  2776. #ifdef LN
  2777. sll K, 0 + BASE_SHIFT, TEMP1
  2778. sub AORIG, TEMP1, AORIG
  2779. #endif
  2780. sll KK, 0 + BASE_SHIFT, TEMP1
  2781. add AORIG, TEMP1, AO
  2782. add B, TEMP1, BO
  2783. sub K, KK, TEMP1
  2784. sra TEMP1, 2, L
  2785. cmp L, 0
  2786. #endif
  2787. LDF [AO + 0 * SIZE], a1
  2788. FMOV FZERO, t1
  2789. LDF [AO + 1 * SIZE], a2
  2790. FMOV FZERO, c01
  2791. LDF [AO + 2 * SIZE], a3
  2792. FMOV FZERO, t2
  2793. LDF [AO + 3 * SIZE], a4
  2794. FMOV FZERO, c02
  2795. LDF [BO + 0 * SIZE], b1
  2796. FMOV FZERO, t3
  2797. LDF [BO + 1 * SIZE], b2
  2798. FMOV FZERO, t4
  2799. LDF [BO + 2 * SIZE], b3
  2800. ble,pn %icc, .LL275
  2801. LDF [BO + 3 * SIZE], b4
  2802. .LL272:
  2803. FADD c01, t1, c01
  2804. add L, -1, L
  2805. add AO, 4 * SIZE, AO
  2806. FMUL a1, b1, t1
  2807. add BO, 4 * SIZE, BO
  2808. LDF [AO + 0 * SIZE], a1
  2809. FADD c02, t2, c02
  2810. cmp L, 0
  2811. LDF [BO + 0 * SIZE], b1
  2812. FMUL a2, b2, t2
  2813. LDF [AO + 1 * SIZE], a2
  2814. FADD c01, t3, c01
  2815. LDF [BO + 1 * SIZE], b2
  2816. FMUL a3, b3, t3
  2817. LDF [AO + 2 * SIZE], a3
  2818. FADD c02, t4, c02
  2819. LDF [BO + 2 * SIZE], b3
  2820. FMUL a4, b4, t4
  2821. LDF [AO + 3 * SIZE], a4
  2822. bg,pt %icc, .LL272
  2823. LDF [BO + 3 * SIZE], b4
  2824. .LL275:
  2825. #if defined(LT) || defined(RN)
  2826. and KK, 3, L
  2827. #else
  2828. and TEMP1, 3, L
  2829. #endif
  2830. cmp L, 0
  2831. ble,a,pn %icc, .LL279
  2832. nop
  2833. .LL276:
  2834. FADD c01, t1, c01
  2835. add L, -1, L
  2836. FMUL a1, b1, t1
  2837. LDF [AO + 1 * SIZE], a1
  2838. LDF [BO + 1 * SIZE], b1
  2839. add BO, 1 * SIZE, BO
  2840. cmp L, 0
  2841. bg,pt %icc, .LL276
  2842. add AO, 1 * SIZE, AO
  2843. .LL279:
  2844. FADD c01, t1, c01
  2845. FADD c02, t2, c02
  2846. FADD c01, t3, c01
  2847. FADD c02, t4, c02
  2848. FADD c01, c02, c01
  2849. #if defined(LN) || defined(RT)
  2850. sub KK, 1, TEMP1
  2851. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  2852. add AORIG, TEMP1, AO
  2853. add B, TEMP1, BO
  2854. #endif
  2855. #if defined(LN) || defined(LT)
  2856. LDF [BO + 0 * SIZE], a1
  2857. FSUB a1, c01, c01
  2858. #else
  2859. LDF [AO + 0 * SIZE], a1
  2860. FSUB a1, c01, c01
  2861. #endif
  2862. #ifdef LN
  2863. LDF [AO + 0 * SIZE], a1
  2864. FMUL a1, c01, c01
  2865. #endif
  2866. #ifdef LT
  2867. LDF [AO + 0 * SIZE], a1
  2868. FMUL a1, c01, c01
  2869. #endif
  2870. #ifdef RN
  2871. LDF [BO + 0 * SIZE], a1
  2872. FMUL a1, c01, c01
  2873. #endif
  2874. #ifdef RT
  2875. LDF [BO + 0 * SIZE], a1
  2876. FMUL a1, c01, c01
  2877. #endif
  2878. #ifdef LN
  2879. add C1, -1 * SIZE, C1
  2880. #endif
  2881. #if defined(LN) || defined(LT)
  2882. STF c01, [BO + 0 * SIZE]
  2883. #else
  2884. STF c01, [AO + 0 * SIZE]
  2885. #endif
  2886. STF c01, [C1 + 0 * SIZE]
  2887. FMOV FZERO, t1
  2888. FMOV FZERO, t2
  2889. FMOV FZERO, t3
  2890. FMOV FZERO, t4
  2891. #ifndef LN
  2892. add C1, 1 * SIZE, C1
  2893. #endif
  2894. #ifdef RT
  2895. sll K, 0 + BASE_SHIFT, TEMP1
  2896. add AORIG, TEMP1, AORIG
  2897. #endif
  2898. #if defined(LT) || defined(RN)
  2899. sub K, KK, TEMP1
  2900. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  2901. add AO, TEMP1, AO
  2902. add BO, TEMP1, BO
  2903. #endif
  2904. #ifdef LT
  2905. add KK, 1, KK
  2906. #endif
  2907. #ifdef LN
  2908. sub KK, 1, KK
  2909. #endif
  2910. .LL250:
  2911. and M, 2, I
  2912. cmp I, 0
  2913. ble,pn %icc, .LL270
  2914. nop
  2915. #if defined(LT) || defined(RN)
  2916. sra KK, 2, L
  2917. mov B, BO
  2918. cmp L, 0
  2919. #else
  2920. #ifdef LN
  2921. sll K, 1 + BASE_SHIFT, TEMP1
  2922. sub AORIG, TEMP1, AORIG
  2923. #endif
  2924. sll KK, 1 + BASE_SHIFT, TEMP1
  2925. sll KK, 0 + BASE_SHIFT, TEMP2
  2926. add AORIG, TEMP1, AO
  2927. add B, TEMP2, BO
  2928. sub K, KK, TEMP1
  2929. sra TEMP1, 2, L
  2930. cmp L, 0
  2931. #endif
  2932. LDF [AO + 0 * SIZE], a1
  2933. FMOV FZERO, c01
  2934. LDF [BO + 0 * SIZE], b1
  2935. FMOV FZERO, t1
  2936. LDF [AO + 1 * SIZE], a2
  2937. FMOV FZERO, c02
  2938. LDF [BO + 1 * SIZE], b2
  2939. FMOV FZERO, t2
  2940. LDF [AO + 2 * SIZE], a3
  2941. FMOV FZERO, c03
  2942. LDF [BO + 2 * SIZE], b3
  2943. FMOV FZERO, t3
  2944. LDF [AO + 3 * SIZE], a4
  2945. FMOV FZERO, c04
  2946. LDF [BO + 3 * SIZE], b4
  2947. FMOV FZERO, t4
  2948. ble,pn %icc, .LL255
  2949. nop
  2950. .LL252:
  2951. FADD c01, t1, c01
  2952. add L, -1, L
  2953. FMUL a1, b1, t1
  2954. LDF [AO + 4 * SIZE], a1
  2955. FADD c02, t2, c02
  2956. FMUL a2, b1, t2
  2957. LDF [AO + 5 * SIZE], a2
  2958. LDF [BO + 4 * SIZE], b1
  2959. FADD c03, t3, c03
  2960. cmp L, 0
  2961. FMUL a3, b2, t3
  2962. LDF [AO + 6 * SIZE], a3
  2963. FADD c04, t4, c04
  2964. FMUL a4, b2, t4
  2965. LDF [AO + 7 * SIZE], a4
  2966. LDF [BO + 5 * SIZE], b2
  2967. FADD c01, t1, c01
  2968. FMUL a1, b3, t1
  2969. LDF [AO + 8 * SIZE], a1
  2970. FADD c02, t2, c02
  2971. FMUL a2, b3, t2
  2972. LDF [AO + 9 * SIZE], a2
  2973. LDF [BO + 6 * SIZE], b3
  2974. FADD c03, t3, c03
  2975. FMUL a3, b4, t3
  2976. LDF [AO + 10 * SIZE], a3
  2977. FADD c04, t4, c04
  2978. FMUL a4, b4, t4
  2979. LDF [AO + 11 * SIZE], a4
  2980. add AO, 8 * SIZE, AO
  2981. LDF [BO + 7 * SIZE], b4
  2982. bg,pt %icc, .LL252
  2983. add BO, 4 * SIZE, BO
  2984. .LL255:
  2985. #if defined(LT) || defined(RN)
  2986. and KK, 3, L
  2987. #else
  2988. and TEMP1, 3, L
  2989. #endif
  2990. cmp L, 0
  2991. ble,a,pn %icc, .LL259
  2992. nop
  2993. .LL256:
  2994. FADD c01, t1, c01
  2995. add L, -1, L
  2996. FMUL a1, b1, t1
  2997. LDF [AO + 2 * SIZE], a1
  2998. FADD c02, t2, c02
  2999. cmp L, 0
  3000. FMUL a2, b1, t2
  3001. LDF [AO + 3 * SIZE], a2
  3002. LDF [BO + 1 * SIZE], b1
  3003. add AO, 2 * SIZE, AO
  3004. bg,pt %icc, .LL256
  3005. add BO, 1 * SIZE, BO
  3006. .LL259:
  3007. FADD c01, t1, c01
  3008. FADD c02, t2, c02
  3009. FADD c03, t3, c03
  3010. FADD c04, t4, c04
  3011. FADD c01, c03, c01
  3012. FADD c02, c04, c02
  3013. #if defined(LN) || defined(RT)
  3014. #ifdef LN
  3015. sub KK, 2, TEMP1
  3016. #else
  3017. sub KK, 1, TEMP1
  3018. #endif
  3019. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  3020. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  3021. add AORIG, TEMP2, AO
  3022. add B, TEMP1, BO
  3023. #endif
  3024. #if defined(LN) || defined(LT)
  3025. LDF [BO + 0 * SIZE], a1
  3026. LDF [BO + 1 * SIZE], a2
  3027. FSUB a1, c01, c01
  3028. FSUB a2, c02, c02
  3029. #else
  3030. LDF [AO + 0 * SIZE], a1
  3031. LDF [AO + 1 * SIZE], a2
  3032. FSUB a1, c01, c01
  3033. FSUB a2, c02, c02
  3034. #endif
  3035. #ifdef LN
  3036. LDF [AO + 3 * SIZE], a1
  3037. LDF [AO + 2 * SIZE], a2
  3038. LDF [AO + 0 * SIZE], a3
  3039. FMUL a1, c02, c02
  3040. FMUL a2, c02, t1
  3041. FSUB c01, t1, c01
  3042. FMUL a3, c01, c01
  3043. #endif
  3044. #ifdef LT
  3045. LDF [AO + 0 * SIZE], a1
  3046. LDF [AO + 1 * SIZE], a2
  3047. LDF [AO + 3 * SIZE], a3
  3048. FMUL a1, c01, c01
  3049. FMUL a2, c01, t1
  3050. FSUB c02, t1, c02
  3051. FMUL a3, c02, c02
  3052. #endif
  3053. #ifdef RN
  3054. LDF [BO + 0 * SIZE], a1
  3055. FMUL a1, c01, c01
  3056. FMUL a1, c02, c02
  3057. #endif
  3058. #ifdef RT
  3059. LDF [BO + 0 * SIZE], a1
  3060. FMUL a1, c01, c01
  3061. FMUL a1, c02, c02
  3062. #endif
  3063. #ifdef LN
  3064. add C1, -2 * SIZE, C1
  3065. #endif
  3066. #if defined(LN) || defined(LT)
  3067. STF c01, [BO + 0 * SIZE]
  3068. STF c02, [BO + 1 * SIZE]
  3069. #else
  3070. STF c01, [AO + 0 * SIZE]
  3071. STF c02, [AO + 1 * SIZE]
  3072. #endif
  3073. STF c01, [C1 + 0 * SIZE]
  3074. STF c02, [C1 + 1 * SIZE]
  3075. FMOV FZERO, t1
  3076. FMOV FZERO, t2
  3077. FMOV FZERO, t3
  3078. FMOV FZERO, t4
  3079. #ifndef LN
  3080. add C1, 2 * SIZE, C1
  3081. #endif
  3082. #ifdef RT
  3083. sll K, 1 + BASE_SHIFT, TEMP1
  3084. add AORIG, TEMP1, AORIG
  3085. #endif
  3086. #if defined(LT) || defined(RN)
  3087. sub K, KK, TEMP1
  3088. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  3089. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  3090. add AO, TEMP2, AO
  3091. add BO, TEMP1, BO
  3092. #endif
  3093. #ifdef LT
  3094. add KK, 2, KK
  3095. #endif
  3096. #ifdef LN
  3097. sub KK, 2, KK
  3098. #endif
  3099. .LL270:
  3100. sra M, 2, I
  3101. cmp I, 0
  3102. ble,pn %icc, .LL299
  3103. nop
  3104. .LL221:
  3105. #if defined(LT) || defined(RN)
  3106. sra KK, 2, L
  3107. mov B, BO
  3108. cmp L, 0
  3109. #else
  3110. #ifdef LN
  3111. sll K, 2 + BASE_SHIFT, TEMP1
  3112. sub AORIG, TEMP1, AORIG
  3113. #endif
  3114. sll KK, 2 + BASE_SHIFT, TEMP1
  3115. sll KK, 0 + BASE_SHIFT, TEMP2
  3116. add AORIG, TEMP1, AO
  3117. add B, TEMP2, BO
  3118. sub K, KK, TEMP1
  3119. sra TEMP1, 2, L
  3120. cmp L, 0
  3121. #endif
  3122. LDF [AO + 0 * SIZE], a1
  3123. FMOV FZERO, c01
  3124. LDF [BO + 0 * SIZE], b1
  3125. FMOV FZERO, t1
  3126. LDF [AO + 1 * SIZE], a2
  3127. FMOV FZERO, c02
  3128. LDF [BO + 1 * SIZE], b2
  3129. FMOV FZERO, t2
  3130. LDF [AO + 2 * SIZE], a3
  3131. FMOV FZERO, c03
  3132. LDF [BO + 2 * SIZE], b3
  3133. FMOV FZERO, t3
  3134. LDF [AO + 3 * SIZE], a4
  3135. FMOV FZERO, c04
  3136. LDF [BO + 3 * SIZE], b4
  3137. FMOV FZERO, t4
  3138. #ifdef LN
  3139. prefetch [C1 - 3 * SIZE], 2
  3140. #else
  3141. prefetch [C1 + 3 * SIZE], 2
  3142. #endif
  3143. ble,pn %icc, .LL225
  3144. prefetch [C1 + 4 * SIZE], 2
  3145. .LL222:
  3146. FADD c01, t1, c01
  3147. add BO, 4 * SIZE, BO
  3148. FMUL a1, b1, t1
  3149. LDF [AO + 4 * SIZE], a1
  3150. FADD c02, t2, c02
  3151. FMUL a2, b1, t2
  3152. LDF [AO + 5 * SIZE], a2
  3153. FADD c03, t3, c03
  3154. add L, -1, L
  3155. FMUL a3, b1, t3
  3156. LDF [AO + 6 * SIZE], a3
  3157. FADD c04, t4, c04
  3158. FMUL a4, b1, t4
  3159. LDF [AO + 7 * SIZE], a4
  3160. LDF [BO + 0 * SIZE], b1
  3161. FADD c01, t1, c01
  3162. cmp L, 0
  3163. FMUL a1, b2, t1
  3164. LDF [AO + 8 * SIZE], a1
  3165. FADD c02, t2, c02
  3166. FMUL a2, b2, t2
  3167. LDF [AO + 9 * SIZE], a2
  3168. FADD c03, t3, c03
  3169. FMUL a3, b2, t3
  3170. LDF [AO + 10 * SIZE], a3
  3171. FADD c04, t4, c04
  3172. FMUL a4, b2, t4
  3173. LDF [AO + 11 * SIZE], a4
  3174. LDF [BO + 1 * SIZE], b2
  3175. FADD c01, t1, c01
  3176. FMUL a1, b3, t1
  3177. LDF [AO + 12 * SIZE], a1
  3178. FADD c02, t2, c02
  3179. FMUL a2, b3, t2
  3180. LDF [AO + 13 * SIZE], a2
  3181. FADD c03, t3, c03
  3182. FMUL a3, b3, t3
  3183. LDF [AO + 14 * SIZE], a3
  3184. FADD c04, t4, c04
  3185. FMUL a4, b3, t4
  3186. LDF [AO + 15 * SIZE], a4
  3187. LDF [BO + 2 * SIZE], b3
  3188. FADD c01, t1, c01
  3189. FMUL a1, b4, t1
  3190. LDF [AO + 16 * SIZE], a1
  3191. FADD c02, t2, c02
  3192. FMUL a2, b4, t2
  3193. LDF [AO + 17 * SIZE], a2
  3194. FADD c03, t3, c03
  3195. FMUL a3, b4, t3
  3196. LDF [AO + 18 * SIZE], a3
  3197. FADD c04, t4, c04
  3198. FMUL a4, b4, t4
  3199. LDF [AO + 19 * SIZE], a4
  3200. add AO, 16 * SIZE, AO
  3201. bg,pt %icc, .LL222
  3202. LDF [BO + 3 * SIZE], b4
  3203. .LL225:
  3204. #if defined(LT) || defined(RN)
  3205. and KK, 3, L
  3206. #else
  3207. and TEMP1, 3, L
  3208. #endif
  3209. cmp L, 0
  3210. ble,a,pn %icc, .LL229
  3211. nop
  3212. .LL226:
  3213. FADD c01, t1, c01
  3214. add BO, 1 * SIZE, BO
  3215. FMUL a1, b1, t1
  3216. LDF [AO + 4 * SIZE], a1
  3217. FADD c02, t2, c02
  3218. add L, -1, L
  3219. FMUL a2, b1, t2
  3220. LDF [AO + 5 * SIZE], a2
  3221. FADD c03, t3, c03
  3222. cmp L, 0
  3223. FMUL a3, b1, t3
  3224. LDF [AO + 6 * SIZE], a3
  3225. FADD c04, t4, c04
  3226. FMUL a4, b1, t4
  3227. LDF [AO + 7 * SIZE], a4
  3228. add AO, 4 * SIZE, AO
  3229. bg,pt %icc, .LL226
  3230. LDF [BO + 0 * SIZE], b1
  3231. .LL229:
  3232. FADD c01, t1, c01
  3233. FADD c02, t2, c02
  3234. FADD c03, t3, c03
  3235. FADD c04, t4, c04
  3236. #if defined(LN) || defined(RT)
  3237. #ifdef LN
  3238. sub KK, 4, TEMP1
  3239. #else
  3240. sub KK, 1, TEMP1
  3241. #endif
  3242. sll TEMP1, 2 + BASE_SHIFT, TEMP2
  3243. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  3244. add AORIG, TEMP2, AO
  3245. add B, TEMP1, BO
  3246. #endif
  3247. #if defined(LN) || defined(LT)
  3248. LDF [BO + 0 * SIZE], a1
  3249. LDF [BO + 1 * SIZE], a2
  3250. LDF [BO + 2 * SIZE], a3
  3251. LDF [BO + 3 * SIZE], a4
  3252. FSUB a1, c01, c01
  3253. FSUB a2, c02, c02
  3254. FSUB a3, c03, c03
  3255. FSUB a4, c04, c04
  3256. #else
  3257. LDF [AO + 0 * SIZE], a1
  3258. LDF [AO + 1 * SIZE], a2
  3259. LDF [AO + 2 * SIZE], a3
  3260. LDF [AO + 3 * SIZE], a4
  3261. FSUB a1, c01, c01
  3262. FSUB a2, c02, c02
  3263. FSUB a3, c03, c03
  3264. FSUB a4, c04, c04
  3265. #endif
  3266. #ifdef LN
  3267. LDF [AO + 15 * SIZE], a1
  3268. LDF [AO + 14 * SIZE], a2
  3269. LDF [AO + 13 * SIZE], a3
  3270. LDF [AO + 12 * SIZE], a4
  3271. FMUL a1, c04, c04
  3272. FMUL a2, c04, t1
  3273. FSUB c03, t1, c03
  3274. FMUL a3, c04, t1
  3275. FSUB c02, t1, c02
  3276. FMUL a4, c04, t1
  3277. FSUB c01, t1, c01
  3278. LDF [AO + 10 * SIZE], a1
  3279. LDF [AO + 9 * SIZE], a2
  3280. LDF [AO + 8 * SIZE], a3
  3281. FMUL a1, c03, c03
  3282. FMUL a2, c03, t1
  3283. FSUB c02, t1, c02
  3284. FMUL a3, c03, t1
  3285. FSUB c01, t1, c01
  3286. LDF [AO + 5 * SIZE], a1
  3287. LDF [AO + 4 * SIZE], a2
  3288. FMUL a1, c02, c02
  3289. FMUL a2, c02, t1
  3290. FSUB c01, t1, c01
  3291. LDF [AO + 0 * SIZE], a1
  3292. FMUL a1, c01, c01
  3293. #endif
  3294. #ifdef LT
  3295. LDF [AO + 0 * SIZE], a1
  3296. LDF [AO + 1 * SIZE], a2
  3297. LDF [AO + 2 * SIZE], a3
  3298. LDF [AO + 3 * SIZE], a4
  3299. FMUL a1, c01, c01
  3300. FMUL a2, c01, t1
  3301. FSUB c02, t1, c02
  3302. FMUL a3, c01, t1
  3303. FSUB c03, t1, c03
  3304. FMUL a4, c01, t1
  3305. FSUB c04, t1, c04
  3306. LDF [AO + 5 * SIZE], a1
  3307. LDF [AO + 6 * SIZE], a2
  3308. LDF [AO + 7 * SIZE], a3
  3309. FMUL a1, c02, c02
  3310. FMUL a2, c02, t1
  3311. FSUB c03, t1, c03
  3312. FMUL a3, c02, t1
  3313. FSUB c04, t1, c04
  3314. LDF [AO + 10 * SIZE], a1
  3315. LDF [AO + 11 * SIZE], a2
  3316. FMUL a1, c03, c03
  3317. FMUL a2, c03, t1
  3318. FSUB c04, t1, c04
  3319. LDF [AO + 15 * SIZE], a1
  3320. FMUL a1, c04, c04
  3321. #endif
  3322. #ifdef RN
  3323. LDF [BO + 0 * SIZE], a1
  3324. FMUL a1, c01, c01
  3325. FMUL a1, c02, c02
  3326. FMUL a1, c03, c03
  3327. FMUL a1, c04, c04
  3328. #endif
  3329. #ifdef RT
  3330. LDF [BO + 0 * SIZE], a1
  3331. FMUL a1, c01, c01
  3332. FMUL a1, c02, c02
  3333. FMUL a1, c03, c03
  3334. FMUL a1, c04, c04
  3335. #endif
  3336. #ifdef LN
  3337. add C1, -4 * SIZE, C1
  3338. #endif
  3339. #if defined(LN) || defined(LT)
  3340. STF c01, [BO + 0 * SIZE]
  3341. STF c02, [BO + 1 * SIZE]
  3342. STF c03, [BO + 2 * SIZE]
  3343. STF c04, [BO + 3 * SIZE]
  3344. #else
  3345. STF c01, [AO + 0 * SIZE]
  3346. STF c02, [AO + 1 * SIZE]
  3347. STF c03, [AO + 2 * SIZE]
  3348. STF c04, [AO + 3 * SIZE]
  3349. #endif
  3350. STF c01, [C1 + 0 * SIZE]
  3351. STF c02, [C1 + 1 * SIZE]
  3352. STF c03, [C1 + 2 * SIZE]
  3353. STF c04, [C1 + 3 * SIZE]
  3354. FMOV FZERO, t1
  3355. FMOV FZERO, t2
  3356. FMOV FZERO, t3
  3357. FMOV FZERO, t4
  3358. #ifndef LN
  3359. add C1, 4 * SIZE, C1
  3360. #endif
  3361. #ifdef RT
  3362. sll K, 2 + BASE_SHIFT, TEMP1
  3363. add AORIG, TEMP1, AORIG
  3364. #endif
  3365. #if defined(LT) || defined(RN)
  3366. sub K, KK, TEMP1
  3367. sll TEMP1, 2 + BASE_SHIFT, TEMP2
  3368. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  3369. add AO, TEMP2, AO
  3370. add BO, TEMP1, BO
  3371. #endif
  3372. #ifdef LT
  3373. add KK, 4, KK
  3374. #endif
  3375. #ifdef LN
  3376. sub KK, 4, KK
  3377. #endif
  3378. add I, -1, I
  3379. cmp I, 0
  3380. bg,pt %icc, .LL221
  3381. nop
  3382. .LL299:
  3383. #ifdef LN
  3384. sll K, 0 + BASE_SHIFT, TEMP1
  3385. add B, TEMP1, B
  3386. #endif
  3387. #if defined(LT) || defined(RN)
  3388. mov BO, B
  3389. #endif
  3390. #ifdef RN
  3391. add KK, 1, KK
  3392. #endif
  3393. #ifdef RT
  3394. sub KK, 1, KK
  3395. #endif
  3396. .LL999:
  3397. return %i7 + 8
  3398. clr %o0
  3399. EPILOGUE