You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT.S 65 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %i0
  41. #define N %i1
  42. #define K %i2
  43. #if defined(DOUBLE) && !defined(__64BIT__)
  44. #define A %i5
  45. #define B %i4
  46. #else
  47. #define A %i4
  48. #define B %i5
  49. #endif
  50. #define C %o4
  51. #define LDC %o5
  52. #define AO %l0
  53. #define BO %l1
  54. #define I %l2
  55. #define J %l3
  56. #define L %l4
  57. #define C1 %o0
  58. #define C2 %o1
  59. #define C3 %o2
  60. #define C4 %o3
  61. #define OFFSET %l5
  62. #define KK %l6
  63. #define TEMP1 %l7
  64. #define TEMP2 %i3
  65. #define AORIG %g1
  66. #ifdef DOUBLE
  67. #define c01 %f0
  68. #define c02 %f2
  69. #define c03 %f4
  70. #define c04 %f6
  71. #define c05 %f8
  72. #define c06 %f10
  73. #define c07 %f12
  74. #define c08 %f14
  75. #define c09 %f16
  76. #define c10 %f18
  77. #define c11 %f20
  78. #define c12 %f22
  79. #define c13 %f24
  80. #define c14 %f26
  81. #define c15 %f28
  82. #define c16 %f30
  83. #define t1 %f32
  84. #define t2 %f34
  85. #define t3 %f36
  86. #define t4 %f38
  87. #define a1 %f40
  88. #define a2 %f42
  89. #define a3 %f44
  90. #define a4 %f46
  91. #define a5 %f58
  92. #define b1 %f48
  93. #define b2 %f50
  94. #define b3 %f52
  95. #define b4 %f54
  96. #define b5 %f56
  97. #define FZERO %f60
  98. #define ALPHA %f62
  99. #else
  100. #define c01 %f0
  101. #define c02 %f1
  102. #define c03 %f2
  103. #define c04 %f3
  104. #define c05 %f4
  105. #define c06 %f5
  106. #define c07 %f6
  107. #define c08 %f7
  108. #define c09 %f8
  109. #define c10 %f9
  110. #define c11 %f10
  111. #define c12 %f11
  112. #define c13 %f12
  113. #define c14 %f13
  114. #define c15 %f14
  115. #define c16 %f15
  116. #define t1 %f16
  117. #define t2 %f17
  118. #define t3 %f18
  119. #define t4 %f19
  120. #define a1 %f20
  121. #define a2 %f21
  122. #define a3 %f22
  123. #define a4 %f23
  124. #define a5 %f31
  125. #define b1 %f24
  126. #define b2 %f25
  127. #define b3 %f26
  128. #define b4 %f27
  129. #define b5 %f28
  130. #define FZERO %f29
  131. #define ALPHA %f30
  132. #endif
  133. #define APREFETCHSIZE 40
  134. #define BPREFETCHSIZE 40
  135. #define APREFETCH_CATEGORY 0
  136. #define BPREFETCH_CATEGORY 0
  137. PROLOGUE
  138. SAVESP
  139. nop
  140. #ifndef __64BIT__
  141. #ifdef DOUBLE
  142. ld [%sp + STACK_START + 28], B
  143. ld [%sp + STACK_START + 32], C
  144. ld [%sp + STACK_START + 36], LDC
  145. ld [%sp + STACK_START + 40], OFFSET
  146. #else
  147. ld [%sp + STACK_START + 28], C
  148. ld [%sp + STACK_START + 32], LDC
  149. ld [%sp + STACK_START + 36], OFFSET
  150. #endif
  151. #else
  152. ldx [%sp+ STACK_START + 56], C
  153. ldx [%sp+ STACK_START + 64], LDC
  154. ldx [%sp+ STACK_START + 72], OFFSET
  155. #endif
  156. FCLR(29)
  157. sll LDC, BASE_SHIFT, LDC
  158. #ifdef LN
  159. smul M, K, TEMP1
  160. sll TEMP1, BASE_SHIFT, TEMP1
  161. add A, TEMP1, A
  162. sll M, BASE_SHIFT, TEMP1
  163. add C, TEMP1, C
  164. #endif
  165. #ifdef RN
  166. neg OFFSET, KK
  167. #endif
  168. #ifdef RT
  169. smul N, K, TEMP1
  170. sll TEMP1, BASE_SHIFT, TEMP1
  171. add B, TEMP1, B
  172. smul N, LDC, TEMP1
  173. add C, TEMP1, C
  174. sub N, OFFSET, KK
  175. #endif
  176. and N, 1, J
  177. cmp J, 0
  178. ble,pn %icc, .LL100
  179. nop
  180. #ifdef RT
  181. sll K, 0 + BASE_SHIFT, TEMP1
  182. sub B, TEMP1, B
  183. sub C, LDC, C
  184. #endif
  185. mov C, C1
  186. #ifdef LN
  187. add M, OFFSET, KK
  188. #endif
  189. #ifdef LT
  190. mov OFFSET, KK
  191. #endif
  192. #if defined(LN) || defined(RT)
  193. mov A, AORIG
  194. #else
  195. mov A, AO
  196. #endif
  197. #ifndef RT
  198. add C, LDC, C
  199. #endif
  200. sra M, 2, I
  201. cmp I, 0
  202. ble,pn %icc, .LL250
  203. nop
  204. .LL221:
  205. #if defined(LT) || defined(RN)
  206. sra KK, 2, L
  207. mov B, BO
  208. cmp L, 0
  209. #else
  210. #ifdef LN
  211. sll K, 2 + BASE_SHIFT, TEMP1
  212. sub AORIG, TEMP1, AORIG
  213. #endif
  214. sll KK, 2 + BASE_SHIFT, TEMP1
  215. sll KK, 0 + BASE_SHIFT, TEMP2
  216. add AORIG, TEMP1, AO
  217. add B, TEMP2, BO
  218. sub K, KK, TEMP1
  219. sra TEMP1, 2, L
  220. cmp L, 0
  221. #endif
  222. LDF [AO + 0 * SIZE], a1
  223. FMOV FZERO, c01
  224. LDF [BO + 0 * SIZE], b1
  225. FMOV FZERO, t1
  226. LDF [AO + 1 * SIZE], a2
  227. FMOV FZERO, c02
  228. LDF [BO + 1 * SIZE], b2
  229. FMOV FZERO, t2
  230. LDF [AO + 2 * SIZE], a3
  231. FMOV FZERO, c03
  232. LDF [BO + 2 * SIZE], b3
  233. FMOV FZERO, t3
  234. LDF [AO + 3 * SIZE], a4
  235. FMOV FZERO, c04
  236. LDF [BO + 3 * SIZE], b4
  237. FMOV FZERO, t4
  238. ble,pn %icc, .LL225
  239. prefetch [C1 + 4 * SIZE], 2
  240. .LL222:
  241. FADD c01, t1, c01
  242. add BO, 4 * SIZE, BO
  243. FMUL a1, b1, t1
  244. LDF [AO + 4 * SIZE], a1
  245. FADD c02, t2, c02
  246. FMUL a2, b1, t2
  247. LDF [AO + 5 * SIZE], a2
  248. FADD c03, t3, c03
  249. add L, -1, L
  250. FMUL a3, b1, t3
  251. LDF [AO + 6 * SIZE], a3
  252. FADD c04, t4, c04
  253. FMUL a4, b1, t4
  254. LDF [AO + 7 * SIZE], a4
  255. LDF [BO + 0 * SIZE], b1
  256. FADD c01, t1, c01
  257. cmp L, 0
  258. FMUL a1, b2, t1
  259. LDF [AO + 8 * SIZE], a1
  260. FADD c02, t2, c02
  261. FMUL a2, b2, t2
  262. LDF [AO + 9 * SIZE], a2
  263. FADD c03, t3, c03
  264. FMUL a3, b2, t3
  265. LDF [AO + 10 * SIZE], a3
  266. FADD c04, t4, c04
  267. FMUL a4, b2, t4
  268. LDF [AO + 11 * SIZE], a4
  269. LDF [BO + 1 * SIZE], b2
  270. FADD c01, t1, c01
  271. FMUL a1, b3, t1
  272. LDF [AO + 12 * SIZE], a1
  273. FADD c02, t2, c02
  274. FMUL a2, b3, t2
  275. LDF [AO + 13 * SIZE], a2
  276. FADD c03, t3, c03
  277. FMUL a3, b3, t3
  278. LDF [AO + 14 * SIZE], a3
  279. FADD c04, t4, c04
  280. FMUL a4, b3, t4
  281. LDF [AO + 15 * SIZE], a4
  282. LDF [BO + 2 * SIZE], b3
  283. FADD c01, t1, c01
  284. FMUL a1, b4, t1
  285. LDF [AO + 16 * SIZE], a1
  286. FADD c02, t2, c02
  287. FMUL a2, b4, t2
  288. LDF [AO + 17 * SIZE], a2
  289. FADD c03, t3, c03
  290. FMUL a3, b4, t3
  291. LDF [AO + 18 * SIZE], a3
  292. FADD c04, t4, c04
  293. FMUL a4, b4, t4
  294. LDF [AO + 19 * SIZE], a4
  295. add AO, 16 * SIZE, AO
  296. bg,pt %icc, .LL222
  297. LDF [BO + 3 * SIZE], b4
  298. .LL225:
  299. #if defined(LT) || defined(RN)
  300. and KK, 3, L
  301. #else
  302. and TEMP1, 3, L
  303. #endif
  304. cmp L, 0
  305. ble,a,pn %icc, .LL229
  306. nop
  307. .LL226:
  308. FADD c01, t1, c01
  309. add BO, 1 * SIZE, BO
  310. FMUL a1, b1, t1
  311. LDF [AO + 4 * SIZE], a1
  312. FADD c02, t2, c02
  313. add L, -1, L
  314. FMUL a2, b1, t2
  315. LDF [AO + 5 * SIZE], a2
  316. FADD c03, t3, c03
  317. cmp L, 0
  318. FMUL a3, b1, t3
  319. LDF [AO + 6 * SIZE], a3
  320. FADD c04, t4, c04
  321. FMUL a4, b1, t4
  322. LDF [AO + 7 * SIZE], a4
  323. add AO, 4 * SIZE, AO
  324. bg,pt %icc, .LL226
  325. LDF [BO + 0 * SIZE], b1
  326. .LL229:
  327. FADD c01, t1, c01
  328. FADD c02, t2, c02
  329. FADD c03, t3, c03
  330. FADD c04, t4, c04
  331. #if defined(LN) || defined(RT)
  332. #ifdef LN
  333. sub KK, 4, TEMP1
  334. #else
  335. sub KK, 1, TEMP1
  336. #endif
  337. sll TEMP1, 2 + BASE_SHIFT, TEMP2
  338. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  339. add AORIG, TEMP2, AO
  340. add B, TEMP1, BO
  341. #endif
  342. #if defined(LN) || defined(LT)
  343. LDF [BO + 0 * SIZE], a1
  344. LDF [BO + 1 * SIZE], a2
  345. LDF [BO + 2 * SIZE], a3
  346. LDF [BO + 3 * SIZE], a4
  347. FSUB a1, c01, c01
  348. FSUB a2, c02, c02
  349. FSUB a3, c03, c03
  350. FSUB a4, c04, c04
  351. #else
  352. LDF [AO + 0 * SIZE], a1
  353. LDF [AO + 1 * SIZE], a2
  354. LDF [AO + 2 * SIZE], a3
  355. LDF [AO + 3 * SIZE], a4
  356. FSUB a1, c01, c01
  357. FSUB a2, c02, c02
  358. FSUB a3, c03, c03
  359. FSUB a4, c04, c04
  360. #endif
  361. #ifdef LN
  362. LDF [AO + 15 * SIZE], a1
  363. LDF [AO + 14 * SIZE], a2
  364. LDF [AO + 13 * SIZE], a3
  365. LDF [AO + 12 * SIZE], a4
  366. FMUL a1, c04, c04
  367. FMUL a2, c04, t1
  368. FSUB c03, t1, c03
  369. FMUL a3, c04, t1
  370. FSUB c02, t1, c02
  371. FMUL a4, c04, t1
  372. FSUB c01, t1, c01
  373. LDF [AO + 10 * SIZE], a1
  374. LDF [AO + 9 * SIZE], a2
  375. LDF [AO + 8 * SIZE], a3
  376. FMUL a1, c03, c03
  377. FMUL a2, c03, t1
  378. FSUB c02, t1, c02
  379. FMUL a3, c03, t1
  380. FSUB c01, t1, c01
  381. LDF [AO + 5 * SIZE], a1
  382. LDF [AO + 4 * SIZE], a2
  383. FMUL a1, c02, c02
  384. FMUL a2, c02, t1
  385. FSUB c01, t1, c01
  386. LDF [AO + 0 * SIZE], a1
  387. FMUL a1, c01, c01
  388. #endif
  389. #ifdef LT
  390. LDF [AO + 0 * SIZE], a1
  391. LDF [AO + 1 * SIZE], a2
  392. LDF [AO + 2 * SIZE], a3
  393. LDF [AO + 3 * SIZE], a4
  394. FMUL a1, c01, c01
  395. FMUL a2, c01, t1
  396. FSUB c02, t1, c02
  397. FMUL a3, c01, t1
  398. FSUB c03, t1, c03
  399. FMUL a4, c01, t1
  400. FSUB c04, t1, c04
  401. LDF [AO + 5 * SIZE], a1
  402. LDF [AO + 6 * SIZE], a2
  403. LDF [AO + 7 * SIZE], a3
  404. FMUL a1, c02, c02
  405. FMUL a2, c02, t1
  406. FSUB c03, t1, c03
  407. FMUL a3, c02, t1
  408. FSUB c04, t1, c04
  409. LDF [AO + 10 * SIZE], a1
  410. LDF [AO + 11 * SIZE], a2
  411. FMUL a1, c03, c03
  412. FMUL a2, c03, t1
  413. FSUB c04, t1, c04
  414. LDF [AO + 15 * SIZE], a1
  415. FMUL a1, c04, c04
  416. #endif
  417. #ifdef RN
  418. LDF [BO + 0 * SIZE], a1
  419. FMUL a1, c01, c01
  420. FMUL a1, c02, c02
  421. FMUL a1, c03, c03
  422. FMUL a1, c04, c04
  423. #endif
  424. #ifdef RT
  425. LDF [BO + 0 * SIZE], a1
  426. FMUL a1, c01, c01
  427. FMUL a1, c02, c02
  428. FMUL a1, c03, c03
  429. FMUL a1, c04, c04
  430. #endif
  431. #ifdef LN
  432. add C1, -4 * SIZE, C1
  433. #endif
  434. #if defined(LN) || defined(LT)
  435. STF c01, [BO + 0 * SIZE]
  436. STF c02, [BO + 1 * SIZE]
  437. STF c03, [BO + 2 * SIZE]
  438. STF c04, [BO + 3 * SIZE]
  439. #else
  440. STF c01, [AO + 0 * SIZE]
  441. STF c02, [AO + 1 * SIZE]
  442. STF c03, [AO + 2 * SIZE]
  443. STF c04, [AO + 3 * SIZE]
  444. #endif
  445. STF c01, [C1 + 0 * SIZE]
  446. STF c02, [C1 + 1 * SIZE]
  447. STF c03, [C1 + 2 * SIZE]
  448. STF c04, [C1 + 3 * SIZE]
  449. FMOV FZERO, t1
  450. FMOV FZERO, t2
  451. FMOV FZERO, t3
  452. FMOV FZERO, t4
  453. #ifndef LN
  454. add C1, 4 * SIZE, C1
  455. #endif
  456. #ifdef RT
  457. sll K, 2 + BASE_SHIFT, TEMP1
  458. add AORIG, TEMP1, AORIG
  459. #endif
  460. #if defined(LT) || defined(RN)
  461. sub K, KK, TEMP1
  462. sll TEMP1, 2 + BASE_SHIFT, TEMP2
  463. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  464. add AO, TEMP2, AO
  465. add BO, TEMP1, BO
  466. #endif
  467. #ifdef LT
  468. add KK, 4, KK
  469. #endif
  470. #ifdef LN
  471. sub KK, 4, KK
  472. #endif
  473. add I, -1, I
  474. cmp I, 0
  475. bg,pt %icc, .LL221
  476. nop
  477. .LL250:
  478. and M, 2, I
  479. cmp I, 0
  480. ble,pn %icc, .LL270
  481. nop
  482. #if defined(LT) || defined(RN)
  483. sra KK, 2, L
  484. mov B, BO
  485. cmp L, 0
  486. #else
  487. #ifdef LN
  488. sll K, 1 + BASE_SHIFT, TEMP1
  489. sub AORIG, TEMP1, AORIG
  490. #endif
  491. sll KK, 1 + BASE_SHIFT, TEMP1
  492. sll KK, 0 + BASE_SHIFT, TEMP2
  493. add AORIG, TEMP1, AO
  494. add B, TEMP2, BO
  495. sub K, KK, TEMP1
  496. sra TEMP1, 2, L
  497. cmp L, 0
  498. #endif
  499. LDF [AO + 0 * SIZE], a1
  500. FMOV FZERO, c01
  501. LDF [BO + 0 * SIZE], b1
  502. FMOV FZERO, t1
  503. LDF [AO + 1 * SIZE], a2
  504. FMOV FZERO, c02
  505. LDF [BO + 1 * SIZE], b2
  506. FMOV FZERO, t2
  507. LDF [AO + 2 * SIZE], a3
  508. FMOV FZERO, c03
  509. LDF [BO + 2 * SIZE], b3
  510. FMOV FZERO, t3
  511. LDF [AO + 3 * SIZE], a4
  512. FMOV FZERO, c04
  513. LDF [BO + 3 * SIZE], b4
  514. FMOV FZERO, t4
  515. ble,pn %icc, .LL255
  516. nop
  517. .LL252:
  518. FADD c01, t1, c01
  519. add L, -1, L
  520. FMUL a1, b1, t1
  521. LDF [AO + 4 * SIZE], a1
  522. FADD c02, t2, c02
  523. FMUL a2, b1, t2
  524. LDF [AO + 5 * SIZE], a2
  525. LDF [BO + 4 * SIZE], b1
  526. FADD c03, t3, c03
  527. cmp L, 0
  528. FMUL a3, b2, t3
  529. LDF [AO + 6 * SIZE], a3
  530. FADD c04, t4, c04
  531. FMUL a4, b2, t4
  532. LDF [AO + 7 * SIZE], a4
  533. LDF [BO + 5 * SIZE], b2
  534. FADD c01, t1, c01
  535. FMUL a1, b3, t1
  536. LDF [AO + 8 * SIZE], a1
  537. FADD c02, t2, c02
  538. FMUL a2, b3, t2
  539. LDF [AO + 9 * SIZE], a2
  540. LDF [BO + 6 * SIZE], b3
  541. FADD c03, t3, c03
  542. FMUL a3, b4, t3
  543. LDF [AO + 10 * SIZE], a3
  544. FADD c04, t4, c04
  545. FMUL a4, b4, t4
  546. LDF [AO + 11 * SIZE], a4
  547. add AO, 8 * SIZE, AO
  548. LDF [BO + 7 * SIZE], b4
  549. bg,pt %icc, .LL252
  550. add BO, 4 * SIZE, BO
  551. .LL255:
  552. #if defined(LT) || defined(RN)
  553. and KK, 3, L
  554. #else
  555. and TEMP1, 3, L
  556. #endif
  557. cmp L, 0
  558. ble,a,pn %icc, .LL259
  559. nop
  560. .LL256:
  561. FADD c01, t1, c01
  562. add L, -1, L
  563. FMUL a1, b1, t1
  564. LDF [AO + 2 * SIZE], a1
  565. FADD c02, t2, c02
  566. cmp L, 0
  567. FMUL a2, b1, t2
  568. LDF [AO + 3 * SIZE], a2
  569. LDF [BO + 1 * SIZE], b1
  570. add AO, 2 * SIZE, AO
  571. bg,pt %icc, .LL256
  572. add BO, 1 * SIZE, BO
  573. .LL259:
  574. FADD c01, t1, c01
  575. FADD c02, t2, c02
  576. FADD c03, t3, c03
  577. FADD c04, t4, c04
  578. FADD c01, c03, c01
  579. FADD c02, c04, c02
  580. #if defined(LN) || defined(RT)
  581. #ifdef LN
  582. sub KK, 2, TEMP1
  583. #else
  584. sub KK, 1, TEMP1
  585. #endif
  586. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  587. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  588. add AORIG, TEMP2, AO
  589. add B, TEMP1, BO
  590. #endif
  591. #if defined(LN) || defined(LT)
  592. LDF [BO + 0 * SIZE], a1
  593. LDF [BO + 1 * SIZE], a2
  594. FSUB a1, c01, c01
  595. FSUB a2, c02, c02
  596. #else
  597. LDF [AO + 0 * SIZE], a1
  598. LDF [AO + 1 * SIZE], a2
  599. FSUB a1, c01, c01
  600. FSUB a2, c02, c02
  601. #endif
  602. #ifdef LN
  603. LDF [AO + 3 * SIZE], a1
  604. LDF [AO + 2 * SIZE], a2
  605. LDF [AO + 0 * SIZE], a3
  606. FMUL a1, c02, c02
  607. FMUL a2, c02, t1
  608. FSUB c01, t1, c01
  609. FMUL a3, c01, c01
  610. #endif
  611. #ifdef LT
  612. LDF [AO + 0 * SIZE], a1
  613. LDF [AO + 1 * SIZE], a2
  614. LDF [AO + 3 * SIZE], a3
  615. FMUL a1, c01, c01
  616. FMUL a2, c01, t1
  617. FSUB c02, t1, c02
  618. FMUL a3, c02, c02
  619. #endif
  620. #ifdef RN
  621. LDF [BO + 0 * SIZE], a1
  622. FMUL a1, c01, c01
  623. FMUL a1, c02, c02
  624. #endif
  625. #ifdef RT
  626. LDF [BO + 0 * SIZE], a1
  627. FMUL a1, c01, c01
  628. FMUL a1, c02, c02
  629. #endif
  630. #ifdef LN
  631. add C1, -2 * SIZE, C1
  632. #endif
  633. #if defined(LN) || defined(LT)
  634. STF c01, [BO + 0 * SIZE]
  635. STF c02, [BO + 1 * SIZE]
  636. #else
  637. STF c01, [AO + 0 * SIZE]
  638. STF c02, [AO + 1 * SIZE]
  639. #endif
  640. STF c01, [C1 + 0 * SIZE]
  641. STF c02, [C1 + 1 * SIZE]
  642. FMOV FZERO, t1
  643. FMOV FZERO, t2
  644. FMOV FZERO, t3
  645. FMOV FZERO, t4
  646. #ifndef LN
  647. add C1, 2 * SIZE, C1
  648. #endif
  649. #ifdef RT
  650. sll K, 1 + BASE_SHIFT, TEMP1
  651. add AORIG, TEMP1, AORIG
  652. #endif
  653. #if defined(LT) || defined(RN)
  654. sub K, KK, TEMP1
  655. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  656. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  657. add AO, TEMP2, AO
  658. add BO, TEMP1, BO
  659. #endif
  660. #ifdef LT
  661. add KK, 2, KK
  662. #endif
  663. #ifdef LN
  664. sub KK, 2, KK
  665. #endif
  666. .LL270:
  667. and M, 1, I
  668. cmp I, 0
  669. ble,pn %icc, .LL299
  670. nop
  671. #if defined(LT) || defined(RN)
  672. sra KK, 2, L
  673. mov B, BO
  674. cmp L, 0
  675. #else
  676. #ifdef LN
  677. sll K, 0 + BASE_SHIFT, TEMP1
  678. sub AORIG, TEMP1, AORIG
  679. #endif
  680. sll KK, 0 + BASE_SHIFT, TEMP1
  681. add AORIG, TEMP1, AO
  682. add B, TEMP1, BO
  683. sub K, KK, TEMP1
  684. sra TEMP1, 2, L
  685. cmp L, 0
  686. #endif
  687. LDF [AO + 0 * SIZE], a1
  688. FMOV FZERO, t1
  689. LDF [AO + 1 * SIZE], a2
  690. FMOV FZERO, c01
  691. LDF [AO + 2 * SIZE], a3
  692. FMOV FZERO, t2
  693. LDF [AO + 3 * SIZE], a4
  694. FMOV FZERO, c02
  695. LDF [BO + 0 * SIZE], b1
  696. FMOV FZERO, t3
  697. LDF [BO + 1 * SIZE], b2
  698. FMOV FZERO, t4
  699. LDF [BO + 2 * SIZE], b3
  700. ble,pn %icc, .LL275
  701. LDF [BO + 3 * SIZE], b4
  702. .LL272:
  703. FADD c01, t1, c01
  704. add L, -1, L
  705. add AO, 4 * SIZE, AO
  706. FMUL a1, b1, t1
  707. add BO, 4 * SIZE, BO
  708. LDF [AO + 0 * SIZE], a1
  709. FADD c02, t2, c02
  710. cmp L, 0
  711. LDF [BO + 0 * SIZE], b1
  712. FMUL a2, b2, t2
  713. LDF [AO + 1 * SIZE], a2
  714. FADD c01, t3, c01
  715. LDF [BO + 1 * SIZE], b2
  716. FMUL a3, b3, t3
  717. LDF [AO + 2 * SIZE], a3
  718. FADD c02, t4, c02
  719. LDF [BO + 2 * SIZE], b3
  720. FMUL a4, b4, t4
  721. LDF [AO + 3 * SIZE], a4
  722. bg,pt %icc, .LL272
  723. LDF [BO + 3 * SIZE], b4
  724. .LL275:
  725. #if defined(LT) || defined(RN)
  726. and KK, 3, L
  727. #else
  728. and TEMP1, 3, L
  729. #endif
  730. cmp L, 0
  731. ble,a,pn %icc, .LL279
  732. nop
  733. .LL276:
  734. FADD c01, t1, c01
  735. add L, -1, L
  736. FMUL a1, b1, t1
  737. LDF [AO + 1 * SIZE], a1
  738. LDF [BO + 1 * SIZE], b1
  739. add BO, 1 * SIZE, BO
  740. cmp L, 0
  741. bg,pt %icc, .LL276
  742. add AO, 1 * SIZE, AO
  743. .LL279:
  744. FADD c01, t1, c01
  745. FADD c02, t2, c02
  746. FADD c01, t3, c01
  747. FADD c02, t4, c02
  748. FADD c01, c02, c01
  749. #if defined(LN) || defined(RT)
  750. sub KK, 1, TEMP1
  751. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  752. add AORIG, TEMP1, AO
  753. add B, TEMP1, BO
  754. #endif
  755. #if defined(LN) || defined(LT)
  756. LDF [BO + 0 * SIZE], a1
  757. FSUB a1, c01, c01
  758. #else
  759. LDF [AO + 0 * SIZE], a1
  760. FSUB a1, c01, c01
  761. #endif
  762. #ifdef LN
  763. LDF [AO + 0 * SIZE], a1
  764. FMUL a1, c01, c01
  765. #endif
  766. #ifdef LT
  767. LDF [AO + 0 * SIZE], a1
  768. FMUL a1, c01, c01
  769. #endif
  770. #ifdef RN
  771. LDF [BO + 0 * SIZE], a1
  772. FMUL a1, c01, c01
  773. #endif
  774. #ifdef RT
  775. LDF [BO + 0 * SIZE], a1
  776. FMUL a1, c01, c01
  777. #endif
  778. #ifdef LN
  779. add C1, -1 * SIZE, C1
  780. #endif
  781. #if defined(LN) || defined(LT)
  782. STF c01, [BO + 0 * SIZE]
  783. #else
  784. STF c01, [AO + 0 * SIZE]
  785. #endif
  786. STF c01, [C1 + 0 * SIZE]
  787. FMOV FZERO, t1
  788. FMOV FZERO, t2
  789. FMOV FZERO, t3
  790. FMOV FZERO, t4
  791. #ifndef LN
  792. add C1, 1 * SIZE, C1
  793. #endif
  794. #ifdef RT
  795. sll K, 0 + BASE_SHIFT, TEMP1
  796. add AORIG, TEMP1, AORIG
  797. #endif
  798. #if defined(LT) || defined(RN)
  799. sub K, KK, TEMP1
  800. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  801. add AO, TEMP1, AO
  802. add BO, TEMP1, BO
  803. #endif
  804. #ifdef LT
  805. add KK, 1, KK
  806. #endif
  807. #ifdef LN
  808. sub KK, 1, KK
  809. #endif
  810. .LL299:
  811. #ifdef LN
  812. sll K, 0 + BASE_SHIFT, TEMP1
  813. add B, TEMP1, B
  814. #endif
  815. #if defined(LT) || defined(RN)
  816. mov BO, B
  817. #endif
  818. #ifdef RN
  819. add KK, 1, KK
  820. #endif
  821. #ifdef RT
  822. sub KK, 1, KK
  823. #endif
  824. .LL100: /* n & 2 */
  825. and N, 2, J
  826. cmp J, 0
  827. ble,pn %icc, .LL200
  828. nop
  829. #ifdef RT
  830. sll K, 1 + BASE_SHIFT, TEMP1
  831. sub B, TEMP1, B
  832. sll LDC, 1, TEMP1
  833. sub C, TEMP1, C
  834. #endif
  835. mov C, C1
  836. add C, LDC, C2
  837. #ifdef LN
  838. add M, OFFSET, KK
  839. #endif
  840. #ifdef LT
  841. mov OFFSET, KK
  842. #endif
  843. #if defined(LN) || defined(RT)
  844. mov A, AORIG
  845. #else
  846. mov A, AO
  847. #endif
  848. #ifndef RT
  849. add C2, LDC, C
  850. #endif
  851. sra M, 2, I
  852. cmp I, 0
  853. ble,pn %icc, .LL150
  854. FMOV FZERO, c03
  855. .LL121:
  856. #if defined(LT) || defined(RN)
  857. sra KK, 2, L
  858. mov B, BO
  859. cmp L, 0
  860. #else
  861. #ifdef LN
  862. sll K, 2 + BASE_SHIFT, TEMP1
  863. sub AORIG, TEMP1, AORIG
  864. #endif
  865. sll KK, 2 + BASE_SHIFT, TEMP1
  866. sll KK, 1 + BASE_SHIFT, TEMP2
  867. add AORIG, TEMP1, AO
  868. add B, TEMP2, BO
  869. sub K, KK, TEMP1
  870. sra TEMP1, 2, L
  871. cmp L, 0
  872. #endif
  873. LDF [AO + 0 * SIZE], a1
  874. FMOV FZERO, t1
  875. LDF [BO + 0 * SIZE], b1
  876. FMOV FZERO, c07
  877. LDF [AO + 1 * SIZE], a2
  878. FMOV FZERO, t2
  879. LDF [BO + 1 * SIZE], b2
  880. FMOV FZERO, c04
  881. LDF [AO + 2 * SIZE], a3
  882. FMOV FZERO, t3
  883. LDF [BO + 2 * SIZE], b3
  884. FMOV FZERO, c08
  885. LDF [AO + 3 * SIZE], a4
  886. FMOV FZERO, t4
  887. LDF [BO + 3 * SIZE], b4
  888. FMOV FZERO, c01
  889. prefetch [C1 + 3 * SIZE], 2
  890. FMOV FZERO, c05
  891. prefetch [C2 + 3 * SIZE], 2
  892. FMOV FZERO, c02
  893. ble,pn %icc, .LL125
  894. FMOV FZERO, c06
  895. .LL122:
  896. FADD c03, t1, c03
  897. add L, -1, L
  898. FMUL a1, b1, t1
  899. prefetch [AO + APREFETCHSIZE * SIZE], 0
  900. FADD c07, t2, c07
  901. add BO, 8 * SIZE, BO
  902. FMUL a1, b2, t2
  903. LDF [AO + 4 * SIZE], a1
  904. FADD c04, t3, c04
  905. add AO, 16 * SIZE, AO
  906. FMUL a2, b1, t3
  907. cmp L, 0
  908. FADD c08, t4, c08
  909. nop
  910. FMUL a2, b2, t4
  911. LDF [AO - 11 * SIZE], a2
  912. FADD c01, t1, c01
  913. nop
  914. FMUL a3, b1, t1
  915. nop
  916. FADD c05, t2, c05
  917. nop
  918. FMUL a3, b2, t2
  919. LDF [AO - 10 * SIZE], a3
  920. FADD c02, t3, c02
  921. nop
  922. FMUL a4, b1, t3
  923. LDF [BO - 4 * SIZE], b1
  924. FADD c06, t4, c06
  925. nop
  926. FMUL a4, b2, t4
  927. LDF [BO - 3 * SIZE], b2
  928. FADD c03, t1, c03
  929. nop
  930. FMUL a1, b3, t1
  931. LDF [AO - 9 * SIZE], a4
  932. FADD c07, t2, c07
  933. nop
  934. FMUL a1, b4, t2
  935. LDF [AO - 8 * SIZE], a1
  936. FADD c04, t3, c04
  937. nop
  938. FMUL a2, b3, t3
  939. nop
  940. FADD c08, t4, c08
  941. nop
  942. FMUL a2, b4, t4
  943. LDF [AO - 7 * SIZE], a2
  944. FADD c01, t1, c01
  945. nop
  946. FMUL a3, b3, t1
  947. nop
  948. FADD c05, t2, c05
  949. nop
  950. FMUL a3, b4, t2
  951. LDF [AO - 6 * SIZE], a3
  952. FADD c02, t3, c02
  953. nop
  954. FMUL a4, b3, t3
  955. LDF [BO - 2 * SIZE], b3
  956. FADD c06, t4, c06
  957. nop
  958. FMUL a4, b4, t4
  959. LDF [BO - 1 * SIZE], b4
  960. FADD c03, t1, c03
  961. nop
  962. FMUL a1, b1, t1
  963. LDF [AO - 5 * SIZE], a4
  964. FADD c07, t2, c07
  965. nop
  966. FMUL a1, b2, t2
  967. LDF [AO - 4 * SIZE], a1
  968. FADD c04, t3, c04
  969. nop
  970. FMUL a2, b1, t3
  971. nop
  972. FADD c08, t4, c08
  973. nop
  974. FMUL a2, b2, t4
  975. LDF [AO - 3 * SIZE], a2
  976. FADD c01, t1, c01
  977. nop
  978. FMUL a3, b1, t1
  979. nop
  980. FADD c05, t2, c05
  981. nop
  982. FMUL a3, b2, t2
  983. LDF [AO - 2 * SIZE], a3
  984. FADD c02, t3, c02
  985. nop
  986. FMUL a4, b1, t3
  987. LDF [BO + 0 * SIZE], b1
  988. FADD c06, t4, c06
  989. nop
  990. FMUL a4, b2, t4
  991. LDF [BO + 1 * SIZE], b2
  992. FADD c03, t1, c03
  993. nop
  994. FMUL a1, b3, t1
  995. LDF [AO - 1 * SIZE], a4
  996. FADD c07, t2, c07
  997. nop
  998. FMUL a1, b4, t2
  999. LDF [AO + 0 * SIZE], a1
  1000. FADD c04, t3, c04
  1001. nop
  1002. FMUL a2, b3, t3
  1003. nop
  1004. FADD c08, t4, c08
  1005. nop
  1006. FMUL a2, b4, t4
  1007. LDF [AO + 1 * SIZE], a2
  1008. FADD c01, t1, c01
  1009. nop
  1010. FMUL a3, b3, t1
  1011. nop
  1012. FADD c05, t2, c05
  1013. nop
  1014. FMUL a3, b4, t2
  1015. LDF [AO + 2 * SIZE], a3
  1016. FADD c02, t3, c02
  1017. nop
  1018. FMUL a4, b3, t3
  1019. LDF [BO + 2 * SIZE], b3
  1020. FADD c06, t4, c06
  1021. FMUL a4, b4, t4
  1022. LDF [AO + 3 * SIZE], a4
  1023. bg,pt %icc, .LL122
  1024. LDF [BO + 3 * SIZE], b4
  1025. .LL125:
  1026. #if defined(LT) || defined(RN)
  1027. and KK, 3, L
  1028. #else
  1029. and TEMP1, 3, L
  1030. #endif
  1031. cmp L, 0
  1032. ble,a,pn %icc, .LL129
  1033. nop
  1034. .LL126:
  1035. FADD c03, t1, c03
  1036. add AO, 4 * SIZE, AO
  1037. FMUL a1, b1, t1
  1038. add BO, 2 * SIZE, BO
  1039. FADD c07, t2, c07
  1040. add L, -1, L
  1041. FMUL a1, b2, t2
  1042. LDF [AO + 0 * SIZE], a1
  1043. FADD c04, t3, c04
  1044. cmp L, 0
  1045. FMUL a2, b1, t3
  1046. FADD c08, t4, c08
  1047. FMUL a2, b2, t4
  1048. LDF [AO + 1 * SIZE], a2
  1049. FADD c01, t1, c01
  1050. FMUL a3, b1, t1
  1051. FADD c05, t2, c05
  1052. FMUL a3, b2, t2
  1053. LDF [AO + 2 * SIZE], a3
  1054. FADD c02, t3, c02
  1055. FMUL a4, b1, t3
  1056. LDF [BO + 0 * SIZE], b1
  1057. FADD c06, t4, c06
  1058. FMUL a4, b2, t4
  1059. LDF [BO + 1 * SIZE], b2
  1060. bg,pt %icc, .LL126
  1061. LDF [AO + 3 * SIZE], a4
  1062. .LL129:
  1063. FADD c03, t1, c03
  1064. FADD c07, t2, c07
  1065. FADD c04, t3, c04
  1066. FADD c08, t4, c08
  1067. #if defined(LN) || defined(RT)
  1068. #ifdef LN
  1069. sub KK, 4, TEMP1
  1070. #else
  1071. sub KK, 2, TEMP1
  1072. #endif
  1073. sll TEMP1, 2 + BASE_SHIFT, TEMP2
  1074. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  1075. add AORIG, TEMP2, AO
  1076. add B, TEMP1, BO
  1077. #endif
  1078. #if defined(LN) || defined(LT)
  1079. LDF [BO + 0 * SIZE], a1
  1080. LDF [BO + 1 * SIZE], a2
  1081. LDF [BO + 2 * SIZE], a3
  1082. LDF [BO + 3 * SIZE], a4
  1083. LDF [BO + 4 * SIZE], b1
  1084. LDF [BO + 5 * SIZE], b2
  1085. LDF [BO + 6 * SIZE], b3
  1086. LDF [BO + 7 * SIZE], b4
  1087. FSUB a1, c01, c01
  1088. FSUB a2, c05, c05
  1089. FSUB a3, c02, c02
  1090. FSUB a4, c06, c06
  1091. FSUB b1, c03, c03
  1092. FSUB b2, c07, c07
  1093. FSUB b3, c04, c04
  1094. FSUB b4, c08, c08
  1095. #else
  1096. LDF [AO + 0 * SIZE], a1
  1097. LDF [AO + 1 * SIZE], a2
  1098. LDF [AO + 2 * SIZE], a3
  1099. LDF [AO + 3 * SIZE], a4
  1100. LDF [AO + 4 * SIZE], b1
  1101. LDF [AO + 5 * SIZE], b2
  1102. LDF [AO + 6 * SIZE], b3
  1103. LDF [AO + 7 * SIZE], b4
  1104. FSUB a1, c01, c01
  1105. FSUB a2, c02, c02
  1106. FSUB a3, c03, c03
  1107. FSUB a4, c04, c04
  1108. FSUB b1, c05, c05
  1109. FSUB b2, c06, c06
  1110. FSUB b3, c07, c07
  1111. FSUB b4, c08, c08
  1112. #endif
  1113. #ifdef LN
  1114. LDF [AO + 15 * SIZE], a1
  1115. LDF [AO + 14 * SIZE], a2
  1116. LDF [AO + 13 * SIZE], a3
  1117. LDF [AO + 12 * SIZE], a4
  1118. FMUL a1, c04, c04
  1119. FMUL a1, c08, c08
  1120. FMUL a2, c04, t1
  1121. FMUL a2, c08, t2
  1122. FSUB c03, t1, c03
  1123. FSUB c07, t2, c07
  1124. FMUL a3, c04, t1
  1125. FMUL a3, c08, t2
  1126. FSUB c02, t1, c02
  1127. FSUB c06, t2, c06
  1128. FMUL a4, c04, t1
  1129. FMUL a4, c08, t2
  1130. FSUB c01, t1, c01
  1131. FSUB c05, t2, c05
  1132. LDF [AO + 10 * SIZE], a1
  1133. LDF [AO + 9 * SIZE], a2
  1134. LDF [AO + 8 * SIZE], a3
  1135. FMUL a1, c03, c03
  1136. FMUL a1, c07, c07
  1137. FMUL a2, c03, t1
  1138. FMUL a2, c07, t2
  1139. FSUB c02, t1, c02
  1140. FSUB c06, t2, c06
  1141. FMUL a3, c03, t1
  1142. FMUL a3, c07, t2
  1143. FSUB c01, t1, c01
  1144. FSUB c05, t2, c05
  1145. LDF [AO + 5 * SIZE], a1
  1146. LDF [AO + 4 * SIZE], a2
  1147. FMUL a1, c02, c02
  1148. FMUL a1, c06, c06
  1149. FMUL a2, c02, t1
  1150. FMUL a2, c06, t2
  1151. FSUB c01, t1, c01
  1152. FSUB c05, t2, c05
  1153. LDF [AO + 0 * SIZE], a1
  1154. FMUL a1, c01, c01
  1155. FMUL a1, c05, c05
  1156. #endif
  1157. #ifdef LT
  1158. LDF [AO + 0 * SIZE], a1
  1159. LDF [AO + 1 * SIZE], a2
  1160. LDF [AO + 2 * SIZE], a3
  1161. LDF [AO + 3 * SIZE], a4
  1162. FMUL a1, c01, c01
  1163. FMUL a1, c05, c05
  1164. FMUL a2, c01, t1
  1165. FMUL a2, c05, t2
  1166. FSUB c02, t1, c02
  1167. FSUB c06, t2, c06
  1168. FMUL a3, c01, t1
  1169. FMUL a3, c05, t2
  1170. FSUB c03, t1, c03
  1171. FSUB c07, t2, c07
  1172. FMUL a4, c01, t1
  1173. FMUL a4, c05, t2
  1174. FSUB c04, t1, c04
  1175. FSUB c08, t2, c08
  1176. LDF [AO + 5 * SIZE], a1
  1177. LDF [AO + 6 * SIZE], a2
  1178. LDF [AO + 7 * SIZE], a3
  1179. FMUL a1, c02, c02
  1180. FMUL a1, c06, c06
  1181. FMUL a2, c02, t1
  1182. FMUL a2, c06, t2
  1183. FSUB c03, t1, c03
  1184. FSUB c07, t2, c07
  1185. FMUL a3, c02, t1
  1186. FMUL a3, c06, t2
  1187. FSUB c04, t1, c04
  1188. FSUB c08, t2, c08
  1189. LDF [AO + 10 * SIZE], a1
  1190. LDF [AO + 11 * SIZE], a2
  1191. FMUL a1, c03, c03
  1192. FMUL a1, c07, c07
  1193. FMUL a2, c03, t1
  1194. FMUL a2, c07, t2
  1195. FSUB c04, t1, c04
  1196. FSUB c08, t2, c08
  1197. LDF [AO + 15 * SIZE], a1
  1198. FMUL a1, c04, c04
  1199. FMUL a1, c08, c08
  1200. #endif
  1201. #ifdef RN
  1202. LDF [BO + 0 * SIZE], a1
  1203. LDF [BO + 1 * SIZE], a2
  1204. LDF [BO + 3 * SIZE], a3
  1205. FMUL a1, c01, c01
  1206. FMUL a1, c02, c02
  1207. FMUL a1, c03, c03
  1208. FMUL a1, c04, c04
  1209. FMUL a2, c01, t1
  1210. FMUL a2, c02, t2
  1211. FMUL a2, c03, t3
  1212. FMUL a2, c04, t4
  1213. FSUB c05, t1, c05
  1214. FSUB c06, t2, c06
  1215. FSUB c07, t3, c07
  1216. FSUB c08, t4, c08
  1217. FMUL a3, c05, c05
  1218. FMUL a3, c06, c06
  1219. FMUL a3, c07, c07
  1220. FMUL a3, c08, c08
  1221. #endif
  1222. #ifdef RT
  1223. LDF [BO + 3 * SIZE], a1
  1224. LDF [BO + 2 * SIZE], a2
  1225. LDF [BO + 0 * SIZE], a3
  1226. FMUL a1, c05, c05
  1227. FMUL a1, c06, c06
  1228. FMUL a1, c07, c07
  1229. FMUL a1, c08, c08
  1230. FMUL a2, c05, t1
  1231. FMUL a2, c06, t2
  1232. FMUL a2, c07, t3
  1233. FMUL a2, c08, t4
  1234. FSUB c01, t1, c01
  1235. FSUB c02, t2, c02
  1236. FSUB c03, t3, c03
  1237. FSUB c04, t4, c04
  1238. FMUL a3, c01, c01
  1239. FMUL a3, c02, c02
  1240. FMUL a3, c03, c03
  1241. FMUL a3, c04, c04
  1242. #endif
  1243. #ifdef LN
  1244. add C1, -4 * SIZE, C1
  1245. add C2, -4 * SIZE, C2
  1246. #endif
  1247. #if defined(LN) || defined(LT)
  1248. STF c01, [BO + 0 * SIZE]
  1249. STF c05, [BO + 1 * SIZE]
  1250. STF c02, [BO + 2 * SIZE]
  1251. STF c06, [BO + 3 * SIZE]
  1252. STF c03, [BO + 4 * SIZE]
  1253. STF c07, [BO + 5 * SIZE]
  1254. STF c04, [BO + 6 * SIZE]
  1255. STF c08, [BO + 7 * SIZE]
  1256. #else
  1257. STF c01, [AO + 0 * SIZE]
  1258. STF c02, [AO + 1 * SIZE]
  1259. STF c03, [AO + 2 * SIZE]
  1260. STF c04, [AO + 3 * SIZE]
  1261. STF c05, [AO + 4 * SIZE]
  1262. STF c06, [AO + 5 * SIZE]
  1263. STF c07, [AO + 6 * SIZE]
  1264. STF c08, [AO + 7 * SIZE]
  1265. #endif
  1266. STF c01, [C1 + 0 * SIZE]
  1267. STF c02, [C1 + 1 * SIZE]
  1268. STF c03, [C1 + 2 * SIZE]
  1269. STF c04, [C1 + 3 * SIZE]
  1270. STF c05, [C2 + 0 * SIZE]
  1271. STF c06, [C2 + 1 * SIZE]
  1272. STF c07, [C2 + 2 * SIZE]
  1273. STF c08, [C2 + 3 * SIZE]
  1274. FMOV FZERO, t1
  1275. FMOV FZERO, t2
  1276. FMOV FZERO, t3
  1277. FMOV FZERO, t4
  1278. #ifndef LN
  1279. add C1, 4 * SIZE, C1
  1280. add C2, 4 * SIZE, C2
  1281. #endif
  1282. #ifdef RT
  1283. sll K, 2 + BASE_SHIFT, TEMP1
  1284. add AORIG, TEMP1, AORIG
  1285. #endif
  1286. #if defined(LT) || defined(RN)
  1287. sub K, KK, TEMP1
  1288. sll TEMP1, 2 + BASE_SHIFT, TEMP2
  1289. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  1290. add AO, TEMP2, AO
  1291. add BO, TEMP1, BO
  1292. #endif
  1293. #ifdef LT
  1294. add KK, 4, KK
  1295. #endif
  1296. #ifdef LN
  1297. sub KK, 4, KK
  1298. #endif
  1299. add I, -1, I
  1300. cmp I, 0
  1301. bg,pt %icc, .LL121
  1302. FMOV FZERO, c03
  1303. .LL150:
  1304. and M, 2, I
  1305. cmp I, 0
  1306. ble,pn %icc, .LL170
  1307. nop
  1308. #if defined(LT) || defined(RN)
  1309. sra KK, 2, L
  1310. mov B, BO
  1311. cmp L, 0
  1312. #else
  1313. #ifdef LN
  1314. sll K, 1 + BASE_SHIFT, TEMP1
  1315. sub AORIG, TEMP1, AORIG
  1316. #endif
  1317. sll KK, 1 + BASE_SHIFT, TEMP1
  1318. sll KK, 1 + BASE_SHIFT, TEMP2
  1319. add AORIG, TEMP1, AO
  1320. add B, TEMP2, BO
  1321. sub K, KK, TEMP1
  1322. sra TEMP1, 2, L
  1323. cmp L, 0
  1324. #endif
  1325. LDF [AO + 0 * SIZE], a1
  1326. FMOV FZERO, c01
  1327. LDF [BO + 0 * SIZE], b1
  1328. FMOV FZERO, t1
  1329. LDF [AO + 1 * SIZE], a2
  1330. cmp L, 0
  1331. FMOV FZERO, c02
  1332. LDF [BO + 1 * SIZE], b2
  1333. FMOV FZERO, t2
  1334. LDF [AO + 2 * SIZE], a3
  1335. FMOV FZERO, c03
  1336. LDF [BO + 2 * SIZE], b3
  1337. FMOV FZERO, t3
  1338. LDF [AO + 3 * SIZE], a4
  1339. FMOV FZERO, c04
  1340. LDF [BO + 3 * SIZE], b4
  1341. FMOV FZERO, t4
  1342. ble,pn %icc, .LL155
  1343. nop
  1344. .LL152:
  1345. FADD c01, t1, c01
  1346. add L, -1, L
  1347. FMUL a1, b1, t1
  1348. prefetch [AO + APREFETCHSIZE * SIZE], 0
  1349. FADD c02, t2, c02
  1350. add BO, 8 * SIZE, BO
  1351. FMUL a1, b2, t2
  1352. LDF [AO + 4 * SIZE], a1
  1353. FADD c03, t3, c03
  1354. cmp L, 0
  1355. FMUL a2, b1, t3
  1356. LDF [BO - 4 * SIZE], b1
  1357. FADD c04, t4, c04
  1358. nop
  1359. FMUL a2, b2, t4
  1360. LDF [AO + 5 * SIZE], a2
  1361. FADD c01, t1, c01
  1362. nop
  1363. FMUL a3, b3, t1
  1364. LDF [BO - 3 * SIZE], b2
  1365. FADD c02, t2, c02
  1366. nop
  1367. FMUL a3, b4, t2
  1368. LDF [AO + 6 * SIZE], a3
  1369. FADD c03, t3, c03
  1370. nop
  1371. FMUL a4, b3, t3
  1372. LDF [BO - 2 * SIZE], b3
  1373. FADD c04, t4, c04
  1374. nop
  1375. FMUL a4, b4, t4
  1376. LDF [AO + 7 * SIZE], a4
  1377. FADD c01, t1, c01
  1378. nop
  1379. FMUL a1, b1, t1
  1380. LDF [BO - 1 * SIZE], b4
  1381. FADD c02, t2, c02
  1382. FMUL a1, b2, t2
  1383. LDF [AO + 8 * SIZE], a1
  1384. FADD c03, t3, c03
  1385. FMUL a2, b1, t3
  1386. LDF [BO + 0 * SIZE], b1
  1387. FADD c04, t4, c04
  1388. FMUL a2, b2, t4
  1389. LDF [AO + 9 * SIZE], a2
  1390. FADD c01, t1, c01
  1391. FMUL a3, b3, t1
  1392. LDF [BO + 1 * SIZE], b2
  1393. FADD c02, t2, c02
  1394. FMUL a3, b4, t2
  1395. LDF [AO + 10 * SIZE], a3
  1396. FADD c03, t3, c03
  1397. FMUL a4, b3, t3
  1398. LDF [BO + 2 * SIZE], b3
  1399. FADD c04, t4, c04
  1400. FMUL a4, b4, t4
  1401. LDF [AO + 11 * SIZE], a4
  1402. add AO, 8 * SIZE, AO
  1403. bg,pt %icc, .LL152
  1404. LDF [BO + 3 * SIZE], b4
  1405. .LL155:
  1406. #if defined(LT) || defined(RN)
  1407. and KK, 3, L
  1408. #else
  1409. and TEMP1, 3, L
  1410. #endif
  1411. cmp L, 0
  1412. ble,a,pn %icc, .LL159
  1413. nop
  1414. .LL156:
  1415. LDF [AO + 0 * SIZE], a1
  1416. LDF [AO + 1 * SIZE], a2
  1417. LDF [BO + 0 * SIZE], b1
  1418. LDF [BO + 1 * SIZE], b2
  1419. FADD c01, t1, c01
  1420. FADD c02, t2, c02
  1421. FADD c03, t3, c03
  1422. FADD c04, t4, c04
  1423. FMUL a1, b1, t1
  1424. FMUL a1, b2, t2
  1425. FMUL a2, b1, t3
  1426. FMUL a2, b2, t4
  1427. add AO, 2 * SIZE, AO
  1428. add BO, 2 * SIZE, BO
  1429. add L, -1, L
  1430. cmp L, 0
  1431. bg,pt %icc, .LL156
  1432. nop
  1433. .LL159:
  1434. FADD c01, t1, c01
  1435. FADD c02, t2, c02
  1436. FADD c03, t3, c03
  1437. FADD c04, t4, c04
  1438. #if defined(LN) || defined(RT)
  1439. #ifdef LN
  1440. sub KK, 2, TEMP1
  1441. #else
  1442. sub KK, 2, TEMP1
  1443. #endif
  1444. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  1445. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  1446. add AORIG, TEMP2, AO
  1447. add B, TEMP1, BO
  1448. #endif
  1449. #if defined(LN) || defined(LT)
  1450. LDF [BO + 0 * SIZE], a1
  1451. LDF [BO + 1 * SIZE], a2
  1452. LDF [BO + 2 * SIZE], a3
  1453. LDF [BO + 3 * SIZE], a4
  1454. FSUB a1, c01, c01
  1455. FSUB a2, c02, c02
  1456. FSUB a3, c03, c03
  1457. FSUB a4, c04, c04
  1458. #else
  1459. LDF [AO + 0 * SIZE], a1
  1460. LDF [AO + 1 * SIZE], a2
  1461. LDF [AO + 2 * SIZE], a3
  1462. LDF [AO + 3 * SIZE], a4
  1463. FSUB a1, c01, c01
  1464. FSUB a2, c03, c03
  1465. FSUB a3, c02, c02
  1466. FSUB a4, c04, c04
  1467. #endif
  1468. #ifdef LN
  1469. LDF [AO + 3 * SIZE], a1
  1470. LDF [AO + 2 * SIZE], a2
  1471. LDF [AO + 0 * SIZE], a3
  1472. FMUL a1, c03, c03
  1473. FMUL a1, c04, c04
  1474. FMUL a2, c03, t1
  1475. FMUL a2, c04, t2
  1476. FSUB c01, t1, c01
  1477. FSUB c02, t2, c02
  1478. FMUL a3, c01, c01
  1479. FMUL a3, c02, c02
  1480. #endif
  1481. #ifdef LT
  1482. LDF [AO + 0 * SIZE], a1
  1483. LDF [AO + 1 * SIZE], a2
  1484. LDF [AO + 3 * SIZE], a3
  1485. FMUL a1, c01, c01
  1486. FMUL a1, c02, c02
  1487. FMUL a2, c01, t1
  1488. FMUL a2, c02, t2
  1489. FSUB c03, t1, c03
  1490. FSUB c04, t2, c04
  1491. FMUL a3, c03, c03
  1492. FMUL a3, c04, c04
  1493. #endif
  1494. #ifdef RN
  1495. LDF [BO + 0 * SIZE], a1
  1496. LDF [BO + 1 * SIZE], a2
  1497. LDF [BO + 3 * SIZE], a3
  1498. FMUL a1, c01, c01
  1499. FMUL a1, c03, c03
  1500. FMUL a2, c01, t1
  1501. FMUL a2, c03, t2
  1502. FSUB c02, t1, c02
  1503. FSUB c04, t2, c04
  1504. FMUL a3, c02, c02
  1505. FMUL a3, c04, c04
  1506. #endif
  1507. #ifdef RT
  1508. LDF [BO + 3 * SIZE], a1
  1509. LDF [BO + 2 * SIZE], a2
  1510. LDF [BO + 0 * SIZE], a3
  1511. FMUL a1, c02, c02
  1512. FMUL a1, c04, c04
  1513. FMUL a2, c02, t1
  1514. FMUL a2, c04, t2
  1515. FSUB c01, t1, c01
  1516. FSUB c03, t2, c03
  1517. FMUL a3, c01, c01
  1518. FMUL a3, c03, c03
  1519. #endif
  1520. #ifdef LN
  1521. add C1, -2 * SIZE, C1
  1522. add C2, -2 * SIZE, C2
  1523. #endif
  1524. #if defined(LN) || defined(LT)
  1525. STF c01, [BO + 0 * SIZE]
  1526. STF c02, [BO + 1 * SIZE]
  1527. STF c03, [BO + 2 * SIZE]
  1528. STF c04, [BO + 3 * SIZE]
  1529. #else
  1530. STF c01, [AO + 0 * SIZE]
  1531. STF c03, [AO + 1 * SIZE]
  1532. STF c02, [AO + 2 * SIZE]
  1533. STF c04, [AO + 3 * SIZE]
  1534. #endif
  1535. STF c01, [C1 + 0 * SIZE]
  1536. STF c03, [C1 + 1 * SIZE]
  1537. STF c02, [C2 + 0 * SIZE]
  1538. STF c04, [C2 + 1 * SIZE]
  1539. FMOV FZERO, t1
  1540. FMOV FZERO, t2
  1541. FMOV FZERO, t3
  1542. FMOV FZERO, t4
  1543. #ifndef LN
  1544. add C1, 2 * SIZE, C1
  1545. add C2, 2 * SIZE, C2
  1546. #endif
  1547. #ifdef RT
  1548. sll K, 1 + BASE_SHIFT, TEMP1
  1549. add AORIG, TEMP1, AORIG
  1550. #endif
  1551. #if defined(LT) || defined(RN)
  1552. sub K, KK, TEMP1
  1553. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  1554. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  1555. add AO, TEMP2, AO
  1556. add BO, TEMP1, BO
  1557. #endif
  1558. #ifdef LT
  1559. add KK, 2, KK
  1560. #endif
  1561. #ifdef LN
  1562. sub KK, 2, KK
  1563. #endif
  1564. .LL170:
  1565. and M, 1, I
  1566. cmp I, 0
  1567. ble,pn %icc, .LL199
  1568. nop
  1569. #if defined(LT) || defined(RN)
  1570. sra KK, 2, L
  1571. mov B, BO
  1572. cmp L, 0
  1573. #else
  1574. #ifdef LN
  1575. sll K, 0 + BASE_SHIFT, TEMP1
  1576. sub AORIG, TEMP1, AORIG
  1577. #endif
  1578. sll KK, 0 + BASE_SHIFT, TEMP1
  1579. sll KK, 1 + BASE_SHIFT, TEMP2
  1580. add AORIG, TEMP1, AO
  1581. add B, TEMP2, BO
  1582. sub K, KK, TEMP1
  1583. sra TEMP1, 2, L
  1584. cmp L, 0
  1585. #endif
  1586. LDF [AO + 0 * SIZE], a1
  1587. FMOV FZERO, c01
  1588. LDF [BO + 0 * SIZE], b1
  1589. FMOV FZERO, t1
  1590. LDF [AO + 1 * SIZE], a2
  1591. FMOV FZERO, c02
  1592. LDF [BO + 1 * SIZE], b2
  1593. FMOV FZERO, t2
  1594. LDF [AO + 2 * SIZE], a3
  1595. FMOV FZERO, c03
  1596. LDF [BO + 2 * SIZE], b3
  1597. FMOV FZERO, t3
  1598. LDF [AO + 3 * SIZE], a4
  1599. FMOV FZERO, c04
  1600. LDF [BO + 3 * SIZE], b4
  1601. FMOV FZERO, t4
  1602. ble,pn %icc, .LL175
  1603. nop
  1604. .LL172:
  1605. FADD c01, t1, c01
  1606. add AO, 4 * SIZE, AO
  1607. FMUL a1, b1, t1
  1608. LDF [BO + 4 * SIZE], b1
  1609. FADD c02, t2, c02
  1610. FMUL a1, b2, t2
  1611. LDF [BO + 5 * SIZE], b2
  1612. add L, -1, L
  1613. LDF [AO + 0 * SIZE], a1
  1614. FADD c03, t3, c03
  1615. cmp L, 0
  1616. FMUL a2, b3, t3
  1617. LDF [BO + 6 * SIZE], b3
  1618. FADD c04, t4, c04
  1619. FMUL a2, b4, t4
  1620. LDF [BO + 7 * SIZE], b4
  1621. LDF [AO + 1 * SIZE], a2
  1622. FADD c01, t1, c01
  1623. FMUL a3, b1, t1
  1624. LDF [BO + 8 * SIZE], b1
  1625. FADD c02, t2, c02
  1626. FMUL a3, b2, t2
  1627. LDF [BO + 9 * SIZE], b2
  1628. LDF [AO + 2 * SIZE], a3
  1629. FADD c03, t3, c03
  1630. FMUL a4, b3, t3
  1631. LDF [BO + 10 * SIZE], b3
  1632. FADD c04, t4, c04
  1633. FMUL a4, b4, t4
  1634. LDF [BO + 11 * SIZE], b4
  1635. add BO, 8 * SIZE, BO
  1636. bg,pt %icc, .LL172
  1637. LDF [AO + 3 * SIZE], a4
  1638. .LL175:
  1639. #if defined(LT) || defined(RN)
  1640. and KK, 3, L
  1641. #else
  1642. and TEMP1, 3, L
  1643. #endif
  1644. cmp L, 0
  1645. ble,a,pn %icc, .LL179
  1646. nop
  1647. .LL176:
  1648. FADD c01, t1, c01
  1649. add L, -1, L
  1650. FMUL a1, b1, t1
  1651. add AO, 1 * SIZE, AO
  1652. LDF [BO + 2 * SIZE], b1
  1653. FADD c02, t2, c02
  1654. cmp L, 0
  1655. FMUL a1, b2, t2
  1656. LDF [BO + 3 * SIZE], b2
  1657. add BO, 2 * SIZE, BO
  1658. bg,pt %icc, .LL176
  1659. LDF [AO + 0 * SIZE], a1
  1660. .LL179:
  1661. FADD c01, t1, c01
  1662. FADD c02, t2, c02
  1663. FADD c03, t3, c03
  1664. FADD c04, t4, c04
  1665. FADD c01, c03, c01
  1666. FADD c02, c04, c02
  1667. #if defined(LN) || defined(RT)
  1668. #ifdef LN
  1669. sub KK, 1, TEMP1
  1670. #else
  1671. sub KK, 2, TEMP1
  1672. #endif
  1673. sll TEMP1, 0 + BASE_SHIFT, TEMP2
  1674. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  1675. add AORIG, TEMP2, AO
  1676. add B, TEMP1, BO
  1677. #endif
  1678. #if defined(LN) || defined(LT)
  1679. LDF [BO + 0 * SIZE], a1
  1680. LDF [BO + 1 * SIZE], a2
  1681. FSUB a1, c01, c01
  1682. FSUB a2, c02, c02
  1683. #else
  1684. LDF [AO + 0 * SIZE], a1
  1685. LDF [AO + 1 * SIZE], a2
  1686. FSUB a1, c01, c01
  1687. FSUB a2, c02, c02
  1688. #endif
  1689. #ifdef LN
  1690. LDF [AO + 0 * SIZE], a1
  1691. FMUL a1, c01, c01
  1692. FMUL a1, c02, c02
  1693. #endif
  1694. #ifdef LT
  1695. LDF [AO + 0 * SIZE], a1
  1696. FMUL a1, c01, c01
  1697. FMUL a1, c02, c02
  1698. #endif
  1699. #ifdef RN
  1700. LDF [BO + 0 * SIZE], a1
  1701. LDF [BO + 1 * SIZE], a2
  1702. LDF [BO + 3 * SIZE], a3
  1703. FMUL a1, c01, c01
  1704. FMUL a2, c01, t1
  1705. FSUB c02, t1, c02
  1706. FMUL a3, c02, c02
  1707. #endif
  1708. #ifdef RT
  1709. LDF [BO + 3 * SIZE], a1
  1710. LDF [BO + 2 * SIZE], a2
  1711. LDF [BO + 0 * SIZE], a3
  1712. FMUL a1, c02, c02
  1713. FMUL a2, c02, t1
  1714. FSUB c01, t1, c01
  1715. FMUL a3, c01, c01
  1716. #endif
  1717. #ifdef LN
  1718. add C1, -1 * SIZE, C1
  1719. add C2, -1 * SIZE, C2
  1720. #endif
  1721. #if defined(LN) || defined(LT)
  1722. STF c01, [BO + 0 * SIZE]
  1723. STF c02, [BO + 1 * SIZE]
  1724. #else
  1725. STF c01, [AO + 0 * SIZE]
  1726. STF c02, [AO + 1 * SIZE]
  1727. #endif
  1728. STF c01, [C1 + 0 * SIZE]
  1729. STF c02, [C2 + 0 * SIZE]
  1730. FMOV FZERO, t1
  1731. FMOV FZERO, t2
  1732. FMOV FZERO, t3
  1733. FMOV FZERO, t4
  1734. #ifndef LN
  1735. add C1, 1 * SIZE, C1
  1736. add C2, 1 * SIZE, C2
  1737. #endif
  1738. #ifdef RT
  1739. sll K, 0 + BASE_SHIFT, TEMP1
  1740. add AORIG, TEMP1, AORIG
  1741. #endif
  1742. #if defined(LT) || defined(RN)
  1743. sub K, KK, TEMP1
  1744. sll TEMP1, 0 + BASE_SHIFT, TEMP2
  1745. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  1746. add AO, TEMP2, AO
  1747. add BO, TEMP1, BO
  1748. #endif
  1749. #ifdef LT
  1750. add KK, 1, KK
  1751. #endif
  1752. #ifdef LN
  1753. sub KK, 1, KK
  1754. #endif
  1755. .LL199:
  1756. #ifdef LN
  1757. sll K, 1 + BASE_SHIFT, TEMP1
  1758. add B, TEMP1, B
  1759. #endif
  1760. #if defined(LT) || defined(RN)
  1761. mov BO, B
  1762. #endif
  1763. #ifdef RN
  1764. add KK, 2, KK
  1765. #endif
  1766. #ifdef RT
  1767. sub KK, 2, KK
  1768. #endif
  1769. .LL200:
  1770. sra N, 2, J
  1771. cmp J, 0
  1772. ble,pn %icc, .LL999
  1773. nop
  1774. .LL11:
  1775. #ifdef RT
  1776. sll K, 2 + BASE_SHIFT, TEMP1
  1777. sub B, TEMP1, B
  1778. sll LDC, 2, TEMP1
  1779. sub C, TEMP1, C
  1780. #endif
  1781. add C, LDC, C2
  1782. FMOV FZERO, t1
  1783. nop
  1784. mov C, C1
  1785. add C2, LDC, C3
  1786. FMOV FZERO, t2
  1787. nop
  1788. mov A, AO
  1789. sra M, 2, I
  1790. add C3, LDC, C4
  1791. FMOV FZERO, t3
  1792. #ifdef LN
  1793. add M, OFFSET, KK
  1794. #endif
  1795. #ifdef LT
  1796. mov OFFSET, KK
  1797. #endif
  1798. #if defined(LN) || defined(RT)
  1799. mov A, AORIG
  1800. #else
  1801. mov A, AO
  1802. #endif
  1803. cmp I, 0
  1804. #ifndef RT
  1805. add C4, LDC, C
  1806. #endif
  1807. FMOV FZERO, t4
  1808. ble,pn %icc, .LL50
  1809. FMOV FZERO, c01
  1810. .LL21:
  1811. FMOV FZERO, c02
  1812. FMOV FZERO, c03
  1813. #if defined(LT) || defined(RN)
  1814. sra KK, 2, L
  1815. mov B, BO
  1816. cmp L, 0
  1817. #else
  1818. #ifdef LN
  1819. sll K, 2 + BASE_SHIFT, TEMP1
  1820. sub AORIG, TEMP1, AORIG
  1821. #endif
  1822. sll KK, 2 + BASE_SHIFT, TEMP1
  1823. add AORIG, TEMP1, AO
  1824. add B, TEMP1, BO
  1825. sub K, KK, TEMP1
  1826. sra TEMP1, 2, L
  1827. cmp L, 0
  1828. #endif
  1829. LDF [AO + 0 * SIZE], a1
  1830. FMOV FZERO, c04
  1831. LDF [BO + 0 * SIZE], b1
  1832. FMOV FZERO, c05
  1833. LDF [AO + 1 * SIZE], a2
  1834. FMOV FZERO, c06
  1835. LDF [BO + 1 * SIZE], b2
  1836. FMOV FZERO, c07
  1837. LDF [AO + 2 * SIZE], a3
  1838. FMOV FZERO, c08
  1839. LDF [BO + 2 * SIZE], b3
  1840. FMOV FZERO, c09
  1841. LDF [AO + 3 * SIZE], a4
  1842. FMOV FZERO, c10
  1843. LDF [BO + 3 * SIZE], b4
  1844. FMOV FZERO, c11
  1845. LDF [BO + 4 * SIZE], b5 /* ***** */
  1846. LDF [AO + 4 * SIZE], a5 /* ***** */
  1847. prefetch [C1 + 3 * SIZE], 3
  1848. FMOV FZERO, c12
  1849. prefetch [C2 + 3 * SIZE], 3
  1850. FMOV FZERO, c13
  1851. prefetch [C3 + 3 * SIZE], 3
  1852. FMOV FZERO, c14
  1853. prefetch [C4 + 3 * SIZE], 3
  1854. FMOV FZERO, c15
  1855. ble,pn %icc, .LL25
  1856. FMOV FZERO, c16
  1857. .LL22:
  1858. FADD c04, t1, c04
  1859. prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY
  1860. FMUL a1, b1, t1
  1861. nop
  1862. FADD c08, t2, c08
  1863. prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY
  1864. FMUL a1, b2, t2
  1865. add AO, 16 * SIZE, AO
  1866. FADD c12, t3, c12
  1867. LDF [AO - 13 * SIZE], a4
  1868. FMUL a1, b3, t3
  1869. add BO, 16 * SIZE, BO
  1870. FADD c16, t4, c16
  1871. nop
  1872. FMUL a1, b4, t4
  1873. LDF [AO - 8 * SIZE], a1
  1874. FADD c01, t1, c01
  1875. nop
  1876. FMUL a2, b1, t1
  1877. nop
  1878. FADD c05, t2, c05
  1879. nop
  1880. FMUL a2, b2, t2
  1881. nop
  1882. FADD c09, t3, c09
  1883. nop
  1884. FMUL a2, b3, t3
  1885. nop
  1886. FADD c13, t4, c13
  1887. add L, -1, L
  1888. FMUL a2, b4, t4
  1889. LDF [AO - 11 * SIZE], a2
  1890. FADD c02, t1, c02
  1891. nop
  1892. FMUL a3, b1, t1
  1893. nop
  1894. FADD c06, t2, c06
  1895. nop
  1896. FMUL a3, b2, t2
  1897. nop
  1898. FADD c10, t3, c10
  1899. nop
  1900. FMUL a3, b3, t3
  1901. nop
  1902. FADD c14, t4, c14
  1903. nop
  1904. FMUL a3, b4, t4
  1905. LDF [AO - 10 * SIZE], a3
  1906. FADD c03, t1, c03
  1907. nop
  1908. FMUL a4, b1, t1
  1909. LDF [BO - 8 * SIZE], b1
  1910. FADD c07, t2, c07
  1911. nop
  1912. FMUL a4, b2, t2
  1913. LDF [BO - 11 * SIZE], b2
  1914. FADD c11, t3, c11
  1915. nop
  1916. FMUL a4, b3, t3
  1917. LDF [BO - 10 * SIZE], b3
  1918. FADD c15, t4, c15
  1919. nop
  1920. FMUL a4, b4, t4
  1921. LDF [BO - 9 * SIZE], b4
  1922. FADD c04, t1, c04
  1923. nop
  1924. FMUL a5, b5, t1
  1925. LDF [AO - 9 * SIZE], a4
  1926. FADD c08, t2, c08
  1927. nop
  1928. FMUL a5, b2, t2
  1929. nop
  1930. FADD c12, t3, c12
  1931. nop
  1932. FMUL a5, b3, t3
  1933. nop
  1934. FADD c16, t4, c16
  1935. nop
  1936. FMUL a5, b4, t4
  1937. LDF [AO - 4 * SIZE], a5
  1938. FADD c01, t1, c01
  1939. nop
  1940. FMUL a2, b5, t1
  1941. nop
  1942. FADD c05, t2, c05
  1943. nop
  1944. FMUL a2, b2, t2
  1945. nop
  1946. FADD c09, t3, c09
  1947. nop
  1948. FMUL a2, b3, t3
  1949. nop
  1950. FADD c13, t4, c13
  1951. nop
  1952. FMUL a2, b4, t4
  1953. LDF [AO - 7 * SIZE], a2
  1954. FADD c02, t1, c02
  1955. nop
  1956. FMUL a3, b5, t1
  1957. nop
  1958. FADD c06, t2, c06
  1959. nop
  1960. FMUL a3, b2, t2
  1961. nop
  1962. FADD c10, t3, c10
  1963. nop
  1964. FMUL a3, b3, t3
  1965. nop
  1966. FADD c14, t4, c14
  1967. nop
  1968. FMUL a3, b4, t4
  1969. LDF [AO - 6 * SIZE], a3
  1970. FADD c03, t1, c03
  1971. nop
  1972. FMUL a4, b5, t1
  1973. LDF [BO - 4 * SIZE], b5
  1974. FADD c07, t2, c07
  1975. nop
  1976. FMUL a4, b2, t2
  1977. LDF [BO - 7 * SIZE], b2
  1978. FADD c11, t3, c11
  1979. nop
  1980. FMUL a4, b3, t3
  1981. LDF [BO - 6 * SIZE], b3
  1982. FADD c15, t4, c15
  1983. nop
  1984. FMUL a4, b4, t4
  1985. LDF [BO - 5 * SIZE], b4
  1986. FADD c04, t1, c04
  1987. nop
  1988. FMUL a1, b1, t1
  1989. LDF [AO - 5 * SIZE], a4
  1990. FADD c08, t2, c08
  1991. nop
  1992. FMUL a1, b2, t2
  1993. nop
  1994. FADD c12, t3, c12
  1995. nop
  1996. FMUL a1, b3, t3
  1997. nop
  1998. FADD c16, t4, c16
  1999. nop
  2000. FMUL a1, b4, t4
  2001. LDF [AO - 0 * SIZE], a1
  2002. FADD c01, t1, c01
  2003. nop
  2004. FMUL a2, b1, t1
  2005. nop
  2006. #ifdef DOUBLE
  2007. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  2008. #else
  2009. nop
  2010. #endif
  2011. FADD c05, t2, c05
  2012. nop
  2013. FMUL a2, b2, t2
  2014. FADD c09, t3, c09
  2015. nop
  2016. FMUL a2, b3, t3
  2017. nop
  2018. FADD c13, t4, c13
  2019. nop
  2020. FMUL a2, b4, t4
  2021. nop
  2022. FADD c02, t1, c02
  2023. nop
  2024. FMUL a3, b1, t1
  2025. LDF [AO - 3 * SIZE], a2
  2026. FADD c06, t2, c06
  2027. #ifdef DOUBLE
  2028. prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY
  2029. #else
  2030. nop
  2031. #endif
  2032. FMUL a3, b2, t2
  2033. nop
  2034. FADD c10, t3, c10
  2035. nop
  2036. FMUL a3, b3, t3
  2037. nop
  2038. FADD c14, t4, c14
  2039. nop
  2040. FMUL a3, b4, t4
  2041. LDF [AO - 2 * SIZE], a3
  2042. FADD c03, t1, c03
  2043. nop
  2044. FMUL a4, b1, t1
  2045. LDF [BO - 0 * SIZE], b1
  2046. FADD c07, t2, c07
  2047. nop
  2048. FMUL a4, b2, t2
  2049. LDF [BO - 3 * SIZE], b2
  2050. FADD c11, t3, c11
  2051. nop
  2052. FMUL a4, b3, t3
  2053. LDF [BO - 2 * SIZE], b3
  2054. FADD c15, t4, c15
  2055. nop
  2056. FMUL a4, b4, t4
  2057. LDF [BO - 1 * SIZE], b4
  2058. FADD c04, t1, c04
  2059. nop
  2060. FMUL a5, b5, t1
  2061. LDF [AO - 1 * SIZE], a4
  2062. FADD c08, t2, c08
  2063. FMUL a5, b2, t2
  2064. FADD c12, t3, c12
  2065. FMUL a5, b3, t3
  2066. FADD c16, t4, c16
  2067. nop
  2068. FMUL a5, b4, t4
  2069. LDF [AO + 4 * SIZE], a5
  2070. FADD c01, t1, c01
  2071. nop
  2072. FMUL a2, b5, t1
  2073. nop
  2074. FADD c05, t2, c05
  2075. nop
  2076. FMUL a2, b2, t2
  2077. nop
  2078. FADD c09, t3, c09
  2079. nop
  2080. FMUL a2, b3, t3
  2081. nop
  2082. FADD c13, t4, c13
  2083. nop
  2084. FMUL a2, b4, t4
  2085. LDF [AO + 1 * SIZE], a2
  2086. FADD c02, t1, c02
  2087. nop
  2088. FMUL a3, b5, t1
  2089. nop
  2090. FADD c06, t2, c06
  2091. nop
  2092. FMUL a3, b2, t2
  2093. nop
  2094. FADD c10, t3, c10
  2095. nop
  2096. FMUL a3, b3, t3
  2097. nop
  2098. FADD c14, t4, c14
  2099. nop
  2100. FMUL a3, b4, t4
  2101. LDF [AO + 2 * SIZE], a3
  2102. FADD c03, t1, c03
  2103. cmp L, 0
  2104. FMUL a4, b5, t1
  2105. LDF [BO + 4 * SIZE], b5
  2106. FADD c07, t2, c07
  2107. nop
  2108. FMUL a4, b2, t2
  2109. LDF [BO + 1 * SIZE], b2
  2110. FADD c11, t3, c11
  2111. nop
  2112. FMUL a4, b3, t3
  2113. LDF [BO + 2 * SIZE], b3
  2114. FADD c15, t4, c15
  2115. FMUL a4, b4, t4
  2116. bg,pt %icc, .LL22
  2117. LDF [BO + 3 * SIZE], b4
  2118. .LL25:
  2119. #if defined(LT) || defined(RN)
  2120. and KK, 3, L
  2121. #else
  2122. and TEMP1, 3, L
  2123. #endif
  2124. cmp L, 0
  2125. ble,a,pn %icc, .LL29
  2126. nop
  2127. .LL26:
  2128. FADD c04, t1, c04
  2129. LDF [AO + 3 * SIZE], a4
  2130. FMUL a1, b1, t1
  2131. add AO, 4 * SIZE, AO
  2132. FADD c08, t2, c08
  2133. add BO, 4 * SIZE, BO
  2134. FMUL a1, b2, t2
  2135. add L, -1, L
  2136. FADD c12, t3, c12
  2137. nop
  2138. FMUL a1, b3, t3
  2139. cmp L, 0
  2140. FADD c16, t4, c16
  2141. nop
  2142. FMUL a1, b4, t4
  2143. LDF [AO + 0 * SIZE], a1
  2144. FADD c01, t1, c01
  2145. nop
  2146. FMUL a2, b1, t1
  2147. nop
  2148. FADD c05, t2, c05
  2149. nop
  2150. FMUL a2, b2, t2
  2151. nop
  2152. FADD c09, t3, c09
  2153. nop
  2154. FMUL a2, b3, t3
  2155. nop
  2156. FADD c13, t4, c13
  2157. nop
  2158. FMUL a2, b4, t4
  2159. LDF [AO + 1 * SIZE], a2
  2160. FADD c02, t1, c02
  2161. nop
  2162. FMUL a3, b1, t1
  2163. nop
  2164. FADD c06, t2, c06
  2165. nop
  2166. FMUL a3, b2, t2
  2167. nop
  2168. FADD c10, t3, c10
  2169. nop
  2170. FMUL a3, b3, t3
  2171. nop
  2172. FADD c14, t4, c14
  2173. nop
  2174. FMUL a3, b4, t4
  2175. LDF [AO + 2 * SIZE], a3
  2176. FADD c03, t1, c03
  2177. nop
  2178. FMUL a4, b1, t1
  2179. LDF [BO + 0 * SIZE], b1
  2180. FADD c07, t2, c07
  2181. nop
  2182. FMUL a4, b2, t2
  2183. LDF [BO + 1 * SIZE], b2
  2184. FADD c11, t3, c11
  2185. nop
  2186. FMUL a4, b3, t3
  2187. LDF [BO + 2 * SIZE], b3
  2188. FADD c15, t4, c15
  2189. FMUL a4, b4, t4
  2190. bg,pt %icc, .LL26
  2191. LDF [BO + 3 * SIZE], b4
  2192. .LL29:
  2193. #if defined(LN) || defined(RT)
  2194. sub KK, 4, TEMP1
  2195. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  2196. add AORIG, TEMP1, AO
  2197. add B, TEMP1, BO
  2198. #endif
  2199. FADD c04, t1, c04
  2200. FADD c08, t2, c08
  2201. FADD c12, t3, c12
  2202. FADD c16, t4, c16
  2203. #if defined(LN) || defined(LT)
  2204. LDF [BO + 0 * SIZE], a1
  2205. LDF [BO + 1 * SIZE], a2
  2206. LDF [BO + 2 * SIZE], a3
  2207. LDF [BO + 3 * SIZE], a4
  2208. LDF [BO + 4 * SIZE], b1
  2209. LDF [BO + 5 * SIZE], b2
  2210. LDF [BO + 6 * SIZE], b3
  2211. LDF [BO + 7 * SIZE], b4
  2212. FSUB a1, c01, c01
  2213. FSUB a2, c05, c05
  2214. FSUB a3, c09, c09
  2215. FSUB a4, c13, c13
  2216. FSUB b1, c02, c02
  2217. FSUB b2, c06, c06
  2218. FSUB b3, c10, c10
  2219. FSUB b4, c14, c14
  2220. LDF [BO + 8 * SIZE], a1
  2221. LDF [BO + 9 * SIZE], a2
  2222. LDF [BO + 10 * SIZE], a3
  2223. LDF [BO + 11 * SIZE], a4
  2224. LDF [BO + 12 * SIZE], b1
  2225. LDF [BO + 13 * SIZE], b2
  2226. LDF [BO + 14 * SIZE], b3
  2227. LDF [BO + 15 * SIZE], b4
  2228. FSUB a1, c03, c03
  2229. FSUB a2, c07, c07
  2230. FSUB a3, c11, c11
  2231. FSUB a4, c15, c15
  2232. FSUB b1, c04, c04
  2233. FSUB b2, c08, c08
  2234. FSUB b3, c12, c12
  2235. FSUB b4, c16, c16
  2236. #else
  2237. LDF [AO + 0 * SIZE], a1
  2238. LDF [AO + 1 * SIZE], a2
  2239. LDF [AO + 2 * SIZE], a3
  2240. LDF [AO + 3 * SIZE], a4
  2241. LDF [AO + 4 * SIZE], b1
  2242. LDF [AO + 5 * SIZE], b2
  2243. LDF [AO + 6 * SIZE], b3
  2244. LDF [AO + 7 * SIZE], b4
  2245. FSUB a1, c01, c01
  2246. FSUB a2, c02, c02
  2247. FSUB a3, c03, c03
  2248. FSUB a4, c04, c04
  2249. FSUB b1, c05, c05
  2250. FSUB b2, c06, c06
  2251. FSUB b3, c07, c07
  2252. FSUB b4, c08, c08
  2253. LDF [AO + 8 * SIZE], a1
  2254. LDF [AO + 9 * SIZE], a2
  2255. LDF [AO + 10 * SIZE], a3
  2256. LDF [AO + 11 * SIZE], a4
  2257. LDF [AO + 12 * SIZE], b1
  2258. LDF [AO + 13 * SIZE], b2
  2259. LDF [AO + 14 * SIZE], b3
  2260. LDF [AO + 15 * SIZE], b4
  2261. FSUB a1, c09, c09
  2262. FSUB a2, c10, c10
  2263. FSUB a3, c11, c11
  2264. FSUB a4, c12, c12
  2265. FSUB b1, c13, c13
  2266. FSUB b2, c14, c14
  2267. FSUB b3, c15, c15
  2268. FSUB b4, c16, c16
  2269. #endif
  2270. #ifdef LN
  2271. LDF [AO + 15 * SIZE], a1
  2272. LDF [AO + 14 * SIZE], a2
  2273. LDF [AO + 13 * SIZE], a3
  2274. LDF [AO + 12 * SIZE], a4
  2275. FMUL a1, c04, c04
  2276. FMUL a1, c08, c08
  2277. FMUL a1, c12, c12
  2278. FMUL a1, c16, c16
  2279. FMUL a2, c04, t1
  2280. FMUL a2, c08, t2
  2281. FMUL a2, c12, t3
  2282. FMUL a2, c16, t4
  2283. FSUB c03, t1, c03
  2284. FSUB c07, t2, c07
  2285. FSUB c11, t3, c11
  2286. FSUB c15, t4, c15
  2287. FMUL a3, c04, t1
  2288. FMUL a3, c08, t2
  2289. FMUL a3, c12, t3
  2290. FMUL a3, c16, t4
  2291. FSUB c02, t1, c02
  2292. FSUB c06, t2, c06
  2293. FSUB c10, t3, c10
  2294. FSUB c14, t4, c14
  2295. FMUL a4, c04, t1
  2296. FMUL a4, c08, t2
  2297. FMUL a4, c12, t3
  2298. FMUL a4, c16, t4
  2299. FSUB c01, t1, c01
  2300. FSUB c05, t2, c05
  2301. FSUB c09, t3, c09
  2302. FSUB c13, t4, c13
  2303. LDF [AO + 10 * SIZE], a1
  2304. LDF [AO + 9 * SIZE], a2
  2305. LDF [AO + 8 * SIZE], a3
  2306. FMUL a1, c03, c03
  2307. FMUL a1, c07, c07
  2308. FMUL a1, c11, c11
  2309. FMUL a1, c15, c15
  2310. FMUL a2, c03, t1
  2311. FMUL a2, c07, t2
  2312. FMUL a2, c11, t3
  2313. FMUL a2, c15, t4
  2314. FSUB c02, t1, c02
  2315. FSUB c06, t2, c06
  2316. FSUB c10, t3, c10
  2317. FSUB c14, t4, c14
  2318. FMUL a3, c03, t1
  2319. FMUL a3, c07, t2
  2320. FMUL a3, c11, t3
  2321. FMUL a3, c15, t4
  2322. FSUB c01, t1, c01
  2323. FSUB c05, t2, c05
  2324. FSUB c09, t3, c09
  2325. FSUB c13, t4, c13
  2326. LDF [AO + 5 * SIZE], a1
  2327. LDF [AO + 4 * SIZE], a2
  2328. FMUL a1, c02, c02
  2329. FMUL a1, c06, c06
  2330. FMUL a1, c10, c10
  2331. FMUL a1, c14, c14
  2332. FMUL a2, c02, t1
  2333. FMUL a2, c06, t2
  2334. FMUL a2, c10, t3
  2335. FMUL a2, c14, t4
  2336. FSUB c01, t1, c01
  2337. FSUB c05, t2, c05
  2338. FSUB c09, t3, c09
  2339. FSUB c13, t4, c13
  2340. LDF [AO + 0 * SIZE], a1
  2341. FMUL a1, c01, c01
  2342. FMUL a1, c05, c05
  2343. FMUL a1, c09, c09
  2344. FMUL a1, c13, c13
  2345. #endif
  2346. #ifdef LT
  2347. LDF [AO + 0 * SIZE], a1
  2348. LDF [AO + 1 * SIZE], a2
  2349. LDF [AO + 2 * SIZE], a3
  2350. LDF [AO + 3 * SIZE], a4
  2351. FMUL a1, c01, c01
  2352. FMUL a1, c05, c05
  2353. FMUL a1, c09, c09
  2354. FMUL a1, c13, c13
  2355. FMUL a2, c01, t1
  2356. FMUL a2, c05, t2
  2357. FMUL a2, c09, t3
  2358. FMUL a2, c13, t4
  2359. FSUB c02, t1, c02
  2360. FSUB c06, t2, c06
  2361. FSUB c10, t3, c10
  2362. FSUB c14, t4, c14
  2363. FMUL a3, c01, t1
  2364. FMUL a3, c05, t2
  2365. FMUL a3, c09, t3
  2366. FMUL a3, c13, t4
  2367. FSUB c03, t1, c03
  2368. FSUB c07, t2, c07
  2369. FSUB c11, t3, c11
  2370. FSUB c15, t4, c15
  2371. FMUL a4, c01, t1
  2372. FMUL a4, c05, t2
  2373. FMUL a4, c09, t3
  2374. FMUL a4, c13, t4
  2375. FSUB c04, t1, c04
  2376. FSUB c08, t2, c08
  2377. FSUB c12, t3, c12
  2378. FSUB c16, t4, c16
  2379. LDF [AO + 5 * SIZE], a1
  2380. LDF [AO + 6 * SIZE], a2
  2381. LDF [AO + 7 * SIZE], a3
  2382. FMUL a1, c02, c02
  2383. FMUL a1, c06, c06
  2384. FMUL a1, c10, c10
  2385. FMUL a1, c14, c14
  2386. FMUL a2, c02, t1
  2387. FMUL a2, c06, t2
  2388. FMUL a2, c10, t3
  2389. FMUL a2, c14, t4
  2390. FSUB c03, t1, c03
  2391. FSUB c07, t2, c07
  2392. FSUB c11, t3, c11
  2393. FSUB c15, t4, c15
  2394. FMUL a3, c02, t1
  2395. FMUL a3, c06, t2
  2396. FMUL a3, c10, t3
  2397. FMUL a3, c14, t4
  2398. FSUB c04, t1, c04
  2399. FSUB c08, t2, c08
  2400. FSUB c12, t3, c12
  2401. FSUB c16, t4, c16
  2402. LDF [AO + 10 * SIZE], a1
  2403. LDF [AO + 11 * SIZE], a2
  2404. FMUL a1, c03, c03
  2405. FMUL a1, c07, c07
  2406. FMUL a1, c11, c11
  2407. FMUL a1, c15, c15
  2408. FMUL a2, c03, t1
  2409. FMUL a2, c07, t2
  2410. FMUL a2, c11, t3
  2411. FMUL a2, c15, t4
  2412. FSUB c04, t1, c04
  2413. FSUB c08, t2, c08
  2414. FSUB c12, t3, c12
  2415. FSUB c16, t4, c16
  2416. LDF [AO + 15 * SIZE], a1
  2417. FMUL a1, c04, c04
  2418. FMUL a1, c08, c08
  2419. FMUL a1, c12, c12
  2420. FMUL a1, c16, c16
  2421. #endif
  2422. #ifdef RN
  2423. LDF [BO + 0 * SIZE], a1
  2424. LDF [BO + 1 * SIZE], a2
  2425. LDF [BO + 2 * SIZE], a3
  2426. LDF [BO + 3 * SIZE], a4
  2427. FMUL a1, c01, c01
  2428. FMUL a1, c02, c02
  2429. FMUL a1, c03, c03
  2430. FMUL a1, c04, c04
  2431. FMUL a2, c01, t1
  2432. FMUL a2, c02, t2
  2433. FMUL a2, c03, t3
  2434. FMUL a2, c04, t4
  2435. FSUB c05, t1, c05
  2436. FSUB c06, t2, c06
  2437. FSUB c07, t3, c07
  2438. FSUB c08, t4, c08
  2439. FMUL a3, c01, t1
  2440. FMUL a3, c02, t2
  2441. FMUL a3, c03, t3
  2442. FMUL a3, c04, t4
  2443. FSUB c09, t1, c09
  2444. FSUB c10, t2, c10
  2445. FSUB c11, t3, c11
  2446. FSUB c12, t4, c12
  2447. FMUL a4, c01, t1
  2448. FMUL a4, c02, t2
  2449. FMUL a4, c03, t3
  2450. FMUL a4, c04, t4
  2451. FSUB c13, t1, c13
  2452. FSUB c14, t2, c14
  2453. FSUB c15, t3, c15
  2454. FSUB c16, t4, c16
  2455. LDF [BO + 5 * SIZE], a1
  2456. LDF [BO + 6 * SIZE], a2
  2457. LDF [BO + 7 * SIZE], a3
  2458. FMUL a1, c05, c05
  2459. FMUL a1, c06, c06
  2460. FMUL a1, c07, c07
  2461. FMUL a1, c08, c08
  2462. FMUL a2, c05, t1
  2463. FMUL a2, c06, t2
  2464. FMUL a2, c07, t3
  2465. FMUL a2, c08, t4
  2466. FSUB c09, t1, c09
  2467. FSUB c10, t2, c10
  2468. FSUB c11, t3, c11
  2469. FSUB c12, t4, c12
  2470. FMUL a3, c05, t1
  2471. FMUL a3, c06, t2
  2472. FMUL a3, c07, t3
  2473. FMUL a3, c08, t4
  2474. FSUB c13, t1, c13
  2475. FSUB c14, t2, c14
  2476. FSUB c15, t3, c15
  2477. FSUB c16, t4, c16
  2478. LDF [BO + 10 * SIZE], a1
  2479. LDF [BO + 11 * SIZE], a2
  2480. FMUL a1, c09, c09
  2481. FMUL a1, c10, c10
  2482. FMUL a1, c11, c11
  2483. FMUL a1, c12, c12
  2484. FMUL a2, c09, t1
  2485. FMUL a2, c10, t2
  2486. FMUL a2, c11, t3
  2487. FMUL a2, c12, t4
  2488. FSUB c13, t1, c13
  2489. FSUB c14, t2, c14
  2490. FSUB c15, t3, c15
  2491. FSUB c16, t4, c16
  2492. LDF [BO + 15 * SIZE], a1
  2493. FMUL a1, c13, c13
  2494. FMUL a1, c14, c14
  2495. FMUL a1, c15, c15
  2496. FMUL a1, c16, c16
  2497. #endif
  2498. #ifdef RT
  2499. LDF [BO + 15 * SIZE], a1
  2500. LDF [BO + 14 * SIZE], a2
  2501. LDF [BO + 13 * SIZE], a3
  2502. LDF [BO + 12 * SIZE], a4
  2503. FMUL a1, c13, c13
  2504. FMUL a1, c14, c14
  2505. FMUL a1, c15, c15
  2506. FMUL a1, c16, c16
  2507. FMUL a2, c13, t1
  2508. FMUL a2, c14, t2
  2509. FMUL a2, c15, t3
  2510. FMUL a2, c16, t4
  2511. FSUB c09, t1, c09
  2512. FSUB c10, t2, c10
  2513. FSUB c11, t3, c11
  2514. FSUB c12, t4, c12
  2515. FMUL a3, c13, t1
  2516. FMUL a3, c14, t2
  2517. FMUL a3, c15, t3
  2518. FMUL a3, c16, t4
  2519. FSUB c05, t1, c05
  2520. FSUB c06, t2, c06
  2521. FSUB c07, t3, c07
  2522. FSUB c08, t4, c08
  2523. FMUL a4, c13, t1
  2524. FMUL a4, c14, t2
  2525. FMUL a4, c15, t3
  2526. FMUL a4, c16, t4
  2527. FSUB c01, t1, c01
  2528. FSUB c02, t2, c02
  2529. FSUB c03, t3, c03
  2530. FSUB c04, t4, c04
  2531. LDF [BO + 10 * SIZE], a1
  2532. LDF [BO + 9 * SIZE], a2
  2533. LDF [BO + 8 * SIZE], a3
  2534. FMUL a1, c09, c09
  2535. FMUL a1, c10, c10
  2536. FMUL a1, c11, c11
  2537. FMUL a1, c12, c12
  2538. FMUL a2, c09, t1
  2539. FMUL a2, c10, t2
  2540. FMUL a2, c11, t3
  2541. FMUL a2, c12, t4
  2542. FSUB c05, t1, c05
  2543. FSUB c06, t2, c06
  2544. FSUB c07, t3, c07
  2545. FSUB c08, t4, c08
  2546. FMUL a3, c09, t1
  2547. FMUL a3, c10, t2
  2548. FMUL a3, c11, t3
  2549. FMUL a3, c12, t4
  2550. FSUB c01, t1, c01
  2551. FSUB c02, t2, c02
  2552. FSUB c03, t3, c03
  2553. FSUB c04, t4, c04
  2554. LDF [BO + 5 * SIZE], a1
  2555. LDF [BO + 4 * SIZE], a2
  2556. FMUL a1, c05, c05
  2557. FMUL a1, c06, c06
  2558. FMUL a1, c07, c07
  2559. FMUL a1, c08, c08
  2560. FMUL a2, c05, t1
  2561. FMUL a2, c06, t2
  2562. FMUL a2, c07, t3
  2563. FMUL a2, c08, t4
  2564. FSUB c01, t1, c01
  2565. FSUB c02, t2, c02
  2566. FSUB c03, t3, c03
  2567. FSUB c04, t4, c04
  2568. LDF [BO + 0 * SIZE], a1
  2569. FMUL a1, c01, c01
  2570. FMUL a1, c02, c02
  2571. FMUL a1, c03, c03
  2572. FMUL a1, c04, c04
  2573. #endif
  2574. #ifdef LN
  2575. add C1, -4 * SIZE, C1
  2576. add C2, -4 * SIZE, C2
  2577. add C3, -4 * SIZE, C3
  2578. add C4, -4 * SIZE, C4
  2579. #endif
  2580. #if defined(LN) || defined(LT)
  2581. STF c01, [BO + 0 * SIZE]
  2582. STF c05, [BO + 1 * SIZE]
  2583. STF c09, [BO + 2 * SIZE]
  2584. STF c13, [BO + 3 * SIZE]
  2585. STF c02, [BO + 4 * SIZE]
  2586. STF c06, [BO + 5 * SIZE]
  2587. STF c10, [BO + 6 * SIZE]
  2588. STF c14, [BO + 7 * SIZE]
  2589. STF c03, [BO + 8 * SIZE]
  2590. STF c07, [BO + 9 * SIZE]
  2591. STF c11, [BO + 10 * SIZE]
  2592. STF c15, [BO + 11 * SIZE]
  2593. STF c04, [BO + 12 * SIZE]
  2594. STF c08, [BO + 13 * SIZE]
  2595. STF c12, [BO + 14 * SIZE]
  2596. STF c16, [BO + 15 * SIZE]
  2597. #else
  2598. STF c01, [AO + 0 * SIZE]
  2599. STF c02, [AO + 1 * SIZE]
  2600. STF c03, [AO + 2 * SIZE]
  2601. STF c04, [AO + 3 * SIZE]
  2602. STF c05, [AO + 4 * SIZE]
  2603. STF c06, [AO + 5 * SIZE]
  2604. STF c07, [AO + 6 * SIZE]
  2605. STF c08, [AO + 7 * SIZE]
  2606. STF c09, [AO + 8 * SIZE]
  2607. STF c10, [AO + 9 * SIZE]
  2608. STF c11, [AO + 10 * SIZE]
  2609. STF c12, [AO + 11 * SIZE]
  2610. STF c13, [AO + 12 * SIZE]
  2611. STF c14, [AO + 13 * SIZE]
  2612. STF c15, [AO + 14 * SIZE]
  2613. STF c16, [AO + 15 * SIZE]
  2614. #endif
  2615. STF c01, [C1 + 0 * SIZE]
  2616. STF c02, [C1 + 1 * SIZE]
  2617. STF c03, [C1 + 2 * SIZE]
  2618. STF c04, [C1 + 3 * SIZE]
  2619. STF c05, [C2 + 0 * SIZE]
  2620. STF c06, [C2 + 1 * SIZE]
  2621. STF c07, [C2 + 2 * SIZE]
  2622. STF c08, [C2 + 3 * SIZE]
  2623. STF c09, [C3 + 0 * SIZE]
  2624. STF c10, [C3 + 1 * SIZE]
  2625. STF c11, [C3 + 2 * SIZE]
  2626. STF c12, [C3 + 3 * SIZE]
  2627. STF c13, [C4 + 0 * SIZE]
  2628. STF c14, [C4 + 1 * SIZE]
  2629. STF c15, [C4 + 2 * SIZE]
  2630. STF c16, [C4 + 3 * SIZE]
  2631. FMOV FZERO, t1
  2632. FMOV FZERO, t2
  2633. FMOV FZERO, t3
  2634. FMOV FZERO, t4
  2635. #ifndef LN
  2636. add C1, 4 * SIZE, C1
  2637. add C2, 4 * SIZE, C2
  2638. add C3, 4 * SIZE, C3
  2639. add C4, 4 * SIZE, C4
  2640. #endif
  2641. #ifdef RT
  2642. sll K, 2 + BASE_SHIFT, TEMP1
  2643. add AORIG, TEMP1, AORIG
  2644. #endif
  2645. #if defined(LT) || defined(RN)
  2646. sub K, KK, TEMP1
  2647. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  2648. add AO, TEMP1, AO
  2649. add BO, TEMP1, BO
  2650. #endif
  2651. #ifdef LT
  2652. add KK, 4, KK
  2653. #endif
  2654. #ifdef LN
  2655. sub KK, 4, KK
  2656. #endif
  2657. add I, -1, I
  2658. cmp I, 0
  2659. sra K, 2, L
  2660. bg,pt %icc, .LL21
  2661. FMOV FZERO, c01
  2662. .LL50:
  2663. and M, 2, I
  2664. cmp I, 0
  2665. ble,pn %icc, .LL70
  2666. nop
  2667. #if defined(LT) || defined(RN)
  2668. sra KK, 2, L
  2669. mov B, BO
  2670. cmp L, 0
  2671. #else
  2672. #ifdef LN
  2673. sll K, 1 + BASE_SHIFT, TEMP1
  2674. sub AORIG, TEMP1, AORIG
  2675. #endif
  2676. sll KK, 1 + BASE_SHIFT, TEMP1
  2677. sll KK, 2 + BASE_SHIFT, TEMP2
  2678. add AORIG, TEMP1, AO
  2679. add B, TEMP2, BO
  2680. sub K, KK, TEMP1
  2681. sra TEMP1, 2, L
  2682. cmp L, 0
  2683. #endif
  2684. FMOV FZERO, c02
  2685. FMOV FZERO, t1
  2686. FMOV FZERO, c04
  2687. LDF [AO + 0 * SIZE], a1
  2688. FMOV FZERO, t2
  2689. LDF [BO + 0 * SIZE], b1
  2690. FMOV FZERO, c06
  2691. LDF [AO + 1 * SIZE], a2
  2692. FMOV FZERO, t3
  2693. LDF [BO + 1 * SIZE], b2
  2694. FMOV FZERO, c08
  2695. LDF [AO + 2 * SIZE], a3
  2696. FMOV FZERO, t4
  2697. LDF [BO + 2 * SIZE], b3
  2698. FMOV FZERO, c01
  2699. LDF [AO + 3 * SIZE], a4
  2700. FMOV FZERO, c03
  2701. LDF [BO + 3 * SIZE], b4
  2702. FMOV FZERO, c05
  2703. ble,pn %icc, .LL55
  2704. FMOV FZERO, c07
  2705. .LL52:
  2706. FADD c02, t1, c02
  2707. add AO, 8 * SIZE, AO
  2708. prefetch [AO + APREFETCHSIZE * SIZE], 0
  2709. FMUL a1, b1, t1
  2710. add BO, 16 * SIZE, BO
  2711. FADD c04, t2, c04
  2712. add L, -1, L
  2713. FMUL a1, b2, t2
  2714. FADD c06, t3, c06
  2715. cmp L, 0
  2716. FMUL a1, b3, t3
  2717. FADD c08, t4, c08
  2718. FMUL a1, b4, t4
  2719. LDF [AO - 4 * SIZE], a1
  2720. FADD c01, t1, c01
  2721. FMUL a2, b1, t1
  2722. LDF [BO - 12 * SIZE], b1
  2723. FADD c03, t2, c03
  2724. FMUL a2, b2, t2
  2725. LDF [BO - 11 * SIZE], b2
  2726. FADD c05, t3, c05
  2727. FMUL a2, b3, t3
  2728. LDF [BO - 10 * SIZE], b3
  2729. FADD c07, t4, c07
  2730. FMUL a2, b4, t4
  2731. LDF [BO - 9 * SIZE], b4
  2732. FADD c02, t1, c02
  2733. FMUL a3, b1, t1
  2734. LDF [AO - 3 * SIZE], a2
  2735. FADD c04, t2, c04
  2736. FMUL a3, b2, t2
  2737. FADD c06, t3, c06
  2738. FMUL a3, b3, t3
  2739. FADD c08, t4, c08
  2740. FMUL a3, b4, t4
  2741. LDF [AO - 2 * SIZE], a3
  2742. FADD c01, t1, c01
  2743. FMUL a4, b1, t1
  2744. LDF [BO - 8 * SIZE], b1
  2745. FADD c03, t2, c03
  2746. FMUL a4, b2, t2
  2747. LDF [BO - 7 * SIZE], b2
  2748. FADD c05, t3, c05
  2749. FMUL a4, b3, t3
  2750. LDF [BO - 6 * SIZE], b3
  2751. FADD c07, t4, c07
  2752. FMUL a4, b4, t4
  2753. LDF [BO - 5 * SIZE], b4
  2754. FADD c02, t1, c02
  2755. FMUL a1, b1, t1
  2756. LDF [AO - 1 * SIZE], a4
  2757. FADD c04, t2, c04
  2758. FMUL a1, b2, t2
  2759. FADD c06, t3, c06
  2760. FMUL a1, b3, t3
  2761. FADD c08, t4, c08
  2762. FMUL a1, b4, t4
  2763. LDF [AO + 0 * SIZE], a1
  2764. FADD c01, t1, c01
  2765. FMUL a2, b1, t1
  2766. LDF [BO - 4 * SIZE], b1
  2767. FADD c03, t2, c03
  2768. FMUL a2, b2, t2
  2769. LDF [BO - 3 * SIZE], b2
  2770. FADD c05, t3, c05
  2771. FMUL a2, b3, t3
  2772. LDF [BO - 2 * SIZE], b3
  2773. FADD c07, t4, c07
  2774. FMUL a2, b4, t4
  2775. LDF [BO - 1 * SIZE], b4
  2776. FADD c02, t1, c02
  2777. FMUL a3, b1, t1
  2778. LDF [AO + 1 * SIZE], a2
  2779. FADD c04, t2, c04
  2780. FMUL a3, b2, t2
  2781. FADD c06, t3, c06
  2782. FMUL a3, b3, t3
  2783. FADD c08, t4, c08
  2784. FMUL a3, b4, t4
  2785. LDF [AO + 2 * SIZE], a3
  2786. FADD c01, t1, c01
  2787. FMUL a4, b1, t1
  2788. LDF [BO + 0 * SIZE], b1
  2789. FADD c03, t2, c03
  2790. FMUL a4, b2, t2
  2791. LDF [BO + 1 * SIZE], b2
  2792. FADD c05, t3, c05
  2793. FMUL a4, b3, t3
  2794. LDF [BO + 2 * SIZE], b3
  2795. FADD c07, t4, c07
  2796. FMUL a4, b4, t4
  2797. LDF [BO + 3 * SIZE], b4
  2798. bg,pt %icc, .LL52
  2799. LDF [AO + 3 * SIZE], a4
  2800. .LL55:
  2801. #if defined(LT) || defined(RN)
  2802. and KK, 3, L
  2803. #else
  2804. and TEMP1, 3, L
  2805. #endif
  2806. cmp L, 0
  2807. ble,a,pn %icc, .LL59
  2808. nop
  2809. .LL56:
  2810. FADD c02, t1, c02
  2811. add AO, 2 * SIZE, AO
  2812. FMUL a1, b1, t1
  2813. add L, -1, L
  2814. add BO, 4 * SIZE, BO
  2815. FADD c04, t2, c04
  2816. cmp L, 0
  2817. FMUL a1, b2, t2
  2818. FADD c06, t3, c06
  2819. FMUL a1, b3, t3
  2820. FADD c08, t4, c08
  2821. FMUL a1, b4, t4
  2822. LDF [AO + 0 * SIZE], a1
  2823. FADD c01, t1, c01
  2824. FMUL a2, b1, t1
  2825. LDF [BO + 0 * SIZE], b1
  2826. FADD c03, t2, c03
  2827. FMUL a2, b2, t2
  2828. LDF [BO + 1 * SIZE], b2
  2829. FADD c05, t3, c05
  2830. FMUL a2, b3, t3
  2831. LDF [BO + 2 * SIZE], b3
  2832. FADD c07, t4, c07
  2833. FMUL a2, b4, t4
  2834. LDF [BO + 3 * SIZE], b4
  2835. bg,pt %icc, .LL56
  2836. LDF [AO + 1 * SIZE], a2
  2837. .LL59:
  2838. #if defined(LN) || defined(RT)
  2839. #ifdef LN
  2840. sub KK, 2, TEMP1
  2841. #else
  2842. sub KK, 4, TEMP1
  2843. #endif
  2844. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  2845. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  2846. add AORIG, TEMP2, AO
  2847. add B, TEMP1, BO
  2848. #endif
  2849. FADD c02, t1, c02
  2850. FADD c04, t2, c04
  2851. FADD c06, t3, c06
  2852. FADD c08, t4, c08
  2853. #if defined(LN) || defined(LT)
  2854. LDF [BO + 0 * SIZE], a1
  2855. LDF [BO + 1 * SIZE], a2
  2856. LDF [BO + 2 * SIZE], a3
  2857. LDF [BO + 3 * SIZE], a4
  2858. LDF [BO + 4 * SIZE], b1
  2859. LDF [BO + 5 * SIZE], b2
  2860. LDF [BO + 6 * SIZE], b3
  2861. LDF [BO + 7 * SIZE], b4
  2862. FSUB a1, c01, c01
  2863. FSUB a2, c03, c03
  2864. FSUB a3, c05, c05
  2865. FSUB a4, c07, c07
  2866. FSUB b1, c02, c02
  2867. FSUB b2, c04, c04
  2868. FSUB b3, c06, c06
  2869. FSUB b4, c08, c08
  2870. #else
  2871. LDF [AO + 0 * SIZE], a1
  2872. LDF [AO + 1 * SIZE], a2
  2873. LDF [AO + 2 * SIZE], a3
  2874. LDF [AO + 3 * SIZE], a4
  2875. LDF [AO + 4 * SIZE], b1
  2876. LDF [AO + 5 * SIZE], b2
  2877. LDF [AO + 6 * SIZE], b3
  2878. LDF [AO + 7 * SIZE], b4
  2879. FSUB a1, c01, c01
  2880. FSUB a2, c02, c02
  2881. FSUB a3, c03, c03
  2882. FSUB a4, c04, c04
  2883. FSUB b1, c05, c05
  2884. FSUB b2, c06, c06
  2885. FSUB b3, c07, c07
  2886. FSUB b4, c08, c08
  2887. #endif
  2888. #ifdef LN
  2889. LDF [AO + 3 * SIZE], a1
  2890. LDF [AO + 2 * SIZE], a2
  2891. LDF [AO + 0 * SIZE], a3
  2892. FMUL a1, c02, c02
  2893. FMUL a1, c04, c04
  2894. FMUL a1, c06, c06
  2895. FMUL a1, c08, c08
  2896. FMUL a2, c02, t1
  2897. FMUL a2, c04, t2
  2898. FMUL a2, c06, t3
  2899. FMUL a2, c08, t4
  2900. FSUB c01, t1, c01
  2901. FSUB c03, t2, c03
  2902. FSUB c05, t3, c05
  2903. FSUB c07, t4, c07
  2904. FMUL a3, c01, c01
  2905. FMUL a3, c03, c03
  2906. FMUL a3, c05, c05
  2907. FMUL a3, c07, c07
  2908. #endif
  2909. #ifdef LT
  2910. LDF [AO + 0 * SIZE], a1
  2911. LDF [AO + 1 * SIZE], a2
  2912. LDF [AO + 3 * SIZE], a3
  2913. FMUL a1, c01, c01
  2914. FMUL a1, c03, c03
  2915. FMUL a1, c05, c05
  2916. FMUL a1, c07, c07
  2917. FMUL a2, c01, t1
  2918. FMUL a2, c03, t2
  2919. FMUL a2, c05, t3
  2920. FMUL a2, c07, t4
  2921. FSUB c02, t1, c02
  2922. FSUB c04, t2, c04
  2923. FSUB c06, t3, c06
  2924. FSUB c08, t4, c08
  2925. FMUL a3, c02, c02
  2926. FMUL a3, c04, c04
  2927. FMUL a3, c06, c06
  2928. FMUL a3, c08, c08
  2929. #endif
  2930. #ifdef RN
  2931. LDF [BO + 0 * SIZE], a1
  2932. LDF [BO + 1 * SIZE], a2
  2933. LDF [BO + 2 * SIZE], a3
  2934. LDF [BO + 3 * SIZE], a4
  2935. FMUL a1, c01, c01
  2936. FMUL a1, c02, c02
  2937. FMUL a2, c01, t1
  2938. FMUL a2, c02, t2
  2939. FSUB c03, t1, c03
  2940. FSUB c04, t2, c04
  2941. FMUL a3, c01, t1
  2942. FMUL a3, c02, t2
  2943. FSUB c05, t1, c05
  2944. FSUB c06, t2, c06
  2945. FMUL a4, c01, t1
  2946. FMUL a4, c02, t2
  2947. FSUB c07, t1, c07
  2948. FSUB c08, t2, c08
  2949. LDF [BO + 5 * SIZE], a1
  2950. LDF [BO + 6 * SIZE], a2
  2951. LDF [BO + 7 * SIZE], a3
  2952. FMUL a1, c03, c03
  2953. FMUL a1, c04, c04
  2954. FMUL a2, c03, t1
  2955. FMUL a2, c04, t2
  2956. FSUB c05, t1, c05
  2957. FSUB c06, t2, c06
  2958. FMUL a3, c03, t1
  2959. FMUL a3, c04, t2
  2960. FSUB c07, t1, c07
  2961. FSUB c08, t2, c08
  2962. LDF [BO + 10 * SIZE], a1
  2963. LDF [BO + 11 * SIZE], a2
  2964. FMUL a1, c05, c05
  2965. FMUL a1, c06, c06
  2966. FMUL a2, c05, t1
  2967. FMUL a2, c06, t2
  2968. FSUB c07, t1, c07
  2969. FSUB c08, t2, c08
  2970. LDF [BO + 15 * SIZE], a1
  2971. FMUL a1, c07, c07
  2972. FMUL a1, c08, c08
  2973. #endif
  2974. #ifdef RT
  2975. LDF [BO + 15 * SIZE], a1
  2976. LDF [BO + 14 * SIZE], a2
  2977. LDF [BO + 13 * SIZE], a3
  2978. LDF [BO + 12 * SIZE], a4
  2979. FMUL a1, c07, c07
  2980. FMUL a1, c08, c08
  2981. FMUL a2, c07, t1
  2982. FMUL a2, c08, t2
  2983. FSUB c05, t1, c05
  2984. FSUB c06, t2, c06
  2985. FMUL a3, c07, t1
  2986. FMUL a3, c08, t2
  2987. FSUB c03, t1, c03
  2988. FSUB c04, t2, c04
  2989. FMUL a4, c07, t1
  2990. FMUL a4, c08, t2
  2991. FSUB c01, t1, c01
  2992. FSUB c02, t2, c02
  2993. LDF [BO + 10 * SIZE], a1
  2994. LDF [BO + 9 * SIZE], a2
  2995. LDF [BO + 8 * SIZE], a3
  2996. FMUL a1, c05, c05
  2997. FMUL a1, c06, c06
  2998. FMUL a2, c05, t1
  2999. FMUL a2, c06, t2
  3000. FSUB c03, t1, c03
  3001. FSUB c04, t2, c04
  3002. FMUL a3, c05, t1
  3003. FMUL a3, c06, t2
  3004. FSUB c01, t1, c01
  3005. FSUB c02, t2, c02
  3006. LDF [BO + 5 * SIZE], a1
  3007. LDF [BO + 4 * SIZE], a2
  3008. FMUL a1, c03, c03
  3009. FMUL a1, c04, c04
  3010. FMUL a2, c03, t1
  3011. FMUL a2, c04, t2
  3012. FSUB c01, t1, c01
  3013. FSUB c02, t2, c02
  3014. LDF [BO + 0 * SIZE], a1
  3015. FMUL a1, c01, c01
  3016. FMUL a1, c02, c02
  3017. #endif
  3018. #ifdef LN
  3019. add C1, -2 * SIZE, C1
  3020. add C2, -2 * SIZE, C2
  3021. add C3, -2 * SIZE, C3
  3022. add C4, -2 * SIZE, C4
  3023. #endif
  3024. #if defined(LN) || defined(LT)
  3025. STF c01, [BO + 0 * SIZE]
  3026. STF c03, [BO + 1 * SIZE]
  3027. STF c05, [BO + 2 * SIZE]
  3028. STF c07, [BO + 3 * SIZE]
  3029. STF c02, [BO + 4 * SIZE]
  3030. STF c04, [BO + 5 * SIZE]
  3031. STF c06, [BO + 6 * SIZE]
  3032. STF c08, [BO + 7 * SIZE]
  3033. #else
  3034. STF c01, [AO + 0 * SIZE]
  3035. STF c02, [AO + 1 * SIZE]
  3036. STF c03, [AO + 2 * SIZE]
  3037. STF c04, [AO + 3 * SIZE]
  3038. STF c05, [AO + 4 * SIZE]
  3039. STF c06, [AO + 5 * SIZE]
  3040. STF c07, [AO + 6 * SIZE]
  3041. STF c08, [AO + 7 * SIZE]
  3042. #endif
  3043. STF c01, [C1 + 0 * SIZE]
  3044. STF c02, [C1 + 1 * SIZE]
  3045. STF c03, [C2 + 0 * SIZE]
  3046. STF c04, [C2 + 1 * SIZE]
  3047. STF c05, [C3 + 0 * SIZE]
  3048. STF c06, [C3 + 1 * SIZE]
  3049. STF c07, [C4 + 0 * SIZE]
  3050. STF c08, [C4 + 1 * SIZE]
  3051. FMOV FZERO, t1
  3052. FMOV FZERO, t2
  3053. FMOV FZERO, t3
  3054. FMOV FZERO, t4
  3055. #ifndef LN
  3056. add C1, 2 * SIZE, C1
  3057. add C2, 2 * SIZE, C2
  3058. add C3, 2 * SIZE, C3
  3059. add C4, 2 * SIZE, C4
  3060. #endif
  3061. #ifdef RT
  3062. sll K, 1 + BASE_SHIFT, TEMP1
  3063. add AORIG, TEMP1, AORIG
  3064. #endif
  3065. #if defined(LT) || defined(RN)
  3066. sub K, KK, TEMP1
  3067. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  3068. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  3069. add AO, TEMP2, AO
  3070. add BO, TEMP1, BO
  3071. #endif
  3072. #ifdef LT
  3073. add KK, 2, KK
  3074. #endif
  3075. #ifdef LN
  3076. sub KK, 2, KK
  3077. #endif
  3078. .LL70:
  3079. and M, 1, I
  3080. cmp I, 0
  3081. ble,pn %icc, .LL99
  3082. nop
  3083. #if defined(LT) || defined(RN)
  3084. sra KK, 2, L
  3085. mov B, BO
  3086. cmp L, 0
  3087. #else
  3088. #ifdef LN
  3089. sll K, 0 + BASE_SHIFT, TEMP1
  3090. sub AORIG, TEMP1, AORIG
  3091. #endif
  3092. sll KK, 0 + BASE_SHIFT, TEMP1
  3093. sll KK, 2 + BASE_SHIFT, TEMP2
  3094. add AORIG, TEMP1, AO
  3095. add B, TEMP2, BO
  3096. sub K, KK, TEMP1
  3097. sra TEMP1, 2, L
  3098. cmp L, 0
  3099. #endif
  3100. LDF [AO + 0 * SIZE], a1
  3101. FMOV FZERO, c01
  3102. LDF [BO + 0 * SIZE], b1
  3103. FMOV FZERO, t1
  3104. LDF [AO + 1 * SIZE], a2
  3105. FMOV FZERO, c02
  3106. LDF [BO + 1 * SIZE], b2
  3107. FMOV FZERO, t2
  3108. LDF [AO + 2 * SIZE], a3
  3109. FMOV FZERO, c03
  3110. LDF [BO + 2 * SIZE], b3
  3111. FMOV FZERO, t3
  3112. LDF [AO + 3 * SIZE], a4
  3113. FMOV FZERO, c04
  3114. LDF [BO + 3 * SIZE], b4
  3115. FMOV FZERO, t4
  3116. ble,pn %icc, .LL75
  3117. nop
  3118. .LL72:
  3119. FADD c01, t1, c01
  3120. add L, -1, L
  3121. FMUL a1, b1, t1
  3122. LDF [BO + 4 * SIZE], b1
  3123. FADD c02, t2, c02
  3124. cmp L, 0
  3125. FMUL a1, b2, t2
  3126. LDF [BO + 5 * SIZE], b2
  3127. FADD c03, t3, c03
  3128. FMUL a1, b3, t3
  3129. LDF [BO + 6 * SIZE], b3
  3130. FADD c04, t4, c04
  3131. FMUL a1, b4, t4
  3132. LDF [BO + 7 * SIZE], b4
  3133. LDF [AO + 4 * SIZE], a1
  3134. FADD c01, t1, c01
  3135. add AO, 4 * SIZE, AO
  3136. FMUL a2, b1, t1
  3137. LDF [BO + 8 * SIZE], b1
  3138. FADD c02, t2, c02
  3139. FMUL a2, b2, t2
  3140. LDF [BO + 9 * SIZE], b2
  3141. FADD c03, t3, c03
  3142. FMUL a2, b3, t3
  3143. LDF [BO + 10 * SIZE], b3
  3144. FADD c04, t4, c04
  3145. FMUL a2, b4, t4
  3146. LDF [BO + 11 * SIZE], b4
  3147. LDF [AO + 1 * SIZE], a2
  3148. FADD c01, t1, c01
  3149. FMUL a3, b1, t1
  3150. LDF [BO + 12 * SIZE], b1
  3151. FADD c02, t2, c02
  3152. FMUL a3, b2, t2
  3153. LDF [BO + 13 * SIZE], b2
  3154. FADD c03, t3, c03
  3155. FMUL a3, b3, t3
  3156. LDF [BO + 14 * SIZE], b3
  3157. FADD c04, t4, c04
  3158. FMUL a3, b4, t4
  3159. LDF [BO + 15 * SIZE], b4
  3160. LDF [AO + 2 * SIZE], a3
  3161. FADD c01, t1, c01
  3162. FMUL a4, b1, t1
  3163. LDF [BO + 16 * SIZE], b1
  3164. FADD c02, t2, c02
  3165. FMUL a4, b2, t2
  3166. LDF [BO + 17 * SIZE], b2
  3167. FADD c03, t3, c03
  3168. FMUL a4, b3, t3
  3169. LDF [BO + 18 * SIZE], b3
  3170. FADD c04, t4, c04
  3171. FMUL a4, b4, t4
  3172. LDF [BO + 19 * SIZE], b4
  3173. add BO, 16 * SIZE, BO
  3174. bg,pt %icc, .LL72
  3175. LDF [AO + 3 * SIZE], a4
  3176. .LL75:
  3177. #if defined(LT) || defined(RN)
  3178. and KK, 3, L
  3179. #else
  3180. and TEMP1, 3, L
  3181. #endif
  3182. cmp L, 0
  3183. ble,a,pn %icc, .LL79
  3184. nop
  3185. .LL76:
  3186. FADD c01, t1, c01
  3187. add AO, 1 * SIZE, AO
  3188. FMUL a1, b1, t1
  3189. LDF [BO + 4 * SIZE], b1
  3190. FADD c02, t2, c02
  3191. add L, -1, L
  3192. FMUL a1, b2, t2
  3193. LDF [BO + 5 * SIZE], b2
  3194. FADD c03, t3, c03
  3195. cmp L, 0
  3196. FMUL a1, b3, t3
  3197. LDF [BO + 6 * SIZE], b3
  3198. FADD c04, t4, c04
  3199. add BO, 4 * SIZE, BO
  3200. FMUL a1, b4, t4
  3201. LDF [AO + 0 * SIZE], a1
  3202. bg,pt %icc, .LL76
  3203. LDF [BO + 3 * SIZE], b4
  3204. .LL79:
  3205. FADD c01, t1, c01
  3206. FADD c02, t2, c02
  3207. FADD c03, t3, c03
  3208. FADD c04, t4, c04
  3209. #if defined(LN) || defined(RT)
  3210. #ifdef LN
  3211. sub KK, 1, TEMP1
  3212. #else
  3213. sub KK, 4, TEMP1
  3214. #endif
  3215. sll TEMP1, 0 + BASE_SHIFT, TEMP2
  3216. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  3217. add AORIG, TEMP2, AO
  3218. add B, TEMP1, BO
  3219. #endif
  3220. #if defined(LN) || defined(LT)
  3221. LDF [BO + 0 * SIZE], a1
  3222. LDF [BO + 1 * SIZE], a2
  3223. LDF [BO + 2 * SIZE], a3
  3224. LDF [BO + 3 * SIZE], a4
  3225. FSUB a1, c01, c01
  3226. FSUB a2, c02, c02
  3227. FSUB a3, c03, c03
  3228. FSUB a4, c04, c04
  3229. #else
  3230. LDF [AO + 0 * SIZE], a1
  3231. LDF [AO + 1 * SIZE], a2
  3232. LDF [AO + 2 * SIZE], a3
  3233. LDF [AO + 3 * SIZE], a4
  3234. FSUB a1, c01, c01
  3235. FSUB a2, c02, c02
  3236. FSUB a3, c03, c03
  3237. FSUB a4, c04, c04
  3238. #endif
  3239. #ifdef LN
  3240. LDF [AO + 0 * SIZE], a1
  3241. FMUL a1, c01, c01
  3242. FMUL a1, c02, c02
  3243. FMUL a1, c03, c03
  3244. FMUL a1, c04, c04
  3245. #endif
  3246. #ifdef LT
  3247. LDF [AO + 0 * SIZE], a1
  3248. FMUL a1, c01, c01
  3249. FMUL a1, c02, c02
  3250. FMUL a1, c03, c03
  3251. FMUL a1, c04, c04
  3252. #endif
  3253. #ifdef RN
  3254. LDF [BO + 0 * SIZE], a1
  3255. LDF [BO + 1 * SIZE], a2
  3256. LDF [BO + 2 * SIZE], a3
  3257. LDF [BO + 3 * SIZE], a4
  3258. FMUL a1, c01, c01
  3259. FMUL a2, c01, t1
  3260. FSUB c02, t1, c02
  3261. FMUL a3, c01, t1
  3262. FSUB c03, t1, c03
  3263. FMUL a4, c01, t1
  3264. FSUB c04, t1, c04
  3265. LDF [BO + 5 * SIZE], a1
  3266. LDF [BO + 6 * SIZE], a2
  3267. LDF [BO + 7 * SIZE], a3
  3268. FMUL a1, c02, c02
  3269. FMUL a2, c02, t1
  3270. FSUB c03, t1, c03
  3271. FMUL a3, c02, t1
  3272. FSUB c04, t1, c04
  3273. LDF [BO + 10 * SIZE], a1
  3274. LDF [BO + 11 * SIZE], a2
  3275. FMUL a1, c03, c03
  3276. FMUL a2, c03, t1
  3277. FSUB c04, t1, c04
  3278. LDF [BO + 15 * SIZE], a1
  3279. FMUL a1, c04, c04
  3280. #endif
  3281. #ifdef RT
  3282. LDF [BO + 15 * SIZE], a1
  3283. LDF [BO + 14 * SIZE], a2
  3284. LDF [BO + 13 * SIZE], a3
  3285. LDF [BO + 12 * SIZE], a4
  3286. FMUL a1, c04, c04
  3287. FMUL a2, c04, t1
  3288. FSUB c03, t1, c03
  3289. FMUL a3, c04, t1
  3290. FSUB c02, t1, c02
  3291. FMUL a4, c04, t1
  3292. FSUB c01, t1, c01
  3293. LDF [BO + 10 * SIZE], a1
  3294. LDF [BO + 9 * SIZE], a2
  3295. LDF [BO + 8 * SIZE], a3
  3296. FMUL a1, c03, c03
  3297. FMUL a2, c03, t1
  3298. FSUB c02, t1, c02
  3299. FMUL a3, c03, t1
  3300. FSUB c01, t1, c01
  3301. LDF [BO + 5 * SIZE], a1
  3302. LDF [BO + 4 * SIZE], a2
  3303. FMUL a1, c02, c02
  3304. FMUL a2, c02, t1
  3305. FSUB c01, t1, c01
  3306. LDF [BO + 0 * SIZE], a1
  3307. FMUL a1, c01, c01
  3308. #endif
  3309. #ifdef LN
  3310. add C1, -1 * SIZE, C1
  3311. add C2, -1 * SIZE, C2
  3312. add C3, -1 * SIZE, C3
  3313. add C4, -1 * SIZE, C4
  3314. #endif
  3315. #if defined(LN) || defined(LT)
  3316. STF c01, [BO + 0 * SIZE]
  3317. STF c02, [BO + 1 * SIZE]
  3318. STF c03, [BO + 2 * SIZE]
  3319. STF c04, [BO + 3 * SIZE]
  3320. #else
  3321. STF c01, [AO + 0 * SIZE]
  3322. STF c02, [AO + 1 * SIZE]
  3323. STF c03, [AO + 2 * SIZE]
  3324. STF c04, [AO + 3 * SIZE]
  3325. #endif
  3326. STF c01, [C1 + 0 * SIZE]
  3327. STF c02, [C2 + 0 * SIZE]
  3328. STF c03, [C3 + 0 * SIZE]
  3329. STF c04, [C4 + 0 * SIZE]
  3330. FMOV FZERO, t1
  3331. FMOV FZERO, t2
  3332. FMOV FZERO, t3
  3333. FMOV FZERO, t4
  3334. #ifndef LN
  3335. add C1, 1 * SIZE, C1
  3336. add C2, 1 * SIZE, C2
  3337. add C3, 1 * SIZE, C3
  3338. add C4, 1 * SIZE, C4
  3339. #endif
  3340. #ifdef RT
  3341. sll K, 0 + BASE_SHIFT, TEMP1
  3342. add AORIG, TEMP1, AORIG
  3343. #endif
  3344. #if defined(LT) || defined(RN)
  3345. sub K, KK, TEMP1
  3346. sll TEMP1, 0 + BASE_SHIFT, TEMP2
  3347. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  3348. add AO, TEMP2, AO
  3349. add BO, TEMP1, BO
  3350. #endif
  3351. #ifdef LT
  3352. add KK, 1, KK
  3353. #endif
  3354. #ifdef LN
  3355. sub KK, 1, KK
  3356. #endif
  3357. .LL99:
  3358. #ifdef LN
  3359. sll K, 2 + BASE_SHIFT, TEMP1
  3360. add B, TEMP1, B
  3361. #endif
  3362. #if defined(LT) || defined(RN)
  3363. mov BO, B
  3364. #endif
  3365. #ifdef RN
  3366. add KK, 4, KK
  3367. #endif
  3368. #ifdef RT
  3369. sub KK, 4, KK
  3370. #endif
  3371. add J, -1, J
  3372. cmp J, 0
  3373. bg,pt %icc, .LL11
  3374. nop
  3375. .LL999:
  3376. return %i7 + 8
  3377. clr %o0
  3378. EPILOGUE