You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel.S 50 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %i0
  41. #define N %i1
  42. #define K %i2
  43. #if defined(DOUBLE) && !defined(__64BIT__)
  44. #define A %i5
  45. #define B %i4
  46. #else
  47. #define A %i4
  48. #define B %i5
  49. #endif
  50. #define C %o4
  51. #define LDC %o5
  52. #define AO %l0
  53. #define BO %l1
  54. #define I %l2
  55. #define J %l3
  56. #define L %l4
  57. #define C1 %o0
  58. #define C2 %o1
  59. #define C3 %o2
  60. #define C4 %o3
  61. #define OFFSET %l5
  62. #define KK %l6
  63. #define TEMP1 %l7
  64. #define TEMP2 %i3
  65. #ifdef DOUBLE
  66. #define c01 %f0
  67. #define c02 %f2
  68. #define c03 %f4
  69. #define c04 %f6
  70. #define c05 %f8
  71. #define c06 %f10
  72. #define c07 %f12
  73. #define c08 %f14
  74. #define c09 %f16
  75. #define c10 %f18
  76. #define c11 %f20
  77. #define c12 %f22
  78. #define c13 %f24
  79. #define c14 %f26
  80. #define c15 %f28
  81. #define c16 %f30
  82. #define t1 %f32
  83. #define t2 %f34
  84. #define t3 %f36
  85. #define t4 %f38
  86. #define a1 %f40
  87. #define a2 %f42
  88. #define a3 %f44
  89. #define a4 %f46
  90. #define a5 %f58
  91. #define b1 %f48
  92. #define b2 %f50
  93. #define b3 %f52
  94. #define b4 %f54
  95. #define b5 %f56
  96. #define FZERO %f60
  97. #define ALPHA %f62
  98. #else
  99. #define c01 %f0
  100. #define c02 %f1
  101. #define c03 %f2
  102. #define c04 %f3
  103. #define c05 %f4
  104. #define c06 %f5
  105. #define c07 %f6
  106. #define c08 %f7
  107. #define c09 %f8
  108. #define c10 %f9
  109. #define c11 %f10
  110. #define c12 %f11
  111. #define c13 %f12
  112. #define c14 %f13
  113. #define c15 %f14
  114. #define c16 %f15
  115. #define t1 %f16
  116. #define t2 %f17
  117. #define t3 %f18
  118. #define t4 %f19
  119. #define a1 %f20
  120. #define a2 %f21
  121. #define a3 %f22
  122. #define a4 %f23
  123. #define a5 %f31
  124. #define b1 %f24
  125. #define b2 %f25
  126. #define b3 %f26
  127. #define b4 %f27
  128. #define b5 %f28
  129. #define FZERO %f29
  130. #define ALPHA %f30
  131. #endif
  132. PROLOGUE
  133. SAVESP
  134. nop
  135. #ifndef __64BIT__
  136. #ifdef DOUBLE
  137. st %i3, [%sp + STACK_START + 16] /* ALPHA */
  138. st %i4, [%sp + STACK_START + 20]
  139. ld [%sp + STACK_START + 28], B
  140. ld [%sp + STACK_START + 32], C
  141. ld [%sp + STACK_START + 36], LDC
  142. #ifdef TRMMKERNEL
  143. ld [%sp + STACK_START + 40], OFFSET
  144. #endif
  145. #else
  146. st %i3, [%sp + STACK_START + 16] /* ALPHA */
  147. ld [%sp + STACK_START + 28], C
  148. ld [%sp + STACK_START + 32], LDC
  149. #ifdef TRMMKERNEL
  150. ld [%sp + STACK_START + 36], OFFSET
  151. #endif
  152. #endif
  153. LDF [%sp + STACK_START + 16], ALPHA
  154. #else
  155. ldx [%sp+ STACK_START + 56], C
  156. ldx [%sp+ STACK_START + 64], LDC
  157. #ifdef TRMMKERNEL
  158. ldx [%sp+ STACK_START + 72], OFFSET
  159. #endif
  160. #ifdef DOUBLE
  161. FMOV %f6, ALPHA
  162. #else
  163. FMOV %f7, ALPHA
  164. #endif
  165. #endif
  166. FCLR(29)
  167. #if defined(TRMMKERNEL) && !defined(LEFT)
  168. neg OFFSET, KK
  169. #endif
  170. sra N, 2, J
  171. cmp J, 0
  172. ble,pn %icc, .LL100
  173. sll LDC, BASE_SHIFT, LDC
  174. .LL11:
  175. add C, LDC, C2
  176. FMOV FZERO, t1
  177. nop
  178. mov C, C1
  179. add C2, LDC, C3
  180. FMOV FZERO, t2
  181. sra K, 2, L
  182. mov A, AO
  183. sra M, 2, I
  184. add C3, LDC, C4
  185. FMOV FZERO, t3
  186. #if defined(TRMMKERNEL) && defined(LEFT)
  187. mov OFFSET, KK
  188. #endif
  189. cmp I, 0
  190. add C4, LDC, C
  191. FMOV FZERO, t4
  192. ble,pn %icc, .LL50
  193. FMOV FZERO, c01
  194. .LL21:
  195. #if !defined(TRMMKERNEL)
  196. FMOV FZERO, c02
  197. mov B, BO
  198. FMOV FZERO, c03
  199. cmp L, 0
  200. #else
  201. FMOV FZERO, c02
  202. FMOV FZERO, c03
  203. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  204. mov B, BO
  205. #else
  206. sll KK, 2 + BASE_SHIFT, TEMP1
  207. add AO, TEMP1, AO
  208. add B, TEMP1, BO
  209. #endif
  210. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  211. sub K, KK, L
  212. #elif defined(LEFT)
  213. add KK, 4, L
  214. #else
  215. add KK, 4, L
  216. #endif
  217. sra L, 2, L
  218. cmp L, 0
  219. #endif
  220. LDF [AO + 0 * SIZE], a1
  221. FMOV FZERO, c04
  222. LDF [BO + 0 * SIZE], b1
  223. FMOV FZERO, c05
  224. LDF [AO + 1 * SIZE], a2
  225. FMOV FZERO, c06
  226. LDF [BO + 1 * SIZE], b2
  227. FMOV FZERO, c07
  228. LDF [AO + 2 * SIZE], a3
  229. FMOV FZERO, c08
  230. LDF [BO + 2 * SIZE], b3
  231. FMOV FZERO, c09
  232. LDF [AO + 3 * SIZE], a4
  233. FMOV FZERO, c10
  234. LDF [BO + 3 * SIZE], b4
  235. FMOV FZERO, c11
  236. LDF [BO + 4 * SIZE], b5 /* ***** */
  237. LDF [AO + 4 * SIZE], a5 /* ***** */
  238. prefetch [C1 + 3 * SIZE], 3
  239. FMOV FZERO, c12
  240. prefetch [C2 + 3 * SIZE], 3
  241. FMOV FZERO, c13
  242. prefetch [C3 + 3 * SIZE], 3
  243. FMOV FZERO, c14
  244. prefetch [C4 + 3 * SIZE], 3
  245. FMOV FZERO, c15
  246. ble,pn %icc, .LL25
  247. FMOV FZERO, c16
  248. #define APREFETCHSIZE 40
  249. #define BPREFETCHSIZE 40
  250. #define APREFETCH_CATEGORY 0
  251. #define BPREFETCH_CATEGORY 0
  252. .LL22:
  253. FADD c04, t1, c04
  254. prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY
  255. FMUL a1, b1, t1
  256. nop
  257. FADD c08, t2, c08
  258. prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY
  259. FMUL a1, b2, t2
  260. add AO, 16 * SIZE, AO
  261. FADD c12, t3, c12
  262. LDF [AO - 13 * SIZE], a4
  263. FMUL a1, b3, t3
  264. add BO, 16 * SIZE, BO
  265. FADD c16, t4, c16
  266. nop
  267. FMUL a1, b4, t4
  268. LDF [AO - 8 * SIZE], a1
  269. FADD c01, t1, c01
  270. nop
  271. FMUL a2, b1, t1
  272. nop
  273. FADD c05, t2, c05
  274. nop
  275. FMUL a2, b2, t2
  276. nop
  277. FADD c09, t3, c09
  278. nop
  279. FMUL a2, b3, t3
  280. nop
  281. FADD c13, t4, c13
  282. add L, -1, L
  283. FMUL a2, b4, t4
  284. LDF [AO - 11 * SIZE], a2
  285. FADD c02, t1, c02
  286. nop
  287. FMUL a3, b1, t1
  288. nop
  289. FADD c06, t2, c06
  290. nop
  291. FMUL a3, b2, t2
  292. nop
  293. FADD c10, t3, c10
  294. nop
  295. FMUL a3, b3, t3
  296. nop
  297. FADD c14, t4, c14
  298. nop
  299. FMUL a3, b4, t4
  300. LDF [AO - 10 * SIZE], a3
  301. FADD c03, t1, c03
  302. nop
  303. FMUL a4, b1, t1
  304. LDF [BO - 8 * SIZE], b1
  305. FADD c07, t2, c07
  306. nop
  307. FMUL a4, b2, t2
  308. LDF [BO - 11 * SIZE], b2
  309. FADD c11, t3, c11
  310. nop
  311. FMUL a4, b3, t3
  312. LDF [BO - 10 * SIZE], b3
  313. FADD c15, t4, c15
  314. nop
  315. FMUL a4, b4, t4
  316. LDF [BO - 9 * SIZE], b4
  317. FADD c04, t1, c04
  318. nop
  319. FMUL a5, b5, t1
  320. LDF [AO - 9 * SIZE], a4
  321. FADD c08, t2, c08
  322. nop
  323. FMUL a5, b2, t2
  324. nop
  325. FADD c12, t3, c12
  326. nop
  327. FMUL a5, b3, t3
  328. nop
  329. FADD c16, t4, c16
  330. nop
  331. FMUL a5, b4, t4
  332. LDF [AO - 4 * SIZE], a5
  333. FADD c01, t1, c01
  334. nop
  335. FMUL a2, b5, t1
  336. nop
  337. FADD c05, t2, c05
  338. nop
  339. FMUL a2, b2, t2
  340. nop
  341. FADD c09, t3, c09
  342. nop
  343. FMUL a2, b3, t3
  344. nop
  345. FADD c13, t4, c13
  346. nop
  347. FMUL a2, b4, t4
  348. LDF [AO - 7 * SIZE], a2
  349. FADD c02, t1, c02
  350. nop
  351. FMUL a3, b5, t1
  352. nop
  353. FADD c06, t2, c06
  354. nop
  355. FMUL a3, b2, t2
  356. nop
  357. FADD c10, t3, c10
  358. nop
  359. FMUL a3, b3, t3
  360. nop
  361. FADD c14, t4, c14
  362. nop
  363. FMUL a3, b4, t4
  364. LDF [AO - 6 * SIZE], a3
  365. FADD c03, t1, c03
  366. nop
  367. FMUL a4, b5, t1
  368. LDF [BO - 4 * SIZE], b5
  369. FADD c07, t2, c07
  370. nop
  371. FMUL a4, b2, t2
  372. LDF [BO - 7 * SIZE], b2
  373. FADD c11, t3, c11
  374. nop
  375. FMUL a4, b3, t3
  376. LDF [BO - 6 * SIZE], b3
  377. FADD c15, t4, c15
  378. nop
  379. FMUL a4, b4, t4
  380. LDF [BO - 5 * SIZE], b4
  381. FADD c04, t1, c04
  382. nop
  383. FMUL a1, b1, t1
  384. LDF [AO - 5 * SIZE], a4
  385. FADD c08, t2, c08
  386. nop
  387. FMUL a1, b2, t2
  388. nop
  389. FADD c12, t3, c12
  390. nop
  391. FMUL a1, b3, t3
  392. nop
  393. FADD c16, t4, c16
  394. nop
  395. FMUL a1, b4, t4
  396. LDF [AO - 0 * SIZE], a1
  397. FADD c01, t1, c01
  398. nop
  399. FMUL a2, b1, t1
  400. nop
  401. #ifdef DOUBLE
  402. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  403. #else
  404. nop
  405. #endif
  406. FADD c05, t2, c05
  407. nop
  408. FMUL a2, b2, t2
  409. FADD c09, t3, c09
  410. nop
  411. FMUL a2, b3, t3
  412. nop
  413. FADD c13, t4, c13
  414. nop
  415. FMUL a2, b4, t4
  416. nop
  417. FADD c02, t1, c02
  418. nop
  419. FMUL a3, b1, t1
  420. LDF [AO - 3 * SIZE], a2
  421. FADD c06, t2, c06
  422. #ifdef DOUBLE
  423. prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY
  424. #else
  425. nop
  426. #endif
  427. FMUL a3, b2, t2
  428. nop
  429. FADD c10, t3, c10
  430. nop
  431. FMUL a3, b3, t3
  432. nop
  433. FADD c14, t4, c14
  434. nop
  435. FMUL a3, b4, t4
  436. LDF [AO - 2 * SIZE], a3
  437. FADD c03, t1, c03
  438. nop
  439. FMUL a4, b1, t1
  440. LDF [BO - 0 * SIZE], b1
  441. FADD c07, t2, c07
  442. nop
  443. FMUL a4, b2, t2
  444. LDF [BO - 3 * SIZE], b2
  445. FADD c11, t3, c11
  446. nop
  447. FMUL a4, b3, t3
  448. LDF [BO - 2 * SIZE], b3
  449. FADD c15, t4, c15
  450. nop
  451. FMUL a4, b4, t4
  452. LDF [BO - 1 * SIZE], b4
  453. FADD c04, t1, c04
  454. nop
  455. FMUL a5, b5, t1
  456. LDF [AO - 1 * SIZE], a4
  457. FADD c08, t2, c08
  458. FMUL a5, b2, t2
  459. FADD c12, t3, c12
  460. FMUL a5, b3, t3
  461. FADD c16, t4, c16
  462. nop
  463. FMUL a5, b4, t4
  464. LDF [AO + 4 * SIZE], a5
  465. FADD c01, t1, c01
  466. nop
  467. FMUL a2, b5, t1
  468. nop
  469. FADD c05, t2, c05
  470. nop
  471. FMUL a2, b2, t2
  472. nop
  473. FADD c09, t3, c09
  474. nop
  475. FMUL a2, b3, t3
  476. nop
  477. FADD c13, t4, c13
  478. nop
  479. FMUL a2, b4, t4
  480. LDF [AO + 1 * SIZE], a2
  481. FADD c02, t1, c02
  482. nop
  483. FMUL a3, b5, t1
  484. nop
  485. FADD c06, t2, c06
  486. nop
  487. FMUL a3, b2, t2
  488. nop
  489. FADD c10, t3, c10
  490. nop
  491. FMUL a3, b3, t3
  492. nop
  493. FADD c14, t4, c14
  494. nop
  495. FMUL a3, b4, t4
  496. LDF [AO + 2 * SIZE], a3
  497. FADD c03, t1, c03
  498. cmp L, 0
  499. FMUL a4, b5, t1
  500. LDF [BO + 4 * SIZE], b5
  501. FADD c07, t2, c07
  502. nop
  503. FMUL a4, b2, t2
  504. LDF [BO + 1 * SIZE], b2
  505. FADD c11, t3, c11
  506. nop
  507. FMUL a4, b3, t3
  508. LDF [BO + 2 * SIZE], b3
  509. FADD c15, t4, c15
  510. FMUL a4, b4, t4
  511. bg,pt %icc, .LL22
  512. LDF [BO + 3 * SIZE], b4
  513. .LL25:
  514. #ifndef TRMMKERNEL
  515. and K, 3, L
  516. #else
  517. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  518. sub K, KK, L
  519. #elif defined(LEFT)
  520. add KK, 4, L
  521. #else
  522. add KK, 4, L
  523. #endif
  524. and L, 3, L
  525. #endif
  526. cmp L, 0
  527. ble,a,pn %icc, .LL29
  528. nop
  529. .LL26:
  530. FADD c04, t1, c04
  531. LDF [AO + 3 * SIZE], a4
  532. FMUL a1, b1, t1
  533. add AO, 4 * SIZE, AO
  534. FADD c08, t2, c08
  535. add BO, 4 * SIZE, BO
  536. FMUL a1, b2, t2
  537. add L, -1, L
  538. FADD c12, t3, c12
  539. nop
  540. FMUL a1, b3, t3
  541. cmp L, 0
  542. FADD c16, t4, c16
  543. nop
  544. FMUL a1, b4, t4
  545. LDF [AO + 0 * SIZE], a1
  546. FADD c01, t1, c01
  547. nop
  548. FMUL a2, b1, t1
  549. nop
  550. FADD c05, t2, c05
  551. nop
  552. FMUL a2, b2, t2
  553. nop
  554. FADD c09, t3, c09
  555. nop
  556. FMUL a2, b3, t3
  557. nop
  558. FADD c13, t4, c13
  559. nop
  560. FMUL a2, b4, t4
  561. LDF [AO + 1 * SIZE], a2
  562. FADD c02, t1, c02
  563. nop
  564. FMUL a3, b1, t1
  565. nop
  566. FADD c06, t2, c06
  567. nop
  568. FMUL a3, b2, t2
  569. nop
  570. FADD c10, t3, c10
  571. nop
  572. FMUL a3, b3, t3
  573. nop
  574. FADD c14, t4, c14
  575. nop
  576. FMUL a3, b4, t4
  577. LDF [AO + 2 * SIZE], a3
  578. FADD c03, t1, c03
  579. nop
  580. FMUL a4, b1, t1
  581. LDF [BO + 0 * SIZE], b1
  582. FADD c07, t2, c07
  583. nop
  584. FMUL a4, b2, t2
  585. LDF [BO + 1 * SIZE], b2
  586. FADD c11, t3, c11
  587. nop
  588. FMUL a4, b3, t3
  589. LDF [BO + 2 * SIZE], b3
  590. FADD c15, t4, c15
  591. FMUL a4, b4, t4
  592. bg,pt %icc, .LL26
  593. LDF [BO + 3 * SIZE], b4
  594. .LL29:
  595. #ifndef TRMMKERNEL
  596. FADD c04, t1, c04
  597. add I, -1, I
  598. FMUL c01, ALPHA, c01
  599. LDF [C1 + 0 * SIZE], a1
  600. FADD c08, t2, c08
  601. cmp I, 0
  602. FMUL c02, ALPHA, c02
  603. LDF [C1 + 1 * SIZE], a2
  604. FADD c12, t3, c12
  605. nop
  606. FMUL c03, ALPHA, c03
  607. LDF [C1 + 2 * SIZE], a3
  608. FADD c16, t4, c16
  609. nop
  610. FMUL c04, ALPHA, c04
  611. LDF [C1 + 3 * SIZE], a4
  612. FMUL c05, ALPHA, c05
  613. LDF [C2 + 0 * SIZE], b1
  614. FMUL c06, ALPHA, c06
  615. LDF [C2 + 1 * SIZE], b2
  616. FMUL c07, ALPHA, c07
  617. LDF [C2 + 2 * SIZE], b3
  618. FMUL c08, ALPHA, c08
  619. LDF [C2 + 3 * SIZE], b4
  620. FMUL c09, ALPHA, c09
  621. LDF [C3 + 0 * SIZE], t1
  622. FMUL c10, ALPHA, c10
  623. LDF [C3 + 1 * SIZE], t2
  624. FMUL c11, ALPHA, c11
  625. LDF [C3 + 2 * SIZE], t3
  626. FMUL c12, ALPHA, c12
  627. LDF [C3 + 3 * SIZE], t4
  628. FMUL c13, ALPHA, c13
  629. add C1, 4 * SIZE, C1
  630. FADD c01, a1, c01
  631. LDF [C4 + 0 * SIZE], a1
  632. FMUL c14, ALPHA, c14
  633. add C2, 4 * SIZE, C2
  634. FADD c02, a2, c02
  635. LDF [C4 + 1 * SIZE], a2
  636. FMUL c15, ALPHA, c15
  637. add C3, 4 * SIZE, C3
  638. FADD c03, a3, c03
  639. LDF [C4 + 2 * SIZE], a3
  640. FMUL c16, ALPHA, c16
  641. nop
  642. FADD c04, a4, c04
  643. LDF [C4 + 3 * SIZE], a4
  644. STF c01, [C1 - 4 * SIZE]
  645. FADD c05, b1, c05
  646. STF c02, [C1 - 3 * SIZE]
  647. FADD c06, b2, c06
  648. STF c03, [C1 - 2 * SIZE]
  649. FADD c07, b3, c07
  650. STF c04, [C1 - 1 * SIZE]
  651. FADD c08, b4, c08
  652. STF c05, [C2 - 4 * SIZE]
  653. FADD c09, t1, c09
  654. STF c06, [C2 - 3 * SIZE]
  655. FADD c10, t2, c10
  656. STF c07, [C2 - 2 * SIZE]
  657. FADD c11, t3, c11
  658. STF c08, [C2 - 1 * SIZE]
  659. FADD c12, t4, c12
  660. STF c09, [C3 - 4 * SIZE]
  661. FADD c13, a1, c13
  662. STF c10, [C3 - 3 * SIZE]
  663. FADD c14, a2, c14
  664. STF c11, [C3 - 2 * SIZE]
  665. FADD c15, a3, c15
  666. STF c12, [C3 - 1 * SIZE]
  667. FADD c16, a4, c16
  668. STF c13, [C4 + 0 * SIZE]
  669. FMOV FZERO, t1
  670. STF c14, [C4 + 1 * SIZE]
  671. FMOV FZERO, t2
  672. STF c15, [C4 + 2 * SIZE]
  673. FMOV FZERO, t3
  674. STF c16, [C4 + 3 * SIZE]
  675. FMOV FZERO, t4
  676. add C4, 4 * SIZE, C4
  677. #else
  678. FADD c04, t1, c04
  679. FMUL c01, ALPHA, c01
  680. FADD c08, t2, c08
  681. FMUL c02, ALPHA, c02
  682. FADD c12, t3, c12
  683. FMUL c03, ALPHA, c03
  684. FADD c16, t4, c16
  685. FMUL c04, ALPHA, c04
  686. STF c01, [C1 + 0 * SIZE]
  687. FMUL c05, ALPHA, c05
  688. STF c02, [C1 + 1 * SIZE]
  689. FMUL c06, ALPHA, c06
  690. STF c03, [C1 + 2 * SIZE]
  691. FMUL c07, ALPHA, c07
  692. STF c04, [C1 + 3 * SIZE]
  693. FMUL c08, ALPHA, c08
  694. STF c05, [C2 + 0 * SIZE]
  695. FMUL c09, ALPHA, c09
  696. STF c06, [C2 + 1 * SIZE]
  697. FMUL c10, ALPHA, c10
  698. STF c07, [C2 + 2 * SIZE]
  699. FMUL c11, ALPHA, c11
  700. STF c08, [C2 + 3 * SIZE]
  701. FMUL c12, ALPHA, c12
  702. STF c09, [C3 + 0 * SIZE]
  703. FMUL c13, ALPHA, c13
  704. STF c10, [C3 + 1 * SIZE]
  705. FMUL c14, ALPHA, c14
  706. STF c11, [C3 + 2 * SIZE]
  707. FMUL c15, ALPHA, c15
  708. STF c12, [C3 + 3 * SIZE]
  709. FMUL c16, ALPHA, c16
  710. STF c13, [C4 + 0 * SIZE]
  711. STF c14, [C4 + 1 * SIZE]
  712. STF c15, [C4 + 2 * SIZE]
  713. STF c16, [C4 + 3 * SIZE]
  714. FMOV FZERO, t1
  715. FMOV FZERO, t2
  716. FMOV FZERO, t3
  717. FMOV FZERO, t4
  718. add C1, 4 * SIZE, C1
  719. add C2, 4 * SIZE, C2
  720. add C3, 4 * SIZE, C3
  721. add C4, 4 * SIZE, C4
  722. #if ( defined(LEFT) && defined(TRANSA)) || \
  723. (!defined(LEFT) && !defined(TRANSA))
  724. sub K, KK, TEMP1
  725. #ifdef LEFT
  726. add TEMP1, -4, TEMP1
  727. #else
  728. add TEMP1, -4, TEMP1
  729. #endif
  730. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  731. add AO, TEMP1, AO
  732. add BO, TEMP1, BO
  733. #endif
  734. #ifdef LEFT
  735. add KK, 4, KK
  736. #endif
  737. add I, -1, I
  738. cmp I, 0
  739. #endif
  740. sra K, 2, L
  741. bg,pt %icc, .LL21
  742. FMOV FZERO, c01
  743. .LL50:
  744. and M, 2, I
  745. FMOV FZERO, c02
  746. cmp I, 0
  747. FMOV FZERO, t1
  748. ble,pn %icc, .LL70
  749. FMOV FZERO, c04
  750. #if !defined(TRMMKERNEL)
  751. LDF [AO + 0 * SIZE], a1
  752. sra K, 2, L
  753. FMOV FZERO, t2
  754. LDF [B + 0 * SIZE], b1
  755. mov B, BO
  756. FMOV FZERO, c06
  757. LDF [AO + 1 * SIZE], a2
  758. cmp L, 0
  759. FMOV FZERO, t3
  760. LDF [B + 1 * SIZE], b2
  761. FMOV FZERO, c08
  762. LDF [AO + 2 * SIZE], a3
  763. FMOV FZERO, t4
  764. LDF [B + 2 * SIZE], b3
  765. FMOV FZERO, c01
  766. LDF [AO + 3 * SIZE], a4
  767. FMOV FZERO, c03
  768. LDF [B + 3 * SIZE], b4
  769. FMOV FZERO, c05
  770. #else
  771. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  772. mov B, BO
  773. #else
  774. sll KK, 1 + BASE_SHIFT, TEMP1
  775. sll KK, 2 + BASE_SHIFT, TEMP2
  776. add AO, TEMP1, AO
  777. add B, TEMP2, BO
  778. #endif
  779. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  780. sub K, KK, L
  781. #elif defined(LEFT)
  782. add KK, 2, L
  783. #else
  784. add KK, 4, L
  785. #endif
  786. sra L, 2, L
  787. cmp L, 0
  788. LDF [AO + 0 * SIZE], a1
  789. FMOV FZERO, t2
  790. LDF [BO + 0 * SIZE], b1
  791. FMOV FZERO, c06
  792. LDF [AO + 1 * SIZE], a2
  793. FMOV FZERO, t3
  794. LDF [BO + 1 * SIZE], b2
  795. FMOV FZERO, c08
  796. LDF [AO + 2 * SIZE], a3
  797. FMOV FZERO, t4
  798. LDF [BO + 2 * SIZE], b3
  799. FMOV FZERO, c01
  800. LDF [AO + 3 * SIZE], a4
  801. FMOV FZERO, c03
  802. LDF [BO + 3 * SIZE], b4
  803. FMOV FZERO, c05
  804. #endif
  805. ble,pn %icc, .LL55
  806. FMOV FZERO, c07
  807. .LL52:
  808. FADD c02, t1, c02
  809. add AO, 8 * SIZE, AO
  810. prefetch [AO + APREFETCHSIZE * SIZE], 0
  811. FMUL a1, b1, t1
  812. add BO, 16 * SIZE, BO
  813. FADD c04, t2, c04
  814. add L, -1, L
  815. FMUL a1, b2, t2
  816. FADD c06, t3, c06
  817. cmp L, 0
  818. FMUL a1, b3, t3
  819. FADD c08, t4, c08
  820. FMUL a1, b4, t4
  821. LDF [AO - 4 * SIZE], a1
  822. FADD c01, t1, c01
  823. FMUL a2, b1, t1
  824. LDF [BO - 12 * SIZE], b1
  825. FADD c03, t2, c03
  826. FMUL a2, b2, t2
  827. LDF [BO - 11 * SIZE], b2
  828. FADD c05, t3, c05
  829. FMUL a2, b3, t3
  830. LDF [BO - 10 * SIZE], b3
  831. FADD c07, t4, c07
  832. FMUL a2, b4, t4
  833. LDF [BO - 9 * SIZE], b4
  834. FADD c02, t1, c02
  835. FMUL a3, b1, t1
  836. LDF [AO - 3 * SIZE], a2
  837. FADD c04, t2, c04
  838. FMUL a3, b2, t2
  839. FADD c06, t3, c06
  840. FMUL a3, b3, t3
  841. FADD c08, t4, c08
  842. FMUL a3, b4, t4
  843. LDF [AO - 2 * SIZE], a3
  844. FADD c01, t1, c01
  845. FMUL a4, b1, t1
  846. LDF [BO - 8 * SIZE], b1
  847. FADD c03, t2, c03
  848. FMUL a4, b2, t2
  849. LDF [BO - 7 * SIZE], b2
  850. FADD c05, t3, c05
  851. FMUL a4, b3, t3
  852. LDF [BO - 6 * SIZE], b3
  853. FADD c07, t4, c07
  854. FMUL a4, b4, t4
  855. LDF [BO - 5 * SIZE], b4
  856. FADD c02, t1, c02
  857. FMUL a1, b1, t1
  858. LDF [AO - 1 * SIZE], a4
  859. FADD c04, t2, c04
  860. FMUL a1, b2, t2
  861. FADD c06, t3, c06
  862. FMUL a1, b3, t3
  863. FADD c08, t4, c08
  864. FMUL a1, b4, t4
  865. LDF [AO + 0 * SIZE], a1
  866. FADD c01, t1, c01
  867. FMUL a2, b1, t1
  868. LDF [BO - 4 * SIZE], b1
  869. FADD c03, t2, c03
  870. FMUL a2, b2, t2
  871. LDF [BO - 3 * SIZE], b2
  872. FADD c05, t3, c05
  873. FMUL a2, b3, t3
  874. LDF [BO - 2 * SIZE], b3
  875. FADD c07, t4, c07
  876. FMUL a2, b4, t4
  877. LDF [BO - 1 * SIZE], b4
  878. FADD c02, t1, c02
  879. FMUL a3, b1, t1
  880. LDF [AO + 1 * SIZE], a2
  881. FADD c04, t2, c04
  882. FMUL a3, b2, t2
  883. FADD c06, t3, c06
  884. FMUL a3, b3, t3
  885. FADD c08, t4, c08
  886. FMUL a3, b4, t4
  887. LDF [AO + 2 * SIZE], a3
  888. FADD c01, t1, c01
  889. FMUL a4, b1, t1
  890. LDF [BO + 0 * SIZE], b1
  891. FADD c03, t2, c03
  892. FMUL a4, b2, t2
  893. LDF [BO + 1 * SIZE], b2
  894. FADD c05, t3, c05
  895. FMUL a4, b3, t3
  896. LDF [BO + 2 * SIZE], b3
  897. FADD c07, t4, c07
  898. FMUL a4, b4, t4
  899. LDF [BO + 3 * SIZE], b4
  900. bg,pt %icc, .LL52
  901. LDF [AO + 3 * SIZE], a4
  902. .LL55:
  903. #ifndef TRMMKERNEL
  904. and K, 3, L
  905. #else
  906. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  907. sub K, KK, L
  908. #elif defined(LEFT)
  909. add KK, 2, L
  910. #else
  911. add KK, 4, L
  912. #endif
  913. and L, 3, L
  914. #endif
  915. cmp L, 0
  916. ble,a,pn %icc, .LL59
  917. nop
  918. .LL56:
  919. FADD c02, t1, c02
  920. add AO, 2 * SIZE, AO
  921. FMUL a1, b1, t1
  922. add L, -1, L
  923. add BO, 4 * SIZE, BO
  924. FADD c04, t2, c04
  925. cmp L, 0
  926. FMUL a1, b2, t2
  927. FADD c06, t3, c06
  928. FMUL a1, b3, t3
  929. FADD c08, t4, c08
  930. FMUL a1, b4, t4
  931. LDF [AO + 0 * SIZE], a1
  932. FADD c01, t1, c01
  933. FMUL a2, b1, t1
  934. LDF [BO + 0 * SIZE], b1
  935. FADD c03, t2, c03
  936. FMUL a2, b2, t2
  937. LDF [BO + 1 * SIZE], b2
  938. FADD c05, t3, c05
  939. FMUL a2, b3, t3
  940. LDF [BO + 2 * SIZE], b3
  941. FADD c07, t4, c07
  942. FMUL a2, b4, t4
  943. LDF [BO + 3 * SIZE], b4
  944. bg,pt %icc, .LL56
  945. LDF [AO + 1 * SIZE], a2
  946. .LL59:
  947. #ifndef TRMMKERNEL
  948. FADD c02, t1, c02
  949. FMUL c01, ALPHA, c01
  950. LDF [C1 + 0 * SIZE], a1
  951. FADD c04, t2, c04
  952. FMUL c03, ALPHA, c03
  953. LDF [C1 + 1 * SIZE], a2
  954. FADD c06, t3, c06
  955. FMUL c05, ALPHA, c05
  956. LDF [C2 + 0 * SIZE], a3
  957. FADD c08, t4, c08
  958. FMUL c07, ALPHA, c07
  959. LDF [C2 + 1 * SIZE], a4
  960. FMUL c02, ALPHA, c02
  961. FADD c01, a1, c01
  962. LDF [C3 + 0 * SIZE], b1
  963. FMUL c04, ALPHA, c04
  964. FADD c02, a2, c02
  965. LDF [C3 + 1 * SIZE], b2
  966. FMUL c06, ALPHA, c06
  967. FADD c03, a3, c03
  968. LDF [C4 + 0 * SIZE], b3
  969. FMUL c08, ALPHA, c08
  970. FADD c04, a4, c04
  971. LDF [C4 + 1 * SIZE], b4
  972. STF c01, [C1 + 0 * SIZE]
  973. FADD c05, b1, c05
  974. STF c02, [C1 + 1 * SIZE]
  975. FADD c06, b2, c06
  976. add C1, 2 * SIZE, C1
  977. STF c03, [C2 + 0 * SIZE]
  978. FADD c07, b3, c07
  979. STF c04, [C2 + 1 * SIZE]
  980. FADD c08, b4, c08
  981. add C2, 2 * SIZE, C2
  982. STF c05, [C3 + 0 * SIZE]
  983. STF c06, [C3 + 1 * SIZE]
  984. add C3, 2 * SIZE, C3
  985. STF c07, [C4 + 0 * SIZE]
  986. STF c08, [C4 + 1 * SIZE]
  987. add C4, 2 * SIZE, C4
  988. #else
  989. FADD c02, t1, c02
  990. FADD c04, t2, c04
  991. FADD c06, t3, c06
  992. FADD c08, t4, c08
  993. FMUL c01, ALPHA, c01
  994. FMUL c03, ALPHA, c03
  995. FMUL c05, ALPHA, c05
  996. FMUL c07, ALPHA, c07
  997. FMUL c02, ALPHA, c02
  998. FMUL c04, ALPHA, c04
  999. FMUL c06, ALPHA, c06
  1000. FMUL c08, ALPHA, c08
  1001. STF c01, [C1 + 0 * SIZE]
  1002. STF c02, [C1 + 1 * SIZE]
  1003. STF c03, [C2 + 0 * SIZE]
  1004. STF c04, [C2 + 1 * SIZE]
  1005. STF c05, [C3 + 0 * SIZE]
  1006. STF c06, [C3 + 1 * SIZE]
  1007. STF c07, [C4 + 0 * SIZE]
  1008. STF c08, [C4 + 1 * SIZE]
  1009. add C1, 2 * SIZE, C1
  1010. add C2, 2 * SIZE, C2
  1011. add C3, 2 * SIZE, C3
  1012. add C4, 2 * SIZE, C4
  1013. #if ( defined(LEFT) && defined(TRANSA)) || \
  1014. (!defined(LEFT) && !defined(TRANSA))
  1015. sub K, KK, TEMP1
  1016. #ifdef LEFT
  1017. add TEMP1, -2, TEMP1
  1018. #else
  1019. add TEMP1, -4, TEMP1
  1020. #endif
  1021. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  1022. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  1023. add AO, TEMP2, AO
  1024. add BO, TEMP1, BO
  1025. #endif
  1026. #ifdef LEFT
  1027. add KK, 2, KK
  1028. #endif
  1029. #endif
  1030. .LL70:
  1031. and M, 1, I
  1032. cmp I, 0
  1033. ble,pn %icc, .LL99
  1034. nop
  1035. .LL71:
  1036. #if !defined(TRMMKERNEL)
  1037. LDF [AO + 0 * SIZE], a1
  1038. sra K, 2, L
  1039. FMOV FZERO, c01
  1040. LDF [B + 0 * SIZE], b1
  1041. mov B, BO
  1042. FMOV FZERO, t1
  1043. LDF [AO + 1 * SIZE], a2
  1044. cmp L, 0
  1045. FMOV FZERO, c02
  1046. LDF [B + 1 * SIZE], b2
  1047. FMOV FZERO, t2
  1048. LDF [AO + 2 * SIZE], a3
  1049. FMOV FZERO, c03
  1050. LDF [B + 2 * SIZE], b3
  1051. FMOV FZERO, t3
  1052. LDF [AO + 3 * SIZE], a4
  1053. FMOV FZERO, c04
  1054. LDF [B + 3 * SIZE], b4
  1055. FMOV FZERO, t4
  1056. #else
  1057. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1058. mov B, BO
  1059. #else
  1060. sll KK, 0 + BASE_SHIFT, TEMP1
  1061. sll KK, 2 + BASE_SHIFT, TEMP2
  1062. add AO, TEMP1, AO
  1063. add B, TEMP2, BO
  1064. #endif
  1065. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1066. sub K, KK, L
  1067. #elif defined(LEFT)
  1068. add KK, 1, L
  1069. #else
  1070. add KK, 4, L
  1071. #endif
  1072. sra L, 2, L
  1073. cmp L, 0
  1074. LDF [AO + 0 * SIZE], a1
  1075. FMOV FZERO, c01
  1076. LDF [BO + 0 * SIZE], b1
  1077. FMOV FZERO, t1
  1078. LDF [AO + 1 * SIZE], a2
  1079. FMOV FZERO, c02
  1080. LDF [BO + 1 * SIZE], b2
  1081. FMOV FZERO, t2
  1082. LDF [AO + 2 * SIZE], a3
  1083. FMOV FZERO, c03
  1084. LDF [BO + 2 * SIZE], b3
  1085. FMOV FZERO, t3
  1086. LDF [AO + 3 * SIZE], a4
  1087. FMOV FZERO, c04
  1088. LDF [BO + 3 * SIZE], b4
  1089. FMOV FZERO, t4
  1090. #endif
  1091. ble,pn %icc, .LL75
  1092. nop
  1093. .LL72:
  1094. FADD c01, t1, c01
  1095. add L, -1, L
  1096. FMUL a1, b1, t1
  1097. LDF [BO + 4 * SIZE], b1
  1098. FADD c02, t2, c02
  1099. cmp L, 0
  1100. FMUL a1, b2, t2
  1101. LDF [BO + 5 * SIZE], b2
  1102. FADD c03, t3, c03
  1103. FMUL a1, b3, t3
  1104. LDF [BO + 6 * SIZE], b3
  1105. FADD c04, t4, c04
  1106. FMUL a1, b4, t4
  1107. LDF [BO + 7 * SIZE], b4
  1108. LDF [AO + 4 * SIZE], a1
  1109. FADD c01, t1, c01
  1110. add AO, 4 * SIZE, AO
  1111. FMUL a2, b1, t1
  1112. LDF [BO + 8 * SIZE], b1
  1113. FADD c02, t2, c02
  1114. FMUL a2, b2, t2
  1115. LDF [BO + 9 * SIZE], b2
  1116. FADD c03, t3, c03
  1117. FMUL a2, b3, t3
  1118. LDF [BO + 10 * SIZE], b3
  1119. FADD c04, t4, c04
  1120. FMUL a2, b4, t4
  1121. LDF [BO + 11 * SIZE], b4
  1122. LDF [AO + 1 * SIZE], a2
  1123. FADD c01, t1, c01
  1124. FMUL a3, b1, t1
  1125. LDF [BO + 12 * SIZE], b1
  1126. FADD c02, t2, c02
  1127. FMUL a3, b2, t2
  1128. LDF [BO + 13 * SIZE], b2
  1129. FADD c03, t3, c03
  1130. FMUL a3, b3, t3
  1131. LDF [BO + 14 * SIZE], b3
  1132. FADD c04, t4, c04
  1133. FMUL a3, b4, t4
  1134. LDF [BO + 15 * SIZE], b4
  1135. LDF [AO + 2 * SIZE], a3
  1136. FADD c01, t1, c01
  1137. FMUL a4, b1, t1
  1138. LDF [BO + 16 * SIZE], b1
  1139. FADD c02, t2, c02
  1140. FMUL a4, b2, t2
  1141. LDF [BO + 17 * SIZE], b2
  1142. FADD c03, t3, c03
  1143. FMUL a4, b3, t3
  1144. LDF [BO + 18 * SIZE], b3
  1145. FADD c04, t4, c04
  1146. FMUL a4, b4, t4
  1147. LDF [BO + 19 * SIZE], b4
  1148. add BO, 16 * SIZE, BO
  1149. bg,pt %icc, .LL72
  1150. LDF [AO + 3 * SIZE], a4
  1151. .LL75:
  1152. #ifndef TRMMKERNEL
  1153. and K, 3, L
  1154. #else
  1155. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1156. sub K, KK, L
  1157. #elif defined(LEFT)
  1158. add KK, 1, L
  1159. #else
  1160. add KK, 4, L
  1161. #endif
  1162. and L, 3, L
  1163. #endif
  1164. cmp L, 0
  1165. ble,a,pn %icc, .LL79
  1166. nop
  1167. .LL76:
  1168. FADD c01, t1, c01
  1169. add AO, 1 * SIZE, AO
  1170. FMUL a1, b1, t1
  1171. LDF [BO + 4 * SIZE], b1
  1172. FADD c02, t2, c02
  1173. add L, -1, L
  1174. FMUL a1, b2, t2
  1175. LDF [BO + 5 * SIZE], b2
  1176. FADD c03, t3, c03
  1177. cmp L, 0
  1178. FMUL a1, b3, t3
  1179. LDF [BO + 6 * SIZE], b3
  1180. FADD c04, t4, c04
  1181. add BO, 4 * SIZE, BO
  1182. FMUL a1, b4, t4
  1183. LDF [AO + 0 * SIZE], a1
  1184. bg,pt %icc, .LL76
  1185. LDF [BO + 3 * SIZE], b4
  1186. .LL79:
  1187. #ifndef TRMMKERNEL
  1188. FADD c01, t1, c01
  1189. LDF [C1 + 0 * SIZE], a1
  1190. FADD c02, t2, c02
  1191. LDF [C2 + 0 * SIZE], a2
  1192. FADD c03, t3, c03
  1193. LDF [C3 + 0 * SIZE], a3
  1194. FADD c04, t4, c04
  1195. LDF [C4 + 0 * SIZE], a4
  1196. FMUL c01, ALPHA, c01
  1197. FMUL c02, ALPHA, c02
  1198. FMUL c03, ALPHA, c03
  1199. FMUL c04, ALPHA, c04
  1200. FADD c01, a1, c01
  1201. FADD c02, a2, c02
  1202. FADD c03, a3, c03
  1203. FADD c04, a4, c04
  1204. STF c01, [C1 + 0 * SIZE]
  1205. STF c02, [C2 + 0 * SIZE]
  1206. STF c03, [C3 + 0 * SIZE]
  1207. STF c04, [C4 + 0 * SIZE]
  1208. #else
  1209. FADD c01, t1, c01
  1210. FADD c02, t2, c02
  1211. FADD c03, t3, c03
  1212. FADD c04, t4, c04
  1213. FMUL c01, ALPHA, c01
  1214. FMUL c02, ALPHA, c02
  1215. FMUL c03, ALPHA, c03
  1216. FMUL c04, ALPHA, c04
  1217. STF c01, [C1 + 0 * SIZE]
  1218. STF c02, [C2 + 0 * SIZE]
  1219. STF c03, [C3 + 0 * SIZE]
  1220. STF c04, [C4 + 0 * SIZE]
  1221. #if ( defined(LEFT) && defined(TRANSA)) || \
  1222. (!defined(LEFT) && !defined(TRANSA))
  1223. sub K, KK, TEMP1
  1224. #ifdef LEFT
  1225. add TEMP1, -1, TEMP1
  1226. #else
  1227. add TEMP1, -4, TEMP1
  1228. #endif
  1229. sll TEMP1, 0 + BASE_SHIFT, TEMP2
  1230. sll TEMP1, 2 + BASE_SHIFT, TEMP1
  1231. add AO, TEMP2, AO
  1232. add BO, TEMP1, BO
  1233. #endif
  1234. #ifdef LEFT
  1235. add KK, 1, KK
  1236. #endif
  1237. #endif
  1238. .LL99:
  1239. add J, -1, J
  1240. mov BO, B
  1241. cmp J, 0
  1242. bg,pt %icc, .LL11
  1243. #if defined(TRMMKERNEL) && !defined(LEFT)
  1244. add KK, 4, KK
  1245. #else
  1246. nop
  1247. #endif
  1248. .LL100: /* n & 2 */
  1249. sra M, 2, I
  1250. and N, 2, J
  1251. cmp J, 0
  1252. add C, LDC, C2
  1253. ble,pn %icc, .LL200
  1254. mov A, AO
  1255. #if defined(TRMMKERNEL) && defined(LEFT)
  1256. mov OFFSET, KK
  1257. #endif
  1258. mov C, C1
  1259. add C2, LDC, C
  1260. cmp I, 0
  1261. ble,pn %icc, .LL150
  1262. FMOV FZERO, c03
  1263. .LL121:
  1264. #if !defined(TRMMKERNEL)
  1265. LDF [AO + 0 * SIZE], a1
  1266. sra K, 2, L
  1267. FMOV FZERO, t1
  1268. LDF [B + 0 * SIZE], b1
  1269. mov B, BO
  1270. FMOV FZERO, c07
  1271. LDF [AO + 1 * SIZE], a2
  1272. cmp L, 0
  1273. FMOV FZERO, t2
  1274. LDF [B + 1 * SIZE], b2
  1275. FMOV FZERO, c04
  1276. LDF [AO + 2 * SIZE], a3
  1277. FMOV FZERO, t3
  1278. LDF [B + 2 * SIZE], b3
  1279. FMOV FZERO, c08
  1280. LDF [AO + 3 * SIZE], a4
  1281. FMOV FZERO, t4
  1282. LDF [B + 3 * SIZE], b4
  1283. FMOV FZERO, c01
  1284. prefetch [C1 + 3 * SIZE], 2
  1285. FMOV FZERO, c05
  1286. prefetch [C2 + 3 * SIZE], 2
  1287. FMOV FZERO, c02
  1288. #else
  1289. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1290. mov B, BO
  1291. #else
  1292. sll KK, 2 + BASE_SHIFT, TEMP1
  1293. sll KK, 1 + BASE_SHIFT, TEMP2
  1294. add AO, TEMP1, AO
  1295. add B, TEMP2, BO
  1296. #endif
  1297. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1298. sub K, KK, L
  1299. #elif defined(LEFT)
  1300. add KK, 4, L
  1301. #else
  1302. add KK, 2, L
  1303. #endif
  1304. sra L, 2, L
  1305. cmp L, 0
  1306. LDF [AO + 0 * SIZE], a1
  1307. FMOV FZERO, t1
  1308. LDF [BO + 0 * SIZE], b1
  1309. FMOV FZERO, c07
  1310. LDF [AO + 1 * SIZE], a2
  1311. FMOV FZERO, t2
  1312. LDF [BO + 1 * SIZE], b2
  1313. FMOV FZERO, c04
  1314. LDF [AO + 2 * SIZE], a3
  1315. FMOV FZERO, t3
  1316. LDF [BO + 2 * SIZE], b3
  1317. FMOV FZERO, c08
  1318. LDF [AO + 3 * SIZE], a4
  1319. FMOV FZERO, t4
  1320. LDF [BO + 3 * SIZE], b4
  1321. FMOV FZERO, c01
  1322. prefetch [C1 + 3 * SIZE], 2
  1323. FMOV FZERO, c05
  1324. prefetch [C2 + 3 * SIZE], 2
  1325. FMOV FZERO, c02
  1326. #endif
  1327. ble,pn %icc, .LL125
  1328. FMOV FZERO, c06
  1329. .LL122:
  1330. FADD c03, t1, c03
  1331. add L, -1, L
  1332. FMUL a1, b1, t1
  1333. prefetch [AO + APREFETCHSIZE * SIZE], 0
  1334. FADD c07, t2, c07
  1335. add BO, 8 * SIZE, BO
  1336. FMUL a1, b2, t2
  1337. LDF [AO + 4 * SIZE], a1
  1338. FADD c04, t3, c04
  1339. add AO, 16 * SIZE, AO
  1340. FMUL a2, b1, t3
  1341. cmp L, 0
  1342. FADD c08, t4, c08
  1343. nop
  1344. FMUL a2, b2, t4
  1345. LDF [AO - 11 * SIZE], a2
  1346. FADD c01, t1, c01
  1347. nop
  1348. FMUL a3, b1, t1
  1349. nop
  1350. FADD c05, t2, c05
  1351. nop
  1352. FMUL a3, b2, t2
  1353. LDF [AO - 10 * SIZE], a3
  1354. FADD c02, t3, c02
  1355. nop
  1356. FMUL a4, b1, t3
  1357. LDF [BO - 4 * SIZE], b1
  1358. FADD c06, t4, c06
  1359. nop
  1360. FMUL a4, b2, t4
  1361. LDF [BO - 3 * SIZE], b2
  1362. FADD c03, t1, c03
  1363. nop
  1364. FMUL a1, b3, t1
  1365. LDF [AO - 9 * SIZE], a4
  1366. FADD c07, t2, c07
  1367. nop
  1368. FMUL a1, b4, t2
  1369. LDF [AO - 8 * SIZE], a1
  1370. FADD c04, t3, c04
  1371. nop
  1372. FMUL a2, b3, t3
  1373. nop
  1374. FADD c08, t4, c08
  1375. nop
  1376. FMUL a2, b4, t4
  1377. LDF [AO - 7 * SIZE], a2
  1378. FADD c01, t1, c01
  1379. nop
  1380. FMUL a3, b3, t1
  1381. nop
  1382. FADD c05, t2, c05
  1383. nop
  1384. FMUL a3, b4, t2
  1385. LDF [AO - 6 * SIZE], a3
  1386. FADD c02, t3, c02
  1387. nop
  1388. FMUL a4, b3, t3
  1389. LDF [BO - 2 * SIZE], b3
  1390. FADD c06, t4, c06
  1391. nop
  1392. FMUL a4, b4, t4
  1393. LDF [BO - 1 * SIZE], b4
  1394. FADD c03, t1, c03
  1395. nop
  1396. FMUL a1, b1, t1
  1397. LDF [AO - 5 * SIZE], a4
  1398. FADD c07, t2, c07
  1399. nop
  1400. FMUL a1, b2, t2
  1401. LDF [AO - 4 * SIZE], a1
  1402. FADD c04, t3, c04
  1403. nop
  1404. FMUL a2, b1, t3
  1405. nop
  1406. FADD c08, t4, c08
  1407. nop
  1408. FMUL a2, b2, t4
  1409. LDF [AO - 3 * SIZE], a2
  1410. FADD c01, t1, c01
  1411. nop
  1412. FMUL a3, b1, t1
  1413. nop
  1414. FADD c05, t2, c05
  1415. nop
  1416. FMUL a3, b2, t2
  1417. LDF [AO - 2 * SIZE], a3
  1418. FADD c02, t3, c02
  1419. nop
  1420. FMUL a4, b1, t3
  1421. LDF [BO + 0 * SIZE], b1
  1422. FADD c06, t4, c06
  1423. nop
  1424. FMUL a4, b2, t4
  1425. LDF [BO + 1 * SIZE], b2
  1426. FADD c03, t1, c03
  1427. nop
  1428. FMUL a1, b3, t1
  1429. LDF [AO - 1 * SIZE], a4
  1430. FADD c07, t2, c07
  1431. nop
  1432. FMUL a1, b4, t2
  1433. LDF [AO + 0 * SIZE], a1
  1434. FADD c04, t3, c04
  1435. nop
  1436. FMUL a2, b3, t3
  1437. nop
  1438. FADD c08, t4, c08
  1439. nop
  1440. FMUL a2, b4, t4
  1441. LDF [AO + 1 * SIZE], a2
  1442. FADD c01, t1, c01
  1443. nop
  1444. FMUL a3, b3, t1
  1445. nop
  1446. FADD c05, t2, c05
  1447. nop
  1448. FMUL a3, b4, t2
  1449. LDF [AO + 2 * SIZE], a3
  1450. FADD c02, t3, c02
  1451. nop
  1452. FMUL a4, b3, t3
  1453. LDF [BO + 2 * SIZE], b3
  1454. FADD c06, t4, c06
  1455. FMUL a4, b4, t4
  1456. LDF [AO + 3 * SIZE], a4
  1457. bg,pt %icc, .LL122
  1458. LDF [BO + 3 * SIZE], b4
  1459. .LL125:
  1460. #ifndef TRMMKERNEL
  1461. and K, 3, L
  1462. #else
  1463. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1464. sub K, KK, L
  1465. #elif defined(LEFT)
  1466. add KK, 4, L
  1467. #else
  1468. add KK, 2, L
  1469. #endif
  1470. and L, 3, L
  1471. #endif
  1472. cmp L, 0
  1473. ble,a,pn %icc, .LL129
  1474. nop
  1475. .LL126:
  1476. FADD c03, t1, c03
  1477. add AO, 4 * SIZE, AO
  1478. FMUL a1, b1, t1
  1479. add BO, 2 * SIZE, BO
  1480. FADD c07, t2, c07
  1481. add L, -1, L
  1482. FMUL a1, b2, t2
  1483. LDF [AO + 0 * SIZE], a1
  1484. FADD c04, t3, c04
  1485. cmp L, 0
  1486. FMUL a2, b1, t3
  1487. FADD c08, t4, c08
  1488. FMUL a2, b2, t4
  1489. LDF [AO + 1 * SIZE], a2
  1490. FADD c01, t1, c01
  1491. FMUL a3, b1, t1
  1492. FADD c05, t2, c05
  1493. FMUL a3, b2, t2
  1494. LDF [AO + 2 * SIZE], a3
  1495. FADD c02, t3, c02
  1496. FMUL a4, b1, t3
  1497. LDF [BO + 0 * SIZE], b1
  1498. FADD c06, t4, c06
  1499. FMUL a4, b2, t4
  1500. LDF [BO + 1 * SIZE], b2
  1501. bg,pt %icc, .LL126
  1502. LDF [AO + 3 * SIZE], a4
  1503. .LL129:
  1504. #ifndef TRMMKERNEL
  1505. FADD c03, t1, c03
  1506. add I, -1, I
  1507. LDF [C1 + 0 * SIZE], a1
  1508. FADD c07, t2, c07
  1509. cmp I, 0
  1510. LDF [C1 + 1 * SIZE], a2
  1511. FADD c04, t3, c04
  1512. LDF [C1 + 2 * SIZE], a3
  1513. FADD c08, t4, c08
  1514. LDF [C1 + 3 * SIZE], a4
  1515. LDF [C2 + 0 * SIZE], b1
  1516. FMUL c01, ALPHA, c01
  1517. LDF [C2 + 1 * SIZE], b2
  1518. FMUL c02, ALPHA, c02
  1519. LDF [C2 + 2 * SIZE], b3
  1520. FMUL c03, ALPHA, c03
  1521. LDF [C2 + 3 * SIZE], b4
  1522. FMUL c04, ALPHA, c04
  1523. FMUL c05, ALPHA, c05
  1524. FADD c01, a1, c01
  1525. FMUL c06, ALPHA, c06
  1526. FADD c02, a2, c02
  1527. FMUL c07, ALPHA, c07
  1528. FADD c03, a3, c03
  1529. FMUL c08, ALPHA, c08
  1530. FADD c04, a4, c04
  1531. STF c01, [C1 + 0 * SIZE]
  1532. FADD c05, b1, c05
  1533. STF c02, [C1 + 1 * SIZE]
  1534. FADD c06, b2, c06
  1535. STF c03, [C1 + 2 * SIZE]
  1536. FADD c07, b3, c07
  1537. STF c04, [C1 + 3 * SIZE]
  1538. add C1, 4 * SIZE, C1
  1539. FADD c08, b4, c08
  1540. STF c05, [C2 + 0 * SIZE]
  1541. STF c06, [C2 + 1 * SIZE]
  1542. STF c07, [C2 + 2 * SIZE]
  1543. STF c08, [C2 + 3 * SIZE]
  1544. add C2, 4 * SIZE, C2
  1545. #else
  1546. FADD c03, t1, c03
  1547. FADD c07, t2, c07
  1548. FADD c04, t3, c04
  1549. FADD c08, t4, c08
  1550. FMUL c01, ALPHA, c01
  1551. FMUL c02, ALPHA, c02
  1552. FMUL c03, ALPHA, c03
  1553. FMUL c04, ALPHA, c04
  1554. FMUL c05, ALPHA, c05
  1555. FMUL c06, ALPHA, c06
  1556. FMUL c07, ALPHA, c07
  1557. FMUL c08, ALPHA, c08
  1558. STF c01, [C1 + 0 * SIZE]
  1559. STF c02, [C1 + 1 * SIZE]
  1560. STF c03, [C1 + 2 * SIZE]
  1561. STF c04, [C1 + 3 * SIZE]
  1562. STF c05, [C2 + 0 * SIZE]
  1563. STF c06, [C2 + 1 * SIZE]
  1564. STF c07, [C2 + 2 * SIZE]
  1565. STF c08, [C2 + 3 * SIZE]
  1566. add C1, 4 * SIZE, C1
  1567. add C2, 4 * SIZE, C2
  1568. #if ( defined(LEFT) && defined(TRANSA)) || \
  1569. (!defined(LEFT) && !defined(TRANSA))
  1570. sub K, KK, TEMP1
  1571. #ifdef LEFT
  1572. add TEMP1, -4, TEMP1
  1573. #else
  1574. add TEMP1, -2, TEMP1
  1575. #endif
  1576. sll TEMP1, 2 + BASE_SHIFT, TEMP2
  1577. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  1578. add AO, TEMP2, AO
  1579. add BO, TEMP1, BO
  1580. #endif
  1581. #ifdef LEFT
  1582. add KK, 4, KK
  1583. #endif
  1584. add I, -1, I
  1585. cmp I, 0
  1586. #endif
  1587. bg,pt %icc, .LL121
  1588. FMOV FZERO, c03
  1589. .LL150:
  1590. and M, 2, I
  1591. cmp I, 0
  1592. ble,pn %icc, .LL170
  1593. nop
  1594. .LL151:
  1595. #if !defined(TRMMKERNEL)
  1596. LDF [AO + 0 * SIZE], a1
  1597. sra K, 2, L
  1598. FMOV FZERO, c01
  1599. LDF [B + 0 * SIZE], b1
  1600. mov B, BO
  1601. FMOV FZERO, t1
  1602. LDF [AO + 1 * SIZE], a2
  1603. cmp L, 0
  1604. FMOV FZERO, c02
  1605. LDF [B + 1 * SIZE], b2
  1606. FMOV FZERO, t2
  1607. LDF [AO + 2 * SIZE], a3
  1608. FMOV FZERO, c03
  1609. LDF [B + 2 * SIZE], b3
  1610. FMOV FZERO, t3
  1611. LDF [AO + 3 * SIZE], a4
  1612. FMOV FZERO, c04
  1613. LDF [B + 3 * SIZE], b4
  1614. FMOV FZERO, t4
  1615. #else
  1616. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1617. mov B, BO
  1618. #else
  1619. sll KK, 1 + BASE_SHIFT, TEMP1
  1620. sll KK, 1 + BASE_SHIFT, TEMP2
  1621. add AO, TEMP1, AO
  1622. add B, TEMP2, BO
  1623. #endif
  1624. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1625. sub K, KK, L
  1626. #elif defined(LEFT)
  1627. add KK, 2, L
  1628. #else
  1629. add KK, 2, L
  1630. #endif
  1631. sra L, 2, L
  1632. cmp L, 0
  1633. LDF [AO + 0 * SIZE], a1
  1634. FMOV FZERO, c01
  1635. LDF [BO + 0 * SIZE], b1
  1636. FMOV FZERO, t1
  1637. LDF [AO + 1 * SIZE], a2
  1638. FMOV FZERO, c02
  1639. LDF [BO + 1 * SIZE], b2
  1640. FMOV FZERO, t2
  1641. LDF [AO + 2 * SIZE], a3
  1642. FMOV FZERO, c03
  1643. LDF [BO + 2 * SIZE], b3
  1644. FMOV FZERO, t3
  1645. LDF [AO + 3 * SIZE], a4
  1646. FMOV FZERO, c04
  1647. LDF [BO + 3 * SIZE], b4
  1648. FMOV FZERO, t4
  1649. #endif
  1650. ble,pn %icc, .LL155
  1651. nop
  1652. .LL152:
  1653. FADD c01, t1, c01
  1654. add L, -1, L
  1655. FMUL a1, b1, t1
  1656. prefetch [AO + APREFETCHSIZE * SIZE], 0
  1657. FADD c02, t2, c02
  1658. add BO, 8 * SIZE, BO
  1659. FMUL a1, b2, t2
  1660. LDF [AO + 4 * SIZE], a1
  1661. FADD c03, t3, c03
  1662. cmp L, 0
  1663. FMUL a2, b1, t3
  1664. LDF [BO - 4 * SIZE], b1
  1665. FADD c04, t4, c04
  1666. nop
  1667. FMUL a2, b2, t4
  1668. LDF [AO + 5 * SIZE], a2
  1669. FADD c01, t1, c01
  1670. nop
  1671. FMUL a3, b3, t1
  1672. LDF [BO - 3 * SIZE], b2
  1673. FADD c02, t2, c02
  1674. nop
  1675. FMUL a3, b4, t2
  1676. LDF [AO + 6 * SIZE], a3
  1677. FADD c03, t3, c03
  1678. nop
  1679. FMUL a4, b3, t3
  1680. LDF [BO - 2 * SIZE], b3
  1681. FADD c04, t4, c04
  1682. nop
  1683. FMUL a4, b4, t4
  1684. LDF [AO + 7 * SIZE], a4
  1685. FADD c01, t1, c01
  1686. nop
  1687. FMUL a1, b1, t1
  1688. LDF [BO - 1 * SIZE], b4
  1689. FADD c02, t2, c02
  1690. FMUL a1, b2, t2
  1691. LDF [AO + 8 * SIZE], a1
  1692. FADD c03, t3, c03
  1693. FMUL a2, b1, t3
  1694. LDF [BO + 0 * SIZE], b1
  1695. FADD c04, t4, c04
  1696. FMUL a2, b2, t4
  1697. LDF [AO + 9 * SIZE], a2
  1698. FADD c01, t1, c01
  1699. FMUL a3, b3, t1
  1700. LDF [BO + 1 * SIZE], b2
  1701. FADD c02, t2, c02
  1702. FMUL a3, b4, t2
  1703. LDF [AO + 10 * SIZE], a3
  1704. FADD c03, t3, c03
  1705. FMUL a4, b3, t3
  1706. LDF [BO + 2 * SIZE], b3
  1707. FADD c04, t4, c04
  1708. FMUL a4, b4, t4
  1709. LDF [AO + 11 * SIZE], a4
  1710. add AO, 8 * SIZE, AO
  1711. bg,pt %icc, .LL152
  1712. LDF [BO + 3 * SIZE], b4
  1713. .LL155:
  1714. #ifndef TRMMKERNEL
  1715. and K, 3, L
  1716. #else
  1717. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1718. sub K, KK, L
  1719. #elif defined(LEFT)
  1720. add KK, 2, L
  1721. #else
  1722. add KK, 2, L
  1723. #endif
  1724. and L, 3, L
  1725. #endif
  1726. cmp L, 0
  1727. ble,a,pn %icc, .LL159
  1728. nop
  1729. .LL156:
  1730. LDF [AO + 0 * SIZE], a1
  1731. LDF [AO + 1 * SIZE], a2
  1732. LDF [BO + 0 * SIZE], b1
  1733. LDF [BO + 1 * SIZE], b2
  1734. FADD c01, t1, c01
  1735. FADD c02, t2, c02
  1736. FADD c03, t3, c03
  1737. FADD c04, t4, c04
  1738. FMUL a1, b1, t1
  1739. FMUL a1, b2, t2
  1740. FMUL a2, b1, t3
  1741. FMUL a2, b2, t4
  1742. add AO, 2 * SIZE, AO
  1743. add BO, 2 * SIZE, BO
  1744. add L, -1, L
  1745. cmp L, 0
  1746. bg,pt %icc, .LL156
  1747. nop
  1748. .LL159:
  1749. #ifndef TRMMKERNEL
  1750. LDF [C1 + 0 * SIZE], a1
  1751. LDF [C2 + 0 * SIZE], a2
  1752. LDF [C1 + 1 * SIZE], a3
  1753. LDF [C2 + 1 * SIZE], a4
  1754. FADD c01, t1, c01
  1755. FADD c02, t2, c02
  1756. FADD c03, t3, c03
  1757. FADD c04, t4, c04
  1758. FMUL c01, ALPHA, c01
  1759. FMUL c02, ALPHA, c02
  1760. FMUL c03, ALPHA, c03
  1761. FMUL c04, ALPHA, c04
  1762. FADD c01, a1, c01
  1763. FADD c02, a2, c02
  1764. FADD c03, a3, c03
  1765. FADD c04, a4, c04
  1766. STF c01, [C1 + 0 * SIZE]
  1767. STF c02, [C2 + 0 * SIZE]
  1768. STF c03, [C1 + 1 * SIZE]
  1769. add C1, 2 * SIZE, C1
  1770. STF c04, [C2 + 1 * SIZE]
  1771. add C2, 2 * SIZE, C2
  1772. #else
  1773. FADD c01, t1, c01
  1774. FADD c02, t2, c02
  1775. FADD c03, t3, c03
  1776. FADD c04, t4, c04
  1777. FMUL c01, ALPHA, c01
  1778. FMUL c02, ALPHA, c02
  1779. FMUL c03, ALPHA, c03
  1780. FMUL c04, ALPHA, c04
  1781. STF c01, [C1 + 0 * SIZE]
  1782. STF c02, [C2 + 0 * SIZE]
  1783. STF c03, [C1 + 1 * SIZE]
  1784. STF c04, [C2 + 1 * SIZE]
  1785. add C1, 2 * SIZE, C1
  1786. add C2, 2 * SIZE, C2
  1787. #if ( defined(LEFT) && defined(TRANSA)) || \
  1788. (!defined(LEFT) && !defined(TRANSA))
  1789. sub K, KK, TEMP1
  1790. #ifdef LEFT
  1791. add TEMP1, -2, TEMP1
  1792. #else
  1793. add TEMP1, -2, TEMP1
  1794. #endif
  1795. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  1796. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  1797. add AO, TEMP2, AO
  1798. add BO, TEMP1, BO
  1799. #endif
  1800. #ifdef LEFT
  1801. add KK, 2, KK
  1802. #endif
  1803. #endif
  1804. .LL170:
  1805. and M, 1, I
  1806. cmp I, 0
  1807. ble,pn %icc, .LL199
  1808. nop
  1809. .LL171:
  1810. #if !defined(TRMMKERNEL)
  1811. LDF [AO + 0 * SIZE], a1
  1812. sra K, 2, L
  1813. FMOV FZERO, c01
  1814. LDF [B + 0 * SIZE], b1
  1815. mov B, BO
  1816. FMOV FZERO, t1
  1817. LDF [AO + 1 * SIZE], a2
  1818. cmp L, 0
  1819. FMOV FZERO, c02
  1820. LDF [B + 1 * SIZE], b2
  1821. FMOV FZERO, t2
  1822. LDF [AO + 2 * SIZE], a3
  1823. FMOV FZERO, c03
  1824. LDF [B + 2 * SIZE], b3
  1825. FMOV FZERO, t3
  1826. LDF [AO + 3 * SIZE], a4
  1827. FMOV FZERO, c04
  1828. LDF [B + 3 * SIZE], b4
  1829. FMOV FZERO, t4
  1830. #else
  1831. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1832. mov B, BO
  1833. #else
  1834. sll KK, 0 + BASE_SHIFT, TEMP1
  1835. sll KK, 1 + BASE_SHIFT, TEMP2
  1836. add AO, TEMP1, AO
  1837. add B, TEMP2, BO
  1838. #endif
  1839. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1840. sub K, KK, L
  1841. #elif defined(LEFT)
  1842. add KK, 1, L
  1843. #else
  1844. add KK, 2, L
  1845. #endif
  1846. sra L, 2, L
  1847. cmp L, 0
  1848. LDF [AO + 0 * SIZE], a1
  1849. FMOV FZERO, c01
  1850. LDF [BO + 0 * SIZE], b1
  1851. FMOV FZERO, t1
  1852. LDF [AO + 1 * SIZE], a2
  1853. FMOV FZERO, c02
  1854. LDF [BO + 1 * SIZE], b2
  1855. FMOV FZERO, t2
  1856. LDF [AO + 2 * SIZE], a3
  1857. FMOV FZERO, c03
  1858. LDF [BO + 2 * SIZE], b3
  1859. FMOV FZERO, t3
  1860. LDF [AO + 3 * SIZE], a4
  1861. FMOV FZERO, c04
  1862. LDF [BO + 3 * SIZE], b4
  1863. FMOV FZERO, t4
  1864. #endif
  1865. ble,pn %icc, .LL175
  1866. nop
  1867. .LL172:
  1868. FADD c01, t1, c01
  1869. add AO, 4 * SIZE, AO
  1870. FMUL a1, b1, t1
  1871. LDF [BO + 4 * SIZE], b1
  1872. FADD c02, t2, c02
  1873. FMUL a1, b2, t2
  1874. LDF [BO + 5 * SIZE], b2
  1875. add L, -1, L
  1876. LDF [AO + 0 * SIZE], a1
  1877. FADD c03, t3, c03
  1878. cmp L, 0
  1879. FMUL a2, b3, t3
  1880. LDF [BO + 6 * SIZE], b3
  1881. FADD c04, t4, c04
  1882. FMUL a2, b4, t4
  1883. LDF [BO + 7 * SIZE], b4
  1884. LDF [AO + 1 * SIZE], a2
  1885. FADD c01, t1, c01
  1886. FMUL a3, b1, t1
  1887. LDF [BO + 8 * SIZE], b1
  1888. FADD c02, t2, c02
  1889. FMUL a3, b2, t2
  1890. LDF [BO + 9 * SIZE], b2
  1891. LDF [AO + 2 * SIZE], a3
  1892. FADD c03, t3, c03
  1893. FMUL a4, b3, t3
  1894. LDF [BO + 10 * SIZE], b3
  1895. FADD c04, t4, c04
  1896. FMUL a4, b4, t4
  1897. LDF [BO + 11 * SIZE], b4
  1898. add BO, 8 * SIZE, BO
  1899. bg,pt %icc, .LL172
  1900. LDF [AO + 3 * SIZE], a4
  1901. .LL175:
  1902. #ifndef TRMMKERNEL
  1903. and K, 3, L
  1904. #else
  1905. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1906. sub K, KK, L
  1907. #elif defined(LEFT)
  1908. add KK, 1, L
  1909. #else
  1910. add KK, 2, L
  1911. #endif
  1912. and L, 3, L
  1913. #endif
  1914. cmp L, 0
  1915. ble,a,pn %icc, .LL179
  1916. nop
  1917. .LL176:
  1918. FADD c01, t1, c01
  1919. add L, -1, L
  1920. FMUL a1, b1, t1
  1921. add AO, 1 * SIZE, AO
  1922. LDF [BO + 2 * SIZE], b1
  1923. FADD c02, t2, c02
  1924. cmp L, 0
  1925. FMUL a1, b2, t2
  1926. LDF [BO + 3 * SIZE], b2
  1927. add BO, 2 * SIZE, BO
  1928. bg,pt %icc, .LL176
  1929. LDF [AO + 0 * SIZE], a1
  1930. .LL179:
  1931. #ifndef TRMMKERNEL
  1932. FADD c01, t1, c01
  1933. LDF [C1 + 0 * SIZE], a1
  1934. FADD c02, t2, c02
  1935. LDF [C2 + 0 * SIZE], a2
  1936. FADD c03, t3, c03
  1937. FADD c04, t4, c04
  1938. FADD c01, c03, c01
  1939. FADD c02, c04, c02
  1940. FMUL c01, ALPHA, c01
  1941. FMUL c02, ALPHA, c02
  1942. FADD c01, a1, c01
  1943. FADD c02, a2, c02
  1944. STF c01, [C1 + 0 * SIZE]
  1945. STF c02, [C2 + 0 * SIZE]
  1946. #else
  1947. FADD c01, t1, c01
  1948. FADD c02, t2, c02
  1949. FADD c03, t3, c03
  1950. FADD c04, t4, c04
  1951. FADD c01, c03, c01
  1952. FADD c02, c04, c02
  1953. FMUL c01, ALPHA, c01
  1954. FMUL c02, ALPHA, c02
  1955. STF c01, [C1 + 0 * SIZE]
  1956. STF c02, [C2 + 0 * SIZE]
  1957. #if ( defined(LEFT) && defined(TRANSA)) || \
  1958. (!defined(LEFT) && !defined(TRANSA))
  1959. sub K, KK, TEMP1
  1960. #ifdef LEFT
  1961. add TEMP1, -1, TEMP1
  1962. #else
  1963. add TEMP1, -2, TEMP1
  1964. #endif
  1965. sll TEMP1, 0 + BASE_SHIFT, TEMP2
  1966. sll TEMP1, 1 + BASE_SHIFT, TEMP1
  1967. add AO, TEMP2, AO
  1968. add BO, TEMP1, BO
  1969. #endif
  1970. #ifdef LEFT
  1971. add KK, 1, KK
  1972. #endif
  1973. #endif
  1974. .LL199:
  1975. mov BO, B
  1976. #if defined(TRMMKERNEL) && !defined(LEFT)
  1977. add KK, 2, KK
  1978. #else
  1979. nop
  1980. #endif
  1981. .LL200:
  1982. and N, 1, J
  1983. sra M, 2, I
  1984. cmp J, 0
  1985. ble,pn %icc, .LL999
  1986. mov A, AO
  1987. #if defined(TRMMKERNEL) && defined(LEFT)
  1988. mov OFFSET, KK
  1989. #endif
  1990. cmp I, 0
  1991. ble,pn %icc, .LL250
  1992. mov C, C1
  1993. .LL221:
  1994. #if !defined(TRMMKERNEL)
  1995. LDF [AO + 0 * SIZE], a1
  1996. sra K, 2, L
  1997. FMOV FZERO, c01
  1998. LDF [B + 0 * SIZE], b1
  1999. mov B, BO
  2000. FMOV FZERO, t1
  2001. LDF [AO + 1 * SIZE], a2
  2002. cmp L, 0
  2003. FMOV FZERO, c02
  2004. LDF [B + 1 * SIZE], b2
  2005. FMOV FZERO, t2
  2006. LDF [AO + 2 * SIZE], a3
  2007. FMOV FZERO, c03
  2008. LDF [B + 2 * SIZE], b3
  2009. FMOV FZERO, t3
  2010. LDF [AO + 3 * SIZE], a4
  2011. FMOV FZERO, c04
  2012. LDF [B + 3 * SIZE], b4
  2013. FMOV FZERO, t4
  2014. #else
  2015. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2016. mov B, BO
  2017. #else
  2018. sll KK, 2 + BASE_SHIFT, TEMP1
  2019. sll KK, 0 + BASE_SHIFT, TEMP2
  2020. add AO, TEMP1, AO
  2021. add B, TEMP2, BO
  2022. #endif
  2023. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2024. sub K, KK, L
  2025. #elif defined(LEFT)
  2026. add KK, 4, L
  2027. #else
  2028. add KK, 1, L
  2029. #endif
  2030. sra L, 2, L
  2031. cmp L, 0
  2032. LDF [AO + 0 * SIZE], a1
  2033. FMOV FZERO, c01
  2034. LDF [BO + 0 * SIZE], b1
  2035. FMOV FZERO, t1
  2036. LDF [AO + 1 * SIZE], a2
  2037. FMOV FZERO, c02
  2038. LDF [BO + 1 * SIZE], b2
  2039. FMOV FZERO, t2
  2040. LDF [AO + 2 * SIZE], a3
  2041. FMOV FZERO, c03
  2042. LDF [BO + 2 * SIZE], b3
  2043. FMOV FZERO, t3
  2044. LDF [AO + 3 * SIZE], a4
  2045. FMOV FZERO, c04
  2046. LDF [BO + 3 * SIZE], b4
  2047. FMOV FZERO, t4
  2048. #endif
  2049. ble,pn %icc, .LL225
  2050. prefetch [C1 + 4 * SIZE], 2
  2051. .LL222:
  2052. FADD c01, t1, c01
  2053. add BO, 4 * SIZE, BO
  2054. FMUL a1, b1, t1
  2055. LDF [AO + 4 * SIZE], a1
  2056. FADD c02, t2, c02
  2057. FMUL a2, b1, t2
  2058. LDF [AO + 5 * SIZE], a2
  2059. FADD c03, t3, c03
  2060. add L, -1, L
  2061. FMUL a3, b1, t3
  2062. LDF [AO + 6 * SIZE], a3
  2063. FADD c04, t4, c04
  2064. FMUL a4, b1, t4
  2065. LDF [AO + 7 * SIZE], a4
  2066. LDF [BO + 0 * SIZE], b1
  2067. FADD c01, t1, c01
  2068. cmp L, 0
  2069. FMUL a1, b2, t1
  2070. LDF [AO + 8 * SIZE], a1
  2071. FADD c02, t2, c02
  2072. FMUL a2, b2, t2
  2073. LDF [AO + 9 * SIZE], a2
  2074. FADD c03, t3, c03
  2075. FMUL a3, b2, t3
  2076. LDF [AO + 10 * SIZE], a3
  2077. FADD c04, t4, c04
  2078. FMUL a4, b2, t4
  2079. LDF [AO + 11 * SIZE], a4
  2080. LDF [BO + 1 * SIZE], b2
  2081. FADD c01, t1, c01
  2082. FMUL a1, b3, t1
  2083. LDF [AO + 12 * SIZE], a1
  2084. FADD c02, t2, c02
  2085. FMUL a2, b3, t2
  2086. LDF [AO + 13 * SIZE], a2
  2087. FADD c03, t3, c03
  2088. FMUL a3, b3, t3
  2089. LDF [AO + 14 * SIZE], a3
  2090. FADD c04, t4, c04
  2091. FMUL a4, b3, t4
  2092. LDF [AO + 15 * SIZE], a4
  2093. LDF [BO + 2 * SIZE], b3
  2094. FADD c01, t1, c01
  2095. FMUL a1, b4, t1
  2096. LDF [AO + 16 * SIZE], a1
  2097. FADD c02, t2, c02
  2098. FMUL a2, b4, t2
  2099. LDF [AO + 17 * SIZE], a2
  2100. FADD c03, t3, c03
  2101. FMUL a3, b4, t3
  2102. LDF [AO + 18 * SIZE], a3
  2103. FADD c04, t4, c04
  2104. FMUL a4, b4, t4
  2105. LDF [AO + 19 * SIZE], a4
  2106. add AO, 16 * SIZE, AO
  2107. bg,pt %icc, .LL222
  2108. LDF [BO + 3 * SIZE], b4
  2109. .LL225:
  2110. #ifndef TRMMKERNEL
  2111. and K, 3, L
  2112. #else
  2113. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2114. sub K, KK, L
  2115. #elif defined(LEFT)
  2116. add KK, 4, L
  2117. #else
  2118. add KK, 1, L
  2119. #endif
  2120. and L, 3, L
  2121. #endif
  2122. cmp L, 0
  2123. ble,a,pn %icc, .LL229
  2124. nop
  2125. .LL226:
  2126. FADD c01, t1, c01
  2127. add BO, 1 * SIZE, BO
  2128. FMUL a1, b1, t1
  2129. LDF [AO + 4 * SIZE], a1
  2130. FADD c02, t2, c02
  2131. add L, -1, L
  2132. FMUL a2, b1, t2
  2133. LDF [AO + 5 * SIZE], a2
  2134. FADD c03, t3, c03
  2135. cmp L, 0
  2136. FMUL a3, b1, t3
  2137. LDF [AO + 6 * SIZE], a3
  2138. FADD c04, t4, c04
  2139. FMUL a4, b1, t4
  2140. LDF [AO + 7 * SIZE], a4
  2141. add AO, 4 * SIZE, AO
  2142. bg,pt %icc, .LL226
  2143. LDF [BO + 0 * SIZE], b1
  2144. .LL229:
  2145. #ifndef TRMMKERNEL
  2146. FADD c01, t1, c01
  2147. add I, -1, I
  2148. FADD c02, t2, c02
  2149. cmp I, 0
  2150. FADD c03, t3, c03
  2151. FADD c04, t4, c04
  2152. FMUL c01, ALPHA, c01
  2153. FMUL c02, ALPHA, c02
  2154. FMUL c03, ALPHA, c03
  2155. FMUL c04, ALPHA, c04
  2156. LDF [C1 + 0 * SIZE], a1
  2157. LDF [C1 + 1 * SIZE], a2
  2158. LDF [C1 + 2 * SIZE], a3
  2159. LDF [C1 + 3 * SIZE], a4
  2160. FADD c01, a1, c01
  2161. FADD c02, a2, c02
  2162. FADD c03, a3, c03
  2163. FADD c04, a4, c04
  2164. STF c01, [C1 + 0 * SIZE]
  2165. STF c02, [C1 + 1 * SIZE]
  2166. STF c03, [C1 + 2 * SIZE]
  2167. STF c04, [C1 + 3 * SIZE]
  2168. add C1, 4 * SIZE, C1
  2169. #else
  2170. FADD c01, t1, c01
  2171. FADD c02, t2, c02
  2172. FADD c03, t3, c03
  2173. FADD c04, t4, c04
  2174. FMUL c01, ALPHA, c01
  2175. FMUL c02, ALPHA, c02
  2176. FMUL c03, ALPHA, c03
  2177. FMUL c04, ALPHA, c04
  2178. STF c01, [C1 + 0 * SIZE]
  2179. STF c02, [C1 + 1 * SIZE]
  2180. STF c03, [C1 + 2 * SIZE]
  2181. STF c04, [C1 + 3 * SIZE]
  2182. add C1, 4 * SIZE, C1
  2183. #if ( defined(LEFT) && defined(TRANSA)) || \
  2184. (!defined(LEFT) && !defined(TRANSA))
  2185. sub K, KK, TEMP1
  2186. #ifdef LEFT
  2187. add TEMP1, -4, TEMP1
  2188. #else
  2189. add TEMP1, -1, TEMP1
  2190. #endif
  2191. sll TEMP1, 2 + BASE_SHIFT, TEMP2
  2192. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  2193. add AO, TEMP2, AO
  2194. add BO, TEMP1, BO
  2195. #endif
  2196. #ifdef LEFT
  2197. add KK, 4, KK
  2198. #endif
  2199. add I, -1, I
  2200. cmp I, 0
  2201. #endif
  2202. bg,pt %icc, .LL221
  2203. nop
  2204. .LL250:
  2205. and M, 2, I
  2206. cmp I, 0
  2207. ble,pn %icc, .LL270
  2208. nop
  2209. .LL251:
  2210. #if !defined(TRMMKERNEL)
  2211. LDF [AO + 0 * SIZE], a1
  2212. sra K, 2, L
  2213. FMOV FZERO, c01
  2214. LDF [B + 0 * SIZE], b1
  2215. mov B, BO
  2216. FMOV FZERO, t1
  2217. LDF [AO + 1 * SIZE], a2
  2218. cmp L, 0
  2219. FMOV FZERO, c02
  2220. LDF [B + 1 * SIZE], b2
  2221. FMOV FZERO, t2
  2222. LDF [AO + 2 * SIZE], a3
  2223. FMOV FZERO, c03
  2224. LDF [B + 2 * SIZE], b3
  2225. FMOV FZERO, t3
  2226. LDF [AO + 3 * SIZE], a4
  2227. FMOV FZERO, c04
  2228. LDF [B + 3 * SIZE], b4
  2229. FMOV FZERO, t4
  2230. #else
  2231. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2232. mov B, BO
  2233. #else
  2234. sll KK, 1 + BASE_SHIFT, TEMP1
  2235. sll KK, 0 + BASE_SHIFT, TEMP2
  2236. add AO, TEMP1, AO
  2237. add B, TEMP2, BO
  2238. #endif
  2239. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2240. sub K, KK, L
  2241. #elif defined(LEFT)
  2242. add KK, 2, L
  2243. #else
  2244. add KK, 1, L
  2245. #endif
  2246. sra L, 2, L
  2247. cmp L, 0
  2248. LDF [AO + 0 * SIZE], a1
  2249. FMOV FZERO, c01
  2250. LDF [BO + 0 * SIZE], b1
  2251. FMOV FZERO, t1
  2252. LDF [AO + 1 * SIZE], a2
  2253. FMOV FZERO, c02
  2254. LDF [BO + 1 * SIZE], b2
  2255. FMOV FZERO, t2
  2256. LDF [AO + 2 * SIZE], a3
  2257. FMOV FZERO, c03
  2258. LDF [BO + 2 * SIZE], b3
  2259. FMOV FZERO, t3
  2260. LDF [AO + 3 * SIZE], a4
  2261. FMOV FZERO, c04
  2262. LDF [BO + 3 * SIZE], b4
  2263. FMOV FZERO, t4
  2264. #endif
  2265. ble,pn %icc, .LL255
  2266. nop
  2267. .LL252:
  2268. FADD c01, t1, c01
  2269. add L, -1, L
  2270. FMUL a1, b1, t1
  2271. LDF [AO + 4 * SIZE], a1
  2272. FADD c02, t2, c02
  2273. FMUL a2, b1, t2
  2274. LDF [AO + 5 * SIZE], a2
  2275. LDF [BO + 4 * SIZE], b1
  2276. FADD c03, t3, c03
  2277. cmp L, 0
  2278. FMUL a3, b2, t3
  2279. LDF [AO + 6 * SIZE], a3
  2280. FADD c04, t4, c04
  2281. FMUL a4, b2, t4
  2282. LDF [AO + 7 * SIZE], a4
  2283. LDF [BO + 5 * SIZE], b2
  2284. FADD c01, t1, c01
  2285. FMUL a1, b3, t1
  2286. LDF [AO + 8 * SIZE], a1
  2287. FADD c02, t2, c02
  2288. FMUL a2, b3, t2
  2289. LDF [AO + 9 * SIZE], a2
  2290. LDF [BO + 6 * SIZE], b3
  2291. FADD c03, t3, c03
  2292. FMUL a3, b4, t3
  2293. LDF [AO + 10 * SIZE], a3
  2294. FADD c04, t4, c04
  2295. FMUL a4, b4, t4
  2296. LDF [AO + 11 * SIZE], a4
  2297. add AO, 8 * SIZE, AO
  2298. LDF [BO + 7 * SIZE], b4
  2299. bg,pt %icc, .LL252
  2300. add BO, 4 * SIZE, BO
  2301. .LL255:
  2302. #ifndef TRMMKERNEL
  2303. and K, 3, L
  2304. #else
  2305. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2306. sub K, KK, L
  2307. #elif defined(LEFT)
  2308. add KK, 2, L
  2309. #else
  2310. add KK, 1, L
  2311. #endif
  2312. and L, 3, L
  2313. #endif
  2314. cmp L, 0
  2315. ble,a,pn %icc, .LL259
  2316. nop
  2317. .LL256:
  2318. FADD c01, t1, c01
  2319. add L, -1, L
  2320. FMUL a1, b1, t1
  2321. LDF [AO + 2 * SIZE], a1
  2322. FADD c02, t2, c02
  2323. cmp L, 0
  2324. FMUL a2, b1, t2
  2325. LDF [AO + 3 * SIZE], a2
  2326. LDF [BO + 1 * SIZE], b1
  2327. add AO, 2 * SIZE, AO
  2328. bg,pt %icc, .LL256
  2329. add BO, 1 * SIZE, BO
  2330. .LL259:
  2331. #ifndef TRMMKERNEL
  2332. FADD c01, t1, c01
  2333. LDF [C1 + 0 * SIZE], a1
  2334. FADD c02, t2, c02
  2335. LDF [C1 + 1 * SIZE], a2
  2336. FADD c03, t3, c03
  2337. FADD c04, t4, c04
  2338. FADD c01, c03, c01
  2339. FADD c02, c04, c02
  2340. FMUL c01, ALPHA, c01
  2341. FMUL c02, ALPHA, c02
  2342. FADD c01, a1, c01
  2343. FADD c02, a2, c02
  2344. STF c01, [C1 + 0 * SIZE]
  2345. STF c02, [C1 + 1 * SIZE]
  2346. add C1, 2 * SIZE, C1
  2347. #else
  2348. FADD c01, t1, c01
  2349. FADD c02, t2, c02
  2350. FADD c03, t3, c03
  2351. FADD c04, t4, c04
  2352. FADD c01, c03, c01
  2353. FADD c02, c04, c02
  2354. FMUL c01, ALPHA, c01
  2355. FMUL c02, ALPHA, c02
  2356. STF c01, [C1 + 0 * SIZE]
  2357. STF c02, [C1 + 1 * SIZE]
  2358. add C1, 2 * SIZE, C1
  2359. #if ( defined(LEFT) && defined(TRANSA)) || \
  2360. (!defined(LEFT) && !defined(TRANSA))
  2361. sub K, KK, TEMP1
  2362. #ifdef LEFT
  2363. add TEMP1, -2, TEMP1
  2364. #else
  2365. add TEMP1, -1, TEMP1
  2366. #endif
  2367. sll TEMP1, 1 + BASE_SHIFT, TEMP2
  2368. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  2369. add AO, TEMP2, AO
  2370. add BO, TEMP1, BO
  2371. #endif
  2372. #ifdef LEFT
  2373. add KK, 2, KK
  2374. #endif
  2375. #endif
  2376. .LL270:
  2377. and M, 1, I
  2378. cmp I, 0
  2379. ble,pn %icc, .LL999
  2380. nop
  2381. .LL271:
  2382. #if !defined(TRMMKERNEL)
  2383. LDF [AO + 0 * SIZE], a1
  2384. sra K, 2, L
  2385. FMOV FZERO, t1
  2386. LDF [AO + 1 * SIZE], a2
  2387. mov B, BO
  2388. FMOV FZERO, c01
  2389. LDF [AO + 2 * SIZE], a3
  2390. cmp L, 0
  2391. FMOV FZERO, t2
  2392. LDF [AO + 3 * SIZE], a4
  2393. FMOV FZERO, c02
  2394. LDF [BO + 0 * SIZE], b1
  2395. FMOV FZERO, t3
  2396. LDF [BO + 1 * SIZE], b2
  2397. FMOV FZERO, t4
  2398. LDF [BO + 2 * SIZE], b3
  2399. #else
  2400. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2401. mov B, BO
  2402. #else
  2403. sll KK, 0 + BASE_SHIFT, TEMP1
  2404. sll KK, 0 + BASE_SHIFT, TEMP2
  2405. add AO, TEMP1, AO
  2406. add B, TEMP2, BO
  2407. #endif
  2408. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2409. sub K, KK, L
  2410. #elif defined(LEFT)
  2411. add KK, 1, L
  2412. #else
  2413. add KK, 1, L
  2414. #endif
  2415. sra L, 2, L
  2416. cmp L, 0
  2417. LDF [AO + 0 * SIZE], a1
  2418. FMOV FZERO, t1
  2419. LDF [AO + 1 * SIZE], a2
  2420. FMOV FZERO, c01
  2421. LDF [AO + 2 * SIZE], a3
  2422. FMOV FZERO, t2
  2423. LDF [AO + 3 * SIZE], a4
  2424. FMOV FZERO, c02
  2425. LDF [BO + 0 * SIZE], b1
  2426. FMOV FZERO, t3
  2427. LDF [BO + 1 * SIZE], b2
  2428. FMOV FZERO, t4
  2429. LDF [BO + 2 * SIZE], b3
  2430. #endif
  2431. ble,pn %icc, .LL275
  2432. LDF [BO + 3 * SIZE], b4
  2433. .LL272:
  2434. FADD c01, t1, c01
  2435. add L, -1, L
  2436. add AO, 4 * SIZE, AO
  2437. FMUL a1, b1, t1
  2438. add BO, 4 * SIZE, BO
  2439. LDF [AO + 0 * SIZE], a1
  2440. FADD c02, t2, c02
  2441. cmp L, 0
  2442. LDF [BO + 0 * SIZE], b1
  2443. FMUL a2, b2, t2
  2444. LDF [AO + 1 * SIZE], a2
  2445. FADD c01, t3, c01
  2446. LDF [BO + 1 * SIZE], b2
  2447. FMUL a3, b3, t3
  2448. LDF [AO + 2 * SIZE], a3
  2449. FADD c02, t4, c02
  2450. LDF [BO + 2 * SIZE], b3
  2451. FMUL a4, b4, t4
  2452. LDF [AO + 3 * SIZE], a4
  2453. bg,pt %icc, .LL272
  2454. LDF [BO + 3 * SIZE], b4
  2455. .LL275:
  2456. #ifndef TRMMKERNEL
  2457. and K, 3, L
  2458. #else
  2459. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2460. sub K, KK, L
  2461. #elif defined(LEFT)
  2462. add KK, 1, L
  2463. #else
  2464. add KK, 1, L
  2465. #endif
  2466. and L, 3, L
  2467. #endif
  2468. cmp L, 0
  2469. ble,a,pn %icc, .LL279
  2470. nop
  2471. .LL276:
  2472. FADD c01, t1, c01
  2473. add L, -1, L
  2474. FMUL a1, b1, t1
  2475. LDF [AO + 1 * SIZE], a1
  2476. LDF [BO + 1 * SIZE], b1
  2477. add BO, 1 * SIZE, BO
  2478. cmp L, 0
  2479. bg,pt %icc, .LL276
  2480. add AO, 1 * SIZE, AO
  2481. .LL279:
  2482. #ifndef TRMMKERNEL
  2483. FADD c01, t1, c01
  2484. LDF [C1 + 0 * SIZE], a1
  2485. FADD c02, t2, c02
  2486. FADD c01, t3, c01
  2487. FADD c02, t4, c02
  2488. FADD c01, c02, c01
  2489. FMUL c01, ALPHA, c01
  2490. FADD c01, a1, c01
  2491. STF c01, [C1 + 0 * SIZE]
  2492. #else
  2493. FADD c01, t1, c01
  2494. FADD c02, t2, c02
  2495. FADD c01, t3, c01
  2496. FADD c02, t4, c02
  2497. FADD c01, c02, c01
  2498. FMUL c01, ALPHA, c01
  2499. STF c01, [C1 + 0 * SIZE]
  2500. #if ( defined(LEFT) && defined(TRANSA)) || \
  2501. (!defined(LEFT) && !defined(TRANSA))
  2502. sub K, KK, TEMP1
  2503. #ifdef LEFT
  2504. add TEMP1, -1, TEMP1
  2505. #else
  2506. add TEMP1, -1, TEMP1
  2507. #endif
  2508. sll TEMP1, 0 + BASE_SHIFT, TEMP2
  2509. sll TEMP1, 0 + BASE_SHIFT, TEMP1
  2510. add AO, TEMP2, AO
  2511. add BO, TEMP1, BO
  2512. #endif
  2513. #ifdef LEFT
  2514. add KK, 1, KK
  2515. #endif
  2516. #endif
  2517. .LL999:
  2518. return %i7 + 8
  2519. clr %o0
  2520. EPILOGUE