You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_4x4.S 42 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if !defined(EV4) && !defined(EV5) && !defined(EV6)
  41. #error "Architecture is not specified."
  42. #endif
  43. #ifdef EV6
  44. #define PREFETCHSIZE 56
  45. #define UNOP unop
  46. #endif
  47. #ifdef EV5
  48. #define PREFETCHSIZE 56
  49. #define UNOP
  50. #endif
  51. #ifdef EV4
  52. #define UNOP
  53. #endif
  54. #define STACKSIZE 80
  55. #define M $16
  56. #define N $17
  57. #define K $18
  58. #define A $20
  59. #define B $21
  60. #define C $22
  61. #define LDC $23
  62. #define C1 $19
  63. #define C2 $24
  64. #define C3 $25
  65. #define C4 $27
  66. #define AO $at
  67. #define BO $5
  68. #define I $6
  69. #define J $7
  70. #define L $8
  71. #define a1 $f16
  72. #define a2 $f17
  73. #define a3 $f18
  74. #define a4 $f19
  75. #define b1 $f20
  76. #define b2 $f21
  77. #define b3 $f22
  78. #define b4 $f23
  79. #define t1 $f24
  80. #define t2 $f25
  81. #define t3 $f26
  82. #define t4 $f27
  83. #define a5 $f28
  84. #define a6 $f30
  85. #define b5 $f29
  86. #define alpha $f30
  87. #define c01 $f0
  88. #define c02 $f1
  89. #define c03 $f2
  90. #define c04 $f3
  91. #define c05 $f4
  92. #define c06 $f5
  93. #define c07 $f6
  94. #define c08 $f7
  95. #define c09 $f8
  96. #define c10 $f9
  97. #define c11 $f10
  98. #define c12 $f11
  99. #define c13 $f12
  100. #define c14 $f13
  101. #define c15 $f14
  102. #define c16 $f15
  103. #define TMP1 $0
  104. #define TMP2 $1
  105. #define KK $2
  106. #define BB $3
  107. #define OFFSET $4
  108. #define ALPHA 64($sp)
  109. PROLOGUE
  110. PROFCODE
  111. .frame $sp, STACKSIZE, $26, 0
  112. lda $sp, -STACKSIZE($sp)
  113. ldq C, 0 + STACKSIZE($sp)
  114. ldq LDC, 8 + STACKSIZE($sp)
  115. #ifdef TRMMKERNEL
  116. ldq OFFSET, 16 + STACKSIZE($sp)
  117. #endif
  118. SXADDQ LDC, 0, LDC
  119. stt $f2, 0($sp)
  120. stt $f3, 8($sp)
  121. stt $f4, 16($sp)
  122. stt $f5, 24($sp)
  123. stt $f6, 32($sp)
  124. stt $f7, 40($sp)
  125. stt $f8, 48($sp)
  126. stt $f9, 56($sp)
  127. stt $f19, ALPHA
  128. cmple M, 0, $0
  129. cmple N, 0, $1
  130. cmple K, 0, $2
  131. or $0, $1, $0
  132. or $0, $2, $0
  133. bne $0, $L999
  134. #if defined(TRMMKERNEL) && !defined(LEFT)
  135. subq $31, OFFSET, KK
  136. #endif
  137. sra N, 2, J
  138. ble J, $L40
  139. .align 4
  140. $L01:
  141. mov C, C1
  142. addq C, LDC, C2
  143. mov A, AO
  144. s4addq K, 0, BB
  145. #if defined(TRMMKERNEL) && defined(LEFT)
  146. mov OFFSET, KK
  147. #endif
  148. addq C2, LDC, C3
  149. s4addq LDC, C, C
  150. SXADDQ BB, B, BB
  151. fclr t1
  152. addq C3, LDC, C4
  153. fclr t2
  154. sra M, 2, I
  155. fclr t3
  156. fclr t4
  157. ble I, $L20
  158. .align 4
  159. $L11:
  160. #if defined(EV5) || defined(EV6)
  161. ldl $31, 0 * SIZE(BB)
  162. ldl $31, 8 * SIZE(BB)
  163. unop
  164. lda BB, 16 * SIZE(BB)
  165. #endif
  166. #if !defined(TRMMKERNEL) || \
  167. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  168. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  169. #ifdef TRMMKERNEL
  170. #ifdef LEFT
  171. addq KK, 4, TMP1
  172. #else
  173. addq KK, 4, TMP1
  174. #endif
  175. #endif
  176. LD a1, 0 * SIZE(AO)
  177. fclr c11
  178. LD a2, 1 * SIZE(AO)
  179. fclr c12
  180. LD a3, 2 * SIZE(AO)
  181. fclr c16
  182. LD a4, 3 * SIZE(AO)
  183. fclr c15
  184. LD b1, 0 * SIZE(B)
  185. fclr c01
  186. LD b2, 1 * SIZE(B)
  187. fclr c02
  188. LD b3, 2 * SIZE(B)
  189. fclr c06
  190. LD b4, 3 * SIZE(B)
  191. fclr c05
  192. lds $f31, 4 * SIZE(C1)
  193. fclr c03
  194. #ifndef TRMMKERNEL
  195. lda L, -2(K)
  196. #else
  197. lda L, -2(TMP1)
  198. #endif
  199. fclr c04
  200. lds $f31, 7 * SIZE(C2)
  201. fclr c08
  202. lda BO, 4 * SIZE(B)
  203. fclr c13
  204. lds $f31, 4 * SIZE(C3)
  205. fclr c09
  206. lda AO, 4 * SIZE(AO)
  207. fclr c10
  208. #else
  209. sll KK, BASE_SHIFT + 2, TMP1
  210. addq AO, TMP1, AO
  211. addq B, TMP1, BO
  212. subq K, KK, TMP1
  213. LD a1, 0 * SIZE(AO)
  214. fclr c11
  215. LD a2, 1 * SIZE(AO)
  216. fclr c12
  217. LD a3, 2 * SIZE(AO)
  218. fclr c16
  219. LD a4, 3 * SIZE(AO)
  220. fclr c15
  221. LD b1, 0 * SIZE(BO)
  222. fclr c01
  223. LD b2, 1 * SIZE(BO)
  224. fclr c02
  225. LD b3, 2 * SIZE(BO)
  226. fclr c06
  227. LD b4, 3 * SIZE(BO)
  228. fclr c05
  229. lds $f31, 4 * SIZE(C1)
  230. fclr c03
  231. lda L, -2(TMP1)
  232. fclr c04
  233. lds $f31, 7 * SIZE(C2)
  234. fclr c08
  235. lda BO, 4 * SIZE(BO)
  236. fclr c13
  237. lds $f31, 4 * SIZE(C3)
  238. fclr c09
  239. lda AO, 4 * SIZE(AO)
  240. fclr c10
  241. #endif
  242. lds $f31, 7 * SIZE(C4)
  243. fclr c14
  244. fclr c07
  245. ble L, $L15
  246. .align 5
  247. $L12:
  248. /* 1 */
  249. ADD c11, t1, c11
  250. #ifndef EV4
  251. ldq $31, PREFETCHSIZE * SIZE(AO)
  252. #else
  253. unop
  254. #endif
  255. MUL b1, a1, t1
  256. #ifndef EV4
  257. ldl $31, PREFETCHSIZE * SIZE(BO)
  258. #else
  259. unop
  260. #endif
  261. ADD c12, t2, c12
  262. unop
  263. MUL b1, a2, t2
  264. unop
  265. ADD c16, t3, c16
  266. unop
  267. MUL b2, a2, t3
  268. LD a5, 0 * SIZE(AO)
  269. ADD c15, t4, c15
  270. unop
  271. MUL b2, a1, t4
  272. LD b5, 0 * SIZE(BO)
  273. /* 2 */
  274. ADD c01, t1, c01
  275. UNOP
  276. MUL b1, a3, t1
  277. UNOP
  278. ADD c02, t2, c02
  279. UNOP
  280. MUL b1, a4, t2
  281. UNOP
  282. ADD c06, t3, c06
  283. unop
  284. MUL b2, a4, t3
  285. unop
  286. ADD c05, t4, c05
  287. unop
  288. MUL b4, a1, t4
  289. unop
  290. /* 3 */
  291. ADD c03, t1, c03
  292. unop
  293. MUL b3, a1, t1
  294. unop
  295. ADD c04, t2, c04
  296. unop
  297. MUL b3, a2, t2
  298. unop
  299. ADD c08, t3, c08
  300. unop
  301. MUL b4, a2, t3
  302. LD a2, 1 * SIZE(AO)
  303. ADD c13, t4, c13
  304. unop
  305. MUL b2, a3, t4
  306. LD b2, 1 * SIZE(BO)
  307. /* 4 */
  308. ADD c09, t1, c09
  309. unop
  310. MUL b3, a3, t1
  311. LD a6, 2 * SIZE(AO)
  312. ADD c10, t2, c10
  313. unop
  314. MUL b3, a4, t2
  315. LD b3, 2 * SIZE(BO)
  316. ADD c14, t3, c14
  317. unop
  318. MUL b4, a4, t3
  319. LD a4, 3 * SIZE(AO)
  320. ADD c07, t4, c07
  321. unop
  322. MUL b4, a3, t4
  323. LD b4, 3 * SIZE(BO)
  324. /* 5 */
  325. ADD c11, t1, c11
  326. unop
  327. MUL b5, a5, t1
  328. LD a1, 4 * SIZE(AO)
  329. ADD c12, t2, c12
  330. lda L, -2(L)
  331. MUL b5, a2, t2
  332. LD b1, 4 * SIZE(BO)
  333. ADD c16, t3, c16
  334. unop
  335. MUL b2, a2, t3
  336. unop
  337. ADD c15, t4, c15
  338. unop
  339. MUL b2, a5, t4
  340. unop
  341. /* 6 */
  342. ADD c01, t1, c01
  343. unop
  344. MUL b5, a6, t1
  345. unop
  346. ADD c02, t2, c02
  347. unop
  348. MUL b5, a4, t2
  349. unop
  350. ADD c06, t3, c06
  351. unop
  352. MUL b2, a4, t3
  353. unop
  354. ADD c05, t4, c05
  355. unop
  356. MUL b4, a5, t4
  357. unop
  358. /* 7 */
  359. ADD c03, t1, c03
  360. lda AO, 8 * SIZE(AO)
  361. MUL b3, a5, t1
  362. unop
  363. ADD c04, t2, c04
  364. lda BO, 8 * SIZE(BO)
  365. MUL b3, a2, t2
  366. unop
  367. ADD c08, t3, c08
  368. unop
  369. MUL b4, a2, t3
  370. LD a2, -3 * SIZE(AO)
  371. ADD c13, t4, c13
  372. unop
  373. MUL b2, a6, t4
  374. LD b2, -3 * SIZE(BO)
  375. /* 8 */
  376. ADD c09, t1, c09
  377. unop
  378. MUL b3, a6, t1
  379. LD a3, -2 * SIZE(AO)
  380. ADD c10, t2, c10
  381. unop
  382. MUL b3, a4, t2
  383. LD b3, -2 * SIZE(BO)
  384. ADD c14, t3, c14
  385. unop
  386. MUL b4, a4, t3
  387. LD a4, -1 * SIZE(AO)
  388. ADD c07, t4, c07
  389. MUL b4, a6, t4
  390. LD b4, -1 * SIZE(BO)
  391. bgt L, $L12
  392. .align 4
  393. $L15:
  394. ADD c11, t1, c11
  395. ldt alpha, ALPHA
  396. MUL b1, a1, t1
  397. #ifndef TRMMKERNEL
  398. blbs K, $L18
  399. #else
  400. blbs TMP1, $L18
  401. #endif
  402. .align 4
  403. ADD c12, t2, c12
  404. MUL b1, a2, t2
  405. ADD c16, t3, c16
  406. MUL b2, a2, t3
  407. ADD c15, t4, c15
  408. MUL b2, a1, t4
  409. ADD c01, t1, c01
  410. MUL b1, a3, t1
  411. ADD c02, t2, c02
  412. unop
  413. MUL b1, a4, t2
  414. LD b1, 0 * SIZE(BO)
  415. ADD c06, t3, c06
  416. MUL b2, a4, t3
  417. ADD c05, t4, c05
  418. MUL b4, a1, t4
  419. ADD c03, t1, c03
  420. unop
  421. MUL b3, a1, t1
  422. LD a1, 0 * SIZE(AO)
  423. ADD c04, t2, c04
  424. unop
  425. MUL b3, a2, t2
  426. unop
  427. ADD c08, t3, c08
  428. unop
  429. MUL b4, a2, t3
  430. LD a2, 1 * SIZE(AO)
  431. ADD c13, t4, c13
  432. unop
  433. MUL b2, a3, t4
  434. LD b2, 1 * SIZE(BO)
  435. ADD c09, t1, c09
  436. unop
  437. MUL b3, a3, t1
  438. lda AO, 4 * SIZE(AO)
  439. ADD c10, t2, c10
  440. unop
  441. MUL b3, a4, t2
  442. LD b3, 2 * SIZE(BO)
  443. ADD c14, t3, c14
  444. unop
  445. MUL b4, a4, t3
  446. LD a4, -1 * SIZE(AO)
  447. ADD c07, t4, c07
  448. unop
  449. MUL b4, a3, t4
  450. LD a3, -2 * SIZE(AO)
  451. ADD c11, t1, c11
  452. LD b4, 3 * SIZE(BO)
  453. MUL b1, a1, t1
  454. lda BO, 4 * SIZE(BO)
  455. .align 4
  456. $L18:
  457. ADD c12, t2, c12
  458. unop
  459. MUL b1, a2, t2
  460. #ifndef TRMMKERNEL
  461. LD a5, 0 * SIZE(C1)
  462. #else
  463. unop
  464. #endif
  465. ADD c16, t3, c16
  466. unop
  467. MUL b2, a2, t3
  468. unop
  469. ADD c15, t4, c15
  470. unop
  471. MUL b2, a1, t4
  472. #ifndef TRMMKERNEL
  473. LD b5, 1 * SIZE(C1)
  474. #else
  475. unop
  476. #endif
  477. ADD c01, t1, c01
  478. unop
  479. MUL b1, a3, t1
  480. unop
  481. ADD c02, t2, c02
  482. unop
  483. MUL b1, a4, t2
  484. #ifndef TRMMKERNEL
  485. LD b1, 0 * SIZE(C2)
  486. #else
  487. unop
  488. #endif
  489. ADD c06, t3, c06
  490. unop
  491. MUL b2, a4, t3
  492. unop
  493. ADD c05, t4, c05
  494. unop
  495. MUL b4, a1, t4
  496. unop
  497. ADD c03, t1, c03
  498. unop
  499. MUL b3, a1, t1
  500. unop
  501. ADD c04, t2, c04
  502. unop
  503. MUL b3, a2, t2
  504. #ifndef TRMMKERNEL
  505. LD a1, 0 * SIZE(C3)
  506. #else
  507. unop
  508. #endif
  509. ADD c08, t3, c08
  510. unop
  511. MUL b4, a2, t3
  512. #ifndef TRMMKERNEL
  513. LD a2, 2 * SIZE(C1)
  514. #else
  515. unop
  516. #endif
  517. ADD c13, t4, c13
  518. unop
  519. MUL b2, a3, t4
  520. #ifndef TRMMKERNEL
  521. LD b2, 3 * SIZE(C1)
  522. #else
  523. unop
  524. #endif
  525. ADD c09, t1, c09
  526. lda I, -1(I)
  527. MUL b3, a3, t1
  528. unop
  529. ADD c10, t2, c10
  530. unop
  531. MUL b3, a4, t2
  532. #ifndef TRMMKERNEL
  533. LD b3, 0 * SIZE(C4)
  534. #else
  535. unop
  536. #endif
  537. ADD c14, t3, c14
  538. unop
  539. MUL b4, a4, t3
  540. #ifndef TRMMKERNEL
  541. LD a4, 1 * SIZE(C2)
  542. #else
  543. unop
  544. #endif
  545. ADD c07, t4, c07
  546. unop
  547. MUL b4, a3, t4
  548. #ifndef TRMMKERNEL
  549. LD a3, 2 * SIZE(C2)
  550. #else
  551. unop
  552. #endif
  553. ADD c11, t1, c11
  554. unop
  555. MUL alpha, c01, c01
  556. #ifndef TRMMKERNEL
  557. LD b4, 3 * SIZE(C2)
  558. #else
  559. unop
  560. #endif
  561. ADD c12, t2, c12
  562. unop
  563. MUL alpha, c02, c02
  564. #ifndef TRMMKERNEL
  565. LD t1, 1 * SIZE(C3)
  566. #else
  567. unop
  568. #endif
  569. ADD c16, t3, c16
  570. unop
  571. MUL alpha, c03, c03
  572. #ifndef TRMMKERNEL
  573. LD t2, 2 * SIZE(C3)
  574. #else
  575. unop
  576. #endif
  577. ADD c15, t4, c15
  578. unop
  579. MUL alpha, c04, c04
  580. #ifndef TRMMKERNEL
  581. LD t3, 3 * SIZE(C3)
  582. #else
  583. unop
  584. #endif
  585. MUL alpha, c05, c05
  586. unop
  587. #ifndef TRMMKERNEL
  588. ADD c01, a5, c01
  589. LD t4, 1 * SIZE(C4)
  590. #else
  591. unop
  592. unop
  593. #endif
  594. MUL alpha, c06, c06
  595. #ifndef TRMMKERNEL
  596. unop
  597. ADD c02, b5, c02
  598. LD a5, 2 * SIZE(C4)
  599. #endif
  600. MUL alpha, c07, c07
  601. #ifndef TRMMKERNEL
  602. unop
  603. ADD c03, a2, c03
  604. LD b5, 3 * SIZE(C4)
  605. #endif
  606. MUL alpha, c08, c08
  607. #ifndef TRMMKERNEL
  608. unop
  609. ADD c04, b2, c04
  610. unop
  611. #endif
  612. MUL alpha, c09, c09
  613. ST c01, 0 * SIZE(C1)
  614. #ifndef TRMMKERNEL
  615. ADD c05, b1, c05
  616. unop
  617. #endif
  618. MUL alpha, c10, c10
  619. ST c02, 1 * SIZE(C1)
  620. #ifndef TRMMKERNEL
  621. ADD c06, a4, c06
  622. unop
  623. #endif
  624. MUL alpha, c11, c11
  625. ST c03, 2 * SIZE(C1)
  626. #ifndef TRMMKERNEL
  627. ADD c07, a3, c07
  628. unop
  629. #endif
  630. MUL alpha, c12, c12
  631. ST c04, 3 * SIZE(C1)
  632. #ifndef TRMMKERNEL
  633. ADD c08, b4, c08
  634. #else
  635. unop
  636. #endif
  637. lda C1, 4 * SIZE(C1)
  638. MUL alpha, c13, c13
  639. ST c05, 0 * SIZE(C2)
  640. #ifndef TRMMKERNEL
  641. ADD c09, a1, c09
  642. unop
  643. #endif
  644. MUL alpha, c14, c14
  645. ST c06, 1 * SIZE(C2)
  646. #ifndef TRMMKERNEL
  647. ADD c10, t1, c10
  648. unop
  649. #endif
  650. MUL alpha, c15, c15
  651. ST c07, 2 * SIZE(C2)
  652. #ifndef TRMMKERNEL
  653. ADD c11, t2, c11
  654. unop
  655. #endif
  656. MUL alpha, c16, c16
  657. ST c08, 3 * SIZE(C2)
  658. #ifndef TRMMKERNEL
  659. ADD c12, t3, c12
  660. #else
  661. unop
  662. #endif
  663. lda C2, 4 * SIZE(C2)
  664. #ifndef TRMMKERNEL
  665. ADD c13, b3, c13
  666. #else
  667. unop
  668. #endif
  669. ST c09, 0 * SIZE(C3)
  670. fclr t1
  671. lda C4, 4 * SIZE(C4)
  672. #ifndef TRMMKERNEL
  673. ADD c14, t4, c14
  674. #else
  675. unop
  676. #endif
  677. ST c10, 1 * SIZE(C3)
  678. fclr t2
  679. unop
  680. #ifndef TRMMKERNEL
  681. ADD c15, a5, c15
  682. #else
  683. unop
  684. #endif
  685. ST c11, 2 * SIZE(C3)
  686. fclr t3
  687. unop
  688. #ifndef TRMMKERNEL
  689. ADD c16, b5, c16
  690. #else
  691. unop
  692. #endif
  693. ST c12, 3 * SIZE(C3)
  694. fclr t4
  695. lda C3, 4 * SIZE(C3)
  696. ST c13, -4 * SIZE(C4)
  697. ST c14, -3 * SIZE(C4)
  698. ST c15, -2 * SIZE(C4)
  699. ST c16, -1 * SIZE(C4)
  700. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  701. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  702. subq K, KK, TMP1
  703. #ifdef LEFT
  704. subq TMP1, 4, TMP1
  705. #else
  706. subq TMP1, 4, TMP1
  707. #endif
  708. sll TMP1, BASE_SHIFT + 2, TMP1
  709. addq AO, TMP1, AO
  710. addq BO, TMP1, BO
  711. #endif
  712. #if defined(TRMMKERNEL) && defined(LEFT)
  713. addq KK, 4, KK
  714. #endif
  715. bgt I, $L11
  716. .align 4
  717. $L20:
  718. and M, 2, I
  719. ble I, $L30
  720. #if !defined(TRMMKERNEL) || \
  721. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  722. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  723. #ifdef TRMMKERNEL
  724. #ifdef LEFT
  725. addq KK, 2, TMP1
  726. #else
  727. addq KK, 4, TMP1
  728. #endif
  729. #endif
  730. LD a1, 0 * SIZE(AO)
  731. fclr c09
  732. LD a2, 1 * SIZE(AO)
  733. fclr c13
  734. LD a3, 2 * SIZE(AO)
  735. fclr c10
  736. LD a4, 3 * SIZE(AO)
  737. fclr c14
  738. LD b1, 0 * SIZE(B)
  739. #ifndef TRMMKERNEL
  740. lda L, -2(K)
  741. #else
  742. lda L, -2(TMP1)
  743. #endif
  744. LD b2, 1 * SIZE(B)
  745. lda AO, 2 * SIZE(AO)
  746. LD b3, 2 * SIZE(B)
  747. fclr c01
  748. LD b4, 3 * SIZE(B)
  749. fclr c05
  750. lda BO, 4 * SIZE(B)
  751. fclr c02
  752. fclr c06
  753. ble L, $L25
  754. #else
  755. sll KK, BASE_SHIFT + 1, TMP1
  756. addq AO, TMP1, AO
  757. sll KK, BASE_SHIFT + 2, TMP2
  758. addq B, TMP2, BO
  759. subq K, KK, TMP1
  760. LD a1, 0 * SIZE(AO)
  761. fclr c09
  762. LD a2, 1 * SIZE(AO)
  763. fclr c13
  764. LD a3, 2 * SIZE(AO)
  765. fclr c10
  766. LD a4, 3 * SIZE(AO)
  767. fclr c14
  768. LD b1, 0 * SIZE(BO)
  769. lda L, -2(TMP1)
  770. LD b2, 1 * SIZE(BO)
  771. lda AO, 2 * SIZE(AO)
  772. LD b3, 2 * SIZE(BO)
  773. fclr c01
  774. LD b4, 3 * SIZE(BO)
  775. fclr c05
  776. lda BO, 4 * SIZE(BO)
  777. fclr c02
  778. fclr c06
  779. ble L, $L25
  780. #endif
  781. .align 4
  782. $L22:
  783. ADD c09, t1, c09
  784. unop
  785. MUL a1, b1, t1
  786. unop
  787. ADD c10, t2, c10
  788. unop
  789. MUL a2, b1, t2
  790. LD b1, 0 * SIZE(BO)
  791. ADD c13, t3, c13
  792. unop
  793. MUL a1, b2, t3
  794. lda BO, 8 * SIZE(BO)
  795. ADD c14, t4, c14
  796. unop
  797. MUL a2, b2, t4
  798. LD b2, -7 * SIZE(BO)
  799. ADD c01, t1, c01
  800. unop
  801. MUL a1, b3, t1
  802. unop
  803. ADD c02, t2, c02
  804. unop
  805. MUL a2, b3, t2
  806. LD b3, -6 * SIZE(BO)
  807. ADD c05, t3, c05
  808. unop
  809. MUL a1, b4, t3
  810. LD a1, 2 * SIZE(AO)
  811. ADD c06, t4, c06
  812. MUL a2, b4, t4
  813. LD b5, -5 * SIZE(BO)
  814. ADD c09, t1, c09
  815. unop
  816. MUL a3, b1, t1
  817. LD a2, 3 * SIZE(AO)
  818. ADD c10, t2, c10
  819. unop
  820. MUL a4, b1, t2
  821. LD b1, -4 * SIZE(BO)
  822. ADD c13, t3, c13
  823. unop
  824. MUL a3, b2, t3
  825. lda AO, 4 * SIZE(AO)
  826. ADD c14, t4, c14
  827. MUL a4, b2, t4
  828. LD b2, -3 * SIZE(BO)
  829. ADD c01, t1, c01
  830. lda L, -2(L)
  831. MUL a3, b3, t1
  832. LD b4, -1 * SIZE(BO)
  833. ADD c02, t2, c02
  834. unop
  835. MUL a4, b3, t2
  836. LD b3, -2 * SIZE(BO)
  837. ADD c05, t3, c05
  838. unop
  839. MUL a3, b5, t3
  840. LD a3, 0 * SIZE(AO)
  841. ADD c06, t4, c06
  842. MUL a4, b5, t4
  843. LD a4, 1 * SIZE(AO)
  844. bgt L, $L22
  845. .align 4
  846. $L25:
  847. ADD c09, t1, c09
  848. ldt alpha, ALPHA
  849. MUL a1, b1, t1
  850. #ifndef TRMMKERNEL
  851. blbs K, $L28
  852. #else
  853. blbs TMP1, $L28
  854. #endif
  855. ADD c10, t2, c10
  856. unop
  857. MUL a2, b1, t2
  858. LD b1, 0 * SIZE(BO)
  859. ADD c13, t3, c13
  860. unop
  861. MUL a1, b2, t3
  862. unop
  863. ADD c14, t4, c14
  864. unop
  865. MUL a2, b2, t4
  866. LD b2, 1 * SIZE(BO)
  867. ADD c01, t1, c01
  868. unop
  869. MUL a1, b3, t1
  870. lda AO, 2 * SIZE(AO)
  871. ADD c02, t2, c02
  872. unop
  873. MUL a2, b3, t2
  874. LD b3, 2 * SIZE(BO)
  875. ADD c05, t3, c05
  876. unop
  877. MUL a1, b4, t3
  878. LD a1, -2 * SIZE(AO)
  879. ADD c06, t4, c06
  880. unop
  881. MUL a2, b4, t4
  882. LD a2, -1 * SIZE(AO)
  883. ADD c09, t1, c09
  884. LD b4, 3 * SIZE(BO)
  885. MUL a1, b1, t1
  886. lda BO, 4 * SIZE(BO)
  887. .align 4
  888. $L28:
  889. ADD c10, t2, c10
  890. unop
  891. MUL a2, b1, t2
  892. #ifndef TRMMKERNEL
  893. LD a3, 0 * SIZE(C1)
  894. #else
  895. unop
  896. #endif
  897. ADD c13, t3, c13
  898. unop
  899. MUL a1, b2, t3
  900. #ifndef TRMMKERNEL
  901. LD a4, 1 * SIZE(C1)
  902. #else
  903. unop
  904. #endif
  905. ADD c14, t4, c14
  906. unop
  907. MUL a2, b2, t4
  908. #ifndef TRMMKERNEL
  909. LD a5, 0 * SIZE(C2)
  910. #else
  911. unop
  912. #endif
  913. ADD c01, t1, c01
  914. unop
  915. MUL a1, b3, t1
  916. #ifndef TRMMKERNEL
  917. LD b5, 1 * SIZE(C2)
  918. #else
  919. unop
  920. #endif
  921. ADD c02, t2, c02
  922. unop
  923. MUL a2, b3, t2
  924. #ifndef TRMMKERNEL
  925. LD b1, 0 * SIZE(C3)
  926. #else
  927. unop
  928. #endif
  929. ADD c05, t3, c05
  930. unop
  931. MUL a1, b4, t3
  932. #ifndef TRMMKERNEL
  933. LD b2, 1 * SIZE(C3)
  934. #else
  935. unop
  936. #endif
  937. ADD c06, t4, c06
  938. unop
  939. MUL a2, b4, t4
  940. #ifndef TRMMKERNEL
  941. LD b3, 0 * SIZE(C4)
  942. #else
  943. unop
  944. #endif
  945. ADD c09, t1, c09
  946. unop
  947. MUL alpha, c01, c01
  948. #ifndef TRMMKERNEL
  949. LD b4, 1 * SIZE(C4)
  950. #else
  951. unop
  952. #endif
  953. ADD c10, t2, c10
  954. unop
  955. MUL alpha, c02, c02
  956. unop
  957. ADD c13, t3, c13
  958. MUL alpha, c05, c05
  959. ADD c14, t4, c14
  960. MUL alpha, c06, c06
  961. MUL alpha, c09, c09
  962. #ifndef TRMMKERNEL
  963. ADD c01, a3, c01
  964. #endif
  965. MUL alpha, c10, c10
  966. #ifndef TRMMKERNEL
  967. ADD c02, a4, c02
  968. #endif
  969. MUL alpha, c13, c13
  970. #ifndef TRMMKERNEL
  971. ADD c05, a5, c05
  972. #endif
  973. MUL alpha, c14, c14
  974. #ifndef TRMMKERNEL
  975. ADD c06, b5, c06
  976. #endif
  977. #ifndef TRMMKERNEL
  978. ADD c09, b1, c09
  979. unop
  980. #endif
  981. ST c01, 0 * SIZE(C1)
  982. fclr t1
  983. #ifndef TRMMKERNEL
  984. ADD c10, b2, c10
  985. unop
  986. #endif
  987. ST c02, 1 * SIZE(C1)
  988. fclr t2
  989. #ifndef TRMMKERNEL
  990. ADD c13, b3, c13
  991. unop
  992. #endif
  993. ST c05, 0 * SIZE(C2)
  994. fclr t3
  995. #ifndef TRMMKERNEL
  996. ADD c14, b4, c14
  997. unop
  998. #endif
  999. ST c06, 1 * SIZE(C2)
  1000. fclr t4
  1001. ST c09, 0 * SIZE(C3)
  1002. lda C1, 2 * SIZE(C1)
  1003. ST c10, 1 * SIZE(C3)
  1004. lda C2, 2 * SIZE(C2)
  1005. ST c13, 0 * SIZE(C4)
  1006. lda C3, 2 * SIZE(C3)
  1007. ST c14, 1 * SIZE(C4)
  1008. lda C4, 2 * SIZE(C4)
  1009. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1010. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1011. subq K, KK, TMP1
  1012. #ifdef LEFT
  1013. subq TMP1, 2, TMP1
  1014. #else
  1015. subq TMP1, 4, TMP1
  1016. #endif
  1017. sll TMP1, BASE_SHIFT + 1, TMP2
  1018. addq AO, TMP2, AO
  1019. sll TMP1, BASE_SHIFT + 2, TMP2
  1020. addq BO, TMP2, BO
  1021. #endif
  1022. #if defined(TRMMKERNEL) && defined(LEFT)
  1023. addq KK, 2, KK
  1024. #endif
  1025. .align 4
  1026. $L30:
  1027. and M, 1, I
  1028. ble I, $L39
  1029. #if !defined(TRMMKERNEL) || \
  1030. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1031. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1032. #ifdef TRMMKERNEL
  1033. #ifdef LEFT
  1034. addq KK, 1, TMP1
  1035. #else
  1036. addq KK, 4, TMP1
  1037. #endif
  1038. #endif
  1039. LD a1, 0 * SIZE(AO)
  1040. fclr c01
  1041. LD a2, 1 * SIZE(AO)
  1042. fclr c05
  1043. LD b1, 0 * SIZE(B)
  1044. #ifndef TRMMKERNEL
  1045. lda L, -2(K)
  1046. #else
  1047. lda L, -2(TMP1)
  1048. #endif
  1049. LD b2, 1 * SIZE(B)
  1050. lda AO, 1 * SIZE(AO)
  1051. LD b3, 2 * SIZE(B)
  1052. fclr c09
  1053. LD b4, 3 * SIZE(B)
  1054. fclr c13
  1055. lda BO, 4 * SIZE(B)
  1056. ble L, $L35
  1057. #else
  1058. sll KK, BASE_SHIFT + 0, TMP1
  1059. addq AO, TMP1, AO
  1060. sll KK, BASE_SHIFT + 2, TMP2
  1061. addq B, TMP2, BO
  1062. subq K, KK, TMP1
  1063. LD a1, 0 * SIZE(AO)
  1064. fclr c01
  1065. LD a2, 1 * SIZE(AO)
  1066. fclr c05
  1067. LD b1, 0 * SIZE(BO)
  1068. lda L, -2(TMP1)
  1069. LD b2, 1 * SIZE(BO)
  1070. lda AO, 1 * SIZE(AO)
  1071. LD b3, 2 * SIZE(BO)
  1072. fclr c09
  1073. LD b4, 3 * SIZE(BO)
  1074. fclr c13
  1075. lda BO, 4 * SIZE(BO)
  1076. ble L, $L35
  1077. #endif
  1078. .align 4
  1079. $L32:
  1080. ADD c01, t1, c01
  1081. lda L, -2(L)
  1082. MUL a1, b1, t1
  1083. LD b1, 0 * SIZE(BO)
  1084. ADD c05, t2, c05
  1085. lda AO, 2 * SIZE(AO)
  1086. MUL a1, b2, t2
  1087. LD b2, 1 * SIZE(BO)
  1088. ADD c09, t3, c09
  1089. LD b5, 3 * SIZE(BO)
  1090. MUL a1, b3, t3
  1091. LD b3, 2 * SIZE(BO)
  1092. ADD c13, t4, c13
  1093. MUL a1, b4, t4
  1094. LD a1, -1 * SIZE(AO)
  1095. ADD c01, t1, c01
  1096. MUL a2, b1, t1
  1097. LD b1, 4 * SIZE(BO)
  1098. lda BO, 8 * SIZE(BO)
  1099. ADD c05, t2, c05
  1100. MUL a2, b2, t2
  1101. LD b2, -3 * SIZE(BO)
  1102. ADD c09, t3, c09
  1103. LD b4, -1 * SIZE(BO)
  1104. MUL a2, b3, t3
  1105. LD b3, -2 * SIZE(BO)
  1106. ADD c13, t4, c13
  1107. MUL a2, b5, t4
  1108. LD a2, 0 * SIZE(AO)
  1109. bgt L, $L32
  1110. .align 4
  1111. $L35:
  1112. ADD c01, t1, c01
  1113. ldt alpha, ALPHA
  1114. MUL a1, b1, t1
  1115. #ifndef TRMMKERNEL
  1116. blbs K, $L38
  1117. #else
  1118. blbs TMP1, $L38
  1119. #endif
  1120. .align 4
  1121. ADD c05, t2, c05
  1122. LD b1, 0 * SIZE(BO)
  1123. MUL a1, b2, t2
  1124. LD b2, 1 * SIZE(BO)
  1125. ADD c09, t3, c09
  1126. MUL a1, b3, t3
  1127. LD b3, 2 * SIZE(BO)
  1128. ADD c13, t4, c13
  1129. MUL a1, b4, t4
  1130. LD a1, 0 * SIZE(AO)
  1131. lda AO, 1 * SIZE(AO)
  1132. ADD c01, t1, c01
  1133. LD b4, 3 * SIZE(BO)
  1134. MUL a1, b1, t1
  1135. lda BO, 4 * SIZE(BO)
  1136. .align 4
  1137. $L38:
  1138. ADD c05, t2, c05
  1139. unop
  1140. MUL a1, b2, t2
  1141. #ifndef TRMMKERNEL
  1142. LD a5, 0 * SIZE(C1)
  1143. #else
  1144. unop
  1145. #endif
  1146. ADD c09, t3, c09
  1147. unop
  1148. MUL a1, b3, t3
  1149. #ifndef TRMMKERNEL
  1150. LD b5, 0 * SIZE(C2)
  1151. #else
  1152. unop
  1153. #endif
  1154. ADD c13, t4, c13
  1155. unop
  1156. MUL a1, b4, t4
  1157. #ifndef TRMMKERNEL
  1158. LD a2, 0 * SIZE(C3)
  1159. #else
  1160. unop
  1161. #endif
  1162. ADD c01, t1, c01
  1163. unop
  1164. MUL alpha, c01, c01
  1165. #ifndef TRMMKERNEL
  1166. LD a3, 0 * SIZE(C4)
  1167. #else
  1168. unop
  1169. #endif
  1170. ADD c05, t2, c05
  1171. unop
  1172. MUL alpha, c05, c05
  1173. unop
  1174. ADD c09, t3, c09
  1175. MUL alpha, c09, c09
  1176. ADD c13, t4, c13
  1177. MUL alpha, c13, c13
  1178. #ifndef TRMMKERNEL
  1179. ADD c01, a5, c01
  1180. ADD c05, b5, c05
  1181. ADD c09, a2, c09
  1182. ADD c13, a3, c13
  1183. #endif
  1184. ST c01, 0 * SIZE(C1)
  1185. ST c05, 0 * SIZE(C2)
  1186. ST c09, 0 * SIZE(C3)
  1187. ST c13, 0 * SIZE(C4)
  1188. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1189. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1190. subq K, KK, TMP1
  1191. #ifdef LEFT
  1192. subq TMP1, 1, TMP1
  1193. #else
  1194. subq TMP1, 4, TMP1
  1195. #endif
  1196. sll TMP1, BASE_SHIFT + 0, TMP2
  1197. addq AO, TMP2, AO
  1198. sll TMP1, BASE_SHIFT + 2, TMP2
  1199. addq BO, TMP2, BO
  1200. #endif
  1201. #if defined(TRMMKERNEL) && defined(LEFT)
  1202. addq KK, 1, KK
  1203. #endif
  1204. .align 4
  1205. $L39:
  1206. mov BO, B
  1207. lda J, -1(J)
  1208. #if defined(TRMMKERNEL) && !defined(LEFT)
  1209. addq KK, 4, KK
  1210. #else
  1211. unop
  1212. #endif
  1213. bgt J, $L01
  1214. .align 4
  1215. $L40:
  1216. and N, 2, J
  1217. ble J, $L80
  1218. mov C, C1
  1219. addq C, LDC, C2
  1220. mov A, AO
  1221. fclr t1
  1222. addq C2, LDC, C
  1223. fclr t2
  1224. #if defined(TRMMKERNEL) && defined(LEFT)
  1225. mov OFFSET, KK
  1226. #endif
  1227. sra M, 2, I
  1228. fclr t3
  1229. fclr t4
  1230. ble I, $L60
  1231. .align 4
  1232. $L51:
  1233. #if !defined(TRMMKERNEL) || \
  1234. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1235. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1236. #ifdef TRMMKERNEL
  1237. #ifdef LEFT
  1238. addq KK, 4, TMP1
  1239. #else
  1240. addq KK, 2, TMP1
  1241. #endif
  1242. #endif
  1243. LD a1, 0 * SIZE(AO)
  1244. fclr c03
  1245. LD a2, 1 * SIZE(AO)
  1246. fclr c07
  1247. LD a3, 2 * SIZE(AO)
  1248. fclr c04
  1249. LD a4, 3 * SIZE(AO)
  1250. fclr c08
  1251. LD b1, 0 * SIZE(B)
  1252. fclr c01
  1253. LD b2, 1 * SIZE(B)
  1254. fclr c05
  1255. LD b3, 2 * SIZE(B)
  1256. fclr c02
  1257. LD b4, 3 * SIZE(B)
  1258. fclr c06
  1259. #ifndef TRMMKERNEL
  1260. lda L, -2(K)
  1261. #else
  1262. lda L, -2(TMP1)
  1263. #endif
  1264. lda BO, 2 * SIZE(B)
  1265. lda AO, 4 * SIZE(AO)
  1266. ble L, $L55
  1267. #else
  1268. sll KK, BASE_SHIFT + 2, TMP1
  1269. addq AO, TMP1, AO
  1270. sll KK, BASE_SHIFT + 1, TMP2
  1271. addq B, TMP2, BO
  1272. subq K, KK, TMP1
  1273. LD a1, 0 * SIZE(AO)
  1274. fclr c03
  1275. LD a2, 1 * SIZE(AO)
  1276. fclr c07
  1277. LD a3, 2 * SIZE(AO)
  1278. fclr c04
  1279. LD a4, 3 * SIZE(AO)
  1280. fclr c08
  1281. LD b1, 0 * SIZE(BO)
  1282. fclr c01
  1283. LD b2, 1 * SIZE(BO)
  1284. fclr c05
  1285. LD b3, 2 * SIZE(BO)
  1286. fclr c02
  1287. LD b4, 3 * SIZE(BO)
  1288. fclr c06
  1289. lda L, -2(TMP1)
  1290. lda BO, 2 * SIZE(BO)
  1291. lda AO, 4 * SIZE(AO)
  1292. ble L, $L55
  1293. #endif
  1294. .align 4
  1295. $L52:
  1296. ADD c05, t1, c05
  1297. unop
  1298. MUL a1, b1, t1
  1299. unop
  1300. ADD c06, t2, c06
  1301. lda L, -2(L)
  1302. MUL a2, b1, t2
  1303. unop
  1304. ADD c07, t3, c07
  1305. unop
  1306. MUL a3, b1, t3
  1307. unop
  1308. ADD c08, t4, c08
  1309. unop
  1310. MUL a4, b1, t4
  1311. LD b1, 2 * SIZE(BO)
  1312. ADD c01, t1, c01
  1313. unop
  1314. MUL a1, b2, t1
  1315. LD a1, 0 * SIZE(AO)
  1316. ADD c02, t2, c02
  1317. lda BO, 4 * SIZE(BO)
  1318. MUL a2, b2, t2
  1319. LD a2, 1 * SIZE(AO)
  1320. ADD c03, t3, c03
  1321. unop
  1322. MUL a3, b2, t3
  1323. LD a3, 2 * SIZE(AO)
  1324. ADD c04, t4, c04
  1325. unop
  1326. MUL a4, b2, t4
  1327. LD a5, 3 * SIZE(AO)
  1328. ADD c05, t1, c05
  1329. unop
  1330. MUL a1, b3, t1
  1331. LD b2, -1 * SIZE(BO)
  1332. ADD c06, t2, c06
  1333. unop
  1334. MUL a2, b3, t2
  1335. unop
  1336. ADD c07, t3, c07
  1337. unop
  1338. MUL a3, b3, t3
  1339. lda AO, 8 * SIZE(AO)
  1340. ADD c08, t4, c08
  1341. unop
  1342. MUL a5, b3, t4
  1343. LD b3, 0 * SIZE(BO)
  1344. ADD c01, t1, c01
  1345. unop
  1346. MUL a1, b4, t1
  1347. LD a1, -4 * SIZE(AO)
  1348. ADD c02, t2, c02
  1349. unop
  1350. MUL a2, b4, t2
  1351. LD a2, -3 * SIZE(AO)
  1352. ADD c03, t3, c03
  1353. LD a4, -1 * SIZE(AO)
  1354. MUL a3, b4, t3
  1355. LD a3, -2 * SIZE(AO)
  1356. ADD c04, t4, c04
  1357. MUL a5, b4, t4
  1358. LD b4, 1 * SIZE(BO)
  1359. bgt L, $L52
  1360. .align 4
  1361. $L55:
  1362. ADD c05, t1, c05
  1363. ldt alpha, ALPHA
  1364. MUL a1, b1, t1
  1365. #ifndef TRMMKERNEL
  1366. blbs K, $L58
  1367. #else
  1368. blbs TMP1, $L58
  1369. #endif
  1370. .align 4
  1371. ADD c06, t2, c06
  1372. MUL a2, b1, t2
  1373. ADD c07, t3, c07
  1374. MUL a3, b1, t3
  1375. ADD c08, t4, c08
  1376. unop
  1377. MUL a4, b1, t4
  1378. LD b1, 0 * SIZE(BO)
  1379. ADD c01, t1, c01
  1380. unop
  1381. MUL a1, b2, t1
  1382. LD a1, 0 * SIZE(AO)
  1383. ADD c02, t2, c02
  1384. unop
  1385. MUL a2, b2, t2
  1386. LD a2, 1 * SIZE(AO)
  1387. ADD c03, t3, c03
  1388. unop
  1389. MUL a3, b2, t3
  1390. LD a3, 2 * SIZE(AO)
  1391. ADD c04, t4, c04
  1392. MUL a4, b2, t4
  1393. LD a4, 3 * SIZE(AO)
  1394. lda AO, 4 * SIZE(AO)
  1395. ADD c05, t1, c05
  1396. LD b2, 1 * SIZE(BO)
  1397. MUL a1, b1, t1
  1398. lda BO, 2 * SIZE(BO)
  1399. .align 4
  1400. $L58:
  1401. ADD c06, t2, c06
  1402. unop
  1403. MUL a2, b1, t2
  1404. #ifndef TRMMKERNEL
  1405. LD c09, 0 * SIZE(C1)
  1406. #else
  1407. unop
  1408. #endif
  1409. ADD c07, t3, c07
  1410. unop
  1411. MUL a3, b1, t3
  1412. #ifndef TRMMKERNEL
  1413. LD c10, 1 * SIZE(C1)
  1414. #else
  1415. unop
  1416. #endif
  1417. ADD c08, t4, c08
  1418. unop
  1419. MUL a4, b1, t4
  1420. #ifndef TRMMKERNEL
  1421. LD c11, 2 * SIZE(C1)
  1422. #else
  1423. unop
  1424. #endif
  1425. ADD c01, t1, c01
  1426. unop
  1427. MUL a1, b2, t1
  1428. #ifndef TRMMKERNEL
  1429. LD c12, 3 * SIZE(C1)
  1430. #else
  1431. unop
  1432. #endif
  1433. ADD c02, t2, c02
  1434. unop
  1435. MUL a2, b2, t2
  1436. #ifndef TRMMKERNEL
  1437. LD c13, 0 * SIZE(C2)
  1438. unop
  1439. #endif
  1440. ADD c03, t3, c03
  1441. unop
  1442. MUL a3, b2, t3
  1443. #ifndef TRMMKERNEL
  1444. LD c14, 1 * SIZE(C2)
  1445. #else
  1446. unop
  1447. #endif
  1448. ADD c04, t4, c04
  1449. unop
  1450. MUL a4, b2, t4
  1451. #ifndef TRMMKERNEL
  1452. LD c15, 2 * SIZE(C2)
  1453. #else
  1454. unop
  1455. #endif
  1456. ADD c05, t1, c05
  1457. unop
  1458. MUL alpha, c01, c01
  1459. #ifndef TRMMKERNEL
  1460. LD c16, 3 * SIZE(C2)
  1461. #else
  1462. unop
  1463. #endif
  1464. ADD c06, t2, c06
  1465. lda I, -1(I)
  1466. MUL alpha, c02, c02
  1467. unop
  1468. ADD c07, t3, c07
  1469. MUL alpha, c03, c03
  1470. ADD c08, t4, c08
  1471. MUL alpha, c04, c04
  1472. MUL alpha, c05, c05
  1473. #ifndef TRMMKERNEL
  1474. ADD c01, c09, c01
  1475. #endif
  1476. MUL alpha, c06, c06
  1477. #ifndef TRMMKERNEL
  1478. ADD c02, c10, c02
  1479. #endif
  1480. MUL alpha, c07, c07
  1481. #ifndef TRMMKERNEL
  1482. ADD c03, c11, c03
  1483. #endif
  1484. MUL alpha, c08, c08
  1485. #ifndef TRMMKERNEL
  1486. ADD c04, c12, c04
  1487. #endif
  1488. #ifndef TRMMKERNEL
  1489. ADD c05, c13, c05
  1490. #endif
  1491. ST c01, 0 * SIZE(C1)
  1492. #ifndef TRMMKERNEL
  1493. ADD c06, c14, c06
  1494. #endif
  1495. ST c02, 1 * SIZE(C1)
  1496. #ifndef TRMMKERNEL
  1497. ADD c07, c15, c07
  1498. #endif
  1499. ST c03, 2 * SIZE(C1)
  1500. #ifndef TRMMKERNEL
  1501. ADD c08, c16, c08
  1502. #endif
  1503. ST c04, 3 * SIZE(C1)
  1504. ST c05, 0 * SIZE(C2)
  1505. fclr t1
  1506. ST c06, 1 * SIZE(C2)
  1507. fclr t2
  1508. ST c07, 2 * SIZE(C2)
  1509. fclr t3
  1510. ST c08, 3 * SIZE(C2)
  1511. fclr t4
  1512. lda C1, 4 * SIZE(C1)
  1513. lda C2, 4 * SIZE(C2)
  1514. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1515. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1516. subq K, KK, TMP1
  1517. #ifdef LEFT
  1518. subq TMP1, 4, TMP1
  1519. #else
  1520. subq TMP1, 2, TMP1
  1521. #endif
  1522. sll TMP1, BASE_SHIFT + 2, TMP2
  1523. addq AO, TMP2, AO
  1524. sll TMP1, BASE_SHIFT + 1, TMP2
  1525. addq BO, TMP2, BO
  1526. #endif
  1527. #if defined(TRMMKERNEL) && defined(LEFT)
  1528. addq KK, 4, KK
  1529. #endif
  1530. bgt I, $L51
  1531. .align 4
  1532. $L60:
  1533. and M, 2, I
  1534. ble I, $L70
  1535. #if !defined(TRMMKERNEL) || \
  1536. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1537. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1538. #ifdef TRMMKERNEL
  1539. #ifdef LEFT
  1540. addq KK, 2, TMP1
  1541. #else
  1542. addq KK, 2, TMP1
  1543. #endif
  1544. #endif
  1545. LD a1, 0 * SIZE(AO)
  1546. fclr c01
  1547. LD a2, 1 * SIZE(AO)
  1548. fclr c05
  1549. LD a3, 2 * SIZE(AO)
  1550. fclr c02
  1551. LD a4, 3 * SIZE(AO)
  1552. fclr c06
  1553. LD b1, 0 * SIZE(B)
  1554. #ifndef TRMMKERNEL
  1555. lda L, -2(K)
  1556. #else
  1557. lda L, -2(TMP1)
  1558. #endif
  1559. LD b2, 1 * SIZE(B)
  1560. lda AO, 2 * SIZE(AO)
  1561. LD b3, 2 * SIZE(B)
  1562. LD b4, 3 * SIZE(B)
  1563. lda BO, 2 * SIZE(B)
  1564. ble L, $L65
  1565. #else
  1566. sll KK, BASE_SHIFT + 1, TMP1
  1567. addq AO, TMP1, AO
  1568. sll KK, BASE_SHIFT + 1, TMP2
  1569. addq B, TMP2, BO
  1570. subq K, KK, TMP1
  1571. LD a1, 0 * SIZE(AO)
  1572. fclr c01
  1573. LD a2, 1 * SIZE(AO)
  1574. fclr c05
  1575. LD a3, 2 * SIZE(AO)
  1576. fclr c02
  1577. LD a4, 3 * SIZE(AO)
  1578. fclr c06
  1579. LD b1, 0 * SIZE(BO)
  1580. lda L, -2(TMP1)
  1581. LD b2, 1 * SIZE(BO)
  1582. lda AO, 2 * SIZE(AO)
  1583. LD b3, 2 * SIZE(BO)
  1584. LD b4, 3 * SIZE(BO)
  1585. lda BO, 2 * SIZE(BO)
  1586. ble L, $L65
  1587. #endif
  1588. .align 4
  1589. $L62:
  1590. ADD c01, t1, c01
  1591. unop
  1592. MUL a1, b1, t1
  1593. unop
  1594. ADD c02, t2, c02
  1595. lda AO, 4 * SIZE(AO)
  1596. MUL a2, b1, t2
  1597. LD b1, 2 * SIZE(BO)
  1598. ADD c05, t3, c05
  1599. lda L, -2(L)
  1600. MUL a1, b2, t3
  1601. LD a1, -2 * SIZE(AO)
  1602. ADD c06, t4, c06
  1603. unop
  1604. MUL a2, b2, t4
  1605. LD a2, -1 * SIZE(AO)
  1606. ADD c01, t1, c01
  1607. LD b2, 3 * SIZE(BO)
  1608. MUL a3, b3, t1
  1609. lda BO, 4 * SIZE(BO)
  1610. ADD c02, t2, c02
  1611. unop
  1612. MUL a4, b3, t2
  1613. LD b3, 0 * SIZE(BO)
  1614. ADD c05, t3, c05
  1615. unop
  1616. MUL a3, b4, t3
  1617. LD a3, 0 * SIZE(AO)
  1618. ADD c06, t4, c06
  1619. MUL a4, b4, t4
  1620. LD b4, 1 * SIZE(BO)
  1621. unop
  1622. LD a4, 1 * SIZE(AO)
  1623. unop
  1624. unop
  1625. bgt L, $L62
  1626. .align 4
  1627. $L65:
  1628. ADD c01, t1, c01
  1629. ldt alpha, ALPHA
  1630. MUL a1, b1, t1
  1631. #ifndef TRMMKERNEL
  1632. blbs K, $L68
  1633. #else
  1634. blbs TMP1, $L68
  1635. #endif
  1636. .align 4
  1637. ADD c02, t2, c02
  1638. unop
  1639. MUL a2, b1, t2
  1640. LD b1, 0 * SIZE(BO)
  1641. ADD c05, t3, c05
  1642. lda BO, 2 * SIZE(BO)
  1643. MUL a1, b2, t3
  1644. LD a1, 0 * SIZE(AO)
  1645. ADD c06, t4, c06
  1646. unop
  1647. MUL a2, b2, t4
  1648. LD a2, 1 * SIZE(AO)
  1649. ADD c01, t1, c01
  1650. LD b2, -1 * SIZE(BO)
  1651. MUL a1, b1, t1
  1652. lda AO, 2 * SIZE(AO)
  1653. .align 4
  1654. $L68:
  1655. ADD c02, t2, c02
  1656. unop
  1657. MUL a2, b1, t2
  1658. #ifndef TRMMKERNEL
  1659. LD c09, 0 * SIZE(C1)
  1660. #else
  1661. unop
  1662. #endif
  1663. ADD c05, t3, c05
  1664. unop
  1665. MUL a1, b2, t3
  1666. #ifndef TRMMKERNEL
  1667. LD c10, 1 * SIZE(C1)
  1668. #else
  1669. unop
  1670. #endif
  1671. ADD c06, t4, c06
  1672. unop
  1673. MUL a2, b2, t4
  1674. #ifndef TRMMKERNEL
  1675. LD c11, 0 * SIZE(C2)
  1676. #else
  1677. unop
  1678. #endif
  1679. ADD c01, t1, c01
  1680. unop
  1681. MUL alpha, c01, c01
  1682. #ifndef TRMMKERNEL
  1683. LD c12, 1 * SIZE(C2)
  1684. #else
  1685. unop
  1686. #endif
  1687. ADD c02, t2, c02
  1688. lda C1, 2 * SIZE(C1)
  1689. MUL alpha, c02, c02
  1690. lda C2, 2 * SIZE(C2)
  1691. ADD c05, t3, c05
  1692. MUL alpha, c05, c05
  1693. ADD c06, t4, c06
  1694. MUL alpha, c06, c06
  1695. #ifndef TRMMKERNEL
  1696. ADD c01, c09, c01
  1697. ADD c02, c10, c02
  1698. ADD c05, c11, c05
  1699. ADD c06, c12, c06
  1700. #endif
  1701. ST c01, -2 * SIZE(C1)
  1702. fclr t1
  1703. ST c02, -1 * SIZE(C1)
  1704. fclr t2
  1705. ST c05, -2 * SIZE(C2)
  1706. fclr t3
  1707. ST c06, -1 * SIZE(C2)
  1708. fclr t4
  1709. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1710. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1711. subq K, KK, TMP1
  1712. #ifdef LEFT
  1713. subq TMP1, 2, TMP1
  1714. #else
  1715. subq TMP1, 2, TMP1
  1716. #endif
  1717. sll TMP1, BASE_SHIFT + 1, TMP2
  1718. addq AO, TMP2, AO
  1719. sll TMP1, BASE_SHIFT + 1, TMP2
  1720. addq BO, TMP2, BO
  1721. #endif
  1722. #if defined(TRMMKERNEL) && defined(LEFT)
  1723. addq KK, 2, KK
  1724. #endif
  1725. .align 4
  1726. $L70:
  1727. and M, 1, I
  1728. ble I, $L79
  1729. #if !defined(TRMMKERNEL) || \
  1730. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1731. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1732. #ifdef TRMMKERNEL
  1733. #ifdef LEFT
  1734. addq KK, 1, TMP1
  1735. #else
  1736. addq KK, 2, TMP1
  1737. #endif
  1738. #endif
  1739. LD a1, 0 * SIZE(AO)
  1740. fclr c01
  1741. LD a2, 1 * SIZE(AO)
  1742. fclr c05
  1743. LD b1, 0 * SIZE(B)
  1744. fclr c02
  1745. LD b2, 1 * SIZE(B)
  1746. fclr c06
  1747. #ifndef TRMMKERNEL
  1748. lda L, -2(K)
  1749. #else
  1750. lda L, -2(TMP1)
  1751. #endif
  1752. LD b3, 2 * SIZE(B)
  1753. lda AO, 1 * SIZE(AO)
  1754. LD b4, 3 * SIZE(B)
  1755. lda BO, 2 * SIZE(B)
  1756. ble L, $L75
  1757. #else
  1758. sll KK, BASE_SHIFT + 0, TMP1
  1759. addq AO, TMP1, AO
  1760. sll KK, BASE_SHIFT + 1, TMP2
  1761. addq B, TMP2, BO
  1762. subq K, KK, TMP1
  1763. LD a1, 0 * SIZE(AO)
  1764. fclr c01
  1765. LD a2, 1 * SIZE(AO)
  1766. fclr c05
  1767. LD b1, 0 * SIZE(BO)
  1768. fclr c02
  1769. LD b2, 1 * SIZE(BO)
  1770. fclr c06
  1771. #ifndef TRMMKERNEL
  1772. lda L, -2(K)
  1773. #else
  1774. lda L, -2(TMP1)
  1775. #endif
  1776. LD b3, 2 * SIZE(BO)
  1777. lda AO, 1 * SIZE(AO)
  1778. LD b4, 3 * SIZE(BO)
  1779. lda BO, 2 * SIZE(BO)
  1780. ble L, $L75
  1781. #endif
  1782. .align 4
  1783. $L72:
  1784. ADD c01, t1, c01
  1785. lda L, -2(L)
  1786. MUL a1, b1, t1
  1787. LD b1, 2 * SIZE(BO)
  1788. ADD c05, t2, c05
  1789. MUL a1, b2, t2
  1790. LD a1, 1 * SIZE(AO)
  1791. LD b2, 3 * SIZE(BO)
  1792. ADD c02, t3, c02
  1793. lda AO, 2 * SIZE(AO)
  1794. MUL a2, b3, t3
  1795. LD b3, 4 * SIZE(BO)
  1796. ADD c06, t4, c06
  1797. MUL a2, b4, t4
  1798. LD a2, 0 * SIZE(AO)
  1799. LD b4, 5 * SIZE(BO)
  1800. lda BO, 4 * SIZE(BO)
  1801. unop
  1802. unop
  1803. bgt L, $L72
  1804. .align 4
  1805. $L75:
  1806. ADD c01, t1, c01
  1807. ldt alpha, ALPHA
  1808. MUL a1, b1, t1
  1809. #ifndef TRMMKERNEL
  1810. blbs K, $L78
  1811. #else
  1812. blbs TMP1, $L78
  1813. #endif
  1814. .align 4
  1815. ADD c05, t2, c05
  1816. MUL a1, b2, t2
  1817. LD a1, 0 * SIZE(AO)
  1818. LD b1, 0 * SIZE(BO)
  1819. ADD c01, t1, c01
  1820. LD b2, 1 * SIZE(BO)
  1821. lda AO, 1 * SIZE(AO)
  1822. MUL a1, b1, t1
  1823. lda BO, 2 * SIZE(BO)
  1824. .align 4
  1825. $L78:
  1826. ADD c05, t2, c05
  1827. MUL a1, b2, t2
  1828. #ifndef TRMMKERNEL
  1829. LD a5, 0 * SIZE(C1)
  1830. #else
  1831. unop
  1832. #endif
  1833. ADD c02, t3, c02
  1834. ADD c06, t4, c06
  1835. #ifndef TRMMKERNEL
  1836. LD b5, 0 * SIZE(C2)
  1837. #else
  1838. unop
  1839. #endif
  1840. ADD c01, c02, c01
  1841. ADD c05, c06, c05
  1842. ADD c01, t1, c01
  1843. ADD c05, t2, c05
  1844. MUL alpha, c01, c01
  1845. MUL alpha, c05, c05
  1846. #ifndef TRMMKERNEL
  1847. ADD c01, a5, c01
  1848. ADD c05, b5, c05
  1849. #endif
  1850. ST c01, 0 * SIZE(C1)
  1851. ST c05, 0 * SIZE(C2)
  1852. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1853. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1854. subq K, KK, TMP1
  1855. #ifdef LEFT
  1856. subq TMP1, 1, TMP1
  1857. #else
  1858. subq TMP1, 2, TMP1
  1859. #endif
  1860. sll TMP1, BASE_SHIFT + 0, TMP2
  1861. addq AO, TMP2, AO
  1862. sll TMP1, BASE_SHIFT + 1, TMP2
  1863. addq BO, TMP2, BO
  1864. #endif
  1865. #if defined(TRMMKERNEL) && defined(LEFT)
  1866. addq KK, 1, KK
  1867. #endif
  1868. .align 4
  1869. $L79:
  1870. mov BO, B
  1871. #if defined(TRMMKERNEL) && !defined(LEFT)
  1872. addq KK, 2, KK
  1873. #else
  1874. unop
  1875. #endif
  1876. unop
  1877. unop
  1878. .align 4
  1879. $L80:
  1880. and N, 1, J
  1881. ble J, $L999
  1882. mov C, C1
  1883. mov A, AO
  1884. #if defined(TRMMKERNEL) && defined(LEFT)
  1885. mov OFFSET, KK
  1886. #endif
  1887. sra M, 2, I
  1888. ble I, $L100
  1889. .align 4
  1890. $L91:
  1891. #if !defined(TRMMKERNEL) || \
  1892. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1893. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1894. #ifdef TRMMKERNEL
  1895. #ifdef LEFT
  1896. addq KK, 4, TMP1
  1897. #else
  1898. addq KK, 1, TMP1
  1899. #endif
  1900. #endif
  1901. LD a1, 0 * SIZE(AO)
  1902. fclr t1
  1903. LD a2, 1 * SIZE(AO)
  1904. fclr t2
  1905. LD a3, 2 * SIZE(AO)
  1906. fclr t3
  1907. LD a4, 3 * SIZE(AO)
  1908. fclr t4
  1909. LD b1, 0 * SIZE(B)
  1910. fclr c01
  1911. LD b2, 1 * SIZE(B)
  1912. fclr c02
  1913. LD b3, 2 * SIZE(B)
  1914. fclr c03
  1915. LD b4, 3 * SIZE(B)
  1916. fclr c04
  1917. #ifndef TRMMKERNEL
  1918. sra K, 2, L
  1919. #else
  1920. sra TMP1, 2, L
  1921. #endif
  1922. mov B, BO
  1923. unop
  1924. ble L, $L95
  1925. #else
  1926. sll KK, BASE_SHIFT + 2, TMP1
  1927. addq AO, TMP1, AO
  1928. sll KK, BASE_SHIFT + 0, TMP2
  1929. addq B, TMP2, BO
  1930. subq K, KK, TMP1
  1931. LD a1, 0 * SIZE(AO)
  1932. fclr t1
  1933. LD a2, 1 * SIZE(AO)
  1934. fclr t2
  1935. LD a3, 2 * SIZE(AO)
  1936. fclr t3
  1937. LD a4, 3 * SIZE(AO)
  1938. fclr t4
  1939. LD b1, 0 * SIZE(BO)
  1940. fclr c01
  1941. LD b2, 1 * SIZE(BO)
  1942. fclr c02
  1943. LD b3, 2 * SIZE(BO)
  1944. fclr c03
  1945. LD b4, 3 * SIZE(BO)
  1946. fclr c04
  1947. #ifndef TRMMKERNEL
  1948. sra K, 2, L
  1949. #else
  1950. sra TMP1, 2, L
  1951. #endif
  1952. unop
  1953. ble L, $L95
  1954. #endif
  1955. .align 5
  1956. $L92:
  1957. ADD c01, t1, c01
  1958. unop
  1959. MUL a1, b1, t1
  1960. LD a1, 4 * SIZE(AO)
  1961. ADD c02, t2, c02
  1962. lda L, -1(L)
  1963. MUL a2, b1, t2
  1964. LD a2, 5 * SIZE(AO)
  1965. ADD c03, t3, c03
  1966. unop
  1967. MUL a3, b1, t3
  1968. LD a3, 6 * SIZE(AO)
  1969. ADD c04, t4, c04
  1970. MUL a4, b1, t4
  1971. LD a4, 7 * SIZE(AO)
  1972. LD b1, 4 * SIZE(BO)
  1973. ADD c01, t1, c01
  1974. unop
  1975. MUL a1, b2, t1
  1976. LD a1, 8 * SIZE(AO)
  1977. ADD c02, t2, c02
  1978. unop
  1979. MUL a2, b2, t2
  1980. LD a2, 9 * SIZE(AO)
  1981. ADD c03, t3, c03
  1982. unop
  1983. MUL a3, b2, t3
  1984. LD a3, 10 * SIZE(AO)
  1985. ADD c04, t4, c04
  1986. MUL a4, b2, t4
  1987. LD a4, 11 * SIZE(AO)
  1988. LD b2, 5 * SIZE(BO)
  1989. ADD c01, t1, c01
  1990. unop
  1991. MUL a1, b3, t1
  1992. LD a1, 12 * SIZE(AO)
  1993. ADD c02, t2, c02
  1994. unop
  1995. MUL a2, b3, t2
  1996. LD a2, 13 * SIZE(AO)
  1997. ADD c03, t3, c03
  1998. unop
  1999. MUL a3, b3, t3
  2000. LD a3, 14 * SIZE(AO)
  2001. ADD c04, t4, c04
  2002. MUL a4, b3, t4
  2003. LD a5, 15 * SIZE(AO)
  2004. LD b3, 6 * SIZE(BO)
  2005. ADD c01, t1, c01
  2006. MUL a1, b4, t1
  2007. LD a1, 16 * SIZE(AO)
  2008. lda AO, 16 * SIZE(AO)
  2009. ADD c02, t2, c02
  2010. lda BO, 4 * SIZE(BO)
  2011. MUL a2, b4, t2
  2012. LD a2, 1 * SIZE(AO)
  2013. ADD c03, t3, c03
  2014. LD a4, 3 * SIZE(AO)
  2015. MUL a3, b4, t3
  2016. LD a3, 2 * SIZE(AO)
  2017. ADD c04, t4, c04
  2018. MUL a5, b4, t4
  2019. LD b4, 3 * SIZE(BO)
  2020. bgt L, $L92
  2021. .align 4
  2022. $L95:
  2023. #ifndef TRMMKERNEL
  2024. and K, 3, L
  2025. #else
  2026. and TMP1, 3, L
  2027. #endif
  2028. ldt alpha, ALPHA
  2029. unop
  2030. ble L, $L98
  2031. .align 4
  2032. $L96:
  2033. ADD c01, t1, c01
  2034. lda L, -1(L)
  2035. MUL a1, b1, t1
  2036. LD a1, 4 * SIZE(AO)
  2037. ADD c02, t2, c02
  2038. lda BO, 1 * SIZE(BO)
  2039. MUL a2, b1, t2
  2040. LD a2, 5 * SIZE(AO)
  2041. ADD c03, t3, c03
  2042. unop
  2043. MUL a3, b1, t3
  2044. LD a3, 6 * SIZE(AO)
  2045. ADD c04, t4, c04
  2046. MUL a4, b1, t4
  2047. LD a4, 7 * SIZE(AO)
  2048. LD b1, 0 * SIZE(BO)
  2049. lda AO, 4 * SIZE(AO)
  2050. bgt L, $L96
  2051. .align 4
  2052. $L98:
  2053. #ifndef TRMMKERNEL
  2054. ADD c01, t1, c01
  2055. LD c05, 0 * SIZE(C1)
  2056. ADD c02, t2, c02
  2057. LD c06, 1 * SIZE(C1)
  2058. ADD c03, t3, c03
  2059. LD c07, 2 * SIZE(C1)
  2060. ADD c04, t4, c04
  2061. LD c08, 3 * SIZE(C1)
  2062. #else
  2063. ADD c01, t1, c01
  2064. ADD c02, t2, c02
  2065. ADD c03, t3, c03
  2066. ADD c04, t4, c04
  2067. #endif
  2068. MUL alpha, c01, c01
  2069. MUL alpha, c02, c02
  2070. MUL alpha, c03, c03
  2071. MUL alpha, c04, c04
  2072. #ifndef TRMMKERNEL
  2073. ADD c01, c05, c01
  2074. ADD c02, c06, c02
  2075. ADD c03, c07, c03
  2076. ADD c04, c08, c04
  2077. #endif
  2078. ST c01, 0 * SIZE(C1)
  2079. ST c02, 1 * SIZE(C1)
  2080. ST c03, 2 * SIZE(C1)
  2081. ST c04, 3 * SIZE(C1)
  2082. lda C1, 4 * SIZE(C1)
  2083. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2084. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2085. subq K, KK, TMP1
  2086. #ifdef LEFT
  2087. subq TMP1, 4, TMP1
  2088. #else
  2089. subq TMP1, 1, TMP1
  2090. #endif
  2091. sll TMP1, BASE_SHIFT + 2, TMP2
  2092. addq AO, TMP2, AO
  2093. sll TMP1, BASE_SHIFT + 0, TMP2
  2094. addq BO, TMP2, BO
  2095. #endif
  2096. #if defined(TRMMKERNEL) && defined(LEFT)
  2097. addq KK, 4, KK
  2098. #endif
  2099. lda I, -1(I)
  2100. bgt I, $L91
  2101. .align 4
  2102. $L100:
  2103. and M, 2, I
  2104. unop
  2105. unop
  2106. ble I, $L110
  2107. .align 4
  2108. $L101:
  2109. #if !defined(TRMMKERNEL) || \
  2110. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2111. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2112. #ifdef TRMMKERNEL
  2113. #ifdef LEFT
  2114. addq KK, 2, TMP1
  2115. #else
  2116. addq KK, 1, TMP1
  2117. #endif
  2118. #endif
  2119. LD a1, 0 * SIZE(AO)
  2120. fclr t1
  2121. LD a2, 1 * SIZE(AO)
  2122. fclr t2
  2123. LD a3, 2 * SIZE(AO)
  2124. fclr t3
  2125. LD a4, 3 * SIZE(AO)
  2126. fclr t4
  2127. LD b1, 0 * SIZE(B)
  2128. fclr c01
  2129. LD b2, 1 * SIZE(B)
  2130. fclr c02
  2131. LD b3, 2 * SIZE(B)
  2132. fclr c03
  2133. LD b4, 3 * SIZE(B)
  2134. fclr c04
  2135. #ifndef TRMMKERNEL
  2136. sra K, 2, L
  2137. #else
  2138. sra TMP1, 2, L
  2139. #endif
  2140. mov B, BO
  2141. unop
  2142. ble L, $L105
  2143. #else
  2144. sll KK, BASE_SHIFT + 1, TMP1
  2145. addq AO, TMP1, AO
  2146. sll KK, BASE_SHIFT + 0, TMP2
  2147. addq B, TMP2, BO
  2148. subq K, KK, TMP1
  2149. LD a1, 0 * SIZE(AO)
  2150. fclr t1
  2151. LD a2, 1 * SIZE(AO)
  2152. fclr t2
  2153. LD a3, 2 * SIZE(AO)
  2154. fclr t3
  2155. LD a4, 3 * SIZE(AO)
  2156. fclr t4
  2157. LD b1, 0 * SIZE(BO)
  2158. fclr c01
  2159. LD b2, 1 * SIZE(BO)
  2160. fclr c02
  2161. LD b3, 2 * SIZE(BO)
  2162. fclr c03
  2163. LD b4, 3 * SIZE(BO)
  2164. fclr c04
  2165. #ifndef TRMMKERNEL
  2166. sra K, 2, L
  2167. #else
  2168. sra TMP1, 2, L
  2169. #endif
  2170. unop
  2171. ble L, $L105
  2172. #endif
  2173. .align 5
  2174. $L102:
  2175. ADD c01, t1, c01
  2176. lda L, -1(L)
  2177. MUL a1, b1, t1
  2178. LD a1, 4 * SIZE(AO)
  2179. ADD c02, t2, c02
  2180. MUL a2, b1, t2
  2181. LD a2, 5 * SIZE(AO)
  2182. LD b1, 4 * SIZE(BO)
  2183. ADD c03, t3, c03
  2184. lda BO, 4 * SIZE(BO)
  2185. MUL a3, b2, t3
  2186. LD a3, 6 * SIZE(AO)
  2187. ADD c04, t4, c04
  2188. MUL a4, b2, t4
  2189. LD a5, 7 * SIZE(AO)
  2190. LD b2, 1 * SIZE(BO)
  2191. ADD c01, t1, c01
  2192. MUL a1, b3, t1
  2193. LD a1, 8 * SIZE(AO)
  2194. lda AO, 8 * SIZE(AO)
  2195. ADD c02, t2, c02
  2196. MUL a2, b3, t2
  2197. LD b3, 2 * SIZE(BO)
  2198. LD a2, 1 * SIZE(AO)
  2199. ADD c03, t3, c03
  2200. LD a4, 3 * SIZE(AO)
  2201. MUL a3, b4, t3
  2202. LD a3, 2 * SIZE(AO)
  2203. ADD c04, t4, c04
  2204. MUL a5, b4, t4
  2205. LD b4, 3 * SIZE(BO)
  2206. bgt L, $L102
  2207. .align 4
  2208. $L105:
  2209. #ifndef TRMMKERNEL
  2210. and K, 3, L
  2211. #else
  2212. and TMP1, 3, L
  2213. #endif
  2214. ldt alpha, ALPHA
  2215. #ifndef TRMMKERNEL
  2216. LD a3, 0 * SIZE(C1)
  2217. LD a4, 1 * SIZE(C1)
  2218. #endif
  2219. ble L, $L108
  2220. .align 4
  2221. $L106:
  2222. ADD c01, t1, c01
  2223. lda L, -1(L)
  2224. MUL a1, b1, t1
  2225. LD a1, 2 * SIZE(AO)
  2226. ADD c02, t2, c02
  2227. MUL a2, b1, t2
  2228. LD a2, 3 * SIZE(AO)
  2229. LD b1, 1 * SIZE(BO)
  2230. lda AO, 2 * SIZE(AO)
  2231. unop
  2232. lda BO, 1 * SIZE(BO)
  2233. bgt L, $L106
  2234. .align 4
  2235. $L108:
  2236. ADD c01, t1, c01
  2237. fclr t1
  2238. ADD c02, t2, c02
  2239. fclr t2
  2240. ADD c03, t3, c03
  2241. fclr t3
  2242. ADD c04, t4, c04
  2243. fclr t4
  2244. ADD c01, c03, c01
  2245. ADD c02, c04, c02
  2246. MUL alpha, c01, c01
  2247. MUL alpha, c02, c02
  2248. #ifndef TRMMKERNEL
  2249. ADD c01, a3, c01
  2250. ADD c02, a4, c02
  2251. #endif
  2252. ST c01, 0 * SIZE(C1)
  2253. ST c02, 1 * SIZE(C1)
  2254. lda C1, 2 * SIZE(C1)
  2255. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2256. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2257. subq K, KK, TMP1
  2258. #ifdef LEFT
  2259. subq TMP1, 2, TMP1
  2260. #else
  2261. subq TMP1, 1, TMP1
  2262. #endif
  2263. sll TMP1, BASE_SHIFT + 1, TMP2
  2264. addq AO, TMP2, AO
  2265. sll TMP1, BASE_SHIFT + 0, TMP2
  2266. addq BO, TMP2, BO
  2267. #endif
  2268. #if defined(TRMMKERNEL) && defined(LEFT)
  2269. addq KK, 2, KK
  2270. #endif
  2271. .align 4
  2272. $L110:
  2273. and M, 1, I
  2274. ble I, $L999
  2275. .align 4
  2276. $L111:
  2277. #if !defined(TRMMKERNEL) || \
  2278. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2279. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2280. #ifdef TRMMKERNEL
  2281. #ifdef LEFT
  2282. addq KK, 1, TMP1
  2283. #else
  2284. addq KK, 1, TMP1
  2285. #endif
  2286. #endif
  2287. LD a1, 0 * SIZE(AO)
  2288. fclr t1
  2289. LD a2, 1 * SIZE(AO)
  2290. fclr t2
  2291. LD a3, 2 * SIZE(AO)
  2292. fclr t3
  2293. LD a4, 3 * SIZE(AO)
  2294. fclr t4
  2295. LD b1, 0 * SIZE(B)
  2296. fclr c01
  2297. LD b2, 1 * SIZE(B)
  2298. fclr c02
  2299. LD b3, 2 * SIZE(B)
  2300. fclr c03
  2301. LD b4, 3 * SIZE(B)
  2302. fclr c04
  2303. #ifndef TRMMKERNEL
  2304. sra K, 2, L
  2305. #else
  2306. sra TMP1, 2, L
  2307. #endif
  2308. mov B, BO
  2309. unop
  2310. ble L, $L115
  2311. #else
  2312. sll KK, BASE_SHIFT + 0, TMP1
  2313. addq AO, TMP1, AO
  2314. sll KK, BASE_SHIFT + 0, TMP2
  2315. addq B, TMP2, BO
  2316. subq K, KK, TMP1
  2317. LD a1, 0 * SIZE(AO)
  2318. fclr t1
  2319. LD a2, 1 * SIZE(AO)
  2320. fclr t2
  2321. LD a3, 2 * SIZE(AO)
  2322. fclr t3
  2323. LD a4, 3 * SIZE(AO)
  2324. fclr t4
  2325. LD b1, 0 * SIZE(BO)
  2326. fclr c01
  2327. LD b2, 1 * SIZE(BO)
  2328. fclr c02
  2329. LD b3, 2 * SIZE(BO)
  2330. fclr c03
  2331. LD b4, 3 * SIZE(BO)
  2332. fclr c04
  2333. #ifndef TRMMKERNEL
  2334. sra K, 2, L
  2335. #else
  2336. sra TMP1, 2, L
  2337. #endif
  2338. unop
  2339. ble L, $L115
  2340. #endif
  2341. .align 4
  2342. $L112:
  2343. ADD c01, t1, c01
  2344. MUL a1, b1, t1
  2345. LD a1, 4 * SIZE(AO)
  2346. LD b1, 4 * SIZE(BO)
  2347. ADD c02, t2, c02
  2348. MUL a2, b2, t2
  2349. LD a2, 5 * SIZE(AO)
  2350. LD b2, 5 * SIZE(BO)
  2351. ADD c03, t3, c03
  2352. MUL a3, b3, t3
  2353. LD a3, 6 * SIZE(AO)
  2354. LD b3, 6 * SIZE(BO)
  2355. ADD c04, t4, c04
  2356. MUL a4, b4, t4
  2357. LD a4, 7 * SIZE(AO)
  2358. LD b4, 7 * SIZE(BO)
  2359. lda L, -1(L)
  2360. lda AO, 4 * SIZE(AO)
  2361. lda BO, 4 * SIZE(BO)
  2362. bgt L, $L112
  2363. .align 4
  2364. $L115:
  2365. #ifndef TRMMKERNEL
  2366. and K, 3, L
  2367. #else
  2368. and TMP1, 3, L
  2369. #endif
  2370. ldt alpha, ALPHA
  2371. #ifndef TRMMKERNEL
  2372. LD a2, 0 * SIZE(C1)
  2373. #endif
  2374. ble L, $L118
  2375. .align 4
  2376. $L116:
  2377. ADD c01, t1, c01
  2378. MUL a1, b1, t1
  2379. LD a1, 1 * SIZE(AO)
  2380. LD b1, 1 * SIZE(BO)
  2381. lda L, -1(L)
  2382. lda AO, 1 * SIZE(AO)
  2383. lda BO, 1 * SIZE(BO)
  2384. bgt L, $L116
  2385. .align 4
  2386. $L118:
  2387. ADD c01, t1, c01
  2388. ADD c02, t2, c02
  2389. ADD c03, t3, c03
  2390. ADD c04, t4, c04
  2391. ADD c01, c02, c01
  2392. ADD c03, c04, c03
  2393. ADD c01, c03, c01
  2394. MUL alpha, c01, c01
  2395. #ifndef TRMMKERNEL
  2396. ADD c01, a2, c01
  2397. #endif
  2398. ST c01, 0 * SIZE(C1)
  2399. .align 4
  2400. $L999:
  2401. ldt $f2, 0($sp)
  2402. ldt $f3, 8($sp)
  2403. ldt $f4, 16($sp)
  2404. ldt $f5, 24($sp)
  2405. ldt $f6, 32($sp)
  2406. ldt $f7, 40($sp)
  2407. ldt $f8, 48($sp)
  2408. ldt $f9, 56($sp)
  2409. clr $0
  2410. lda $sp, STACKSIZE($sp)
  2411. ret
  2412. EPILOGUE