You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_1x4.S 33 kB


  1. /*********************************************************************/
  2. /* Copyright 2005-2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define APREFETCHSIZE 24
  41. #define APREFETCH_CATEGORY 0
  42. #define M %i0
  43. #define N %i1
  44. #define K %i2
  45. #define A %i5
  46. #define B %i3
  47. #define C %i4
  48. #define LDC %o0
  49. #define AO %o1
  50. #define BO %o2
  51. #define I %o3
  52. #define J %o4
  53. #define L %o5
  54. #define BB %o7
  55. #define C1 %l0
  56. #define C2 %l1
  57. #define C3 %l2
  58. #define C4 %l3
  59. #define OFFSET %l4
  60. #define KK %l5
  61. #define TEMP1 %l6
  62. #define TEMP2 %l7
  63. #ifdef DOUBLE
  64. #define c01 %f0
  65. #define c02 %f2
  66. #define c03 %f4
  67. #define c04 %f6
  68. #define c05 %f8
  69. #define c06 %f10
  70. #define c07 %f12
  71. #define c08 %f14
  72. #define c09 %f16
  73. #define c10 %f18
  74. #define c11 %f20
  75. #define c12 %f22
  76. #define c13 %f24
  77. #define c14 %f26
  78. #define c15 %f28
  79. #define c16 %f30
  80. #define a1 %f32
  81. #define a2 %f34
  82. #define a3 %f36
  83. #define a4 %f38
  84. #define a5 %f40
  85. #define b1 %f42
  86. #define b2 %f44
  87. #define b3 %f46
  88. #define b4 %f48
  89. #define b5 %f50
  90. #define b6 %f52
  91. #define b7 %f54
  92. #define b8 %f56
  93. #define b9 %f58
  94. #define ALPHA_R %f60
  95. #define ALPHA_I %f62
  96. #define cc01 0
  97. #define cc02 2
  98. #define cc03 4
  99. #define cc04 6
  100. #define cc05 8
  101. #define cc06 10
  102. #define cc07 12
  103. #define cc08 14
  104. #define cc09 16
  105. #define cc10 18
  106. #define cc11 20
  107. #define cc12 22
  108. #define cc13 24
  109. #define cc14 26
  110. #define cc15 28
  111. #define cc16 30
  112. #define aa1 1
  113. #define aa2 3
  114. #define aa3 5
  115. #define aa4 7
  116. #define aa5 9
  117. #define bb1 11
  118. #define bb2 13
  119. #define bb3 15
  120. #define bb4 17
  121. #define bb5 19
  122. #define bb6 21
  123. #define bb7 23
  124. #define bb8 25
  125. #define bb9 27
  126. #define alpha_r 29
  127. #define alpha_i 31
  128. #else
  129. #define c01 %f0
  130. #define c02 %f1
  131. #define c03 %f2
  132. #define c04 %f3
  133. #define c05 %f4
  134. #define c06 %f5
  135. #define c07 %f6
  136. #define c08 %f7
  137. #define c09 %f8
  138. #define c10 %f9
  139. #define c11 %f10
  140. #define c12 %f11
  141. #define c13 %f12
  142. #define c14 %f13
  143. #define c15 %f14
  144. #define c16 %f15
  145. #define a1 %f16
  146. #define a2 %f17
  147. #define a3 %f18
  148. #define a4 %f19
  149. #define a5 %f20
  150. #define b1 %f21
  151. #define b2 %f22
  152. #define b3 %f23
  153. #define b4 %f24
  154. #define b5 %f25
  155. #define b6 %f26
  156. #define b7 %f27
  157. #define b8 %f28
  158. #define b9 %f29
  159. #define ALPHA_R %f30
  160. #define ALPHA_I %f31
  161. #define cc01 0
  162. #define cc02 1
  163. #define cc03 2
  164. #define cc04 3
  165. #define cc05 4
  166. #define cc06 5
  167. #define cc07 6
  168. #define cc08 7
  169. #define cc09 8
  170. #define cc10 9
  171. #define cc11 10
  172. #define cc12 11
  173. #define cc13 12
  174. #define cc14 13
  175. #define cc15 14
  176. #define cc16 15
  177. #define aa1 16
  178. #define aa2 17
  179. #define aa3 18
  180. #define aa4 19
  181. #define aa5 20
  182. #define bb1 21
  183. #define bb2 22
  184. #define bb3 23
  185. #define bb4 24
  186. #define bb5 25
  187. #define bb6 26
  188. #define bb7 27
  189. #define bb8 28
  190. #define bb9 29
  191. #define alpha_r 30
  192. #define alpha_i 31
  193. #endif
  194. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  195. #define FMADD1 FMADD
  196. #define FMADD2 FMADD
  197. #define FMADD3 FMADD
  198. #define FMADD4 FNMSUB
  199. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  200. #define FMADD1 FMADD
  201. #define FMADD2 FMADD
  202. #define FMADD3 FNMSUB
  203. #define FMADD4 FMADD
  204. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  205. #define FMADD1 FMADD
  206. #define FMADD2 FNMSUB
  207. #define FMADD3 FMADD
  208. #define FMADD4 FMADD
  209. #else
  210. #define FMADD1 FMADD
  211. #define FMADD2 FNMSUB
  212. #define FMADD3 FNMSUB
  213. #define FMADD4 FNMSUB
  214. #endif
  215. .register %g2, #scratch
  216. .register %g3, #scratch
  217. PROLOGUE
  218. SAVESP
  219. #ifndef __64BIT__
  220. #ifdef DOUBLE
  221. st %i3, [%sp + STACK_START + 16]
  222. st %i4, [%sp + STACK_START + 20]
  223. st %i5, [%sp + STACK_START + 24]
  224. ld [%sp + STACK_START + 32], A
  225. ld [%sp + STACK_START + 36], B
  226. ld [%sp + STACK_START + 40], C
  227. ld [%sp + STACK_START + 44], LDC
  228. #ifdef TRMMKERNEL
  229. ld [%sp + STACK_START + 48], OFFSET
  230. #endif
  231. ldd [%sp + STACK_START + 16], ALPHA_R
  232. ldd [%sp + STACK_START + 24], ALPHA_I
  233. #else
  234. st %i3, [%sp + STACK_START + 16]
  235. st %i4, [%sp + STACK_START + 20]
  236. ld [%sp + STACK_START + 28], B
  237. ld [%sp + STACK_START + 32], C
  238. ld [%sp + STACK_START + 36], LDC
  239. #ifdef TRMMKERNEL
  240. ld [%sp + STACK_START + 40], OFFSET
  241. #endif
  242. ld [%sp + STACK_START + 16], ALPHA_R
  243. ld [%sp + STACK_START + 20], ALPHA_I
  244. #endif
  245. #else
  246. ldx [%sp + STACK_START + 56], B
  247. ldx [%sp + STACK_START + 64], C
  248. ldx [%sp + STACK_START + 72], LDC
  249. #ifdef TRMMKERNEL
  250. ldx [%sp + STACK_START + 80], OFFSET
  251. #endif
  252. #ifdef DOUBLE
  253. FMOV %f6, ALPHA_R
  254. FMOV %f8, ALPHA_I
  255. #else
  256. FMOV %f7, ALPHA_R
  257. FMOV %f9, ALPHA_I
  258. #endif
  259. #endif
  260. #if defined(TRMMKERNEL) && !defined(LEFT)
  261. neg OFFSET, KK
  262. #endif
  263. cmp M, 0
  264. ble,pn %icc, .LL999
  265. nop
  266. sra N, 2, J
  267. cmp J, 0
  268. ble,pn %icc, .LL20
  269. sll LDC, ZBASE_SHIFT, LDC
  270. .LL11:
  271. mov C, C1
  272. add C, LDC, C2
  273. add C2, LDC, C3
  274. add C3, LDC, C4
  275. add C4, LDC, C
  276. sll K, ZBASE_SHIFT + 2, BB
  277. #if defined(TRMMKERNEL) && defined(LEFT)
  278. mov OFFSET, KK
  279. #endif
  280. mov A, AO
  281. mov M, I
  282. add B, BB, BB
  283. .align 4
  284. .LL12:
  285. prefetch [BB + 0 * SIZE], 1
  286. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
  287. mov B, BO
  288. #else
  289. sll KK, ZBASE_SHIFT + 0, TEMP1
  290. sll KK, ZBASE_SHIFT + 2, TEMP2
  291. add AO, TEMP1, AO
  292. add B, TEMP2, BO
  293. #endif
  294. LDF [AO + 0 * SIZE], a1
  295. FCLR (cc01)
  296. LDF [AO + 1 * SIZE], a2
  297. FCLR (cc05)
  298. LDF [AO + 8 * SIZE], a5
  299. FCLR (cc09)
  300. LDF [BO + 0 * SIZE], b1
  301. FCLR (cc13)
  302. LDF [BO + 1 * SIZE], b2
  303. FCLR (cc02)
  304. LDF [BO + 2 * SIZE], b3
  305. FCLR (cc06)
  306. LDF [BO + 3 * SIZE], b4
  307. FCLR (cc10)
  308. LDF [BO + 4 * SIZE], b5
  309. FCLR (cc14)
  310. LDF [BO + 5 * SIZE], b6
  311. FCLR (cc03)
  312. LDF [BO + 6 * SIZE], b7
  313. FCLR (cc07)
  314. LDF [BO + 7 * SIZE], b8
  315. FCLR (cc11)
  316. LDF [BO + 8 * SIZE], b9
  317. FCLR (cc15)
  318. prefetch [C1 + 1 * SIZE], 3
  319. FCLR (cc04)
  320. prefetch [C2 + 2 * SIZE], 3
  321. FCLR (cc08)
  322. prefetch [C3 + 1 * SIZE], 3
  323. FCLR (cc12)
  324. prefetch [C4 + 2 * SIZE], 3
  325. FCLR (cc16)
  326. #ifndef TRMMKERNEL
  327. sra K, 3, L
  328. #else
  329. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  330. sub K, KK, L
  331. #elif defined(LEFT)
  332. add KK, 1, L
  333. #else
  334. add KK, 4, L
  335. #endif
  336. sra L, 3, L
  337. #endif
  338. cmp L, 0
  339. ble,pn %icc, .LL15
  340. add BB, 32 * SIZE, BB
  341. .align 4
  342. .LL13:
  343. FMADD1 (aa1, bb1, cc01, cc01)
  344. FMADD2 (aa2, bb1, cc02, cc02)
  345. FMADD3 (aa1, bb2, cc03, cc03)
  346. FMADD4 (aa2, bb2, cc04, cc04)
  347. FMADD1 (aa1, bb3, cc05, cc05)
  348. LDF [BO + 16 * SIZE], b1
  349. FMADD2 (aa2, bb3, cc06, cc06)
  350. LDF [BO + 9 * SIZE], b2
  351. FMADD3 (aa1, bb4, cc07, cc07)
  352. LDF [BO + 10 * SIZE], b3
  353. FMADD4 (aa2, bb4, cc08, cc08)
  354. LDF [BO + 11 * SIZE], b4
  355. FMADD1 (aa1, bb5, cc09, cc09)
  356. LDF [AO + 2 * SIZE], a3
  357. FMADD2 (aa2, bb5, cc10, cc10)
  358. LDF [AO + 3 * SIZE], a4
  359. FMADD3 (aa1, bb6, cc11, cc11)
  360. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  361. FMADD4 (aa2, bb6, cc12, cc12)
  362. nop
  363. FMADD1 (aa1, bb7, cc13, cc13)
  364. LDF [BO + 12 * SIZE], b5
  365. FMADD2 (aa2, bb7, cc14, cc14)
  366. LDF [BO + 13 * SIZE], b6
  367. FMADD3 (aa1, bb8, cc15, cc15)
  368. LDF [BO + 14 * SIZE], b7
  369. FMADD4 (aa2, bb8, cc16, cc16)
  370. LDF [BO + 15 * SIZE], b8
  371. FMADD1 (aa3, bb9, cc01, cc01)
  372. FMADD2 (aa4, bb9, cc02, cc02)
  373. FMADD3 (aa3, bb2, cc03, cc03)
  374. FMADD4 (aa4, bb2, cc04, cc04)
  375. FMADD1 (aa3, bb3, cc05, cc05)
  376. LDF [BO + 24 * SIZE], b9
  377. FMADD2 (aa4, bb3, cc06, cc06)
  378. LDF [BO + 17 * SIZE], b2
  379. FMADD3 (aa3, bb4, cc07, cc07)
  380. LDF [BO + 18 * SIZE], b3
  381. FMADD4 (aa4, bb4, cc08, cc08)
  382. LDF [BO + 19 * SIZE], b4
  383. FMADD1 (aa3, bb5, cc09, cc09)
  384. LDF [AO + 4 * SIZE], a1
  385. FMADD2 (aa4, bb5, cc10, cc10)
  386. LDF [AO + 5 * SIZE], a2
  387. FMADD3 (aa3, bb6, cc11, cc11)
  388. add L, -1, L
  389. FMADD4 (aa4, bb6, cc12, cc12)
  390. nop
  391. FMADD1 (aa3, bb7, cc13, cc13)
  392. LDF [BO + 20 * SIZE], b5
  393. FMADD2 (aa4, bb7, cc14, cc14)
  394. LDF [BO + 21 * SIZE], b6
  395. FMADD3 (aa3, bb8, cc15, cc15)
  396. LDF [BO + 22 * SIZE], b7
  397. FMADD4 (aa4, bb8, cc16, cc16)
  398. LDF [BO + 23 * SIZE], b8
  399. FMADD1 (aa1, bb1, cc01, cc01)
  400. FMADD2 (aa2, bb1, cc02, cc02)
  401. FMADD3 (aa1, bb2, cc03, cc03)
  402. FMADD4 (aa2, bb2, cc04, cc04)
  403. FMADD1 (aa1, bb3, cc05, cc05)
  404. LDF [BO + 32 * SIZE], b1
  405. FMADD2 (aa2, bb3, cc06, cc06)
  406. LDF [BO + 25 * SIZE], b2
  407. FMADD3 (aa1, bb4, cc07, cc07)
  408. LDF [BO + 26 * SIZE], b3
  409. FMADD4 (aa2, bb4, cc08, cc08)
  410. LDF [BO + 27 * SIZE], b4
  411. FMADD1 (aa1, bb5, cc09, cc09)
  412. LDF [AO + 6 * SIZE], a3
  413. FMADD2 (aa2, bb5, cc10, cc10)
  414. LDF [AO + 7 * SIZE], a4
  415. FMADD3 (aa1, bb6, cc11, cc11)
  416. nop
  417. FMADD4 (aa2, bb6, cc12, cc12)
  418. nop
  419. FMADD1 (aa1, bb7, cc13, cc13)
  420. LDF [BO + 28 * SIZE], b5
  421. FMADD2 (aa2, bb7, cc14, cc14)
  422. LDF [BO + 29 * SIZE], b6
  423. FMADD3 (aa1, bb8, cc15, cc15)
  424. LDF [BO + 30 * SIZE], b7
  425. FMADD4 (aa2, bb8, cc16, cc16)
  426. LDF [BO + 31 * SIZE], b8
  427. FMADD1 (aa3, bb9, cc01, cc01)
  428. FMADD2 (aa4, bb9, cc02, cc02)
  429. FMADD3 (aa3, bb2, cc03, cc03)
  430. FMADD4 (aa4, bb2, cc04, cc04)
  431. FMADD1 (aa3, bb3, cc05, cc05)
  432. LDF [BO + 40 * SIZE], b9
  433. FMADD2 (aa4, bb3, cc06, cc06)
  434. LDF [BO + 33 * SIZE], b2
  435. FMADD3 (aa3, bb4, cc07, cc07)
  436. LDF [BO + 34 * SIZE], b3
  437. FMADD4 (aa4, bb4, cc08, cc08)
  438. LDF [BO + 35 * SIZE], b4
  439. FMADD1 (aa3, bb5, cc09, cc09)
  440. LDF [AO + 16 * SIZE], a1 /****/
  441. FMADD2 (aa4, bb5, cc10, cc10)
  442. LDF [AO + 9 * SIZE], a2
  443. FMADD3 (aa3, bb6, cc11, cc11)
  444. nop
  445. FMADD4 (aa4, bb6, cc12, cc12)
  446. nop
  447. FMADD1 (aa3, bb7, cc13, cc13)
  448. LDF [BO + 36 * SIZE], b5
  449. FMADD2 (aa4, bb7, cc14, cc14)
  450. LDF [BO + 37 * SIZE], b6
  451. FMADD3 (aa3, bb8, cc15, cc15)
  452. LDF [BO + 38 * SIZE], b7
  453. FMADD4 (aa4, bb8, cc16, cc16)
  454. LDF [BO + 39 * SIZE], b8
  455. FMADD1 (aa5, bb1, cc01, cc01)
  456. FMADD2 (aa2, bb1, cc02, cc02)
  457. FMADD3 (aa5, bb2, cc03, cc03)
  458. FMADD4 (aa2, bb2, cc04, cc04)
  459. FMADD1 (aa5, bb3, cc05, cc05)
  460. LDF [BO + 48 * SIZE], b1
  461. FMADD2 (aa2, bb3, cc06, cc06)
  462. LDF [BO + 41 * SIZE], b2
  463. FMADD3 (aa5, bb4, cc07, cc07)
  464. LDF [BO + 42 * SIZE], b3
  465. FMADD4 (aa2, bb4, cc08, cc08)
  466. LDF [BO + 43 * SIZE], b4
  467. FMADD1 (aa5, bb5, cc09, cc09)
  468. LDF [AO + 10 * SIZE], a3
  469. FMADD2 (aa2, bb5, cc10, cc10)
  470. LDF [AO + 11 * SIZE], a4
  471. FMADD3 (aa5, bb6, cc11, cc11)
  472. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  473. FMADD4 (aa2, bb6, cc12, cc12)
  474. nop
  475. FMADD1 (aa5, bb7, cc13, cc13)
  476. LDF [BO + 44 * SIZE], b5
  477. FMADD2 (aa2, bb7, cc14, cc14)
  478. LDF [BO + 45 * SIZE], b6
  479. FMADD3 (aa5, bb8, cc15, cc15)
  480. LDF [BO + 46 * SIZE], b7
  481. FMADD4 (aa2, bb8, cc16, cc16)
  482. LDF [BO + 47 * SIZE], b8
  483. FMADD1 (aa3, bb9, cc01, cc01)
  484. FMADD2 (aa4, bb9, cc02, cc02)
  485. FMADD3 (aa3, bb2, cc03, cc03)
  486. FMADD4 (aa4, bb2, cc04, cc04)
  487. FMADD1 (aa3, bb3, cc05, cc05)
  488. LDF [BO + 56 * SIZE], b9
  489. FMADD2 (aa4, bb3, cc06, cc06)
  490. LDF [BO + 49 * SIZE], b2
  491. FMADD3 (aa3, bb4, cc07, cc07)
  492. LDF [BO + 50 * SIZE], b3
  493. FMADD4 (aa4, bb4, cc08, cc08)
  494. LDF [BO + 51 * SIZE], b4
  495. FMADD1 (aa3, bb5, cc09, cc09)
  496. LDF [AO + 12 * SIZE], a5
  497. FMADD2 (aa4, bb5, cc10, cc10)
  498. LDF [AO + 13 * SIZE], a2
  499. FMADD3 (aa3, bb6, cc11, cc11)
  500. cmp L, 0
  501. FMADD4 (aa4, bb6, cc12, cc12)
  502. nop
  503. FMADD1 (aa3, bb7, cc13, cc13)
  504. LDF [BO + 52 * SIZE], b5
  505. FMADD2 (aa4, bb7, cc14, cc14)
  506. LDF [BO + 53 * SIZE], b6
  507. FMADD3 (aa3, bb8, cc15, cc15)
  508. LDF [BO + 54 * SIZE], b7
  509. FMADD4 (aa4, bb8, cc16, cc16)
  510. LDF [BO + 55 * SIZE], b8
  511. FMADD1 (aa5, bb1, cc01, cc01)
  512. FMADD2 (aa2, bb1, cc02, cc02)
  513. FMADD3 (aa5, bb2, cc03, cc03)
  514. FMADD4 (aa2, bb2, cc04, cc04)
  515. FMADD1 (aa5, bb3, cc05, cc05)
  516. LDF [BO + 64 * SIZE], b1
  517. FMADD2 (aa2, bb3, cc06, cc06)
  518. LDF [BO + 57 * SIZE], b2
  519. FMADD3 (aa5, bb4, cc07, cc07)
  520. LDF [BO + 58 * SIZE], b3
  521. FMADD4 (aa2, bb4, cc08, cc08)
  522. LDF [BO + 59 * SIZE], b4
  523. FMADD1 (aa5, bb5, cc09, cc09)
  524. LDF [AO + 14 * SIZE], a3
  525. FMADD2 (aa2, bb5, cc10, cc10)
  526. LDF [AO + 15 * SIZE], a4
  527. FMADD3 (aa5, bb6, cc11, cc11)
  528. add BO, 64 * SIZE, BO
  529. FMADD4 (aa2, bb6, cc12, cc12)
  530. add AO, 16 * SIZE, AO
  531. FMADD1 (aa5, bb7, cc13, cc13)
  532. LDF [BO - 4 * SIZE], b5
  533. FMADD2 (aa2, bb7, cc14, cc14)
  534. LDF [BO - 3 * SIZE], b6
  535. FMADD3 (aa5, bb8, cc15, cc15)
  536. LDF [BO - 2 * SIZE], b7
  537. FMADD4 (aa2, bb8, cc16, cc16)
  538. LDF [BO - 1 * SIZE], b8
  539. FMADD1 (aa3, bb9, cc01, cc01)
  540. FMADD2 (aa4, bb9, cc02, cc02)
  541. FMADD3 (aa3, bb2, cc03, cc03)
  542. FMADD4 (aa4, bb2, cc04, cc04)
  543. FMADD1 (aa3, bb3, cc05, cc05)
  544. LDF [BO + 8 * SIZE], b9
  545. FMADD2 (aa4, bb3, cc06, cc06)
  546. LDF [BO + 1 * SIZE], b2
  547. FMADD3 (aa3, bb4, cc07, cc07)
  548. LDF [BO + 2 * SIZE], b3
  549. FMADD4 (aa4, bb4, cc08, cc08)
  550. LDF [BO + 3 * SIZE], b4
  551. FMADD1 (aa3, bb5, cc09, cc09)
  552. LDF [AO + 8 * SIZE], a5 /****/
  553. FMADD2 (aa4, bb5, cc10, cc10)
  554. LDF [AO + 1 * SIZE], a2
  555. FMADD3 (aa3, bb6, cc11, cc11)
  556. FMADD4 (aa4, bb6, cc12, cc12)
  557. FMADD1 (aa3, bb7, cc13, cc13)
  558. LDF [BO + 4 * SIZE], b5
  559. FMADD2 (aa4, bb7, cc14, cc14)
  560. LDF [BO + 5 * SIZE], b6
  561. FMADD3 (aa3, bb8, cc15, cc15)
  562. LDF [BO + 6 * SIZE], b7
  563. FMADD4 (aa4, bb8, cc16, cc16)
  564. ble,pn %icc, .LL15
  565. LDF [BO + 7 * SIZE], b8
  566. FMADD1 (aa1, bb1, cc01, cc01)
  567. FMADD2 (aa2, bb1, cc02, cc02)
  568. FMADD3 (aa1, bb2, cc03, cc03)
  569. FMADD4 (aa2, bb2, cc04, cc04)
  570. FMADD1 (aa1, bb3, cc05, cc05)
  571. LDF [BO + 16 * SIZE], b1
  572. FMADD2 (aa2, bb3, cc06, cc06)
  573. LDF [BO + 9 * SIZE], b2
  574. FMADD3 (aa1, bb4, cc07, cc07)
  575. LDF [BO + 10 * SIZE], b3
  576. FMADD4 (aa2, bb4, cc08, cc08)
  577. LDF [BO + 11 * SIZE], b4
  578. FMADD1 (aa1, bb5, cc09, cc09)
  579. LDF [AO + 2 * SIZE], a3
  580. FMADD2 (aa2, bb5, cc10, cc10)
  581. LDF [AO + 3 * SIZE], a4
  582. FMADD3 (aa1, bb6, cc11, cc11)
  583. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  584. FMADD4 (aa2, bb6, cc12, cc12)
  585. nop
  586. FMADD1 (aa1, bb7, cc13, cc13)
  587. LDF [BO + 12 * SIZE], b5
  588. FMADD2 (aa2, bb7, cc14, cc14)
  589. LDF [BO + 13 * SIZE], b6
  590. FMADD3 (aa1, bb8, cc15, cc15)
  591. LDF [BO + 14 * SIZE], b7
  592. FMADD4 (aa2, bb8, cc16, cc16)
  593. LDF [BO + 15 * SIZE], b8
  594. FMADD1 (aa3, bb9, cc01, cc01)
  595. FMADD2 (aa4, bb9, cc02, cc02)
  596. FMADD3 (aa3, bb2, cc03, cc03)
  597. FMADD4 (aa4, bb2, cc04, cc04)
  598. FMADD1 (aa3, bb3, cc05, cc05)
  599. LDF [BO + 24 * SIZE], b9
  600. FMADD2 (aa4, bb3, cc06, cc06)
  601. LDF [BO + 17 * SIZE], b2
  602. FMADD3 (aa3, bb4, cc07, cc07)
  603. LDF [BO + 18 * SIZE], b3
  604. FMADD4 (aa4, bb4, cc08, cc08)
  605. LDF [BO + 19 * SIZE], b4
  606. FMADD1 (aa3, bb5, cc09, cc09)
  607. LDF [AO + 4 * SIZE], a1
  608. FMADD2 (aa4, bb5, cc10, cc10)
  609. LDF [AO + 5 * SIZE], a2
  610. FMADD3 (aa3, bb6, cc11, cc11)
  611. add L, -1, L
  612. FMADD4 (aa4, bb6, cc12, cc12)
  613. nop
  614. FMADD1 (aa3, bb7, cc13, cc13)
  615. LDF [BO + 20 * SIZE], b5
  616. FMADD2 (aa4, bb7, cc14, cc14)
  617. LDF [BO + 21 * SIZE], b6
  618. FMADD3 (aa3, bb8, cc15, cc15)
  619. LDF [BO + 22 * SIZE], b7
  620. FMADD4 (aa4, bb8, cc16, cc16)
  621. LDF [BO + 23 * SIZE], b8
  622. FMADD1 (aa1, bb1, cc01, cc01)
  623. FMADD2 (aa2, bb1, cc02, cc02)
  624. FMADD3 (aa1, bb2, cc03, cc03)
  625. FMADD4 (aa2, bb2, cc04, cc04)
  626. FMADD1 (aa1, bb3, cc05, cc05)
  627. LDF [BO + 32 * SIZE], b1
  628. FMADD2 (aa2, bb3, cc06, cc06)
  629. LDF [BO + 25 * SIZE], b2
  630. FMADD3 (aa1, bb4, cc07, cc07)
  631. LDF [BO + 26 * SIZE], b3
  632. FMADD4 (aa2, bb4, cc08, cc08)
  633. LDF [BO + 27 * SIZE], b4
  634. FMADD1 (aa1, bb5, cc09, cc09)
  635. LDF [AO + 6 * SIZE], a3
  636. FMADD2 (aa2, bb5, cc10, cc10)
  637. LDF [AO + 7 * SIZE], a4
  638. FMADD3 (aa1, bb6, cc11, cc11)
  639. nop
  640. FMADD4 (aa2, bb6, cc12, cc12)
  641. nop
  642. FMADD1 (aa1, bb7, cc13, cc13)
  643. LDF [BO + 28 * SIZE], b5
  644. FMADD2 (aa2, bb7, cc14, cc14)
  645. LDF [BO + 29 * SIZE], b6
  646. FMADD3 (aa1, bb8, cc15, cc15)
  647. LDF [BO + 30 * SIZE], b7
  648. FMADD4 (aa2, bb8, cc16, cc16)
  649. LDF [BO + 31 * SIZE], b8
  650. FMADD1 (aa3, bb9, cc01, cc01)
  651. FMADD2 (aa4, bb9, cc02, cc02)
  652. FMADD3 (aa3, bb2, cc03, cc03)
  653. FMADD4 (aa4, bb2, cc04, cc04)
  654. FMADD1 (aa3, bb3, cc05, cc05)
  655. LDF [BO + 40 * SIZE], b9
  656. FMADD2 (aa4, bb3, cc06, cc06)
  657. LDF [BO + 33 * SIZE], b2
  658. FMADD3 (aa3, bb4, cc07, cc07)
  659. LDF [BO + 34 * SIZE], b3
  660. FMADD4 (aa4, bb4, cc08, cc08)
  661. LDF [BO + 35 * SIZE], b4
  662. FMADD1 (aa3, bb5, cc09, cc09)
  663. LDF [AO + 16 * SIZE], a1 /****/
  664. FMADD2 (aa4, bb5, cc10, cc10)
  665. LDF [AO + 9 * SIZE], a2
  666. FMADD3 (aa3, bb6, cc11, cc11)
  667. nop
  668. FMADD4 (aa4, bb6, cc12, cc12)
  669. nop
  670. FMADD1 (aa3, bb7, cc13, cc13)
  671. LDF [BO + 36 * SIZE], b5
  672. FMADD2 (aa4, bb7, cc14, cc14)
  673. LDF [BO + 37 * SIZE], b6
  674. FMADD3 (aa3, bb8, cc15, cc15)
  675. LDF [BO + 38 * SIZE], b7
  676. FMADD4 (aa4, bb8, cc16, cc16)
  677. LDF [BO + 39 * SIZE], b8
  678. FMADD1 (aa5, bb1, cc01, cc01)
  679. FMADD2 (aa2, bb1, cc02, cc02)
  680. FMADD3 (aa5, bb2, cc03, cc03)
  681. FMADD4 (aa2, bb2, cc04, cc04)
  682. FMADD1 (aa5, bb3, cc05, cc05)
  683. LDF [BO + 48 * SIZE], b1
  684. FMADD2 (aa2, bb3, cc06, cc06)
  685. LDF [BO + 41 * SIZE], b2
  686. FMADD3 (aa5, bb4, cc07, cc07)
  687. LDF [BO + 42 * SIZE], b3
  688. FMADD4 (aa2, bb4, cc08, cc08)
  689. LDF [BO + 43 * SIZE], b4
  690. FMADD1 (aa5, bb5, cc09, cc09)
  691. LDF [AO + 10 * SIZE], a3
  692. FMADD2 (aa2, bb5, cc10, cc10)
  693. LDF [AO + 11 * SIZE], a4
  694. FMADD3 (aa5, bb6, cc11, cc11)
  695. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  696. FMADD4 (aa2, bb6, cc12, cc12)
  697. nop
  698. FMADD1 (aa5, bb7, cc13, cc13)
  699. LDF [BO + 44 * SIZE], b5
  700. FMADD2 (aa2, bb7, cc14, cc14)
  701. LDF [BO + 45 * SIZE], b6
  702. FMADD3 (aa5, bb8, cc15, cc15)
  703. LDF [BO + 46 * SIZE], b7
  704. FMADD4 (aa2, bb8, cc16, cc16)
  705. LDF [BO + 47 * SIZE], b8
  706. FMADD1 (aa3, bb9, cc01, cc01)
  707. FMADD2 (aa4, bb9, cc02, cc02)
  708. FMADD3 (aa3, bb2, cc03, cc03)
  709. FMADD4 (aa4, bb2, cc04, cc04)
  710. FMADD1 (aa3, bb3, cc05, cc05)
  711. LDF [BO + 56 * SIZE], b9
  712. FMADD2 (aa4, bb3, cc06, cc06)
  713. LDF [BO + 49 * SIZE], b2
  714. FMADD3 (aa3, bb4, cc07, cc07)
  715. LDF [BO + 50 * SIZE], b3
  716. FMADD4 (aa4, bb4, cc08, cc08)
  717. LDF [BO + 51 * SIZE], b4
  718. FMADD1 (aa3, bb5, cc09, cc09)
  719. LDF [AO + 12 * SIZE], a5
  720. FMADD2 (aa4, bb5, cc10, cc10)
  721. LDF [AO + 13 * SIZE], a2
  722. FMADD3 (aa3, bb6, cc11, cc11)
  723. cmp L, 0
  724. FMADD4 (aa4, bb6, cc12, cc12)
  725. nop
  726. FMADD1 (aa3, bb7, cc13, cc13)
  727. LDF [BO + 52 * SIZE], b5
  728. FMADD2 (aa4, bb7, cc14, cc14)
  729. LDF [BO + 53 * SIZE], b6
  730. FMADD3 (aa3, bb8, cc15, cc15)
  731. LDF [BO + 54 * SIZE], b7
  732. FMADD4 (aa4, bb8, cc16, cc16)
  733. LDF [BO + 55 * SIZE], b8
  734. FMADD1 (aa5, bb1, cc01, cc01)
  735. FMADD2 (aa2, bb1, cc02, cc02)
  736. FMADD3 (aa5, bb2, cc03, cc03)
  737. FMADD4 (aa2, bb2, cc04, cc04)
  738. FMADD1 (aa5, bb3, cc05, cc05)
  739. LDF [BO + 64 * SIZE], b1
  740. FMADD2 (aa2, bb3, cc06, cc06)
  741. LDF [BO + 57 * SIZE], b2
  742. FMADD3 (aa5, bb4, cc07, cc07)
  743. LDF [BO + 58 * SIZE], b3
  744. FMADD4 (aa2, bb4, cc08, cc08)
  745. LDF [BO + 59 * SIZE], b4
  746. FMADD1 (aa5, bb5, cc09, cc09)
  747. LDF [AO + 14 * SIZE], a3
  748. FMADD2 (aa2, bb5, cc10, cc10)
  749. LDF [AO + 15 * SIZE], a4
  750. FMADD3 (aa5, bb6, cc11, cc11)
  751. add BO, 64 * SIZE, BO
  752. FMADD4 (aa2, bb6, cc12, cc12)
  753. add AO, 16 * SIZE, AO
  754. FMADD1 (aa5, bb7, cc13, cc13)
  755. LDF [BO - 4 * SIZE], b5
  756. FMADD2 (aa2, bb7, cc14, cc14)
  757. LDF [BO - 3 * SIZE], b6
  758. FMADD3 (aa5, bb8, cc15, cc15)
  759. LDF [BO - 2 * SIZE], b7
  760. FMADD4 (aa2, bb8, cc16, cc16)
  761. LDF [BO - 1 * SIZE], b8
  762. FMADD1 (aa3, bb9, cc01, cc01)
  763. FMADD2 (aa4, bb9, cc02, cc02)
  764. FMADD3 (aa3, bb2, cc03, cc03)
  765. FMADD4 (aa4, bb2, cc04, cc04)
  766. FMADD1 (aa3, bb3, cc05, cc05)
  767. LDF [BO + 8 * SIZE], b9
  768. FMADD2 (aa4, bb3, cc06, cc06)
  769. LDF [BO + 1 * SIZE], b2
  770. FMADD3 (aa3, bb4, cc07, cc07)
  771. LDF [BO + 2 * SIZE], b3
  772. FMADD4 (aa4, bb4, cc08, cc08)
  773. LDF [BO + 3 * SIZE], b4
  774. FMADD1 (aa3, bb5, cc09, cc09)
  775. LDF [AO + 8 * SIZE], a5 /****/
  776. FMADD2 (aa4, bb5, cc10, cc10)
  777. LDF [AO + 1 * SIZE], a2
  778. FMADD3 (aa3, bb6, cc11, cc11)
  779. FMADD4 (aa4, bb6, cc12, cc12)
  780. FMADD1 (aa3, bb7, cc13, cc13)
  781. LDF [BO + 4 * SIZE], b5
  782. FMADD2 (aa4, bb7, cc14, cc14)
  783. LDF [BO + 5 * SIZE], b6
  784. FMADD3 (aa3, bb8, cc15, cc15)
  785. LDF [BO + 6 * SIZE], b7
  786. FMADD4 (aa4, bb8, cc16, cc16)
  787. bg,pt %icc, .LL13
  788. LDF [BO + 7 * SIZE], b8
  789. .align 4
  790. .LL15:
  791. #ifndef TRMMKERNEL
  792. and K, 7, L
  793. #else
  794. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  795. sub K, KK, L
  796. #elif defined(LEFT)
  797. add KK, 1, L
  798. #else
  799. add KK, 4, L
  800. #endif
  801. and L, 7, L
  802. #endif
  803. cmp L, 0
  804. ble,a,pn %icc, .LL18
  805. nop
  806. .align 4
  807. .LL17:
  808. FMADD1 (aa1, bb1, cc01, cc01)
  809. add L, -1, L
  810. FMADD2 (aa2, bb1, cc02, cc02)
  811. nop
  812. FMADD3 (aa1, bb2, cc03, cc03)
  813. LDF [BO + 8 * SIZE], b1
  814. FMADD4 (aa2, bb2, cc04, cc04)
  815. LDF [BO + 9 * SIZE], b2
  816. FMADD1 (aa1, bb3, cc05, cc05)
  817. cmp L, 0
  818. FMADD2 (aa2, bb3, cc06, cc06)
  819. nop
  820. FMADD3 (aa1, bb4, cc07, cc07)
  821. LDF [BO + 10 * SIZE], b3
  822. FMADD4 (aa2, bb4, cc08, cc08)
  823. LDF [BO + 11 * SIZE], b4
  824. FMADD1 (aa1, bb5, cc09, cc09)
  825. nop
  826. FMADD2 (aa2, bb5, cc10, cc10)
  827. nop
  828. FMADD3 (aa1, bb6, cc11, cc11)
  829. LDF [BO + 12 * SIZE], b5
  830. FMADD4 (aa2, bb6, cc12, cc12)
  831. LDF [BO + 13 * SIZE], b6
  832. FMADD1 (aa1, bb7, cc13, cc13)
  833. add AO, 2 * SIZE, AO
  834. FMADD2 (aa2, bb7, cc14, cc14)
  835. add BO, 8 * SIZE, BO
  836. FMADD3 (aa1, bb8, cc15, cc15)
  837. LDF [AO + 0 * SIZE], a1
  838. FMADD4 (aa2, bb8, cc16, cc16)
  839. LDF [AO + 1 * SIZE], a2
  840. LDF [BO + 6 * SIZE], b7
  841. bg,pt %icc, .LL17
  842. LDF [BO + 7 * SIZE], b8
  843. nop
  844. .align 4
  845. .LL18:
  846. #ifndef TRMMKERNEL
  847. LDF [C1 + 0 * SIZE], a1
  848. FADD c01, c04, c01
  849. LDF [C1 + 1 * SIZE], a2
  850. FADD c02, c03, c02
  851. LDF [C2 + 0 * SIZE], a3
  852. FADD c05, c08, c05
  853. LDF [C2 + 1 * SIZE], a4
  854. FADD c06, c07, c06
  855. LDF [C3 + 0 * SIZE], b1
  856. FADD c09, c12, c09
  857. LDF [C3 + 1 * SIZE], b2
  858. FADD c10, c11, c10
  859. LDF [C4 + 0 * SIZE], b3
  860. FADD c13, c16, c13
  861. LDF [C4 + 1 * SIZE], b4
  862. FADD c14, c15, c14
  863. FMADD (alpha_r, cc01, aa1, aa1)
  864. FMADD (alpha_r, cc02, aa2, aa2)
  865. FMADD (alpha_r, cc05, aa3, aa3)
  866. FMADD (alpha_r, cc06, aa4, aa4)
  867. FMADD (alpha_r, cc09, bb1, bb1)
  868. FMADD (alpha_r, cc10, bb2, bb2)
  869. FMADD (alpha_r, cc13, bb3, bb3)
  870. FMADD (alpha_r, cc14, bb4, bb4)
  871. #else
  872. FADD c01, c04, c01
  873. FADD c02, c03, c02
  874. FADD c05, c08, c05
  875. FADD c06, c07, c06
  876. FADD c09, c12, c09
  877. FADD c10, c11, c10
  878. FADD c13, c16, c13
  879. FADD c14, c15, c14
  880. FMUL ALPHA_R, c01, a1
  881. FMUL ALPHA_R, c02, a2
  882. FMUL ALPHA_R, c05, a3
  883. FMUL ALPHA_R, c06, a4
  884. FMUL ALPHA_R, c09, b1
  885. FMUL ALPHA_R, c10, b2
  886. FMUL ALPHA_R, c13, b3
  887. FMUL ALPHA_R, c14, b4
  888. #endif
  889. FNMSUB (alpha_i, cc02, aa1, aa1)
  890. FMADD (alpha_i, cc01, aa2, aa2)
  891. FNMSUB (alpha_i, cc06, aa3, aa3)
  892. FMADD (alpha_i, cc05, aa4, aa4)
  893. FNMSUB (alpha_i, cc10, bb1, bb1)
  894. STF a1, [C1 + 0 * SIZE]
  895. FMADD (alpha_i, cc09, bb2, bb2)
  896. STF a2, [C1 + 1 * SIZE]
  897. FNMSUB (alpha_i, cc14, bb3, bb3)
  898. STF a3, [C2 + 0 * SIZE]
  899. FMADD (alpha_i, cc13, bb4, bb4)
  900. STF a4, [C2 + 1 * SIZE]
  901. STF b1, [C3 + 0 * SIZE]
  902. add C1, 2 * SIZE, C1
  903. STF b2, [C3 + 1 * SIZE]
  904. add C2, 2 * SIZE, C2
  905. STF b3, [C4 + 0 * SIZE]
  906. add C3, 2 * SIZE, C3
  907. STF b4, [C4 + 1 * SIZE]
  908. add C4, 2 * SIZE, C4
  909. #ifdef TRMMKERNEL
  910. #if ( defined(LEFT) && defined(TRANSA)) || \
  911. (!defined(LEFT) && !defined(TRANSA))
  912. sub K, KK, TEMP1
  913. #ifdef LEFT
  914. add TEMP1, -1, TEMP1
  915. #else
  916. add TEMP1, -4, TEMP1
  917. #endif
  918. sll TEMP1, ZBASE_SHIFT + 0, TEMP2
  919. sll TEMP1, ZBASE_SHIFT + 2, TEMP1
  920. add AO, TEMP2, AO
  921. add BO, TEMP1, BO
  922. #endif
  923. #ifdef LEFT
  924. add KK, 1, KK
  925. #endif
  926. #endif
  927. add I, -1, I
  928. cmp I, 0
  929. bg,pt %icc, .LL12
  930. nop
  931. #if defined(TRMMKERNEL) && !defined(LEFT)
  932. add KK, 4, KK
  933. #endif
  934. add J, -1, J
  935. cmp J, 0
  936. bg,pt %icc, .LL11
  937. mov BO, B
  938. .align 4
  939. .LL20:
  940. and N, 2, J
  941. cmp J, 0
  942. ble,pn %icc, .LL30
  943. mov C, C1
  944. add C, LDC, C2
  945. add C2, LDC, C
  946. #if defined(TRMMKERNEL) && defined(LEFT)
  947. mov OFFSET, KK
  948. #endif
  949. mov M, I
  950. mov A, AO
  951. .align 4
  952. .LL22:
  953. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
  954. mov B, BO
  955. #else
  956. sll KK, ZBASE_SHIFT + 0, TEMP1
  957. sll KK, ZBASE_SHIFT + 1, TEMP2
  958. add AO, TEMP1, AO
  959. add B, TEMP2, BO
  960. #endif
  961. LDF [AO + 0 * SIZE], a1
  962. LDF [AO + 1 * SIZE], a2
  963. LDF [BO + 0 * SIZE], b1
  964. LDF [BO + 1 * SIZE], b2
  965. LDF [BO + 2 * SIZE], b3
  966. LDF [BO + 3 * SIZE], b4
  967. LDF [BO + 4 * SIZE], b5
  968. FCLR (cc01)
  969. LDF [BO + 5 * SIZE], b6
  970. FCLR (cc02)
  971. LDF [BO + 6 * SIZE], b7
  972. FCLR (cc03)
  973. LDF [BO + 7 * SIZE], b8
  974. FCLR (cc04)
  975. LDF [BO + 8 * SIZE], b9
  976. FCLR (cc05)
  977. prefetch [C1 + 2 * SIZE], 3
  978. FCLR (cc06)
  979. prefetch [C2 + 2 * SIZE], 3
  980. FCLR (cc07)
  981. #ifndef TRMMKERNEL
  982. sra K, 2, L
  983. #else
  984. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  985. sub K, KK, L
  986. #elif defined(LEFT)
  987. add KK, 1, L
  988. #else
  989. add KK, 2, L
  990. #endif
  991. sra L, 2, L
  992. #endif
  993. cmp L, 0
  994. ble,pn %icc, .LL25
  995. FCLR (cc08)
  996. .align 4
  997. .LL23:
  998. FMADD1 (aa1, bb1, cc01, cc01)
  999. LDF [AO + 2 * SIZE], a3
  1000. FMADD2 (aa2, bb1, cc02, cc02)
  1001. LDF [AO + 3 * SIZE], a4
  1002. FMADD3 (aa1, bb2, cc03, cc03)
  1003. LDF [BO + 16 * SIZE], b1
  1004. FMADD4 (aa2, bb2, cc04, cc04)
  1005. LDF [BO + 9 * SIZE], b2
  1006. FMADD1 (aa1, bb3, cc05, cc05)
  1007. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1008. FMADD2 (aa2, bb3, cc06, cc06)
  1009. add L, -1, L
  1010. FMADD3 (aa1, bb4, cc07, cc07)
  1011. LDF [BO + 10 * SIZE], b3
  1012. FMADD4 (aa2, bb4, cc08, cc08)
  1013. LDF [BO + 11 * SIZE], b4
  1014. FMADD1 (aa3, bb5, cc01, cc01)
  1015. LDF [AO + 4 * SIZE], a1
  1016. FMADD2 (aa4, bb5, cc02, cc02)
  1017. LDF [AO + 5 * SIZE], a2
  1018. FMADD3 (aa3, bb6, cc03, cc03)
  1019. LDF [BO + 12 * SIZE], b5
  1020. FMADD4 (aa4, bb6, cc04, cc04)
  1021. LDF [BO + 13 * SIZE], b6
  1022. FMADD1 (aa3, bb7, cc05, cc05)
  1023. cmp L, 0
  1024. FMADD2 (aa4, bb7, cc06, cc06)
  1025. add AO, 8 * SIZE, AO
  1026. FMADD3 (aa3, bb8, cc07, cc07)
  1027. LDF [BO + 14 * SIZE], b7
  1028. FMADD4 (aa4, bb8, cc08, cc08)
  1029. LDF [BO + 15 * SIZE], b8
  1030. FMADD1 (aa1, bb9, cc01, cc01)
  1031. LDF [AO - 2 * SIZE], a3
  1032. FMADD2 (aa2, bb9, cc02, cc02)
  1033. LDF [AO - 1 * SIZE], a4
  1034. FMADD3 (aa1, bb2, cc03, cc03)
  1035. LDF [BO + 24 * SIZE], b9
  1036. FMADD4 (aa2, bb2, cc04, cc04)
  1037. LDF [BO + 17 * SIZE], b2
  1038. FMADD1 (aa1, bb3, cc05, cc05)
  1039. add BO, 16 * SIZE, BO
  1040. FMADD2 (aa2, bb3, cc06, cc06)
  1041. nop
  1042. FMADD3 (aa1, bb4, cc07, cc07)
  1043. LDF [BO + 2 * SIZE], b3
  1044. FMADD4 (aa2, bb4, cc08, cc08)
  1045. LDF [BO + 3 * SIZE], b4
  1046. FMADD1 (aa3, bb5, cc01, cc01)
  1047. LDF [AO + 0 * SIZE], a1
  1048. FMADD2 (aa4, bb5, cc02, cc02)
  1049. LDF [AO + 1 * SIZE], a2
  1050. FMADD3 (aa3, bb6, cc03, cc03)
  1051. LDF [BO + 4 * SIZE], b5
  1052. FMADD4 (aa4, bb6, cc04, cc04)
  1053. LDF [BO + 5 * SIZE], b6
  1054. FMADD1 (aa3, bb7, cc05, cc05)
  1055. nop
  1056. FMADD2 (aa4, bb7, cc06, cc06)
  1057. LDF [BO + 6 * SIZE], b7
  1058. FMADD3 (aa3, bb8, cc07, cc07)
  1059. FMADD4 (aa4, bb8, cc08, cc08)
  1060. bg,pt %icc, .LL23
  1061. LDF [BO + 7 * SIZE], b8
  1062. .align 4
  1063. .LL25:
  1064. #ifndef TRMMKERNEL
  1065. and K, 3, L
  1066. #else
  1067. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1068. sub K, KK, L
  1069. #elif defined(LEFT)
  1070. add KK, 1, L
  1071. #else
  1072. add KK, 2, L
  1073. #endif
  1074. and L, 3, L
  1075. #endif
  1076. cmp L, 0
  1077. ble,a,pn %icc, .LL28
  1078. nop
  1079. .align 4
  1080. .LL27:
  1081. FMADD1 (aa1, bb1, cc01, cc01)
  1082. add L, -1, L
  1083. FMADD2 (aa2, bb1, cc02, cc02)
  1084. LDF [BO + 4 * SIZE], b1
  1085. FMADD3 (aa1, bb2, cc03, cc03)
  1086. add AO, 2 * SIZE, AO
  1087. FMADD4 (aa2, bb2, cc04, cc04)
  1088. LDF [BO + 5 * SIZE], b2
  1089. FMADD1 (aa1, bb3, cc05, cc05)
  1090. cmp L, 0
  1091. FMADD2 (aa2, bb3, cc06, cc06)
  1092. LDF [BO + 6 * SIZE], b3
  1093. FMADD3 (aa1, bb4, cc07, cc07)
  1094. LDF [AO + 0 * SIZE], a1
  1095. FMADD4 (aa2, bb4, cc08, cc08)
  1096. LDF [AO + 1 * SIZE], a2
  1097. LDF [BO + 7 * SIZE], b4
  1098. bg,pt %icc, .LL27
  1099. add BO, 4 * SIZE, BO
  1100. .align 4
  1101. .LL28:
  1102. #ifndef TRMMKERNEL
  1103. LDF [C1 + 0 * SIZE], a1
  1104. FADD c01, c04, c01
  1105. LDF [C1 + 1 * SIZE], a2
  1106. FADD c02, c03, c02
  1107. LDF [C2 + 0 * SIZE], a3
  1108. FADD c05, c08, c05
  1109. LDF [C2 + 1 * SIZE], a4
  1110. FADD c06, c07, c06
  1111. FMADD (alpha_r, cc01, aa1, aa1)
  1112. FMADD (alpha_r, cc02, aa2, aa2)
  1113. FMADD (alpha_r, cc05, aa3, aa3)
  1114. FMADD (alpha_r, cc06, aa4, aa4)
  1115. #else
  1116. FADD c01, c04, c01
  1117. FADD c02, c03, c02
  1118. FADD c05, c08, c05
  1119. FADD c06, c07, c06
  1120. FMUL ALPHA_R, c01, a1
  1121. FMUL ALPHA_R, c02, a2
  1122. FMUL ALPHA_R, c05, a3
  1123. FMUL ALPHA_R, c06, a4
  1124. #endif
  1125. FNMSUB (alpha_i, cc02, aa1, aa1)
  1126. FMADD (alpha_i, cc01, aa2, aa2)
  1127. FNMSUB (alpha_i, cc06, aa3, aa3)
  1128. FMADD (alpha_i, cc05, aa4, aa4)
  1129. STF a1, [C1 + 0 * SIZE]
  1130. add I, -1, I
  1131. STF a2, [C1 + 1 * SIZE]
  1132. cmp I, 0
  1133. STF a3, [C2 + 0 * SIZE]
  1134. add C1, 2 * SIZE, C1
  1135. STF a4, [C2 + 1 * SIZE]
  1136. add C2, 2 * SIZE, C2
  1137. #ifdef TRMMKERNEL
  1138. #if ( defined(LEFT) && defined(TRANSA)) || \
  1139. (!defined(LEFT) && !defined(TRANSA))
  1140. sub K, KK, TEMP1
  1141. #ifdef LEFT
  1142. add TEMP1, -1, TEMP1
  1143. #else
  1144. add TEMP1, -2, TEMP1
  1145. #endif
  1146. sll TEMP1, ZBASE_SHIFT + 0, TEMP2
  1147. sll TEMP1, ZBASE_SHIFT + 1, TEMP1
  1148. add AO, TEMP2, AO
  1149. add BO, TEMP1, BO
  1150. #endif
  1151. #ifdef LEFT
  1152. add KK, 1, KK
  1153. #endif
  1154. #endif
  1155. bg,pt %icc, .LL22
  1156. nop
  1157. #if defined(TRMMKERNEL) && !defined(LEFT)
  1158. add KK, 2, KK
  1159. #endif
  1160. mov BO, B
  1161. .align 4
  1162. .LL30:
  1163. and N, 1, J
  1164. cmp J, 0
  1165. ble,pn %icc, .LL999
  1166. mov C, C1
  1167. #if defined(TRMMKERNEL) && defined(LEFT)
  1168. mov OFFSET, KK
  1169. #endif
  1170. mov M, I
  1171. mov A, AO
  1172. .align 4
  1173. .LL32:
  1174. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
  1175. mov B, BO
  1176. #else
  1177. sll KK, ZBASE_SHIFT + 0, TEMP1
  1178. sll KK, ZBASE_SHIFT + 0, TEMP2
  1179. add AO, TEMP1, AO
  1180. add B, TEMP2, BO
  1181. #endif
  1182. LDF [AO + 0 * SIZE], a1
  1183. LDF [AO + 1 * SIZE], a2
  1184. LDF [AO + 2 * SIZE], a3
  1185. LDF [AO + 3 * SIZE], a4
  1186. LDF [BO + 0 * SIZE], b1
  1187. LDF [BO + 1 * SIZE], b2
  1188. LDF [BO + 2 * SIZE], b3
  1189. FCLR (cc01)
  1190. LDF [BO + 3 * SIZE], b4
  1191. FCLR (cc02)
  1192. LDF [BO + 4 * SIZE], b5
  1193. FCLR (cc03)
  1194. LDF [BO + 5 * SIZE], b6
  1195. FCLR (cc04)
  1196. LDF [BO + 6 * SIZE], b7
  1197. FCLR (cc05)
  1198. LDF [BO + 7 * SIZE], b8
  1199. FCLR (cc06)
  1200. prefetch [C1 + 2 * SIZE], 3
  1201. FCLR (cc07)
  1202. #ifndef TRMMKERNEL
  1203. sra K, 2, L
  1204. #else
  1205. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1206. sub K, KK, L
  1207. #elif defined(LEFT)
  1208. add KK, 1, L
  1209. #else
  1210. add KK, 1, L
  1211. #endif
  1212. sra L, 2, L
  1213. #endif
  1214. cmp L, 0
  1215. ble,pn %icc, .LL35
  1216. FCLR (cc08)
  1217. .align 4
  1218. .LL33:
  1219. FMADD1 (aa1, bb1, cc01, cc01)
  1220. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1221. FMADD2 (aa2, bb1, cc02, cc02)
  1222. LDF [BO + 8 * SIZE], b1
  1223. FMADD3 (aa1, bb2, cc03, cc03)
  1224. LDF [AO + 4 * SIZE], a1
  1225. FMADD4 (aa2, bb2, cc04, cc04)
  1226. LDF [AO + 5 * SIZE], a2
  1227. FMADD1 (aa3, bb3, cc01, cc01)
  1228. LDF [BO + 9 * SIZE], b2
  1229. FMADD2 (aa4, bb3, cc02, cc02)
  1230. LDF [BO + 10 * SIZE], b3
  1231. FMADD3 (aa3, bb4, cc03, cc03)
  1232. LDF [AO + 6 * SIZE], a3
  1233. FMADD4 (aa4, bb4, cc04, cc04)
  1234. LDF [AO + 7 * SIZE], a4
  1235. FMADD1 (aa1, bb5, cc01, cc01)
  1236. LDF [BO + 11 * SIZE], b4
  1237. FMADD2 (aa2, bb5, cc02, cc02)
  1238. LDF [BO + 12 * SIZE], b5
  1239. FMADD3 (aa1, bb6, cc03, cc03)
  1240. LDF [AO + 8 * SIZE], a1
  1241. FMADD4 (aa2, bb6, cc04, cc04)
  1242. LDF [AO + 9 * SIZE], a2
  1243. FMADD1 (aa3, bb7, cc01, cc01)
  1244. LDF [BO + 13 * SIZE], b6
  1245. FMADD2 (aa4, bb7, cc02, cc02)
  1246. LDF [BO + 14 * SIZE], b7
  1247. FMADD3 (aa3, bb8, cc03, cc03)
  1248. LDF [AO + 10 * SIZE], a3
  1249. FMADD4 (aa4, bb8, cc04, cc04)
  1250. LDF [AO + 11 * SIZE], a4
  1251. add AO, 8 * SIZE, AO
  1252. add L, -1, L
  1253. add BO, 8 * SIZE, BO
  1254. cmp L, 0
  1255. bg,pt %icc, .LL33
  1256. LDF [BO + 7 * SIZE], b8
  1257. .align 4
  1258. .LL35:
  1259. #ifndef TRMMKERNEL
  1260. and K, 3, L
  1261. #else
  1262. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1263. sub K, KK, L
  1264. #elif defined(LEFT)
  1265. add KK, 1, L
  1266. #else
  1267. add KK, 1, L
  1268. #endif
  1269. and L, 3, L
  1270. #endif
  1271. cmp L, 0
  1272. ble,a,pn %icc, .LL38
  1273. nop
  1274. .align 4
  1275. .LL37:
  1276. FMADD1 (aa1, bb1, cc01, cc01)
  1277. add L, -1, L
  1278. FMADD2 (aa2, bb1, cc02, cc02)
  1279. LDF [BO + 2 * SIZE], b1
  1280. FMADD3 (aa1, bb2, cc03, cc03)
  1281. LDF [AO + 2 * SIZE], a1
  1282. FMADD4 (aa2, bb2, cc04, cc04)
  1283. LDF [AO + 3 * SIZE], a2
  1284. add AO, 2 * SIZE, AO
  1285. cmp L, 0
  1286. add BO, 2 * SIZE, BO
  1287. bg,pt %icc, .LL37
  1288. LDF [BO + 1 * SIZE], b2
  1289. .align 4
  1290. .LL38:
  1291. #ifndef TRMMKERNEL
  1292. LDF [C1 + 0 * SIZE], a1
  1293. FADD c01, c04, c01
  1294. LDF [C1 + 1 * SIZE], a2
  1295. FADD c02, c03, c02
  1296. FMADD (alpha_r, cc01, aa1, aa1)
  1297. FMADD (alpha_r, cc02, aa2, aa2)
  1298. #else
  1299. FADD c01, c04, c01
  1300. FADD c02, c03, c02
  1301. FMUL ALPHA_R, c01, a1
  1302. FMUL ALPHA_R, c02, a2
  1303. #endif
  1304. FNMSUB (alpha_i, cc02, aa1, aa1)
  1305. FMADD (alpha_i, cc01, aa2, aa2)
  1306. STF a1, [C1 + 0 * SIZE]
  1307. STF a2, [C1 + 1 * SIZE]
  1308. #ifdef TRMMKERNEL
  1309. #if ( defined(LEFT) && defined(TRANSA)) || \
  1310. (!defined(LEFT) && !defined(TRANSA))
  1311. sub K, KK, TEMP1
  1312. #ifdef LEFT
  1313. add TEMP1, -1, TEMP1
  1314. #else
  1315. add TEMP1, -1, TEMP1
  1316. #endif
  1317. sll TEMP1, ZBASE_SHIFT + 0, TEMP2
  1318. sll TEMP1, ZBASE_SHIFT + 0, TEMP1
  1319. add AO, TEMP2, AO
  1320. add BO, TEMP1, BO
  1321. #endif
  1322. #ifdef LEFT
  1323. add KK, 1, KK
  1324. #endif
  1325. #endif
  1326. add I, -1, I
  1327. cmp I, 0
  1328. bg,pt %icc, .LL32
  1329. add C1, 2 * SIZE, C1
  1330. .align 4
  1331. .LL999:
  1332. return %i7 + 8
  1333. clr %o0
  1334. EPILOGUE