You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_logic_power10.S 63 kB


  1. /***************************************************************************
  2. Copyright (c) 2013-2020, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define MY_ALIGN .align 3
  28. b CGEMM_L4
  29. /* MINI SUBROUTINES */
  30. /* 4x8 MAIN 128x+2 LOOP */
  31. CGEMM_L4x8_LMAIN_SUB:
  32. /*----------------------------------------*/
  33. mtctr T8
  34. LOAD4x8_2
  35. MY_ALIGN
  36. CGEMM_L4x8_LOOP:
  37. /*----------------------------------------*/
  38. dcbt AO, PRE
  39. dcbt BO, PRE
  40. KERNEL4x8_L2 128,64,0,0
  41. CGEMM_L4x8_K128:
  42. /*----------------------------------------*/
  43. KERNEL4x8_L2 128,64,1,0
  44. dcbt AO, T2
  45. KERNEL4x8_L2 128,64,2,0
  46. KERNEL4x8_L2 128,64,3,0
  47. dcbt AO, T3
  48. dcbt BO, T2
  49. KERNEL4x8_L2 128,64,4,0
  50. KERNEL4x8_L2 128,64,5,0
  51. dcbt AO, T4
  52. KERNEL4x8_L2 128,64,6,0
  53. KERNEL4x8_L2 128,64,7,0
  54. dcbt AO, T5
  55. dcbt BO, T3
  56. KERNEL4x8_L2 128,64,8,0
  57. KERNEL4x8_L2 128,64,9,0
  58. KERNEL4x8_L2 128,64,10,0
  59. KERNEL4x8_L2 128,64,11,0
  60. dcbt BO, T4
  61. KERNEL4x8_L2 128,64,12,0
  62. KERNEL4x8_L2 128,64,13,0
  63. KERNEL4x8_L2 128,64,14,0
  64. KERNEL4x8_L2 128,64,15,0
  65. KERNEL4x8_L2 128,64,16,0
  66. KERNEL4x8_L2 128,64,17,0
  67. KERNEL4x8_L2 128,64,18,0
  68. KERNEL4x8_L2 128,64,19,0
  69. KERNEL4x8_L2 128,64,20,0
  70. KERNEL4x8_L2 128,64,21,0
  71. KERNEL4x8_L2 128,64,22,0
  72. KERNEL4x8_L2 128,64,23,0
  73. KERNEL4x8_L2 128,64,24,0
  74. KERNEL4x8_L2 128,64,25,0
  75. KERNEL4x8_L2 128,64,26,0
  76. KERNEL4x8_L2 128,64,27,0
  77. KERNEL4x8_L2 128,64,28,0
  78. KERNEL4x8_L2 128,64,29,0
  79. KERNEL4x8_L2 128,64,30,0
  80. KERNEL4x8_L2 128,64,31,0
  81. KERNEL4x8_L2 128,64,32,0
  82. KERNEL4x8_L2 128,64,33,0
  83. KERNEL4x8_L2 128,64,34,0
  84. KERNEL4x8_L2 128,64,35,0
  85. KERNEL4x8_L2 128,64,36,0
  86. KERNEL4x8_L2 128,64,37,0
  87. KERNEL4x8_L2 128,64,38,0
  88. KERNEL4x8_L2 128,64,39,0
  89. KERNEL4x8_L2 128,64,40,0
  90. KERNEL4x8_L2 128,64,41,0
  91. KERNEL4x8_L2 128,64,42,0
  92. KERNEL4x8_L2 128,64,43,0
  93. KERNEL4x8_L2 128,64,44,0
  94. KERNEL4x8_L2 128,64,45,0
  95. KERNEL4x8_L2 128,64,46,0
  96. KERNEL4x8_L2 128,64,47,0
  97. KERNEL4x8_L2 128,64,48,0
  98. KERNEL4x8_L2 128,64,49,0
  99. KERNEL4x8_L2 128,64,50,0
  100. KERNEL4x8_L2 128,64,51,0
  101. KERNEL4x8_L2 128,64,52,0
  102. KERNEL4x8_L2 128,64,53,0
  103. KERNEL4x8_L2 128,64,54,0
  104. KERNEL4x8_L2 128,64,55,0
  105. KERNEL4x8_L2 128,64,56,0
  106. KERNEL4x8_L2 128,64,57,0
  107. KERNEL4x8_L2 128,64,58,0
  108. KERNEL4x8_L2 128,64,59,0
  109. KERNEL4x8_L2 128,64,60,0
  110. KERNEL4x8_L2 128,64,61,0
  111. KERNEL4x8_L2 128,64,62,0
  112. KERNEL4x8_L2 128,64,63,1
  113. bdnz CGEMM_L4x8_LOOP
  114. MY_ALIGN
  115. CGEMM_L4x8_LOOP_END:
  116. /*----------------------------------------*/
  117. END4x8_2
  118. blr
  119. MY_ALIGN
  120. CGEMM_4x8_L64_SUB:
  121. /*----------------------------------------*/
  122. LOAD4x8_2
  123. dcbt AO, PRE
  124. dcbt BO, PRE
  125. KERNEL4x8_L2 128,64,0,0
  126. KERNEL4x8_L2 128,64,1,0
  127. dcbt AO, T2
  128. KERNEL4x8_L2 128,64,2,0
  129. KERNEL4x8_L2 128,64,3,0
  130. dcbt AO, T3
  131. dcbt BO, T2
  132. KERNEL4x8_L2 128,64,4,0
  133. KERNEL4x8_L2 128,64,5,0
  134. dcbt AO, T4
  135. KERNEL4x8_L2 128,64,6,0
  136. KERNEL4x8_L2 128,64,7,0
  137. dcbt AO, T5
  138. dcbt BO, T3
  139. KERNEL4x8_L2 128,64,8,0
  140. KERNEL4x8_L2 128,64,9,0
  141. KERNEL4x8_L2 128,64,10,0
  142. KERNEL4x8_L2 128,64,11,0
  143. dcbt BO, T4
  144. KERNEL4x8_L2 128,64,12,0
  145. KERNEL4x8_L2 128,64,13,0
  146. KERNEL4x8_L2 128,64,14,0
  147. KERNEL4x8_L2 128,64,15,0
  148. KERNEL4x8_L2 128,64,16,0
  149. KERNEL4x8_L2 128,64,17,0
  150. KERNEL4x8_L2 128,64,18,0
  151. KERNEL4x8_L2 128,64,19,0
  152. KERNEL4x8_L2 128,64,20,0
  153. KERNEL4x8_L2 128,64,21,0
  154. KERNEL4x8_L2 128,64,22,0
  155. KERNEL4x8_L2 128,64,23,0
  156. KERNEL4x8_L2 128,64,24,0
  157. KERNEL4x8_L2 128,64,25,0
  158. KERNEL4x8_L2 128,64,26,0
  159. KERNEL4x8_L2 128,64,27,0
  160. KERNEL4x8_L2 128,64,28,0
  161. KERNEL4x8_L2 128,64,29,0
  162. KERNEL4x8_L2 128,64,30,0
  163. KERNEL4x8_E2 128,64,31,1
  164. blr
  165. MY_ALIGN
  166. CGEMM_4x8_L32_SUB:
  167. /*----------------------------------------*/
  168. LOAD4x8_2
  169. dcbt AO, PRE
  170. dcbt BO, PRE
  171. KERNEL4x8_L2 128,64,0,0
  172. KERNEL4x8_L2 128,64,1,0
  173. dcbt AO, T2
  174. KERNEL4x8_L2 128,64,2,0
  175. KERNEL4x8_L2 128,64,3,0
  176. dcbt AO, T3
  177. dcbt BO, T2
  178. KERNEL4x8_L2 128,64,4,0
  179. KERNEL4x8_L2 128,64,5,0
  180. dcbt AO, T4
  181. KERNEL4x8_L2 128,64,6,0
  182. KERNEL4x8_L2 128,64,7,0
  183. dcbt AO, T5
  184. dcbt BO, T3
  185. KERNEL4x8_L2 128,64,8,0
  186. KERNEL4x8_L2 128,64,9,0
  187. KERNEL4x8_L2 128,64,10,0
  188. KERNEL4x8_L2 128,64,11,0
  189. dcbt BO, T4
  190. KERNEL4x8_L2 128,64,12,0
  191. KERNEL4x8_L2 128,64,13,0
  192. KERNEL4x8_L2 128,64,14,0
  193. KERNEL4x8_E2 128,64,15,1
  194. blr
  195. MY_ALIGN
  196. CGEMM_4x8_L16_SUB:
  197. /*----------------------------------------*/
  198. LOAD4x8_2
  199. dcbt AO, PRE
  200. dcbt BO, PRE
  201. KERNEL4x8_L2 128,64,0,0
  202. KERNEL4x8_L2 128,64,1,0
  203. dcbt AO, T2
  204. KERNEL4x8_L2 128,64,2,0
  205. KERNEL4x8_L2 128,64,3,0
  206. dcbt AO, T3
  207. dcbt BO, T2
  208. KERNEL4x8_L2 128,64,4,0
  209. KERNEL4x8_L2 128,64,5,0
  210. dcbt AO, T4
  211. KERNEL4x8_L2 128,64,6,0
  212. KERNEL4x8_E2 128,64,7,1
  213. blr
  214. MY_ALIGN
  215. CGEMM_4x4_LMAIN_SUB:
  216. /*----------------------------------------*/
  217. mtctr T8
  218. LOAD4x4_2
  219. MY_ALIGN
  220. CGEMM_L4x4_LOOP:
  221. /*----------------------------------------*/
  222. KERNEL4x4_L2 64,64,0,0
  223. CGEMM_L4x4_K32:
  224. /*----------------------------------------*/
  225. KERNEL4x4_L2 64,64,1,0
  226. KERNEL4x4_L2 64,64,2,0
  227. KERNEL4x4_L2 64,64,3,0
  228. KERNEL4x4_L2 64,64,4,0
  229. KERNEL4x4_L2 64,64,5,0
  230. KERNEL4x4_L2 64,64,6,0
  231. KERNEL4x4_L2 64,64,7,0
  232. KERNEL4x4_L2 64,64,8,0
  233. KERNEL4x4_L2 64,64,9,0
  234. KERNEL4x4_L2 64,64,10,0
  235. KERNEL4x4_L2 64,64,11,0
  236. KERNEL4x4_L2 64,64,12,0
  237. KERNEL4x4_L2 64,64,13,0
  238. KERNEL4x4_L2 64,64,14,0
  239. KERNEL4x4_L2 64,64,15,1
  240. bdnz CGEMM_L4x4_LOOP
  241. MY_ALIGN
  242. CGEMM_L4x4_LOOP_END:
  243. /*----------------------------------------*/
  244. END4x4_2
  245. blr
  246. MY_ALIGN
  247. CGEMM_4x4_L16_SUB:
  248. /*----------------------------------------*/
  249. LOAD4x4_2
  250. KERNEL4x4_L2 64,64,0,0
  251. KERNEL4x4_L2 64,64,1,0
  252. KERNEL4x4_L2 64,64,2,0
  253. KERNEL4x4_L2 64,64,3,0
  254. KERNEL4x4_L2 64,64,4,0
  255. KERNEL4x4_L2 64,64,5,0
  256. KERNEL4x4_L2 64,64,6,0
  257. KERNEL4x4_E2 64,64,7,1
  258. blr
  259. MY_ALIGN
  260. CGEMM_4x4_L8_SUB:
  261. /*----------------------------------------*/
  262. LOAD4x4_2
  263. KERNEL4x4_L2 64,64,0,0
  264. KERNEL4x4_L2 64,64,1,0
  265. KERNEL4x4_L2 64,64,2,0
  266. KERNEL4x4_E2 64,64,3,1
  267. blr
  268. CGEMM_4x2_LMAIN_SUB:
  269. /*----------------------------------------*/
  270. mtctr T8
  271. LOAD4x2_2
  272. MY_ALIGN
  273. CGEMM_L4x2_LOOP:
  274. /*----------------------------------------*/
  275. KERNEL4x2_L2 32,64,0,0
  276. CGEMM_L4x2_K32:
  277. /*----------------------------------------*/
  278. KERNEL4x2_L2 32,64,1,0
  279. KERNEL4x2_L2 32,64,2,0
  280. KERNEL4x2_L2 32,64,3,0
  281. KERNEL4x2_L2 32,64,4,0
  282. KERNEL4x2_L2 32,64,5,0
  283. KERNEL4x2_L2 32,64,6,0
  284. KERNEL4x2_L2 32,64,7,0
  285. KERNEL4x2_L2 32,64,8,0
  286. KERNEL4x2_L2 32,64,9,0
  287. KERNEL4x2_L2 32,64,10,0
  288. KERNEL4x2_L2 32,64,11,0
  289. KERNEL4x2_L2 32,64,12,0
  290. KERNEL4x2_L2 32,64,13,0
  291. KERNEL4x2_L2 32,64,14,0
  292. KERNEL4x2_L2 32,64,15,1
  293. bdnz CGEMM_L4x2_LOOP
  294. MY_ALIGN
  295. CGEMM_L4x2_LOOP_END:
  296. /*----------------------------------------*/
  297. END4x2_2
  298. blr
  299. MY_ALIGN
  300. CGEMM_4x2_L16_SUB:
  301. /*----------------------------------------*/
  302. LOAD4x2_2
  303. KERNEL4x2_L2 32,64,0,0
  304. KERNEL4x2_L2 32,64,1,0
  305. KERNEL4x2_L2 32,64,2,0
  306. KERNEL4x2_L2 32,64,3,0
  307. KERNEL4x2_L2 32,64,4,0
  308. KERNEL4x2_L2 32,64,5,0
  309. KERNEL4x2_L2 32,64,6,0
  310. KERNEL4x2_E2 32,64,7,1
  311. blr
  312. MY_ALIGN
  313. CGEMM_4x2_L8_SUB:
  314. /*----------------------------------------*/
  315. LOAD4x2_2
  316. KERNEL4x2_L2 32,64,0,0
  317. KERNEL4x2_L2 32,64,1,0
  318. KERNEL4x2_L2 32,64,2,0
  319. KERNEL4x2_E2 32,64,3,1
  320. blr
  321. CGEMM_4x1_LMAIN_SUB:
  322. /*----------------------------------------*/
  323. mtctr T8
  324. LOAD4x1_2
  325. MY_ALIGN
  326. CGEMM_L4x1_LOOP:
  327. /*----------------------------------------*/
  328. KERNEL4x1_L2 16,64,0,0
  329. CGEMM_L4x1_K32:
  330. /*----------------------------------------*/
  331. KERNEL4x1_L2 16,64,1,0
  332. KERNEL4x1_L2 16,64,2,0
  333. KERNEL4x1_L2 16,64,3,0
  334. KERNEL4x1_L2 16,64,4,0
  335. KERNEL4x1_L2 16,64,5,0
  336. KERNEL4x1_L2 16,64,6,0
  337. KERNEL4x1_L2 16,64,7,0
  338. KERNEL4x1_L2 16,64,8,0
  339. KERNEL4x1_L2 16,64,9,0
  340. KERNEL4x1_L2 16,64,10,0
  341. KERNEL4x1_L2 16,64,11,0
  342. KERNEL4x1_L2 16,64,12,0
  343. KERNEL4x1_L2 16,64,13,0
  344. KERNEL4x1_L2 16,64,14,0
  345. KERNEL4x1_L2 16,64,15,1
  346. bdnz CGEMM_L4x1_LOOP
  347. MY_ALIGN
  348. CGEMM_L4x1_LOOP_END:
  349. /*----------------------------------------*/
  350. END4x1_2
  351. blr
  352. MY_ALIGN
  353. CGEMM_4x1_L16_SUB:
  354. /*----------------------------------------*/
  355. LOAD4x1_2
  356. KERNEL4x1_L2 16,64,0,0
  357. KERNEL4x1_L2 16,64,1,0
  358. KERNEL4x1_L2 16,64,2,0
  359. KERNEL4x1_L2 16,64,3,0
  360. KERNEL4x1_L2 16,64,4,0
  361. KERNEL4x1_L2 16,64,5,0
  362. KERNEL4x1_L2 16,64,6,0
  363. KERNEL4x1_E2 16,64,7,1
  364. blr
  365. MY_ALIGN
  366. CGEMM_4x1_L8_SUB:
  367. /*----------------------------------------*/
  368. LOAD4x1_2
  369. KERNEL4x1_L2 16,64,0,0
  370. KERNEL4x1_L2 16,64,1,0
  371. KERNEL4x1_L2 16,64,2,0
  372. KERNEL4x1_E2 16,64,3,1
  373. blr
  374. /* MAIN LOOP BEGINS */
  375. MY_ALIGN
  376. CGEMM_L4:
  377. /*----------------------------------------*/
  378. #if defined(TRMMKERNEL) && !defined(LEFT)
  379. neg TEMP_REG, OFFSET
  380. #endif
  381. /* Pre set value in vs57 as 0xffff0000ffff0000 for masking */
  382. vspltisb v24, -1
  383. vspltisb v25, 0
  384. xxsldwi vs57, vs56, vs57, 1
  385. xxpermdi vs57, vs57, vs57, 3
  386. srawi. J, N, 2
  387. ble CGEMM_L4_END
  388. CGEMM_L4_BEGIN:
  389. /*----------------------------------------*/
  390. mr CO, C
  391. slwi T1, LDC , 2
  392. add T2,C,LDC
  393. mr AO, A
  394. add C, C, T1
  395. #if defined(TRMMKERNEL) && defined(LEFT)
  396. mr TEMP_REG, OFFSET /*off = offset;*/
  397. #endif
  398. srawi. I, M, 3
  399. ble CGEMM_L4x8_END
  400. dcbt CO,r0 /*just prefetch*/
  401. dcbt T2,r0
  402. CGEMM_L4x8_BEGIN:
  403. /*----------------------------------------*/
  404. #if defined(TRMMKERNEL)
  405. REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4
  406. #else
  407. mr BO, B
  408. dcbt B, r0
  409. #endif
  410. dcbt AO, r0
  411. #if defined(TRMMKERNEL)
  412. REFRESH_TEMP_BK T6,K,TEMP_REG,8,4
  413. mr T1, T6
  414. /* TEMPS FOR PREFETCH */
  415. li T2, 1024
  416. li T3, 1024+512
  417. addi T1,T1, -2
  418. /* TEMPS FOR PREFETCH */
  419. li T4, 2048
  420. li T5, 2048+512
  421. srawi. T8, T1, 7 /**(T1-2) % 128x */
  422. #else
  423. mr T1, K
  424. /* TEMPS FOR PREFETCH */
  425. li T2, 1024
  426. li T3, 1024+512
  427. addi T1,T1, -2
  428. /* TEMPS FOR PREFETCH */
  429. li T4, 2048
  430. li T5, 2048+512
  431. srawi. T8, T1, 7 /**(K-2) % 128x */
  432. #endif
  433. ZERO4x8
  434. ble CGEMM_L4x8_SUB0
  435. bl CGEMM_L4x8_LMAIN_SUB
  436. andi. L, T1, 127
  437. ble CGEMM_L4x8_SAVE
  438. b CGEMM_L4x8_SUB2
  439. CGEMM_L4x8_SUB0:
  440. /*----------------------------------------*/
  441. #if defined(TRMMKERNEL)
  442. andi. L, T6, 255
  443. cmpwi T6,129
  444. #else
  445. andi. L, K, 255
  446. cmpwi K,129
  447. #endif
  448. li T8,1
  449. bne CMP4x8_128K
  450. addi BO,BO,-32
  451. addi AO,AO,-64
  452. LOAD4x8O 64,32
  453. END4x8_WITHOUT_ADD
  454. LOAD4x8_2O 128, 64
  455. mtctr T8
  456. bl CGEMM_L4x8_K128
  457. b CGEMM_L4x8_SAVE
  458. CMP4x8_128K:
  459. /*----------------------------------------*/
  460. #if defined(TRMMKERNEL)
  461. cmpwi T6,128
  462. #else
  463. cmpwi K,128
  464. #endif
  465. bne CGEMM_L4x8_SUB2
  466. MY_ALIGN
  467. mtctr T8
  468. addi BO,BO,-64
  469. addi AO,AO,-128
  470. LOAD4x8_2O 128,64
  471. bl CGEMM_L4x8_K128
  472. b CGEMM_L4x8_SAVE
  473. MY_ALIGN
  474. CGEMM_L4x8_SUB2:
  475. /*----------------------------------------*/
  476. andi. T1,L, 64
  477. ble CGEMM_L4x8_SUB2_32
  478. bl CGEMM_4x8_L64_SUB
  479. MY_ALIGN
  480. CGEMM_L4x8_SUB2_32:
  481. /*----------------------------------------*/
  482. andi. T1,L, 32
  483. ble CGEMM_L4x8_SUB2_16
  484. bl CGEMM_4x8_L32_SUB
  485. MY_ALIGN
  486. CGEMM_L4x8_SUB2_16:
  487. /*----------------------------------------*/
  488. andi. T1,L, 16
  489. ble CGEMM_L4x8_SUB2_8
  490. bl CGEMM_4x8_L16_SUB
  491. MY_ALIGN
  492. CGEMM_L4x8_SUB2_8:
  493. /*----------------------------------------*/
  494. andi. T1,L, 8
  495. ble CGEMM_L4x8_SUB2_4
  496. LOAD4x8_2
  497. KERNEL4x8_L2 128,64, 0,0
  498. KERNEL4x8_L2 128,64, 1,0
  499. KERNEL4x8_L2 128,64, 2,0
  500. KERNEL4x8_E2 128,64, 3,1
  501. MY_ALIGN
  502. CGEMM_L4x8_SUB2_4:
  503. /*----------------------------------------*/
  504. andi. T1,L, 4
  505. ble CGEMM_L4x8_SUB2_2
  506. LOAD4x8_2
  507. KERNEL4x8_L2 128,64, 0,0
  508. KERNEL4x8_E2 128,64, 1,1
  509. MY_ALIGN
  510. CGEMM_L4x8_SUB2_2:
  511. /*----------------------------------------*/
  512. andi. T1,L, 2
  513. ble CGEMM_L4x8_SUB2_1
  514. LOAD4x8_2
  515. KERNEL4x8_E2 128,64, 0,1
  516. MY_ALIGN
  517. CGEMM_L4x8_SUB2_1:
  518. /*----------------------------------------*/
  519. andi. T1,L, 1
  520. ble CGEMM_L4x8_SAVE
  521. KERNEL4x8
  522. MY_ALIGN
  523. CGEMM_L4x8_SAVE:
  524. /*----------------------------------------*/
  525. addic. I, I, -1
  526. MY_ALIGN
  527. SAVE4x8
  528. #if defined(TRMMKERNEL)
  529. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4
  530. #endif
  531. bgt CGEMM_L4x8_BEGIN
  532. andi. T2, M, 7
  533. ble CGEMM_L4x1_END
  534. andi. T1, M, 4
  535. ble CGEMM_L4x4_END
  536. b CGEMM_L4x4_BEGIN
  537. MY_ALIGN
  538. CGEMM_L4x8_END:
  539. /*----------------------------------------*/
  540. CGEMM_L4x4_BEGIN:
  541. /*----------------------------------------*/
  542. andi. T2, M, 7
  543. ble CGEMM_L4x1_END
  544. andi. T1, M, 4
  545. ble CGEMM_L4x4_END
  546. #if defined(TRMMKERNEL)
  547. REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4
  548. #else
  549. mr BO, B
  550. #endif
  551. #if defined(TRMMKERNEL)
  552. REFRESH_TEMP_BK T6,K,TEMP_REG,4,4
  553. mr T1, T6
  554. addi T1,T1, -2
  555. srawi. T8, T1, 5 /**(T1-2) % 32x */
  556. #else
  557. mr T1, K
  558. addi T1,T1, -2
  559. srawi. T8, T1, 5 /**(K-2) % 32x */
  560. #endif
  561. ZERO4x4
  562. ble CGEMM_L4x4_SUB0
  563. bl CGEMM_4x4_LMAIN_SUB
  564. andi. L, T1, 31
  565. ble CGEMM_L4x4_SAVE
  566. b CGEMM_L4x4_SUB2
  567. CGEMM_L4x4_SUB0:
  568. /*----------------------------------------*/
  569. #if defined(TRMMKERNEL)
  570. andi. L, T6, 63
  571. cmpwi T6,33
  572. #else
  573. andi. L, K, 63
  574. cmpwi K,33
  575. #endif
  576. li T8,1
  577. bne CMP4x4_32K
  578. addi BO,BO,-32
  579. addi AO,AO,-32
  580. LOAD4x4O 32,32
  581. END4x4_WITHOUT_ADD
  582. LOAD4x4_2O 64, 64
  583. mtctr T8
  584. bl CGEMM_L4x4_K32
  585. b CGEMM_L4x4_SAVE
  586. CMP4x4_32K:
  587. /*----------------------------------------*/
  588. #if defined(TRMMKERNEL)
  589. cmpwi T6,32
  590. #else
  591. cmpwi K,32
  592. #endif
  593. bne CGEMM_L4x4_SUB2
  594. MY_ALIGN
  595. mtctr T8
  596. addi BO,BO,-64
  597. addi AO,AO,-64
  598. LOAD4x4_2O 64,64
  599. bl CGEMM_L4x4_K32
  600. b CGEMM_L4x4_SAVE
  601. MY_ALIGN
  602. MY_ALIGN
  603. CGEMM_L4x4_SUB2:
  604. /*----------------------------------------*/
  605. andi. T1,L, 16
  606. ble CGEMM_L4x4_SUB2_8
  607. bl CGEMM_4x4_L16_SUB
  608. MY_ALIGN
  609. CGEMM_L4x4_SUB2_8:
  610. /*----------------------------------------*/
  611. andi. T1,L, 8
  612. ble CGEMM_L4x4_SUB2_4
  613. bl CGEMM_4x4_L8_SUB
  614. MY_ALIGN
  615. CGEMM_L4x4_SUB2_4:
  616. /*----------------------------------------*/
  617. andi. T1,L, 4
  618. ble CGEMM_L4x4_SUB2_2
  619. LOAD4x4_2
  620. KERNEL4x4_L2 64,64, 0,0
  621. KERNEL4x4_E2 64,64, 1,1
  622. MY_ALIGN
  623. CGEMM_L4x4_SUB2_2:
  624. /*----------------------------------------*/
  625. andi. T1,L, 2
  626. ble CGEMM_L4x4_SUB2_1
  627. LOAD4x4_2
  628. KERNEL4x4_E2 64,64, 0,1
  629. MY_ALIGN
  630. CGEMM_L4x4_SUB2_1:
  631. /*----------------------------------------*/
  632. andi. T1,L, 1
  633. ble CGEMM_L4x4_SAVE
  634. KERNEL4x4
  635. CGEMM_L4x4_SAVE:
  636. /*----------------------------------------*/
  637. SAVE4x4
  638. #if defined(TRMMKERNEL)
  639. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4
  640. #endif
  641. CGEMM_L4x4_END:
  642. /*----------------------------------------*/
  643. CGEMM_L4x2_BEGIN:
  644. /*----------------------------------------*/
  645. andi. T1, M, 2
  646. ble CGEMM_L4x2_END
  647. #if defined(TRMMKERNEL)
  648. REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4
  649. #else
  650. mr BO, B
  651. #endif
  652. #if defined(TRMMKERNEL)
  653. REFRESH_TEMP_BK T6,K,TEMP_REG,2,4
  654. mr T1, T6
  655. addi T1,T1, -2
  656. srawi. T8, T1, 5 /**(T1-2) % 32x */
  657. #else
  658. mr T1, K
  659. addi T1,T1, -2
  660. srawi. T8, T1, 5 /**(K-2) % 32x */
  661. #endif
  662. ZERO4x2
  663. ble CGEMM_L4x2_SUB0
  664. bl CGEMM_4x2_LMAIN_SUB
  665. andi. L, T1, 31
  666. ble CGEMM_L4x2_SAVE
  667. b CGEMM_L4x2_SUB2
  668. CGEMM_L4x2_SUB0:
  669. /*----------------------------------------*/
  670. #if defined(TRMMKERNEL)
  671. andi. L, T6, 63
  672. cmpwi T6,33
  673. #else
  674. andi. L, K, 63
  675. cmpwi K,33
  676. #endif
  677. li T8,1
  678. bne CMP4x2_32K
  679. addi BO,BO,-32
  680. addi AO,AO,-16
  681. LOAD4x2O 16,32
  682. END4x2_WITHOUT_ADD
  683. LOAD4x2_2O 32, 64
  684. mtctr T8
  685. bl CGEMM_L4x2_K32
  686. b CGEMM_L4x2_SAVE
  687. CMP4x2_32K:
  688. /*----------------------------------------*/
  689. #if defined(TRMMKERNEL)
  690. cmpwi T6,32
  691. #else
  692. cmpwi K,32
  693. #endif
  694. bne CGEMM_L4x2_SUB2
  695. MY_ALIGN
  696. mtctr T8
  697. addi BO,BO,-64
  698. addi AO,AO,-32
  699. LOAD4x2_2O 32,64
  700. bl CGEMM_L4x2_K32
  701. b CGEMM_L4x2_SAVE
  702. MY_ALIGN
  703. MY_ALIGN
  704. CGEMM_L4x2_SUB2:
  705. /*----------------------------------------*/
  706. andi. T1,L, 16
  707. ble CGEMM_L4x2_SUB2_8
  708. bl CGEMM_4x2_L16_SUB
  709. MY_ALIGN
  710. CGEMM_L4x2_SUB2_8:
  711. /*----------------------------------------*/
  712. andi. T1,L, 8
  713. ble CGEMM_L4x2_SUB2_4
  714. bl CGEMM_4x2_L8_SUB
  715. MY_ALIGN
  716. CGEMM_L4x2_SUB2_4:
  717. /*----------------------------------------*/
  718. andi. T1,L, 4
  719. ble CGEMM_L4x2_SUB2_2
  720. LOAD4x2_2
  721. KERNEL4x2_L2 32,64, 0,0
  722. KERNEL4x2_E2 32,64, 1,1
  723. MY_ALIGN
  724. CGEMM_L4x2_SUB2_2:
  725. /*----------------------------------------*/
  726. andi. T1,L, 2
  727. ble CGEMM_L4x2_SUB2_1
  728. LOAD4x2_2
  729. KERNEL4x2_E2 32,64, 0,1
  730. MY_ALIGN
  731. CGEMM_L4x2_SUB2_1:
  732. /*----------------------------------------*/
  733. andi. T1,L, 1
  734. ble CGEMM_L4x2_SAVE
  735. KERNEL4x2
  736. MY_ALIGN
  737. CGEMM_L4x2_SAVE:
  738. /*----------------------------------------*/
  739. SAVE4x2
  740. #if defined(TRMMKERNEL)
  741. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4
  742. #endif
  743. CGEMM_L4x2_END:
  744. /*----------------------------------------*/
  745. CGEMM_L4x1_BEGIN:
  746. /*----------------------------------------*/
  747. andi. T1, M, 1
  748. ble CGEMM_L4x1_END
  749. #if defined(TRMMKERNEL)
  750. REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4
  751. #else
  752. mr BO, B
  753. #endif
  754. #if defined(TRMMKERNEL)
  755. REFRESH_TEMP_BK T6,K,TEMP_REG,1,4
  756. mr T1, T6
  757. addi T1,T1, -2
  758. srawi. T8, T1, 5 /**(T1-2) % 32x */
  759. #else
  760. mr T1, K
  761. addi T1,T1, -2
  762. srawi. T8, T1, 5 /**(K-2) % 32x */
  763. #endif
  764. ZERO4x1
  765. ble CGEMM_L4x1_SUB0
  766. bl CGEMM_4x1_LMAIN_SUB
  767. andi. L, T1, 31
  768. ble CGEMM_L4x1_SAVE
  769. b CGEMM_L4x1_SUB2
  770. CGEMM_L4x1_SUB0:
  771. /*----------------------------------------*/
  772. #if defined(TRMMKERNEL)
  773. andi. L, T6, 63
  774. cmpwi T6,33
  775. #else
  776. andi. L, K, 63
  777. cmpwi K,33
  778. #endif
  779. li T8,1
  780. bne CMP4x1_32K
  781. addi BO,BO,-32
  782. addi AO,AO,-8
  783. LOAD4x1O 8,32
  784. END4x1_WITHOUT_ADD
  785. LOAD4x1_2O 16, 64
  786. mtctr T8
  787. bl CGEMM_L4x1_K32
  788. b CGEMM_L4x1_SAVE
  789. CMP4x1_32K:
  790. /*----------------------------------------*/
  791. #if defined(TRMMKERNEL)
  792. cmpwi T6,32
  793. #else
  794. cmpwi K,32
  795. #endif
  796. bne CGEMM_L4x1_SUB2
  797. MY_ALIGN
  798. mtctr T8
  799. addi BO,BO,-64
  800. addi AO,AO,-16
  801. LOAD4x1_2O 16,64
  802. bl CGEMM_L4x1_K32
  803. b CGEMM_L4x1_SAVE
  804. MY_ALIGN
  805. MY_ALIGN
  806. CGEMM_L4x1_SUB2:
  807. /*----------------------------------------*/
  808. andi. T1,L, 16
  809. ble CGEMM_L4x1_SUB2_8
  810. bl CGEMM_4x1_L16_SUB
  811. MY_ALIGN
  812. CGEMM_L4x1_SUB2_8:
  813. /*----------------------------------------*/
  814. andi. T1,L, 8
  815. ble CGEMM_L4x1_SUB2_4
  816. bl CGEMM_4x1_L8_SUB
  817. MY_ALIGN
  818. CGEMM_L4x1_SUB2_4:
  819. /*----------------------------------------*/
  820. andi. T1,L, 4
  821. ble CGEMM_L4x1_SUB2_2
  822. LOAD4x1_2
  823. KERNEL4x1_L2 16,64, 0,0
  824. KERNEL4x1_E2 16,64, 1,1
  825. MY_ALIGN
  826. CGEMM_L4x1_SUB2_2:
  827. /*----------------------------------------*/
  828. andi. T1,L, 2
  829. ble CGEMM_L4x1_SUB2_1
  830. LOAD4x1_2
  831. KERNEL4x1_E2 16,64, 0,1
  832. MY_ALIGN
  833. CGEMM_L4x1_SUB2_1:
  834. /*----------------------------------------*/
  835. andi. T1,L, 1
  836. ble CGEMM_L4x1_SAVE
  837. KERNEL4x1
  838. MY_ALIGN
  839. CGEMM_L4x1_SAVE:
  840. /*----------------------------------------*/
  841. SAVE4x1
  842. #if defined(TRMMKERNEL)
  843. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4
  844. #endif
  845. CGEMM_L4x1_END:
  846. /*----------------------------------------*/
  847. slwi T1, K, 5
  848. addic. J, J, -1
  849. add B, B, T1
  850. #if defined(TRMMKERNEL) && !defined(LEFT)
  851. addi TEMP_REG, TEMP_REG, 4
  852. #endif
  853. bgt CGEMM_L4_BEGIN
  854. CGEMM_L4_END:
  855. b CGEMM_L2
  856. /* MINI SUBROUTINES */
  857. /* 2x8 MAIN 128x+2 LOOP */
  858. CGEMM_L2x8_LMAIN_SUB:
  859. /*----------------------------------------*/
  860. mtctr T8
  861. LOAD2x8_2
  862. MY_ALIGN
  863. CGEMM_L2x8_LOOP:
  864. /*----------------------------------------*/
  865. dcbt AO, PRE
  866. dcbt BO, PRE
  867. KERNEL2x8_L2 128,32,0,0
  868. CGEMM_L2x8_K128:
  869. /*----------------------------------------*/
  870. KERNEL2x8_L2 128,32,1,0
  871. dcbt AO, T2
  872. KERNEL2x8_L2 128,32,2,0
  873. KERNEL2x8_L2 128,32,3,0
  874. dcbt AO, T3
  875. dcbt BO, T2
  876. KERNEL2x8_L2 128,32,4,0
  877. KERNEL2x8_L2 128,32,5,0
  878. dcbt AO, T4
  879. KERNEL2x8_L2 128,32,6,0
  880. KERNEL2x8_L2 128,32,7,0
  881. dcbt AO, T5
  882. dcbt BO, T3
  883. KERNEL2x8_L2 128,32,8,0
  884. KERNEL2x8_L2 128,32,9,0
  885. KERNEL2x8_L2 128,32,10,0
  886. KERNEL2x8_L2 128,32,11,0
  887. dcbt BO, T4
  888. KERNEL2x8_L2 128,32,12,0
  889. KERNEL2x8_L2 128,32,13,0
  890. KERNEL2x8_L2 128,32,14,0
  891. KERNEL2x8_L2 128,32,15,0
  892. KERNEL2x8_L2 128,32,16,0
  893. KERNEL2x8_L2 128,32,17,0
  894. KERNEL2x8_L2 128,32,18,0
  895. KERNEL2x8_L2 128,32,19,0
  896. KERNEL2x8_L2 128,32,20,0
  897. KERNEL2x8_L2 128,32,21,0
  898. KERNEL2x8_L2 128,32,22,0
  899. KERNEL2x8_L2 128,32,23,0
  900. KERNEL2x8_L2 128,32,24,0
  901. KERNEL2x8_L2 128,32,25,0
  902. KERNEL2x8_L2 128,32,26,0
  903. KERNEL2x8_L2 128,32,27,0
  904. KERNEL2x8_L2 128,32,28,0
  905. KERNEL2x8_L2 128,32,29,0
  906. KERNEL2x8_L2 128,32,30,0
  907. KERNEL2x8_L2 128,32,31,0
  908. KERNEL2x8_L2 128,32,32,0
  909. KERNEL2x8_L2 128,32,33,0
  910. KERNEL2x8_L2 128,32,34,0
  911. KERNEL2x8_L2 128,32,35,0
  912. KERNEL2x8_L2 128,32,36,0
  913. KERNEL2x8_L2 128,32,37,0
  914. KERNEL2x8_L2 128,32,38,0
  915. KERNEL2x8_L2 128,32,39,0
  916. KERNEL2x8_L2 128,32,40,0
  917. KERNEL2x8_L2 128,32,41,0
  918. KERNEL2x8_L2 128,32,42,0
  919. KERNEL2x8_L2 128,32,43,0
  920. KERNEL2x8_L2 128,32,44,0
  921. KERNEL2x8_L2 128,32,45,0
  922. KERNEL2x8_L2 128,32,46,0
  923. KERNEL2x8_L2 128,32,47,0
  924. KERNEL2x8_L2 128,32,48,0
  925. KERNEL2x8_L2 128,32,49,0
  926. KERNEL2x8_L2 128,32,50,0
  927. KERNEL2x8_L2 128,32,51,0
  928. KERNEL2x8_L2 128,32,52,0
  929. KERNEL2x8_L2 128,32,53,0
  930. KERNEL2x8_L2 128,32,54,0
  931. KERNEL2x8_L2 128,32,55,0
  932. KERNEL2x8_L2 128,32,56,0
  933. KERNEL2x8_L2 128,32,57,0
  934. KERNEL2x8_L2 128,32,58,0
  935. KERNEL2x8_L2 128,32,59,0
  936. KERNEL2x8_L2 128,32,60,0
  937. KERNEL2x8_L2 128,32,61,0
  938. KERNEL2x8_L2 128,32,62,0
  939. KERNEL2x8_L2 128,32,63,1
  940. bdnz CGEMM_L2x8_LOOP
  941. MY_ALIGN
  942. CGEMM_L2x8_LOOP_END:
  943. /*----------------------------------------*/
  944. END2x8_2
  945. blr
  946. MY_ALIGN
  947. CGEMM_2x8_L64_SUB:
  948. /*----------------------------------------*/
  949. LOAD2x8_2
  950. dcbt AO, PRE
  951. dcbt BO, PRE
  952. KERNEL2x8_L2 128,32,0,0
  953. KERNEL2x8_L2 128,32,1,0
  954. dcbt AO, T2
  955. KERNEL2x8_L2 128,32,2,0
  956. KERNEL2x8_L2 128,32,3,0
  957. dcbt AO, T3
  958. dcbt BO, T2
  959. KERNEL2x8_L2 128,32,4,0
  960. KERNEL2x8_L2 128,32,5,0
  961. dcbt AO, T4
  962. KERNEL2x8_L2 128,32,6,0
  963. KERNEL2x8_L2 128,32,7,0
  964. dcbt AO, T5
  965. dcbt BO, T3
  966. KERNEL2x8_L2 128,32,8,0
  967. KERNEL2x8_L2 128,32,9,0
  968. KERNEL2x8_L2 128,32,10,0
  969. KERNEL2x8_L2 128,32,11,0
  970. dcbt BO, T4
  971. KERNEL2x8_L2 128,32,12,0
  972. KERNEL2x8_L2 128,32,13,0
  973. KERNEL2x8_L2 128,32,14,0
  974. KERNEL2x8_L2 128,32,15,0
  975. KERNEL2x8_L2 128,32,16,0
  976. KERNEL2x8_L2 128,32,17,0
  977. KERNEL2x8_L2 128,32,18,0
  978. KERNEL2x8_L2 128,32,19,0
  979. KERNEL2x8_L2 128,32,20,0
  980. KERNEL2x8_L2 128,32,21,0
  981. KERNEL2x8_L2 128,32,22,0
  982. KERNEL2x8_L2 128,32,23,0
  983. KERNEL2x8_L2 128,32,24,0
  984. KERNEL2x8_L2 128,32,25,0
  985. KERNEL2x8_L2 128,32,26,0
  986. KERNEL2x8_L2 128,32,27,0
  987. KERNEL2x8_L2 128,32,28,0
  988. KERNEL2x8_L2 128,32,29,0
  989. KERNEL2x8_L2 128,32,30,0
  990. KERNEL2x8_E2 128,32,31,1
  991. blr
  992. MY_ALIGN
  993. CGEMM_2x8_L32_SUB:
  994. /*----------------------------------------*/
  995. LOAD2x8_2
  996. dcbt AO, PRE
  997. dcbt BO, PRE
  998. KERNEL2x8_L2 128,32,0,0
  999. KERNEL2x8_L2 128,32,1,0
  1000. dcbt AO, T2
  1001. KERNEL2x8_L2 128,32,2,0
  1002. KERNEL2x8_L2 128,32,3,0
  1003. dcbt AO, T3
  1004. dcbt BO, T2
  1005. KERNEL2x8_L2 128,32,4,0
  1006. KERNEL2x8_L2 128,32,5,0
  1007. dcbt AO, T4
  1008. KERNEL2x8_L2 128,32,6,0
  1009. KERNEL2x8_L2 128,32,7,0
  1010. dcbt AO, T5
  1011. dcbt BO, T3
  1012. KERNEL2x8_L2 128,32,8,0
  1013. KERNEL2x8_L2 128,32,9,0
  1014. KERNEL2x8_L2 128,32,10,0
  1015. KERNEL2x8_L2 128,32,11,0
  1016. dcbt BO, T4
  1017. KERNEL2x8_L2 128,32,12,0
  1018. KERNEL2x8_L2 128,32,13,0
  1019. KERNEL2x8_L2 128,32,14,0
  1020. KERNEL2x8_E2 128,32,15,1
  1021. blr
  1022. MY_ALIGN
  1023. CGEMM_2x8_L16_SUB:
  1024. /*----------------------------------------*/
  1025. LOAD2x8_2
  1026. dcbt AO, PRE
  1027. dcbt BO, PRE
  1028. KERNEL2x8_L2 128,32,0,0
  1029. KERNEL2x8_L2 128,32,1,0
  1030. dcbt AO, T2
  1031. KERNEL2x8_L2 128,32,2,0
  1032. KERNEL2x8_L2 128,32,3,0
  1033. dcbt AO, T3
  1034. dcbt BO, T2
  1035. KERNEL2x8_L2 128,32,4,0
  1036. KERNEL2x8_L2 128,32,5,0
  1037. dcbt AO, T4
  1038. KERNEL2x8_L2 128,32,6,0
  1039. KERNEL2x8_E2 128,32,7,1
  1040. blr
  1041. MY_ALIGN
  1042. CGEMM_2x4_LMAIN_SUB:
  1043. /*----------------------------------------*/
  1044. mtctr T8
  1045. LOAD2x4_2
  1046. MY_ALIGN
  1047. CGEMM_L2x4_LOOP:
  1048. /*----------------------------------------*/
  1049. KERNEL2x4_L2 64,32,0,0
  1050. CGEMM_L2x4_K32:
  1051. /*----------------------------------------*/
  1052. KERNEL2x4_L2 64,32,1,0
  1053. KERNEL2x4_L2 64,32,2,0
  1054. KERNEL2x4_L2 64,32,3,0
  1055. KERNEL2x4_L2 64,32,4,0
  1056. KERNEL2x4_L2 64,32,5,0
  1057. KERNEL2x4_L2 64,32,6,0
  1058. KERNEL2x4_L2 64,32,7,0
  1059. KERNEL2x4_L2 64,32,8,0
  1060. KERNEL2x4_L2 64,32,9,0
  1061. KERNEL2x4_L2 64,32,10,0
  1062. KERNEL2x4_L2 64,32,11,0
  1063. KERNEL2x4_L2 64,32,12,0
  1064. KERNEL2x4_L2 64,32,13,0
  1065. KERNEL2x4_L2 64,32,14,0
  1066. KERNEL2x4_L2 64,32,15,1
  1067. bdnz CGEMM_L2x4_LOOP
  1068. MY_ALIGN
  1069. CGEMM_L2x4_LOOP_END:
  1070. /*----------------------------------------*/
  1071. END2x4_2
  1072. blr
  1073. MY_ALIGN
  1074. CGEMM_2x4_L16_SUB:
  1075. /*----------------------------------------*/
  1076. LOAD2x4_2
  1077. KERNEL2x4_L2 64,32,0,0
  1078. KERNEL2x4_L2 64,32,1,0
  1079. KERNEL2x4_L2 64,32,2,0
  1080. KERNEL2x4_L2 64,32,3,0
  1081. KERNEL2x4_L2 64,32,4,0
  1082. KERNEL2x4_L2 64,32,5,0
  1083. KERNEL2x4_L2 64,32,6,0
  1084. KERNEL2x4_E2 64,32,7,1
  1085. blr
  1086. MY_ALIGN
  1087. CGEMM_2x4_L8_SUB:
  1088. /*----------------------------------------*/
  1089. LOAD2x4_2
  1090. KERNEL2x4_L2 64,32,0,0
  1091. KERNEL2x4_L2 64,32,1,0
  1092. KERNEL2x4_L2 64,32,2,0
  1093. KERNEL2x4_E2 64,32,3,1
  1094. blr
  1095. CGEMM_2x2_LMAIN_SUB:
  1096. /*----------------------------------------*/
  1097. mtctr T8
  1098. LOAD2x2_2
  1099. MY_ALIGN
  1100. CGEMM_L2x2_LOOP:
  1101. /*----------------------------------------*/
  1102. KERNEL2x2_L2 32,32,0,0
  1103. CGEMM_L2x2_K32:
  1104. /*----------------------------------------*/
  1105. KERNEL2x2_L2 32,32,1,0
  1106. KERNEL2x2_L2 32,32,2,0
  1107. KERNEL2x2_L2 32,32,3,0
  1108. KERNEL2x2_L2 32,32,4,0
  1109. KERNEL2x2_L2 32,32,5,0
  1110. KERNEL2x2_L2 32,32,6,0
  1111. KERNEL2x2_L2 32,32,7,0
  1112. KERNEL2x2_L2 32,32,8,0
  1113. KERNEL2x2_L2 32,32,9,0
  1114. KERNEL2x2_L2 32,32,10,0
  1115. KERNEL2x2_L2 32,32,11,0
  1116. KERNEL2x2_L2 32,32,12,0
  1117. KERNEL2x2_L2 32,32,13,0
  1118. KERNEL2x2_L2 32,32,14,0
  1119. KERNEL2x2_L2 32,32,15,1
  1120. bdnz CGEMM_L2x2_LOOP
  1121. MY_ALIGN
  1122. CGEMM_L2x2_LOOP_END:
  1123. /*----------------------------------------*/
  1124. END2x2_2
  1125. blr
  1126. MY_ALIGN
  1127. CGEMM_2x2_L16_SUB:
  1128. /*----------------------------------------*/
  1129. LOAD2x2_2
  1130. KERNEL2x2_L2 32,32,0,0
  1131. KERNEL2x2_L2 32,32,1,0
  1132. KERNEL2x2_L2 32,32,2,0
  1133. KERNEL2x2_L2 32,32,3,0
  1134. KERNEL2x2_L2 32,32,4,0
  1135. KERNEL2x2_L2 32,32,5,0
  1136. KERNEL2x2_L2 32,32,6,0
  1137. KERNEL2x2_E2 32,32,7,1
  1138. blr
  1139. MY_ALIGN
  1140. CGEMM_2x2_L8_SUB:
  1141. /*----------------------------------------*/
  1142. LOAD2x2_2
  1143. KERNEL2x2_L2 32,32,0,0
  1144. KERNEL2x2_L2 32,32,1,0
  1145. KERNEL2x2_L2 32,32,2,0
  1146. KERNEL2x2_E2 32,32,3,1
  1147. blr
  1148. CGEMM_2x1_LMAIN_SUB:
  1149. /*----------------------------------------*/
  1150. mtctr T8
  1151. LOAD2x1_2
  1152. MY_ALIGN
  1153. CGEMM_L2x1_LOOP:
  1154. /*----------------------------------------*/
  1155. KERNEL2x1_L2 16,32,0,0
  1156. CGEMM_L2x1_K32:
  1157. /*----------------------------------------*/
  1158. KERNEL2x1_L2 16,32,1,0
  1159. KERNEL2x1_L2 16,32,2,0
  1160. KERNEL2x1_L2 16,32,3,0
  1161. KERNEL2x1_L2 16,32,4,0
  1162. KERNEL2x1_L2 16,32,5,0
  1163. KERNEL2x1_L2 16,32,6,0
  1164. KERNEL2x1_L2 16,32,7,0
  1165. KERNEL2x1_L2 16,32,8,0
  1166. KERNEL2x1_L2 16,32,9,0
  1167. KERNEL2x1_L2 16,32,10,0
  1168. KERNEL2x1_L2 16,32,11,0
  1169. KERNEL2x1_L2 16,32,12,0
  1170. KERNEL2x1_L2 16,32,13,0
  1171. KERNEL2x1_L2 16,32,14,0
  1172. KERNEL2x1_L2 16,32,15,1
  1173. bdnz CGEMM_L2x1_LOOP
  1174. MY_ALIGN
  1175. CGEMM_L2x1_LOOP_END:
  1176. /*----------------------------------------*/
  1177. END2x1_2
  1178. blr
  1179. MY_ALIGN
  1180. CGEMM_2x1_L16_SUB:
  1181. /*----------------------------------------*/
  1182. LOAD2x1_2
  1183. KERNEL2x1_L2 16,32,0,0
  1184. KERNEL2x1_L2 16,32,1,0
  1185. KERNEL2x1_L2 16,32,2,0
  1186. KERNEL2x1_L2 16,32,3,0
  1187. KERNEL2x1_L2 16,32,4,0
  1188. KERNEL2x1_L2 16,32,5,0
  1189. KERNEL2x1_L2 16,32,6,0
  1190. KERNEL2x1_E2 16,32,7,1
  1191. blr
  1192. MY_ALIGN
  1193. CGEMM_2x1_L8_SUB:
  1194. /*----------------------------------------*/
  1195. LOAD2x1_2
  1196. KERNEL2x1_L2 16,32,0,0
  1197. KERNEL2x1_L2 16,32,1,0
  1198. KERNEL2x1_L2 16,32,2,0
  1199. KERNEL2x1_E2 16,32,3,1
  1200. blr
  1201. /* MAIN LOOP BEGINS */
  1202. MY_ALIGN
  1203. CGEMM_L2:
  1204. /*----------------------------------------*/
  1205. andi. J, N, 2
  1206. ble CGEMM_L2_END
  1207. CGEMM_L2_BEGIN:
  1208. /*----------------------------------------*/
  1209. mr CO, C
  1210. slwi T1, LDC , 1
  1211. add T2,C,LDC
  1212. mr AO, A
  1213. add C, C, T1
  1214. #if defined(TRMMKERNEL) && defined(LEFT)
  1215. mr TEMP_REG, OFFSET /*off = offset;*/
  1216. #endif
  1217. srawi. I, M, 3
  1218. ble CGEMM_L2x8_END
  1219. dcbt CO,r0 /*just prefetch*/
  1220. dcbt T2,r0
  1221. CGEMM_L2x8_BEGIN:
  1222. /*----------------------------------------*/
  1223. #if defined(TRMMKERNEL)
  1224. REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
  1225. #else
  1226. mr BO, B
  1227. dcbt B, r0
  1228. #endif
  1229. dcbt AO, r0
  1230. #if defined(TRMMKERNEL)
  1231. REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
  1232. mr T1, T6
  1233. /* TEMPS FOR PREFETCH */
  1234. li T2, 1024
  1235. li T3, 1024+512
  1236. addi T1,T1, -2
  1237. /* TEMPS FOR PREFETCH */
  1238. li T4, 2048
  1239. li T5, 2048+512
  1240. srawi. T8, T1, 7 /**(T1-2) % 128x */
  1241. #else
  1242. mr T1, K
  1243. /* TEMPS FOR PREFETCH */
  1244. li T2, 1024
  1245. li T3, 1024+512
  1246. addi T1,T1, -2
  1247. /* TEMPS FOR PREFETCH */
  1248. li T4, 2048
  1249. li T5, 2048+512
  1250. srawi. T8, T1, 7 /**(K-2) % 128x */
  1251. #endif
  1252. ZERO2x8
  1253. ble CGEMM_L2x8_SUB0
  1254. bl CGEMM_L2x8_LMAIN_SUB
  1255. andi. L, T1, 127
  1256. ble CGEMM_L2x8_SAVE
  1257. b CGEMM_L2x8_SUB2
  1258. CGEMM_L2x8_SUB0:
  1259. /*----------------------------------------*/
  1260. #if defined(TRMMKERNEL)
  1261. andi. L, T6, 255
  1262. cmpwi T6,129
  1263. #else
  1264. andi. L, K, 255
  1265. cmpwi K,129
  1266. #endif
  1267. li T8,1
  1268. bne CMP2x8_128K
  1269. addi BO,BO,-16
  1270. addi AO,AO,-64
  1271. LOAD2x8O 64,16
  1272. END2x8_WITHOUT_ADD
  1273. LOAD2x8_2O 128, 32
  1274. mtctr T8
  1275. bl CGEMM_L2x8_K128
  1276. b CGEMM_L2x8_SAVE
  1277. CMP2x8_128K:
  1278. /*----------------------------------------*/
  1279. #if defined(TRMMKERNEL)
  1280. cmpwi T6,128
  1281. #else
  1282. cmpwi K,128
  1283. #endif
  1284. bne CGEMM_L2x8_SUB2
  1285. MY_ALIGN
  1286. mtctr T8
  1287. addi BO,BO,-32
  1288. addi AO,AO,-128
  1289. LOAD2x8_2O 128,32
  1290. bl CGEMM_L2x8_K128
  1291. b CGEMM_L2x8_SAVE
  1292. MY_ALIGN
  1293. CGEMM_L2x8_SUB2:
  1294. /*----------------------------------------*/
  1295. andi. T1,L, 64
  1296. ble CGEMM_L2x8_SUB2_32
  1297. bl CGEMM_2x8_L64_SUB
  1298. MY_ALIGN
  1299. CGEMM_L2x8_SUB2_32:
  1300. /*----------------------------------------*/
  1301. andi. T1,L, 32
  1302. ble CGEMM_L2x8_SUB2_16
  1303. bl CGEMM_2x8_L32_SUB
  1304. MY_ALIGN
  1305. CGEMM_L2x8_SUB2_16:
  1306. /*----------------------------------------*/
  1307. andi. T1,L, 16
  1308. ble CGEMM_L2x8_SUB2_8
  1309. bl CGEMM_2x8_L16_SUB
  1310. MY_ALIGN
  1311. CGEMM_L2x8_SUB2_8:
  1312. /*----------------------------------------*/
  1313. andi. T1,L, 8
  1314. ble CGEMM_L2x8_SUB2_4
  1315. LOAD2x8_2
  1316. KERNEL2x8_L2 128,32, 0,0
  1317. KERNEL2x8_L2 128,32, 1,0
  1318. KERNEL2x8_L2 128,32, 2,0
  1319. KERNEL2x8_E2 128,32, 3,1
  1320. MY_ALIGN
  1321. CGEMM_L2x8_SUB2_4:
  1322. /*----------------------------------------*/
  1323. andi. T1,L, 4
  1324. ble CGEMM_L2x8_SUB2_2
  1325. LOAD2x8_2
  1326. KERNEL2x8_L2 128,32, 0,0
  1327. KERNEL2x8_E2 128,32, 1,1
  1328. MY_ALIGN
  1329. CGEMM_L2x8_SUB2_2:
  1330. /*----------------------------------------*/
  1331. andi. T1,L, 2
  1332. ble CGEMM_L2x8_SUB2_1
  1333. LOAD2x8_2
  1334. KERNEL2x8_E2 128,32, 0,1
  1335. MY_ALIGN
  1336. CGEMM_L2x8_SUB2_1:
  1337. /*----------------------------------------*/
  1338. andi. T1,L, 1
  1339. ble CGEMM_L2x8_SAVE
  1340. KERNEL2x8
  1341. MY_ALIGN
  1342. CGEMM_L2x8_SAVE:
  1343. /*----------------------------------------*/
  1344. addic. I, I, -1
  1345. MY_ALIGN
  1346. SAVE2x8
  1347. #if defined(TRMMKERNEL)
  1348. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
  1349. #endif
  1350. bgt CGEMM_L2x8_BEGIN
  1351. andi. T2, M, 7
  1352. ble CGEMM_L2x1_END
  1353. andi. T1, M, 4
  1354. ble CGEMM_L2x4_END
  1355. b CGEMM_L2x4_BEGIN
  1356. MY_ALIGN
  1357. CGEMM_L2x8_END:
  1358. /*----------------------------------------*/
  1359. CGEMM_L2x4_BEGIN:
  1360. /*----------------------------------------*/
  1361. andi. T2, M, 7
  1362. ble CGEMM_L2x1_END
  1363. andi. T1, M, 4
  1364. ble CGEMM_L2x4_END
  1365. #if defined(TRMMKERNEL)
  1366. REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
  1367. #else
  1368. mr BO, B
  1369. #endif
  1370. #if defined(TRMMKERNEL)
  1371. REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
  1372. mr T1, T6
  1373. addi T1,T1, -2
  1374. srawi. T8, T1, 5 /**(T1-2) % 32x */
  1375. #else
  1376. mr T1, K
  1377. addi T1,T1, -2
  1378. srawi. T8, T1, 5 /**(K-2) % 32x */
  1379. #endif
  1380. ZERO2x4
  1381. ble CGEMM_L2x4_SUB0
  1382. bl CGEMM_2x4_LMAIN_SUB
  1383. andi. L, T1, 31
  1384. ble CGEMM_L2x4_SAVE
  1385. b CGEMM_L2x4_SUB2
  1386. CGEMM_L2x4_SUB0:
  1387. /*----------------------------------------*/
  1388. #if defined(TRMMKERNEL)
  1389. andi. L, T6, 63
  1390. cmpwi T6,33
  1391. #else
  1392. andi. L, K, 63
  1393. cmpwi K,33
  1394. #endif
  1395. li T8,1
  1396. bne CMP2x4_32K
  1397. addi BO,BO,-16
  1398. addi AO,AO,-32
  1399. LOAD2x4O 32,16
  1400. END2x4_WITHOUT_ADD
  1401. LOAD2x4_2O 64, 32
  1402. mtctr T8
  1403. bl CGEMM_L2x4_K32
  1404. b CGEMM_L2x4_SAVE
  1405. CMP2x4_32K:
  1406. /*----------------------------------------*/
  1407. #if defined(TRMMKERNEL)
  1408. cmpwi T6,32
  1409. #else
  1410. cmpwi K,32
  1411. #endif
  1412. bne CGEMM_L2x4_SUB2
  1413. MY_ALIGN
  1414. mtctr T8
  1415. addi BO,BO,-32
  1416. addi AO,AO,-64
  1417. LOAD2x4_2O 64,32
  1418. bl CGEMM_L2x4_K32
  1419. b CGEMM_L2x4_SAVE
  1420. MY_ALIGN
  1421. MY_ALIGN
  1422. CGEMM_L2x4_SUB2:
  1423. /*----------------------------------------*/
  1424. andi. T1,L, 16
  1425. ble CGEMM_L2x4_SUB2_8
  1426. bl CGEMM_2x4_L16_SUB
  1427. MY_ALIGN
  1428. CGEMM_L2x4_SUB2_8:
  1429. /*----------------------------------------*/
  1430. andi. T1,L, 8
  1431. ble CGEMM_L2x4_SUB2_4
  1432. bl CGEMM_2x4_L8_SUB
  1433. MY_ALIGN
  1434. CGEMM_L2x4_SUB2_4:
  1435. /*----------------------------------------*/
  1436. andi. T1,L, 4
  1437. ble CGEMM_L2x4_SUB2_2
  1438. LOAD2x4_2
  1439. KERNEL2x4_L2 64,32, 0,0
  1440. KERNEL2x4_E2 64,32, 1,1
  1441. MY_ALIGN
  1442. CGEMM_L2x4_SUB2_2:
  1443. /*----------------------------------------*/
  1444. andi. T1,L, 2
  1445. ble CGEMM_L2x4_SUB2_1
  1446. LOAD2x4_2
  1447. KERNEL2x4_E2 64,32, 0,1
  1448. MY_ALIGN
  1449. CGEMM_L2x4_SUB2_1:
  1450. /*----------------------------------------*/
  1451. andi. T1,L, 1
  1452. ble CGEMM_L2x4_SAVE
  1453. KERNEL2x4
  1454. CGEMM_L2x4_SAVE:
  1455. /*----------------------------------------*/
  1456. SAVE2x4
  1457. #if defined(TRMMKERNEL)
  1458. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
  1459. #endif
  1460. CGEMM_L2x4_END:
  1461. /*----------------------------------------*/
  1462. CGEMM_L2x2_BEGIN:
  1463. /*----------------------------------------*/
  1464. andi. T1, M, 2
  1465. ble CGEMM_L2x2_END
  1466. #if defined(TRMMKERNEL)
  1467. REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
  1468. #else
  1469. mr BO, B
  1470. #endif
  1471. #if defined(TRMMKERNEL)
  1472. REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
  1473. mr T1, T6
  1474. addi T1,T1, -2
  1475. srawi. T8, T1, 5 /**(T1-2) % 32x */
  1476. #else
  1477. mr T1, K
  1478. addi T1,T1, -2
  1479. srawi. T8, T1, 5 /**(K-2) % 32x */
  1480. #endif
  1481. ZERO2x2
  1482. ble CGEMM_L2x2_SUB0
  1483. bl CGEMM_2x2_LMAIN_SUB
  1484. andi. L, T1, 31
  1485. ble CGEMM_L2x2_SAVE
  1486. b CGEMM_L2x2_SUB2
  1487. CGEMM_L2x2_SUB0:
  1488. /*----------------------------------------*/
  1489. #if defined(TRMMKERNEL)
  1490. andi. L, T6, 63
  1491. cmpwi T6,33
  1492. #else
  1493. andi. L, K, 63
  1494. cmpwi K,33
  1495. #endif
  1496. li T8,1
  1497. bne CMP2x2_32K
  1498. addi BO,BO,-16
  1499. addi AO,AO,-16
  1500. LOAD2x2O 16,16
  1501. END2x2_WITHOUT_ADD
  1502. LOAD2x2_2O 32, 32
  1503. mtctr T8
  1504. bl CGEMM_L2x2_K32
  1505. b CGEMM_L2x2_SAVE
  1506. CMP2x2_32K:
  1507. /*----------------------------------------*/
  1508. #if defined(TRMMKERNEL)
  1509. cmpwi T6,32
  1510. #else
  1511. cmpwi K,32
  1512. #endif
  1513. bne CGEMM_L2x2_SUB2
  1514. MY_ALIGN
  1515. mtctr T8
  1516. addi BO,BO,-32
  1517. addi AO,AO,-32
  1518. LOAD2x2_2O 32,32
  1519. bl CGEMM_L2x2_K32
  1520. b CGEMM_L2x2_SAVE
  1521. MY_ALIGN
  1522. MY_ALIGN
  1523. CGEMM_L2x2_SUB2:
  1524. /*----------------------------------------*/
  1525. andi. T1,L, 16
  1526. ble CGEMM_L2x2_SUB2_8
  1527. bl CGEMM_2x2_L16_SUB
  1528. MY_ALIGN
  1529. CGEMM_L2x2_SUB2_8:
  1530. /*----------------------------------------*/
  1531. andi. T1,L, 8
  1532. ble CGEMM_L2x2_SUB2_4
  1533. bl CGEMM_2x2_L8_SUB
  1534. MY_ALIGN
  1535. CGEMM_L2x2_SUB2_4:
  1536. /*----------------------------------------*/
  1537. andi. T1,L, 4
  1538. ble CGEMM_L2x2_SUB2_2
  1539. LOAD2x2_2
  1540. KERNEL2x2_L2 32,32, 0,0
  1541. KERNEL2x2_E2 32,32, 1,1
  1542. MY_ALIGN
  1543. CGEMM_L2x2_SUB2_2:
  1544. /*----------------------------------------*/
  1545. andi. T1,L, 2
  1546. ble CGEMM_L2x2_SUB2_1
  1547. LOAD2x2_2
  1548. KERNEL2x2_E2 32,32, 0,1
  1549. MY_ALIGN
  1550. CGEMM_L2x2_SUB2_1:
  1551. /*----------------------------------------*/
  1552. andi. T1,L, 1
  1553. ble CGEMM_L2x2_SAVE
  1554. KERNEL2x2
  1555. MY_ALIGN
  1556. CGEMM_L2x2_SAVE:
  1557. /*----------------------------------------*/
  1558. SAVE2x2
  1559. #if defined(TRMMKERNEL)
  1560. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
  1561. #endif
  1562. CGEMM_L2x2_END:
  1563. /*----------------------------------------*/
  1564. CGEMM_L2x1_BEGIN:
  1565. /*----------------------------------------*/
  1566. andi. T1, M, 1
  1567. ble CGEMM_L2x1_END
  1568. #if defined(TRMMKERNEL)
  1569. REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
  1570. #else
  1571. mr BO, B
  1572. #endif
  1573. #if defined(TRMMKERNEL)
  1574. REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
  1575. mr T1, T6
  1576. addi T1,T1, -2
  1577. srawi. T8, T1, 5 /**(T1-2) % 32x */
  1578. #else
  1579. mr T1, K
  1580. addi T1,T1, -2
  1581. srawi. T8, T1, 5 /**(K-2) % 32x */
  1582. #endif
  1583. ZERO2x1
  1584. ble CGEMM_L2x1_SUB0
  1585. bl CGEMM_2x1_LMAIN_SUB
  1586. andi. L, T1, 31
  1587. ble CGEMM_L2x1_SAVE
  1588. b CGEMM_L2x1_SUB2
  1589. CGEMM_L2x1_SUB0:
  1590. /*----------------------------------------*/
  1591. #if defined(TRMMKERNEL)
  1592. andi. L, T6, 63
  1593. cmpwi T6,33
  1594. #else
  1595. andi. L, K, 63
  1596. cmpwi K,33
  1597. #endif
  1598. li T8,1
  1599. bne CMP2x1_32K
  1600. addi BO,BO,-16
  1601. addi AO,AO,-8
  1602. LOAD2x1O 8,16
  1603. END2x1_WITHOUT_ADD
  1604. LOAD2x1_2O 16, 32
  1605. mtctr T8
  1606. bl CGEMM_L2x1_K32
  1607. b CGEMM_L2x1_SAVE
  1608. CMP2x1_32K:
  1609. /*----------------------------------------*/
  1610. #if defined(TRMMKERNEL)
  1611. cmpwi T6,32
  1612. #else
  1613. cmpwi K,32
  1614. #endif
  1615. bne CGEMM_L2x1_SUB2
  1616. MY_ALIGN
  1617. mtctr T8
  1618. addi BO,BO,-32
  1619. addi AO,AO,-16
  1620. LOAD2x1_2O 16,32
  1621. bl CGEMM_L2x1_K32
  1622. b CGEMM_L2x1_SAVE
  1623. MY_ALIGN
  1624. MY_ALIGN
  1625. CGEMM_L2x1_SUB2:
  1626. /*----------------------------------------*/
  1627. andi. T1,L, 16
  1628. ble CGEMM_L2x1_SUB2_8
  1629. bl CGEMM_2x1_L16_SUB
  1630. MY_ALIGN
  1631. CGEMM_L2x1_SUB2_8:
  1632. /*----------------------------------------*/
  1633. andi. T1,L, 8
  1634. ble CGEMM_L2x1_SUB2_4
  1635. bl CGEMM_2x1_L8_SUB
  1636. MY_ALIGN
  1637. CGEMM_L2x1_SUB2_4:
  1638. /*----------------------------------------*/
  1639. andi. T1,L, 4
  1640. ble CGEMM_L2x1_SUB2_2
  1641. LOAD2x1_2
  1642. KERNEL2x1_L2 16,32, 0,0
  1643. KERNEL2x1_E2 16,32, 1,1
  1644. MY_ALIGN
  1645. CGEMM_L2x1_SUB2_2:
  1646. /*----------------------------------------*/
  1647. andi. T1,L, 2
  1648. ble CGEMM_L2x1_SUB2_1
  1649. LOAD2x1_2
  1650. KERNEL2x1_E2 16,32, 0,1
  1651. MY_ALIGN
  1652. CGEMM_L2x1_SUB2_1:
  1653. /*----------------------------------------*/
  1654. andi. T1,L, 1
  1655. ble CGEMM_L2x1_SAVE
  1656. KERNEL2x1
  1657. MY_ALIGN
  1658. CGEMM_L2x1_SAVE:
  1659. /*----------------------------------------*/
  1660. SAVE2x1
  1661. #if defined(TRMMKERNEL)
  1662. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
  1663. #endif
  1664. CGEMM_L2x1_END:
  1665. /*----------------------------------------*/
  1666. slwi T1, K, 4
  1667. add B, B, T1
  1668. #if defined(TRMMKERNEL) && !defined(LEFT)
  1669. addi TEMP_REG, TEMP_REG, 2
  1670. #endif
  1671. CGEMM_L2_END:
  1672. b CGEMM_L1
  1673. /* MINI SUBROUTINES */
  1674. /* 1x8 MAIN 128x+2 LOOP */
  1675. CGEMM_L1x8_LMAIN_SUB:
  1676. /*----------------------------------------*/
  1677. mtctr T8
  1678. LOAD1x8_2
  1679. MY_ALIGN
  1680. CGEMM_L1x8_LOOP:
  1681. /*----------------------------------------*/
  1682. dcbt AO, PRE
  1683. dcbt BO, PRE
  1684. KERNEL1x8_L2 128,16,0,0
  1685. CGEMM_L1x8_K128:
  1686. /*----------------------------------------*/
  1687. KERNEL1x8_L2 128,16,1,0
  1688. dcbt AO, T2
  1689. KERNEL1x8_L2 128,16,2,0
  1690. KERNEL1x8_L2 128,16,3,0
  1691. dcbt AO, T3
  1692. dcbt BO, T2
  1693. KERNEL1x8_L2 128,16,4,0
  1694. KERNEL1x8_L2 128,16,5,0
  1695. dcbt AO, T4
  1696. KERNEL1x8_L2 128,16,6,0
  1697. KERNEL1x8_L2 128,16,7,0
  1698. dcbt AO, T5
  1699. dcbt BO, T3
  1700. KERNEL1x8_L2 128,16,8,0
  1701. KERNEL1x8_L2 128,16,9,0
  1702. KERNEL1x8_L2 128,16,10,0
  1703. KERNEL1x8_L2 128,16,11,0
  1704. dcbt BO, T4
  1705. KERNEL1x8_L2 128,16,12,0
  1706. KERNEL1x8_L2 128,16,13,0
  1707. KERNEL1x8_L2 128,16,14,0
  1708. KERNEL1x8_L2 128,16,15,0
  1709. KERNEL1x8_L2 128,16,16,0
  1710. KERNEL1x8_L2 128,16,17,0
  1711. KERNEL1x8_L2 128,16,18,0
  1712. KERNEL1x8_L2 128,16,19,0
  1713. KERNEL1x8_L2 128,16,20,0
  1714. KERNEL1x8_L2 128,16,21,0
  1715. KERNEL1x8_L2 128,16,22,0
  1716. KERNEL1x8_L2 128,16,23,0
  1717. KERNEL1x8_L2 128,16,24,0
  1718. KERNEL1x8_L2 128,16,25,0
  1719. KERNEL1x8_L2 128,16,26,0
  1720. KERNEL1x8_L2 128,16,27,0
  1721. KERNEL1x8_L2 128,16,28,0
  1722. KERNEL1x8_L2 128,16,29,0
  1723. KERNEL1x8_L2 128,16,30,0
  1724. KERNEL1x8_L2 128,16,31,0
  1725. KERNEL1x8_L2 128,16,32,0
  1726. KERNEL1x8_L2 128,16,33,0
  1727. KERNEL1x8_L2 128,16,34,0
  1728. KERNEL1x8_L2 128,16,35,0
  1729. KERNEL1x8_L2 128,16,36,0
  1730. KERNEL1x8_L2 128,16,37,0
  1731. KERNEL1x8_L2 128,16,38,0
  1732. KERNEL1x8_L2 128,16,39,0
  1733. KERNEL1x8_L2 128,16,40,0
  1734. KERNEL1x8_L2 128,16,41,0
  1735. KERNEL1x8_L2 128,16,42,0
  1736. KERNEL1x8_L2 128,16,43,0
  1737. KERNEL1x8_L2 128,16,44,0
  1738. KERNEL1x8_L2 128,16,45,0
  1739. KERNEL1x8_L2 128,16,46,0
  1740. KERNEL1x8_L2 128,16,47,0
  1741. KERNEL1x8_L2 128,16,48,0
  1742. KERNEL1x8_L2 128,16,49,0
  1743. KERNEL1x8_L2 128,16,50,0
  1744. KERNEL1x8_L2 128,16,51,0
  1745. KERNEL1x8_L2 128,16,52,0
  1746. KERNEL1x8_L2 128,16,53,0
  1747. KERNEL1x8_L2 128,16,54,0
  1748. KERNEL1x8_L2 128,16,55,0
  1749. KERNEL1x8_L2 128,16,56,0
  1750. KERNEL1x8_L2 128,16,57,0
  1751. KERNEL1x8_L2 128,16,58,0
  1752. KERNEL1x8_L2 128,16,59,0
  1753. KERNEL1x8_L2 128,16,60,0
  1754. KERNEL1x8_L2 128,16,61,0
  1755. KERNEL1x8_L2 128,16,62,0
  1756. KERNEL1x8_L2 128,16,63,1
  1757. bdnz CGEMM_L1x8_LOOP
  1758. MY_ALIGN
  1759. CGEMM_L1x8_LOOP_END:
  1760. /*----------------------------------------*/
  1761. END1x8_2
  1762. blr
  1763. MY_ALIGN
  1764. CGEMM_1x8_L64_SUB:
  1765. /*----------------------------------------*/
  1766. LOAD1x8_2
  1767. dcbt AO, PRE
  1768. dcbt BO, PRE
  1769. KERNEL1x8_L2 128,16,0,0
  1770. KERNEL1x8_L2 128,16,1,0
  1771. dcbt AO, T2
  1772. KERNEL1x8_L2 128,16,2,0
  1773. KERNEL1x8_L2 128,16,3,0
  1774. dcbt AO, T3
  1775. dcbt BO, T2
  1776. KERNEL1x8_L2 128,16,4,0
  1777. KERNEL1x8_L2 128,16,5,0
  1778. dcbt AO, T4
  1779. KERNEL1x8_L2 128,16,6,0
  1780. KERNEL1x8_L2 128,16,7,0
  1781. dcbt AO, T5
  1782. dcbt BO, T3
  1783. KERNEL1x8_L2 128,16,8,0
  1784. KERNEL1x8_L2 128,16,9,0
  1785. KERNEL1x8_L2 128,16,10,0
  1786. KERNEL1x8_L2 128,16,11,0
  1787. dcbt BO, T4
  1788. KERNEL1x8_L2 128,16,12,0
  1789. KERNEL1x8_L2 128,16,13,0
  1790. KERNEL1x8_L2 128,16,14,0
  1791. KERNEL1x8_L2 128,16,15,0
  1792. KERNEL1x8_L2 128,16,16,0
  1793. KERNEL1x8_L2 128,16,17,0
  1794. KERNEL1x8_L2 128,16,18,0
  1795. KERNEL1x8_L2 128,16,19,0
  1796. KERNEL1x8_L2 128,16,20,0
  1797. KERNEL1x8_L2 128,16,21,0
  1798. KERNEL1x8_L2 128,16,22,0
  1799. KERNEL1x8_L2 128,16,23,0
  1800. KERNEL1x8_L2 128,16,24,0
  1801. KERNEL1x8_L2 128,16,25,0
  1802. KERNEL1x8_L2 128,16,26,0
  1803. KERNEL1x8_L2 128,16,27,0
  1804. KERNEL1x8_L2 128,16,28,0
  1805. KERNEL1x8_L2 128,16,29,0
  1806. KERNEL1x8_L2 128,16,30,0
  1807. KERNEL1x8_E2 128,16,31,1
  1808. blr
  1809. MY_ALIGN
  1810. CGEMM_1x8_L32_SUB:
  1811. /*----------------------------------------*/
  1812. LOAD1x8_2
  1813. dcbt AO, PRE
  1814. dcbt BO, PRE
  1815. KERNEL1x8_L2 128,16,0,0
  1816. KERNEL1x8_L2 128,16,1,0
  1817. dcbt AO, T2
  1818. KERNEL1x8_L2 128,16,2,0
  1819. KERNEL1x8_L2 128,16,3,0
  1820. dcbt AO, T3
  1821. dcbt BO, T2
  1822. KERNEL1x8_L2 128,16,4,0
  1823. KERNEL1x8_L2 128,16,5,0
  1824. dcbt AO, T4
  1825. KERNEL1x8_L2 128,16,6,0
  1826. KERNEL1x8_L2 128,16,7,0
  1827. dcbt AO, T5
  1828. dcbt BO, T3
  1829. KERNEL1x8_L2 128,16,8,0
  1830. KERNEL1x8_L2 128,16,9,0
  1831. KERNEL1x8_L2 128,16,10,0
  1832. KERNEL1x8_L2 128,16,11,0
  1833. dcbt BO, T4
  1834. KERNEL1x8_L2 128,16,12,0
  1835. KERNEL1x8_L2 128,16,13,0
  1836. KERNEL1x8_L2 128,16,14,0
  1837. KERNEL1x8_E2 128,16,15,1
  1838. blr
  1839. MY_ALIGN
  1840. CGEMM_1x8_L16_SUB:
  1841. /*----------------------------------------*/
  1842. LOAD1x8_2
  1843. dcbt AO, PRE
  1844. dcbt BO, PRE
  1845. KERNEL1x8_L2 128,16,0,0
  1846. KERNEL1x8_L2 128,16,1,0
  1847. dcbt AO, T2
  1848. KERNEL1x8_L2 128,16,2,0
  1849. KERNEL1x8_L2 128,16,3,0
  1850. dcbt AO, T3
  1851. dcbt BO, T2
  1852. KERNEL1x8_L2 128,16,4,0
  1853. KERNEL1x8_L2 128,16,5,0
  1854. dcbt AO, T4
  1855. KERNEL1x8_L2 128,16,6,0
  1856. KERNEL1x8_E2 128,16,7,1
  1857. blr
  1858. MY_ALIGN
  1859. CGEMM_1x4_LMAIN_SUB:
  1860. /*----------------------------------------*/
  1861. mtctr T8
  1862. LOAD1x4_2
  1863. MY_ALIGN
  1864. CGEMM_L1x4_LOOP:
  1865. /*----------------------------------------*/
  1866. KERNEL1x4_L2 64,16,0,0
  1867. CGEMM_L1x4_K32:
  1868. /*----------------------------------------*/
  1869. KERNEL1x4_L2 64,16,1,0
  1870. KERNEL1x4_L2 64,16,2,0
  1871. KERNEL1x4_L2 64,16,3,0
  1872. KERNEL1x4_L2 64,16,4,0
  1873. KERNEL1x4_L2 64,16,5,0
  1874. KERNEL1x4_L2 64,16,6,0
  1875. KERNEL1x4_L2 64,16,7,0
  1876. KERNEL1x4_L2 64,16,8,0
  1877. KERNEL1x4_L2 64,16,9,0
  1878. KERNEL1x4_L2 64,16,10,0
  1879. KERNEL1x4_L2 64,16,11,0
  1880. KERNEL1x4_L2 64,16,12,0
  1881. KERNEL1x4_L2 64,16,13,0
  1882. KERNEL1x4_L2 64,16,14,0
  1883. KERNEL1x4_L2 64,16,15,1
  1884. bdnz CGEMM_L1x4_LOOP
  1885. MY_ALIGN
  1886. CGEMM_L1x4_LOOP_END:
  1887. /*----------------------------------------*/
  1888. END1x4_2
  1889. blr
  1890. MY_ALIGN
  1891. CGEMM_1x4_L16_SUB:
  1892. /*----------------------------------------*/
  1893. LOAD1x4_2
  1894. KERNEL1x4_L2 64,16,0,0
  1895. KERNEL1x4_L2 64,16,1,0
  1896. KERNEL1x4_L2 64,16,2,0
  1897. KERNEL1x4_L2 64,16,3,0
  1898. KERNEL1x4_L2 64,16,4,0
  1899. KERNEL1x4_L2 64,16,5,0
  1900. KERNEL1x4_L2 64,16,6,0
  1901. KERNEL1x4_E2 64,16,7,1
  1902. blr
  1903. MY_ALIGN
  1904. CGEMM_1x4_L8_SUB:
  1905. /*----------------------------------------*/
  1906. LOAD1x4_2
  1907. KERNEL1x4_L2 64,16,0,0
  1908. KERNEL1x4_L2 64,16,1,0
  1909. KERNEL1x4_L2 64,16,2,0
  1910. KERNEL1x4_E2 64,16,3,1
  1911. blr
  1912. CGEMM_1x2_LMAIN_SUB:
  1913. /*----------------------------------------*/
  1914. mtctr T8
  1915. LOAD1x2_2
  1916. MY_ALIGN
  1917. CGEMM_L1x2_LOOP:
  1918. /*----------------------------------------*/
  1919. KERNEL1x2_L2 32,16,0,0
  1920. CGEMM_L1x2_K32:
  1921. /*----------------------------------------*/
  1922. KERNEL1x2_L2 32,16,1,0
  1923. KERNEL1x2_L2 32,16,2,0
  1924. KERNEL1x2_L2 32,16,3,0
  1925. KERNEL1x2_L2 32,16,4,0
  1926. KERNEL1x2_L2 32,16,5,0
  1927. KERNEL1x2_L2 32,16,6,0
  1928. KERNEL1x2_L2 32,16,7,0
  1929. KERNEL1x2_L2 32,16,8,0
  1930. KERNEL1x2_L2 32,16,9,0
  1931. KERNEL1x2_L2 32,16,10,0
  1932. KERNEL1x2_L2 32,16,11,0
  1933. KERNEL1x2_L2 32,16,12,0
  1934. KERNEL1x2_L2 32,16,13,0
  1935. KERNEL1x2_L2 32,16,14,0
  1936. KERNEL1x2_L2 32,16,15,1
  1937. bdnz CGEMM_L1x2_LOOP
  1938. MY_ALIGN
  1939. CGEMM_L1x2_LOOP_END:
  1940. /*----------------------------------------*/
  1941. END1x2_2
  1942. blr
  1943. MY_ALIGN
  1944. CGEMM_1x2_L16_SUB:
  1945. /*----------------------------------------*/
  1946. LOAD1x2_2
  1947. KERNEL1x2_L2 32,16,0,0
  1948. KERNEL1x2_L2 32,16,1,0
  1949. KERNEL1x2_L2 32,16,2,0
  1950. KERNEL1x2_L2 32,16,3,0
  1951. KERNEL1x2_L2 32,16,4,0
  1952. KERNEL1x2_L2 32,16,5,0
  1953. KERNEL1x2_L2 32,16,6,0
  1954. KERNEL1x2_E2 32,16,7,1
  1955. blr
  1956. MY_ALIGN
  1957. CGEMM_1x2_L8_SUB:
  1958. /*----------------------------------------*/
  1959. LOAD1x2_2
  1960. KERNEL1x2_L2 32,16,0,0
  1961. KERNEL1x2_L2 32,16,1,0
  1962. KERNEL1x2_L2 32,16,2,0
  1963. KERNEL1x2_E2 32,16,3,1
  1964. blr
  1965. CGEMM_1x1_LMAIN_SUB:
  1966. /*----------------------------------------*/
  1967. mtctr T8
  1968. LOAD1x1_2
  1969. MY_ALIGN
  1970. CGEMM_L1x1_LOOP:
  1971. /*----------------------------------------*/
  1972. KERNEL1x1_L2 16,16,0,0
  1973. CGEMM_L1x1_K32:
  1974. /*----------------------------------------*/
  1975. KERNEL1x1_L2 16,16,1,0
  1976. KERNEL1x1_L2 16,16,2,0
  1977. KERNEL1x1_L2 16,16,3,0
  1978. KERNEL1x1_L2 16,16,4,0
  1979. KERNEL1x1_L2 16,16,5,0
  1980. KERNEL1x1_L2 16,16,6,0
  1981. KERNEL1x1_L2 16,16,7,0
  1982. KERNEL1x1_L2 16,16,8,0
  1983. KERNEL1x1_L2 16,16,9,0
  1984. KERNEL1x1_L2 16,16,10,0
  1985. KERNEL1x1_L2 16,16,11,0
  1986. KERNEL1x1_L2 16,16,12,0
  1987. KERNEL1x1_L2 16,16,13,0
  1988. KERNEL1x1_L2 16,16,14,0
  1989. KERNEL1x1_L2 16,16,15,1
  1990. bdnz CGEMM_L1x1_LOOP
  1991. MY_ALIGN
  1992. CGEMM_L1x1_LOOP_END:
  1993. /*----------------------------------------*/
  1994. END1x1_2
  1995. blr
  1996. MY_ALIGN
  1997. CGEMM_1x1_L16_SUB:
  1998. /*----------------------------------------*/
  1999. LOAD1x1_2
  2000. KERNEL1x1_L2 16,16,0,0
  2001. KERNEL1x1_L2 16,16,1,0
  2002. KERNEL1x1_L2 16,16,2,0
  2003. KERNEL1x1_L2 16,16,3,0
  2004. KERNEL1x1_L2 16,16,4,0
  2005. KERNEL1x1_L2 16,16,5,0
  2006. KERNEL1x1_L2 16,16,6,0
  2007. KERNEL1x1_E2 16,16,7,1
  2008. blr
  2009. MY_ALIGN
  2010. CGEMM_1x1_L8_SUB:
  2011. /*----------------------------------------*/
  2012. LOAD1x1_2
  2013. KERNEL1x1_L2 16,16,0,0
  2014. KERNEL1x1_L2 16,16,1,0
  2015. KERNEL1x1_L2 16,16,2,0
  2016. KERNEL1x1_E2 16,16,3,1
  2017. blr
  2018. /* MAIN LOOP BEGINS */
  2019. MY_ALIGN
  2020. CGEMM_L1:
  2021. /*----------------------------------------*/
  2022. andi. J, N, 1
  2023. ble CGEMM_L1_END
  2024. CGEMM_L1_BEGIN:
  2025. /*----------------------------------------*/
  2026. mr CO, C
  2027. add T2,C,LDC
  2028. mr AO, A
  2029. add C, C, T1
  2030. #if defined(TRMMKERNEL) && defined(LEFT)
  2031. mr TEMP_REG, OFFSET /*off = offset;*/
  2032. #endif
  2033. srawi. I, M, 3
  2034. ble CGEMM_L1x8_END
  2035. dcbt CO,r0 /*just prefetch*/
  2036. dcbt T2,r0
  2037. CGEMM_L1x8_BEGIN:
  2038. /*----------------------------------------*/
  2039. #if defined(TRMMKERNEL)
  2040. REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
  2041. #else
  2042. mr BO, B
  2043. dcbt B, r0
  2044. #endif
  2045. dcbt AO, r0
  2046. #if defined(TRMMKERNEL)
  2047. REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
  2048. mr T1, T6
  2049. /* TEMPS FOR PREFETCH */
  2050. li T2, 1024
  2051. li T3, 1024+512
  2052. addi T1,T1, -2
  2053. /* TEMPS FOR PREFETCH */
  2054. li T4, 2048
  2055. li T5, 2048+512
  2056. srawi. T8, T1, 7 /**(T1-2) % 128x */
  2057. #else
  2058. mr T1, K
  2059. /* TEMPS FOR PREFETCH */
  2060. li T2, 1024
  2061. li T3, 1024+512
  2062. addi T1,T1, -2
  2063. /* TEMPS FOR PREFETCH */
  2064. li T4, 2048
  2065. li T5, 2048+512
  2066. srawi. T8, T1, 7 /**(K-2) % 128x */
  2067. #endif
  2068. ZERO1x8
  2069. ble CGEMM_L1x8_SUB0
  2070. bl CGEMM_L1x8_LMAIN_SUB
  2071. andi. L, T1, 127
  2072. ble CGEMM_L1x8_SAVE
  2073. b CGEMM_L1x8_SUB2
  2074. CGEMM_L1x8_SUB0:
  2075. /*----------------------------------------*/
  2076. #if defined(TRMMKERNEL)
  2077. andi. L, T6, 255
  2078. cmpwi T6,129
  2079. #else
  2080. andi. L, K, 255
  2081. cmpwi K,129
  2082. #endif
  2083. li T8,1
  2084. bne CMP1x8_128K
  2085. addi BO,BO,-8
  2086. addi AO,AO,-64
  2087. LOAD1x8O 64,8
  2088. END1x8_WITHOUT_ADD
  2089. LOAD1x8_2O 128, 16
  2090. mtctr T8
  2091. bl CGEMM_L1x8_K128
  2092. b CGEMM_L1x8_SAVE
  2093. CMP1x8_128K:
  2094. /*----------------------------------------*/
  2095. #if defined(TRMMKERNEL)
  2096. cmpwi T6,128
  2097. #else
  2098. cmpwi K,128
  2099. #endif
  2100. bne CGEMM_L1x8_SUB2
  2101. MY_ALIGN
  2102. mtctr T8
  2103. addi BO,BO,-16
  2104. addi AO,AO,-128
  2105. LOAD1x8_2O 128,16
  2106. bl CGEMM_L1x8_K128
  2107. b CGEMM_L1x8_SAVE
  2108. MY_ALIGN
  2109. CGEMM_L1x8_SUB2:
  2110. /*----------------------------------------*/
  2111. andi. T1,L, 64
  2112. ble CGEMM_L1x8_SUB2_32
  2113. bl CGEMM_1x8_L64_SUB
  2114. MY_ALIGN
  2115. CGEMM_L1x8_SUB2_32:
  2116. /*----------------------------------------*/
  2117. andi. T1,L, 32
  2118. ble CGEMM_L1x8_SUB2_16
  2119. bl CGEMM_1x8_L32_SUB
  2120. MY_ALIGN
  2121. CGEMM_L1x8_SUB2_16:
  2122. /*----------------------------------------*/
  2123. andi. T1,L, 16
  2124. ble CGEMM_L1x8_SUB2_8
  2125. bl CGEMM_1x8_L16_SUB
  2126. MY_ALIGN
  2127. CGEMM_L1x8_SUB2_8:
  2128. /*----------------------------------------*/
  2129. andi. T1,L, 8
  2130. ble CGEMM_L1x8_SUB2_4
  2131. LOAD1x8_2
  2132. KERNEL1x8_L2 128,16, 0,0
  2133. KERNEL1x8_L2 128,16, 1,0
  2134. KERNEL1x8_L2 128,16, 2,0
  2135. KERNEL1x8_E2 128,16, 3,1
  2136. MY_ALIGN
  2137. CGEMM_L1x8_SUB2_4:
  2138. /*----------------------------------------*/
  2139. andi. T1,L, 4
  2140. ble CGEMM_L1x8_SUB2_2
  2141. LOAD1x8_2
  2142. KERNEL1x8_L2 128,16, 0,0
  2143. KERNEL1x8_E2 128,16, 1,1
  2144. MY_ALIGN
  2145. CGEMM_L1x8_SUB2_2:
  2146. /*----------------------------------------*/
  2147. andi. T1,L, 2
  2148. ble CGEMM_L1x8_SUB2_1
  2149. LOAD1x8_2
  2150. KERNEL1x8_E2 128,16, 0,1
  2151. MY_ALIGN
  2152. CGEMM_L1x8_SUB2_1:
  2153. /*----------------------------------------*/
  2154. andi. T1,L, 1
  2155. ble CGEMM_L1x8_SAVE
  2156. KERNEL1x8
  2157. MY_ALIGN
  2158. CGEMM_L1x8_SAVE:
  2159. /*----------------------------------------*/
  2160. addic. I, I, -1
  2161. MY_ALIGN
  2162. SAVE1x8
  2163. #if defined(TRMMKERNEL)
  2164. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
  2165. #endif
  2166. bgt CGEMM_L1x8_BEGIN
  2167. andi. T2, M, 7
  2168. ble CGEMM_L1x1_END
  2169. andi. T1, M, 4
  2170. ble CGEMM_L1x4_END
  2171. b CGEMM_L1x4_BEGIN
  2172. MY_ALIGN
  2173. CGEMM_L1x8_END:
  2174. /*----------------------------------------*/
  2175. CGEMM_L1x4_BEGIN:
  2176. /*----------------------------------------*/
  2177. andi. T2, M, 7
  2178. ble CGEMM_L1x1_END
  2179. andi. T1, M, 4
  2180. ble CGEMM_L1x4_END
  2181. #if defined(TRMMKERNEL)
  2182. REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
  2183. #else
  2184. mr BO, B
  2185. #endif
  2186. #if defined(TRMMKERNEL)
  2187. REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
  2188. mr T1, T6
  2189. addi T1,T1, -2
  2190. srawi. T8, T1, 5 /**(T1-2) % 31x */
  2191. #else
  2192. mr T1, K
  2193. addi T1,T1, -2
  2194. srawi. T8, T1, 5 /**(K-2) % 31x */
  2195. #endif
  2196. ZERO1x4
  2197. ble CGEMM_L1x4_SUB0
  2198. bl CGEMM_1x4_LMAIN_SUB
  2199. andi. L, T1, 31
  2200. ble CGEMM_L1x4_SAVE
  2201. b CGEMM_L1x4_SUB2
  2202. CGEMM_L1x4_SUB0:
  2203. /*----------------------------------------*/
  2204. #if defined(TRMMKERNEL)
  2205. andi. L, T6, 63
  2206. cmpwi T6,33
  2207. #else
  2208. andi. L, K, 63
  2209. cmpwi K,33
  2210. #endif
  2211. li T8,1
  2212. bne CMP1x4_32K
  2213. addi BO,BO,-8
  2214. addi AO,AO,-32
  2215. LOAD1x4O 32,8
  2216. END1x4_WITHOUT_ADD
  2217. LOAD1x4_2O 64, 16
  2218. mtctr T8
  2219. bl CGEMM_L1x4_K32
  2220. b CGEMM_L1x4_SAVE
  2221. CMP1x4_32K:
  2222. /*----------------------------------------*/
  2223. #if defined(TRMMKERNEL)
  2224. cmpwi T6,32
  2225. #else
  2226. cmpwi K,32
  2227. #endif
  2228. bne CGEMM_L1x4_SUB2
  2229. MY_ALIGN
  2230. mtctr T8
  2231. addi BO,BO,-16
  2232. addi AO,AO,-64
  2233. LOAD1x4_2O 64,16
  2234. bl CGEMM_L1x4_K32
  2235. b CGEMM_L1x4_SAVE
  2236. MY_ALIGN
  2237. MY_ALIGN
  2238. CGEMM_L1x4_SUB2:
  2239. /*----------------------------------------*/
  2240. andi. T1,L, 16
  2241. ble CGEMM_L1x4_SUB2_8
  2242. bl CGEMM_1x4_L16_SUB
  2243. MY_ALIGN
  2244. CGEMM_L1x4_SUB2_8:
  2245. /*----------------------------------------*/
  2246. andi. T1,L, 8
  2247. ble CGEMM_L1x4_SUB2_4
  2248. bl CGEMM_1x4_L8_SUB
  2249. MY_ALIGN
  2250. CGEMM_L1x4_SUB2_4:
  2251. /*----------------------------------------*/
  2252. andi. T1,L, 4
  2253. ble CGEMM_L1x4_SUB2_2
  2254. LOAD1x4_2
  2255. KERNEL1x4_L2 64,16, 0,0
  2256. KERNEL1x4_E2 64,16, 1,1
  2257. MY_ALIGN
  2258. CGEMM_L1x4_SUB2_2:
  2259. /*----------------------------------------*/
  2260. andi. T1,L, 2
  2261. ble CGEMM_L1x4_SUB2_1
  2262. LOAD1x4_2
  2263. KERNEL1x4_E2 64,16, 0,1
  2264. MY_ALIGN
  2265. CGEMM_L1x4_SUB2_1:
  2266. /*----------------------------------------*/
  2267. andi. T1,L, 1
  2268. ble CGEMM_L1x4_SAVE
  2269. KERNEL1x4
  2270. CGEMM_L1x4_SAVE:
  2271. /*----------------------------------------*/
  2272. SAVE1x4
  2273. #if defined(TRMMKERNEL)
  2274. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
  2275. #endif
  2276. CGEMM_L1x4_END:
  2277. /*----------------------------------------*/
  2278. CGEMM_L1x2_BEGIN:
  2279. /*----------------------------------------*/
  2280. andi. T1, M, 2
  2281. ble CGEMM_L1x2_END
  2282. #if defined(TRMMKERNEL)
  2283. REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
  2284. #else
  2285. mr BO, B
  2286. #endif
  2287. #if defined(TRMMKERNEL)
  2288. REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
  2289. mr T1, T6
  2290. addi T1,T1, -2
  2291. srawi. T8, T1, 5 /**(T1-2) % 31x */
  2292. #else
  2293. mr T1, K
  2294. addi T1,T1, -2
  2295. srawi. T8, T1, 5 /**(K-2) % 31x */
  2296. #endif
  2297. ZERO1x2
  2298. ble CGEMM_L1x2_SUB0
  2299. bl CGEMM_1x2_LMAIN_SUB
  2300. andi. L, T1, 31
  2301. ble CGEMM_L1x2_SAVE
  2302. b CGEMM_L1x2_SUB2
  2303. CGEMM_L1x2_SUB0:
  2304. /*----------------------------------------*/
  2305. #if defined(TRMMKERNEL)
  2306. andi. L, T6, 63
  2307. cmpwi T6,33
  2308. #else
  2309. andi. L, K, 63
  2310. cmpwi K,33
  2311. #endif
  2312. li T8,1
  2313. bne CMP1x2_32K
  2314. addi BO,BO,-8
  2315. addi AO,AO,-16
  2316. LOAD1x2O 16,8
  2317. END1x2_WITHOUT_ADD
  2318. LOAD1x2_2O 32, 16
  2319. mtctr T8
  2320. bl CGEMM_L1x2_K32
  2321. b CGEMM_L1x2_SAVE
  2322. CMP1x2_32K:
  2323. /*----------------------------------------*/
  2324. #if defined(TRMMKERNEL)
  2325. cmpwi T6,32
  2326. #else
  2327. cmpwi K,32
  2328. #endif
  2329. bne CGEMM_L1x2_SUB2
  2330. MY_ALIGN
  2331. mtctr T8
  2332. addi BO,BO,-16
  2333. addi AO,AO,-32
  2334. LOAD1x2_2O 32,16
  2335. bl CGEMM_L1x2_K32
  2336. b CGEMM_L1x2_SAVE
  2337. MY_ALIGN
  2338. MY_ALIGN
  2339. CGEMM_L1x2_SUB2:
  2340. /*----------------------------------------*/
  2341. andi. T1,L, 16
  2342. ble CGEMM_L1x2_SUB2_8
  2343. bl CGEMM_1x2_L16_SUB
  2344. MY_ALIGN
  2345. CGEMM_L1x2_SUB2_8:
  2346. /*----------------------------------------*/
  2347. andi. T1,L, 8
  2348. ble CGEMM_L1x2_SUB2_4
  2349. bl CGEMM_1x2_L8_SUB
  2350. MY_ALIGN
  2351. CGEMM_L1x2_SUB2_4:
  2352. /*----------------------------------------*/
  2353. andi. T1,L, 4
  2354. ble CGEMM_L1x2_SUB2_2
  2355. LOAD1x2_2
  2356. KERNEL1x2_L2 32,16, 0,0
  2357. KERNEL1x2_E2 32,16, 1,1
  2358. MY_ALIGN
  2359. CGEMM_L1x2_SUB2_2:
  2360. /*----------------------------------------*/
  2361. andi. T1,L, 2
  2362. ble CGEMM_L1x2_SUB2_1
  2363. LOAD1x2_2
  2364. KERNEL1x2_E2 32,16, 0,1
  2365. MY_ALIGN
  2366. CGEMM_L1x2_SUB2_1:
  2367. /*----------------------------------------*/
  2368. andi. T1,L, 1
  2369. ble CGEMM_L1x2_SAVE
  2370. KERNEL1x2
  2371. MY_ALIGN
  2372. CGEMM_L1x2_SAVE:
  2373. /*----------------------------------------*/
  2374. SAVE1x2
  2375. #if defined(TRMMKERNEL)
  2376. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
  2377. #endif
  2378. CGEMM_L1x2_END:
  2379. /*----------------------------------------*/
  2380. CGEMM_L1x1_BEGIN:
  2381. /*----------------------------------------*/
  2382. andi. T1, M, 1
  2383. ble CGEMM_L1x1_END
  2384. #if defined(TRMMKERNEL)
  2385. REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
  2386. #else
  2387. mr BO, B
  2388. #endif
  2389. #if defined(TRMMKERNEL)
  2390. REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
  2391. mr T1, T6
  2392. addi T1,T1, -2
  2393. srawi. T8, T1, 5 /**(T1-2) % 31x */
  2394. #else
  2395. mr T1, K
  2396. addi T1,T1, -2
  2397. srawi. T8, T1, 5 /**(K-2) % 31x */
  2398. #endif
  2399. ZERO1x1
  2400. ble CGEMM_L1x1_SUB0
  2401. bl CGEMM_1x1_LMAIN_SUB
  2402. andi. L, T1, 31
  2403. ble CGEMM_L1x1_SAVE
  2404. b CGEMM_L1x1_SUB2
  2405. CGEMM_L1x1_SUB0:
  2406. /*----------------------------------------*/
  2407. #if defined(TRMMKERNEL)
  2408. andi. L, T6, 63
  2409. cmpwi T6,33
  2410. #else
  2411. andi. L, K, 63
  2412. cmpwi K,33
  2413. #endif
  2414. li T8,1
  2415. bne CMP1x1_32K
  2416. addi BO,BO,-8
  2417. addi AO,AO,-8
  2418. LOAD1x1O 8,8
  2419. END1x1_WITHOUT_ADD
  2420. LOAD1x1_2O 16, 16
  2421. mtctr T8
  2422. bl CGEMM_L1x1_K32
  2423. b CGEMM_L1x1_SAVE
  2424. CMP1x1_32K:
  2425. /*----------------------------------------*/
  2426. #if defined(TRMMKERNEL)
  2427. cmpwi T6,32
  2428. #else
  2429. cmpwi K,32
  2430. #endif
  2431. bne CGEMM_L1x1_SUB2
  2432. MY_ALIGN
  2433. mtctr T8
  2434. addi BO,BO,-16
  2435. addi AO,AO,-16
  2436. LOAD1x1_2O 16,16
  2437. bl CGEMM_L1x1_K32
  2438. b CGEMM_L1x1_SAVE
  2439. MY_ALIGN
  2440. MY_ALIGN
  2441. CGEMM_L1x1_SUB2:
  2442. /*----------------------------------------*/
  2443. andi. T1,L, 16
  2444. ble CGEMM_L1x1_SUB2_8
  2445. bl CGEMM_1x1_L16_SUB
  2446. MY_ALIGN
  2447. CGEMM_L1x1_SUB2_8:
  2448. /*----------------------------------------*/
  2449. andi. T1,L, 8
  2450. ble CGEMM_L1x1_SUB2_4
  2451. bl CGEMM_1x1_L8_SUB
  2452. MY_ALIGN
  2453. CGEMM_L1x1_SUB2_4:
  2454. /*----------------------------------------*/
  2455. andi. T1,L, 4
  2456. ble CGEMM_L1x1_SUB2_2
  2457. LOAD1x1_2
  2458. KERNEL1x1_L2 16,16, 0,0
  2459. KERNEL1x1_E2 16,16, 1,1
  2460. MY_ALIGN
  2461. CGEMM_L1x1_SUB2_2:
  2462. /*----------------------------------------*/
  2463. andi. T1,L, 2
  2464. ble CGEMM_L1x1_SUB2_1
  2465. LOAD1x1_2
  2466. KERNEL1x1_E2 16,16, 0,1
  2467. MY_ALIGN
  2468. CGEMM_L1x1_SUB2_1:
  2469. /*----------------------------------------*/
  2470. andi. T1,L, 1
  2471. ble CGEMM_L1x1_SAVE
  2472. KERNEL1x1
  2473. MY_ALIGN
  2474. CGEMM_L1x1_SAVE:
  2475. /*----------------------------------------*/
  2476. SAVE1x1
  2477. #if defined(TRMMKERNEL)
  2478. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
  2479. #endif
  2480. CGEMM_L1x1_END:
  2481. /*----------------------------------------*/
  2482. slwi T1, K, 3
  2483. add B, B, T1
  2484. #if defined(TRMMKERNEL) && !defined(LEFT)
  2485. addi TEMP_REG, TEMP_REG, 1
  2486. #endif
  2487. CGEMM_L1_END: