You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_logic_power9.S 66 kB


  1. /***************************************************************************
  2. Copyright (c) 2013-2019, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * Abdelrauf(quickwritereader@gmail.com)
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. * LAPACK-TEST : OK
  33. **************************************************************************************/
  34. #define MY_ALIGN .align 3
  35. b CGEMM_L4
  36. /* MINI SUBROUTINES */
  37. /* 4x8 MAIN 128x+2 LOOP */
  38. CGEMM_L4x8_LMAIN_SUB:
  39. /*----------------------------------------*/
  40. mtctr T8
  41. LOAD4x8_2
  42. MY_ALIGN
  43. CGEMM_L4x8_LOOP:
  44. /*----------------------------------------*/
  45. dcbt AO, PRE
  46. dcbt BO, PRE
  47. KERNEL4x8_L2 128,64,0,0
  48. CGEMM_L4x8_K128:
  49. /*----------------------------------------*/
  50. KERNEL4x8_L2 128,64,1,0
  51. dcbt AO, T2
  52. KERNEL4x8_L2 128,64,2,0
  53. KERNEL4x8_L2 128,64,3,0
  54. dcbt AO, T3
  55. dcbt BO, T2
  56. KERNEL4x8_L2 128,64,4,0
  57. KERNEL4x8_L2 128,64,5,0
  58. dcbt AO, T4
  59. KERNEL4x8_L2 128,64,6,0
  60. KERNEL4x8_L2 128,64,7,0
  61. dcbt AO, T5
  62. dcbt BO, T3
  63. KERNEL4x8_L2 128,64,8,0
  64. KERNEL4x8_L2 128,64,9,0
  65. KERNEL4x8_L2 128,64,10,0
  66. KERNEL4x8_L2 128,64,11,0
  67. dcbt BO, T4
  68. KERNEL4x8_L2 128,64,12,0
  69. KERNEL4x8_L2 128,64,13,0
  70. KERNEL4x8_L2 128,64,14,0
  71. KERNEL4x8_L2 128,64,15,0
  72. KERNEL4x8_L2 128,64,16,0
  73. KERNEL4x8_L2 128,64,17,0
  74. KERNEL4x8_L2 128,64,18,0
  75. KERNEL4x8_L2 128,64,19,0
  76. KERNEL4x8_L2 128,64,20,0
  77. KERNEL4x8_L2 128,64,21,0
  78. KERNEL4x8_L2 128,64,22,0
  79. KERNEL4x8_L2 128,64,23,0
  80. KERNEL4x8_L2 128,64,24,0
  81. KERNEL4x8_L2 128,64,25,0
  82. KERNEL4x8_L2 128,64,26,0
  83. KERNEL4x8_L2 128,64,27,0
  84. KERNEL4x8_L2 128,64,28,0
  85. KERNEL4x8_L2 128,64,29,0
  86. KERNEL4x8_L2 128,64,30,0
  87. KERNEL4x8_L2 128,64,31,0
  88. KERNEL4x8_L2 128,64,32,0
  89. KERNEL4x8_L2 128,64,33,0
  90. KERNEL4x8_L2 128,64,34,0
  91. KERNEL4x8_L2 128,64,35,0
  92. KERNEL4x8_L2 128,64,36,0
  93. KERNEL4x8_L2 128,64,37,0
  94. KERNEL4x8_L2 128,64,38,0
  95. KERNEL4x8_L2 128,64,39,0
  96. KERNEL4x8_L2 128,64,40,0
  97. KERNEL4x8_L2 128,64,41,0
  98. KERNEL4x8_L2 128,64,42,0
  99. KERNEL4x8_L2 128,64,43,0
  100. KERNEL4x8_L2 128,64,44,0
  101. KERNEL4x8_L2 128,64,45,0
  102. KERNEL4x8_L2 128,64,46,0
  103. KERNEL4x8_L2 128,64,47,0
  104. KERNEL4x8_L2 128,64,48,0
  105. KERNEL4x8_L2 128,64,49,0
  106. KERNEL4x8_L2 128,64,50,0
  107. KERNEL4x8_L2 128,64,51,0
  108. KERNEL4x8_L2 128,64,52,0
  109. KERNEL4x8_L2 128,64,53,0
  110. KERNEL4x8_L2 128,64,54,0
  111. KERNEL4x8_L2 128,64,55,0
  112. KERNEL4x8_L2 128,64,56,0
  113. KERNEL4x8_L2 128,64,57,0
  114. KERNEL4x8_L2 128,64,58,0
  115. KERNEL4x8_L2 128,64,59,0
  116. KERNEL4x8_L2 128,64,60,0
  117. KERNEL4x8_L2 128,64,61,0
  118. KERNEL4x8_L2 128,64,62,0
  119. KERNEL4x8_L2 128,64,63,1
  120. bdnz CGEMM_L4x8_LOOP
  121. MY_ALIGN
  122. CGEMM_L4x8_LOOP_END:
  123. /*----------------------------------------*/
  124. END4x8_2
  125. blr
  126. MY_ALIGN
  127. CGEMM_4x8_L64_SUB:
  128. /*----------------------------------------*/
  129. LOAD4x8_2
  130. dcbt AO, PRE
  131. dcbt BO, PRE
  132. KERNEL4x8_L2 128,64,0,0
  133. KERNEL4x8_L2 128,64,1,0
  134. dcbt AO, T2
  135. KERNEL4x8_L2 128,64,2,0
  136. KERNEL4x8_L2 128,64,3,0
  137. dcbt AO, T3
  138. dcbt BO, T2
  139. KERNEL4x8_L2 128,64,4,0
  140. KERNEL4x8_L2 128,64,5,0
  141. dcbt AO, T4
  142. KERNEL4x8_L2 128,64,6,0
  143. KERNEL4x8_L2 128,64,7,0
  144. dcbt AO, T5
  145. dcbt BO, T3
  146. KERNEL4x8_L2 128,64,8,0
  147. KERNEL4x8_L2 128,64,9,0
  148. KERNEL4x8_L2 128,64,10,0
  149. KERNEL4x8_L2 128,64,11,0
  150. dcbt BO, T4
  151. KERNEL4x8_L2 128,64,12,0
  152. KERNEL4x8_L2 128,64,13,0
  153. KERNEL4x8_L2 128,64,14,0
  154. KERNEL4x8_L2 128,64,15,0
  155. KERNEL4x8_L2 128,64,16,0
  156. KERNEL4x8_L2 128,64,17,0
  157. KERNEL4x8_L2 128,64,18,0
  158. KERNEL4x8_L2 128,64,19,0
  159. KERNEL4x8_L2 128,64,20,0
  160. KERNEL4x8_L2 128,64,21,0
  161. KERNEL4x8_L2 128,64,22,0
  162. KERNEL4x8_L2 128,64,23,0
  163. KERNEL4x8_L2 128,64,24,0
  164. KERNEL4x8_L2 128,64,25,0
  165. KERNEL4x8_L2 128,64,26,0
  166. KERNEL4x8_L2 128,64,27,0
  167. KERNEL4x8_L2 128,64,28,0
  168. KERNEL4x8_L2 128,64,29,0
  169. KERNEL4x8_L2 128,64,30,0
  170. KERNEL4x8_E2 128,64,31,1
  171. blr
  172. MY_ALIGN
  173. CGEMM_4x8_L32_SUB:
  174. /*----------------------------------------*/
  175. LOAD4x8_2
  176. dcbt AO, PRE
  177. dcbt BO, PRE
  178. KERNEL4x8_L2 128,64,0,0
  179. KERNEL4x8_L2 128,64,1,0
  180. dcbt AO, T2
  181. KERNEL4x8_L2 128,64,2,0
  182. KERNEL4x8_L2 128,64,3,0
  183. dcbt AO, T3
  184. dcbt BO, T2
  185. KERNEL4x8_L2 128,64,4,0
  186. KERNEL4x8_L2 128,64,5,0
  187. dcbt AO, T4
  188. KERNEL4x8_L2 128,64,6,0
  189. KERNEL4x8_L2 128,64,7,0
  190. dcbt AO, T5
  191. dcbt BO, T3
  192. KERNEL4x8_L2 128,64,8,0
  193. KERNEL4x8_L2 128,64,9,0
  194. KERNEL4x8_L2 128,64,10,0
  195. KERNEL4x8_L2 128,64,11,0
  196. dcbt BO, T4
  197. KERNEL4x8_L2 128,64,12,0
  198. KERNEL4x8_L2 128,64,13,0
  199. KERNEL4x8_L2 128,64,14,0
  200. KERNEL4x8_E2 128,64,15,1
  201. blr
  202. MY_ALIGN
  203. CGEMM_4x8_L16_SUB:
  204. /*----------------------------------------*/
  205. LOAD4x8_2
  206. dcbt AO, PRE
  207. dcbt BO, PRE
  208. KERNEL4x8_L2 128,64,0,0
  209. KERNEL4x8_L2 128,64,1,0
  210. dcbt AO, T2
  211. KERNEL4x8_L2 128,64,2,0
  212. KERNEL4x8_L2 128,64,3,0
  213. dcbt AO, T3
  214. dcbt BO, T2
  215. KERNEL4x8_L2 128,64,4,0
  216. KERNEL4x8_L2 128,64,5,0
  217. dcbt AO, T4
  218. KERNEL4x8_L2 128,64,6,0
  219. KERNEL4x8_E2 128,64,7,1
  220. blr
  221. MY_ALIGN
  222. CGEMM_4x4_LMAIN_SUB:
  223. /*----------------------------------------*/
  224. mtctr T8
  225. LOAD4x4_2
  226. MY_ALIGN
  227. CGEMM_L4x4_LOOP:
  228. /*----------------------------------------*/
  229. KERNEL4x4_L2 64,64,0,0
  230. CGEMM_L4x4_K32:
  231. /*----------------------------------------*/
  232. KERNEL4x4_L2 64,64,1,0
  233. KERNEL4x4_L2 64,64,2,0
  234. KERNEL4x4_L2 64,64,3,0
  235. KERNEL4x4_L2 64,64,4,0
  236. KERNEL4x4_L2 64,64,5,0
  237. KERNEL4x4_L2 64,64,6,0
  238. KERNEL4x4_L2 64,64,7,0
  239. KERNEL4x4_L2 64,64,8,0
  240. KERNEL4x4_L2 64,64,9,0
  241. KERNEL4x4_L2 64,64,10,0
  242. KERNEL4x4_L2 64,64,11,0
  243. KERNEL4x4_L2 64,64,12,0
  244. KERNEL4x4_L2 64,64,13,0
  245. KERNEL4x4_L2 64,64,14,0
  246. KERNEL4x4_L2 64,64,15,1
  247. bdnz CGEMM_L4x4_LOOP
  248. MY_ALIGN
  249. CGEMM_L4x4_LOOP_END:
  250. /*----------------------------------------*/
  251. END4x4_2
  252. blr
  253. MY_ALIGN
  254. CGEMM_4x4_L16_SUB:
  255. /*----------------------------------------*/
  256. LOAD4x4_2
  257. KERNEL4x4_L2 64,64,0,0
  258. KERNEL4x4_L2 64,64,1,0
  259. KERNEL4x4_L2 64,64,2,0
  260. KERNEL4x4_L2 64,64,3,0
  261. KERNEL4x4_L2 64,64,4,0
  262. KERNEL4x4_L2 64,64,5,0
  263. KERNEL4x4_L2 64,64,6,0
  264. KERNEL4x4_E2 64,64,7,1
  265. blr
  266. MY_ALIGN
  267. CGEMM_4x4_L8_SUB:
  268. /*----------------------------------------*/
  269. LOAD4x4_2
  270. KERNEL4x4_L2 64,64,0,0
  271. KERNEL4x4_L2 64,64,1,0
  272. KERNEL4x4_L2 64,64,2,0
  273. KERNEL4x4_E2 64,64,3,1
  274. blr
  275. CGEMM_4x2_LMAIN_SUB:
  276. /*----------------------------------------*/
  277. mtctr T8
  278. LOAD4x2_2
  279. MY_ALIGN
  280. CGEMM_L4x2_LOOP:
  281. /*----------------------------------------*/
  282. KERNEL4x2_L2 32,64,0,0
  283. CGEMM_L4x2_K32:
  284. /*----------------------------------------*/
  285. KERNEL4x2_L2 32,64,1,0
  286. KERNEL4x2_L2 32,64,2,0
  287. KERNEL4x2_L2 32,64,3,0
  288. KERNEL4x2_L2 32,64,4,0
  289. KERNEL4x2_L2 32,64,5,0
  290. KERNEL4x2_L2 32,64,6,0
  291. KERNEL4x2_L2 32,64,7,0
  292. KERNEL4x2_L2 32,64,8,0
  293. KERNEL4x2_L2 32,64,9,0
  294. KERNEL4x2_L2 32,64,10,0
  295. KERNEL4x2_L2 32,64,11,0
  296. KERNEL4x2_L2 32,64,12,0
  297. KERNEL4x2_L2 32,64,13,0
  298. KERNEL4x2_L2 32,64,14,0
  299. KERNEL4x2_L2 32,64,15,1
  300. bdnz CGEMM_L4x2_LOOP
  301. MY_ALIGN
  302. CGEMM_L4x2_LOOP_END:
  303. /*----------------------------------------*/
  304. END4x2_2
  305. blr
  306. MY_ALIGN
  307. CGEMM_4x2_L16_SUB:
  308. /*----------------------------------------*/
  309. LOAD4x2_2
  310. KERNEL4x2_L2 32,64,0,0
  311. KERNEL4x2_L2 32,64,1,0
  312. KERNEL4x2_L2 32,64,2,0
  313. KERNEL4x2_L2 32,64,3,0
  314. KERNEL4x2_L2 32,64,4,0
  315. KERNEL4x2_L2 32,64,5,0
  316. KERNEL4x2_L2 32,64,6,0
  317. KERNEL4x2_E2 32,64,7,1
  318. blr
  319. MY_ALIGN
  320. CGEMM_4x2_L8_SUB:
  321. /*----------------------------------------*/
  322. LOAD4x2_2
  323. KERNEL4x2_L2 32,64,0,0
  324. KERNEL4x2_L2 32,64,1,0
  325. KERNEL4x2_L2 32,64,2,0
  326. KERNEL4x2_E2 32,64,3,1
  327. blr
  328. CGEMM_4x1_LMAIN_SUB:
  329. /*----------------------------------------*/
  330. mtctr T8
  331. LOAD4x1_2
  332. MY_ALIGN
  333. CGEMM_L4x1_LOOP:
  334. /*----------------------------------------*/
  335. KERNEL4x1_L2 16,64,0,0
  336. CGEMM_L4x1_K32:
  337. /*----------------------------------------*/
  338. KERNEL4x1_L2 16,64,1,0
  339. KERNEL4x1_L2 16,64,2,0
  340. KERNEL4x1_L2 16,64,3,0
  341. KERNEL4x1_L2 16,64,4,0
  342. KERNEL4x1_L2 16,64,5,0
  343. KERNEL4x1_L2 16,64,6,0
  344. KERNEL4x1_L2 16,64,7,0
  345. KERNEL4x1_L2 16,64,8,0
  346. KERNEL4x1_L2 16,64,9,0
  347. KERNEL4x1_L2 16,64,10,0
  348. KERNEL4x1_L2 16,64,11,0
  349. KERNEL4x1_L2 16,64,12,0
  350. KERNEL4x1_L2 16,64,13,0
  351. KERNEL4x1_L2 16,64,14,0
  352. KERNEL4x1_L2 16,64,15,1
  353. bdnz CGEMM_L4x1_LOOP
  354. MY_ALIGN
  355. CGEMM_L4x1_LOOP_END:
  356. /*----------------------------------------*/
  357. END4x1_2
  358. blr
  359. MY_ALIGN
  360. CGEMM_4x1_L16_SUB:
  361. /*----------------------------------------*/
  362. LOAD4x1_2
  363. KERNEL4x1_L2 16,64,0,0
  364. KERNEL4x1_L2 16,64,1,0
  365. KERNEL4x1_L2 16,64,2,0
  366. KERNEL4x1_L2 16,64,3,0
  367. KERNEL4x1_L2 16,64,4,0
  368. KERNEL4x1_L2 16,64,5,0
  369. KERNEL4x1_L2 16,64,6,0
  370. KERNEL4x1_E2 16,64,7,1
  371. blr
  372. MY_ALIGN
  373. CGEMM_4x1_L8_SUB:
  374. /*----------------------------------------*/
  375. LOAD4x1_2
  376. KERNEL4x1_L2 16,64,0,0
  377. KERNEL4x1_L2 16,64,1,0
  378. KERNEL4x1_L2 16,64,2,0
  379. KERNEL4x1_E2 16,64,3,1
  380. blr
  381. /* MAIN LOOP BEGINS */
  382. MY_ALIGN
  383. CGEMM_L4:
  384. /*----------------------------------------*/
  385. #if defined(TRMMKERNEL) && !defined(LEFT)
  386. neg TEMP_REG, OFFSET
  387. #endif
  388. srawi. J, N, 2
  389. ble CGEMM_L4_END
  390. CGEMM_L4_BEGIN:
  391. /*----------------------------------------*/
  392. mr CO, C
  393. slwi T1, LDC , 2
  394. add T2,C,LDC
  395. mr AO, A
  396. add C, C, T1
  397. #if defined(TRMMKERNEL) && defined(LEFT)
  398. mr TEMP_REG, OFFSET /*off = offset;*/
  399. #endif
  400. srawi. I, M, 3
  401. ble CGEMM_L4x8_END
  402. dcbt CO,r0 /*just prefetch*/
  403. dcbt T2,r0
  404. CGEMM_L4x8_BEGIN:
  405. /*----------------------------------------*/
  406. #if defined(TRMMKERNEL)
  407. REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4
  408. #else
  409. mr BO, B
  410. dcbt B, r0
  411. #endif
  412. dcbt AO, r0
  413. #if defined(TRMMKERNEL)
  414. REFRESH_TEMP_BK T6,K,TEMP_REG,8,4
  415. mr T1, T6
  416. /* TEMPS FOR PREFETCH */
  417. li T2, 1024
  418. li T3, 1024+512
  419. addi T1,T1, -2
  420. /* TEMPS FOR PREFETCH */
  421. li T4, 2048
  422. li T5, 2048+512
  423. srawi. T8, T1, 7 /**(T1-2) % 128x */
  424. #else
  425. mr T1, K
  426. /* TEMPS FOR PREFETCH */
  427. li T2, 1024
  428. li T3, 1024+512
  429. addi T1,T1, -2
  430. /* TEMPS FOR PREFETCH */
  431. li T4, 2048
  432. li T5, 2048+512
  433. srawi. T8, T1, 7 /**(K-2) % 128x */
  434. #endif
  435. ZERO4x8
  436. ble CGEMM_L4x8_SUB0
  437. bl CGEMM_L4x8_LMAIN_SUB
  438. andi. L, T1, 127
  439. ble CGEMM_L4x8_SAVE
  440. b CGEMM_L4x8_SUB2
  441. CGEMM_L4x8_SUB0:
  442. /*----------------------------------------*/
  443. #if defined(TRMMKERNEL)
  444. andi. L, T6, 255
  445. cmpwi T6,129
  446. #else
  447. andi. L, K, 255
  448. cmpwi K,129
  449. #endif
  450. li T8,1
  451. bne CMP4x8_128K
  452. addi BO,BO,-32
  453. addi AO,AO,-64
  454. LOAD4x8O 64,32
  455. END4x8_WITHOUT_ADD
  456. LOAD4x8_2O 128, 64
  457. mtctr T8
  458. bl CGEMM_L4x8_K128
  459. b CGEMM_L4x8_SAVE
  460. CMP4x8_128K:
  461. /*----------------------------------------*/
  462. #if defined(TRMMKERNEL)
  463. cmpwi T6,128
  464. #else
  465. cmpwi K,128
  466. #endif
  467. bne CGEMM_L4x8_SUB2
  468. MY_ALIGN
  469. mtctr T8
  470. addi BO,BO,-64
  471. addi AO,AO,-128
  472. LOAD4x8_2O 128,64
  473. bl CGEMM_L4x8_K128
  474. b CGEMM_L4x8_SAVE
  475. MY_ALIGN
  476. CGEMM_L4x8_SUB2:
  477. /*----------------------------------------*/
  478. andi. T1,L, 64
  479. ble CGEMM_L4x8_SUB2_32
  480. bl CGEMM_4x8_L64_SUB
  481. MY_ALIGN
  482. CGEMM_L4x8_SUB2_32:
  483. /*----------------------------------------*/
  484. andi. T1,L, 32
  485. ble CGEMM_L4x8_SUB2_16
  486. bl CGEMM_4x8_L32_SUB
  487. MY_ALIGN
  488. CGEMM_L4x8_SUB2_16:
  489. /*----------------------------------------*/
  490. andi. T1,L, 16
  491. ble CGEMM_L4x8_SUB2_8
  492. bl CGEMM_4x8_L16_SUB
  493. MY_ALIGN
  494. CGEMM_L4x8_SUB2_8:
  495. /*----------------------------------------*/
  496. andi. T1,L, 8
  497. ble CGEMM_L4x8_SUB2_4
  498. LOAD4x8_2
  499. KERNEL4x8_L2 128,64, 0,0
  500. KERNEL4x8_L2 128,64, 1,0
  501. KERNEL4x8_L2 128,64, 2,0
  502. KERNEL4x8_E2 128,64, 3,1
  503. MY_ALIGN
  504. CGEMM_L4x8_SUB2_4:
  505. /*----------------------------------------*/
  506. andi. T1,L, 4
  507. ble CGEMM_L4x8_SUB2_2
  508. LOAD4x8_2
  509. KERNEL4x8_L2 128,64, 0,0
  510. KERNEL4x8_E2 128,64, 1,1
  511. MY_ALIGN
  512. CGEMM_L4x8_SUB2_2:
  513. /*----------------------------------------*/
  514. andi. T1,L, 2
  515. ble CGEMM_L4x8_SUB2_1
  516. LOAD4x8_2
  517. KERNEL4x8_E2 128,64, 0,1
  518. MY_ALIGN
  519. CGEMM_L4x8_SUB2_1:
  520. /*----------------------------------------*/
  521. andi. T1,L, 1
  522. ble CGEMM_L4x8_SAVE
  523. KERNEL4x8
  524. MY_ALIGN
  525. CGEMM_L4x8_SAVE:
  526. /*----------------------------------------*/
  527. addic. I, I, -1
  528. MY_ALIGN
  529. SAVE4x8
  530. #if defined(TRMMKERNEL)
  531. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4
  532. #endif
  533. bgt CGEMM_L4x8_BEGIN
  534. andi. T2, M, 7
  535. ble CGEMM_L4x1_END
  536. andi. T1, M, 4
  537. ble CGEMM_L4x4_END
  538. b CGEMM_L4x4_BEGIN
  539. MY_ALIGN
  540. CGEMM_L4x8_END:
  541. /*----------------------------------------*/
  542. CGEMM_L4x4_BEGIN:
  543. /*----------------------------------------*/
  544. andi. T2, M, 7
  545. ble CGEMM_L4x1_END
  546. andi. T1, M, 4
  547. ble CGEMM_L4x4_END
  548. #if defined(TRMMKERNEL)
  549. REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4
  550. #else
  551. mr BO, B
  552. #endif
  553. #if defined(TRMMKERNEL)
  554. REFRESH_TEMP_BK T6,K,TEMP_REG,4,4
  555. mr T1, T6
  556. addi T1,T1, -2
  557. srawi. T8, T1, 5 /**(T1-2) % 32x */
  558. #else
  559. mr T1, K
  560. addi T1,T1, -2
  561. srawi. T8, T1, 5 /**(K-2) % 32x */
  562. #endif
  563. ZERO4x4
  564. ble CGEMM_L4x4_SUB0
  565. bl CGEMM_4x4_LMAIN_SUB
  566. andi. L, T1, 31
  567. ble CGEMM_L4x4_SAVE
  568. b CGEMM_L4x4_SUB2
  569. CGEMM_L4x4_SUB0:
  570. /*----------------------------------------*/
  571. #if defined(TRMMKERNEL)
  572. andi. L, T6, 63
  573. cmpwi T6,33
  574. #else
  575. andi. L, K, 63
  576. cmpwi K,33
  577. #endif
  578. li T8,1
  579. bne CMP4x4_32K
  580. addi BO,BO,-32
  581. addi AO,AO,-32
  582. LOAD4x4O 32,32
  583. END4x4_WITHOUT_ADD
  584. LOAD4x4_2O 64, 64
  585. mtctr T8
  586. bl CGEMM_L4x4_K32
  587. b CGEMM_L4x4_SAVE
  588. CMP4x4_32K:
  589. /*----------------------------------------*/
  590. #if defined(TRMMKERNEL)
  591. cmpwi T6,32
  592. #else
  593. cmpwi K,32
  594. #endif
  595. bne CGEMM_L4x4_SUB2
  596. MY_ALIGN
  597. mtctr T8
  598. addi BO,BO,-64
  599. addi AO,AO,-64
  600. LOAD4x4_2O 64,64
  601. bl CGEMM_L4x4_K32
  602. b CGEMM_L4x4_SAVE
  603. MY_ALIGN
  604. MY_ALIGN
  605. CGEMM_L4x4_SUB2:
  606. /*----------------------------------------*/
  607. andi. T1,L, 16
  608. ble CGEMM_L4x4_SUB2_8
  609. bl CGEMM_4x4_L16_SUB
  610. MY_ALIGN
  611. CGEMM_L4x4_SUB2_8:
  612. /*----------------------------------------*/
  613. andi. T1,L, 8
  614. ble CGEMM_L4x4_SUB2_4
  615. bl CGEMM_4x4_L8_SUB
  616. MY_ALIGN
  617. CGEMM_L4x4_SUB2_4:
  618. /*----------------------------------------*/
  619. andi. T1,L, 4
  620. ble CGEMM_L4x4_SUB2_2
  621. LOAD4x4_2
  622. KERNEL4x4_L2 64,64, 0,0
  623. KERNEL4x4_E2 64,64, 1,1
  624. MY_ALIGN
  625. CGEMM_L4x4_SUB2_2:
  626. /*----------------------------------------*/
  627. andi. T1,L, 2
  628. ble CGEMM_L4x4_SUB2_1
  629. LOAD4x4_2
  630. KERNEL4x4_E2 64,64, 0,1
  631. MY_ALIGN
  632. CGEMM_L4x4_SUB2_1:
  633. /*----------------------------------------*/
  634. andi. T1,L, 1
  635. ble CGEMM_L4x4_SAVE
  636. KERNEL4x4
  637. CGEMM_L4x4_SAVE:
  638. /*----------------------------------------*/
  639. SAVE4x4
  640. #if defined(TRMMKERNEL)
  641. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4
  642. #endif
  643. CGEMM_L4x4_END:
  644. /*----------------------------------------*/
  645. CGEMM_L4x2_BEGIN:
  646. /*----------------------------------------*/
  647. andi. T1, M, 2
  648. ble CGEMM_L4x2_END
  649. #if defined(TRMMKERNEL)
  650. REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4
  651. #else
  652. mr BO, B
  653. #endif
  654. #if defined(TRMMKERNEL)
  655. REFRESH_TEMP_BK T6,K,TEMP_REG,2,4
  656. mr T1, T6
  657. addi T1,T1, -2
  658. srawi. T8, T1, 5 /**(T1-2) % 32x */
  659. #else
  660. mr T1, K
  661. addi T1,T1, -2
  662. srawi. T8, T1, 5 /**(K-2) % 32x */
  663. #endif
  664. ZERO4x2
  665. ble CGEMM_L4x2_SUB0
  666. bl CGEMM_4x2_LMAIN_SUB
  667. andi. L, T1, 31
  668. ble CGEMM_L4x2_SAVE
  669. b CGEMM_L4x2_SUB2
  670. CGEMM_L4x2_SUB0:
  671. /*----------------------------------------*/
  672. #if defined(TRMMKERNEL)
  673. andi. L, T6, 63
  674. cmpwi T6,33
  675. #else
  676. andi. L, K, 63
  677. cmpwi K,33
  678. #endif
  679. li T8,1
  680. bne CMP4x2_32K
  681. addi BO,BO,-32
  682. addi AO,AO,-16
  683. LOAD4x2O 16,32
  684. END4x2_WITHOUT_ADD
  685. LOAD4x2_2O 32, 64
  686. mtctr T8
  687. bl CGEMM_L4x2_K32
  688. b CGEMM_L4x2_SAVE
  689. CMP4x2_32K:
  690. /*----------------------------------------*/
  691. #if defined(TRMMKERNEL)
  692. cmpwi T6,32
  693. #else
  694. cmpwi K,32
  695. #endif
  696. bne CGEMM_L4x2_SUB2
  697. MY_ALIGN
  698. mtctr T8
  699. addi BO,BO,-64
  700. addi AO,AO,-32
  701. LOAD4x2_2O 32,64
  702. bl CGEMM_L4x2_K32
  703. b CGEMM_L4x2_SAVE
  704. MY_ALIGN
  705. MY_ALIGN
  706. CGEMM_L4x2_SUB2:
  707. /*----------------------------------------*/
  708. andi. T1,L, 16
  709. ble CGEMM_L4x2_SUB2_8
  710. bl CGEMM_4x2_L16_SUB
  711. MY_ALIGN
  712. CGEMM_L4x2_SUB2_8:
  713. /*----------------------------------------*/
  714. andi. T1,L, 8
  715. ble CGEMM_L4x2_SUB2_4
  716. bl CGEMM_4x2_L8_SUB
  717. MY_ALIGN
  718. CGEMM_L4x2_SUB2_4:
  719. /*----------------------------------------*/
  720. andi. T1,L, 4
  721. ble CGEMM_L4x2_SUB2_2
  722. LOAD4x2_2
  723. KERNEL4x2_L2 32,64, 0,0
  724. KERNEL4x2_E2 32,64, 1,1
  725. MY_ALIGN
  726. CGEMM_L4x2_SUB2_2:
  727. /*----------------------------------------*/
  728. andi. T1,L, 2
  729. ble CGEMM_L4x2_SUB2_1
  730. LOAD4x2_2
  731. KERNEL4x2_E2 32,64, 0,1
  732. MY_ALIGN
  733. CGEMM_L4x2_SUB2_1:
  734. /*----------------------------------------*/
  735. andi. T1,L, 1
  736. ble CGEMM_L4x2_SAVE
  737. KERNEL4x2
  738. MY_ALIGN
  739. CGEMM_L4x2_SAVE:
  740. /*----------------------------------------*/
  741. SAVE4x2
  742. #if defined(TRMMKERNEL)
  743. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4
  744. #endif
  745. CGEMM_L4x2_END:
  746. /*----------------------------------------*/
  747. CGEMM_L4x1_BEGIN:
  748. /*----------------------------------------*/
  749. andi. T1, M, 1
  750. ble CGEMM_L4x1_END
  751. #if defined(TRMMKERNEL)
  752. REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4
  753. #else
  754. mr BO, B
  755. #endif
  756. #if defined(TRMMKERNEL)
  757. REFRESH_TEMP_BK T6,K,TEMP_REG,1,4
  758. mr T1, T6
  759. addi T1,T1, -2
  760. srawi. T8, T1, 5 /**(T1-2) % 32x */
  761. #else
  762. mr T1, K
  763. addi T1,T1, -2
  764. srawi. T8, T1, 5 /**(K-2) % 32x */
  765. #endif
  766. ZERO4x1
  767. ble CGEMM_L4x1_SUB0
  768. bl CGEMM_4x1_LMAIN_SUB
  769. andi. L, T1, 31
  770. ble CGEMM_L4x1_SAVE
  771. b CGEMM_L4x1_SUB2
  772. CGEMM_L4x1_SUB0:
  773. /*----------------------------------------*/
  774. #if defined(TRMMKERNEL)
  775. andi. L, T6, 63
  776. cmpwi T6,33
  777. #else
  778. andi. L, K, 63
  779. cmpwi K,33
  780. #endif
  781. li T8,1
  782. bne CMP4x1_32K
  783. addi BO,BO,-32
  784. addi AO,AO,-8
  785. LOAD4x1O 8,32
  786. END4x1_WITHOUT_ADD
  787. LOAD4x1_2O 16, 64
  788. mtctr T8
  789. bl CGEMM_L4x1_K32
  790. b CGEMM_L4x1_SAVE
  791. CMP4x1_32K:
  792. /*----------------------------------------*/
  793. #if defined(TRMMKERNEL)
  794. cmpwi T6,32
  795. #else
  796. cmpwi K,32
  797. #endif
  798. bne CGEMM_L4x1_SUB2
  799. MY_ALIGN
  800. mtctr T8
  801. addi BO,BO,-64
  802. addi AO,AO,-16
  803. LOAD4x1_2O 16,64
  804. bl CGEMM_L4x1_K32
  805. b CGEMM_L4x1_SAVE
  806. MY_ALIGN
  807. MY_ALIGN
  808. CGEMM_L4x1_SUB2:
  809. /*----------------------------------------*/
  810. andi. T1,L, 16
  811. ble CGEMM_L4x1_SUB2_8
  812. bl CGEMM_4x1_L16_SUB
  813. MY_ALIGN
  814. CGEMM_L4x1_SUB2_8:
  815. /*----------------------------------------*/
  816. andi. T1,L, 8
  817. ble CGEMM_L4x1_SUB2_4
  818. bl CGEMM_4x1_L8_SUB
  819. MY_ALIGN
  820. CGEMM_L4x1_SUB2_4:
  821. /*----------------------------------------*/
  822. andi. T1,L, 4
  823. ble CGEMM_L4x1_SUB2_2
  824. LOAD4x1_2
  825. KERNEL4x1_L2 16,64, 0,0
  826. KERNEL4x1_E2 16,64, 1,1
  827. MY_ALIGN
  828. CGEMM_L4x1_SUB2_2:
  829. /*----------------------------------------*/
  830. andi. T1,L, 2
  831. ble CGEMM_L4x1_SUB2_1
  832. LOAD4x1_2
  833. KERNEL4x1_E2 16,64, 0,1
  834. MY_ALIGN
  835. CGEMM_L4x1_SUB2_1:
  836. /*----------------------------------------*/
  837. andi. T1,L, 1
  838. ble CGEMM_L4x1_SAVE
  839. KERNEL4x1
  840. MY_ALIGN
  841. CGEMM_L4x1_SAVE:
  842. /*----------------------------------------*/
  843. SAVE4x1
  844. #if defined(TRMMKERNEL)
  845. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4
  846. #endif
  847. CGEMM_L4x1_END:
  848. /*----------------------------------------*/
  849. slwi T1, K, 5
  850. addic. J, J, -1
  851. add B, B, T1
  852. #if defined(TRMMKERNEL) && !defined(LEFT)
  853. addi TEMP_REG, TEMP_REG, 4
  854. #endif
  855. bgt CGEMM_L4_BEGIN
  856. CGEMM_L4_END:
  857. b CGEMM_L2
  858. /* MINI SUBROUTINES */
  859. /* 2x8 MAIN 128x+2 LOOP */
  860. CGEMM_L2x8_LMAIN_SUB:
  861. /*----------------------------------------*/
  862. mtctr T8
  863. LOAD2x8_2
  864. MY_ALIGN
  865. CGEMM_L2x8_LOOP:
  866. /*----------------------------------------*/
  867. dcbt AO, PRE
  868. dcbt BO, PRE
  869. KERNEL2x8_L2 128,32,0,0
  870. CGEMM_L2x8_K128:
  871. /*----------------------------------------*/
  872. KERNEL2x8_L2 128,32,1,0
  873. dcbt AO, T2
  874. KERNEL2x8_L2 128,32,2,0
  875. KERNEL2x8_L2 128,32,3,0
  876. dcbt AO, T3
  877. dcbt BO, T2
  878. KERNEL2x8_L2 128,32,4,0
  879. KERNEL2x8_L2 128,32,5,0
  880. dcbt AO, T4
  881. KERNEL2x8_L2 128,32,6,0
  882. KERNEL2x8_L2 128,32,7,0
  883. dcbt AO, T5
  884. dcbt BO, T3
  885. KERNEL2x8_L2 128,32,8,0
  886. KERNEL2x8_L2 128,32,9,0
  887. KERNEL2x8_L2 128,32,10,0
  888. KERNEL2x8_L2 128,32,11,0
  889. dcbt BO, T4
  890. KERNEL2x8_L2 128,32,12,0
  891. KERNEL2x8_L2 128,32,13,0
  892. KERNEL2x8_L2 128,32,14,0
  893. KERNEL2x8_L2 128,32,15,0
  894. KERNEL2x8_L2 128,32,16,0
  895. KERNEL2x8_L2 128,32,17,0
  896. KERNEL2x8_L2 128,32,18,0
  897. KERNEL2x8_L2 128,32,19,0
  898. KERNEL2x8_L2 128,32,20,0
  899. KERNEL2x8_L2 128,32,21,0
  900. KERNEL2x8_L2 128,32,22,0
  901. KERNEL2x8_L2 128,32,23,0
  902. KERNEL2x8_L2 128,32,24,0
  903. KERNEL2x8_L2 128,32,25,0
  904. KERNEL2x8_L2 128,32,26,0
  905. KERNEL2x8_L2 128,32,27,0
  906. KERNEL2x8_L2 128,32,28,0
  907. KERNEL2x8_L2 128,32,29,0
  908. KERNEL2x8_L2 128,32,30,0
  909. KERNEL2x8_L2 128,32,31,0
  910. KERNEL2x8_L2 128,32,32,0
  911. KERNEL2x8_L2 128,32,33,0
  912. KERNEL2x8_L2 128,32,34,0
  913. KERNEL2x8_L2 128,32,35,0
  914. KERNEL2x8_L2 128,32,36,0
  915. KERNEL2x8_L2 128,32,37,0
  916. KERNEL2x8_L2 128,32,38,0
  917. KERNEL2x8_L2 128,32,39,0
  918. KERNEL2x8_L2 128,32,40,0
  919. KERNEL2x8_L2 128,32,41,0
  920. KERNEL2x8_L2 128,32,42,0
  921. KERNEL2x8_L2 128,32,43,0
  922. KERNEL2x8_L2 128,32,44,0
  923. KERNEL2x8_L2 128,32,45,0
  924. KERNEL2x8_L2 128,32,46,0
  925. KERNEL2x8_L2 128,32,47,0
  926. KERNEL2x8_L2 128,32,48,0
  927. KERNEL2x8_L2 128,32,49,0
  928. KERNEL2x8_L2 128,32,50,0
  929. KERNEL2x8_L2 128,32,51,0
  930. KERNEL2x8_L2 128,32,52,0
  931. KERNEL2x8_L2 128,32,53,0
  932. KERNEL2x8_L2 128,32,54,0
  933. KERNEL2x8_L2 128,32,55,0
  934. KERNEL2x8_L2 128,32,56,0
  935. KERNEL2x8_L2 128,32,57,0
  936. KERNEL2x8_L2 128,32,58,0
  937. KERNEL2x8_L2 128,32,59,0
  938. KERNEL2x8_L2 128,32,60,0
  939. KERNEL2x8_L2 128,32,61,0
  940. KERNEL2x8_L2 128,32,62,0
  941. KERNEL2x8_L2 128,32,63,1
  942. bdnz CGEMM_L2x8_LOOP
  943. MY_ALIGN
  944. CGEMM_L2x8_LOOP_END:
  945. /*----------------------------------------*/
  946. END2x8_2
  947. blr
  948. MY_ALIGN
  949. CGEMM_2x8_L64_SUB:
  950. /*----------------------------------------*/
  951. LOAD2x8_2
  952. dcbt AO, PRE
  953. dcbt BO, PRE
  954. KERNEL2x8_L2 128,32,0,0
  955. KERNEL2x8_L2 128,32,1,0
  956. dcbt AO, T2
  957. KERNEL2x8_L2 128,32,2,0
  958. KERNEL2x8_L2 128,32,3,0
  959. dcbt AO, T3
  960. dcbt BO, T2
  961. KERNEL2x8_L2 128,32,4,0
  962. KERNEL2x8_L2 128,32,5,0
  963. dcbt AO, T4
  964. KERNEL2x8_L2 128,32,6,0
  965. KERNEL2x8_L2 128,32,7,0
  966. dcbt AO, T5
  967. dcbt BO, T3
  968. KERNEL2x8_L2 128,32,8,0
  969. KERNEL2x8_L2 128,32,9,0
  970. KERNEL2x8_L2 128,32,10,0
  971. KERNEL2x8_L2 128,32,11,0
  972. dcbt BO, T4
  973. KERNEL2x8_L2 128,32,12,0
  974. KERNEL2x8_L2 128,32,13,0
  975. KERNEL2x8_L2 128,32,14,0
  976. KERNEL2x8_L2 128,32,15,0
  977. KERNEL2x8_L2 128,32,16,0
  978. KERNEL2x8_L2 128,32,17,0
  979. KERNEL2x8_L2 128,32,18,0
  980. KERNEL2x8_L2 128,32,19,0
  981. KERNEL2x8_L2 128,32,20,0
  982. KERNEL2x8_L2 128,32,21,0
  983. KERNEL2x8_L2 128,32,22,0
  984. KERNEL2x8_L2 128,32,23,0
  985. KERNEL2x8_L2 128,32,24,0
  986. KERNEL2x8_L2 128,32,25,0
  987. KERNEL2x8_L2 128,32,26,0
  988. KERNEL2x8_L2 128,32,27,0
  989. KERNEL2x8_L2 128,32,28,0
  990. KERNEL2x8_L2 128,32,29,0
  991. KERNEL2x8_L2 128,32,30,0
  992. KERNEL2x8_E2 128,32,31,1
  993. blr
  994. MY_ALIGN
  995. CGEMM_2x8_L32_SUB:
  996. /*----------------------------------------*/
  997. LOAD2x8_2
  998. dcbt AO, PRE
  999. dcbt BO, PRE
  1000. KERNEL2x8_L2 128,32,0,0
  1001. KERNEL2x8_L2 128,32,1,0
  1002. dcbt AO, T2
  1003. KERNEL2x8_L2 128,32,2,0
  1004. KERNEL2x8_L2 128,32,3,0
  1005. dcbt AO, T3
  1006. dcbt BO, T2
  1007. KERNEL2x8_L2 128,32,4,0
  1008. KERNEL2x8_L2 128,32,5,0
  1009. dcbt AO, T4
  1010. KERNEL2x8_L2 128,32,6,0
  1011. KERNEL2x8_L2 128,32,7,0
  1012. dcbt AO, T5
  1013. dcbt BO, T3
  1014. KERNEL2x8_L2 128,32,8,0
  1015. KERNEL2x8_L2 128,32,9,0
  1016. KERNEL2x8_L2 128,32,10,0
  1017. KERNEL2x8_L2 128,32,11,0
  1018. dcbt BO, T4
  1019. KERNEL2x8_L2 128,32,12,0
  1020. KERNEL2x8_L2 128,32,13,0
  1021. KERNEL2x8_L2 128,32,14,0
  1022. KERNEL2x8_E2 128,32,15,1
  1023. blr
  1024. MY_ALIGN
  1025. CGEMM_2x8_L16_SUB:
  1026. /*----------------------------------------*/
  1027. LOAD2x8_2
  1028. dcbt AO, PRE
  1029. dcbt BO, PRE
  1030. KERNEL2x8_L2 128,32,0,0
  1031. KERNEL2x8_L2 128,32,1,0
  1032. dcbt AO, T2
  1033. KERNEL2x8_L2 128,32,2,0
  1034. KERNEL2x8_L2 128,32,3,0
  1035. dcbt AO, T3
  1036. dcbt BO, T2
  1037. KERNEL2x8_L2 128,32,4,0
  1038. KERNEL2x8_L2 128,32,5,0
  1039. dcbt AO, T4
  1040. KERNEL2x8_L2 128,32,6,0
  1041. KERNEL2x8_E2 128,32,7,1
  1042. blr
  1043. MY_ALIGN
  1044. CGEMM_2x4_LMAIN_SUB:
  1045. /*----------------------------------------*/
  1046. mtctr T8
  1047. LOAD2x4_2
  1048. MY_ALIGN
  1049. CGEMM_L2x4_LOOP:
  1050. /*----------------------------------------*/
  1051. KERNEL2x4_L2 64,32,0,0
  1052. CGEMM_L2x4_K32:
  1053. /*----------------------------------------*/
  1054. KERNEL2x4_L2 64,32,1,0
  1055. KERNEL2x4_L2 64,32,2,0
  1056. KERNEL2x4_L2 64,32,3,0
  1057. KERNEL2x4_L2 64,32,4,0
  1058. KERNEL2x4_L2 64,32,5,0
  1059. KERNEL2x4_L2 64,32,6,0
  1060. KERNEL2x4_L2 64,32,7,0
  1061. KERNEL2x4_L2 64,32,8,0
  1062. KERNEL2x4_L2 64,32,9,0
  1063. KERNEL2x4_L2 64,32,10,0
  1064. KERNEL2x4_L2 64,32,11,0
  1065. KERNEL2x4_L2 64,32,12,0
  1066. KERNEL2x4_L2 64,32,13,0
  1067. KERNEL2x4_L2 64,32,14,0
  1068. KERNEL2x4_L2 64,32,15,1
  1069. bdnz CGEMM_L2x4_LOOP
  1070. MY_ALIGN
  1071. CGEMM_L2x4_LOOP_END:
  1072. /*----------------------------------------*/
  1073. END2x4_2
  1074. blr
  1075. MY_ALIGN
  1076. CGEMM_2x4_L16_SUB:
  1077. /*----------------------------------------*/
  1078. LOAD2x4_2
  1079. KERNEL2x4_L2 64,32,0,0
  1080. KERNEL2x4_L2 64,32,1,0
  1081. KERNEL2x4_L2 64,32,2,0
  1082. KERNEL2x4_L2 64,32,3,0
  1083. KERNEL2x4_L2 64,32,4,0
  1084. KERNEL2x4_L2 64,32,5,0
  1085. KERNEL2x4_L2 64,32,6,0
  1086. KERNEL2x4_E2 64,32,7,1
  1087. blr
  1088. MY_ALIGN
  1089. CGEMM_2x4_L8_SUB:
  1090. /*----------------------------------------*/
  1091. LOAD2x4_2
  1092. KERNEL2x4_L2 64,32,0,0
  1093. KERNEL2x4_L2 64,32,1,0
  1094. KERNEL2x4_L2 64,32,2,0
  1095. KERNEL2x4_E2 64,32,3,1
  1096. blr
  1097. CGEMM_2x2_LMAIN_SUB:
  1098. /*----------------------------------------*/
  1099. mtctr T8
  1100. LOAD2x2_2
  1101. MY_ALIGN
  1102. CGEMM_L2x2_LOOP:
  1103. /*----------------------------------------*/
  1104. KERNEL2x2_L2 32,32,0,0
  1105. CGEMM_L2x2_K32:
  1106. /*----------------------------------------*/
  1107. KERNEL2x2_L2 32,32,1,0
  1108. KERNEL2x2_L2 32,32,2,0
  1109. KERNEL2x2_L2 32,32,3,0
  1110. KERNEL2x2_L2 32,32,4,0
  1111. KERNEL2x2_L2 32,32,5,0
  1112. KERNEL2x2_L2 32,32,6,0
  1113. KERNEL2x2_L2 32,32,7,0
  1114. KERNEL2x2_L2 32,32,8,0
  1115. KERNEL2x2_L2 32,32,9,0
  1116. KERNEL2x2_L2 32,32,10,0
  1117. KERNEL2x2_L2 32,32,11,0
  1118. KERNEL2x2_L2 32,32,12,0
  1119. KERNEL2x2_L2 32,32,13,0
  1120. KERNEL2x2_L2 32,32,14,0
  1121. KERNEL2x2_L2 32,32,15,1
  1122. bdnz CGEMM_L2x2_LOOP
  1123. MY_ALIGN
  1124. CGEMM_L2x2_LOOP_END:
  1125. /*----------------------------------------*/
  1126. END2x2_2
  1127. blr
  1128. MY_ALIGN
  1129. CGEMM_2x2_L16_SUB:
  1130. /*----------------------------------------*/
  1131. LOAD2x2_2
  1132. KERNEL2x2_L2 32,32,0,0
  1133. KERNEL2x2_L2 32,32,1,0
  1134. KERNEL2x2_L2 32,32,2,0
  1135. KERNEL2x2_L2 32,32,3,0
  1136. KERNEL2x2_L2 32,32,4,0
  1137. KERNEL2x2_L2 32,32,5,0
  1138. KERNEL2x2_L2 32,32,6,0
  1139. KERNEL2x2_E2 32,32,7,1
  1140. blr
  1141. MY_ALIGN
  1142. CGEMM_2x2_L8_SUB:
  1143. /*----------------------------------------*/
  1144. LOAD2x2_2
  1145. KERNEL2x2_L2 32,32,0,0
  1146. KERNEL2x2_L2 32,32,1,0
  1147. KERNEL2x2_L2 32,32,2,0
  1148. KERNEL2x2_E2 32,32,3,1
  1149. blr
  1150. CGEMM_2x1_LMAIN_SUB:
  1151. /*----------------------------------------*/
  1152. mtctr T8
  1153. LOAD2x1_2
  1154. MY_ALIGN
  1155. CGEMM_L2x1_LOOP:
  1156. /*----------------------------------------*/
  1157. KERNEL2x1_L2 16,32,0,0
  1158. CGEMM_L2x1_K32:
  1159. /*----------------------------------------*/
  1160. KERNEL2x1_L2 16,32,1,0
  1161. KERNEL2x1_L2 16,32,2,0
  1162. KERNEL2x1_L2 16,32,3,0
  1163. KERNEL2x1_L2 16,32,4,0
  1164. KERNEL2x1_L2 16,32,5,0
  1165. KERNEL2x1_L2 16,32,6,0
  1166. KERNEL2x1_L2 16,32,7,0
  1167. KERNEL2x1_L2 16,32,8,0
  1168. KERNEL2x1_L2 16,32,9,0
  1169. KERNEL2x1_L2 16,32,10,0
  1170. KERNEL2x1_L2 16,32,11,0
  1171. KERNEL2x1_L2 16,32,12,0
  1172. KERNEL2x1_L2 16,32,13,0
  1173. KERNEL2x1_L2 16,32,14,0
  1174. KERNEL2x1_L2 16,32,15,1
  1175. bdnz CGEMM_L2x1_LOOP
  1176. MY_ALIGN
  1177. CGEMM_L2x1_LOOP_END:
  1178. /*----------------------------------------*/
  1179. END2x1_2
  1180. blr
  1181. MY_ALIGN
  1182. CGEMM_2x1_L16_SUB:
  1183. /*----------------------------------------*/
  1184. LOAD2x1_2
  1185. KERNEL2x1_L2 16,32,0,0
  1186. KERNEL2x1_L2 16,32,1,0
  1187. KERNEL2x1_L2 16,32,2,0
  1188. KERNEL2x1_L2 16,32,3,0
  1189. KERNEL2x1_L2 16,32,4,0
  1190. KERNEL2x1_L2 16,32,5,0
  1191. KERNEL2x1_L2 16,32,6,0
  1192. KERNEL2x1_E2 16,32,7,1
  1193. blr
  1194. MY_ALIGN
  1195. CGEMM_2x1_L8_SUB:
  1196. /*----------------------------------------*/
  1197. LOAD2x1_2
  1198. KERNEL2x1_L2 16,32,0,0
  1199. KERNEL2x1_L2 16,32,1,0
  1200. KERNEL2x1_L2 16,32,2,0
  1201. KERNEL2x1_E2 16,32,3,1
  1202. blr
  1203. /* MAIN LOOP BEGINS */
  1204. MY_ALIGN
  1205. CGEMM_L2:
  1206. /*----------------------------------------*/
  1207. andi. J, N, 2
  1208. ble CGEMM_L2_END
  1209. CGEMM_L2_BEGIN:
  1210. /*----------------------------------------*/
  1211. mr CO, C
  1212. slwi T1, LDC , 1
  1213. add T2,C,LDC
  1214. mr AO, A
  1215. add C, C, T1
  1216. #if defined(TRMMKERNEL) && defined(LEFT)
  1217. mr TEMP_REG, OFFSET /*off = offset;*/
  1218. #endif
  1219. srawi. I, M, 3
  1220. ble CGEMM_L2x8_END
  1221. dcbt CO,r0 /*just prefetch*/
  1222. dcbt T2,r0
  1223. CGEMM_L2x8_BEGIN:
  1224. /*----------------------------------------*/
  1225. #if defined(TRMMKERNEL)
  1226. REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
  1227. #else
  1228. mr BO, B
  1229. dcbt B, r0
  1230. #endif
  1231. dcbt AO, r0
  1232. #if defined(TRMMKERNEL)
  1233. REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
  1234. mr T1, T6
  1235. /* TEMPS FOR PREFETCH */
  1236. li T2, 1024
  1237. li T3, 1024+512
  1238. addi T1,T1, -2
  1239. /* TEMPS FOR PREFETCH */
  1240. li T4, 2048
  1241. li T5, 2048+512
  1242. srawi. T8, T1, 7 /**(T1-2) % 128x */
  1243. #else
  1244. mr T1, K
  1245. /* TEMPS FOR PREFETCH */
  1246. li T2, 1024
  1247. li T3, 1024+512
  1248. addi T1,T1, -2
  1249. /* TEMPS FOR PREFETCH */
  1250. li T4, 2048
  1251. li T5, 2048+512
  1252. srawi. T8, T1, 7 /**(K-2) % 128x */
  1253. #endif
  1254. ZERO2x8
  1255. ble CGEMM_L2x8_SUB0
  1256. bl CGEMM_L2x8_LMAIN_SUB
  1257. andi. L, T1, 127
  1258. ble CGEMM_L2x8_SAVE
  1259. b CGEMM_L2x8_SUB2
  1260. CGEMM_L2x8_SUB0:
  1261. /*----------------------------------------*/
  1262. #if defined(TRMMKERNEL)
  1263. andi. L, T6, 255
  1264. cmpwi T6,129
  1265. #else
  1266. andi. L, K, 255
  1267. cmpwi K,129
  1268. #endif
  1269. li T8,1
  1270. bne CMP2x8_128K
  1271. addi BO,BO,-16
  1272. addi AO,AO,-64
  1273. LOAD2x8O 64,16
  1274. END2x8_WITHOUT_ADD
  1275. LOAD2x8_2O 128, 32
  1276. mtctr T8
  1277. bl CGEMM_L2x8_K128
  1278. b CGEMM_L2x8_SAVE
  1279. CMP2x8_128K:
  1280. /*----------------------------------------*/
  1281. #if defined(TRMMKERNEL)
  1282. cmpwi T6,128
  1283. #else
  1284. cmpwi K,128
  1285. #endif
  1286. bne CGEMM_L2x8_SUB2
  1287. MY_ALIGN
  1288. mtctr T8
  1289. addi BO,BO,-32
  1290. addi AO,AO,-128
  1291. LOAD2x8_2O 128,32
  1292. bl CGEMM_L2x8_K128
  1293. b CGEMM_L2x8_SAVE
  1294. MY_ALIGN
  1295. CGEMM_L2x8_SUB2:
  1296. /*----------------------------------------*/
  1297. andi. T1,L, 64
  1298. ble CGEMM_L2x8_SUB2_32
  1299. bl CGEMM_2x8_L64_SUB
  1300. MY_ALIGN
  1301. CGEMM_L2x8_SUB2_32:
  1302. /*----------------------------------------*/
  1303. andi. T1,L, 32
  1304. ble CGEMM_L2x8_SUB2_16
  1305. bl CGEMM_2x8_L32_SUB
  1306. MY_ALIGN
  1307. CGEMM_L2x8_SUB2_16:
  1308. /*----------------------------------------*/
  1309. andi. T1,L, 16
  1310. ble CGEMM_L2x8_SUB2_8
  1311. bl CGEMM_2x8_L16_SUB
  1312. MY_ALIGN
  1313. CGEMM_L2x8_SUB2_8:
  1314. /*----------------------------------------*/
  1315. andi. T1,L, 8
  1316. ble CGEMM_L2x8_SUB2_4
  1317. LOAD2x8_2
  1318. KERNEL2x8_L2 128,32, 0,0
  1319. KERNEL2x8_L2 128,32, 1,0
  1320. KERNEL2x8_L2 128,32, 2,0
  1321. KERNEL2x8_E2 128,32, 3,1
  1322. MY_ALIGN
  1323. CGEMM_L2x8_SUB2_4:
  1324. /*----------------------------------------*/
  1325. andi. T1,L, 4
  1326. ble CGEMM_L2x8_SUB2_2
  1327. LOAD2x8_2
  1328. KERNEL2x8_L2 128,32, 0,0
  1329. KERNEL2x8_E2 128,32, 1,1
  1330. MY_ALIGN
  1331. CGEMM_L2x8_SUB2_2:
  1332. /*----------------------------------------*/
  1333. andi. T1,L, 2
  1334. ble CGEMM_L2x8_SUB2_1
  1335. LOAD2x8_2
  1336. KERNEL2x8_E2 128,32, 0,1
  1337. MY_ALIGN
  1338. CGEMM_L2x8_SUB2_1:
  1339. /*----------------------------------------*/
  1340. andi. T1,L, 1
  1341. ble CGEMM_L2x8_SAVE
  1342. KERNEL2x8
  1343. MY_ALIGN
  1344. CGEMM_L2x8_SAVE:
  1345. /*----------------------------------------*/
  1346. addic. I, I, -1
  1347. MY_ALIGN
  1348. SAVE2x8
  1349. #if defined(TRMMKERNEL)
  1350. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
  1351. #endif
  1352. bgt CGEMM_L2x8_BEGIN
  1353. andi. T2, M, 7
  1354. ble CGEMM_L2x1_END
  1355. andi. T1, M, 4
  1356. ble CGEMM_L2x4_END
  1357. b CGEMM_L2x4_BEGIN
  1358. MY_ALIGN
  1359. CGEMM_L2x8_END:
  1360. /*----------------------------------------*/
  1361. CGEMM_L2x4_BEGIN:
  1362. /*----------------------------------------*/
  1363. andi. T2, M, 7
  1364. ble CGEMM_L2x1_END
  1365. andi. T1, M, 4
  1366. ble CGEMM_L2x4_END
  1367. #if defined(TRMMKERNEL)
  1368. REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
  1369. #else
  1370. mr BO, B
  1371. #endif
  1372. #if defined(TRMMKERNEL)
  1373. REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
  1374. mr T1, T6
  1375. addi T1,T1, -2
  1376. srawi. T8, T1, 5 /**(T1-2) % 32x */
  1377. #else
  1378. mr T1, K
  1379. addi T1,T1, -2
  1380. srawi. T8, T1, 5 /**(K-2) % 32x */
  1381. #endif
  1382. ZERO2x4
  1383. ble CGEMM_L2x4_SUB0
  1384. bl CGEMM_2x4_LMAIN_SUB
  1385. andi. L, T1, 31
  1386. ble CGEMM_L2x4_SAVE
  1387. b CGEMM_L2x4_SUB2
  1388. CGEMM_L2x4_SUB0:
  1389. /*----------------------------------------*/
  1390. #if defined(TRMMKERNEL)
  1391. andi. L, T6, 63
  1392. cmpwi T6,33
  1393. #else
  1394. andi. L, K, 63
  1395. cmpwi K,33
  1396. #endif
  1397. li T8,1
  1398. bne CMP2x4_32K
  1399. addi BO,BO,-16
  1400. addi AO,AO,-32
  1401. LOAD2x4O 32,16
  1402. END2x4_WITHOUT_ADD
  1403. LOAD2x4_2O 64, 32
  1404. mtctr T8
  1405. bl CGEMM_L2x4_K32
  1406. b CGEMM_L2x4_SAVE
  1407. CMP2x4_32K:
  1408. /*----------------------------------------*/
  1409. #if defined(TRMMKERNEL)
  1410. cmpwi T6,32
  1411. #else
  1412. cmpwi K,32
  1413. #endif
  1414. bne CGEMM_L2x4_SUB2
  1415. MY_ALIGN
  1416. mtctr T8
  1417. addi BO,BO,-32
  1418. addi AO,AO,-64
  1419. LOAD2x4_2O 64,32
  1420. bl CGEMM_L2x4_K32
  1421. b CGEMM_L2x4_SAVE
  1422. MY_ALIGN
  1423. MY_ALIGN
  1424. CGEMM_L2x4_SUB2:
  1425. /*----------------------------------------*/
  1426. andi. T1,L, 16
  1427. ble CGEMM_L2x4_SUB2_8
  1428. bl CGEMM_2x4_L16_SUB
  1429. MY_ALIGN
  1430. CGEMM_L2x4_SUB2_8:
  1431. /*----------------------------------------*/
  1432. andi. T1,L, 8
  1433. ble CGEMM_L2x4_SUB2_4
  1434. bl CGEMM_2x4_L8_SUB
  1435. MY_ALIGN
  1436. CGEMM_L2x4_SUB2_4:
  1437. /*----------------------------------------*/
  1438. andi. T1,L, 4
  1439. ble CGEMM_L2x4_SUB2_2
  1440. LOAD2x4_2
  1441. KERNEL2x4_L2 64,32, 0,0
  1442. KERNEL2x4_E2 64,32, 1,1
  1443. MY_ALIGN
  1444. CGEMM_L2x4_SUB2_2:
  1445. /*----------------------------------------*/
  1446. andi. T1,L, 2
  1447. ble CGEMM_L2x4_SUB2_1
  1448. LOAD2x4_2
  1449. KERNEL2x4_E2 64,32, 0,1
  1450. MY_ALIGN
  1451. CGEMM_L2x4_SUB2_1:
  1452. /*----------------------------------------*/
  1453. andi. T1,L, 1
  1454. ble CGEMM_L2x4_SAVE
  1455. KERNEL2x4
  1456. CGEMM_L2x4_SAVE:
  1457. /*----------------------------------------*/
  1458. SAVE2x4
  1459. #if defined(TRMMKERNEL)
  1460. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
  1461. #endif
  1462. CGEMM_L2x4_END:
  1463. /*----------------------------------------*/
  1464. CGEMM_L2x2_BEGIN:
  1465. /*----------------------------------------*/
  1466. andi. T1, M, 2
  1467. ble CGEMM_L2x2_END
  1468. #if defined(TRMMKERNEL)
  1469. REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
  1470. #else
  1471. mr BO, B
  1472. #endif
  1473. #if defined(TRMMKERNEL)
  1474. REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
  1475. mr T1, T6
  1476. addi T1,T1, -2
  1477. srawi. T8, T1, 5 /**(T1-2) % 32x */
  1478. #else
  1479. mr T1, K
  1480. addi T1,T1, -2
  1481. srawi. T8, T1, 5 /**(K-2) % 32x */
  1482. #endif
  1483. ZERO2x2
  1484. ble CGEMM_L2x2_SUB0
  1485. bl CGEMM_2x2_LMAIN_SUB
  1486. andi. L, T1, 31
  1487. ble CGEMM_L2x2_SAVE
  1488. b CGEMM_L2x2_SUB2
  1489. CGEMM_L2x2_SUB0:
  1490. /*----------------------------------------*/
  1491. #if defined(TRMMKERNEL)
  1492. andi. L, T6, 63
  1493. cmpwi T6,33
  1494. #else
  1495. andi. L, K, 63
  1496. cmpwi K,33
  1497. #endif
  1498. li T8,1
  1499. bne CMP2x2_32K
  1500. addi BO,BO,-16
  1501. addi AO,AO,-16
  1502. LOAD2x2O 16,16
  1503. END2x2_WITHOUT_ADD
  1504. LOAD2x2_2O 32, 32
  1505. mtctr T8
  1506. bl CGEMM_L2x2_K32
  1507. b CGEMM_L2x2_SAVE
  1508. CMP2x2_32K:
  1509. /*----------------------------------------*/
  1510. #if defined(TRMMKERNEL)
  1511. cmpwi T6,32
  1512. #else
  1513. cmpwi K,32
  1514. #endif
  1515. bne CGEMM_L2x2_SUB2
  1516. MY_ALIGN
  1517. mtctr T8
  1518. addi BO,BO,-32
  1519. addi AO,AO,-32
  1520. LOAD2x2_2O 32,32
  1521. bl CGEMM_L2x2_K32
  1522. b CGEMM_L2x2_SAVE
  1523. MY_ALIGN
  1524. MY_ALIGN
  1525. CGEMM_L2x2_SUB2:
  1526. /*----------------------------------------*/
  1527. andi. T1,L, 16
  1528. ble CGEMM_L2x2_SUB2_8
  1529. bl CGEMM_2x2_L16_SUB
  1530. MY_ALIGN
  1531. CGEMM_L2x2_SUB2_8:
  1532. /*----------------------------------------*/
  1533. andi. T1,L, 8
  1534. ble CGEMM_L2x2_SUB2_4
  1535. bl CGEMM_2x2_L8_SUB
  1536. MY_ALIGN
  1537. CGEMM_L2x2_SUB2_4:
  1538. /*----------------------------------------*/
  1539. andi. T1,L, 4
  1540. ble CGEMM_L2x2_SUB2_2
  1541. LOAD2x2_2
  1542. KERNEL2x2_L2 32,32, 0,0
  1543. KERNEL2x2_E2 32,32, 1,1
  1544. MY_ALIGN
  1545. CGEMM_L2x2_SUB2_2:
  1546. /*----------------------------------------*/
  1547. andi. T1,L, 2
  1548. ble CGEMM_L2x2_SUB2_1
  1549. LOAD2x2_2
  1550. KERNEL2x2_E2 32,32, 0,1
  1551. MY_ALIGN
  1552. CGEMM_L2x2_SUB2_1:
  1553. /*----------------------------------------*/
  1554. andi. T1,L, 1
  1555. ble CGEMM_L2x2_SAVE
  1556. KERNEL2x2
  1557. MY_ALIGN
  1558. CGEMM_L2x2_SAVE:
  1559. /*----------------------------------------*/
  1560. SAVE2x2
  1561. #if defined(TRMMKERNEL)
  1562. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
  1563. #endif
  1564. CGEMM_L2x2_END:
  1565. /*----------------------------------------*/
  1566. CGEMM_L2x1_BEGIN:
  1567. /*----------------------------------------*/
  1568. andi. T1, M, 1
  1569. ble CGEMM_L2x1_END
  1570. #if defined(TRMMKERNEL)
  1571. REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
  1572. #else
  1573. mr BO, B
  1574. #endif
  1575. #if defined(TRMMKERNEL)
  1576. REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
  1577. mr T1, T6
  1578. addi T1,T1, -2
  1579. srawi. T8, T1, 5 /**(T1-2) % 32x */
  1580. #else
  1581. mr T1, K
  1582. addi T1,T1, -2
  1583. srawi. T8, T1, 5 /**(K-2) % 32x */
  1584. #endif
  1585. ZERO2x1
  1586. ble CGEMM_L2x1_SUB0
  1587. bl CGEMM_2x1_LMAIN_SUB
  1588. andi. L, T1, 31
  1589. ble CGEMM_L2x1_SAVE
  1590. b CGEMM_L2x1_SUB2
  1591. CGEMM_L2x1_SUB0:
  1592. /*----------------------------------------*/
  1593. #if defined(TRMMKERNEL)
  1594. andi. L, T6, 63
  1595. cmpwi T6,33
  1596. #else
  1597. andi. L, K, 63
  1598. cmpwi K,33
  1599. #endif
  1600. li T8,1
  1601. bne CMP2x1_32K
  1602. addi BO,BO,-16
  1603. addi AO,AO,-8
  1604. LOAD2x1O 8,16
  1605. END2x1_WITHOUT_ADD
  1606. LOAD2x1_2O 16, 32
  1607. mtctr T8
  1608. bl CGEMM_L2x1_K32
  1609. b CGEMM_L2x1_SAVE
  1610. CMP2x1_32K:
  1611. /*----------------------------------------*/
  1612. #if defined(TRMMKERNEL)
  1613. cmpwi T6,32
  1614. #else
  1615. cmpwi K,32
  1616. #endif
  1617. bne CGEMM_L2x1_SUB2
  1618. MY_ALIGN
  1619. mtctr T8
  1620. addi BO,BO,-32
  1621. addi AO,AO,-16
  1622. LOAD2x1_2O 16,32
  1623. bl CGEMM_L2x1_K32
  1624. b CGEMM_L2x1_SAVE
  1625. MY_ALIGN
  1626. MY_ALIGN
  1627. CGEMM_L2x1_SUB2:
  1628. /*----------------------------------------*/
  1629. andi. T1,L, 16
  1630. ble CGEMM_L2x1_SUB2_8
  1631. bl CGEMM_2x1_L16_SUB
  1632. MY_ALIGN
  1633. CGEMM_L2x1_SUB2_8:
  1634. /*----------------------------------------*/
  1635. andi. T1,L, 8
  1636. ble CGEMM_L2x1_SUB2_4
  1637. bl CGEMM_2x1_L8_SUB
  1638. MY_ALIGN
  1639. CGEMM_L2x1_SUB2_4:
  1640. /*----------------------------------------*/
  1641. andi. T1,L, 4
  1642. ble CGEMM_L2x1_SUB2_2
  1643. LOAD2x1_2
  1644. KERNEL2x1_L2 16,32, 0,0
  1645. KERNEL2x1_E2 16,32, 1,1
  1646. MY_ALIGN
  1647. CGEMM_L2x1_SUB2_2:
  1648. /*----------------------------------------*/
  1649. andi. T1,L, 2
  1650. ble CGEMM_L2x1_SUB2_1
  1651. LOAD2x1_2
  1652. KERNEL2x1_E2 16,32, 0,1
  1653. MY_ALIGN
  1654. CGEMM_L2x1_SUB2_1:
  1655. /*----------------------------------------*/
  1656. andi. T1,L, 1
  1657. ble CGEMM_L2x1_SAVE
  1658. KERNEL2x1
  1659. MY_ALIGN
  1660. CGEMM_L2x1_SAVE:
  1661. /*----------------------------------------*/
  1662. SAVE2x1
  1663. #if defined(TRMMKERNEL)
  1664. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
  1665. #endif
  1666. CGEMM_L2x1_END:
  1667. /*----------------------------------------*/
  1668. slwi T1, K, 4
  1669. add B, B, T1
  1670. #if defined(TRMMKERNEL) && !defined(LEFT)
  1671. addi TEMP_REG, TEMP_REG, 2
  1672. #endif
  1673. CGEMM_L2_END:
  1674. b CGEMM_L1
  1675. /* MINI SUBROUTINES */
  1676. /* 1x8 MAIN 128x+2 LOOP */
  1677. CGEMM_L1x8_LMAIN_SUB:
  1678. /*----------------------------------------*/
  1679. mtctr T8
  1680. LOAD1x8_2
  1681. MY_ALIGN
  1682. CGEMM_L1x8_LOOP:
  1683. /*----------------------------------------*/
  1684. dcbt AO, PRE
  1685. dcbt BO, PRE
  1686. KERNEL1x8_L2 128,16,0,0
  1687. CGEMM_L1x8_K128:
  1688. /*----------------------------------------*/
  1689. KERNEL1x8_L2 128,16,1,0
  1690. dcbt AO, T2
  1691. KERNEL1x8_L2 128,16,2,0
  1692. KERNEL1x8_L2 128,16,3,0
  1693. dcbt AO, T3
  1694. dcbt BO, T2
  1695. KERNEL1x8_L2 128,16,4,0
  1696. KERNEL1x8_L2 128,16,5,0
  1697. dcbt AO, T4
  1698. KERNEL1x8_L2 128,16,6,0
  1699. KERNEL1x8_L2 128,16,7,0
  1700. dcbt AO, T5
  1701. dcbt BO, T3
  1702. KERNEL1x8_L2 128,16,8,0
  1703. KERNEL1x8_L2 128,16,9,0
  1704. KERNEL1x8_L2 128,16,10,0
  1705. KERNEL1x8_L2 128,16,11,0
  1706. dcbt BO, T4
  1707. KERNEL1x8_L2 128,16,12,0
  1708. KERNEL1x8_L2 128,16,13,0
  1709. KERNEL1x8_L2 128,16,14,0
  1710. KERNEL1x8_L2 128,16,15,0
  1711. KERNEL1x8_L2 128,16,16,0
  1712. KERNEL1x8_L2 128,16,17,0
  1713. KERNEL1x8_L2 128,16,18,0
  1714. KERNEL1x8_L2 128,16,19,0
  1715. KERNEL1x8_L2 128,16,20,0
  1716. KERNEL1x8_L2 128,16,21,0
  1717. KERNEL1x8_L2 128,16,22,0
  1718. KERNEL1x8_L2 128,16,23,0
  1719. KERNEL1x8_L2 128,16,24,0
  1720. KERNEL1x8_L2 128,16,25,0
  1721. KERNEL1x8_L2 128,16,26,0
  1722. KERNEL1x8_L2 128,16,27,0
  1723. KERNEL1x8_L2 128,16,28,0
  1724. KERNEL1x8_L2 128,16,29,0
  1725. KERNEL1x8_L2 128,16,30,0
  1726. KERNEL1x8_L2 128,16,31,0
  1727. KERNEL1x8_L2 128,16,32,0
  1728. KERNEL1x8_L2 128,16,33,0
  1729. KERNEL1x8_L2 128,16,34,0
  1730. KERNEL1x8_L2 128,16,35,0
  1731. KERNEL1x8_L2 128,16,36,0
  1732. KERNEL1x8_L2 128,16,37,0
  1733. KERNEL1x8_L2 128,16,38,0
  1734. KERNEL1x8_L2 128,16,39,0
  1735. KERNEL1x8_L2 128,16,40,0
  1736. KERNEL1x8_L2 128,16,41,0
  1737. KERNEL1x8_L2 128,16,42,0
  1738. KERNEL1x8_L2 128,16,43,0
  1739. KERNEL1x8_L2 128,16,44,0
  1740. KERNEL1x8_L2 128,16,45,0
  1741. KERNEL1x8_L2 128,16,46,0
  1742. KERNEL1x8_L2 128,16,47,0
  1743. KERNEL1x8_L2 128,16,48,0
  1744. KERNEL1x8_L2 128,16,49,0
  1745. KERNEL1x8_L2 128,16,50,0
  1746. KERNEL1x8_L2 128,16,51,0
  1747. KERNEL1x8_L2 128,16,52,0
  1748. KERNEL1x8_L2 128,16,53,0
  1749. KERNEL1x8_L2 128,16,54,0
  1750. KERNEL1x8_L2 128,16,55,0
  1751. KERNEL1x8_L2 128,16,56,0
  1752. KERNEL1x8_L2 128,16,57,0
  1753. KERNEL1x8_L2 128,16,58,0
  1754. KERNEL1x8_L2 128,16,59,0
  1755. KERNEL1x8_L2 128,16,60,0
  1756. KERNEL1x8_L2 128,16,61,0
  1757. KERNEL1x8_L2 128,16,62,0
  1758. KERNEL1x8_L2 128,16,63,1
  1759. bdnz CGEMM_L1x8_LOOP
  1760. MY_ALIGN
  1761. CGEMM_L1x8_LOOP_END:
  1762. /*----------------------------------------*/
  1763. END1x8_2
  1764. blr
  1765. MY_ALIGN
  1766. CGEMM_1x8_L64_SUB:
  1767. /*----------------------------------------*/
  1768. LOAD1x8_2
  1769. dcbt AO, PRE
  1770. dcbt BO, PRE
  1771. KERNEL1x8_L2 128,16,0,0
  1772. KERNEL1x8_L2 128,16,1,0
  1773. dcbt AO, T2
  1774. KERNEL1x8_L2 128,16,2,0
  1775. KERNEL1x8_L2 128,16,3,0
  1776. dcbt AO, T3
  1777. dcbt BO, T2
  1778. KERNEL1x8_L2 128,16,4,0
  1779. KERNEL1x8_L2 128,16,5,0
  1780. dcbt AO, T4
  1781. KERNEL1x8_L2 128,16,6,0
  1782. KERNEL1x8_L2 128,16,7,0
  1783. dcbt AO, T5
  1784. dcbt BO, T3
  1785. KERNEL1x8_L2 128,16,8,0
  1786. KERNEL1x8_L2 128,16,9,0
  1787. KERNEL1x8_L2 128,16,10,0
  1788. KERNEL1x8_L2 128,16,11,0
  1789. dcbt BO, T4
  1790. KERNEL1x8_L2 128,16,12,0
  1791. KERNEL1x8_L2 128,16,13,0
  1792. KERNEL1x8_L2 128,16,14,0
  1793. KERNEL1x8_L2 128,16,15,0
  1794. KERNEL1x8_L2 128,16,16,0
  1795. KERNEL1x8_L2 128,16,17,0
  1796. KERNEL1x8_L2 128,16,18,0
  1797. KERNEL1x8_L2 128,16,19,0
  1798. KERNEL1x8_L2 128,16,20,0
  1799. KERNEL1x8_L2 128,16,21,0
  1800. KERNEL1x8_L2 128,16,22,0
  1801. KERNEL1x8_L2 128,16,23,0
  1802. KERNEL1x8_L2 128,16,24,0
  1803. KERNEL1x8_L2 128,16,25,0
  1804. KERNEL1x8_L2 128,16,26,0
  1805. KERNEL1x8_L2 128,16,27,0
  1806. KERNEL1x8_L2 128,16,28,0
  1807. KERNEL1x8_L2 128,16,29,0
  1808. KERNEL1x8_L2 128,16,30,0
  1809. KERNEL1x8_E2 128,16,31,1
  1810. blr
  1811. MY_ALIGN
  1812. CGEMM_1x8_L32_SUB:
  1813. /*----------------------------------------*/
  1814. LOAD1x8_2
  1815. dcbt AO, PRE
  1816. dcbt BO, PRE
  1817. KERNEL1x8_L2 128,16,0,0
  1818. KERNEL1x8_L2 128,16,1,0
  1819. dcbt AO, T2
  1820. KERNEL1x8_L2 128,16,2,0
  1821. KERNEL1x8_L2 128,16,3,0
  1822. dcbt AO, T3
  1823. dcbt BO, T2
  1824. KERNEL1x8_L2 128,16,4,0
  1825. KERNEL1x8_L2 128,16,5,0
  1826. dcbt AO, T4
  1827. KERNEL1x8_L2 128,16,6,0
  1828. KERNEL1x8_L2 128,16,7,0
  1829. dcbt AO, T5
  1830. dcbt BO, T3
  1831. KERNEL1x8_L2 128,16,8,0
  1832. KERNEL1x8_L2 128,16,9,0
  1833. KERNEL1x8_L2 128,16,10,0
  1834. KERNEL1x8_L2 128,16,11,0
  1835. dcbt BO, T4
  1836. KERNEL1x8_L2 128,16,12,0
  1837. KERNEL1x8_L2 128,16,13,0
  1838. KERNEL1x8_L2 128,16,14,0
  1839. KERNEL1x8_E2 128,16,15,1
  1840. blr
  1841. MY_ALIGN
  1842. CGEMM_1x8_L16_SUB:
  1843. /*----------------------------------------*/
  1844. LOAD1x8_2
  1845. dcbt AO, PRE
  1846. dcbt BO, PRE
  1847. KERNEL1x8_L2 128,16,0,0
  1848. KERNEL1x8_L2 128,16,1,0
  1849. dcbt AO, T2
  1850. KERNEL1x8_L2 128,16,2,0
  1851. KERNEL1x8_L2 128,16,3,0
  1852. dcbt AO, T3
  1853. dcbt BO, T2
  1854. KERNEL1x8_L2 128,16,4,0
  1855. KERNEL1x8_L2 128,16,5,0
  1856. dcbt AO, T4
  1857. KERNEL1x8_L2 128,16,6,0
  1858. KERNEL1x8_E2 128,16,7,1
  1859. blr
  1860. MY_ALIGN
  1861. CGEMM_1x4_LMAIN_SUB:
  1862. /*----------------------------------------*/
  1863. mtctr T8
  1864. LOAD1x4_2
  1865. MY_ALIGN
  1866. CGEMM_L1x4_LOOP:
  1867. /*----------------------------------------*/
  1868. KERNEL1x4_L2 64,16,0,0
  1869. CGEMM_L1x4_K32:
  1870. /*----------------------------------------*/
  1871. KERNEL1x4_L2 64,16,1,0
  1872. KERNEL1x4_L2 64,16,2,0
  1873. KERNEL1x4_L2 64,16,3,0
  1874. KERNEL1x4_L2 64,16,4,0
  1875. KERNEL1x4_L2 64,16,5,0
  1876. KERNEL1x4_L2 64,16,6,0
  1877. KERNEL1x4_L2 64,16,7,0
  1878. KERNEL1x4_L2 64,16,8,0
  1879. KERNEL1x4_L2 64,16,9,0
  1880. KERNEL1x4_L2 64,16,10,0
  1881. KERNEL1x4_L2 64,16,11,0
  1882. KERNEL1x4_L2 64,16,12,0
  1883. KERNEL1x4_L2 64,16,13,0
  1884. KERNEL1x4_L2 64,16,14,0
  1885. KERNEL1x4_L2 64,16,15,1
  1886. bdnz CGEMM_L1x4_LOOP
  1887. MY_ALIGN
  1888. CGEMM_L1x4_LOOP_END:
  1889. /*----------------------------------------*/
  1890. END1x4_2
  1891. blr
  1892. MY_ALIGN
  1893. CGEMM_1x4_L16_SUB:
  1894. /*----------------------------------------*/
  1895. LOAD1x4_2
  1896. KERNEL1x4_L2 64,16,0,0
  1897. KERNEL1x4_L2 64,16,1,0
  1898. KERNEL1x4_L2 64,16,2,0
  1899. KERNEL1x4_L2 64,16,3,0
  1900. KERNEL1x4_L2 64,16,4,0
  1901. KERNEL1x4_L2 64,16,5,0
  1902. KERNEL1x4_L2 64,16,6,0
  1903. KERNEL1x4_E2 64,16,7,1
  1904. blr
  1905. MY_ALIGN
  1906. CGEMM_1x4_L8_SUB:
  1907. /*----------------------------------------*/
  1908. LOAD1x4_2
  1909. KERNEL1x4_L2 64,16,0,0
  1910. KERNEL1x4_L2 64,16,1,0
  1911. KERNEL1x4_L2 64,16,2,0
  1912. KERNEL1x4_E2 64,16,3,1
  1913. blr
  1914. CGEMM_1x2_LMAIN_SUB:
  1915. /*----------------------------------------*/
  1916. mtctr T8
  1917. LOAD1x2_2
  1918. MY_ALIGN
  1919. CGEMM_L1x2_LOOP:
  1920. /*----------------------------------------*/
  1921. KERNEL1x2_L2 32,16,0,0
  1922. CGEMM_L1x2_K32:
  1923. /*----------------------------------------*/
  1924. KERNEL1x2_L2 32,16,1,0
  1925. KERNEL1x2_L2 32,16,2,0
  1926. KERNEL1x2_L2 32,16,3,0
  1927. KERNEL1x2_L2 32,16,4,0
  1928. KERNEL1x2_L2 32,16,5,0
  1929. KERNEL1x2_L2 32,16,6,0
  1930. KERNEL1x2_L2 32,16,7,0
  1931. KERNEL1x2_L2 32,16,8,0
  1932. KERNEL1x2_L2 32,16,9,0
  1933. KERNEL1x2_L2 32,16,10,0
  1934. KERNEL1x2_L2 32,16,11,0
  1935. KERNEL1x2_L2 32,16,12,0
  1936. KERNEL1x2_L2 32,16,13,0
  1937. KERNEL1x2_L2 32,16,14,0
  1938. KERNEL1x2_L2 32,16,15,1
  1939. bdnz CGEMM_L1x2_LOOP
  1940. MY_ALIGN
  1941. CGEMM_L1x2_LOOP_END:
  1942. /*----------------------------------------*/
  1943. END1x2_2
  1944. blr
  1945. MY_ALIGN
  1946. CGEMM_1x2_L16_SUB:
  1947. /*----------------------------------------*/
  1948. LOAD1x2_2
  1949. KERNEL1x2_L2 32,16,0,0
  1950. KERNEL1x2_L2 32,16,1,0
  1951. KERNEL1x2_L2 32,16,2,0
  1952. KERNEL1x2_L2 32,16,3,0
  1953. KERNEL1x2_L2 32,16,4,0
  1954. KERNEL1x2_L2 32,16,5,0
  1955. KERNEL1x2_L2 32,16,6,0
  1956. KERNEL1x2_E2 32,16,7,1
  1957. blr
  1958. MY_ALIGN
  1959. CGEMM_1x2_L8_SUB:
  1960. /*----------------------------------------*/
  1961. LOAD1x2_2
  1962. KERNEL1x2_L2 32,16,0,0
  1963. KERNEL1x2_L2 32,16,1,0
  1964. KERNEL1x2_L2 32,16,2,0
  1965. KERNEL1x2_E2 32,16,3,1
  1966. blr
  1967. CGEMM_1x1_LMAIN_SUB:
  1968. /*----------------------------------------*/
  1969. mtctr T8
  1970. LOAD1x1_2
  1971. MY_ALIGN
  1972. CGEMM_L1x1_LOOP:
  1973. /*----------------------------------------*/
  1974. KERNEL1x1_L2 16,16,0,0
  1975. CGEMM_L1x1_K32:
  1976. /*----------------------------------------*/
  1977. KERNEL1x1_L2 16,16,1,0
  1978. KERNEL1x1_L2 16,16,2,0
  1979. KERNEL1x1_L2 16,16,3,0
  1980. KERNEL1x1_L2 16,16,4,0
  1981. KERNEL1x1_L2 16,16,5,0
  1982. KERNEL1x1_L2 16,16,6,0
  1983. KERNEL1x1_L2 16,16,7,0
  1984. KERNEL1x1_L2 16,16,8,0
  1985. KERNEL1x1_L2 16,16,9,0
  1986. KERNEL1x1_L2 16,16,10,0
  1987. KERNEL1x1_L2 16,16,11,0
  1988. KERNEL1x1_L2 16,16,12,0
  1989. KERNEL1x1_L2 16,16,13,0
  1990. KERNEL1x1_L2 16,16,14,0
  1991. KERNEL1x1_L2 16,16,15,1
  1992. bdnz CGEMM_L1x1_LOOP
  1993. MY_ALIGN
  1994. CGEMM_L1x1_LOOP_END:
  1995. /*----------------------------------------*/
  1996. END1x1_2
  1997. blr
  1998. MY_ALIGN
  1999. CGEMM_1x1_L16_SUB:
  2000. /*----------------------------------------*/
  2001. LOAD1x1_2
  2002. KERNEL1x1_L2 16,16,0,0
  2003. KERNEL1x1_L2 16,16,1,0
  2004. KERNEL1x1_L2 16,16,2,0
  2005. KERNEL1x1_L2 16,16,3,0
  2006. KERNEL1x1_L2 16,16,4,0
  2007. KERNEL1x1_L2 16,16,5,0
  2008. KERNEL1x1_L2 16,16,6,0
  2009. KERNEL1x1_E2 16,16,7,1
  2010. blr
  2011. MY_ALIGN
  2012. CGEMM_1x1_L8_SUB:
  2013. /*----------------------------------------*/
  2014. LOAD1x1_2
  2015. KERNEL1x1_L2 16,16,0,0
  2016. KERNEL1x1_L2 16,16,1,0
  2017. KERNEL1x1_L2 16,16,2,0
  2018. KERNEL1x1_E2 16,16,3,1
  2019. blr
  2020. /* MAIN LOOP BEGINS */
  2021. MY_ALIGN
  2022. CGEMM_L1:
  2023. /*----------------------------------------*/
  2024. andi. J, N, 1
  2025. ble CGEMM_L1_END
  2026. CGEMM_L1_BEGIN:
  2027. /*----------------------------------------*/
  2028. mr CO, C
  2029. add T2,C,LDC
  2030. mr AO, A
  2031. add C, C, T1
  2032. #if defined(TRMMKERNEL) && defined(LEFT)
  2033. mr TEMP_REG, OFFSET /*off = offset;*/
  2034. #endif
  2035. srawi. I, M, 3
  2036. ble CGEMM_L1x8_END
  2037. dcbt CO,r0 /*just prefetch*/
  2038. dcbt T2,r0
  2039. CGEMM_L1x8_BEGIN:
  2040. /*----------------------------------------*/
  2041. #if defined(TRMMKERNEL)
  2042. REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
  2043. #else
  2044. mr BO, B
  2045. dcbt B, r0
  2046. #endif
  2047. dcbt AO, r0
  2048. #if defined(TRMMKERNEL)
  2049. REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
  2050. mr T1, T6
  2051. /* TEMPS FOR PREFETCH */
  2052. li T2, 1024
  2053. li T3, 1024+512
  2054. addi T1,T1, -2
  2055. /* TEMPS FOR PREFETCH */
  2056. li T4, 2048
  2057. li T5, 2048+512
  2058. srawi. T8, T1, 7 /**(T1-2) % 128x */
  2059. #else
  2060. mr T1, K
  2061. /* TEMPS FOR PREFETCH */
  2062. li T2, 1024
  2063. li T3, 1024+512
  2064. addi T1,T1, -2
  2065. /* TEMPS FOR PREFETCH */
  2066. li T4, 2048
  2067. li T5, 2048+512
  2068. srawi. T8, T1, 7 /**(K-2) % 128x */
  2069. #endif
  2070. ZERO1x8
  2071. ble CGEMM_L1x8_SUB0
  2072. bl CGEMM_L1x8_LMAIN_SUB
  2073. andi. L, T1, 127
  2074. ble CGEMM_L1x8_SAVE
  2075. b CGEMM_L1x8_SUB2
  2076. CGEMM_L1x8_SUB0:
  2077. /*----------------------------------------*/
  2078. #if defined(TRMMKERNEL)
  2079. andi. L, T6, 255
  2080. cmpwi T6,129
  2081. #else
  2082. andi. L, K, 255
  2083. cmpwi K,129
  2084. #endif
  2085. li T8,1
  2086. bne CMP1x8_128K
  2087. addi BO,BO,-8
  2088. addi AO,AO,-64
  2089. LOAD1x8O 64,8
  2090. END1x8_WITHOUT_ADD
  2091. LOAD1x8_2O 128, 16
  2092. mtctr T8
  2093. bl CGEMM_L1x8_K128
  2094. b CGEMM_L1x8_SAVE
  2095. CMP1x8_128K:
  2096. /*----------------------------------------*/
  2097. #if defined(TRMMKERNEL)
  2098. cmpwi T6,128
  2099. #else
  2100. cmpwi K,128
  2101. #endif
  2102. bne CGEMM_L1x8_SUB2
  2103. MY_ALIGN
  2104. mtctr T8
  2105. addi BO,BO,-16
  2106. addi AO,AO,-128
  2107. LOAD1x8_2O 128,16
  2108. bl CGEMM_L1x8_K128
  2109. b CGEMM_L1x8_SAVE
  2110. MY_ALIGN
  2111. CGEMM_L1x8_SUB2:
  2112. /*----------------------------------------*/
  2113. andi. T1,L, 64
  2114. ble CGEMM_L1x8_SUB2_32
  2115. bl CGEMM_1x8_L64_SUB
  2116. MY_ALIGN
  2117. CGEMM_L1x8_SUB2_32:
  2118. /*----------------------------------------*/
  2119. andi. T1,L, 32
  2120. ble CGEMM_L1x8_SUB2_16
  2121. bl CGEMM_1x8_L32_SUB
  2122. MY_ALIGN
  2123. CGEMM_L1x8_SUB2_16:
  2124. /*----------------------------------------*/
  2125. andi. T1,L, 16
  2126. ble CGEMM_L1x8_SUB2_8
  2127. bl CGEMM_1x8_L16_SUB
  2128. MY_ALIGN
  2129. CGEMM_L1x8_SUB2_8:
  2130. /*----------------------------------------*/
  2131. andi. T1,L, 8
  2132. ble CGEMM_L1x8_SUB2_4
  2133. LOAD1x8_2
  2134. KERNEL1x8_L2 128,16, 0,0
  2135. KERNEL1x8_L2 128,16, 1,0
  2136. KERNEL1x8_L2 128,16, 2,0
  2137. KERNEL1x8_E2 128,16, 3,1
  2138. MY_ALIGN
  2139. CGEMM_L1x8_SUB2_4:
  2140. /*----------------------------------------*/
  2141. andi. T1,L, 4
  2142. ble CGEMM_L1x8_SUB2_2
  2143. LOAD1x8_2
  2144. KERNEL1x8_L2 128,16, 0,0
  2145. KERNEL1x8_E2 128,16, 1,1
  2146. MY_ALIGN
  2147. CGEMM_L1x8_SUB2_2:
  2148. /*----------------------------------------*/
  2149. andi. T1,L, 2
  2150. ble CGEMM_L1x8_SUB2_1
  2151. LOAD1x8_2
  2152. KERNEL1x8_E2 128,16, 0,1
  2153. MY_ALIGN
  2154. CGEMM_L1x8_SUB2_1:
  2155. /*----------------------------------------*/
  2156. andi. T1,L, 1
  2157. ble CGEMM_L1x8_SAVE
  2158. KERNEL1x8
  2159. MY_ALIGN
  2160. CGEMM_L1x8_SAVE:
  2161. /*----------------------------------------*/
  2162. addic. I, I, -1
  2163. MY_ALIGN
  2164. SAVE1x8
  2165. #if defined(TRMMKERNEL)
  2166. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
  2167. #endif
  2168. bgt CGEMM_L1x8_BEGIN
  2169. andi. T2, M, 7
  2170. ble CGEMM_L1x1_END
  2171. andi. T1, M, 4
  2172. ble CGEMM_L1x4_END
  2173. b CGEMM_L1x4_BEGIN
  2174. MY_ALIGN
  2175. CGEMM_L1x8_END:
  2176. /*----------------------------------------*/
  2177. CGEMM_L1x4_BEGIN:
  2178. /*----------------------------------------*/
  2179. andi. T2, M, 7
  2180. ble CGEMM_L1x1_END
  2181. andi. T1, M, 4
  2182. ble CGEMM_L1x4_END
  2183. #if defined(TRMMKERNEL)
  2184. REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
  2185. #else
  2186. mr BO, B
  2187. #endif
  2188. #if defined(TRMMKERNEL)
  2189. REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
  2190. mr T1, T6
  2191. addi T1,T1, -2
  2192. srawi. T8, T1, 5 /**(T1-2) % 31x */
  2193. #else
  2194. mr T1, K
  2195. addi T1,T1, -2
  2196. srawi. T8, T1, 5 /**(K-2) % 31x */
  2197. #endif
  2198. ZERO1x4
  2199. ble CGEMM_L1x4_SUB0
  2200. bl CGEMM_1x4_LMAIN_SUB
  2201. andi. L, T1, 31
  2202. ble CGEMM_L1x4_SAVE
  2203. b CGEMM_L1x4_SUB2
  2204. CGEMM_L1x4_SUB0:
  2205. /*----------------------------------------*/
  2206. #if defined(TRMMKERNEL)
  2207. andi. L, T6, 63
  2208. cmpwi T6,33
  2209. #else
  2210. andi. L, K, 63
  2211. cmpwi K,33
  2212. #endif
  2213. li T8,1
  2214. bne CMP1x4_32K
  2215. addi BO,BO,-8
  2216. addi AO,AO,-32
  2217. LOAD1x4O 32,8
  2218. END1x4_WITHOUT_ADD
  2219. LOAD1x4_2O 64, 16
  2220. mtctr T8
  2221. bl CGEMM_L1x4_K32
  2222. b CGEMM_L1x4_SAVE
  2223. CMP1x4_32K:
  2224. /*----------------------------------------*/
  2225. #if defined(TRMMKERNEL)
  2226. cmpwi T6,32
  2227. #else
  2228. cmpwi K,32
  2229. #endif
  2230. bne CGEMM_L1x4_SUB2
  2231. MY_ALIGN
  2232. mtctr T8
  2233. addi BO,BO,-16
  2234. addi AO,AO,-64
  2235. LOAD1x4_2O 64,16
  2236. bl CGEMM_L1x4_K32
  2237. b CGEMM_L1x4_SAVE
  2238. MY_ALIGN
  2239. MY_ALIGN
  2240. CGEMM_L1x4_SUB2:
  2241. /*----------------------------------------*/
  2242. andi. T1,L, 16
  2243. ble CGEMM_L1x4_SUB2_8
  2244. bl CGEMM_1x4_L16_SUB
  2245. MY_ALIGN
  2246. CGEMM_L1x4_SUB2_8:
  2247. /*----------------------------------------*/
  2248. andi. T1,L, 8
  2249. ble CGEMM_L1x4_SUB2_4
  2250. bl CGEMM_1x4_L8_SUB
  2251. MY_ALIGN
  2252. CGEMM_L1x4_SUB2_4:
  2253. /*----------------------------------------*/
  2254. andi. T1,L, 4
  2255. ble CGEMM_L1x4_SUB2_2
  2256. LOAD1x4_2
  2257. KERNEL1x4_L2 64,16, 0,0
  2258. KERNEL1x4_E2 64,16, 1,1
  2259. MY_ALIGN
  2260. CGEMM_L1x4_SUB2_2:
  2261. /*----------------------------------------*/
  2262. andi. T1,L, 2
  2263. ble CGEMM_L1x4_SUB2_1
  2264. LOAD1x4_2
  2265. KERNEL1x4_E2 64,16, 0,1
  2266. MY_ALIGN
  2267. CGEMM_L1x4_SUB2_1:
  2268. /*----------------------------------------*/
  2269. andi. T1,L, 1
  2270. ble CGEMM_L1x4_SAVE
  2271. KERNEL1x4
  2272. CGEMM_L1x4_SAVE:
  2273. /*----------------------------------------*/
  2274. SAVE1x4
  2275. #if defined(TRMMKERNEL)
  2276. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
  2277. #endif
  2278. CGEMM_L1x4_END:
  2279. /*----------------------------------------*/
  2280. CGEMM_L1x2_BEGIN:
  2281. /*----------------------------------------*/
  2282. andi. T1, M, 2
  2283. ble CGEMM_L1x2_END
  2284. #if defined(TRMMKERNEL)
  2285. REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
  2286. #else
  2287. mr BO, B
  2288. #endif
  2289. #if defined(TRMMKERNEL)
  2290. REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
  2291. mr T1, T6
  2292. addi T1,T1, -2
  2293. srawi. T8, T1, 5 /**(T1-2) % 31x */
  2294. #else
  2295. mr T1, K
  2296. addi T1,T1, -2
  2297. srawi. T8, T1, 5 /**(K-2) % 31x */
  2298. #endif
  2299. ZERO1x2
  2300. ble CGEMM_L1x2_SUB0
  2301. bl CGEMM_1x2_LMAIN_SUB
  2302. andi. L, T1, 31
  2303. ble CGEMM_L1x2_SAVE
  2304. b CGEMM_L1x2_SUB2
  2305. CGEMM_L1x2_SUB0:
  2306. /*----------------------------------------*/
  2307. #if defined(TRMMKERNEL)
  2308. andi. L, T6, 63
  2309. cmpwi T6,33
  2310. #else
  2311. andi. L, K, 63
  2312. cmpwi K,33
  2313. #endif
  2314. li T8,1
  2315. bne CMP1x2_32K
  2316. addi BO,BO,-8
  2317. addi AO,AO,-16
  2318. LOAD1x2O 16,8
  2319. END1x2_WITHOUT_ADD
  2320. LOAD1x2_2O 32, 16
  2321. mtctr T8
  2322. bl CGEMM_L1x2_K32
  2323. b CGEMM_L1x2_SAVE
  2324. CMP1x2_32K:
  2325. /*----------------------------------------*/
  2326. #if defined(TRMMKERNEL)
  2327. cmpwi T6,32
  2328. #else
  2329. cmpwi K,32
  2330. #endif
  2331. bne CGEMM_L1x2_SUB2
  2332. MY_ALIGN
  2333. mtctr T8
  2334. addi BO,BO,-16
  2335. addi AO,AO,-32
  2336. LOAD1x2_2O 32,16
  2337. bl CGEMM_L1x2_K32
  2338. b CGEMM_L1x2_SAVE
  2339. MY_ALIGN
  2340. MY_ALIGN
  2341. CGEMM_L1x2_SUB2:
  2342. /*----------------------------------------*/
  2343. andi. T1,L, 16
  2344. ble CGEMM_L1x2_SUB2_8
  2345. bl CGEMM_1x2_L16_SUB
  2346. MY_ALIGN
  2347. CGEMM_L1x2_SUB2_8:
  2348. /*----------------------------------------*/
  2349. andi. T1,L, 8
  2350. ble CGEMM_L1x2_SUB2_4
  2351. bl CGEMM_1x2_L8_SUB
  2352. MY_ALIGN
  2353. CGEMM_L1x2_SUB2_4:
  2354. /*----------------------------------------*/
  2355. andi. T1,L, 4
  2356. ble CGEMM_L1x2_SUB2_2
  2357. LOAD1x2_2
  2358. KERNEL1x2_L2 32,16, 0,0
  2359. KERNEL1x2_E2 32,16, 1,1
  2360. MY_ALIGN
  2361. CGEMM_L1x2_SUB2_2:
  2362. /*----------------------------------------*/
  2363. andi. T1,L, 2
  2364. ble CGEMM_L1x2_SUB2_1
  2365. LOAD1x2_2
  2366. KERNEL1x2_E2 32,16, 0,1
  2367. MY_ALIGN
  2368. CGEMM_L1x2_SUB2_1:
  2369. /*----------------------------------------*/
  2370. andi. T1,L, 1
  2371. ble CGEMM_L1x2_SAVE
  2372. KERNEL1x2
  2373. MY_ALIGN
  2374. CGEMM_L1x2_SAVE:
  2375. /*----------------------------------------*/
  2376. SAVE1x2
  2377. #if defined(TRMMKERNEL)
  2378. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
  2379. #endif
  2380. CGEMM_L1x2_END:
  2381. /*----------------------------------------*/
  2382. CGEMM_L1x1_BEGIN:
  2383. /*----------------------------------------*/
  2384. andi. T1, M, 1
  2385. ble CGEMM_L1x1_END
  2386. #if defined(TRMMKERNEL)
  2387. REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
  2388. #else
  2389. mr BO, B
  2390. #endif
  2391. #if defined(TRMMKERNEL)
  2392. REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
  2393. mr T1, T6
  2394. addi T1,T1, -2
  2395. srawi. T8, T1, 5 /**(T1-2) % 31x */
  2396. #else
  2397. mr T1, K
  2398. addi T1,T1, -2
  2399. srawi. T8, T1, 5 /**(K-2) % 31x */
  2400. #endif
  2401. ZERO1x1
  2402. ble CGEMM_L1x1_SUB0
  2403. bl CGEMM_1x1_LMAIN_SUB
  2404. andi. L, T1, 31
  2405. ble CGEMM_L1x1_SAVE
  2406. b CGEMM_L1x1_SUB2
  2407. CGEMM_L1x1_SUB0:
  2408. /*----------------------------------------*/
  2409. #if defined(TRMMKERNEL)
  2410. andi. L, T6, 63
  2411. cmpwi T6,33
  2412. #else
  2413. andi. L, K, 63
  2414. cmpwi K,33
  2415. #endif
  2416. li T8,1
  2417. bne CMP1x1_32K
  2418. addi BO,BO,-8
  2419. addi AO,AO,-8
  2420. LOAD1x1O 8,8
  2421. END1x1_WITHOUT_ADD
  2422. LOAD1x1_2O 16, 16
  2423. mtctr T8
  2424. bl CGEMM_L1x1_K32
  2425. b CGEMM_L1x1_SAVE
  2426. CMP1x1_32K:
  2427. /*----------------------------------------*/
  2428. #if defined(TRMMKERNEL)
  2429. cmpwi T6,32
  2430. #else
  2431. cmpwi K,32
  2432. #endif
  2433. bne CGEMM_L1x1_SUB2
  2434. MY_ALIGN
  2435. mtctr T8
  2436. addi BO,BO,-16
  2437. addi AO,AO,-16
  2438. LOAD1x1_2O 16,16
  2439. bl CGEMM_L1x1_K32
  2440. b CGEMM_L1x1_SAVE
  2441. MY_ALIGN
  2442. MY_ALIGN
  2443. CGEMM_L1x1_SUB2:
  2444. /*----------------------------------------*/
  2445. andi. T1,L, 16
  2446. ble CGEMM_L1x1_SUB2_8
  2447. bl CGEMM_1x1_L16_SUB
  2448. MY_ALIGN
  2449. CGEMM_L1x1_SUB2_8:
  2450. /*----------------------------------------*/
  2451. andi. T1,L, 8
  2452. ble CGEMM_L1x1_SUB2_4
  2453. bl CGEMM_1x1_L8_SUB
  2454. MY_ALIGN
  2455. CGEMM_L1x1_SUB2_4:
  2456. /*----------------------------------------*/
  2457. andi. T1,L, 4
  2458. ble CGEMM_L1x1_SUB2_2
  2459. LOAD1x1_2
  2460. KERNEL1x1_L2 16,16, 0,0
  2461. KERNEL1x1_E2 16,16, 1,1
  2462. MY_ALIGN
  2463. CGEMM_L1x1_SUB2_2:
  2464. /*----------------------------------------*/
  2465. andi. T1,L, 2
  2466. ble CGEMM_L1x1_SUB2_1
  2467. LOAD1x1_2
  2468. KERNEL1x1_E2 16,16, 0,1
  2469. MY_ALIGN
  2470. CGEMM_L1x1_SUB2_1:
  2471. /*----------------------------------------*/
  2472. andi. T1,L, 1
  2473. ble CGEMM_L1x1_SAVE
  2474. KERNEL1x1
  2475. MY_ALIGN
  2476. CGEMM_L1x1_SAVE:
  2477. /*----------------------------------------*/
  2478. SAVE1x1
  2479. #if defined(TRMMKERNEL)
  2480. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
  2481. #endif
  2482. CGEMM_L1x1_END:
  2483. /*----------------------------------------*/
  2484. slwi T1, K, 3
  2485. add B, B, T1
  2486. #if defined(TRMMKERNEL) && !defined(LEFT)
  2487. addi TEMP_REG, TEMP_REG, 1
  2488. #endif
  2489. CGEMM_L1_END: