You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_logic_8x4_power8.S 19 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459
  1. /***************************************************************************
  2. Copyright (c) 2013-2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2016/04/04 Werner Saar (wernsaar@googlemail.com)
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. * LAPACK-TEST : OK
  33. **************************************************************************************/
  34. srawi. J, N, 2
  35. ble CGEMM_L4_END
  36. CGEMM_L4_BEGIN:
  37. mr BO, B
  38. mr BBO, BBUFFER
  39. slwi T1, K, 3
  40. CGEMM_L4_COPYB:
  41. dcbtst BBO, PRE
  42. lxvw4x vs3, o0, BO
  43. lxvw4x vs11, o16, BO
  44. xxspltw vs4, vs3, 0
  45. xxspltw vs5, vs3, 1
  46. xxspltw vs6, vs3, 2
  47. xxspltw vs7, vs3, 3
  48. xxspltw vs12, vs11, 0
  49. xxspltw vs13, vs11, 1
  50. xxspltw vs14, vs11, 2
  51. xxspltw vs15, vs11, 3
  52. stxvw4x vs4, o0, BBO
  53. stxvw4x vs5, o16, BBO
  54. stxvw4x vs6, o32, BBO
  55. stxvw4x vs7, o48, BBO
  56. addi BO, BO, 32
  57. addi BBO, BBO, 64
  58. stxvw4x vs12, o0, BBO
  59. stxvw4x vs13, o16, BBO
  60. stxvw4x vs14, o32, BBO
  61. stxvw4x vs15, o48, BBO
  62. addic. T1, T1, -8
  63. addi BBO, BBO, 64
  64. bge CGEMM_L4_COPYB
  65. mr CO, C
  66. mr AO, A
  67. slwi T1, LDC , 2
  68. add C, C, T1
  69. srawi. I, M, 3
  70. ble CGEMM_L4x8_END
  71. CGEMM_L4x8_BEGIN:
  72. mr BO, BBUFFER
  73. srawi. L, K, 3
  74. ble CGEMM_L4x8_SUB0
  75. cmpwi cr0, L, 1
  76. ble CGEMM_L4x8_SUB4
  77. CGEMM_L4x8_LOOP_START:
  78. dcbt AO, PRE
  79. dcbt BO, PRE
  80. LOAD4x8_1
  81. dcbt BO, PRE
  82. KERNEL4x8_I1
  83. dcbt BO, PRE
  84. dcbt AO, PRE
  85. KERNEL4x8_2
  86. dcbt BO, PRE
  87. KERNEL4x8_1
  88. dcbt BO, PRE
  89. dcbt AO, PRE
  90. KERNEL4x8_2
  91. dcbt BO, PRE
  92. KERNEL4x8_1
  93. dcbt BO, PRE
  94. dcbt AO, PRE
  95. KERNEL4x8_2
  96. dcbt BO, PRE
  97. KERNEL4x8_1
  98. dcbt BO, PRE
  99. dcbt AO, PRE
  100. KERNEL4x8_2
  101. addic. L, L, -2
  102. ble CGEMM_L4x8_LOOP_END
  103. .align 5
  104. CGEMM_L4x8_LOOP:
  105. dcbt BO, PRE
  106. KERNEL4x8_1
  107. dcbt BO, PRE
  108. dcbt AO, PRE
  109. KERNEL4x8_2
  110. dcbt BO, PRE
  111. KERNEL4x8_1
  112. dcbt BO, PRE
  113. dcbt AO, PRE
  114. KERNEL4x8_2
  115. dcbt BO, PRE
  116. KERNEL4x8_1
  117. dcbt BO, PRE
  118. dcbt AO, PRE
  119. KERNEL4x8_2
  120. dcbt BO, PRE
  121. KERNEL4x8_1
  122. dcbt BO, PRE
  123. dcbt AO, PRE
  124. KERNEL4x8_2
  125. addic. L, L, -1
  126. bgt CGEMM_L4x8_LOOP
  127. CGEMM_L4x8_LOOP_END:
  128. dcbt BO, PRE
  129. KERNEL4x8_1
  130. dcbt BO, PRE
  131. dcbt AO, PRE
  132. KERNEL4x8_2
  133. KERNEL4x8_1
  134. dcbt AO, PRE
  135. KERNEL4x8_2
  136. KERNEL4x8_1
  137. dcbt AO, PRE
  138. KERNEL4x8_2
  139. KERNEL4x8_1
  140. KERNEL4x8_E2
  141. b CGEMM_L4x8_SUB1
  142. CGEMM_L4x8_SUB4:
  143. KERNEL4x8_SUBI1
  144. KERNEL4x8_SUB1
  145. KERNEL4x8_SUB1
  146. KERNEL4x8_SUB1
  147. KERNEL4x8_SUB1
  148. KERNEL4x8_SUB1
  149. KERNEL4x8_SUB1
  150. KERNEL4x8_SUB1
  151. b CGEMM_L4x8_SUB1
  152. CGEMM_L4x8_SUB0:
  153. andi. L, K, 7
  154. KERNEL4x8_SUBI1
  155. addic. L, L, -1
  156. ble CGEMM_L4x8_SAVE
  157. b CGEMM_L4x8_SUB2
  158. CGEMM_L4x8_SUB1:
  159. andi. L, K, 7
  160. ble CGEMM_L4x8_SAVE
  161. CGEMM_L4x8_SUB2:
  162. KERNEL4x8_SUB1
  163. addic. L, L, -1
  164. bgt CGEMM_L4x8_SUB2
  165. CGEMM_L4x8_SAVE:
  166. SAVE4x8
  167. addic. I, I, -1
  168. bgt CGEMM_L4x8_BEGIN
  169. CGEMM_L4x8_END:
  170. CGEMM_L4x4_BEGIN:
  171. andi. T2, M, 7
  172. ble CGEMM_L4x1_END
  173. andi. T1, M, 4
  174. ble CGEMM_L4x4_END
  175. mr BO, BBUFFER
  176. srawi. L, K, 3
  177. ble CGEMM_L4x4_SUB0
  178. cmpwi cr0, L, 1
  179. ble CGEMM_L4x4_SUB4
  180. CGEMM_L4x4_LOOP_START:
  181. LOAD4x4_1
  182. KERNEL4x4_I1
  183. KERNEL4x4_2
  184. KERNEL4x4_1
  185. KERNEL4x4_2
  186. KERNEL4x4_1
  187. KERNEL4x4_2
  188. KERNEL4x4_1
  189. KERNEL4x4_2
  190. addic. L, L, -2
  191. ble CGEMM_L4x4_LOOP_END
  192. .align 5
  193. CGEMM_L4x4_LOOP:
  194. KERNEL4x4_1
  195. KERNEL4x4_2
  196. KERNEL4x4_1
  197. KERNEL4x4_2
  198. KERNEL4x4_1
  199. KERNEL4x4_2
  200. KERNEL4x4_1
  201. KERNEL4x4_2
  202. addic. L, L, -1
  203. bgt CGEMM_L4x4_LOOP
  204. CGEMM_L4x4_LOOP_END:
  205. KERNEL4x4_1
  206. KERNEL4x4_2
  207. KERNEL4x4_1
  208. KERNEL4x4_2
  209. KERNEL4x4_1
  210. KERNEL4x4_2
  211. KERNEL4x4_1
  212. KERNEL4x4_E2
  213. b CGEMM_L4x4_SUB1
  214. CGEMM_L4x4_SUB4:
  215. KERNEL4x4_SUBI1
  216. KERNEL4x4_SUB1
  217. KERNEL4x4_SUB1
  218. KERNEL4x4_SUB1
  219. KERNEL4x4_SUB1
  220. KERNEL4x4_SUB1
  221. KERNEL4x4_SUB1
  222. KERNEL4x4_SUB1
  223. b CGEMM_L4x4_SUB1
  224. CGEMM_L4x4_SUB0:
  225. andi. L, K, 7
  226. KERNEL4x4_SUBI1
  227. addic. L, L, -1
  228. ble CGEMM_L4x4_SAVE
  229. b CGEMM_L4x4_SUB2
  230. CGEMM_L4x4_SUB1:
  231. andi. L, K, 7
  232. ble CGEMM_L4x4_SAVE
  233. CGEMM_L4x4_SUB2:
  234. KERNEL4x4_SUB1
  235. addic. L, L, -1
  236. bgt CGEMM_L4x4_SUB2
  237. CGEMM_L4x4_SAVE:
  238. SAVE4x4
  239. CGEMM_L4x4_END:
  240. CGEMM_L4x2_BEGIN:
  241. andi. T1, M, 2
  242. ble CGEMM_L4x2_END
  243. mr BO, BBUFFER
  244. srawi. L, K, 3
  245. ble CGEMM_L4x2_SUB0
  246. cmpwi cr0, L, 1
  247. ble CGEMM_L4x2_SUB4
  248. CGEMM_L4x2_LOOP_START:
  249. LOAD4x2_1
  250. KERNEL4x2_I1
  251. KERNEL4x2_2
  252. KERNEL4x2_1
  253. KERNEL4x2_2
  254. KERNEL4x2_1
  255. KERNEL4x2_2
  256. KERNEL4x2_1
  257. KERNEL4x2_2
  258. addic. L, L, -2
  259. ble CGEMM_L4x2_LOOP_END
  260. .align 5
  261. CGEMM_L4x2_LOOP:
  262. KERNEL4x2_1
  263. KERNEL4x2_2
  264. KERNEL4x2_1
  265. KERNEL4x2_2
  266. KERNEL4x2_1
  267. KERNEL4x2_2
  268. KERNEL4x2_1
  269. KERNEL4x2_2
  270. addic. L, L, -1
  271. bgt CGEMM_L4x2_LOOP
  272. CGEMM_L4x2_LOOP_END:
  273. KERNEL4x2_1
  274. KERNEL4x2_2
  275. KERNEL4x2_1
  276. KERNEL4x2_2
  277. KERNEL4x2_1
  278. KERNEL4x2_2
  279. KERNEL4x2_1
  280. KERNEL4x2_E2
  281. b CGEMM_L4x2_SUB1
  282. CGEMM_L4x2_SUB4:
  283. KERNEL4x2_SUBI1
  284. KERNEL4x2_SUB1
  285. KERNEL4x2_SUB1
  286. KERNEL4x2_SUB1
  287. KERNEL4x2_SUB1
  288. KERNEL4x2_SUB1
  289. KERNEL4x2_SUB1
  290. KERNEL4x2_SUB1
  291. b CGEMM_L4x2_SUB1
  292. CGEMM_L4x2_SUB0:
  293. andi. L, K, 7
  294. KERNEL4x2_SUBI1
  295. addic. L, L, -1
  296. ble CGEMM_L4x2_SAVE
  297. b CGEMM_L4x2_SUB2
  298. CGEMM_L4x2_SUB1:
  299. andi. L, K, 7
  300. ble CGEMM_L4x2_SAVE
  301. CGEMM_L4x2_SUB2:
  302. KERNEL4x2_SUB1
  303. addic. L, L, -1
  304. bgt CGEMM_L4x2_SUB2
  305. CGEMM_L4x2_SAVE:
  306. SAVE4x2
  307. CGEMM_L4x2_END:
  308. CGEMM_L4x1_BEGIN:
  309. andi. T1, M, 1
  310. ble CGEMM_L4x1_END
  311. mr BO, BBUFFER
  312. srawi. L, K, 3
  313. ble CGEMM_L4x1_SUB0
  314. cmpwi cr0, L, 1
  315. ble CGEMM_L4x1_SUB4
  316. CGEMM_L4x1_LOOP_START:
  317. LOAD4x1_1
  318. KERNEL4x1_I1
  319. KERNEL4x1_2
  320. KERNEL4x1_1
  321. KERNEL4x1_2
  322. KERNEL4x1_1
  323. KERNEL4x1_2
  324. KERNEL4x1_1
  325. KERNEL4x1_2
  326. addic. L, L, -2
  327. ble CGEMM_L4x1_LOOP_END
  328. .align 5
  329. CGEMM_L4x1_LOOP:
  330. KERNEL4x1_1
  331. KERNEL4x1_2
  332. KERNEL4x1_1
  333. KERNEL4x1_2
  334. KERNEL4x1_1
  335. KERNEL4x1_2
  336. KERNEL4x1_1
  337. KERNEL4x1_2
  338. addic. L, L, -1
  339. bgt CGEMM_L4x1_LOOP
  340. CGEMM_L4x1_LOOP_END:
  341. KERNEL4x1_1
  342. KERNEL4x1_2
  343. KERNEL4x1_1
  344. KERNEL4x1_2
  345. KERNEL4x1_1
  346. KERNEL4x1_2
  347. KERNEL4x1_1
  348. KERNEL4x1_E2
  349. b CGEMM_L4x1_SUB1
  350. CGEMM_L4x1_SUB4:
  351. KERNEL4x1_SUBI1
  352. KERNEL4x1_SUB1
  353. KERNEL4x1_SUB1
  354. KERNEL4x1_SUB1
  355. KERNEL4x1_SUB1
  356. KERNEL4x1_SUB1
  357. KERNEL4x1_SUB1
  358. KERNEL4x1_SUB1
  359. b CGEMM_L4x1_SUB1
  360. CGEMM_L4x1_SUB0:
  361. andi. L, K, 7
  362. KERNEL4x1_SUBI1
  363. addic. L, L, -1
  364. ble CGEMM_L4x1_SAVE
  365. b CGEMM_L4x1_SUB2
  366. CGEMM_L4x1_SUB1:
  367. andi. L, K, 7
  368. ble CGEMM_L4x1_SAVE
  369. CGEMM_L4x1_SUB2:
  370. KERNEL4x1_SUB1
  371. addic. L, L, -1
  372. bgt CGEMM_L4x1_SUB2
  373. CGEMM_L4x1_SAVE:
  374. SAVE4x1
  375. CGEMM_L4x1_END:
  376. slwi T1, K, 5
  377. add B, B, T1
  378. addic. J, J, -1
  379. bgt CGEMM_L4_BEGIN
  380. andi. T2, N, 3
  381. ble L999_H2
  382. CGEMM_L4_END:
  383. b CGEMM_L2_BEGIN
  384. L999_H1:
  385. b L999_H2
  386. CGEMM_L2_BEGIN:
  387. mr BO, B
  388. mr BBO, BBUFFER
  389. slwi T1, K, 2
  390. CGEMM_L2_COPYB:
  391. dcbtst BBO, PRE
  392. lxvw4x vs3, o0, BO
  393. lxvw4x vs11, o16, BO
  394. xxspltw vs4, vs3, 0
  395. xxspltw vs5, vs3, 1
  396. xxspltw vs6, vs3, 2
  397. xxspltw vs7, vs3, 3
  398. xxspltw vs12, vs11, 0
  399. xxspltw vs13, vs11, 1
  400. xxspltw vs14, vs11, 2
  401. xxspltw vs15, vs11, 3
  402. stxvw4x vs4, o0, BBO
  403. stxvw4x vs5, o16, BBO
  404. stxvw4x vs6, o32, BBO
  405. stxvw4x vs7, o48, BBO
  406. addi BO, BO, 32
  407. addi BBO, BBO, 64
  408. stxvw4x vs12, o0, BBO
  409. stxvw4x vs13, o16, BBO
  410. stxvw4x vs14, o32, BBO
  411. stxvw4x vs15, o48, BBO
  412. addic. T1, T1, -8
  413. addi BBO, BBO, 64
  414. bge CGEMM_L2_COPYB
  415. andi. T1, N, 2
  416. ble CGEMM_L2_END
  417. mr CO, C
  418. mr AO, A
  419. slwi T1, LDC , 1
  420. add C, C, T1
  421. srawi. I, M, 3
  422. ble CGEMM_L2x8_END
  423. CGEMM_L2x8_BEGIN:
  424. mr BO, BBUFFER
  425. srawi. L, K, 3
  426. ble CGEMM_L2x8_SUB0
  427. cmpwi cr0, L, 1
  428. ble CGEMM_L2x8_SUB4
  429. CGEMM_L2x8_LOOP_START:
  430. dcbt AO, PRE
  431. LOAD2x8_1
  432. KERNEL2x8_I1
  433. dcbt AO, PRE
  434. KERNEL2x8_2
  435. KERNEL2x8_1
  436. dcbt AO, PRE
  437. KERNEL2x8_2
  438. KERNEL2x8_1
  439. dcbt AO, PRE
  440. KERNEL2x8_2
  441. KERNEL2x8_1
  442. dcbt AO, PRE
  443. KERNEL2x8_2
  444. addic. L, L, -2
  445. ble CGEMM_L2x8_LOOP_END
  446. .align 5
  447. CGEMM_L2x8_LOOP:
  448. KERNEL2x8_1
  449. dcbt AO, PRE
  450. KERNEL2x8_2
  451. KERNEL2x8_1
  452. dcbt AO, PRE
  453. KERNEL2x8_2
  454. KERNEL2x8_1
  455. dcbt AO, PRE
  456. KERNEL2x8_2
  457. KERNEL2x8_1
  458. dcbt AO, PRE
  459. KERNEL2x8_2
  460. addic. L, L, -1
  461. bgt CGEMM_L2x8_LOOP
  462. CGEMM_L2x8_LOOP_END:
  463. KERNEL2x8_1
  464. dcbt AO, PRE
  465. KERNEL2x8_2
  466. KERNEL2x8_1
  467. dcbt AO, PRE
  468. KERNEL2x8_2
  469. KERNEL2x8_1
  470. dcbt AO, PRE
  471. KERNEL2x8_2
  472. KERNEL2x8_1
  473. KERNEL2x8_E2
  474. b CGEMM_L2x8_SUB1
  475. CGEMM_L2x8_SUB4:
  476. KERNEL2x8_SUBI1
  477. KERNEL2x8_SUB1
  478. KERNEL2x8_SUB1
  479. KERNEL2x8_SUB1
  480. KERNEL2x8_SUB1
  481. KERNEL2x8_SUB1
  482. KERNEL2x8_SUB1
  483. KERNEL2x8_SUB1
  484. b CGEMM_L2x8_SUB1
  485. CGEMM_L2x8_SUB0:
  486. andi. L, K, 7
  487. KERNEL2x8_SUBI1
  488. addic. L, L, -1
  489. ble CGEMM_L2x8_SAVE
  490. b CGEMM_L2x8_SUB2
  491. CGEMM_L2x8_SUB1:
  492. andi. L, K, 7
  493. ble CGEMM_L2x8_SAVE
  494. CGEMM_L2x8_SUB2:
  495. KERNEL2x8_SUB1
  496. addic. L, L, -1
  497. bgt CGEMM_L2x8_SUB2
  498. CGEMM_L2x8_SAVE:
  499. SAVE2x8
  500. addic. I, I, -1
  501. bgt CGEMM_L2x8_BEGIN
  502. CGEMM_L2x8_END:
  503. CGEMM_L2x4_BEGIN:
  504. andi. T2, M, 7
  505. ble CGEMM_L2x1_END
  506. andi. T1, M, 4
  507. ble CGEMM_L2x4_END
  508. mr BO, BBUFFER
  509. srawi. L, K, 3
  510. ble CGEMM_L2x4_SUB0
  511. cmpwi cr0, L, 1
  512. ble CGEMM_L2x4_SUB4
  513. CGEMM_L2x4_LOOP_START:
  514. LOAD2x4_1
  515. KERNEL2x4_I1
  516. KERNEL2x4_2
  517. KERNEL2x4_1
  518. KERNEL2x4_2
  519. KERNEL2x4_1
  520. KERNEL2x4_2
  521. KERNEL2x4_1
  522. KERNEL2x4_2
  523. addic. L, L, -2
  524. ble CGEMM_L2x4_LOOP_END
  525. .align 5
  526. CGEMM_L2x4_LOOP:
  527. KERNEL2x4_1
  528. KERNEL2x4_2
  529. KERNEL2x4_1
  530. KERNEL2x4_2
  531. KERNEL2x4_1
  532. KERNEL2x4_2
  533. KERNEL2x4_1
  534. KERNEL2x4_2
  535. addic. L, L, -1
  536. bgt CGEMM_L2x4_LOOP
  537. CGEMM_L2x4_LOOP_END:
  538. KERNEL2x4_1
  539. KERNEL2x4_2
  540. KERNEL2x4_1
  541. KERNEL2x4_2
  542. KERNEL2x4_1
  543. KERNEL2x4_2
  544. KERNEL2x4_1
  545. KERNEL2x4_E2
  546. b CGEMM_L2x4_SUB1
  547. CGEMM_L2x4_SUB4:
  548. KERNEL2x4_SUBI1
  549. KERNEL2x4_SUB1
  550. KERNEL2x4_SUB1
  551. KERNEL2x4_SUB1
  552. KERNEL2x4_SUB1
  553. KERNEL2x4_SUB1
  554. KERNEL2x4_SUB1
  555. KERNEL2x4_SUB1
  556. b CGEMM_L2x4_SUB1
  557. CGEMM_L2x4_SUB0:
  558. andi. L, K, 7
  559. KERNEL2x4_SUBI1
  560. addic. L, L, -1
  561. ble CGEMM_L2x4_SAVE
  562. b CGEMM_L2x4_SUB2
  563. CGEMM_L2x4_SUB1:
  564. andi. L, K, 7
  565. ble CGEMM_L2x4_SAVE
  566. CGEMM_L2x4_SUB2:
  567. KERNEL2x4_SUB1
  568. addic. L, L, -1
  569. bgt CGEMM_L2x4_SUB2
  570. CGEMM_L2x4_SAVE:
  571. SAVE2x4
  572. CGEMM_L2x4_END:
  573. CGEMM_L2x2_BEGIN:
  574. andi. T1, M, 2
  575. ble CGEMM_L2x2_END
  576. mr BO, BBUFFER
  577. srawi. L, K, 3
  578. ble CGEMM_L2x2_SUB0
  579. cmpwi cr0, L, 1
  580. ble CGEMM_L2x2_SUB4
  581. CGEMM_L2x2_LOOP_START:
  582. LOAD2x2_1
  583. KERNEL2x2_I1
  584. KERNEL2x2_2
  585. KERNEL2x2_1
  586. KERNEL2x2_2
  587. KERNEL2x2_1
  588. KERNEL2x2_2
  589. KERNEL2x2_1
  590. KERNEL2x2_2
  591. addic. L, L, -2
  592. ble CGEMM_L2x2_LOOP_END
  593. .align 5
  594. CGEMM_L2x2_LOOP:
  595. KERNEL2x2_1
  596. KERNEL2x2_2
  597. KERNEL2x2_1
  598. KERNEL2x2_2
  599. KERNEL2x2_1
  600. KERNEL2x2_2
  601. KERNEL2x2_1
  602. KERNEL2x2_2
  603. addic. L, L, -1
  604. bgt CGEMM_L2x2_LOOP
  605. CGEMM_L2x2_LOOP_END:
  606. KERNEL2x2_1
  607. KERNEL2x2_2
  608. KERNEL2x2_1
  609. KERNEL2x2_2
  610. KERNEL2x2_1
  611. KERNEL2x2_2
  612. KERNEL2x2_1
  613. KERNEL2x2_E2
  614. b CGEMM_L2x2_SUB1
  615. CGEMM_L2x2_SUB4:
  616. KERNEL2x2_SUBI1
  617. KERNEL2x2_SUB1
  618. KERNEL2x2_SUB1
  619. KERNEL2x2_SUB1
  620. KERNEL2x2_SUB1
  621. KERNEL2x2_SUB1
  622. KERNEL2x2_SUB1
  623. KERNEL2x2_SUB1
  624. b CGEMM_L2x2_SUB1
  625. CGEMM_L2x2_SUB0:
  626. andi. L, K, 7
  627. KERNEL2x2_SUBI1
  628. addic. L, L, -1
  629. ble CGEMM_L2x2_SAVE
  630. b CGEMM_L2x2_SUB2
  631. CGEMM_L2x2_SUB1:
  632. andi. L, K, 7
  633. ble CGEMM_L2x2_SAVE
  634. CGEMM_L2x2_SUB2:
  635. KERNEL2x2_SUB1
  636. addic. L, L, -1
  637. bgt CGEMM_L2x2_SUB2
  638. CGEMM_L2x2_SAVE:
  639. SAVE2x2
  640. CGEMM_L2x2_END:
  641. CGEMM_L2x1_BEGIN:
  642. andi. T1, M, 1
  643. ble CGEMM_L2x1_END
  644. mr BO, BBUFFER
  645. srawi. L, K, 3
  646. ble CGEMM_L2x1_SUB0
  647. cmpwi cr0, L, 1
  648. ble CGEMM_L2x1_SUB4
  649. CGEMM_L2x1_LOOP_START:
  650. LOAD2x1_1
  651. KERNEL2x1_I1
  652. KERNEL2x1_2
  653. KERNEL2x1_1
  654. KERNEL2x1_2
  655. KERNEL2x1_1
  656. KERNEL2x1_2
  657. KERNEL2x1_1
  658. KERNEL2x1_2
  659. addic. L, L, -2
  660. ble CGEMM_L2x1_LOOP_END
  661. .align 5
  662. CGEMM_L2x1_LOOP:
  663. KERNEL2x1_1
  664. KERNEL2x1_2
  665. KERNEL2x1_1
  666. KERNEL2x1_2
  667. KERNEL2x1_1
  668. KERNEL2x1_2
  669. KERNEL2x1_1
  670. KERNEL2x1_2
  671. addic. L, L, -1
  672. bgt CGEMM_L2x1_LOOP
  673. CGEMM_L2x1_LOOP_END:
  674. KERNEL2x1_1
  675. KERNEL2x1_2
  676. KERNEL2x1_1
  677. KERNEL2x1_2
  678. KERNEL2x1_1
  679. KERNEL2x1_2
  680. KERNEL2x1_1
  681. KERNEL2x1_E2
  682. b CGEMM_L2x1_SUB1
  683. CGEMM_L2x1_SUB4:
  684. KERNEL2x1_SUBI1
  685. KERNEL2x1_SUB1
  686. KERNEL2x1_SUB1
  687. KERNEL2x1_SUB1
  688. KERNEL2x1_SUB1
  689. KERNEL2x1_SUB1
  690. KERNEL2x1_SUB1
  691. KERNEL2x1_SUB1
  692. b CGEMM_L2x1_SUB1
  693. CGEMM_L2x1_SUB0:
  694. andi. L, K, 7
  695. KERNEL2x1_SUBI1
  696. addic. L, L, -1
  697. ble CGEMM_L2x1_SAVE
  698. b CGEMM_L2x1_SUB2
  699. CGEMM_L2x1_SUB1:
  700. andi. L, K, 7
  701. ble CGEMM_L2x1_SAVE
  702. CGEMM_L2x1_SUB2:
  703. KERNEL2x1_SUB1
  704. addic. L, L, -1
  705. bgt CGEMM_L2x1_SUB2
  706. CGEMM_L2x1_SAVE:
  707. SAVE2x1
  708. CGEMM_L2x1_END:
  709. slwi T1, K, 4
  710. add B, B, T1
  711. CGEMM_L2_END:
  712. b CGEMM_L1_BEGIN
  713. L999_H2:
  714. b L999
  715. CGEMM_L1_BEGIN:
  716. mr BO, B
  717. mr BBO, BBUFFER
  718. slwi T1, K, 1
  719. CGEMM_L1_COPYB:
  720. dcbtst BBO, PRE
  721. lxvw4x vs3, o0, BO
  722. lxvw4x vs11, o16, BO
  723. xxspltw vs4, vs3, 0
  724. xxspltw vs5, vs3, 1
  725. xxspltw vs6, vs3, 2
  726. xxspltw vs7, vs3, 3
  727. xxspltw vs12, vs11, 0
  728. xxspltw vs13, vs11, 1
  729. xxspltw vs14, vs11, 2
  730. xxspltw vs15, vs11, 3
  731. stxvw4x vs4, o0, BBO
  732. stxvw4x vs5, o16, BBO
  733. stxvw4x vs6, o32, BBO
  734. stxvw4x vs7, o48, BBO
  735. addi BO, BO, 32
  736. addi BBO, BBO, 64
  737. stxvw4x vs12, o0, BBO
  738. stxvw4x vs13, o16, BBO
  739. stxvw4x vs14, o32, BBO
  740. stxvw4x vs15, o48, BBO
  741. addic. T1, T1, -8
  742. addi BBO, BBO, 64
  743. bge CGEMM_L1_COPYB
  744. andi. T1, N, 1
  745. ble CGEMM_L1_END
  746. mr CO, C
  747. mr AO, A
  748. srawi. I, M, 3
  749. ble CGEMM_L1x8_END
  750. CGEMM_L1x8_BEGIN:
  751. mr BO, BBUFFER
  752. srawi. L, K, 3
  753. ble CGEMM_L1x8_SUB0
  754. cmpwi cr0, L, 1
  755. ble CGEMM_L1x8_SUB4
  756. CGEMM_L1x8_LOOP_START:
  757. dcbt AO, PRE
  758. LOAD1x8_1
  759. KERNEL1x8_I1
  760. dcbt AO, PRE
  761. KERNEL1x8_2
  762. KERNEL1x8_1
  763. dcbt AO, PRE
  764. KERNEL1x8_2
  765. KERNEL1x8_1
  766. dcbt AO, PRE
  767. KERNEL1x8_2
  768. KERNEL1x8_1
  769. dcbt AO, PRE
  770. KERNEL1x8_2
  771. addic. L, L, -2
  772. ble CGEMM_L1x8_LOOP_END
  773. .align 5
  774. CGEMM_L1x8_LOOP:
  775. KERNEL1x8_1
  776. dcbt AO, PRE
  777. KERNEL1x8_2
  778. KERNEL1x8_1
  779. dcbt AO, PRE
  780. KERNEL1x8_2
  781. KERNEL1x8_1
  782. dcbt AO, PRE
  783. KERNEL1x8_2
  784. KERNEL1x8_1
  785. dcbt AO, PRE
  786. KERNEL1x8_2
  787. addic. L, L, -1
  788. bgt CGEMM_L1x8_LOOP
  789. CGEMM_L1x8_LOOP_END:
  790. KERNEL1x8_1
  791. dcbt AO, PRE
  792. KERNEL1x8_2
  793. KERNEL1x8_1
  794. dcbt AO, PRE
  795. KERNEL1x8_2
  796. KERNEL1x8_1
  797. dcbt AO, PRE
  798. KERNEL1x8_2
  799. KERNEL1x8_1
  800. KERNEL1x8_E2
  801. b CGEMM_L1x8_SUB1
  802. CGEMM_L1x8_SUB4:
  803. KERNEL1x8_SUBI1
  804. KERNEL1x8_SUB1
  805. KERNEL1x8_SUB1
  806. KERNEL1x8_SUB1
  807. KERNEL1x8_SUB1
  808. KERNEL1x8_SUB1
  809. KERNEL1x8_SUB1
  810. KERNEL1x8_SUB1
  811. b CGEMM_L1x8_SUB1
  812. CGEMM_L1x8_SUB0:
  813. andi. L, K, 7
  814. KERNEL1x8_SUBI1
  815. addic. L, L, -1
  816. ble CGEMM_L1x8_SAVE
  817. b CGEMM_L1x8_SUB2
  818. CGEMM_L1x8_SUB1:
  819. andi. L, K, 7
  820. ble CGEMM_L1x8_SAVE
  821. CGEMM_L1x8_SUB2:
  822. KERNEL1x8_SUB1
  823. addic. L, L, -1
  824. bgt CGEMM_L1x8_SUB2
  825. CGEMM_L1x8_SAVE:
  826. SAVE1x8
  827. addic. I, I, -1
  828. bgt CGEMM_L1x8_BEGIN
  829. CGEMM_L1x8_END:
  830. CGEMM_L1x4_BEGIN:
  831. andi. T2, M, 7
  832. ble CGEMM_L1x1_END
  833. andi. T1, M, 4
  834. ble CGEMM_L1x4_END
  835. mr BO, BBUFFER
  836. srawi. L, K, 3
  837. ble CGEMM_L1x4_SUB0
  838. cmpwi cr0, L, 1
  839. ble CGEMM_L1x4_SUB4
  840. CGEMM_L1x4_LOOP_START:
  841. LOAD1x4_1
  842. KERNEL1x4_I1
  843. KERNEL1x4_2
  844. KERNEL1x4_1
  845. KERNEL1x4_2
  846. KERNEL1x4_1
  847. KERNEL1x4_2
  848. KERNEL1x4_1
  849. KERNEL1x4_2
  850. addic. L, L, -2
  851. ble CGEMM_L1x4_LOOP_END
  852. .align 5
  853. CGEMM_L1x4_LOOP:
  854. KERNEL1x4_1
  855. KERNEL1x4_2
  856. KERNEL1x4_1
  857. KERNEL1x4_2
  858. KERNEL1x4_1
  859. KERNEL1x4_2
  860. KERNEL1x4_1
  861. KERNEL1x4_2
  862. addic. L, L, -1
  863. bgt CGEMM_L1x4_LOOP
  864. CGEMM_L1x4_LOOP_END:
  865. KERNEL1x4_1
  866. KERNEL1x4_2
  867. KERNEL1x4_1
  868. KERNEL1x4_2
  869. KERNEL1x4_1
  870. KERNEL1x4_2
  871. KERNEL1x4_1
  872. KERNEL1x4_E2
  873. b CGEMM_L1x4_SUB1
  874. CGEMM_L1x4_SUB4:
  875. KERNEL1x4_SUBI1
  876. KERNEL1x4_SUB1
  877. KERNEL1x4_SUB1
  878. KERNEL1x4_SUB1
  879. KERNEL1x4_SUB1
  880. KERNEL1x4_SUB1
  881. KERNEL1x4_SUB1
  882. KERNEL1x4_SUB1
  883. b CGEMM_L1x4_SUB1
  884. CGEMM_L1x4_SUB0:
  885. andi. L, K, 7
  886. KERNEL1x4_SUBI1
  887. addic. L, L, -1
  888. ble CGEMM_L1x4_SAVE
  889. b CGEMM_L1x4_SUB2
  890. CGEMM_L1x4_SUB1:
  891. andi. L, K, 7
  892. ble CGEMM_L1x4_SAVE
  893. CGEMM_L1x4_SUB2:
  894. KERNEL1x4_SUB1
  895. addic. L, L, -1
  896. bgt CGEMM_L1x4_SUB2
  897. CGEMM_L1x4_SAVE:
  898. SAVE1x4
  899. CGEMM_L1x4_END:
  900. CGEMM_L1x2_BEGIN:
  901. andi. T1, M, 2
  902. ble CGEMM_L1x2_END
  903. mr BO, BBUFFER
  904. srawi. L, K, 3
  905. ble CGEMM_L1x2_SUB0
  906. cmpwi cr0, L, 1
  907. ble CGEMM_L1x2_SUB4
  908. CGEMM_L1x2_LOOP_START:
  909. LOAD1x2_1
  910. KERNEL1x2_I1
  911. KERNEL1x2_2
  912. KERNEL1x2_1
  913. KERNEL1x2_2
  914. KERNEL1x2_1
  915. KERNEL1x2_2
  916. KERNEL1x2_1
  917. KERNEL1x2_2
  918. addic. L, L, -2
  919. ble CGEMM_L1x2_LOOP_END
  920. .align 5
  921. CGEMM_L1x2_LOOP:
  922. KERNEL1x2_1
  923. KERNEL1x2_2
  924. KERNEL1x2_1
  925. KERNEL1x2_2
  926. KERNEL1x2_1
  927. KERNEL1x2_2
  928. KERNEL1x2_1
  929. KERNEL1x2_2
  930. addic. L, L, -1
  931. bgt CGEMM_L1x2_LOOP
  932. CGEMM_L1x2_LOOP_END:
  933. KERNEL1x2_1
  934. KERNEL1x2_2
  935. KERNEL1x2_1
  936. KERNEL1x2_2
  937. KERNEL1x2_1
  938. KERNEL1x2_2
  939. KERNEL1x2_1
  940. KERNEL1x2_E2
  941. b CGEMM_L1x2_SUB1
  942. CGEMM_L1x2_SUB4:
  943. KERNEL1x2_SUBI1
  944. KERNEL1x2_SUB1
  945. KERNEL1x2_SUB1
  946. KERNEL1x2_SUB1
  947. KERNEL1x2_SUB1
  948. KERNEL1x2_SUB1
  949. KERNEL1x2_SUB1
  950. KERNEL1x2_SUB1
  951. b CGEMM_L1x2_SUB1
  952. CGEMM_L1x2_SUB0:
  953. andi. L, K, 7
  954. KERNEL1x2_SUBI1
  955. addic. L, L, -1
  956. ble CGEMM_L1x2_SAVE
  957. b CGEMM_L1x2_SUB2
  958. CGEMM_L1x2_SUB1:
  959. andi. L, K, 7
  960. ble CGEMM_L1x2_SAVE
  961. CGEMM_L1x2_SUB2:
  962. KERNEL1x2_SUB1
  963. addic. L, L, -1
  964. bgt CGEMM_L1x2_SUB2
  965. CGEMM_L1x2_SAVE:
  966. SAVE1x2
  967. CGEMM_L1x2_END:
  968. CGEMM_L1x1_BEGIN:
  969. andi. T1, M, 1
  970. ble CGEMM_L1x1_END
  971. mr BO, BBUFFER
  972. srawi. L, K, 3
  973. ble CGEMM_L1x1_SUB0
  974. cmpwi cr0, L, 1
  975. ble CGEMM_L1x1_SUB4
  976. CGEMM_L1x1_LOOP_START:
  977. LOAD1x1_1
  978. KERNEL1x1_I1
  979. KERNEL1x1_2
  980. KERNEL1x1_1
  981. KERNEL1x1_2
  982. KERNEL1x1_1
  983. KERNEL1x1_2
  984. KERNEL1x1_1
  985. KERNEL1x1_2
  986. addic. L, L, -2
  987. ble CGEMM_L1x1_LOOP_END
  988. .align 5
  989. CGEMM_L1x1_LOOP:
  990. KERNEL1x1_1
  991. KERNEL1x1_2
  992. KERNEL1x1_1
  993. KERNEL1x1_2
  994. KERNEL1x1_1
  995. KERNEL1x1_2
  996. KERNEL1x1_1
  997. KERNEL1x1_2
  998. addic. L, L, -1
  999. bgt CGEMM_L1x1_LOOP
  1000. CGEMM_L1x1_LOOP_END:
  1001. KERNEL1x1_1
  1002. KERNEL1x1_2
  1003. KERNEL1x1_1
  1004. KERNEL1x1_2
  1005. KERNEL1x1_1
  1006. KERNEL1x1_2
  1007. KERNEL1x1_1
  1008. KERNEL1x1_E2
  1009. b CGEMM_L1x1_SUB1
  1010. CGEMM_L1x1_SUB4:
  1011. KERNEL1x1_SUBI1
  1012. KERNEL1x1_SUB1
  1013. KERNEL1x1_SUB1
  1014. KERNEL1x1_SUB1
  1015. KERNEL1x1_SUB1
  1016. KERNEL1x1_SUB1
  1017. KERNEL1x1_SUB1
  1018. KERNEL1x1_SUB1
  1019. b CGEMM_L1x1_SUB1
  1020. CGEMM_L1x1_SUB0:
  1021. andi. L, K, 7
  1022. KERNEL1x1_SUBI1
  1023. addic. L, L, -1
  1024. ble CGEMM_L1x1_SAVE
  1025. b CGEMM_L1x1_SUB2
  1026. CGEMM_L1x1_SUB1:
  1027. andi. L, K, 7
  1028. ble CGEMM_L1x1_SAVE
  1029. CGEMM_L1x1_SUB2:
  1030. KERNEL1x1_SUB1
  1031. addic. L, L, -1
  1032. bgt CGEMM_L1x1_SUB2
  1033. CGEMM_L1x1_SAVE:
  1034. SAVE1x1
  1035. CGEMM_L1x1_END:
  1036. CGEMM_L1_END: