You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_logic_8x2_power8.S 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997
  1. /***************************************************************************
  2. Copyright (c) 2013-2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2016/04/22 Werner Saar (wernsaar@googlemail.com)
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. * LAPACK-TEST : OK
  33. **************************************************************************************/
  34. srawi. J, N, 1
  35. ble ZGEMM_L2_END
  36. ZGEMM_L2_BEGIN:
  37. mr BO, B
  38. mr BBO, BBUFFER
  39. srawi. T1, K, 2
  40. ble ZGEMM_L2_COPYB1
  41. ZGEMM_L2_COPYB8:
  42. addi T2, PRE, 128
  43. dcbt BO, PRE
  44. dcbtst BBO, PRE
  45. dcbtst BBO, T2
  46. ZCOPYB_8x1
  47. addic. T1, T1, -1
  48. bgt ZGEMM_L2_COPYB8
  49. ZGEMM_L2_COPYB1:
  50. andi. T1, K, 3
  51. ble ZGEMM_L2_COPYB_END
  52. ZGEMM_L2_COPYB_LOOP:
  53. ZCOPYB_1x1
  54. ZCOPYB_1x1
  55. addic. T1, T1, -1
  56. bgt ZGEMM_L2_COPYB_LOOP
  57. ZGEMM_L2_COPYB_END:
  58. mr CO, C
  59. mr AO, A
  60. slwi T1, LDC , 1
  61. add C, C, T1
  62. srawi. I, M, 3
  63. ble ZGEMM_L2x8_END
  64. ZGEMM_L2x8_BEGIN:
  65. mr BO, BBUFFER
  66. srawi. L, K, 3
  67. ble ZGEMM_L2x8_SUB0
  68. cmpwi cr0, L, 1
  69. ble ZGEMM_L2x8_SUB4
  70. ZGEMM_L2x8_LOOP_START:
  71. dcbt AO, PRE
  72. dcbt BO, PRE
  73. LOAD2x8_1
  74. dcbt AO, PRE
  75. KERNEL2x8_I1
  76. dcbt AO, PRE
  77. dcbt BO, PRE
  78. KERNEL2x8_2
  79. dcbt AO, PRE
  80. KERNEL2x8_1
  81. dcbt AO, PRE
  82. dcbt BO, PRE
  83. KERNEL2x8_2
  84. dcbt AO, PRE
  85. KERNEL2x8_1
  86. dcbt AO, PRE
  87. dcbt BO, PRE
  88. KERNEL2x8_2
  89. dcbt AO, PRE
  90. KERNEL2x8_1
  91. dcbt AO, PRE
  92. dcbt BO, PRE
  93. KERNEL2x8_2
  94. addic. L, L, -2
  95. ble ZGEMM_L2x8_LOOP_END
  96. .align 5
  97. ZGEMM_L2x8_LOOP:
  98. dcbt AO, PRE
  99. KERNEL2x8_1
  100. dcbt AO, PRE
  101. dcbt BO, PRE
  102. KERNEL2x8_2
  103. dcbt AO, PRE
  104. KERNEL2x8_1
  105. dcbt AO, PRE
  106. dcbt BO, PRE
  107. KERNEL2x8_2
  108. dcbt AO, PRE
  109. KERNEL2x8_1
  110. dcbt AO, PRE
  111. dcbt BO, PRE
  112. KERNEL2x8_2
  113. dcbt AO, PRE
  114. KERNEL2x8_1
  115. dcbt AO, PRE
  116. dcbt BO, PRE
  117. KERNEL2x8_2
  118. addic. L, L, -1
  119. bgt ZGEMM_L2x8_LOOP
  120. ZGEMM_L2x8_LOOP_END:
  121. dcbt AO, PRE
  122. KERNEL2x8_1
  123. dcbt AO, PRE
  124. dcbt BO, PRE
  125. KERNEL2x8_2
  126. dcbt AO, PRE
  127. KERNEL2x8_1
  128. dcbt AO, PRE
  129. dcbt BO, PRE
  130. KERNEL2x8_2
  131. dcbt AO, PRE
  132. KERNEL2x8_1
  133. dcbt AO, PRE
  134. KERNEL2x8_2
  135. dcbt AO, PRE
  136. KERNEL2x8_1
  137. KERNEL2x8_E2
  138. b ZGEMM_L2x8_SUB1
  139. ZGEMM_L2x8_SUB4:
  140. dcbt AO, PRE
  141. KERNEL2x8_SUBI1
  142. dcbt AO, PRE
  143. KERNEL2x8_SUB1
  144. dcbt AO, PRE
  145. KERNEL2x8_SUB1
  146. dcbt AO, PRE
  147. KERNEL2x8_SUB1
  148. KERNEL2x8_SUB1
  149. KERNEL2x8_SUB1
  150. KERNEL2x8_SUB1
  151. KERNEL2x8_SUB1
  152. b ZGEMM_L2x8_SUB1
  153. ZGEMM_L2x8_SUB0:
  154. andi. L, K, 7
  155. KERNEL2x8_SUBI1
  156. addic. L, L, -1
  157. ble ZGEMM_L2x8_SAVE
  158. b ZGEMM_L2x8_SUB2
  159. ZGEMM_L2x8_SUB1:
  160. andi. L, K, 7
  161. ble ZGEMM_L2x8_SAVE
  162. ZGEMM_L2x8_SUB2:
  163. KERNEL2x8_SUB1
  164. addic. L, L, -1
  165. bgt ZGEMM_L2x8_SUB2
  166. ZGEMM_L2x8_SAVE:
  167. SAVE2x8
  168. addic. I, I, -1
  169. bgt ZGEMM_L2x8_BEGIN
  170. ZGEMM_L2x8_END:
  171. ZGEMM_L2x4_BEGIN:
  172. andi. T2, M, 7
  173. ble ZGEMM_L2x1_END
  174. andi. T1, M, 4
  175. ble ZGEMM_L2x4_END
  176. mr BO, BBUFFER
  177. srawi. L, K, 3
  178. ble ZGEMM_L2x4_SUB0
  179. cmpwi cr0, L, 1
  180. ble ZGEMM_L2x4_SUB4
  181. ZGEMM_L2x4_LOOP_START:
  182. LOAD2x4_1
  183. KERNEL2x4_I1
  184. KERNEL2x4_2
  185. KERNEL2x4_1
  186. KERNEL2x4_2
  187. KERNEL2x4_1
  188. KERNEL2x4_2
  189. KERNEL2x4_1
  190. KERNEL2x4_2
  191. addic. L, L, -2
  192. ble ZGEMM_L2x4_LOOP_END
  193. .align 5
  194. ZGEMM_L2x4_LOOP:
  195. KERNEL2x4_1
  196. KERNEL2x4_2
  197. KERNEL2x4_1
  198. KERNEL2x4_2
  199. KERNEL2x4_1
  200. KERNEL2x4_2
  201. KERNEL2x4_1
  202. KERNEL2x4_2
  203. addic. L, L, -1
  204. bgt ZGEMM_L2x4_LOOP
  205. ZGEMM_L2x4_LOOP_END:
  206. KERNEL2x4_1
  207. KERNEL2x4_2
  208. KERNEL2x4_1
  209. KERNEL2x4_2
  210. KERNEL2x4_1
  211. KERNEL2x4_2
  212. KERNEL2x4_1
  213. KERNEL2x4_E2
  214. b ZGEMM_L2x4_SUB1
  215. ZGEMM_L2x4_SUB4:
  216. KERNEL2x4_SUBI1
  217. KERNEL2x4_SUB1
  218. KERNEL2x4_SUB1
  219. KERNEL2x4_SUB1
  220. KERNEL2x4_SUB1
  221. KERNEL2x4_SUB1
  222. KERNEL2x4_SUB1
  223. KERNEL2x4_SUB1
  224. b ZGEMM_L2x4_SUB1
  225. ZGEMM_L2x4_SUB0:
  226. andi. L, K, 7
  227. KERNEL2x4_SUBI1
  228. addic. L, L, -1
  229. ble ZGEMM_L2x4_SAVE
  230. b ZGEMM_L2x4_SUB2
  231. ZGEMM_L2x4_SUB1:
  232. andi. L, K, 7
  233. ble ZGEMM_L2x4_SAVE
  234. ZGEMM_L2x4_SUB2:
  235. KERNEL2x4_SUB1
  236. addic. L, L, -1
  237. bgt ZGEMM_L2x4_SUB2
  238. ZGEMM_L2x4_SAVE:
  239. SAVE2x4
  240. ZGEMM_L2x4_END:
  241. ZGEMM_L2x2_BEGIN:
  242. andi. T1, M, 2
  243. ble ZGEMM_L2x2_END
  244. mr BO, BBUFFER
  245. srawi. L, K, 3
  246. ble ZGEMM_L2x2_SUB0
  247. cmpwi cr0, L, 1
  248. ble ZGEMM_L2x2_SUB4
  249. ZGEMM_L2x2_LOOP_START:
  250. LOAD2x2_1
  251. KERNEL2x2_I1
  252. KERNEL2x2_2
  253. KERNEL2x2_1
  254. KERNEL2x2_2
  255. KERNEL2x2_1
  256. KERNEL2x2_2
  257. KERNEL2x2_1
  258. KERNEL2x2_2
  259. addic. L, L, -2
  260. ble ZGEMM_L2x2_LOOP_END
  261. .align 5
  262. ZGEMM_L2x2_LOOP:
  263. KERNEL2x2_1
  264. KERNEL2x2_2
  265. KERNEL2x2_1
  266. KERNEL2x2_2
  267. KERNEL2x2_1
  268. KERNEL2x2_2
  269. KERNEL2x2_1
  270. KERNEL2x2_2
  271. addic. L, L, -1
  272. bgt ZGEMM_L2x2_LOOP
  273. ZGEMM_L2x2_LOOP_END:
  274. KERNEL2x2_1
  275. KERNEL2x2_2
  276. KERNEL2x2_1
  277. KERNEL2x2_2
  278. KERNEL2x2_1
  279. KERNEL2x2_2
  280. KERNEL2x2_1
  281. KERNEL2x2_E2
  282. b ZGEMM_L2x2_SUB1
  283. ZGEMM_L2x2_SUB4:
  284. KERNEL2x2_SUBI1
  285. KERNEL2x2_SUB1
  286. KERNEL2x2_SUB1
  287. KERNEL2x2_SUB1
  288. KERNEL2x2_SUB1
  289. KERNEL2x2_SUB1
  290. KERNEL2x2_SUB1
  291. KERNEL2x2_SUB1
  292. b ZGEMM_L2x2_SUB1
  293. ZGEMM_L2x2_SUB0:
  294. andi. L, K, 7
  295. KERNEL2x2_SUBI1
  296. addic. L, L, -1
  297. ble ZGEMM_L2x2_SAVE
  298. b ZGEMM_L2x2_SUB2
  299. ZGEMM_L2x2_SUB1:
  300. andi. L, K, 7
  301. ble ZGEMM_L2x2_SAVE
  302. ZGEMM_L2x2_SUB2:
  303. KERNEL2x2_SUB1
  304. addic. L, L, -1
  305. bgt ZGEMM_L2x2_SUB2
  306. ZGEMM_L2x2_SAVE:
  307. SAVE2x2
  308. ZGEMM_L2x2_END:
  309. ZGEMM_L2x1_BEGIN:
  310. andi. T1, M, 1
  311. ble ZGEMM_L2x1_END
  312. mr BO, BBUFFER
  313. srawi. L, K, 3
  314. ble ZGEMM_L2x1_SUB0
  315. cmpwi cr0, L, 1
  316. ble ZGEMM_L2x1_SUB4
  317. ZGEMM_L2x1_LOOP_START:
  318. LOAD2x1_1
  319. KERNEL2x1_I1
  320. KERNEL2x1_2
  321. KERNEL2x1_1
  322. KERNEL2x1_2
  323. KERNEL2x1_1
  324. KERNEL2x1_2
  325. KERNEL2x1_1
  326. KERNEL2x1_2
  327. addic. L, L, -2
  328. ble ZGEMM_L2x1_LOOP_END
  329. .align 5
  330. ZGEMM_L2x1_LOOP:
  331. KERNEL2x1_1
  332. KERNEL2x1_2
  333. KERNEL2x1_1
  334. KERNEL2x1_2
  335. KERNEL2x1_1
  336. KERNEL2x1_2
  337. KERNEL2x1_1
  338. KERNEL2x1_2
  339. addic. L, L, -1
  340. bgt ZGEMM_L2x1_LOOP
  341. ZGEMM_L2x1_LOOP_END:
  342. KERNEL2x1_1
  343. KERNEL2x1_2
  344. KERNEL2x1_1
  345. KERNEL2x1_2
  346. KERNEL2x1_1
  347. KERNEL2x1_2
  348. KERNEL2x1_1
  349. KERNEL2x1_E2
  350. b ZGEMM_L2x1_SUB1
  351. ZGEMM_L2x1_SUB4:
  352. KERNEL2x1_SUBI1
  353. KERNEL2x1_SUB1
  354. KERNEL2x1_SUB1
  355. KERNEL2x1_SUB1
  356. KERNEL2x1_SUB1
  357. KERNEL2x1_SUB1
  358. KERNEL2x1_SUB1
  359. KERNEL2x1_SUB1
  360. b ZGEMM_L2x1_SUB1
  361. ZGEMM_L2x1_SUB0:
  362. andi. L, K, 7
  363. KERNEL2x1_SUBI1
  364. addic. L, L, -1
  365. ble ZGEMM_L2x1_SAVE
  366. b ZGEMM_L2x1_SUB2
  367. ZGEMM_L2x1_SUB1:
  368. andi. L, K, 7
  369. ble ZGEMM_L2x1_SAVE
  370. ZGEMM_L2x1_SUB2:
  371. KERNEL2x1_SUB1
  372. addic. L, L, -1
  373. bgt ZGEMM_L2x1_SUB2
  374. ZGEMM_L2x1_SAVE:
  375. SAVE2x1
  376. ZGEMM_L2x1_END:
  377. slwi T1, K, 5
  378. add B, B, T1
  379. addic. J, J, -1
  380. bgt ZGEMM_L2_BEGIN
  381. andi. T2, N, 1
  382. ble L999
  383. ZGEMM_L2_END:
  384. b ZGEMM_L1_BEGIN
  385. L999_H1:
  386. b L999
  387. ZGEMM_L1_BEGIN:
  388. mr BO, B
  389. mr BBO, BBUFFER
  390. slwi T1, K, 0
  391. ZGEMM_L1_COPYB:
  392. dcbtst BBO, PRE
  393. lxvdsx vs4, o0, BO // b0_r
  394. lxvdsx vs5, o8, BO // b0_i
  395. addi BO, BO, 16
  396. stxvd2x vs4, o0, BBO
  397. stxvd2x vs5, o16, BBO
  398. addic. T1, T1, -1
  399. addi BBO, BBO, 32
  400. bge ZGEMM_L1_COPYB
  401. andi. T1, N, 1
  402. ble ZGEMM_L1_END
  403. mr CO, C
  404. mr AO, A
  405. srawi. I, M, 3
  406. ble ZGEMM_L1x8_END
  407. ZGEMM_L1x8_BEGIN:
  408. mr BO, BBUFFER
  409. srawi. L, K, 3
  410. ble ZGEMM_L1x8_SUB0
  411. cmpwi cr0, L, 1
  412. ble ZGEMM_L1x8_SUB4
  413. ZGEMM_L1x8_LOOP_START:
  414. dcbt AO, PRE
  415. LOAD1x8_1
  416. dcbt AO, PRE
  417. KERNEL1x8_I1
  418. dcbt AO, PRE
  419. KERNEL1x8_2
  420. dcbt AO, PRE
  421. KERNEL1x8_1
  422. dcbt AO, PRE
  423. KERNEL1x8_2
  424. dcbt AO, PRE
  425. KERNEL1x8_1
  426. dcbt AO, PRE
  427. KERNEL1x8_2
  428. dcbt AO, PRE
  429. KERNEL1x8_1
  430. dcbt AO, PRE
  431. KERNEL1x8_2
  432. addic. L, L, -2
  433. ble ZGEMM_L1x8_LOOP_END
  434. .align 5
  435. ZGEMM_L1x8_LOOP:
  436. dcbt AO, PRE
  437. KERNEL1x8_1
  438. dcbt AO, PRE
  439. KERNEL1x8_2
  440. dcbt AO, PRE
  441. KERNEL1x8_1
  442. dcbt AO, PRE
  443. KERNEL1x8_2
  444. dcbt AO, PRE
  445. KERNEL1x8_1
  446. dcbt AO, PRE
  447. KERNEL1x8_2
  448. dcbt AO, PRE
  449. KERNEL1x8_1
  450. dcbt AO, PRE
  451. KERNEL1x8_2
  452. addic. L, L, -1
  453. bgt ZGEMM_L1x8_LOOP
  454. ZGEMM_L1x8_LOOP_END:
  455. dcbt AO, PRE
  456. KERNEL1x8_1
  457. dcbt AO, PRE
  458. KERNEL1x8_2
  459. dcbt AO, PRE
  460. KERNEL1x8_1
  461. dcbt AO, PRE
  462. KERNEL1x8_2
  463. dcbt AO, PRE
  464. KERNEL1x8_1
  465. dcbt AO, PRE
  466. KERNEL1x8_2
  467. dcbt AO, PRE
  468. KERNEL1x8_1
  469. KERNEL1x8_E2
  470. b ZGEMM_L1x8_SUB1
  471. ZGEMM_L1x8_SUB4:
  472. dcbt AO, PRE
  473. KERNEL1x8_SUBI1
  474. dcbt AO, PRE
  475. KERNEL1x8_SUB1
  476. dcbt AO, PRE
  477. KERNEL1x8_SUB1
  478. dcbt AO, PRE
  479. KERNEL1x8_SUB1
  480. KERNEL1x8_SUB1
  481. KERNEL1x8_SUB1
  482. KERNEL1x8_SUB1
  483. KERNEL1x8_SUB1
  484. b ZGEMM_L1x8_SUB1
  485. ZGEMM_L1x8_SUB0:
  486. andi. L, K, 7
  487. KERNEL1x8_SUBI1
  488. addic. L, L, -1
  489. ble ZGEMM_L1x8_SAVE
  490. b ZGEMM_L1x8_SUB2
  491. ZGEMM_L1x8_SUB1:
  492. andi. L, K, 7
  493. ble ZGEMM_L1x8_SAVE
  494. ZGEMM_L1x8_SUB2:
  495. KERNEL1x8_SUB1
  496. addic. L, L, -1
  497. bgt ZGEMM_L1x8_SUB2
  498. ZGEMM_L1x8_SAVE:
  499. SAVE1x8
  500. addic. I, I, -1
  501. bgt ZGEMM_L1x8_BEGIN
  502. ZGEMM_L1x8_END:
  503. ZGEMM_L1x4_BEGIN:
  504. andi. T2, M, 7
  505. ble ZGEMM_L1x1_END
  506. andi. T1, M, 4
  507. ble ZGEMM_L1x4_END
  508. mr BO, BBUFFER
  509. srawi. L, K, 3
  510. ble ZGEMM_L1x4_SUB0
  511. cmpwi cr0, L, 1
  512. ble ZGEMM_L1x4_SUB4
  513. ZGEMM_L1x4_LOOP_START:
  514. LOAD1x4_1
  515. KERNEL1x4_I1
  516. KERNEL1x4_2
  517. KERNEL1x4_1
  518. KERNEL1x4_2
  519. KERNEL1x4_1
  520. KERNEL1x4_2
  521. KERNEL1x4_1
  522. KERNEL1x4_2
  523. addic. L, L, -2
  524. ble ZGEMM_L1x4_LOOP_END
  525. .align 5
  526. ZGEMM_L1x4_LOOP:
  527. KERNEL1x4_1
  528. KERNEL1x4_2
  529. KERNEL1x4_1
  530. KERNEL1x4_2
  531. KERNEL1x4_1
  532. KERNEL1x4_2
  533. KERNEL1x4_1
  534. KERNEL1x4_2
  535. addic. L, L, -1
  536. bgt ZGEMM_L1x4_LOOP
  537. ZGEMM_L1x4_LOOP_END:
  538. KERNEL1x4_1
  539. KERNEL1x4_2
  540. KERNEL1x4_1
  541. KERNEL1x4_2
  542. KERNEL1x4_1
  543. KERNEL1x4_2
  544. KERNEL1x4_1
  545. KERNEL1x4_E2
  546. b ZGEMM_L1x4_SUB1
  547. ZGEMM_L1x4_SUB4:
  548. KERNEL1x4_SUBI1
  549. KERNEL1x4_SUB1
  550. KERNEL1x4_SUB1
  551. KERNEL1x4_SUB1
  552. KERNEL1x4_SUB1
  553. KERNEL1x4_SUB1
  554. KERNEL1x4_SUB1
  555. KERNEL1x4_SUB1
  556. b ZGEMM_L1x4_SUB1
  557. ZGEMM_L1x4_SUB0:
  558. andi. L, K, 7
  559. KERNEL1x4_SUBI1
  560. addic. L, L, -1
  561. ble ZGEMM_L1x4_SAVE
  562. b ZGEMM_L1x4_SUB2
  563. ZGEMM_L1x4_SUB1:
  564. andi. L, K, 7
  565. ble ZGEMM_L1x4_SAVE
  566. ZGEMM_L1x4_SUB2:
  567. KERNEL1x4_SUB1
  568. addic. L, L, -1
  569. bgt ZGEMM_L1x4_SUB2
  570. ZGEMM_L1x4_SAVE:
  571. SAVE1x4
  572. ZGEMM_L1x4_END:
  573. ZGEMM_L1x2_BEGIN:
  574. andi. T1, M, 2
  575. ble ZGEMM_L1x2_END
  576. mr BO, BBUFFER
  577. srawi. L, K, 3
  578. ble ZGEMM_L1x2_SUB0
  579. cmpwi cr0, L, 1
  580. ble ZGEMM_L1x2_SUB4
  581. ZGEMM_L1x2_LOOP_START:
  582. LOAD1x2_1
  583. KERNEL1x2_I1
  584. KERNEL1x2_2
  585. KERNEL1x2_1
  586. KERNEL1x2_2
  587. KERNEL1x2_1
  588. KERNEL1x2_2
  589. KERNEL1x2_1
  590. KERNEL1x2_2
  591. addic. L, L, -2
  592. ble ZGEMM_L1x2_LOOP_END
  593. .align 5
  594. ZGEMM_L1x2_LOOP:
  595. KERNEL1x2_1
  596. KERNEL1x2_2
  597. KERNEL1x2_1
  598. KERNEL1x2_2
  599. KERNEL1x2_1
  600. KERNEL1x2_2
  601. KERNEL1x2_1
  602. KERNEL1x2_2
  603. addic. L, L, -1
  604. bgt ZGEMM_L1x2_LOOP
  605. ZGEMM_L1x2_LOOP_END:
  606. KERNEL1x2_1
  607. KERNEL1x2_2
  608. KERNEL1x2_1
  609. KERNEL1x2_2
  610. KERNEL1x2_1
  611. KERNEL1x2_2
  612. KERNEL1x2_1
  613. KERNEL1x2_E2
  614. b ZGEMM_L1x2_SUB1
  615. ZGEMM_L1x2_SUB4:
  616. KERNEL1x2_SUBI1
  617. KERNEL1x2_SUB1
  618. KERNEL1x2_SUB1
  619. KERNEL1x2_SUB1
  620. KERNEL1x2_SUB1
  621. KERNEL1x2_SUB1
  622. KERNEL1x2_SUB1
  623. KERNEL1x2_SUB1
  624. b ZGEMM_L1x2_SUB1
  625. ZGEMM_L1x2_SUB0:
  626. andi. L, K, 7
  627. KERNEL1x2_SUBI1
  628. addic. L, L, -1
  629. ble ZGEMM_L1x2_SAVE
  630. b ZGEMM_L1x2_SUB2
  631. ZGEMM_L1x2_SUB1:
  632. andi. L, K, 7
  633. ble ZGEMM_L1x2_SAVE
  634. ZGEMM_L1x2_SUB2:
  635. KERNEL1x2_SUB1
  636. addic. L, L, -1
  637. bgt ZGEMM_L1x2_SUB2
  638. ZGEMM_L1x2_SAVE:
  639. SAVE1x2
  640. ZGEMM_L1x2_END:
  641. ZGEMM_L1x1_BEGIN:
  642. andi. T1, M, 1
  643. ble ZGEMM_L1x1_END
  644. mr BO, BBUFFER
  645. srawi. L, K, 3
  646. ble ZGEMM_L1x1_SUB0
  647. cmpwi cr0, L, 1
  648. ble ZGEMM_L1x1_SUB4
  649. ZGEMM_L1x1_LOOP_START:
  650. LOAD1x1_1
  651. KERNEL1x1_I1
  652. KERNEL1x1_2
  653. KERNEL1x1_1
  654. KERNEL1x1_2
  655. KERNEL1x1_1
  656. KERNEL1x1_2
  657. KERNEL1x1_1
  658. KERNEL1x1_2
  659. addic. L, L, -2
  660. ble ZGEMM_L1x1_LOOP_END
  661. .align 5
  662. ZGEMM_L1x1_LOOP:
  663. KERNEL1x1_1
  664. KERNEL1x1_2
  665. KERNEL1x1_1
  666. KERNEL1x1_2
  667. KERNEL1x1_1
  668. KERNEL1x1_2
  669. KERNEL1x1_1
  670. KERNEL1x1_2
  671. addic. L, L, -1
  672. bgt ZGEMM_L1x1_LOOP
  673. ZGEMM_L1x1_LOOP_END:
  674. KERNEL1x1_1
  675. KERNEL1x1_2
  676. KERNEL1x1_1
  677. KERNEL1x1_2
  678. KERNEL1x1_1
  679. KERNEL1x1_2
  680. KERNEL1x1_1
  681. KERNEL1x1_E2
  682. b ZGEMM_L1x1_SUB1
  683. ZGEMM_L1x1_SUB4:
  684. KERNEL1x1_SUBI1
  685. KERNEL1x1_SUB1
  686. KERNEL1x1_SUB1
  687. KERNEL1x1_SUB1
  688. KERNEL1x1_SUB1
  689. KERNEL1x1_SUB1
  690. KERNEL1x1_SUB1
  691. KERNEL1x1_SUB1
  692. b ZGEMM_L1x1_SUB1
  693. ZGEMM_L1x1_SUB0:
  694. andi. L, K, 7
  695. KERNEL1x1_SUBI1
  696. addic. L, L, -1
  697. ble ZGEMM_L1x1_SAVE
  698. b ZGEMM_L1x1_SUB2
  699. ZGEMM_L1x1_SUB1:
  700. andi. L, K, 7
  701. ble ZGEMM_L1x1_SAVE
  702. ZGEMM_L1x1_SUB2:
  703. KERNEL1x1_SUB1
  704. addic. L, L, -1
  705. bgt ZGEMM_L1x1_SUB2
  706. ZGEMM_L1x1_SAVE:
  707. SAVE1x1
  708. ZGEMM_L1x1_END:
  709. ZGEMM_L1_END: