You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_logic_16x4_power8.S 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851
  1. /***************************************************************************
  2. Copyright (c) 2013-2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2016/03/05 Werner Saar (wernsaar@googlemail.com)
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. * LAPACK-TEST : OK
  33. **************************************************************************************/
  34. #define MY_ALIGN .align 3
  35. srawi. J, N, 2
  36. ble LDGEMM_L4_END
  37. LDGEMM_L4_BEGIN:
  38. li T1, 128
  39. li T2, 256
  40. mr AO, A
  41. mr CO, C
  42. slwi T3, LDC , 2
  43. add C, C, T3
  44. dcbt A, T1
  45. dcbt A, T2
  46. srawi. I, M, 4
  47. ble LDGEMM_L4x16_END
  48. MY_ALIGN
  49. LDGEMM_L4x16_BEGIN_FIRST:
  50. li L, -128
  51. mr T1, CO
  52. add T2, T1, LDC
  53. add T3, T2, LDC
  54. add T4, T3, LDC
  55. and T1, T1, L
  56. and T2, T2, L
  57. and T3, T3, L
  58. and T4, T4, L
  59. dcbt T1, r0
  60. dcbt T2, r0
  61. dcbt T3, r0
  62. dcbt T4, r0
  63. mr BO, B
  64. srawi. L, K, 2
  65. addi T1, T1, 128
  66. addi T2, T2, 128
  67. addi T3, T3, 128
  68. addi T4, T4, 128
  69. dcbt T1, r0
  70. dcbt T2, r0
  71. dcbt T3, r0
  72. dcbt T4, r0
  73. ble LDGEMM_L4x16_SUB0_FIRST
  74. cmpwi cr0, L, 1
  75. ble LDGEMM_L4x16_SUB4_FIRST
  76. MY_ALIGN
  77. LDGEMM_L4x16_LOOP_START_FIRST:
  78. li T2, 512
  79. li o40, 40
  80. li o56, 56
  81. dcbt AO, PRE
  82. dcbt BO, T2
  83. LOAD4x16_1
  84. dcbt AO, PRE
  85. KERNEL4x16_I1
  86. dcbt AO, PRE
  87. addic. L, L, -2
  88. KERNEL4x16_L2
  89. dcbt AO, PRE
  90. KERNEL4x16_L1
  91. dcbt AO, PRE
  92. dcbt BO, T2
  93. KERNEL4x16_L2
  94. ble LDGEMM_L4x16_LOOP_END_FIRST
  95. mtctr L
  96. MY_ALIGN
  97. LDGEMM_L4x16_LOOP_FIRST:
  98. dcbt AO, PRE
  99. KERNEL4x16_L1
  100. dcbt AO, PRE
  101. KERNEL4x16_L2
  102. dcbt AO, PRE
  103. KERNEL4x16_L1
  104. dcbt AO, PRE
  105. dcbt BO, T2
  106. KERNEL4x16_L2
  107. bdnz LDGEMM_L4x16_LOOP_FIRST
  108. MY_ALIGN
  109. LDGEMM_L4x16_LOOP_END_FIRST:
  110. KERNEL4x16_L1
  111. KERNEL4x16_L2
  112. KERNEL4x16_1
  113. KERNEL4x16_E2
  114. b LDGEMM_L4x16_SUB1_FIRST
  115. LDGEMM_L4x16_SUB4_FIRST:
  116. KERNEL4x16_SUBI1
  117. KERNEL4x16_SUB1
  118. KERNEL4x16_SUB1
  119. KERNEL4x16_SUB1
  120. b LDGEMM_L4x16_SUB1_FIRST
  121. LDGEMM_L4x16_SUB0_FIRST:
  122. andi. L, K, 3
  123. KERNEL4x16_SUBI1
  124. addic. L, L, -1
  125. ble LDGEMM_L4x16_SAVE_FIRST
  126. b LDGEMM_L4x16_SUB2_FIRST
  127. LDGEMM_L4x16_SUB1_FIRST:
  128. andi. L, K, 3
  129. ble LDGEMM_L4x16_SAVE_FIRST
  130. LDGEMM_L4x16_SUB2_FIRST:
  131. KERNEL4x16_SUB1
  132. addic. L, L, -1
  133. bgt LDGEMM_L4x16_SUB2_FIRST
  134. MY_ALIGN
  135. LDGEMM_L4x16_SAVE_FIRST:
  136. SAVE4x16
  137. addic. I, I, -1
  138. ble LDGEMM_L4x16_END
  139. LDGEMM_L4x16_END_FIRST:
  140. MY_ALIGN
  141. LDGEMM_L4x16_BEGIN:
  142. li L, -128
  143. mr T1, CO
  144. add T2, T1, LDC
  145. add T3, T2, LDC
  146. add T4, T3, LDC
  147. and T1, T1, L
  148. and T2, T2, L
  149. and T3, T3, L
  150. and T4, T4, L
  151. dcbt T1, r0
  152. dcbt T2, r0
  153. dcbt T3, r0
  154. dcbt T4, r0
  155. mr BO, B
  156. srawi. L, K, 1
  157. addi T1, T1, 128
  158. addi T2, T2, 128
  159. addi T3, T3, 128
  160. addi T4, T4, 128
  161. dcbt T1, r0
  162. dcbt T2, r0
  163. dcbt T3, r0
  164. dcbt T4, r0
  165. ble- LDGEMM_L4x16_SUB0
  166. cmpwi cr0, L, 1
  167. ble- LDGEMM_L4x16_SUB4
  168. MY_ALIGN
  169. LDGEMM_L4x16_LOOP_START:
  170. li o40, 40
  171. li o56, 56
  172. dcbt AO, PRE
  173. LOAD4x16_1
  174. dcbt AO, PRE
  175. KERNEL4x16_I1
  176. dcbt AO, PRE
  177. addic. L, L, -2
  178. KERNEL4x16_L2
  179. ble- LDGEMM_L4x16_LOOP_END
  180. mtctr L
  181. MY_ALIGN
  182. LDGEMM_L4x16_LOOP:
  183. dcbt AO, PRE
  184. KERNEL4x16_L1
  185. dcbt AO, PRE
  186. KERNEL4x16_L2
  187. bdnz+ LDGEMM_L4x16_LOOP
  188. MY_ALIGN
  189. LDGEMM_L4x16_LOOP_END:
  190. KERNEL4x16_1
  191. KERNEL4x16_E2
  192. b LDGEMM_L4x16_SUB1
  193. MY_ALIGN
  194. LDGEMM_L4x16_SUB4:
  195. KERNEL4x16_SUBI1
  196. KERNEL4x16_SUB1
  197. b LDGEMM_L4x16_SUB1
  198. MY_ALIGN
  199. LDGEMM_L4x16_SUB0:
  200. andi. L, K, 1
  201. KERNEL4x16_SUBI1
  202. addic. L, L, -1
  203. ble LDGEMM_L4x16_SAVE
  204. b LDGEMM_L4x16_SUB2
  205. MY_ALIGN
  206. LDGEMM_L4x16_SUB1:
  207. andi. L, K, 1
  208. ble LDGEMM_L4x16_SAVE
  209. MY_ALIGN
  210. LDGEMM_L4x16_SUB2:
  211. KERNEL4x16_SUB1
  212. addic. L, L, -1
  213. bgt LDGEMM_L4x16_SUB2
  214. MY_ALIGN
  215. LDGEMM_L4x16_SAVE:
  216. SAVE4x16
  217. addic. I, I, -1
  218. bgt+ LDGEMM_L4x16_BEGIN
  219. LDGEMM_L4x16_END:
  220. LDGEMM_L4x8_BEGIN:
  221. andi. T2, M, 15
  222. ble LDGEMM_L4x1_END
  223. andi. T1, M, 8
  224. ble LDGEMM_L4x8_END
  225. mr BO, B
  226. srawi. L, K, 3
  227. ble LDGEMM_L4x8_SUB0
  228. cmpwi cr0, L, 1
  229. ble LDGEMM_L4x8_SUB4
  230. LDGEMM_L4x8_LOOP_START:
  231. dcbt AO, PRE
  232. LOAD4x8_1
  233. KERNEL4x8_I1
  234. dcbt AO, PRE
  235. KERNEL4x8_2
  236. KERNEL4x8_1
  237. dcbt AO, PRE
  238. KERNEL4x8_2
  239. KERNEL4x8_1
  240. dcbt AO, PRE
  241. KERNEL4x8_2
  242. KERNEL4x8_1
  243. dcbt AO, PRE
  244. KERNEL4x8_2
  245. addic. L, L, -2
  246. ble LDGEMM_L4x8_LOOP_END
  247. MY_ALIGN
  248. LDGEMM_L4x8_LOOP:
  249. KERNEL4x8_1
  250. dcbt AO, PRE
  251. KERNEL4x8_2
  252. KERNEL4x8_1
  253. dcbt AO, PRE
  254. KERNEL4x8_2
  255. KERNEL4x8_1
  256. dcbt AO, PRE
  257. KERNEL4x8_2
  258. KERNEL4x8_1
  259. dcbt AO, PRE
  260. KERNEL4x8_2
  261. addic. L, L, -1
  262. bgt LDGEMM_L4x8_LOOP
  263. LDGEMM_L4x8_LOOP_END:
  264. KERNEL4x8_1
  265. KERNEL4x8_2
  266. KERNEL4x8_1
  267. KERNEL4x8_2
  268. KERNEL4x8_1
  269. KERNEL4x8_2
  270. KERNEL4x8_1
  271. KERNEL4x8_E2
  272. b LDGEMM_L4x8_SUB1
  273. LDGEMM_L4x8_SUB4:
  274. KERNEL4x8_SUBI1
  275. KERNEL4x8_SUB1
  276. KERNEL4x8_SUB1
  277. KERNEL4x8_SUB1
  278. KERNEL4x8_SUB1
  279. KERNEL4x8_SUB1
  280. KERNEL4x8_SUB1
  281. KERNEL4x8_SUB1
  282. b LDGEMM_L4x8_SUB1
  283. LDGEMM_L4x8_SUB0:
  284. andi. L, K, 7
  285. KERNEL4x8_SUBI1
  286. addic. L, L, -1
  287. ble LDGEMM_L4x8_SAVE
  288. b LDGEMM_L4x8_SUB2
  289. LDGEMM_L4x8_SUB1:
  290. andi. L, K, 7
  291. ble LDGEMM_L4x8_SAVE
  292. LDGEMM_L4x8_SUB2:
  293. KERNEL4x8_SUB1
  294. addic. L, L, -1
  295. bgt LDGEMM_L4x8_SUB2
  296. LDGEMM_L4x8_SAVE:
  297. SAVE4x8
  298. LDGEMM_L4x8_END:
  299. LDGEMM_L4x4_BEGIN:
  300. andi. T1, M, 4
  301. ble LDGEMM_L4x4_END
  302. mr BO, B
  303. srawi. L, K, 3
  304. ble LDGEMM_L4x4_SUB0
  305. cmpwi cr0, L, 1
  306. ble LDGEMM_L4x4_SUB4
  307. LDGEMM_L4x4_LOOP_START:
  308. dcbt AO, PRE
  309. LOAD4x4_1
  310. KERNEL4x4_I1
  311. KERNEL4x4_2
  312. KERNEL4x4_1
  313. dcbt AO, PRE
  314. KERNEL4x4_2
  315. KERNEL4x4_1
  316. KERNEL4x4_2
  317. KERNEL4x4_1
  318. dcbt AO, PRE
  319. KERNEL4x4_2
  320. addic. L, L, -2
  321. ble LDGEMM_L4x4_LOOP_END
  322. MY_ALIGN
  323. LDGEMM_L4x4_LOOP:
  324. KERNEL4x4_1
  325. KERNEL4x4_2
  326. KERNEL4x4_1
  327. dcbt AO, PRE
  328. KERNEL4x4_2
  329. KERNEL4x4_1
  330. KERNEL4x4_2
  331. KERNEL4x4_1
  332. dcbt AO, PRE
  333. KERNEL4x4_2
  334. addic. L, L, -1
  335. bgt LDGEMM_L4x4_LOOP
  336. LDGEMM_L4x4_LOOP_END:
  337. KERNEL4x4_1
  338. KERNEL4x4_2
  339. KERNEL4x4_1
  340. KERNEL4x4_2
  341. KERNEL4x4_1
  342. KERNEL4x4_2
  343. KERNEL4x4_1
  344. KERNEL4x4_E2
  345. b LDGEMM_L4x4_SUB1
  346. LDGEMM_L4x4_SUB4:
  347. KERNEL4x4_SUBI1
  348. KERNEL4x4_SUB1
  349. KERNEL4x4_SUB1
  350. KERNEL4x4_SUB1
  351. KERNEL4x4_SUB1
  352. KERNEL4x4_SUB1
  353. KERNEL4x4_SUB1
  354. KERNEL4x4_SUB1
  355. b LDGEMM_L4x4_SUB1
  356. LDGEMM_L4x4_SUB0:
  357. andi. L, K, 7
  358. KERNEL4x4_SUBI1
  359. addic. L, L, -1
  360. ble LDGEMM_L4x4_SAVE
  361. b LDGEMM_L4x4_SUB2
  362. LDGEMM_L4x4_SUB1:
  363. andi. L, K, 7
  364. ble LDGEMM_L4x4_SAVE
  365. LDGEMM_L4x4_SUB2:
  366. KERNEL4x4_SUB1
  367. addic. L, L, -1
  368. bgt LDGEMM_L4x4_SUB2
  369. LDGEMM_L4x4_SAVE:
  370. SAVE4x4
  371. LDGEMM_L4x4_END:
  372. LDGEMM_L4x2_BEGIN:
  373. andi. T1, M, 2
  374. ble LDGEMM_L4x2_END
  375. mr BO, B
  376. srawi. L, K, 3
  377. ble LDGEMM_L4x2_SUB0
  378. cmpwi cr0, L, 1
  379. ble LDGEMM_L4x2_SUB4
  380. LDGEMM_L4x2_LOOP_START:
  381. LOAD4x2_1
  382. KERNEL4x2_I1
  383. KERNEL4x2_2
  384. KERNEL4x2_1
  385. KERNEL4x2_2
  386. KERNEL4x2_1
  387. KERNEL4x2_2
  388. KERNEL4x2_1
  389. KERNEL4x2_2
  390. addic. L, L, -2
  391. ble LDGEMM_L4x2_LOOP_END
  392. MY_ALIGN
  393. LDGEMM_L4x2_LOOP:
  394. KERNEL4x2_1
  395. KERNEL4x2_2
  396. KERNEL4x2_1
  397. KERNEL4x2_2
  398. KERNEL4x2_1
  399. KERNEL4x2_2
  400. KERNEL4x2_1
  401. KERNEL4x2_2
  402. addic. L, L, -1
  403. bgt LDGEMM_L4x2_LOOP
  404. LDGEMM_L4x2_LOOP_END:
  405. KERNEL4x2_1
  406. KERNEL4x2_2
  407. KERNEL4x2_1
  408. KERNEL4x2_2
  409. KERNEL4x2_1
  410. KERNEL4x2_2
  411. KERNEL4x2_1
  412. KERNEL4x2_E2
  413. b LDGEMM_L4x2_SUB1
  414. LDGEMM_L4x2_SUB4:
  415. KERNEL4x2_SUBI1
  416. KERNEL4x2_SUB1
  417. KERNEL4x2_SUB1
  418. KERNEL4x2_SUB1
  419. KERNEL4x2_SUB1
  420. KERNEL4x2_SUB1
  421. KERNEL4x2_SUB1
  422. KERNEL4x2_SUB1
  423. b LDGEMM_L4x2_SUB1
  424. LDGEMM_L4x2_SUB0:
  425. andi. L, K, 7
  426. KERNEL4x2_SUBI1
  427. addic. L, L, -1
  428. ble LDGEMM_L4x2_SAVE
  429. b LDGEMM_L4x2_SUB2
  430. LDGEMM_L4x2_SUB1:
  431. andi. L, K, 7
  432. ble LDGEMM_L4x2_SAVE
  433. LDGEMM_L4x2_SUB2:
  434. KERNEL4x2_SUB1
  435. addic. L, L, -1
  436. bgt LDGEMM_L4x2_SUB2
  437. LDGEMM_L4x2_SAVE:
  438. SAVE4x2
  439. LDGEMM_L4x2_END:
  440. LDGEMM_L4x1_BEGIN:
  441. andi. T1, M, 1
  442. ble LDGEMM_L4x1_END
  443. mr BO, B
  444. srawi. L, K, 3
  445. ble LDGEMM_L4x1_SUB0
  446. cmpwi cr0, L, 1
  447. ble LDGEMM_L4x1_SUB4
  448. LDGEMM_L4x1_LOOP_START:
  449. LOAD4x1_1
  450. KERNEL4x1_I1
  451. KERNEL4x1_2
  452. KERNEL4x1_1
  453. KERNEL4x1_2
  454. KERNEL4x1_1
  455. KERNEL4x1_2
  456. KERNEL4x1_1
  457. KERNEL4x1_2
  458. addic. L, L, -2
  459. ble LDGEMM_L4x1_LOOP_END
  460. MY_ALIGN
  461. LDGEMM_L4x1_LOOP:
  462. KERNEL4x1_1
  463. KERNEL4x1_2
  464. KERNEL4x1_1
  465. KERNEL4x1_2
  466. KERNEL4x1_1
  467. KERNEL4x1_2
  468. KERNEL4x1_1
  469. KERNEL4x1_2
  470. addic. L, L, -1
  471. bgt LDGEMM_L4x1_LOOP
  472. LDGEMM_L4x1_LOOP_END:
  473. KERNEL4x1_1
  474. KERNEL4x1_2
  475. KERNEL4x1_1
  476. KERNEL4x1_2
  477. KERNEL4x1_1
  478. KERNEL4x1_2
  479. KERNEL4x1_1
  480. KERNEL4x1_E2
  481. b LDGEMM_L4x1_SUB1
  482. LDGEMM_L4x1_SUB4:
  483. KERNEL4x1_SUBI1
  484. KERNEL4x1_SUB1
  485. KERNEL4x1_SUB1
  486. KERNEL4x1_SUB1
  487. KERNEL4x1_SUB1
  488. KERNEL4x1_SUB1
  489. KERNEL4x1_SUB1
  490. KERNEL4x1_SUB1
  491. b LDGEMM_L4x1_SUB1
  492. LDGEMM_L4x1_SUB0:
  493. andi. L, K, 7
  494. KERNEL4x1_SUBI1
  495. addic. L, L, -1
  496. ble LDGEMM_L4x1_SAVE
  497. b LDGEMM_L4x1_SUB2
  498. LDGEMM_L4x1_SUB1:
  499. andi. L, K, 7
  500. ble LDGEMM_L4x1_SAVE
  501. LDGEMM_L4x1_SUB2:
  502. KERNEL4x1_SUB1
  503. addic. L, L, -1
  504. bgt LDGEMM_L4x1_SUB2
  505. LDGEMM_L4x1_SAVE:
  506. SAVE4x1
  507. LDGEMM_L4x1_END:
  508. slwi T1, K, 5
  509. add B, B, T1
  510. addic. J, J, -1
  511. bgt LDGEMM_L4_BEGIN
  512. andi. T2, N, 3
  513. ble .L999
  514. LDGEMM_L4_END:
  515. b LDGEMM_L2_BEGIN
  516. .L999_H1:
  517. b .L999
  518. LDGEMM_L2_BEGIN:
  519. andi. T1, N, 2
  520. ble LDGEMM_L2_END
  521. mr CO, C
  522. mr AO, A
  523. slwi T1, LDC , 1
  524. add C, C, T1
  525. srawi. I, M, 4
  526. ble LDGEMM_L2x16_END
  527. LDGEMM_L2x16_BEGIN:
  528. mr BO, B
  529. srawi. L, K, 3
  530. ble LDGEMM_L2x16_SUB0
  531. cmpwi cr0, L, 1
  532. ble LDGEMM_L2x16_SUB4
  533. LDGEMM_L2x16_LOOP_START:
  534. dcbt AO, PRE
  535. LOAD2x16_1
  536. dcbt AO, PRE
  537. KERNEL2x16_I1
  538. dcbt AO, PRE
  539. KERNEL2x16_2
  540. dcbt AO, PRE
  541. KERNEL2x16_1
  542. dcbt AO, PRE
  543. KERNEL2x16_2
  544. dcbt AO, PRE
  545. KERNEL2x16_1
  546. dcbt AO, PRE
  547. KERNEL2x16_2
  548. dcbt AO, PRE
  549. KERNEL2x16_1
  550. dcbt AO, PRE
  551. KERNEL2x16_2
  552. addic. L, L, -2
  553. ble LDGEMM_L2x16_LOOP_END
  554. MY_ALIGN
  555. LDGEMM_L2x16_LOOP:
  556. dcbt AO, PRE
  557. KERNEL2x16_1
  558. dcbt AO, PRE
  559. KERNEL2x16_2
  560. dcbt AO, PRE
  561. KERNEL2x16_1
  562. dcbt AO, PRE
  563. KERNEL2x16_2
  564. dcbt AO, PRE
  565. KERNEL2x16_1
  566. dcbt AO, PRE
  567. KERNEL2x16_2
  568. dcbt AO, PRE
  569. KERNEL2x16_1
  570. dcbt AO, PRE
  571. KERNEL2x16_2
  572. addic. L, L, -1
  573. bgt LDGEMM_L2x16_LOOP
  574. LDGEMM_L2x16_LOOP_END:
  575. dcbt AO, PRE
  576. KERNEL2x16_1
  577. dcbt AO, PRE
  578. KERNEL2x16_2
  579. dcbt AO, PRE
  580. KERNEL2x16_1
  581. dcbt AO, PRE
  582. KERNEL2x16_2
  583. dcbt AO, PRE
  584. KERNEL2x16_1
  585. dcbt AO, PRE
  586. KERNEL2x16_2
  587. dcbt AO, PRE
  588. KERNEL2x16_1
  589. KERNEL2x16_E2
  590. b LDGEMM_L2x16_SUB1
  591. LDGEMM_L2x16_SUB4:
  592. dcbt AO, PRE
  593. KERNEL2x16_SUBI1
  594. dcbt AO, PRE
  595. KERNEL2x16_SUB1
  596. dcbt AO, PRE
  597. KERNEL2x16_SUB1
  598. dcbt AO, PRE
  599. KERNEL2x16_SUB1
  600. KERNEL2x16_SUB1
  601. KERNEL2x16_SUB1
  602. KERNEL2x16_SUB1
  603. KERNEL2x16_SUB1
  604. b LDGEMM_L2x16_SUB1
  605. LDGEMM_L2x16_SUB0:
  606. andi. L, K, 7
  607. KERNEL2x16_SUBI1
  608. addic. L, L, -1
  609. ble LDGEMM_L2x16_SAVE
  610. b LDGEMM_L2x16_SUB2
  611. LDGEMM_L2x16_SUB1:
  612. andi. L, K, 7
  613. ble LDGEMM_L2x16_SAVE
  614. LDGEMM_L2x16_SUB2:
  615. KERNEL2x16_SUB1
  616. addic. L, L, -1
  617. bgt LDGEMM_L2x16_SUB2
  618. LDGEMM_L2x16_SAVE:
  619. SAVE2x16
  620. addic. I, I, -1
  621. bgt LDGEMM_L2x16_BEGIN
  622. LDGEMM_L2x16_END:
  623. LDGEMM_L2x8_BEGIN:
  624. andi. T2, M, 15
  625. ble LDGEMM_L2x1_END
  626. andi. T1, M, 8
  627. ble LDGEMM_L2x8_END
  628. mr BO, B
  629. srawi. L, K, 3
  630. ble LDGEMM_L2x8_SUB0
  631. cmpwi cr0, L, 1
  632. ble LDGEMM_L2x8_SUB4
  633. LDGEMM_L2x8_LOOP_START:
  634. dcbt AO, PRE
  635. LOAD2x8_1
  636. KERNEL2x8_I1
  637. dcbt AO, PRE
  638. KERNEL2x8_2
  639. KERNEL2x8_1
  640. dcbt AO, PRE
  641. KERNEL2x8_2
  642. KERNEL2x8_1
  643. dcbt AO, PRE
  644. KERNEL2x8_2
  645. KERNEL2x8_1
  646. dcbt AO, PRE
  647. KERNEL2x8_2
  648. addic. L, L, -2
  649. ble LDGEMM_L2x8_LOOP_END
  650. MY_ALIGN
  651. LDGEMM_L2x8_LOOP:
  652. KERNEL2x8_1
  653. dcbt AO, PRE
  654. KERNEL2x8_2
  655. KERNEL2x8_1
  656. dcbt AO, PRE
  657. KERNEL2x8_2
  658. KERNEL2x8_1
  659. dcbt AO, PRE
  660. KERNEL2x8_2
  661. KERNEL2x8_1
  662. dcbt AO, PRE
  663. KERNEL2x8_2
  664. addic. L, L, -1
  665. bgt LDGEMM_L2x8_LOOP
  666. LDGEMM_L2x8_LOOP_END:
  667. KERNEL2x8_1
  668. KERNEL2x8_2
  669. KERNEL2x8_1
  670. KERNEL2x8_2
  671. KERNEL2x8_1
  672. KERNEL2x8_2
  673. KERNEL2x8_1
  674. KERNEL2x8_E2
  675. b LDGEMM_L2x8_SUB1
  676. LDGEMM_L2x8_SUB4:
  677. KERNEL2x8_SUBI1
  678. KERNEL2x8_SUB1
  679. KERNEL2x8_SUB1
  680. KERNEL2x8_SUB1
  681. KERNEL2x8_SUB1
  682. KERNEL2x8_SUB1
  683. KERNEL2x8_SUB1
  684. KERNEL2x8_SUB1
  685. b LDGEMM_L2x8_SUB1
  686. LDGEMM_L2x8_SUB0:
  687. andi. L, K, 7
  688. KERNEL2x8_SUBI1
  689. addic. L, L, -1
  690. ble LDGEMM_L2x8_SAVE
  691. b LDGEMM_L2x8_SUB2
  692. LDGEMM_L2x8_SUB1:
  693. andi. L, K, 7
  694. ble LDGEMM_L2x8_SAVE
  695. LDGEMM_L2x8_SUB2:
  696. KERNEL2x8_SUB1
  697. addic. L, L, -1
  698. bgt LDGEMM_L2x8_SUB2
  699. LDGEMM_L2x8_SAVE:
  700. SAVE2x8
  701. LDGEMM_L2x8_END:
  702. LDGEMM_L2x4_BEGIN:
  703. andi. T1, M, 4
  704. ble LDGEMM_L2x4_END
  705. mr BO, B
  706. srawi. L, K, 3
  707. ble LDGEMM_L2x4_SUB0
  708. cmpwi cr0, L, 1
  709. ble LDGEMM_L2x4_SUB4
  710. LDGEMM_L2x4_LOOP_START:
  711. LOAD2x4_1
  712. KERNEL2x4_I1
  713. KERNEL2x4_2
  714. KERNEL2x4_1
  715. KERNEL2x4_2
  716. KERNEL2x4_1
  717. KERNEL2x4_2
  718. KERNEL2x4_1
  719. KERNEL2x4_2
  720. addic. L, L, -2
  721. ble LDGEMM_L2x4_LOOP_END
  722. MY_ALIGN
  723. LDGEMM_L2x4_LOOP:
  724. KERNEL2x4_1
  725. KERNEL2x4_2
  726. KERNEL2x4_1
  727. KERNEL2x4_2
  728. KERNEL2x4_1
  729. KERNEL2x4_2
  730. KERNEL2x4_1
  731. KERNEL2x4_2
  732. addic. L, L, -1
  733. bgt LDGEMM_L2x4_LOOP
  734. LDGEMM_L2x4_LOOP_END:
  735. KERNEL2x4_1
  736. KERNEL2x4_2
  737. KERNEL2x4_1
  738. KERNEL2x4_2
  739. KERNEL2x4_1
  740. KERNEL2x4_2
  741. KERNEL2x4_1
  742. KERNEL2x4_E2
  743. b LDGEMM_L2x4_SUB1
  744. LDGEMM_L2x4_SUB4:
  745. KERNEL2x4_SUBI1
  746. KERNEL2x4_SUB1
  747. KERNEL2x4_SUB1
  748. KERNEL2x4_SUB1
  749. KERNEL2x4_SUB1
  750. KERNEL2x4_SUB1
  751. KERNEL2x4_SUB1
  752. KERNEL2x4_SUB1
  753. b LDGEMM_L2x4_SUB1
  754. LDGEMM_L2x4_SUB0:
  755. andi. L, K, 7
  756. KERNEL2x4_SUBI1
  757. addic. L, L, -1
  758. ble LDGEMM_L2x4_SAVE
  759. b LDGEMM_L2x4_SUB2
  760. LDGEMM_L2x4_SUB1:
  761. andi. L, K, 7
  762. ble LDGEMM_L2x4_SAVE
  763. LDGEMM_L2x4_SUB2:
  764. KERNEL2x4_SUB1
  765. addic. L, L, -1
  766. bgt LDGEMM_L2x4_SUB2
  767. LDGEMM_L2x4_SAVE:
  768. SAVE2x4
  769. LDGEMM_L2x4_END:
  770. LDGEMM_L2x2_BEGIN:
  771. andi. T1, M, 2
  772. ble LDGEMM_L2x2_END
  773. mr BO, B
  774. srawi. L, K, 3
  775. ble LDGEMM_L2x2_SUB0
  776. cmpwi cr0, L, 1
  777. ble LDGEMM_L2x2_SUB4
  778. LDGEMM_L2x2_LOOP_START:
  779. LOAD2x2_1
  780. KERNEL2x2_I1
  781. KERNEL2x2_2
  782. KERNEL2x2_1
  783. KERNEL2x2_2
  784. KERNEL2x2_1
  785. KERNEL2x2_2
  786. KERNEL2x2_1
  787. KERNEL2x2_2
  788. addic. L, L, -2
  789. ble LDGEMM_L2x2_LOOP_END
  790. MY_ALIGN
  791. LDGEMM_L2x2_LOOP:
  792. KERNEL2x2_1
  793. KERNEL2x2_2
  794. KERNEL2x2_1
  795. KERNEL2x2_2
  796. KERNEL2x2_1
  797. KERNEL2x2_2
  798. KERNEL2x2_1
  799. KERNEL2x2_2
  800. addic. L, L, -1
  801. bgt LDGEMM_L2x2_LOOP
  802. LDGEMM_L2x2_LOOP_END:
  803. KERNEL2x2_1
  804. KERNEL2x2_2
  805. KERNEL2x2_1
  806. KERNEL2x2_2
  807. KERNEL2x2_1
  808. KERNEL2x2_2
  809. KERNEL2x2_1
  810. KERNEL2x2_E2
  811. b LDGEMM_L2x2_SUB1
  812. LDGEMM_L2x2_SUB4:
  813. KERNEL2x2_SUBI1
  814. KERNEL2x2_SUB1
  815. KERNEL2x2_SUB1
  816. KERNEL2x2_SUB1
  817. KERNEL2x2_SUB1
  818. KERNEL2x2_SUB1
  819. KERNEL2x2_SUB1
  820. KERNEL2x2_SUB1
  821. b LDGEMM_L2x2_SUB1
  822. LDGEMM_L2x2_SUB0:
  823. andi. L, K, 7
  824. KERNEL2x2_SUBI1
  825. addic. L, L, -1
  826. ble LDGEMM_L2x2_SAVE
  827. b LDGEMM_L2x2_SUB2
  828. LDGEMM_L2x2_SUB1:
  829. andi. L, K, 7
  830. ble LDGEMM_L2x2_SAVE
  831. LDGEMM_L2x2_SUB2:
  832. KERNEL2x2_SUB1
  833. addic. L, L, -1
  834. bgt LDGEMM_L2x2_SUB2
  835. LDGEMM_L2x2_SAVE:
  836. SAVE2x2
  837. LDGEMM_L2x2_END:
  838. LDGEMM_L2x1_BEGIN:
  839. andi. T1, M, 1
  840. ble LDGEMM_L2x1_END
  841. mr BO, B
  842. srawi. L, K, 3
  843. ble LDGEMM_L2x1_SUB0
  844. cmpwi cr0, L, 1
  845. ble LDGEMM_L2x1_SUB4
  846. LDGEMM_L2x1_LOOP_START:
  847. LOAD2x1_1
  848. KERNEL2x1_I1
  849. KERNEL2x1_2
  850. KERNEL2x1_1
  851. KERNEL2x1_2
  852. KERNEL2x1_1
  853. KERNEL2x1_2
  854. KERNEL2x1_1
  855. KERNEL2x1_2
  856. addic. L, L, -2
  857. ble LDGEMM_L2x1_LOOP_END
  858. MY_ALIGN
  859. LDGEMM_L2x1_LOOP:
  860. KERNEL2x1_1
  861. KERNEL2x1_2
  862. KERNEL2x1_1
  863. KERNEL2x1_2
  864. KERNEL2x1_1
  865. KERNEL2x1_2
  866. KERNEL2x1_1
  867. KERNEL2x1_2
  868. addic. L, L, -1
  869. bgt LDGEMM_L2x1_LOOP
  870. LDGEMM_L2x1_LOOP_END:
  871. KERNEL2x1_1
  872. KERNEL2x1_2
  873. KERNEL2x1_1
  874. KERNEL2x1_2
  875. KERNEL2x1_1
  876. KERNEL2x1_2
  877. KERNEL2x1_1
  878. KERNEL2x1_E2
  879. b LDGEMM_L2x1_SUB1
  880. LDGEMM_L2x1_SUB4:
  881. KERNEL2x1_SUBI1
  882. KERNEL2x1_SUB1
  883. KERNEL2x1_SUB1
  884. KERNEL2x1_SUB1
  885. KERNEL2x1_SUB1
  886. KERNEL2x1_SUB1
  887. KERNEL2x1_SUB1
  888. KERNEL2x1_SUB1
  889. b LDGEMM_L2x1_SUB1
  890. LDGEMM_L2x1_SUB0:
  891. andi. L, K, 7
  892. KERNEL2x1_SUBI1
  893. addic. L, L, -1
  894. ble LDGEMM_L2x1_SAVE
  895. b LDGEMM_L2x1_SUB2
  896. LDGEMM_L2x1_SUB1:
  897. andi. L, K, 7
  898. ble LDGEMM_L2x1_SAVE
  899. LDGEMM_L2x1_SUB2:
  900. KERNEL2x1_SUB1
  901. addic. L, L, -1
  902. bgt LDGEMM_L2x1_SUB2
  903. LDGEMM_L2x1_SAVE:
  904. SAVE2x1
  905. LDGEMM_L2x1_END:
  906. slwi T1, K, 4
  907. add B, B, T1
  908. LDGEMM_L2_END:
  909. LDGEMM_L1_BEGIN:
  910. andi. T1, N, 1
  911. ble LDGEMM_L1_END
  912. mr CO, C
  913. mr AO, A
  914. srawi. I, M, 4
  915. ble LDGEMM_L1x16_END
  916. LDGEMM_L1x16_BEGIN:
  917. mr BO, B
  918. srawi. L, K, 3
  919. ble LDGEMM_L1x16_SUB0
  920. cmpwi cr0, L, 1
  921. ble LDGEMM_L1x16_SUB4
  922. LDGEMM_L1x16_LOOP_START:
  923. dcbt AO, PRE
  924. LOAD1x16_1
  925. dcbt AO, PRE
  926. KERNEL1x16_I1
  927. dcbt AO, PRE
  928. KERNEL1x16_2
  929. dcbt AO, PRE
  930. KERNEL1x16_1
  931. dcbt AO, PRE
  932. KERNEL1x16_2
  933. dcbt AO, PRE
  934. KERNEL1x16_1
  935. dcbt AO, PRE
  936. KERNEL1x16_2
  937. dcbt AO, PRE
  938. KERNEL1x16_1
  939. dcbt AO, PRE
  940. KERNEL1x16_2
  941. addic. L, L, -2
  942. ble LDGEMM_L1x16_LOOP_END
  943. MY_ALIGN
  944. LDGEMM_L1x16_LOOP:
  945. dcbt AO, PRE
  946. KERNEL1x16_1
  947. dcbt AO, PRE
  948. KERNEL1x16_2
  949. dcbt AO, PRE
  950. KERNEL1x16_1
  951. dcbt AO, PRE
  952. KERNEL1x16_2
  953. dcbt AO, PRE
  954. KERNEL1x16_1
  955. dcbt AO, PRE
  956. KERNEL1x16_2
  957. dcbt AO, PRE
  958. KERNEL1x16_1
  959. dcbt AO, PRE
  960. KERNEL1x16_2
  961. addic. L, L, -1
  962. bgt LDGEMM_L1x16_LOOP
  963. LDGEMM_L1x16_LOOP_END:
  964. dcbt AO, PRE
  965. KERNEL1x16_1
  966. dcbt AO, PRE
  967. KERNEL1x16_2
  968. dcbt AO, PRE
  969. KERNEL1x16_1
  970. dcbt AO, PRE
  971. KERNEL1x16_2
  972. dcbt AO, PRE
  973. KERNEL1x16_1
  974. dcbt AO, PRE
  975. KERNEL1x16_2
  976. dcbt AO, PRE
  977. KERNEL1x16_1
  978. KERNEL1x16_E2
  979. b LDGEMM_L1x16_SUB1
  980. LDGEMM_L1x16_SUB4:
  981. dcbt AO, PRE
  982. KERNEL1x16_SUBI1
  983. dcbt AO, PRE
  984. KERNEL1x16_SUB1
  985. dcbt AO, PRE
  986. KERNEL1x16_SUB1
  987. dcbt AO, PRE
  988. KERNEL1x16_SUB1
  989. KERNEL1x16_SUB1
  990. KERNEL1x16_SUB1
  991. KERNEL1x16_SUB1
  992. KERNEL1x16_SUB1
  993. b LDGEMM_L1x16_SUB1
  994. LDGEMM_L1x16_SUB0:
  995. andi. L, K, 7
  996. KERNEL1x16_SUBI1
  997. addic. L, L, -1
  998. ble LDGEMM_L1x16_SAVE
  999. b LDGEMM_L1x16_SUB2
  1000. LDGEMM_L1x16_SUB1:
  1001. andi. L, K, 7
  1002. ble LDGEMM_L1x16_SAVE
  1003. LDGEMM_L1x16_SUB2:
  1004. KERNEL1x16_SUB1
  1005. addic. L, L, -1
  1006. bgt LDGEMM_L1x16_SUB2
  1007. LDGEMM_L1x16_SAVE:
  1008. SAVE1x16
  1009. addic. I, I, -1
  1010. bgt LDGEMM_L1x16_BEGIN
  1011. LDGEMM_L1x16_END:
  1012. LDGEMM_L1x8_BEGIN:
  1013. andi. T2, M, 15
  1014. ble LDGEMM_L1x1_END
  1015. andi. T1, M, 8
  1016. ble LDGEMM_L1x8_END
  1017. mr BO, B
  1018. srawi. L, K, 3
  1019. ble LDGEMM_L1x8_SUB0
  1020. cmpwi cr0, L, 1
  1021. ble LDGEMM_L1x8_SUB4
  1022. LDGEMM_L1x8_LOOP_START:
  1023. dcbt AO, PRE
  1024. LOAD1x8_1
  1025. KERNEL1x8_I1
  1026. dcbt AO, PRE
  1027. KERNEL1x8_2
  1028. KERNEL1x8_1
  1029. dcbt AO, PRE
  1030. KERNEL1x8_2
  1031. KERNEL1x8_1
  1032. dcbt AO, PRE
  1033. KERNEL1x8_2
  1034. KERNEL1x8_1
  1035. dcbt AO, PRE
  1036. KERNEL1x8_2
  1037. addic. L, L, -2
  1038. ble LDGEMM_L1x8_LOOP_END
  1039. MY_ALIGN
  1040. LDGEMM_L1x8_LOOP:
  1041. KERNEL1x8_1
  1042. dcbt AO, PRE
  1043. KERNEL1x8_2
  1044. KERNEL1x8_1
  1045. dcbt AO, PRE
  1046. KERNEL1x8_2
  1047. KERNEL1x8_1
  1048. dcbt AO, PRE
  1049. KERNEL1x8_2
  1050. KERNEL1x8_1
  1051. dcbt AO, PRE
  1052. KERNEL1x8_2
  1053. addic. L, L, -1
  1054. bgt LDGEMM_L1x8_LOOP
  1055. LDGEMM_L1x8_LOOP_END:
  1056. KERNEL1x8_1
  1057. KERNEL1x8_2
  1058. KERNEL1x8_1
  1059. KERNEL1x8_2
  1060. KERNEL1x8_1
  1061. KERNEL1x8_2
  1062. KERNEL1x8_1
  1063. KERNEL1x8_E2
  1064. b LDGEMM_L1x8_SUB1
  1065. LDGEMM_L1x8_SUB4:
  1066. KERNEL1x8_SUBI1
  1067. KERNEL1x8_SUB1
  1068. KERNEL1x8_SUB1
  1069. KERNEL1x8_SUB1
  1070. KERNEL1x8_SUB1
  1071. KERNEL1x8_SUB1
  1072. KERNEL1x8_SUB1
  1073. KERNEL1x8_SUB1
  1074. b LDGEMM_L1x8_SUB1
  1075. LDGEMM_L1x8_SUB0:
  1076. andi. L, K, 7
  1077. KERNEL1x8_SUBI1
  1078. addic. L, L, -1
  1079. ble LDGEMM_L1x8_SAVE
  1080. b LDGEMM_L1x8_SUB2
  1081. LDGEMM_L1x8_SUB1:
  1082. andi. L, K, 7
  1083. ble LDGEMM_L1x8_SAVE
  1084. LDGEMM_L1x8_SUB2:
  1085. KERNEL1x8_SUB1
  1086. addic. L, L, -1
  1087. bgt LDGEMM_L1x8_SUB2
  1088. LDGEMM_L1x8_SAVE:
  1089. SAVE1x8
  1090. LDGEMM_L1x8_END:
  1091. LDGEMM_L1x4_BEGIN:
  1092. andi. T1, M, 4
  1093. ble LDGEMM_L1x4_END
  1094. mr BO, B
  1095. srawi. L, K, 3
  1096. ble LDGEMM_L1x4_SUB0
  1097. cmpwi cr0, L, 1
  1098. ble LDGEMM_L1x4_SUB4
  1099. LDGEMM_L1x4_LOOP_START:
  1100. LOAD1x4_1
  1101. KERNEL1x4_I1
  1102. KERNEL1x4_2
  1103. KERNEL1x4_1
  1104. KERNEL1x4_2
  1105. KERNEL1x4_1
  1106. KERNEL1x4_2
  1107. KERNEL1x4_1
  1108. KERNEL1x4_2
  1109. addic. L, L, -2
  1110. ble LDGEMM_L1x4_LOOP_END
  1111. MY_ALIGN
  1112. LDGEMM_L1x4_LOOP:
  1113. KERNEL1x4_1
  1114. KERNEL1x4_2
  1115. KERNEL1x4_1
  1116. KERNEL1x4_2
  1117. KERNEL1x4_1
  1118. KERNEL1x4_2
  1119. KERNEL1x4_1
  1120. KERNEL1x4_2
  1121. addic. L, L, -1
  1122. bgt LDGEMM_L1x4_LOOP
  1123. LDGEMM_L1x4_LOOP_END:
  1124. KERNEL1x4_1
  1125. KERNEL1x4_2
  1126. KERNEL1x4_1
  1127. KERNEL1x4_2
  1128. KERNEL1x4_1
  1129. KERNEL1x4_2
  1130. KERNEL1x4_1
  1131. KERNEL1x4_E2
  1132. b LDGEMM_L1x4_SUB1
  1133. LDGEMM_L1x4_SUB4:
  1134. KERNEL1x4_SUBI1
  1135. KERNEL1x4_SUB1
  1136. KERNEL1x4_SUB1
  1137. KERNEL1x4_SUB1
  1138. KERNEL1x4_SUB1
  1139. KERNEL1x4_SUB1
  1140. KERNEL1x4_SUB1
  1141. KERNEL1x4_SUB1
  1142. b LDGEMM_L1x4_SUB1
  1143. LDGEMM_L1x4_SUB0:
  1144. andi. L, K, 7
  1145. KERNEL1x4_SUBI1
  1146. addic. L, L, -1
  1147. ble LDGEMM_L1x4_SAVE
  1148. b LDGEMM_L1x4_SUB2
  1149. LDGEMM_L1x4_SUB1:
  1150. andi. L, K, 7
  1151. ble LDGEMM_L1x4_SAVE
  1152. LDGEMM_L1x4_SUB2:
  1153. KERNEL1x4_SUB1
  1154. addic. L, L, -1
  1155. bgt LDGEMM_L1x4_SUB2
  1156. LDGEMM_L1x4_SAVE:
  1157. SAVE1x4
  1158. LDGEMM_L1x4_END:
  1159. LDGEMM_L1x2_BEGIN:
  1160. andi. T1, M, 2
  1161. ble LDGEMM_L1x2_END
  1162. mr BO, B
  1163. srawi. L, K, 3
  1164. ble LDGEMM_L1x2_SUB0
  1165. cmpwi cr0, L, 1
  1166. ble LDGEMM_L1x2_SUB4
  1167. LDGEMM_L1x2_LOOP_START:
  1168. LOAD1x2_1
  1169. KERNEL1x2_I1
  1170. KERNEL1x2_2
  1171. KERNEL1x2_1
  1172. KERNEL1x2_2
  1173. KERNEL1x2_1
  1174. KERNEL1x2_2
  1175. KERNEL1x2_1
  1176. KERNEL1x2_2
  1177. addic. L, L, -2
  1178. ble LDGEMM_L1x2_LOOP_END
  1179. MY_ALIGN
  1180. LDGEMM_L1x2_LOOP:
  1181. KERNEL1x2_1
  1182. KERNEL1x2_2
  1183. KERNEL1x2_1
  1184. KERNEL1x2_2
  1185. KERNEL1x2_1
  1186. KERNEL1x2_2
  1187. KERNEL1x2_1
  1188. KERNEL1x2_2
  1189. addic. L, L, -1
  1190. bgt LDGEMM_L1x2_LOOP
  1191. LDGEMM_L1x2_LOOP_END:
  1192. KERNEL1x2_1
  1193. KERNEL1x2_2
  1194. KERNEL1x2_1
  1195. KERNEL1x2_2
  1196. KERNEL1x2_1
  1197. KERNEL1x2_2
  1198. KERNEL1x2_1
  1199. KERNEL1x2_E2
  1200. b LDGEMM_L1x2_SUB1
  1201. LDGEMM_L1x2_SUB4:
  1202. KERNEL1x2_SUBI1
  1203. KERNEL1x2_SUB1
  1204. KERNEL1x2_SUB1
  1205. KERNEL1x2_SUB1
  1206. KERNEL1x2_SUB1
  1207. KERNEL1x2_SUB1
  1208. KERNEL1x2_SUB1
  1209. KERNEL1x2_SUB1
  1210. b LDGEMM_L1x2_SUB1
  1211. LDGEMM_L1x2_SUB0:
  1212. andi. L, K, 7
  1213. KERNEL1x2_SUBI1
  1214. addic. L, L, -1
  1215. ble LDGEMM_L1x2_SAVE
  1216. b LDGEMM_L1x2_SUB2
  1217. LDGEMM_L1x2_SUB1:
  1218. andi. L, K, 7
  1219. ble LDGEMM_L1x2_SAVE
  1220. LDGEMM_L1x2_SUB2:
  1221. KERNEL1x2_SUB1
  1222. addic. L, L, -1
  1223. bgt LDGEMM_L1x2_SUB2
  1224. LDGEMM_L1x2_SAVE:
  1225. SAVE1x2
  1226. LDGEMM_L1x2_END:
  1227. LDGEMM_L1x1_BEGIN:
  1228. andi. T1, M, 1
  1229. ble LDGEMM_L1x1_END
  1230. mr BO, B
  1231. srawi. L, K, 3
  1232. ble LDGEMM_L1x1_SUB0
  1233. cmpwi cr0, L, 1
  1234. ble LDGEMM_L1x1_SUB4
  1235. LDGEMM_L1x1_LOOP_START:
  1236. LOAD1x1_1
  1237. KERNEL1x1_I1
  1238. KERNEL1x1_2
  1239. KERNEL1x1_1
  1240. KERNEL1x1_2
  1241. KERNEL1x1_1
  1242. KERNEL1x1_2
  1243. KERNEL1x1_1
  1244. KERNEL1x1_2
  1245. addic. L, L, -2
  1246. ble LDGEMM_L1x1_LOOP_END
  1247. MY_ALIGN
  1248. LDGEMM_L1x1_LOOP:
  1249. KERNEL1x1_1
  1250. KERNEL1x1_2
  1251. KERNEL1x1_1
  1252. KERNEL1x1_2
  1253. KERNEL1x1_1
  1254. KERNEL1x1_2
  1255. KERNEL1x1_1
  1256. KERNEL1x1_2
  1257. addic. L, L, -1
  1258. bgt LDGEMM_L1x1_LOOP
  1259. LDGEMM_L1x1_LOOP_END:
  1260. KERNEL1x1_1
  1261. KERNEL1x1_2
  1262. KERNEL1x1_1
  1263. KERNEL1x1_2
  1264. KERNEL1x1_1
  1265. KERNEL1x1_2
  1266. KERNEL1x1_1
  1267. KERNEL1x1_E2
  1268. b LDGEMM_L1x1_SUB1
  1269. LDGEMM_L1x1_SUB4:
  1270. KERNEL1x1_SUBI1
  1271. KERNEL1x1_SUB1
  1272. KERNEL1x1_SUB1
  1273. KERNEL1x1_SUB1
  1274. KERNEL1x1_SUB1
  1275. KERNEL1x1_SUB1
  1276. KERNEL1x1_SUB1
  1277. KERNEL1x1_SUB1
  1278. b LDGEMM_L1x1_SUB1
  1279. LDGEMM_L1x1_SUB0:
  1280. andi. L, K, 7
  1281. KERNEL1x1_SUBI1
  1282. addic. L, L, -1
  1283. ble LDGEMM_L1x1_SAVE
  1284. b LDGEMM_L1x1_SUB2
  1285. LDGEMM_L1x1_SUB1:
  1286. andi. L, K, 7
  1287. ble LDGEMM_L1x1_SAVE
  1288. LDGEMM_L1x1_SUB2:
  1289. KERNEL1x1_SUB1
  1290. addic. L, L, -1
  1291. bgt LDGEMM_L1x1_SUB2
  1292. LDGEMM_L1x1_SAVE:
  1293. SAVE1x1
  1294. LDGEMM_L1x1_END:
  1295. LDGEMM_L1_END: