You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrmm_logic_8x2_power8.S 23 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237
  1. /***************************************************************************
  2. Copyright (c) 2013-2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2016/03/05 Werner Saar (wernsaar@googlemail.com)
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. * LAPACK-TEST : OK
  33. **************************************************************************************/
  34. srawi. J, N, 1
  35. ble .LZTRMM_L2_END
  36. .LZTRMM_L2_BEGIN:
  37. mr CO, C
  38. mr AO, A
  39. slwi T1, LDC , 1
  40. add C, C, T1
  41. #if defined(LEFT)
  42. mr KK, OFFSET // OFFSET -> KK
  43. #endif
  44. srawi. I, M, 3
  45. ble .LZTRMM_L2x8_END
  46. .LZTRMM_L2x8_BEGIN:
  47. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  48. mr BO, B // B -> BO
  49. #else
  50. mr BO, B // B -> BO
  51. slwi T1, KK, 5 // Number of values in B shifted
  52. slwi T2, KK, 7 // Number of values in A shifted
  53. add BO, BO, T1 // Add values to BO
  54. add AO, AO, T2 // Add values to AO
  55. #endif
  56. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  57. sub T1, K, KK // K - KK -> TEMP1
  58. #else
  59. mr T1, KK // KK -> KTEMP
  60. #ifdef LEFT
  61. addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
  62. #else
  63. addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
  64. #endif
  65. #endif
  66. mr KKK, T1
  67. mr K1, T1
  68. srawi. L, K1, 3 // KTEMP / 8 -> L
  69. ble .LZTRMM_L2x8_SUB0
  70. cmpwi cr0, L, 1
  71. ble .LZTRMM_L2x8_SUB4
  72. .LZTRMM_L2x8_LOOP_START:
  73. dcbt AO, PRE
  74. LOAD2x8_1
  75. dcbt AO, PRE
  76. KERNEL2x8_I1
  77. dcbt AO, PRE
  78. KERNEL2x8_2
  79. dcbt AO, PRE
  80. KERNEL2x8_1
  81. dcbt AO, PRE
  82. KERNEL2x8_2
  83. dcbt AO, PRE
  84. KERNEL2x8_1
  85. dcbt AO, PRE
  86. KERNEL2x8_2
  87. dcbt AO, PRE
  88. KERNEL2x8_1
  89. dcbt AO, PRE
  90. KERNEL2x8_2
  91. addic. L, L, -2
  92. ble .LZTRMM_L2x8_LOOP_END
  93. .align 5
  94. .LZTRMM_L2x8_LOOP:
  95. dcbt AO, PRE
  96. KERNEL2x8_1
  97. dcbt AO, PRE
  98. KERNEL2x8_2
  99. dcbt AO, PRE
  100. KERNEL2x8_1
  101. dcbt AO, PRE
  102. KERNEL2x8_2
  103. dcbt AO, PRE
  104. KERNEL2x8_1
  105. dcbt AO, PRE
  106. KERNEL2x8_2
  107. dcbt AO, PRE
  108. KERNEL2x8_1
  109. dcbt AO, PRE
  110. KERNEL2x8_2
  111. addic. L, L, -1
  112. bgt .LZTRMM_L2x8_LOOP
  113. .LZTRMM_L2x8_LOOP_END:
  114. dcbt AO, PRE
  115. KERNEL2x8_1
  116. dcbt AO, PRE
  117. KERNEL2x8_2
  118. dcbt AO, PRE
  119. KERNEL2x8_1
  120. dcbt AO, PRE
  121. KERNEL2x8_2
  122. dcbt AO, PRE
  123. KERNEL2x8_1
  124. dcbt AO, PRE
  125. KERNEL2x8_2
  126. dcbt AO, PRE
  127. KERNEL2x8_1
  128. KERNEL2x8_E2
  129. b .LZTRMM_L2x8_SUB1
  130. .LZTRMM_L2x8_SUB4:
  131. dcbt AO, PRE
  132. KERNEL2x8_SUBI1
  133. dcbt AO, PRE
  134. KERNEL2x8_SUB1
  135. dcbt AO, PRE
  136. KERNEL2x8_SUB1
  137. dcbt AO, PRE
  138. KERNEL2x8_SUB1
  139. KERNEL2x8_SUB1
  140. KERNEL2x8_SUB1
  141. KERNEL2x8_SUB1
  142. KERNEL2x8_SUB1
  143. b .LZTRMM_L2x8_SUB1
  144. .LZTRMM_L2x8_SUB0:
  145. andi. L, K1, 7 // K1 & 7 -> L
  146. KERNEL2x8_SUBI1
  147. addic. L, L, -1
  148. ble .LZTRMM_L2x8_SAVE
  149. b .LZTRMM_L2x8_SUB2
  150. .LZTRMM_L2x8_SUB1:
  151. andi. L, K1, 7 // K1 & 7 -> L
  152. ble .LZTRMM_L2x8_SAVE
  153. .LZTRMM_L2x8_SUB2:
  154. KERNEL2x8_SUB1
  155. addic. L, L, -1
  156. bgt .LZTRMM_L2x8_SUB2
  157. .LZTRMM_L2x8_SAVE:
  158. SAVE2x8
  159. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  160. sub T1, K, KKK // K - KKK -> TEMP1
  161. slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
  162. slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1
  163. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  164. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  165. #endif
  166. #if defined(LEFT)
  167. addi KK, KK, 8 // KK += Number of values in A
  168. #endif
  169. addic. I, I, -1
  170. bgt .LZTRMM_L2x8_BEGIN
  171. .LZTRMM_L2x8_END:
  172. .LZTRMM_L2x4_BEGIN:
  173. andi. T2, M, 7
  174. ble .LZTRMM_L2x1_END
  175. andi. T1, M, 4
  176. ble .LZTRMM_L2x4_END
  177. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  178. mr BO, B // B -> BO
  179. #else
  180. mr BO, B // B -> BO
  181. slwi T1, KK, 5 // Number of values in B shifted
  182. slwi T2, KK, 6 // Number of values in A shifted
  183. add BO, BO, T1 // Add values to BO
  184. add AO, AO, T2 // Add values to AO
  185. #endif
  186. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  187. sub T1, K, KK // K - KK -> TEMP1
  188. #else
  189. mr T1, KK // KK -> KTEMP
  190. #ifdef LEFT
  191. addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
  192. #else
  193. addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
  194. #endif
  195. #endif
  196. mr KKK, T1
  197. mr K1, T1
  198. srawi. L, K1, 3 // KTEMP / 8 -> L
  199. ble .LZTRMM_L2x4_SUB0
  200. cmpwi cr0, L, 1
  201. ble .LZTRMM_L2x4_SUB4
  202. .LZTRMM_L2x4_LOOP_START:
  203. LOAD2x4_1
  204. KERNEL2x4_I1
  205. KERNEL2x4_2
  206. KERNEL2x4_1
  207. KERNEL2x4_2
  208. KERNEL2x4_1
  209. KERNEL2x4_2
  210. KERNEL2x4_1
  211. KERNEL2x4_2
  212. addic. L, L, -2
  213. ble .LZTRMM_L2x4_LOOP_END
  214. .align 5
  215. .LZTRMM_L2x4_LOOP:
  216. KERNEL2x4_1
  217. KERNEL2x4_2
  218. KERNEL2x4_1
  219. KERNEL2x4_2
  220. KERNEL2x4_1
  221. KERNEL2x4_2
  222. KERNEL2x4_1
  223. KERNEL2x4_2
  224. addic. L, L, -1
  225. bgt .LZTRMM_L2x4_LOOP
  226. .LZTRMM_L2x4_LOOP_END:
  227. KERNEL2x4_1
  228. KERNEL2x4_2
  229. KERNEL2x4_1
  230. KERNEL2x4_2
  231. KERNEL2x4_1
  232. KERNEL2x4_2
  233. KERNEL2x4_1
  234. KERNEL2x4_E2
  235. b .LZTRMM_L2x4_SUB1
  236. .LZTRMM_L2x4_SUB4:
  237. KERNEL2x4_SUBI1
  238. KERNEL2x4_SUB1
  239. KERNEL2x4_SUB1
  240. KERNEL2x4_SUB1
  241. KERNEL2x4_SUB1
  242. KERNEL2x4_SUB1
  243. KERNEL2x4_SUB1
  244. KERNEL2x4_SUB1
  245. b .LZTRMM_L2x4_SUB1
  246. .LZTRMM_L2x4_SUB0:
  247. andi. L, K1, 7 // K1 & 7 -> L
  248. KERNEL2x4_SUBI1
  249. addic. L, L, -1
  250. ble .LZTRMM_L2x4_SAVE
  251. b .LZTRMM_L2x4_SUB2
  252. .LZTRMM_L2x4_SUB1:
  253. andi. L, K1, 7 // K1 & 7 -> L
  254. ble .LZTRMM_L2x4_SAVE
  255. .LZTRMM_L2x4_SUB2:
  256. KERNEL2x4_SUB1
  257. addic. L, L, -1
  258. bgt .LZTRMM_L2x4_SUB2
  259. .LZTRMM_L2x4_SAVE:
  260. SAVE2x4
  261. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  262. sub T1, K, KKK // K - KKK -> TEMP1
  263. slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
  264. slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
  265. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  266. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  267. #endif
  268. #if defined(LEFT)
  269. addi KK, KK, 4 // KK += Number of values in A
  270. #endif
  271. .LZTRMM_L2x4_END:
  272. .LZTRMM_L2x2_BEGIN:
  273. andi. T1, M, 2
  274. ble .LZTRMM_L2x2_END
  275. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  276. mr BO, B // B -> BO
  277. #else
  278. mr BO, B // B -> BO
  279. slwi T1, KK, 5 // Number of values in B shifted
  280. slwi T2, KK, 5 // Number of values in A shifted
  281. add BO, BO, T1 // Add values to BO
  282. add AO, AO, T2 // Add values to AO
  283. #endif
  284. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  285. sub T1, K, KK // K - KK -> TEMP1
  286. #else
  287. mr T1, KK // KK -> KTEMP
  288. #ifdef LEFT
  289. addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
  290. #else
  291. addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
  292. #endif
  293. #endif
  294. mr KKK, T1
  295. mr K1, T1
  296. srawi. L, K1, 3 // KTEMP / 8 -> L
  297. ble .LZTRMM_L2x2_SUB0
  298. cmpwi cr0, L, 1
  299. ble .LZTRMM_L2x2_SUB4
  300. .LZTRMM_L2x2_LOOP_START:
  301. LOAD2x2_1
  302. KERNEL2x2_I1
  303. KERNEL2x2_2
  304. KERNEL2x2_1
  305. KERNEL2x2_2
  306. KERNEL2x2_1
  307. KERNEL2x2_2
  308. KERNEL2x2_1
  309. KERNEL2x2_2
  310. addic. L, L, -2
  311. ble .LZTRMM_L2x2_LOOP_END
  312. .align 5
  313. .LZTRMM_L2x2_LOOP:
  314. KERNEL2x2_1
  315. KERNEL2x2_2
  316. KERNEL2x2_1
  317. KERNEL2x2_2
  318. KERNEL2x2_1
  319. KERNEL2x2_2
  320. KERNEL2x2_1
  321. KERNEL2x2_2
  322. addic. L, L, -1
  323. bgt .LZTRMM_L2x2_LOOP
  324. .LZTRMM_L2x2_LOOP_END:
  325. KERNEL2x2_1
  326. KERNEL2x2_2
  327. KERNEL2x2_1
  328. KERNEL2x2_2
  329. KERNEL2x2_1
  330. KERNEL2x2_2
  331. KERNEL2x2_1
  332. KERNEL2x2_E2
  333. b .LZTRMM_L2x2_SUB1
  334. .LZTRMM_L2x2_SUB4:
  335. KERNEL2x2_SUBI1
  336. KERNEL2x2_SUB1
  337. KERNEL2x2_SUB1
  338. KERNEL2x2_SUB1
  339. KERNEL2x2_SUB1
  340. KERNEL2x2_SUB1
  341. KERNEL2x2_SUB1
  342. KERNEL2x2_SUB1
  343. b .LZTRMM_L2x2_SUB1
  344. .LZTRMM_L2x2_SUB0:
  345. andi. L, K1, 7 // K1 & 7 -> L
  346. KERNEL2x2_SUBI1
  347. addic. L, L, -1
  348. ble .LZTRMM_L2x2_SAVE
  349. b .LZTRMM_L2x2_SUB2
  350. .LZTRMM_L2x2_SUB1:
  351. andi. L, K1, 7 // K1 & 7 -> L
  352. ble .LZTRMM_L2x2_SAVE
  353. .LZTRMM_L2x2_SUB2:
  354. KERNEL2x2_SUB1
  355. addic. L, L, -1
  356. bgt .LZTRMM_L2x2_SUB2
  357. .LZTRMM_L2x2_SAVE:
  358. SAVE2x2
  359. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  360. sub T1, K, KKK // K - KKK -> TEMP1
  361. slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
  362. slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
  363. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  364. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  365. #endif
  366. #if defined(LEFT)
  367. addi KK, KK, 2 // KK += Number of values in A
  368. #endif
  369. .LZTRMM_L2x2_END:
  370. .LZTRMM_L2x1_BEGIN:
  371. andi. T1, M, 1
  372. ble .LZTRMM_L2x1_END
  373. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  374. mr BO, B // B -> BO
  375. #else
  376. mr BO, B // B -> BO
  377. slwi T1, KK, 5 // Number of values in B shifted
  378. slwi T2, KK, 4 // Number of values in A shifted
  379. add BO, BO, T1 // Add values to BO
  380. add AO, AO, T2 // Add values to AO
  381. #endif
  382. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  383. sub T1, K, KK // K - KK -> TEMP1
  384. #else
  385. mr T1, KK // KK -> KTEMP
  386. #ifdef LEFT
  387. addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
  388. #else
  389. addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
  390. #endif
  391. #endif
  392. mr KKK, T1
  393. mr K1, T1
  394. srawi. L, K1, 3 // KTEMP / 8 -> L
  395. ble .LZTRMM_L2x1_SUB0
  396. cmpwi cr0, L, 1
  397. ble .LZTRMM_L2x1_SUB4
  398. .LZTRMM_L2x1_LOOP_START:
  399. LOAD2x1_1
  400. KERNEL2x1_I1
  401. KERNEL2x1_2
  402. KERNEL2x1_1
  403. KERNEL2x1_2
  404. KERNEL2x1_1
  405. KERNEL2x1_2
  406. KERNEL2x1_1
  407. KERNEL2x1_2
  408. addic. L, L, -2
  409. ble .LZTRMM_L2x1_LOOP_END
  410. .align 5
  411. .LZTRMM_L2x1_LOOP:
  412. KERNEL2x1_1
  413. KERNEL2x1_2
  414. KERNEL2x1_1
  415. KERNEL2x1_2
  416. KERNEL2x1_1
  417. KERNEL2x1_2
  418. KERNEL2x1_1
  419. KERNEL2x1_2
  420. addic. L, L, -1
  421. bgt .LZTRMM_L2x1_LOOP
  422. .LZTRMM_L2x1_LOOP_END:
  423. KERNEL2x1_1
  424. KERNEL2x1_2
  425. KERNEL2x1_1
  426. KERNEL2x1_2
  427. KERNEL2x1_1
  428. KERNEL2x1_2
  429. KERNEL2x1_1
  430. KERNEL2x1_E2
  431. b .LZTRMM_L2x1_SUB1
  432. .LZTRMM_L2x1_SUB4:
  433. KERNEL2x1_SUBI1
  434. KERNEL2x1_SUB1
  435. KERNEL2x1_SUB1
  436. KERNEL2x1_SUB1
  437. KERNEL2x1_SUB1
  438. KERNEL2x1_SUB1
  439. KERNEL2x1_SUB1
  440. KERNEL2x1_SUB1
  441. b .LZTRMM_L2x1_SUB1
  442. .LZTRMM_L2x1_SUB0:
  443. andi. L, K1, 7 // K1 & 7 -> L
  444. KERNEL2x1_SUBI1
  445. addic. L, L, -1
  446. ble .LZTRMM_L2x1_SAVE
  447. b .LZTRMM_L2x1_SUB2
  448. .LZTRMM_L2x1_SUB1:
  449. andi. L, K1, 7 // K1 & 7 -> L
  450. ble .LZTRMM_L2x1_SAVE
  451. .LZTRMM_L2x1_SUB2:
  452. KERNEL2x1_SUB1
  453. addic. L, L, -1
  454. bgt .LZTRMM_L2x1_SUB2
  455. .LZTRMM_L2x1_SAVE:
  456. SAVE2x1
  457. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  458. sub T1, K, KKK // K - KKK -> TEMP1
  459. slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
  460. slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
  461. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  462. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  463. #endif
  464. #if defined(LEFT)
  465. addi KK, KK, 1 // KK += Number of values in A
  466. #endif
  467. .LZTRMM_L2x1_END:
  468. slwi T1, K, 5
  469. add B, B, T1
  470. #if !defined(LEFT)
  471. addi KK, KK, 2 // KK += Number of values in B
  472. #endif
  473. addic. J, J, -1
  474. bgt .LZTRMM_L2_BEGIN
  475. andi. T2, N, 1
  476. ble .L999
  477. .LZTRMM_L2_END:
  478. b .LZTRMM_L1_BEGIN
  479. .L999_H1:
  480. b .L999
  481. .LZTRMM_L1_BEGIN:
  482. andi. T1, N, 1
  483. ble .LZTRMM_L1_END
  484. mr CO, C
  485. mr AO, A
  486. #if defined(LEFT)
  487. mr KK, OFFSET // OFFSET -> KK
  488. #endif
  489. srawi. I, M, 3
  490. ble .LZTRMM_L1x8_END
  491. .LZTRMM_L1x8_BEGIN:
  492. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  493. mr BO, B // B -> BO
  494. #else
  495. mr BO, B // B -> BO
  496. slwi T1, KK, 4 // Number of values in B shifted
  497. slwi T2, KK, 7 // Number of values in A shifted
  498. add BO, BO, T1 // Add values to BO
  499. add AO, AO, T2 // Add values to AO
  500. #endif
  501. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  502. sub T1, K, KK // K - KK -> TEMP1
  503. #else
  504. mr T1, KK // KK -> KTEMP
  505. #ifdef LEFT
  506. addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
  507. #else
  508. addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
  509. #endif
  510. #endif
  511. mr KKK, T1
  512. mr K1, T1
  513. srawi. L, K1, 3 // KTEMP / 8 -> L
  514. ble .LZTRMM_L1x8_SUB0
  515. cmpwi cr0, L, 1
  516. ble .LZTRMM_L1x8_SUB4
  517. .LZTRMM_L1x8_LOOP_START:
  518. dcbt AO, PRE
  519. LOAD1x8_1
  520. dcbt AO, PRE
  521. KERNEL1x8_I1
  522. dcbt AO, PRE
  523. KERNEL1x8_2
  524. dcbt AO, PRE
  525. KERNEL1x8_1
  526. dcbt AO, PRE
  527. KERNEL1x8_2
  528. dcbt AO, PRE
  529. KERNEL1x8_1
  530. dcbt AO, PRE
  531. KERNEL1x8_2
  532. dcbt AO, PRE
  533. KERNEL1x8_1
  534. dcbt AO, PRE
  535. KERNEL1x8_2
  536. addic. L, L, -2
  537. ble .LZTRMM_L1x8_LOOP_END
  538. .align 5
  539. .LZTRMM_L1x8_LOOP:
  540. dcbt AO, PRE
  541. KERNEL1x8_1
  542. dcbt AO, PRE
  543. KERNEL1x8_2
  544. dcbt AO, PRE
  545. KERNEL1x8_1
  546. dcbt AO, PRE
  547. KERNEL1x8_2
  548. dcbt AO, PRE
  549. KERNEL1x8_1
  550. dcbt AO, PRE
  551. KERNEL1x8_2
  552. dcbt AO, PRE
  553. KERNEL1x8_1
  554. dcbt AO, PRE
  555. KERNEL1x8_2
  556. addic. L, L, -1
  557. bgt .LZTRMM_L1x8_LOOP
  558. .LZTRMM_L1x8_LOOP_END:
  559. dcbt AO, PRE
  560. KERNEL1x8_1
  561. dcbt AO, PRE
  562. KERNEL1x8_2
  563. dcbt AO, PRE
  564. KERNEL1x8_1
  565. dcbt AO, PRE
  566. KERNEL1x8_2
  567. dcbt AO, PRE
  568. KERNEL1x8_1
  569. dcbt AO, PRE
  570. KERNEL1x8_2
  571. dcbt AO, PRE
  572. KERNEL1x8_1
  573. KERNEL1x8_E2
  574. b .LZTRMM_L1x8_SUB1
  575. .LZTRMM_L1x8_SUB4:
  576. dcbt AO, PRE
  577. KERNEL1x8_SUBI1
  578. dcbt AO, PRE
  579. KERNEL1x8_SUB1
  580. dcbt AO, PRE
  581. KERNEL1x8_SUB1
  582. dcbt AO, PRE
  583. KERNEL1x8_SUB1
  584. KERNEL1x8_SUB1
  585. KERNEL1x8_SUB1
  586. KERNEL1x8_SUB1
  587. KERNEL1x8_SUB1
  588. b .LZTRMM_L1x8_SUB1
  589. .LZTRMM_L1x8_SUB0:
  590. andi. L, K1, 7 // K1 & 7 -> L
  591. KERNEL1x8_SUBI1
  592. addic. L, L, -1
  593. ble .LZTRMM_L1x8_SAVE
  594. b .LZTRMM_L1x8_SUB2
  595. .LZTRMM_L1x8_SUB1:
  596. andi. L, K1, 7 // K1 & 7 -> L
  597. ble .LZTRMM_L1x8_SAVE
  598. .LZTRMM_L1x8_SUB2:
  599. KERNEL1x8_SUB1
  600. addic. L, L, -1
  601. bgt .LZTRMM_L1x8_SUB2
  602. .LZTRMM_L1x8_SAVE:
  603. SAVE1x8
  604. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  605. sub T1, K, KKK // K - KKK -> TEMP1
  606. slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
  607. slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1
  608. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  609. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  610. #endif
  611. #if defined(LEFT)
  612. addi KK, KK, 8 // KK += Number of values in A
  613. #endif
  614. addic. I, I, -1
  615. bgt .LZTRMM_L1x8_BEGIN
  616. .LZTRMM_L1x8_END:
  617. .LZTRMM_L1x4_BEGIN:
  618. andi. T2, M, 7
  619. ble .LZTRMM_L1x1_END
  620. andi. T1, M, 4
  621. ble .LZTRMM_L1x4_END
  622. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  623. mr BO, B // B -> BO
  624. #else
  625. mr BO, B // B -> BO
  626. slwi T1, KK, 4 // Number of values in B shifted
  627. slwi T2, KK, 6 // Number of values in A shifted
  628. add BO, BO, T1 // Add values to BO
  629. add AO, AO, T2 // Add values to AO
  630. #endif
  631. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  632. sub T1, K, KK // K - KK -> TEMP1
  633. #else
  634. mr T1, KK // KK -> KTEMP
  635. #ifdef LEFT
  636. addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
  637. #else
  638. addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
  639. #endif
  640. #endif
  641. mr KKK, T1
  642. mr K1, T1
  643. srawi. L, K1, 3 // KTEMP / 8 -> L
  644. ble .LZTRMM_L1x4_SUB0
  645. cmpwi cr0, L, 1
  646. ble .LZTRMM_L1x4_SUB4
  647. .LZTRMM_L1x4_LOOP_START:
  648. LOAD1x4_1
  649. KERNEL1x4_I1
  650. KERNEL1x4_2
  651. KERNEL1x4_1
  652. KERNEL1x4_2
  653. KERNEL1x4_1
  654. KERNEL1x4_2
  655. KERNEL1x4_1
  656. KERNEL1x4_2
  657. addic. L, L, -2
  658. ble .LZTRMM_L1x4_LOOP_END
  659. .align 5
  660. .LZTRMM_L1x4_LOOP:
  661. KERNEL1x4_1
  662. KERNEL1x4_2
  663. KERNEL1x4_1
  664. KERNEL1x4_2
  665. KERNEL1x4_1
  666. KERNEL1x4_2
  667. KERNEL1x4_1
  668. KERNEL1x4_2
  669. addic. L, L, -1
  670. bgt .LZTRMM_L1x4_LOOP
  671. .LZTRMM_L1x4_LOOP_END:
  672. KERNEL1x4_1
  673. KERNEL1x4_2
  674. KERNEL1x4_1
  675. KERNEL1x4_2
  676. KERNEL1x4_1
  677. KERNEL1x4_2
  678. KERNEL1x4_1
  679. KERNEL1x4_E2
  680. b .LZTRMM_L1x4_SUB1
  681. .LZTRMM_L1x4_SUB4:
  682. KERNEL1x4_SUBI1
  683. KERNEL1x4_SUB1
  684. KERNEL1x4_SUB1
  685. KERNEL1x4_SUB1
  686. KERNEL1x4_SUB1
  687. KERNEL1x4_SUB1
  688. KERNEL1x4_SUB1
  689. KERNEL1x4_SUB1
  690. b .LZTRMM_L1x4_SUB1
  691. .LZTRMM_L1x4_SUB0:
  692. andi. L, K1, 7 // K1 & 7 -> L
  693. KERNEL1x4_SUBI1
  694. addic. L, L, -1
  695. ble .LZTRMM_L1x4_SAVE
  696. b .LZTRMM_L1x4_SUB2
  697. .LZTRMM_L1x4_SUB1:
  698. andi. L, K1, 7 // K1 & 7 -> L
  699. ble .LZTRMM_L1x4_SAVE
  700. .LZTRMM_L1x4_SUB2:
  701. KERNEL1x4_SUB1
  702. addic. L, L, -1
  703. bgt .LZTRMM_L1x4_SUB2
  704. .LZTRMM_L1x4_SAVE:
  705. SAVE1x4
  706. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  707. sub T1, K, KKK // K - KKK -> TEMP1
  708. slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
  709. slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
  710. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  711. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  712. #endif
  713. #if defined(LEFT)
  714. addi KK, KK, 4 // KK += Number of values in A
  715. #endif
  716. .LZTRMM_L1x4_END:
  717. .LZTRMM_L1x2_BEGIN:
  718. andi. T1, M, 2
  719. ble .LZTRMM_L1x2_END
  720. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  721. mr BO, B // B -> BO
  722. #else
  723. mr BO, B // B -> BO
  724. slwi T1, KK, 4 // Number of values in B shifted
  725. slwi T2, KK, 5 // Number of values in A shifted
  726. add BO, BO, T1 // Add values to BO
  727. add AO, AO, T2 // Add values to AO
  728. #endif
  729. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  730. sub T1, K, KK // K - KK -> TEMP1
  731. #else
  732. mr T1, KK // KK -> KTEMP
  733. #ifdef LEFT
  734. addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
  735. #else
  736. addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
  737. #endif
  738. #endif
  739. mr KKK, T1
  740. mr K1, T1
  741. srawi. L, K1, 3 // KTEMP / 8 -> L
  742. ble .LZTRMM_L1x2_SUB0
  743. cmpwi cr0, L, 1
  744. ble .LZTRMM_L1x2_SUB4
  745. .LZTRMM_L1x2_LOOP_START:
  746. LOAD1x2_1
  747. KERNEL1x2_I1
  748. KERNEL1x2_2
  749. KERNEL1x2_1
  750. KERNEL1x2_2
  751. KERNEL1x2_1
  752. KERNEL1x2_2
  753. KERNEL1x2_1
  754. KERNEL1x2_2
  755. addic. L, L, -2
  756. ble .LZTRMM_L1x2_LOOP_END
  757. .align 5
  758. .LZTRMM_L1x2_LOOP:
  759. KERNEL1x2_1
  760. KERNEL1x2_2
  761. KERNEL1x2_1
  762. KERNEL1x2_2
  763. KERNEL1x2_1
  764. KERNEL1x2_2
  765. KERNEL1x2_1
  766. KERNEL1x2_2
  767. addic. L, L, -1
  768. bgt .LZTRMM_L1x2_LOOP
  769. .LZTRMM_L1x2_LOOP_END:
  770. KERNEL1x2_1
  771. KERNEL1x2_2
  772. KERNEL1x2_1
  773. KERNEL1x2_2
  774. KERNEL1x2_1
  775. KERNEL1x2_2
  776. KERNEL1x2_1
  777. KERNEL1x2_E2
  778. b .LZTRMM_L1x2_SUB1
  779. .LZTRMM_L1x2_SUB4:
  780. KERNEL1x2_SUBI1
  781. KERNEL1x2_SUB1
  782. KERNEL1x2_SUB1
  783. KERNEL1x2_SUB1
  784. KERNEL1x2_SUB1
  785. KERNEL1x2_SUB1
  786. KERNEL1x2_SUB1
  787. KERNEL1x2_SUB1
  788. b .LZTRMM_L1x2_SUB1
  789. .LZTRMM_L1x2_SUB0:
  790. andi. L, K1, 7 // K1 & 7 -> L
  791. KERNEL1x2_SUBI1
  792. addic. L, L, -1
  793. ble .LZTRMM_L1x2_SAVE
  794. b .LZTRMM_L1x2_SUB2
  795. .LZTRMM_L1x2_SUB1:
  796. andi. L, K1, 7 // K1 & 7 -> L
  797. ble .LZTRMM_L1x2_SAVE
  798. .LZTRMM_L1x2_SUB2:
  799. KERNEL1x2_SUB1
  800. addic. L, L, -1
  801. bgt .LZTRMM_L1x2_SUB2
  802. .LZTRMM_L1x2_SAVE:
  803. SAVE1x2
  804. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  805. sub T1, K, KKK // K - KKK -> TEMP1
  806. slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
  807. slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
  808. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  809. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  810. #endif
  811. #if defined(LEFT)
  812. addi KK, KK, 2 // KK += Number of values in A
  813. #endif
  814. .LZTRMM_L1x2_END:
  815. .LZTRMM_L1x1_BEGIN:
  816. andi. T1, M, 1
  817. ble .LZTRMM_L1x1_END
  818. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  819. mr BO, B // B -> BO
  820. #else
  821. mr BO, B // B -> BO
  822. slwi T1, KK, 4 // Number of values in B shifted
  823. slwi T2, KK, 4 // Number of values in A shifted
  824. add BO, BO, T1 // Add values to BO
  825. add AO, AO, T2 // Add values to AO
  826. #endif
  827. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  828. sub T1, K, KK // K - KK -> TEMP1
  829. #else
  830. mr T1, KK // KK -> KTEMP
  831. #ifdef LEFT
  832. addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
  833. #else
  834. addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
  835. #endif
  836. #endif
  837. mr KKK, T1
  838. mr K1, T1
  839. srawi. L, K1, 3 // KTEMP / 8 -> L
  840. ble .LZTRMM_L1x1_SUB0
  841. cmpwi cr0, L, 1
  842. ble .LZTRMM_L1x1_SUB4
  843. .LZTRMM_L1x1_LOOP_START:
  844. LOAD1x1_1
  845. KERNEL1x1_I1
  846. KERNEL1x1_2
  847. KERNEL1x1_1
  848. KERNEL1x1_2
  849. KERNEL1x1_1
  850. KERNEL1x1_2
  851. KERNEL1x1_1
  852. KERNEL1x1_2
  853. addic. L, L, -2
  854. ble .LZTRMM_L1x1_LOOP_END
  855. .align 5
  856. .LZTRMM_L1x1_LOOP:
  857. KERNEL1x1_1
  858. KERNEL1x1_2
  859. KERNEL1x1_1
  860. KERNEL1x1_2
  861. KERNEL1x1_1
  862. KERNEL1x1_2
  863. KERNEL1x1_1
  864. KERNEL1x1_2
  865. addic. L, L, -1
  866. bgt .LZTRMM_L1x1_LOOP
  867. .LZTRMM_L1x1_LOOP_END:
  868. KERNEL1x1_1
  869. KERNEL1x1_2
  870. KERNEL1x1_1
  871. KERNEL1x1_2
  872. KERNEL1x1_1
  873. KERNEL1x1_2
  874. KERNEL1x1_1
  875. KERNEL1x1_E2
  876. b .LZTRMM_L1x1_SUB1
  877. .LZTRMM_L1x1_SUB4:
  878. KERNEL1x1_SUBI1
  879. KERNEL1x1_SUB1
  880. KERNEL1x1_SUB1
  881. KERNEL1x1_SUB1
  882. KERNEL1x1_SUB1
  883. KERNEL1x1_SUB1
  884. KERNEL1x1_SUB1
  885. KERNEL1x1_SUB1
  886. b .LZTRMM_L1x1_SUB1
  887. .LZTRMM_L1x1_SUB0:
  888. andi. L, K1, 7 // K1 & 7 -> L
  889. KERNEL1x1_SUBI1
  890. addic. L, L, -1
  891. ble .LZTRMM_L1x1_SAVE
  892. b .LZTRMM_L1x1_SUB2
  893. .LZTRMM_L1x1_SUB1:
  894. andi. L, K1, 7 // K1 & 7 -> L
  895. ble .LZTRMM_L1x1_SAVE
  896. .LZTRMM_L1x1_SUB2:
  897. KERNEL1x1_SUB1
  898. addic. L, L, -1
  899. bgt .LZTRMM_L1x1_SUB2
  900. .LZTRMM_L1x1_SAVE:
  901. SAVE1x1
  902. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  903. sub T1, K, KKK // K - KKK -> TEMP1
  904. slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
  905. slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
  906. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  907. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  908. #endif
  909. #if defined(LEFT)
  910. addi KK, KK, 1 // KK += Number of values in A
  911. #endif
  912. .LZTRMM_L1x1_END:
  913. #if !defined(LEFT)
  914. addi KK, KK, 1 // KK += Number of values in B
  915. #endif
  916. .LZTRMM_L1_END: