You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ctrmm_logic_8x4_power8.S 31 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769
  1. /***************************************************************************
  2. Copyright (c) 2013-2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2016/04/04 Werner Saar (wernsaar@googlemail.com)
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. * LAPACK-TEST : OK
  33. **************************************************************************************/
  34. srawi. J, N, 2
  35. ble CTRMM_L4_END
  36. CTRMM_L4_BEGIN:
  37. mr CO, C
  38. mr AO, A
  39. slwi T1, LDC , 2
  40. add C, C, T1
  41. #if defined(LEFT)
  42. mr KK, OFFSET // OFFSET -> KK
  43. #endif
  44. srawi. I, M, 3
  45. ble CTRMM_L4x8_END
  46. CTRMM_L4x8_BEGIN:
  47. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  48. mr BO, B // B -> BO
  49. #else
  50. mr BO, B // B -> BO
  51. slwi T1, KK, 5 // Number of values in B shifted
  52. slwi T2, KK, 6 // Number of values in A shifted
  53. add BO, BO, T1 // Add values to BO
  54. add AO, AO, T2 // Add values to AO
  55. #endif
  56. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  57. sub T1, K, KK // K - KK -> TEMP1
  58. #else
  59. mr T1, KK // KK -> KTEMP
  60. #ifdef LEFT
  61. addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
  62. #else
  63. addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
  64. #endif
  65. #endif
  66. mr KKK, T1
  67. mr K1, T1
  68. srawi. L, K1, 3 // KTEMP / 8 -> L
  69. ble CTRMM_L4x8_SUB0
  70. cmpwi cr0, L, 1
  71. ble CTRMM_L4x8_SUB4
  72. CTRMM_L4x8_LOOP_START:
  73. dcbt AO, PRE
  74. dcbt BO, PRE
  75. LOAD4x8_1
  76. KERNEL4x8_I1
  77. dcbt AO, PRE
  78. KERNEL4x8_2
  79. KERNEL4x8_1
  80. dcbt AO, PRE
  81. KERNEL4x8_2
  82. KERNEL4x8_1
  83. dcbt AO, PRE
  84. KERNEL4x8_2
  85. KERNEL4x8_1
  86. dcbt AO, PRE
  87. dcbt BO, PRE
  88. KERNEL4x8_2
  89. addic. L, L, -2
  90. ble CTRMM_L4x8_LOOP_END
  91. .align 5
  92. CTRMM_L4x8_LOOP:
  93. KERNEL4x8_1
  94. dcbt AO, PRE
  95. KERNEL4x8_2
  96. KERNEL4x8_1
  97. dcbt AO, PRE
  98. KERNEL4x8_2
  99. KERNEL4x8_1
  100. dcbt AO, PRE
  101. KERNEL4x8_2
  102. KERNEL4x8_1
  103. dcbt AO, PRE
  104. dcbt BO, PRE
  105. KERNEL4x8_2
  106. addic. L, L, -1
  107. bgt CTRMM_L4x8_LOOP
  108. CTRMM_L4x8_LOOP_END:
  109. KERNEL4x8_1
  110. dcbt AO, PRE
  111. KERNEL4x8_2
  112. KERNEL4x8_1
  113. dcbt AO, PRE
  114. KERNEL4x8_2
  115. KERNEL4x8_1
  116. KERNEL4x8_2
  117. KERNEL4x8_1
  118. KERNEL4x8_E2
  119. b CTRMM_L4x8_SUB1
  120. CTRMM_L4x8_SUB4:
  121. KERNEL4x8_SUBI1
  122. KERNEL4x8_SUB1
  123. KERNEL4x8_SUB1
  124. KERNEL4x8_SUB1
  125. KERNEL4x8_SUB1
  126. KERNEL4x8_SUB1
  127. KERNEL4x8_SUB1
  128. KERNEL4x8_SUB1
  129. b CTRMM_L4x8_SUB1
  130. CTRMM_L4x8_SUB0:
  131. andi. L, K1, 7 // K1 & 7 -> L
  132. KERNEL4x8_SUBI1
  133. addic. L, L, -1
  134. ble CTRMM_L4x8_SAVE
  135. b CTRMM_L4x8_SUB2
  136. CTRMM_L4x8_SUB1:
  137. andi. L, K1, 7 // K1 & 7 -> L
  138. ble CTRMM_L4x8_SAVE
  139. CTRMM_L4x8_SUB2:
  140. KERNEL4x8_SUB1
  141. addic. L, L, -1
  142. bgt CTRMM_L4x8_SUB2
  143. CTRMM_L4x8_SAVE:
  144. SAVE4x8
  145. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  146. sub T1, K, KKK // K - KKK -> TEMP1
  147. slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
  148. slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
  149. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  150. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  151. #endif
  152. #if defined(LEFT)
  153. addi KK, KK, 8 // KK += Number of values in A
  154. #endif
  155. addic. I, I, -1
  156. bgt CTRMM_L4x8_BEGIN
  157. CTRMM_L4x8_END:
  158. CTRMM_L4x4_BEGIN:
  159. andi. T2, M, 7
  160. ble CTRMM_L4x1_END
  161. andi. T1, M, 4
  162. ble CTRMM_L4x4_END
  163. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  164. mr BO, B // B -> BO
  165. #else
  166. mr BO, B // B -> BO
  167. slwi T1, KK, 5 // Number of values in B shifted
  168. slwi T2, KK, 5 // Number of values in A shifted
  169. add BO, BO, T1 // Add values to BO
  170. add AO, AO, T2 // Add values to AO
  171. #endif
  172. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  173. sub T1, K, KK // K - KK -> TEMP1
  174. #else
  175. mr T1, KK // KK -> KTEMP
  176. #ifdef LEFT
  177. addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
  178. #else
  179. addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
  180. #endif
  181. #endif
  182. mr KKK, T1
  183. mr K1, T1
  184. srawi. L, K1, 3 // KTEMP / 8 -> L
  185. ble CTRMM_L4x4_SUB0
  186. cmpwi cr0, L, 1
  187. ble CTRMM_L4x4_SUB4
  188. CTRMM_L4x4_LOOP_START:
  189. LOAD4x4_1
  190. KERNEL4x4_I1
  191. KERNEL4x4_2
  192. KERNEL4x4_1
  193. KERNEL4x4_2
  194. KERNEL4x4_1
  195. KERNEL4x4_2
  196. KERNEL4x4_1
  197. KERNEL4x4_2
  198. addic. L, L, -2
  199. ble CTRMM_L4x4_LOOP_END
  200. .align 5
  201. CTRMM_L4x4_LOOP:
  202. KERNEL4x4_1
  203. KERNEL4x4_2
  204. KERNEL4x4_1
  205. KERNEL4x4_2
  206. KERNEL4x4_1
  207. KERNEL4x4_2
  208. KERNEL4x4_1
  209. KERNEL4x4_2
  210. addic. L, L, -1
  211. bgt CTRMM_L4x4_LOOP
  212. CTRMM_L4x4_LOOP_END:
  213. KERNEL4x4_1
  214. KERNEL4x4_2
  215. KERNEL4x4_1
  216. KERNEL4x4_2
  217. KERNEL4x4_1
  218. KERNEL4x4_2
  219. KERNEL4x4_1
  220. KERNEL4x4_E2
  221. b CTRMM_L4x4_SUB1
  222. CTRMM_L4x4_SUB4:
  223. KERNEL4x4_SUBI1
  224. KERNEL4x4_SUB1
  225. KERNEL4x4_SUB1
  226. KERNEL4x4_SUB1
  227. KERNEL4x4_SUB1
  228. KERNEL4x4_SUB1
  229. KERNEL4x4_SUB1
  230. KERNEL4x4_SUB1
  231. b CTRMM_L4x4_SUB1
  232. CTRMM_L4x4_SUB0:
  233. andi. L, K1, 7 // K1 & 7 -> L
  234. KERNEL4x4_SUBI1
  235. addic. L, L, -1
  236. ble CTRMM_L4x4_SAVE
  237. b CTRMM_L4x4_SUB2
  238. CTRMM_L4x4_SUB1:
  239. andi. L, K1, 7 // K1 & 7 -> L
  240. ble CTRMM_L4x4_SAVE
  241. CTRMM_L4x4_SUB2:
  242. KERNEL4x4_SUB1
  243. addic. L, L, -1
  244. bgt CTRMM_L4x4_SUB2
  245. CTRMM_L4x4_SAVE:
  246. SAVE4x4
  247. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  248. sub T1, K, KKK // K - KKK -> TEMP1
  249. slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
  250. slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
  251. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  252. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  253. #endif
  254. #if defined(LEFT)
  255. addi KK, KK, 4 // KK += Number of values in A
  256. #endif
  257. CTRMM_L4x4_END:
  258. CTRMM_L4x2_BEGIN:
  259. andi. T1, M, 2
  260. ble CTRMM_L4x2_END
  261. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  262. mr BO, B // B -> BO
  263. #else
  264. mr BO, B // B -> BO
  265. slwi T1, KK, 5 // Number of values in B shifted
  266. slwi T2, KK, 4 // Number of values in A shifted
  267. add BO, BO, T1 // Add values to BO
  268. add AO, AO, T2 // Add values to AO
  269. #endif
  270. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  271. sub T1, K, KK // K - KK -> TEMP1
  272. #else
  273. mr T1, KK // KK -> KTEMP
  274. #ifdef LEFT
  275. addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
  276. #else
  277. addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
  278. #endif
  279. #endif
  280. mr KKK, T1
  281. mr K1, T1
  282. srawi. L, K1, 3 // KTEMP / 8 -> L
  283. ble CTRMM_L4x2_SUB0
  284. cmpwi cr0, L, 1
  285. ble CTRMM_L4x2_SUB4
  286. CTRMM_L4x2_LOOP_START:
  287. LOAD4x2_1
  288. KERNEL4x2_I1
  289. KERNEL4x2_2
  290. KERNEL4x2_1
  291. KERNEL4x2_2
  292. KERNEL4x2_1
  293. KERNEL4x2_2
  294. KERNEL4x2_1
  295. KERNEL4x2_2
  296. addic. L, L, -2
  297. ble CTRMM_L4x2_LOOP_END
  298. .align 5
  299. CTRMM_L4x2_LOOP:
  300. KERNEL4x2_1
  301. KERNEL4x2_2
  302. KERNEL4x2_1
  303. KERNEL4x2_2
  304. KERNEL4x2_1
  305. KERNEL4x2_2
  306. KERNEL4x2_1
  307. KERNEL4x2_2
  308. addic. L, L, -1
  309. bgt CTRMM_L4x2_LOOP
  310. CTRMM_L4x2_LOOP_END:
  311. KERNEL4x2_1
  312. KERNEL4x2_2
  313. KERNEL4x2_1
  314. KERNEL4x2_2
  315. KERNEL4x2_1
  316. KERNEL4x2_2
  317. KERNEL4x2_1
  318. KERNEL4x2_E2
  319. b CTRMM_L4x2_SUB1
  320. CTRMM_L4x2_SUB4:
  321. KERNEL4x2_SUBI1
  322. KERNEL4x2_SUB1
  323. KERNEL4x2_SUB1
  324. KERNEL4x2_SUB1
  325. KERNEL4x2_SUB1
  326. KERNEL4x2_SUB1
  327. KERNEL4x2_SUB1
  328. KERNEL4x2_SUB1
  329. b CTRMM_L4x2_SUB1
  330. CTRMM_L4x2_SUB0:
  331. andi. L, K1, 7 // K1 & 7 -> L
  332. KERNEL4x2_SUBI1
  333. addic. L, L, -1
  334. ble CTRMM_L4x2_SAVE
  335. b CTRMM_L4x2_SUB2
  336. CTRMM_L4x2_SUB1:
  337. andi. L, K1, 7 // K1 & 7 -> L
  338. ble CTRMM_L4x2_SAVE
  339. CTRMM_L4x2_SUB2:
  340. KERNEL4x2_SUB1
  341. addic. L, L, -1
  342. bgt CTRMM_L4x2_SUB2
  343. CTRMM_L4x2_SAVE:
  344. SAVE4x2
  345. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  346. sub T1, K, KKK // K - KKK -> TEMP1
  347. slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
  348. slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
  349. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  350. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  351. #endif
  352. #if defined(LEFT)
  353. addi KK, KK, 2 // KK += Number of values in A
  354. #endif
  355. CTRMM_L4x2_END:
  356. CTRMM_L4x1_BEGIN:
  357. andi. T1, M, 1
  358. ble CTRMM_L4x1_END
  359. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  360. mr BO, B // B -> BO
  361. #else
  362. mr BO, B // B -> BO
  363. slwi T1, KK, 5 // Number of values in B shifted
  364. slwi T2, KK, 3 // Number of values in A shifted
  365. add BO, BO, T1 // Add values to BO
  366. add AO, AO, T2 // Add values to AO
  367. #endif
  368. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  369. sub T1, K, KK // K - KK -> TEMP1
  370. #else
  371. mr T1, KK // KK -> KTEMP
  372. #ifdef LEFT
  373. addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
  374. #else
  375. addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
  376. #endif
  377. #endif
  378. mr KKK, T1
  379. mr K1, T1
  380. srawi. L, K1, 3 // KTEMP / 8 -> L
  381. ble CTRMM_L4x1_SUB0
  382. cmpwi cr0, L, 1
  383. ble CTRMM_L4x1_SUB4
  384. CTRMM_L4x1_LOOP_START:
  385. LOAD4x1_1
  386. KERNEL4x1_I1
  387. KERNEL4x1_2
  388. KERNEL4x1_1
  389. KERNEL4x1_2
  390. KERNEL4x1_1
  391. KERNEL4x1_2
  392. KERNEL4x1_1
  393. KERNEL4x1_2
  394. addic. L, L, -2
  395. ble CTRMM_L4x1_LOOP_END
  396. .align 5
  397. CTRMM_L4x1_LOOP:
  398. KERNEL4x1_1
  399. KERNEL4x1_2
  400. KERNEL4x1_1
  401. KERNEL4x1_2
  402. KERNEL4x1_1
  403. KERNEL4x1_2
  404. KERNEL4x1_1
  405. KERNEL4x1_2
  406. addic. L, L, -1
  407. bgt CTRMM_L4x1_LOOP
  408. CTRMM_L4x1_LOOP_END:
  409. KERNEL4x1_1
  410. KERNEL4x1_2
  411. KERNEL4x1_1
  412. KERNEL4x1_2
  413. KERNEL4x1_1
  414. KERNEL4x1_2
  415. KERNEL4x1_1
  416. KERNEL4x1_E2
  417. b CTRMM_L4x1_SUB1
  418. CTRMM_L4x1_SUB4:
  419. KERNEL4x1_SUBI1
  420. KERNEL4x1_SUB1
  421. KERNEL4x1_SUB1
  422. KERNEL4x1_SUB1
  423. KERNEL4x1_SUB1
  424. KERNEL4x1_SUB1
  425. KERNEL4x1_SUB1
  426. KERNEL4x1_SUB1
  427. b CTRMM_L4x1_SUB1
  428. CTRMM_L4x1_SUB0:
  429. andi. L, K1, 7 // K1 & 7 -> L
  430. KERNEL4x1_SUBI1
  431. addic. L, L, -1
  432. ble CTRMM_L4x1_SAVE
  433. b CTRMM_L4x1_SUB2
  434. CTRMM_L4x1_SUB1:
  435. andi. L, K1, 7 // K1 & 7 -> L
  436. ble CTRMM_L4x1_SAVE
  437. CTRMM_L4x1_SUB2:
  438. KERNEL4x1_SUB1
  439. addic. L, L, -1
  440. bgt CTRMM_L4x1_SUB2
  441. CTRMM_L4x1_SAVE:
  442. SAVE4x1
  443. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  444. sub T1, K, KKK // K - KKK -> TEMP1
  445. slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
  446. slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
  447. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  448. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  449. #endif
  450. #if defined(LEFT)
  451. addi KK, KK, 1 // KK += Number of values in A
  452. #endif
  453. CTRMM_L4x1_END:
  454. slwi T1, K, 5
  455. add B, B, T1
  456. #if !defined(LEFT)
  457. addi KK, KK, 4 // KK += Number of values in B
  458. #endif
  459. addic. J, J, -1
  460. bgt CTRMM_L4_BEGIN
  461. andi. T2, N, 3
  462. ble L999_H2
  463. CTRMM_L4_END:
  464. b CTRMM_L2_BEGIN
  465. L999_H1:
  466. b L999_H2
  467. CTRMM_L2_BEGIN:
  468. andi. T1, N, 2
  469. ble CTRMM_L2_END
  470. mr CO, C
  471. mr AO, A
  472. slwi T1, LDC , 1
  473. add C, C, T1
  474. #if defined(LEFT)
  475. mr KK, OFFSET // OFFSET -> KK
  476. #endif
  477. srawi. I, M, 3
  478. ble CTRMM_L2x8_END
  479. CTRMM_L2x8_BEGIN:
  480. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  481. mr BO, B // B -> BO
  482. #else
  483. mr BO, B // B -> BO
  484. slwi T1, KK, 4 // Number of values in B shifted
  485. slwi T2, KK, 6 // Number of values in A shifted
  486. add BO, BO, T1 // Add values to BO
  487. add AO, AO, T2 // Add values to AO
  488. #endif
  489. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  490. sub T1, K, KK // K - KK -> TEMP1
  491. #else
  492. mr T1, KK // KK -> KTEMP
  493. #ifdef LEFT
  494. addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
  495. #else
  496. addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
  497. #endif
  498. #endif
  499. mr KKK, T1
  500. mr K1, T1
  501. srawi. L, K1, 3 // KTEMP / 8 -> L
  502. ble CTRMM_L2x8_SUB0
  503. cmpwi cr0, L, 1
  504. ble CTRMM_L2x8_SUB4
  505. CTRMM_L2x8_LOOP_START:
  506. LOAD2x8_1
  507. KERNEL2x8_I1
  508. KERNEL2x8_2
  509. KERNEL2x8_1
  510. KERNEL2x8_2
  511. KERNEL2x8_1
  512. KERNEL2x8_2
  513. KERNEL2x8_1
  514. KERNEL2x8_2
  515. addic. L, L, -2
  516. ble CTRMM_L2x8_LOOP_END
  517. .align 5
  518. CTRMM_L2x8_LOOP:
  519. KERNEL2x8_1
  520. KERNEL2x8_2
  521. KERNEL2x8_1
  522. KERNEL2x8_2
  523. KERNEL2x8_1
  524. KERNEL2x8_2
  525. KERNEL2x8_1
  526. KERNEL2x8_2
  527. addic. L, L, -1
  528. bgt CTRMM_L2x8_LOOP
  529. CTRMM_L2x8_LOOP_END:
  530. KERNEL2x8_1
  531. KERNEL2x8_2
  532. KERNEL2x8_1
  533. KERNEL2x8_2
  534. KERNEL2x8_1
  535. KERNEL2x8_2
  536. KERNEL2x8_1
  537. KERNEL2x8_E2
  538. b CTRMM_L2x8_SUB1
  539. CTRMM_L2x8_SUB4:
  540. KERNEL2x8_SUBI1
  541. KERNEL2x8_SUB1
  542. KERNEL2x8_SUB1
  543. KERNEL2x8_SUB1
  544. KERNEL2x8_SUB1
  545. KERNEL2x8_SUB1
  546. KERNEL2x8_SUB1
  547. KERNEL2x8_SUB1
  548. b CTRMM_L2x8_SUB1
  549. CTRMM_L2x8_SUB0:
  550. andi. L, K1, 7 // K1 & 7 -> L
  551. KERNEL2x8_SUBI1
  552. addic. L, L, -1
  553. ble CTRMM_L2x8_SAVE
  554. b CTRMM_L2x8_SUB2
  555. CTRMM_L2x8_SUB1:
  556. andi. L, K1, 7 // K1 & 7 -> L
  557. ble CTRMM_L2x8_SAVE
  558. CTRMM_L2x8_SUB2:
  559. KERNEL2x8_SUB1
  560. addic. L, L, -1
  561. bgt CTRMM_L2x8_SUB2
  562. CTRMM_L2x8_SAVE:
  563. SAVE2x8
  564. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  565. sub T1, K, KKK // K - KKK -> TEMP1
  566. slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
  567. slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
  568. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  569. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  570. #endif
  571. #if defined(LEFT)
  572. addi KK, KK, 8 // KK += Number of values in A
  573. #endif
  574. addic. I, I, -1
  575. bgt CTRMM_L2x8_BEGIN
  576. CTRMM_L2x8_END:
  577. CTRMM_L2x4_BEGIN:
  578. andi. T2, M, 7
  579. ble CTRMM_L2x1_END
  580. andi. T1, M, 4
  581. ble CTRMM_L2x4_END
  582. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  583. mr BO, B // B -> BO
  584. #else
  585. mr BO, B // B -> BO
  586. slwi T1, KK, 4 // Number of values in B shifted
  587. slwi T2, KK, 5 // Number of values in A shifted
  588. add BO, BO, T1 // Add values to BO
  589. add AO, AO, T2 // Add values to AO
  590. #endif
  591. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  592. sub T1, K, KK // K - KK -> TEMP1
  593. #else
  594. mr T1, KK // KK -> KTEMP
  595. #ifdef LEFT
  596. addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
  597. #else
  598. addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
  599. #endif
  600. #endif
  601. mr KKK, T1
  602. mr K1, T1
  603. srawi. L, K1, 3 // KTEMP / 8 -> L
  604. ble CTRMM_L2x4_SUB0
  605. cmpwi cr0, L, 1
  606. ble CTRMM_L2x4_SUB4
  607. CTRMM_L2x4_LOOP_START:
  608. LOAD2x4_1
  609. KERNEL2x4_I1
  610. KERNEL2x4_2
  611. KERNEL2x4_1
  612. KERNEL2x4_2
  613. KERNEL2x4_1
  614. KERNEL2x4_2
  615. KERNEL2x4_1
  616. KERNEL2x4_2
  617. addic. L, L, -2
  618. ble CTRMM_L2x4_LOOP_END
  619. .align 5
  620. CTRMM_L2x4_LOOP:
  621. KERNEL2x4_1
  622. KERNEL2x4_2
  623. KERNEL2x4_1
  624. KERNEL2x4_2
  625. KERNEL2x4_1
  626. KERNEL2x4_2
  627. KERNEL2x4_1
  628. KERNEL2x4_2
  629. addic. L, L, -1
  630. bgt CTRMM_L2x4_LOOP
  631. CTRMM_L2x4_LOOP_END:
  632. KERNEL2x4_1
  633. KERNEL2x4_2
  634. KERNEL2x4_1
  635. KERNEL2x4_2
  636. KERNEL2x4_1
  637. KERNEL2x4_2
  638. KERNEL2x4_1
  639. KERNEL2x4_E2
  640. b CTRMM_L2x4_SUB1
  641. CTRMM_L2x4_SUB4:
  642. KERNEL2x4_SUBI1
  643. KERNEL2x4_SUB1
  644. KERNEL2x4_SUB1
  645. KERNEL2x4_SUB1
  646. KERNEL2x4_SUB1
  647. KERNEL2x4_SUB1
  648. KERNEL2x4_SUB1
  649. KERNEL2x4_SUB1
  650. b CTRMM_L2x4_SUB1
  651. CTRMM_L2x4_SUB0:
  652. andi. L, K1, 7 // K1 & 7 -> L
  653. KERNEL2x4_SUBI1
  654. addic. L, L, -1
  655. ble CTRMM_L2x4_SAVE
  656. b CTRMM_L2x4_SUB2
  657. CTRMM_L2x4_SUB1:
  658. andi. L, K1, 7 // K1 & 7 -> L
  659. ble CTRMM_L2x4_SAVE
  660. CTRMM_L2x4_SUB2:
  661. KERNEL2x4_SUB1
  662. addic. L, L, -1
  663. bgt CTRMM_L2x4_SUB2
  664. CTRMM_L2x4_SAVE:
  665. SAVE2x4
  666. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  667. sub T1, K, KKK // K - KKK -> TEMP1
  668. slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
  669. slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
  670. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  671. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  672. #endif
  673. #if defined(LEFT)
  674. addi KK, KK, 4 // KK += Number of values in A
  675. #endif
  676. CTRMM_L2x4_END:
  677. CTRMM_L2x2_BEGIN:
  678. andi. T1, M, 2
  679. ble CTRMM_L2x2_END
  680. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  681. mr BO, B // B -> BO
  682. #else
  683. mr BO, B // B -> BO
  684. slwi T1, KK, 4 // Number of values in B shifted
  685. slwi T2, KK, 4 // Number of values in A shifted
  686. add BO, BO, T1 // Add values to BO
  687. add AO, AO, T2 // Add values to AO
  688. #endif
  689. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  690. sub T1, K, KK // K - KK -> TEMP1
  691. #else
  692. mr T1, KK // KK -> KTEMP
  693. #ifdef LEFT
  694. addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
  695. #else
  696. addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
  697. #endif
  698. #endif
  699. mr KKK, T1
  700. mr K1, T1
  701. srawi. L, K1, 3 // KTEMP / 8 -> L
  702. ble CTRMM_L2x2_SUB0
  703. cmpwi cr0, L, 1
  704. ble CTRMM_L2x2_SUB4
  705. CTRMM_L2x2_LOOP_START:
  706. LOAD2x2_1
  707. KERNEL2x2_I1
  708. KERNEL2x2_2
  709. KERNEL2x2_1
  710. KERNEL2x2_2
  711. KERNEL2x2_1
  712. KERNEL2x2_2
  713. KERNEL2x2_1
  714. KERNEL2x2_2
  715. addic. L, L, -2
  716. ble CTRMM_L2x2_LOOP_END
  717. .align 5
  718. CTRMM_L2x2_LOOP:
  719. KERNEL2x2_1
  720. KERNEL2x2_2
  721. KERNEL2x2_1
  722. KERNEL2x2_2
  723. KERNEL2x2_1
  724. KERNEL2x2_2
  725. KERNEL2x2_1
  726. KERNEL2x2_2
  727. addic. L, L, -1
  728. bgt CTRMM_L2x2_LOOP
  729. CTRMM_L2x2_LOOP_END:
  730. KERNEL2x2_1
  731. KERNEL2x2_2
  732. KERNEL2x2_1
  733. KERNEL2x2_2
  734. KERNEL2x2_1
  735. KERNEL2x2_2
  736. KERNEL2x2_1
  737. KERNEL2x2_E2
  738. b CTRMM_L2x2_SUB1
  739. CTRMM_L2x2_SUB4:
  740. KERNEL2x2_SUBI1
  741. KERNEL2x2_SUB1
  742. KERNEL2x2_SUB1
  743. KERNEL2x2_SUB1
  744. KERNEL2x2_SUB1
  745. KERNEL2x2_SUB1
  746. KERNEL2x2_SUB1
  747. KERNEL2x2_SUB1
  748. b CTRMM_L2x2_SUB1
  749. CTRMM_L2x2_SUB0:
  750. andi. L, K1, 7 // K1 & 7 -> L
  751. KERNEL2x2_SUBI1
  752. addic. L, L, -1
  753. ble CTRMM_L2x2_SAVE
  754. b CTRMM_L2x2_SUB2
  755. CTRMM_L2x2_SUB1:
  756. andi. L, K1, 7 // K1 & 7 -> L
  757. ble CTRMM_L2x2_SAVE
  758. CTRMM_L2x2_SUB2:
  759. KERNEL2x2_SUB1
  760. addic. L, L, -1
  761. bgt CTRMM_L2x2_SUB2
  762. CTRMM_L2x2_SAVE:
  763. SAVE2x2
  764. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  765. sub T1, K, KKK // K - KKK -> TEMP1
  766. slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
  767. slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
  768. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  769. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  770. #endif
  771. #if defined(LEFT)
  772. addi KK, KK, 2 // KK += Number of values in A
  773. #endif
  774. CTRMM_L2x2_END:
  775. CTRMM_L2x1_BEGIN:
  776. andi. T1, M, 1
  777. ble CTRMM_L2x1_END
  778. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  779. mr BO, B // B -> BO
  780. #else
  781. mr BO, B // B -> BO
  782. slwi T1, KK, 4 // Number of values in B shifted
  783. slwi T2, KK, 3 // Number of values in A shifted
  784. add BO, BO, T1 // Add values to BO
  785. add AO, AO, T2 // Add values to AO
  786. #endif
  787. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  788. sub T1, K, KK // K - KK -> TEMP1
  789. #else
  790. mr T1, KK // KK -> KTEMP
  791. #ifdef LEFT
  792. addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
  793. #else
  794. addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
  795. #endif
  796. #endif
  797. mr KKK, T1
  798. mr K1, T1
  799. srawi. L, K1, 3 // KTEMP / 8 -> L
  800. ble CTRMM_L2x1_SUB0
  801. cmpwi cr0, L, 1
  802. ble CTRMM_L2x1_SUB4
  803. CTRMM_L2x1_LOOP_START:
  804. LOAD2x1_1
  805. KERNEL2x1_I1
  806. KERNEL2x1_2
  807. KERNEL2x1_1
  808. KERNEL2x1_2
  809. KERNEL2x1_1
  810. KERNEL2x1_2
  811. KERNEL2x1_1
  812. KERNEL2x1_2
  813. addic. L, L, -2
  814. ble CTRMM_L2x1_LOOP_END
  815. .align 5
  816. CTRMM_L2x1_LOOP:
  817. KERNEL2x1_1
  818. KERNEL2x1_2
  819. KERNEL2x1_1
  820. KERNEL2x1_2
  821. KERNEL2x1_1
  822. KERNEL2x1_2
  823. KERNEL2x1_1
  824. KERNEL2x1_2
  825. addic. L, L, -1
  826. bgt CTRMM_L2x1_LOOP
  827. CTRMM_L2x1_LOOP_END:
  828. KERNEL2x1_1
  829. KERNEL2x1_2
  830. KERNEL2x1_1
  831. KERNEL2x1_2
  832. KERNEL2x1_1
  833. KERNEL2x1_2
  834. KERNEL2x1_1
  835. KERNEL2x1_E2
  836. b CTRMM_L2x1_SUB1
  837. CTRMM_L2x1_SUB4:
  838. KERNEL2x1_SUBI1
  839. KERNEL2x1_SUB1
  840. KERNEL2x1_SUB1
  841. KERNEL2x1_SUB1
  842. KERNEL2x1_SUB1
  843. KERNEL2x1_SUB1
  844. KERNEL2x1_SUB1
  845. KERNEL2x1_SUB1
  846. b CTRMM_L2x1_SUB1
  847. CTRMM_L2x1_SUB0:
  848. andi. L, K1, 7 // K1 & 7 -> L
  849. KERNEL2x1_SUBI1
  850. addic. L, L, -1
  851. ble CTRMM_L2x1_SAVE
  852. b CTRMM_L2x1_SUB2
  853. CTRMM_L2x1_SUB1:
  854. andi. L, K1, 7 // K1 & 7 -> L
  855. ble CTRMM_L2x1_SAVE
  856. CTRMM_L2x1_SUB2:
  857. KERNEL2x1_SUB1
  858. addic. L, L, -1
  859. bgt CTRMM_L2x1_SUB2
  860. CTRMM_L2x1_SAVE:
  861. SAVE2x1
  862. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  863. sub T1, K, KKK // K - KKK -> TEMP1
  864. slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
  865. slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
  866. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  867. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  868. #endif
  869. #if defined(LEFT)
  870. addi KK, KK, 1 // KK += Number of values in A
  871. #endif
  872. CTRMM_L2x1_END:
  873. slwi T1, K, 4
  874. add B, B, T1
  875. #if !defined(LEFT)
  876. addi KK, KK, 2 // KK += Number of values in B
  877. #endif
  878. CTRMM_L2_END:
  879. b CTRMM_L1_BEGIN
  880. L999_H2:
  881. b L999
  882. CTRMM_L1_BEGIN:
  883. andi. T1, N, 1
  884. ble CTRMM_L1_END
  885. mr CO, C
  886. mr AO, A
  887. #if defined(LEFT)
  888. mr KK, OFFSET // OFFSET -> KK
  889. #endif
  890. srawi. I, M, 3
  891. ble CTRMM_L1x8_END
  892. CTRMM_L1x8_BEGIN:
  893. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  894. mr BO, B // B -> BO
  895. #else
  896. mr BO, B // B -> BO
  897. slwi T1, KK, 3 // Number of values in B shifted
  898. slwi T2, KK, 6 // Number of values in A shifted
  899. add BO, BO, T1 // Add values to BO
  900. add AO, AO, T2 // Add values to AO
  901. #endif
  902. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  903. sub T1, K, KK // K - KK -> TEMP1
  904. #else
  905. mr T1, KK // KK -> KTEMP
  906. #ifdef LEFT
  907. addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
  908. #else
  909. addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
  910. #endif
  911. #endif
  912. mr KKK, T1
  913. mr K1, T1
  914. srawi. L, K1, 3 // KTEMP / 8 -> L
  915. ble CTRMM_L1x8_SUB0
  916. cmpwi cr0, L, 1
  917. ble CTRMM_L1x8_SUB4
  918. CTRMM_L1x8_LOOP_START:
  919. LOAD1x8_1
  920. KERNEL1x8_I1
  921. KERNEL1x8_2
  922. KERNEL1x8_1
  923. KERNEL1x8_2
  924. KERNEL1x8_1
  925. KERNEL1x8_2
  926. KERNEL1x8_1
  927. KERNEL1x8_2
  928. addic. L, L, -2
  929. ble CTRMM_L1x8_LOOP_END
  930. .align 5
  931. CTRMM_L1x8_LOOP:
  932. KERNEL1x8_1
  933. KERNEL1x8_2
  934. KERNEL1x8_1
  935. KERNEL1x8_2
  936. KERNEL1x8_1
  937. KERNEL1x8_2
  938. KERNEL1x8_1
  939. KERNEL1x8_2
  940. addic. L, L, -1
  941. bgt CTRMM_L1x8_LOOP
  942. CTRMM_L1x8_LOOP_END:
  943. KERNEL1x8_1
  944. KERNEL1x8_2
  945. KERNEL1x8_1
  946. KERNEL1x8_2
  947. KERNEL1x8_1
  948. KERNEL1x8_2
  949. KERNEL1x8_1
  950. KERNEL1x8_E2
  951. b CTRMM_L1x8_SUB1
  952. CTRMM_L1x8_SUB4:
  953. KERNEL1x8_SUBI1
  954. KERNEL1x8_SUB1
  955. KERNEL1x8_SUB1
  956. KERNEL1x8_SUB1
  957. KERNEL1x8_SUB1
  958. KERNEL1x8_SUB1
  959. KERNEL1x8_SUB1
  960. KERNEL1x8_SUB1
  961. b CTRMM_L1x8_SUB1
  962. CTRMM_L1x8_SUB0:
  963. andi. L, K1, 7 // K1 & 7 -> L
  964. KERNEL1x8_SUBI1
  965. addic. L, L, -1
  966. ble CTRMM_L1x8_SAVE
  967. b CTRMM_L1x8_SUB2
  968. CTRMM_L1x8_SUB1:
  969. andi. L, K1, 7 // K1 & 7 -> L
  970. ble CTRMM_L1x8_SAVE
  971. CTRMM_L1x8_SUB2:
  972. KERNEL1x8_SUB1
  973. addic. L, L, -1
  974. bgt CTRMM_L1x8_SUB2
  975. CTRMM_L1x8_SAVE:
  976. SAVE1x8
  977. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  978. sub T1, K, KKK // K - KKK -> TEMP1
  979. slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
  980. slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
  981. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  982. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  983. #endif
  984. #if defined(LEFT)
  985. addi KK, KK, 8 // KK += Number of values in A
  986. #endif
  987. addic. I, I, -1
  988. bgt CTRMM_L1x8_BEGIN
  989. CTRMM_L1x8_END:
  990. CTRMM_L1x4_BEGIN:
  991. andi. T2, M, 7
  992. ble CTRMM_L1x1_END
  993. andi. T1, M, 4
  994. ble CTRMM_L1x4_END
  995. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  996. mr BO, B // B -> BO
  997. #else
  998. mr BO, B // B -> BO
  999. slwi T1, KK, 3 // Number of values in B shifted
  1000. slwi T2, KK, 5 // Number of values in A shifted
  1001. add BO, BO, T1 // Add values to BO
  1002. add AO, AO, T2 // Add values to AO
  1003. #endif
  1004. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1005. sub T1, K, KK // K - KK -> TEMP1
  1006. #else
  1007. mr T1, KK // KK -> KTEMP
  1008. #ifdef LEFT
  1009. addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
  1010. #else
  1011. addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
  1012. #endif
  1013. #endif
  1014. mr KKK, T1
  1015. mr K1, T1
  1016. srawi. L, K1, 3 // KTEMP / 8 -> L
  1017. ble CTRMM_L1x4_SUB0
  1018. cmpwi cr0, L, 1
  1019. ble CTRMM_L1x4_SUB4
  1020. CTRMM_L1x4_LOOP_START:
  1021. LOAD1x4_1
  1022. KERNEL1x4_I1
  1023. KERNEL1x4_2
  1024. KERNEL1x4_1
  1025. KERNEL1x4_2
  1026. KERNEL1x4_1
  1027. KERNEL1x4_2
  1028. KERNEL1x4_1
  1029. KERNEL1x4_2
  1030. addic. L, L, -2
  1031. ble CTRMM_L1x4_LOOP_END
  1032. .align 5
  1033. CTRMM_L1x4_LOOP:
  1034. KERNEL1x4_1
  1035. KERNEL1x4_2
  1036. KERNEL1x4_1
  1037. KERNEL1x4_2
  1038. KERNEL1x4_1
  1039. KERNEL1x4_2
  1040. KERNEL1x4_1
  1041. KERNEL1x4_2
  1042. addic. L, L, -1
  1043. bgt CTRMM_L1x4_LOOP
  1044. CTRMM_L1x4_LOOP_END:
  1045. KERNEL1x4_1
  1046. KERNEL1x4_2
  1047. KERNEL1x4_1
  1048. KERNEL1x4_2
  1049. KERNEL1x4_1
  1050. KERNEL1x4_2
  1051. KERNEL1x4_1
  1052. KERNEL1x4_E2
  1053. b CTRMM_L1x4_SUB1
  1054. CTRMM_L1x4_SUB4:
  1055. KERNEL1x4_SUBI1
  1056. KERNEL1x4_SUB1
  1057. KERNEL1x4_SUB1
  1058. KERNEL1x4_SUB1
  1059. KERNEL1x4_SUB1
  1060. KERNEL1x4_SUB1
  1061. KERNEL1x4_SUB1
  1062. KERNEL1x4_SUB1
  1063. b CTRMM_L1x4_SUB1
  1064. CTRMM_L1x4_SUB0:
  1065. andi. L, K1, 7 // K1 & 7 -> L
  1066. KERNEL1x4_SUBI1
  1067. addic. L, L, -1
  1068. ble CTRMM_L1x4_SAVE
  1069. b CTRMM_L1x4_SUB2
  1070. CTRMM_L1x4_SUB1:
  1071. andi. L, K1, 7 // K1 & 7 -> L
  1072. ble CTRMM_L1x4_SAVE
  1073. CTRMM_L1x4_SUB2:
  1074. KERNEL1x4_SUB1
  1075. addic. L, L, -1
  1076. bgt CTRMM_L1x4_SUB2
  1077. CTRMM_L1x4_SAVE:
  1078. SAVE1x4
  1079. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1080. sub T1, K, KKK // K - KKK -> TEMP1
  1081. slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
  1082. slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
  1083. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  1084. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  1085. #endif
  1086. #if defined(LEFT)
  1087. addi KK, KK, 4 // KK += Number of values in A
  1088. #endif
  1089. CTRMM_L1x4_END:
  1090. CTRMM_L1x2_BEGIN:
  1091. andi. T1, M, 2
  1092. ble CTRMM_L1x2_END
  1093. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1094. mr BO, B // B -> BO
  1095. #else
  1096. mr BO, B // B -> BO
  1097. slwi T1, KK, 3 // Number of values in B shifted
  1098. slwi T2, KK, 4 // Number of values in A shifted
  1099. add BO, BO, T1 // Add values to BO
  1100. add AO, AO, T2 // Add values to AO
  1101. #endif
  1102. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1103. sub T1, K, KK // K - KK -> TEMP1
  1104. #else
  1105. mr T1, KK // KK -> KTEMP
  1106. #ifdef LEFT
  1107. addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
  1108. #else
  1109. addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
  1110. #endif
  1111. #endif
  1112. mr KKK, T1
  1113. mr K1, T1
  1114. srawi. L, K1, 3 // KTEMP / 8 -> L
  1115. ble CTRMM_L1x2_SUB0
  1116. cmpwi cr0, L, 1
  1117. ble CTRMM_L1x2_SUB4
  1118. CTRMM_L1x2_LOOP_START:
  1119. LOAD1x2_1
  1120. KERNEL1x2_I1
  1121. KERNEL1x2_2
  1122. KERNEL1x2_1
  1123. KERNEL1x2_2
  1124. KERNEL1x2_1
  1125. KERNEL1x2_2
  1126. KERNEL1x2_1
  1127. KERNEL1x2_2
  1128. addic. L, L, -2
  1129. ble CTRMM_L1x2_LOOP_END
  1130. .align 5
  1131. CTRMM_L1x2_LOOP:
  1132. KERNEL1x2_1
  1133. KERNEL1x2_2
  1134. KERNEL1x2_1
  1135. KERNEL1x2_2
  1136. KERNEL1x2_1
  1137. KERNEL1x2_2
  1138. KERNEL1x2_1
  1139. KERNEL1x2_2
  1140. addic. L, L, -1
  1141. bgt CTRMM_L1x2_LOOP
  1142. CTRMM_L1x2_LOOP_END:
  1143. KERNEL1x2_1
  1144. KERNEL1x2_2
  1145. KERNEL1x2_1
  1146. KERNEL1x2_2
  1147. KERNEL1x2_1
  1148. KERNEL1x2_2
  1149. KERNEL1x2_1
  1150. KERNEL1x2_E2
  1151. b CTRMM_L1x2_SUB1
  1152. CTRMM_L1x2_SUB4:
  1153. KERNEL1x2_SUBI1
  1154. KERNEL1x2_SUB1
  1155. KERNEL1x2_SUB1
  1156. KERNEL1x2_SUB1
  1157. KERNEL1x2_SUB1
  1158. KERNEL1x2_SUB1
  1159. KERNEL1x2_SUB1
  1160. KERNEL1x2_SUB1
  1161. b CTRMM_L1x2_SUB1
  1162. CTRMM_L1x2_SUB0:
  1163. andi. L, K1, 7 // K1 & 7 -> L
  1164. KERNEL1x2_SUBI1
  1165. addic. L, L, -1
  1166. ble CTRMM_L1x2_SAVE
  1167. b CTRMM_L1x2_SUB2
  1168. CTRMM_L1x2_SUB1:
  1169. andi. L, K1, 7 // K1 & 7 -> L
  1170. ble CTRMM_L1x2_SAVE
  1171. CTRMM_L1x2_SUB2:
  1172. KERNEL1x2_SUB1
  1173. addic. L, L, -1
  1174. bgt CTRMM_L1x2_SUB2
  1175. CTRMM_L1x2_SAVE:
  1176. SAVE1x2
  1177. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1178. sub T1, K, KKK // K - KKK -> TEMP1
  1179. slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
  1180. slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
  1181. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  1182. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  1183. #endif
  1184. #if defined(LEFT)
  1185. addi KK, KK, 2 // KK += Number of values in A
  1186. #endif
  1187. CTRMM_L1x2_END:
  1188. CTRMM_L1x1_BEGIN:
  1189. andi. T1, M, 1
  1190. ble CTRMM_L1x1_END
  1191. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1192. mr BO, B // B -> BO
  1193. #else
  1194. mr BO, B // B -> BO
  1195. slwi T1, KK, 3 // Number of values in B shifted
  1196. slwi T2, KK, 3 // Number of values in A shifted
  1197. add BO, BO, T1 // Add values to BO
  1198. add AO, AO, T2 // Add values to AO
  1199. #endif
  1200. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1201. sub T1, K, KK // K - KK -> TEMP1
  1202. #else
  1203. mr T1, KK // KK -> KTEMP
  1204. #ifdef LEFT
  1205. addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
  1206. #else
  1207. addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
  1208. #endif
  1209. #endif
  1210. mr KKK, T1
  1211. mr K1, T1
  1212. srawi. L, K1, 3 // KTEMP / 8 -> L
  1213. ble CTRMM_L1x1_SUB0
  1214. cmpwi cr0, L, 1
  1215. ble CTRMM_L1x1_SUB4
  1216. CTRMM_L1x1_LOOP_START:
  1217. LOAD1x1_1
  1218. KERNEL1x1_I1
  1219. KERNEL1x1_2
  1220. KERNEL1x1_1
  1221. KERNEL1x1_2
  1222. KERNEL1x1_1
  1223. KERNEL1x1_2
  1224. KERNEL1x1_1
  1225. KERNEL1x1_2
  1226. addic. L, L, -2
  1227. ble CTRMM_L1x1_LOOP_END
  1228. .align 5
  1229. CTRMM_L1x1_LOOP:
  1230. KERNEL1x1_1
  1231. KERNEL1x1_2
  1232. KERNEL1x1_1
  1233. KERNEL1x1_2
  1234. KERNEL1x1_1
  1235. KERNEL1x1_2
  1236. KERNEL1x1_1
  1237. KERNEL1x1_2
  1238. addic. L, L, -1
  1239. bgt CTRMM_L1x1_LOOP
  1240. CTRMM_L1x1_LOOP_END:
  1241. KERNEL1x1_1
  1242. KERNEL1x1_2
  1243. KERNEL1x1_1
  1244. KERNEL1x1_2
  1245. KERNEL1x1_1
  1246. KERNEL1x1_2
  1247. KERNEL1x1_1
  1248. KERNEL1x1_E2
  1249. b CTRMM_L1x1_SUB1
  1250. CTRMM_L1x1_SUB4:
  1251. KERNEL1x1_SUBI1
  1252. KERNEL1x1_SUB1
  1253. KERNEL1x1_SUB1
  1254. KERNEL1x1_SUB1
  1255. KERNEL1x1_SUB1
  1256. KERNEL1x1_SUB1
  1257. KERNEL1x1_SUB1
  1258. KERNEL1x1_SUB1
  1259. b CTRMM_L1x1_SUB1
  1260. CTRMM_L1x1_SUB0:
  1261. andi. L, K1, 7 // K1 & 7 -> L
  1262. KERNEL1x1_SUBI1
  1263. addic. L, L, -1
  1264. ble CTRMM_L1x1_SAVE
  1265. b CTRMM_L1x1_SUB2
  1266. CTRMM_L1x1_SUB1:
  1267. andi. L, K1, 7 // K1 & 7 -> L
  1268. ble CTRMM_L1x1_SAVE
  1269. CTRMM_L1x1_SUB2:
  1270. KERNEL1x1_SUB1
  1271. addic. L, L, -1
  1272. bgt CTRMM_L1x1_SUB2
  1273. CTRMM_L1x1_SAVE:
  1274. SAVE1x1
  1275. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1276. sub T1, K, KKK // K - KKK -> TEMP1
  1277. slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
  1278. slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
  1279. add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
  1280. add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
  1281. #endif
  1282. #if defined(LEFT)
  1283. addi KK, KK, 1 // KK += Number of values in A
  1284. #endif
  1285. CTRMM_L1x1_END:
  1286. #if !defined(LEFT)
  1287. addi KK, KK, 1 // KK += Number of values in B
  1288. #endif
  1289. CTRMM_L1_END: