You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strmm_kernel_8x8_cortexa53.S 54 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 x7 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define temp x16
  48. #define tempOffset x17
  49. #define tempK x18
  50. #define alpha0 s10
  51. #define alphaV0 v10.s[0]
  52. #define alpha1 s11
  53. #define alphaV1 v11.s[0]
  54. #define alpha2 s14
  55. #define alphaV2 v14.s[0]
  56. #define alpha3 s15
  57. #define alphaV3 v15.s[0]
  58. // 00 origM
  59. // 01 origN
  60. // 02 origK
  61. // 03 origPA
  62. // 04 origPB
  63. // 05 pC
  64. // 06 origLDC -> LDC
  65. // 07 offset
  66. // 08 counterL
  67. // 09 counterI
  68. // 10 counterJ
  69. // 11 pB
  70. // 12 pCRow0
  71. // 13 pCRow1
  72. // 14 pCRow2
  73. // 15 pA
  74. // 16 temp
  75. // 17 tempOffset
  76. // 18 must save tempK
  77. // 19 must save
  78. // 20 must save pA0_2, pA0_3
  79. // 21 must save pA0_6, pA0_7
  80. // 22 must save pA1_2, pA1_3
  81. // 23 must save pA1_6, pA1_7
  82. // 24 must save pB0_2, pB0_3
  83. // 25 must save pB0_6, pB0_7
  84. // 26 must save pB1_2, pB1_3
  85. // 27 must save pB1_6, pB1_7
  86. // 28 must save
  87. // 29 frame
  88. // 30 link
  89. // 31 sp
  90. //v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3
  91. //v01 pA0_4, pA0_5, pA0_6, pA0_7
  92. //v02 pA1_0, pA1_1, pA1_2, pA1_3
  93. //v03 pA1_4, pA1_5, pA1_6, pA1_7
  94. //v04 pB0_0, pB0_1, pB0_2, pB0_3
  95. //v05 pB0_4, pB0_5, pB0_6, pB0_7
  96. //v06 pB1_0, pB1_1, pB1_2, pB1_3
  97. //v07 pB1_4, pB1_5, pB1_6, pB1_7
  98. //v08 must save
  99. //v09 must save
  100. //v10 must save ALPHA0
  101. //v11 must save ALPHA1
  102. //v12 must save
  103. //v13 must save
  104. //v14 must save ALPHA2
  105. //v15 must save ALPHA3
  106. //v16 must save C00, C01, C02, C03
  107. //v17 must save C04, C05, C06, C07
  108. //v18 C08, C09, C10, C11
  109. //v19 C12, C13, C14, C15
  110. //v20 C16, C17, C18, C19
  111. //v21 C20, C21, C22, C23
  112. //v22 C24, C25, C26, C27
  113. //v23 C28, C29, C30, C31
  114. //v24 C32, C33, C34, C35
  115. //v25 C36, C37, C38, C39
  116. //v26 C40, C41, C42, C43
  117. //v27 C44, C45, C46, C47
  118. //v28 C48, C49, C50, C51
  119. //v29 C52, C53, C54, C55
  120. //v30 C56, C57, C58, C59
  121. //v31 C60, C61, C62, C63
  122. /*******************************************************************************
  123. * Macro definitions
  124. *******************************************************************************/
  125. .macro INIT8x8
  126. fmov s16, wzr
  127. fmov s17, wzr
  128. fmov s18, s16
  129. fmov s19, s17
  130. fmov s20, wzr
  131. fmov s21, s16
  132. fmov s22, s17
  133. fmov s23, s18
  134. fmov s24, wzr
  135. fmov s25, s16
  136. fmov s26, s17
  137. fmov s27, s18
  138. fmov s28, wzr
  139. fmov s29, s16
  140. fmov s30, s17
  141. fmov s31, s18
  142. .endm
  143. .macro KERNEL8x8_I
  144. ld1 {v0.4s, v1.4s}, [pA], #32
  145. ld1 {v4.4s, v5.4s}, [pB], #32
  146. ldr d2, [pA], #8
  147. ldr d6, [pB], #8
  148. ldr d3, [pA, #8]
  149. ldr d7, [pB, #8]
  150. ldr x22, [pA], #16
  151. fmul v16.4s, v0.4s, v4.s[0]
  152. ldr x26, [pB], #16
  153. fmul v17.4s, v1.4s, v4.s[0]
  154. ldr x23, [pA], #8
  155. fmul v18.4s, v0.4s, v4.s[1]
  156. ldr x27, [pB], #8
  157. fmul v19.4s, v1.4s, v4.s[1]
  158. fmul v20.4s, v0.4s, v4.s[2]
  159. fmul v21.4s, v1.4s, v4.s[2]
  160. fmul v22.4s, v0.4s, v4.s[3]
  161. fmul v23.4s, v1.4s, v4.s[3]
  162. fmul v24.4s, v0.4s, v5.s[0]
  163. fmul v25.4s, v1.4s, v5.s[0]
  164. fmul v26.4s, v0.4s, v5.s[1]
  165. fmul v27.4s, v1.4s, v5.s[1]
  166. fmul v28.4s, v0.4s, v5.s[2]
  167. fmul v29.4s, v1.4s, v5.s[2]
  168. fmul v30.4s, v0.4s, v5.s[3]
  169. fmul v31.4s, v1.4s, v5.s[3]
  170. .endm
  171. .macro KERNEL8x8_M1
  172. ldr d2, [pA], #8
  173. fmov v0.d[1], x20
  174. ldr d6, [pB], #8
  175. fmov v4.d[1], x24
  176. ldr d3, [pA, #8]
  177. fmov v1.d[1], x21
  178. ldr d7, [pB, #8]
  179. fmov v5.d[1], x25
  180. fmla v16.4s, v0.4s, v4.s[0]
  181. ldr x22, [pA], #16
  182. fmla v17.4s, v1.4s, v4.s[0]
  183. ldr x26, [pB], #16
  184. fmla v18.4s, v0.4s, v4.s[1]
  185. ldr x23, [pA], #8
  186. fmla v19.4s, v1.4s, v4.s[1]
  187. ldr x27, [pB], #8
  188. fmla v20.4s, v0.4s, v4.s[2]
  189. fmla v21.4s, v1.4s, v4.s[2]
  190. fmla v22.4s, v0.4s, v4.s[3]
  191. fmla v23.4s, v1.4s, v4.s[3]
  192. fmla v24.4s, v0.4s, v5.s[0]
  193. fmla v25.4s, v1.4s, v5.s[0]
  194. fmla v26.4s, v0.4s, v5.s[1]
  195. fmla v27.4s, v1.4s, v5.s[1]
  196. fmla v28.4s, v0.4s, v5.s[2]
  197. fmla v29.4s, v1.4s, v5.s[2]
  198. fmla v30.4s, v0.4s, v5.s[3]
  199. fmla v31.4s, v1.4s, v5.s[3]
  200. .endm
  201. .macro KERNEL8x8_M2
  202. ldr d0, [pA], #8
  203. fmov v2.d[1], x22
  204. ldr d4, [pB], #8
  205. fmov v6.d[1], x26
  206. ldr d1, [pA, #8]
  207. fmov v3.d[1], x23
  208. ldr d5, [pB, #8]
  209. fmov v7.d[1], x27
  210. fmla v16.4s, v2.4s, v6.s[0]
  211. ldr x20, [pA], #16
  212. fmla v17.4s, v3.4s, v6.s[0]
  213. ldr x24, [pB], #16
  214. fmla v18.4s, v2.4s, v6.s[1]
  215. ldr x21, [pA], #8
  216. fmla v19.4s, v3.4s, v6.s[1]
  217. ldr x25, [pB], #8
  218. fmla v20.4s, v2.4s, v6.s[2]
  219. fmla v21.4s, v3.4s, v6.s[2]
  220. fmla v22.4s, v2.4s, v6.s[3]
  221. fmla v23.4s, v3.4s, v6.s[3]
  222. fmla v24.4s, v2.4s, v7.s[0]
  223. fmla v25.4s, v3.4s, v7.s[0]
  224. fmla v26.4s, v2.4s, v7.s[1]
  225. fmla v27.4s, v3.4s, v7.s[1]
  226. fmla v28.4s, v2.4s, v7.s[2]
  227. fmla v29.4s, v3.4s, v7.s[2]
  228. fmla v30.4s, v2.4s, v7.s[3]
  229. fmla v31.4s, v3.4s, v7.s[3]
  230. .endm
  231. .macro KERNEL8x8_E
  232. fmov v2.d[1], x22
  233. fmov v6.d[1], x26
  234. fmov v3.d[1], x23
  235. fmov v7.d[1], x27
  236. fmla v16.4s, v2.4s, v6.s[0]
  237. fmla v17.4s, v3.4s, v6.s[0]
  238. fmla v18.4s, v2.4s, v6.s[1]
  239. fmla v19.4s, v3.4s, v6.s[1]
  240. fmla v20.4s, v2.4s, v6.s[2]
  241. fmla v21.4s, v3.4s, v6.s[2]
  242. fmla v22.4s, v2.4s, v6.s[3]
  243. fmla v23.4s, v3.4s, v6.s[3]
  244. fmla v24.4s, v2.4s, v7.s[0]
  245. fmla v25.4s, v3.4s, v7.s[0]
  246. fmla v26.4s, v2.4s, v7.s[1]
  247. fmla v27.4s, v3.4s, v7.s[1]
  248. fmla v28.4s, v2.4s, v7.s[2]
  249. fmla v29.4s, v3.4s, v7.s[2]
  250. fmla v30.4s, v2.4s, v7.s[3]
  251. fmla v31.4s, v3.4s, v7.s[3]
  252. .endm
  253. .macro KERNEL8x8_SUB
  254. ld1 {v4.4s}, [pB]
  255. add pB, pB, #16
  256. ld1 {v5.4s}, [pB]
  257. add pB, pB, #16
  258. ld1 {v0.4s}, [pA]
  259. add pA, pA, #16
  260. ld1 {v1.4s}, [pA]
  261. add pA, pA, #16
  262. fmla v16.4s, v0.4s, v4.s[0]
  263. fmla v17.4s, v1.4s, v4.s[0]
  264. fmla v18.4s, v0.4s, v4.s[1]
  265. fmla v19.4s, v1.4s, v4.s[1]
  266. fmla v20.4s, v0.4s, v4.s[2]
  267. fmla v21.4s, v1.4s, v4.s[2]
  268. fmla v22.4s, v0.4s, v4.s[3]
  269. fmla v23.4s, v1.4s, v4.s[3]
  270. fmla v24.4s, v0.4s, v5.s[0]
  271. fmla v25.4s, v1.4s, v5.s[0]
  272. fmla v26.4s, v0.4s, v5.s[1]
  273. fmla v27.4s, v1.4s, v5.s[1]
  274. fmla v28.4s, v0.4s, v5.s[2]
  275. fmla v29.4s, v1.4s, v5.s[2]
  276. fmla v30.4s, v0.4s, v5.s[3]
  277. fmla v31.4s, v1.4s, v5.s[3]
  278. .endm
  279. .macro SAVE8x8
  280. add pCRow1, pCRow0, LDC
  281. fmul v0.4s, v16.4s, alphaV0
  282. fmul v1.4s, v17.4s, alphaV1
  283. st1 {v0.4s, v1.4s}, [pCRow0]
  284. add pCRow2, pCRow1, LDC
  285. fmul v2.4s, v18.4s, alphaV2
  286. fmul v3.4s, v19.4s, alphaV3
  287. st1 {v2.4s, v3.4s}, [pCRow1]
  288. add pCRow1, pCRow2, LDC
  289. fmul v4.4s, v20.4s, alphaV0
  290. fmul v5.4s, v21.4s, alphaV1
  291. st1 {v4.4s, v5.4s}, [pCRow2]
  292. add pCRow2, pCRow1, LDC
  293. fmul v6.4s, v22.4s, alphaV2
  294. fmul v7.4s, v23.4s, alphaV3
  295. st1 {v6.4s, v7.4s}, [pCRow1]
  296. add pCRow1, pCRow2, LDC
  297. fmul v0.4s, v24.4s, alphaV0
  298. fmul v1.4s, v25.4s, alphaV1
  299. st1 {v0.4s, v1.4s}, [pCRow2]
  300. add pCRow2, pCRow1, LDC
  301. fmul v2.4s, v26.4s, alphaV2
  302. fmul v3.4s, v27.4s, alphaV3
  303. st1 {v2.4s, v3.4s}, [pCRow1]
  304. add pCRow1, pCRow2, LDC
  305. fmul v4.4s, v28.4s, alphaV0
  306. fmul v5.4s, v29.4s, alphaV1
  307. st1 {v4.4s, v5.4s}, [pCRow2]
  308. fmul v6.4s, v30.4s, alphaV2
  309. fmul v7.4s, v31.4s, alphaV3
  310. st1 {v6.4s, v7.4s}, [pCRow1]
  311. add pCRow0, pCRow0, #32
  312. .endm
  313. /******************************************************************************/
  314. .macro INIT4x8
  315. fmov s16, wzr
  316. fmov s18, wzr
  317. fmov s20, wzr
  318. fmov s22, s16
  319. fmov s24, wzr
  320. fmov s26, s16
  321. fmov s28, s18
  322. fmov s30, s20
  323. .endm
  324. .macro KERNEL4x8_I
  325. ld1 {v0.4s}, [pA], #16
  326. ld1 {v4.4s, v5.4s}, [pB], #32
  327. ldr d2, [pA], #8
  328. ldr d6, [pB], #8
  329. ldr d7, [pB, #8]
  330. ldr x21, [pA], #8
  331. fmul v16.4s, v0.4s, v4.s[0]
  332. ldr x26, [pB], #16
  333. fmul v18.4s, v0.4s, v4.s[1]
  334. ldr x27, [pB], #8
  335. fmul v20.4s, v0.4s, v4.s[2]
  336. fmul v22.4s, v0.4s, v4.s[3]
  337. fmul v24.4s, v0.4s, v5.s[0]
  338. fmul v26.4s, v0.4s, v5.s[1]
  339. fmul v28.4s, v0.4s, v5.s[2]
  340. fmul v30.4s, v0.4s, v5.s[3]
  341. .endm
  342. .macro KERNEL4x8_M1
  343. ldr d2, [pA], #8
  344. fmov v0.d[1], x20
  345. ldr d6, [pB], #8
  346. fmov v4.d[1], x24
  347. ldr d7, [pB, #8]
  348. fmov v5.d[1], x25
  349. fmla v16.4s, v0.4s, v4.s[0]
  350. ldr x21, [pA], #8
  351. fmla v18.4s, v0.4s, v4.s[1]
  352. ldr x26, [pB], #16
  353. fmla v20.4s, v0.4s, v4.s[2]
  354. ldr x27, [pB], #8
  355. fmla v22.4s, v0.4s, v4.s[3]
  356. fmla v24.4s, v0.4s, v5.s[0]
  357. fmla v26.4s, v0.4s, v5.s[1]
  358. fmla v28.4s, v0.4s, v5.s[2]
  359. fmla v30.4s, v0.4s, v5.s[3]
  360. .endm
  361. .macro KERNEL4x8_M2
  362. ldr d0, [pA], #8
  363. fmov v2.d[1], x21
  364. ldr d4, [pB], #8
  365. fmov v6.d[1], x26
  366. ldr d5, [pB, #8]
  367. fmov v7.d[1], x27
  368. fmla v16.4s, v2.4s, v6.s[0]
  369. ldr x20, [pA], #8
  370. fmla v18.4s, v2.4s, v6.s[1]
  371. ldr x24, [pB], #16
  372. fmla v20.4s, v2.4s, v6.s[2]
  373. ldr x25, [pB], #8
  374. fmla v22.4s, v2.4s, v6.s[3]
  375. fmla v24.4s, v2.4s, v7.s[0]
  376. fmla v26.4s, v2.4s, v7.s[1]
  377. fmla v28.4s, v2.4s, v7.s[2]
  378. fmla v30.4s, v2.4s, v7.s[3]
  379. .endm
  380. .macro KERNEL4x8_E
  381. fmov v2.d[1], x21
  382. fmov v6.d[1], x26
  383. fmov v7.d[1], x27
  384. fmla v16.4s, v2.4s, v6.s[0]
  385. fmla v18.4s, v2.4s, v6.s[1]
  386. fmla v20.4s, v2.4s, v6.s[2]
  387. fmla v22.4s, v2.4s, v6.s[3]
  388. fmla v24.4s, v2.4s, v7.s[0]
  389. fmla v26.4s, v2.4s, v7.s[1]
  390. fmla v28.4s, v2.4s, v7.s[2]
  391. fmla v30.4s, v2.4s, v7.s[3]
  392. .endm
  393. .macro KERNEL4x8_SUB
  394. ld1 {v4.4s}, [pB]
  395. add pB, pB, #16
  396. ld1 {v5.4s}, [pB]
  397. add pB, pB, #16
  398. ld1 {v0.4s}, [pA]
  399. add pA, pA, #16
  400. fmla v16.4s, v0.4s, v4.s[0]
  401. fmla v18.4s, v0.4s, v4.s[1]
  402. fmla v20.4s, v0.4s, v4.s[2]
  403. fmla v22.4s, v0.4s, v4.s[3]
  404. fmla v24.4s, v0.4s, v5.s[0]
  405. fmla v26.4s, v0.4s, v5.s[1]
  406. fmla v28.4s, v0.4s, v5.s[2]
  407. fmla v30.4s, v0.4s, v5.s[3]
  408. .endm
  409. .macro SAVE4x8
  410. add pCRow1, pCRow0, LDC
  411. fmul v0.4s, v16.4s, alphaV0
  412. st1 {v0.4s}, [pCRow0]
  413. add pCRow2, pCRow1, LDC
  414. fmul v2.4s, v18.4s, alphaV2
  415. st1 {v2.4s}, [pCRow1]
  416. add pCRow1, pCRow2, LDC
  417. fmul v4.4s, v20.4s, alphaV0
  418. st1 {v4.4s}, [pCRow2]
  419. add pCRow2, pCRow1, LDC
  420. fmul v6.4s, v22.4s, alphaV2
  421. st1 {v6.4s}, [pCRow1]
  422. add pCRow1, pCRow2, LDC
  423. fmul v0.4s, v24.4s, alphaV0
  424. st1 {v0.4s}, [pCRow2]
  425. add pCRow2, pCRow1, LDC
  426. fmul v2.4s, v26.4s, alphaV2
  427. st1 {v2.4s}, [pCRow1]
  428. add pCRow1, pCRow2, LDC
  429. fmul v4.4s, v28.4s, alphaV0
  430. st1 {v4.4s}, [pCRow2]
  431. fmul v6.4s, v30.4s, alphaV2
  432. st1 {v6.4s}, [pCRow1]
  433. add pCRow0, pCRow0, #16
  434. .endm
  435. /******************************************************************************/
  436. .macro INIT2x8
  437. fmov s16, wzr
  438. fmov s18, wzr
  439. fmov s20, wzr
  440. fmov s22, s16
  441. fmov s24, wzr
  442. fmov s26, s16
  443. fmov s28, s18
  444. fmov s30, s20
  445. .endm
  446. .macro KERNEL2x8_SUB
  447. ld1 {v4.4s}, [pB]
  448. add pB, pB, #16
  449. ld1 {v5.4s}, [pB]
  450. add pB, pB, #16
  451. ld1 {v0.2s}, [pA]
  452. add pA, pA, #8
  453. fmla v16.2s, v0.2s, v4.s[0]
  454. fmla v18.2s, v0.2s, v4.s[1]
  455. fmla v20.2s, v0.2s, v4.s[2]
  456. fmla v22.2s, v0.2s, v4.s[3]
  457. fmla v24.2s, v0.2s, v5.s[0]
  458. fmla v26.2s, v0.2s, v5.s[1]
  459. fmla v28.2s, v0.2s, v5.s[2]
  460. fmla v30.2s, v0.2s, v5.s[3]
  461. .endm
  462. .macro SAVE2x8
  463. add pCRow1, pCRow0, LDC
  464. fmul v0.2s, v16.2s, alphaV0
  465. st1 {v0.2s}, [pCRow0]
  466. add pCRow2, pCRow1, LDC
  467. fmul v2.2s, v18.2s, alphaV2
  468. st1 {v2.2s}, [pCRow1]
  469. add pCRow1, pCRow2, LDC
  470. fmul v4.2s, v20.2s, alphaV0
  471. st1 {v4.2s}, [pCRow2]
  472. add pCRow2, pCRow1, LDC
  473. fmul v6.2s, v22.2s, alphaV2
  474. st1 {v6.2s}, [pCRow1]
  475. add pCRow1, pCRow2, LDC
  476. fmul v0.2s, v24.2s, alphaV0
  477. st1 {v0.2s}, [pCRow2]
  478. add pCRow2, pCRow1, LDC
  479. fmul v2.2s, v26.2s, alphaV2
  480. st1 {v2.2s}, [pCRow1]
  481. add pCRow1, pCRow2, LDC
  482. fmul v4.2s, v28.2s, alphaV0
  483. st1 {v4.2s}, [pCRow2]
  484. fmul v6.2s, v30.2s, alphaV2
  485. st1 {v6.2s}, [pCRow1]
  486. add pCRow0, pCRow0, #8
  487. .endm
  488. /******************************************************************************/
  489. .macro INIT1x8
  490. fmov s16, wzr
  491. fmov s18, wzr
  492. fmov s20, wzr
  493. fmov s22, s16
  494. fmov s24, wzr
  495. fmov s26, s16
  496. fmov s28, s18
  497. fmov s30, s20
  498. .endm
  499. .macro KERNEL1x8_SUB
  500. ld1 {v4.4s}, [pB]
  501. add pB, pB, #16
  502. ld1 {v5.4s}, [pB]
  503. add pB, pB, #16
  504. ldr s0, [pA]
  505. add pA, pA, #4
  506. fmla s16, s0, v4.s[0]
  507. fmla s18, s0, v4.s[1]
  508. fmla s20, s0, v4.s[2]
  509. fmla s22, s0, v4.s[3]
  510. fmla s24, s0, v5.s[0]
  511. fmla s26, s0, v5.s[1]
  512. fmla s28, s0, v5.s[2]
  513. fmla s30, s0, v5.s[3]
  514. .endm
  515. .macro SAVE1x8
  516. add pCRow1, pCRow0, LDC
  517. fmul s0, s16, alphaV0
  518. str s0, [pCRow0]
  519. add pCRow2, pCRow1, LDC
  520. fmul s2, s18, alphaV2
  521. str s2, [pCRow1]
  522. add pCRow1, pCRow2, LDC
  523. fmul s4, s20, alphaV0
  524. str s4, [pCRow2]
  525. add pCRow2, pCRow1, LDC
  526. fmul s6, s22, alphaV2
  527. str s6, [pCRow1]
  528. add pCRow1, pCRow2, LDC
  529. fmul s0, s24, alphaV0
  530. str s0, [pCRow2]
  531. add pCRow2, pCRow1, LDC
  532. fmul s2, s26, alphaV2
  533. str s2, [pCRow1]
  534. add pCRow1, pCRow2, LDC
  535. fmul s4, s28, alphaV0
  536. str s4, [pCRow2]
  537. fmul s6, s30, alphaV2
  538. str s6, [pCRow1]
  539. add pCRow0, pCRow0, #4
  540. .endm
  541. /******************************************************************************/
  542. .macro INIT8x4
  543. fmov s16, wzr
  544. fmov s17, wzr
  545. fmov s20, wzr
  546. fmov s21, s16
  547. fmov s24, wzr
  548. fmov s25, s16
  549. fmov s28, wzr
  550. fmov s29, s16
  551. .endm
  552. .macro KERNEL8x4_I
  553. ld1 {v8.4s}, [pB], #16
  554. ld1 {v0.4s, v1.4s}, [pA], #32
  555. ldr d9, [pB], #8
  556. ldr d2, [pA], #8
  557. ldr d3, [pA, #8]
  558. fmul v16.4s, v0.4s, v8.s[0]
  559. ldr x25, [pB], #8
  560. fmul v17.4s, v1.4s, v8.s[0]
  561. ldr x22, [pA], #16
  562. fmul v20.4s, v0.4s, v8.s[1]
  563. ldr x23, [pA], #8
  564. fmul v21.4s, v1.4s, v8.s[1]
  565. fmul v24.4s, v0.4s, v8.s[2]
  566. fmul v25.4s, v1.4s, v8.s[2]
  567. fmul v28.4s, v0.4s, v8.s[3]
  568. fmul v29.4s, v1.4s, v8.s[3]
  569. .endm
  570. .macro KERNEL8x4_M1
  571. ldr d9, [pB], #8
  572. fmov v8.d[1], x24
  573. ldr d2, [pA], #8
  574. fmov v0.d[1], x20
  575. ldr d3, [pA, #8]
  576. fmov v1.d[1], x21
  577. fmla v16.4s, v0.4s, v8.s[0]
  578. ldr x25, [pB], #8
  579. fmla v17.4s, v1.4s, v8.s[0]
  580. ldr x22, [pA], #16
  581. fmla v20.4s, v0.4s, v8.s[1]
  582. ldr x23, [pA], #8
  583. fmla v21.4s, v1.4s, v8.s[1]
  584. fmla v24.4s, v0.4s, v8.s[2]
  585. fmla v25.4s, v1.4s, v8.s[2]
  586. fmla v28.4s, v0.4s, v8.s[3]
  587. fmla v29.4s, v1.4s, v8.s[3]
  588. .endm
  589. .macro KERNEL8x4_M2
  590. ldr d8, [pB], #8
  591. fmov v9.d[1], x25
  592. ldr d0, [pA], #8
  593. fmov v2.d[1], x22
  594. ldr d1, [pA, #8]
  595. fmov v3.d[1], x23
  596. fmla v16.4s, v2.4s, v9.s[0]
  597. ldr x24, [pB], #8
  598. fmla v17.4s, v3.4s, v9.s[0]
  599. ldr x20, [pA], #16
  600. fmla v20.4s, v2.4s, v9.s[1]
  601. ldr x21, [pA], #8
  602. fmla v21.4s, v3.4s, v9.s[1]
  603. fmla v24.4s, v2.4s, v9.s[2]
  604. fmla v25.4s, v3.4s, v9.s[2]
  605. fmla v28.4s, v2.4s, v9.s[3]
  606. fmla v29.4s, v3.4s, v9.s[3]
  607. .endm
  608. .macro KERNEL8x4_E
  609. fmov v9.d[1], x25
  610. fmov v2.d[1], x22
  611. fmov v3.d[1], x23
  612. fmla v16.4s, v2.4s, v9.s[0]
  613. fmla v17.4s, v3.4s, v9.s[0]
  614. fmla v20.4s, v2.4s, v9.s[1]
  615. fmla v21.4s, v3.4s, v9.s[1]
  616. fmla v24.4s, v2.4s, v9.s[2]
  617. fmla v25.4s, v3.4s, v9.s[2]
  618. fmla v28.4s, v2.4s, v9.s[3]
  619. fmla v29.4s, v3.4s, v9.s[3]
  620. .endm
  621. .macro KERNEL8x4_SUB
  622. ld1 {v8.4s}, [pB], #16
  623. ld1 {v0.4s, v1.4s}, [pA], #32
  624. fmla v16.4s, v0.4s, v8.s[0]
  625. fmla v17.4s, v1.4s, v8.s[0]
  626. fmla v20.4s, v0.4s, v8.s[1]
  627. fmla v21.4s, v1.4s, v8.s[1]
  628. fmla v24.4s, v0.4s, v8.s[2]
  629. fmla v25.4s, v1.4s, v8.s[2]
  630. fmla v28.4s, v0.4s, v8.s[3]
  631. fmla v29.4s, v1.4s, v8.s[3]
  632. .endm
  633. .macro SAVE8x4
  634. add pCRow1, pCRow0, LDC
  635. fmul v0.4s, v16.4s, alphaV0
  636. fmul v1.4s, v17.4s, alphaV1
  637. st1 {v0.4s, v1.4s}, [pCRow0]
  638. add pCRow2, pCRow1, LDC
  639. fmul v4.4s, v20.4s, alphaV0
  640. fmul v5.4s, v21.4s, alphaV1
  641. st1 {v4.4s, v5.4s}, [pCRow1]
  642. add pCRow1, pCRow2, LDC
  643. fmul v0.4s, v24.4s, alphaV0
  644. fmul v1.4s, v25.4s, alphaV1
  645. st1 {v0.4s, v1.4s}, [pCRow2]
  646. fmul v4.4s, v28.4s, alphaV0
  647. fmul v5.4s, v29.4s, alphaV1
  648. st1 {v4.4s, v5.4s}, [pCRow1]
  649. add pCRow0, pCRow0, #32
  650. .endm
  651. /******************************************************************************/
  652. .macro INIT4x4
  653. fmov s16, wzr
  654. fmov s17, s16
  655. fmov s20, s17
  656. fmov s21, s16
  657. fmov s24, s17
  658. fmov s25, s16
  659. fmov s28, s17
  660. fmov s29, s16
  661. .endm
  662. .macro KERNEL4x4_I
  663. ld1 {v8.2s, v9.2s}, [pB]
  664. add pB, pB, #16
  665. ld1 {v0.2s, v1.2s}, [pA]
  666. add pA, pA, #16
  667. fmul v16.2s, v0.2s, v8.s[0]
  668. fmul v29.2s, v1.2s, v9.s[1]
  669. fmul v20.2s, v0.2s, v8.s[1]
  670. fmul v25.2s, v1.2s, v9.s[0]
  671. fmul v24.2s, v0.2s, v9.s[0]
  672. fmul v21.2s, v1.2s, v8.s[1]
  673. fmul v28.2s, v0.2s, v9.s[1]
  674. fmul v17.2s, v1.2s, v8.s[0]
  675. ld1 {v12.2s, v13.2s}, [pB]
  676. add pB, pB, #16
  677. ld1 {v4.2s, v5.2s}, [pA]
  678. add pA, pA, #16
  679. .endm
  680. .macro KERNEL4x4_M1
  681. fmla v16.2s, v0.2s, v8.s[0]
  682. fmla v29.2s, v1.2s, v9.s[1]
  683. ld1 {v12.2s, v13.2s}, [pB] // For next round
  684. add pB, pB, #16
  685. fmla v20.2s, v0.2s, v8.s[1]
  686. fmla v25.2s, v1.2s, v9.s[0]
  687. ld1 {v4.2s, v5.2s}, [pA] // For next round
  688. add pA, pA, #16
  689. fmla v24.2s, v0.2s, v9.s[0]
  690. fmla v21.2s, v1.2s, v8.s[1]
  691. prfm PLDL1KEEP, [pB, #512]
  692. fmla v28.2s, v0.2s, v9.s[1]
  693. fmla v17.2s, v1.2s, v8.s[0]
  694. .endm
  695. .macro KERNEL4x4_M2
  696. fmla v16.2s, v4.2s, v12.s[0]
  697. fmla v29.2s, v5.2s, v13.s[1]
  698. ld1 {v8.2s, v9.2s}, [pB] // For next round
  699. add pB, pB, #16
  700. fmla v20.2s, v4.2s, v12.s[1]
  701. fmla v25.2s, v5.2s, v13.s[0]
  702. ld1 {v0.2s, v1.2s}, [pA] // For next round
  703. add pA, pA, #16
  704. fmla v24.2s, v4.2s, v13.s[0]
  705. fmla v21.2s, v5.2s, v12.s[1]
  706. prfm PLDL1KEEP, [pA, #512]
  707. fmla v28.2s, v4.2s, v13.s[1]
  708. fmla v17.2s, v5.2s, v12.s[0]
  709. .endm
  710. .macro KERNEL4x4_E
  711. fmla v16.2s, v4.2s, v12.s[0]
  712. fmla v29.2s, v5.2s, v13.s[1]
  713. fmla v20.2s, v4.2s, v12.s[1]
  714. fmla v25.2s, v5.2s, v13.s[0]
  715. fmla v24.2s, v4.2s, v13.s[0]
  716. fmla v21.2s, v5.2s, v12.s[1]
  717. fmla v28.2s, v4.2s, v13.s[1]
  718. fmla v17.2s, v5.2s, v12.s[0]
  719. .endm
  720. .macro KERNEL4x4_SUB
  721. ld1 {v8.2s, v9.2s}, [pB]
  722. add pB, pB, #16
  723. ld1 {v0.2s, v1.2s}, [pA]
  724. add pA, pA, #16
  725. fmla v16.2s, v0.2s, v8.s[0]
  726. fmla v29.2s, v1.2s, v9.s[1]
  727. fmla v20.2s, v0.2s, v8.s[1]
  728. fmla v25.2s, v1.2s, v9.s[0]
  729. fmla v24.2s, v0.2s, v9.s[0]
  730. fmla v21.2s, v1.2s, v8.s[1]
  731. fmla v28.2s, v0.2s, v9.s[1]
  732. fmla v17.2s, v1.2s, v8.s[0]
  733. .endm
  734. .macro SAVE4x4
  735. fmul v8.2s, v16.2s, alphaV0
  736. fmul v9.2s, v17.2s, alphaV1
  737. st1 {v8.2s, v9.2s}, [pCRow0]
  738. add pCRow1, pCRow0, LDC
  739. fmul v12.2s, v20.2s, alphaV2
  740. fmul v13.2s, v21.2s, alphaV3
  741. st1 {v12.2s, v13.2s}, [pCRow1]
  742. add pCRow2, pCRow1, LDC
  743. fmul v8.2s, v24.2s, alphaV0
  744. fmul v9.2s, v25.2s, alphaV1
  745. st1 {v8.2s, v9.2s}, [pCRow2]
  746. add pCRow1, pCRow2, LDC
  747. fmul v12.2s, v28.2s, alphaV2
  748. fmul v13.2s, v29.2s, alphaV3
  749. st1 {v12.2s, v13.2s}, [pCRow1]
  750. add pCRow0, pCRow0, #16
  751. .endm
  752. /******************************************************************************/
  753. .macro INIT2x4
  754. fmov s16, wzr
  755. fmov s20, s16
  756. fmov s24, s20
  757. fmov s28, s16
  758. .endm
  759. .macro KERNEL2x4_SUB
  760. ld1 {v8.2s, v9.2s}, [pB]
  761. add pB, pB, #16
  762. ld1 {v0.2s}, [pA]
  763. add pA, pA, #8
  764. fmla v16.2s, v0.2s, v8.s[0]
  765. fmla v20.2s, v0.2s, v8.s[1]
  766. fmla v24.2s, v0.2s, v9.s[0]
  767. fmla v28.2s, v0.2s, v9.s[1]
  768. .endm
  769. .macro SAVE2x4
  770. fmul v8.2s, v16.2s, alphaV0
  771. st1 {v8.2s}, [pCRow0]
  772. add pCRow1, pCRow0, LDC
  773. fmul v12.2s, v20.2s, alphaV1
  774. st1 {v12.2s}, [pCRow1]
  775. add pCRow2, pCRow1, LDC
  776. fmul v8.2s, v24.2s, alphaV2
  777. st1 {v8.2s}, [pCRow2]
  778. add pCRow1, pCRow2, LDC
  779. fmul v12.2s, v28.2s, alphaV3
  780. st1 {v12.2s}, [pCRow1]
  781. add pCRow0, pCRow0, #8
  782. .endm
  783. /******************************************************************************/
  784. .macro INIT1x4
  785. fmov s16, wzr
  786. fmov s20, s16
  787. .endm
  788. .macro KERNEL1x4_SUB
  789. ldr s0, [pA]
  790. add pA, pA, #4
  791. ld1 {v8.2s, v9.2s}, [pB]
  792. add pB, pB, #16
  793. fmla v16.2s, v8.2s, v0.s[0]
  794. fmla v20.2s, v9.2s, v0.s[0]
  795. .endm
  796. .macro SAVE1x4
  797. add pCRow1, pCRow0, LDC
  798. fmul v8.2s, v16.2s, alphaV0
  799. st1 {v8.s}[0], [pCRow0]
  800. st1 {v8.s}[1], [pCRow1]
  801. add pCRow2, pCRow1, LDC
  802. add pCRow1, pCRow2, LDC
  803. fmul v12.2s, v20.2s, alphaV1
  804. st1 {v12.s}[0], [pCRow2]
  805. st1 {v12.s}[1], [pCRow1]
  806. add pCRow0, pCRow0, #4
  807. .endm
  808. /******************************************************************************/
  809. .macro INIT8x2
  810. fmov s16, wzr
  811. fmov s17, s16
  812. fmov s20, s17
  813. fmov s21, s16
  814. .endm
  815. .macro KERNEL8x2_SUB
  816. ld1 {v8.2s}, [pB]
  817. add pB, pB, #8
  818. ld1 {v0.4s}, [pA]
  819. add pA, pA, #16
  820. ld1 {v1.4s}, [pA]
  821. add pA, pA, #16
  822. fmla v16.4s, v0.4s, v8.s[0]
  823. fmla v17.4s, v1.4s, v8.s[0]
  824. fmla v20.4s, v0.4s, v8.s[1]
  825. fmla v21.4s, v1.4s, v8.s[1]
  826. .endm
  827. .macro SAVE8x2
  828. add pCRow1, pCRow0, LDC
  829. fmul v0.4s, v16.4s, alphaV0
  830. fmul v1.4s, v17.4s, alphaV1
  831. st1 {v0.4s, v1.4s}, [pCRow0]
  832. add pCRow2, pCRow1, LDC
  833. fmul v4.4s, v20.4s, alphaV0
  834. fmul v5.4s, v21.4s, alphaV1
  835. st1 {v4.4s, v5.4s}, [pCRow1]
  836. add pCRow0, pCRow0, #32
  837. .endm
  838. /******************************************************************************/
  839. .macro INIT4x2
  840. fmov s16, wzr
  841. fmov s17, s16
  842. fmov s20, s17
  843. fmov s21, s16
  844. .endm
  845. .macro KERNEL4x2_SUB
  846. ld1 {v8.2s}, [pB]
  847. add pB, pB, #8
  848. ld1 {v0.2s, v1.2s}, [pA]
  849. add pA, pA, #16
  850. fmla v16.2s, v0.2s, v8.s[0]
  851. fmla v17.2s, v1.2s, v8.s[0]
  852. fmla v20.2s, v0.2s, v8.s[1]
  853. fmla v21.2s, v1.2s, v8.s[1]
  854. .endm
  855. .macro SAVE4x2
  856. fmul v8.2s, v16.2s, alphaV0
  857. fmul v9.2s, v17.2s, alphaV1
  858. st1 {v8.2s, v9.2s}, [pCRow0]
  859. add pCRow1, pCRow0, LDC
  860. fmul v12.2s, v20.2s, alphaV2
  861. fmul v13.2s, v21.2s, alphaV3
  862. st1 {v12.2s, v13.2s}, [pCRow1]
  863. add pCRow0, pCRow0, #16
  864. .endm
  865. /******************************************************************************/
  866. .macro INIT2x2
  867. fmov s16, wzr
  868. fmov s20, s16
  869. .endm
  870. .macro KERNEL2x2_SUB
  871. ld1 {v8.2s}, [pB]
  872. add pB, pB, #8
  873. ld1 {v0.2s}, [pA]
  874. add pA, pA, #8
  875. fmla v16.2s, v0.2s, v8.s[0]
  876. fmla v20.2s, v0.2s, v8.s[1]
  877. .endm
  878. .macro SAVE2x2
  879. fmul v8.2s, v16.2s, alphaV0
  880. st1 {v8.2s}, [pCRow0]
  881. add pCRow1 , pCRow0, LDC
  882. fmul v12.2s, v20.2s, alphaV1
  883. st1 {v12.2s}, [pCRow1]
  884. add pCRow0, pCRow0, #8
  885. .endm
  886. /******************************************************************************/
  887. .macro INIT1x2
  888. fmov s16, wzr
  889. .endm
  890. .macro KERNEL1x2_SUB
  891. ld1 {v8.2s} , [pB]
  892. add pB , pB, #8
  893. ldr s0 , [pA]
  894. add pA, pA, #4
  895. fmla v16.2s, v8.2s, v0.s[0]
  896. .endm
  897. .macro SAVE1x2
  898. add pCRow1 , pCRow0, LDC
  899. fmul v8.2s, v16.2s, alphaV0
  900. st1 {v8.s}[0], [pCRow0]
  901. st1 {v8.s}[1], [pCRow1]
  902. add pCRow0, pCRow0, #4
  903. .endm
  904. /******************************************************************************/
  905. .macro INIT8x1
  906. fmov s16, wzr
  907. fmov s17, wzr
  908. .endm
  909. .macro KERNEL8x1_SUB
  910. ldr s8, [pB]
  911. add pB , pB, #4
  912. ld1 {v0.4s}, [pA]
  913. add pA, pA, #16
  914. ld1 {v1.4s}, [pA]
  915. add pA, pA, #16
  916. fmla v16.4s, v0.4s, v8.s[0]
  917. fmla v17.4s, v1.4s, v8.s[0]
  918. .endm
  919. .macro SAVE8x1
  920. fmul v0.4s, v16.4s, alphaV0
  921. fmul v1.4s, v17.4s, alphaV1
  922. st1 {v0.4s, v1.4s}, [pCRow0]
  923. add pCRow0, pCRow0, #32
  924. .endm
  925. /******************************************************************************/
  926. .macro INIT4x1
  927. fmov s16, wzr
  928. fmov s17, s16
  929. .endm
  930. .macro KERNEL4x1_SUB
  931. ldr s8, [pB]
  932. add pB , pB, #4
  933. ld1 {v0.2s, v1.2s}, [pA]
  934. add pA , pA, #16
  935. fmla v16.2s, v0.2s, v8.s[0]
  936. fmla v17.2s, v1.2s, v8.s[0]
  937. .endm
  938. .macro SAVE4x1
  939. fmul v8.2s, v16.2s, alphaV0
  940. fmul v9.2s, v17.2s, alphaV1
  941. st1 {v8.2s, v9.2s}, [pCRow0]
  942. add pCRow0, pCRow0, #16
  943. .endm
  944. /******************************************************************************/
  945. .macro INIT2x1
  946. fmov s16, wzr
  947. .endm
  948. .macro KERNEL2x1_SUB
  949. ldr s8, [pB]
  950. add pB , pB, #4
  951. ld1 {v0.2s}, [pA]
  952. add pA , pA, #8
  953. fmla v16.2s, v0.2s, v8.s[0]
  954. .endm
  955. .macro SAVE2x1
  956. fmul v8.2s, v16.2s, alphaV0
  957. st1 {v8.2s}, [pCRow0]
  958. add pCRow0, pCRow0, #8
  959. .endm
  960. /******************************************************************************/
  961. .macro INIT1x1
  962. fmov s16, wzr
  963. .endm
  964. .macro KERNEL1x1_SUB
  965. ldr s8, [pB]
  966. add pB , pB, #4
  967. ldr s0, [pA]
  968. add pA , pA, #4
  969. fmadd s16, s0, s8, s16
  970. .endm
  971. .macro SAVE1x1
  972. fmul s8, s16, alpha0
  973. str s8, [pCRow0]
  974. add pCRow0, pCRow0, #4
  975. .endm
  976. /*******************************************************************************
  977. * End of macro definitions
  978. *******************************************************************************/
  979. PROLOGUE
  980. .Lstrmm_kernel_begin:
  981. .align 5
  982. add sp, sp, #-(11 * 16)
  983. stp d8, d9, [sp, #(0 * 16)]
  984. stp d10, d11, [sp, #(1 * 16)]
  985. stp d12, d13, [sp, #(2 * 16)]
  986. stp d14, d15, [sp, #(3 * 16)]
  987. stp d16, d17, [sp, #(4 * 16)]
  988. stp x18, x19, [sp, #(5 * 16)]
  989. stp x20, x21, [sp, #(6 * 16)]
  990. stp x22, x23, [sp, #(7 * 16)]
  991. stp x24, x25, [sp, #(8 * 16)]
  992. stp x26, x27, [sp, #(9 * 16)]
  993. str x28, [sp, #(10 * 16)]
  994. fmov alpha0, s0
  995. fmov alpha1, s0
  996. fmov alpha2, s0
  997. fmov alpha3, s0
  998. lsl LDC, LDC, #2 // ldc = ldc * 4
  999. #if !defined(LEFT)
  1000. neg tempOffset, offset
  1001. #endif
  1002. mov pB, origPB
  1003. mov counterJ, origN
  1004. asr counterJ, counterJ, #3 // J = J / 8
  1005. cmp counterJ, #0
  1006. ble .Lstrmm_kernel_L4_BEGIN
  1007. /******************************************************************************/
  1008. /******************************************************************************/
  1009. .Lstrmm_kernel_L8_BEGIN:
  1010. mov pCRow0, pC // pCRow0 = C
  1011. add pC, pC, LDC, lsl #3
  1012. #if defined(LEFT)
  1013. mov tempOffset, offset
  1014. #endif
  1015. mov pA, origPA // pA = start of A array
  1016. /******************************************************************************/
  1017. .Lstrmm_kernel_L8_M8_BEGIN:
  1018. mov counterI, origM
  1019. asr counterI, counterI, #3 // counterI = counterI / 8
  1020. cmp counterI, #0
  1021. ble .Lstrmm_kernel_L8_M4_BEGIN
  1022. .Lstrmm_kernel_L8_M8_20:
  1023. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1024. mov pB, origPB
  1025. #else
  1026. mov pB, origPB
  1027. lsl temp, tempOffset, #5
  1028. add pA, pA, temp
  1029. add pB, pB, temp
  1030. #endif
  1031. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1032. sub tempK, origK, tempOffset
  1033. #elif defined(LEFT)
  1034. add tempK, tempOffset, #8
  1035. #else
  1036. add tempK, tempOffset, #8
  1037. #endif
  1038. asr counterL , tempK, #3 // L = K / 8
  1039. cmp counterL , #2 // is there at least 16 to do?
  1040. blt .Lstrmm_kernel_L8_M8_32
  1041. KERNEL8x8_I // do one in the K
  1042. KERNEL8x8_M2 // do another in the K
  1043. KERNEL8x8_M1
  1044. KERNEL8x8_M2
  1045. KERNEL8x8_M1
  1046. KERNEL8x8_M2
  1047. KERNEL8x8_M1
  1048. KERNEL8x8_M2
  1049. subs counterL, counterL, #2
  1050. ble .Lstrmm_kernel_L8_M8_22a
  1051. .align 5
  1052. .Lstrmm_kernel_L8_M8_22:
  1053. KERNEL8x8_M1
  1054. KERNEL8x8_M2
  1055. KERNEL8x8_M1
  1056. KERNEL8x8_M2
  1057. KERNEL8x8_M1
  1058. KERNEL8x8_M2
  1059. KERNEL8x8_M1
  1060. KERNEL8x8_M2
  1061. subs counterL, counterL, #1
  1062. bgt .Lstrmm_kernel_L8_M8_22
  1063. .Lstrmm_kernel_L8_M8_22a:
  1064. KERNEL8x8_M1
  1065. KERNEL8x8_M2
  1066. KERNEL8x8_M1
  1067. KERNEL8x8_M2
  1068. KERNEL8x8_M1
  1069. KERNEL8x8_M2
  1070. KERNEL8x8_M1
  1071. KERNEL8x8_E
  1072. b .Lstrmm_kernel_L8_M8_44
  1073. .Lstrmm_kernel_L8_M8_32:
  1074. tst counterL, #1
  1075. ble .Lstrmm_kernel_L8_M8_40
  1076. KERNEL8x8_I
  1077. KERNEL8x8_M2
  1078. KERNEL8x8_M1
  1079. KERNEL8x8_M2
  1080. KERNEL8x8_M1
  1081. KERNEL8x8_M2
  1082. KERNEL8x8_M1
  1083. KERNEL8x8_E
  1084. b .Lstrmm_kernel_L8_M8_44
  1085. .Lstrmm_kernel_L8_M8_40:
  1086. INIT8x8
  1087. .Lstrmm_kernel_L8_M8_44:
  1088. ands counterL , tempK, #7
  1089. ble .Lstrmm_kernel_L8_M8_100
  1090. .Lstrmm_kernel_L8_M8_46:
  1091. KERNEL8x8_SUB
  1092. subs counterL, counterL, 1
  1093. bgt .Lstrmm_kernel_L8_M8_46
  1094. .Lstrmm_kernel_L8_M8_100:
  1095. SAVE8x8
  1096. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1097. sub tempK, origK, tempOffset
  1098. #if defined(LEFT)
  1099. sub tempK, tempK, #8
  1100. #else
  1101. sub tempK, tempK, #8
  1102. #endif
  1103. lsl temp, tempK, #5
  1104. add pA, pA, temp
  1105. add pB, pB, temp
  1106. #endif
  1107. #if defined(LEFT)
  1108. add tempOffset, tempOffset, #8
  1109. #endif
  1110. .Lstrmm_kernel_L8_M8_END:
  1111. subs counterI, counterI, #1
  1112. bne .Lstrmm_kernel_L8_M8_20
  1113. /******************************************************************************/
  1114. .Lstrmm_kernel_L8_M4_BEGIN:
  1115. mov counterI, origM
  1116. tst counterI , #7
  1117. ble .Lstrmm_kernel_L8_END
  1118. tst counterI, #4
  1119. ble .Lstrmm_kernel_L8_M2_BEGIN
  1120. .Lstrmm_kernel_L8_M4_20:
  1121. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1122. mov pB, origPB
  1123. #else
  1124. mov pB, origPB
  1125. lsl temp, tempOffset, #4
  1126. add pA, pA, temp
  1127. lsl temp, tempOffset, #5
  1128. add pB, pB, temp
  1129. #endif
  1130. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1131. sub tempK, origK, tempOffset
  1132. #elif defined(LEFT)
  1133. add tempK, tempOffset, #4
  1134. #else
  1135. add tempK, tempOffset, #8
  1136. #endif
  1137. asr counterL , tempK, #1 // L = K / 2
  1138. cmp counterL , #2 // is there at least 4 to do?
  1139. blt .Lstrmm_kernel_L8_M4_32
  1140. KERNEL4x8_I // do one in the K
  1141. KERNEL4x8_M2 // do another in the K
  1142. subs counterL, counterL, #2
  1143. ble .Lstrmm_kernel_L8_M4_22a
  1144. .align 5
  1145. .Lstrmm_kernel_L8_M4_22:
  1146. KERNEL4x8_M1
  1147. KERNEL4x8_M2
  1148. subs counterL, counterL, #1
  1149. bgt .Lstrmm_kernel_L8_M4_22
  1150. .Lstrmm_kernel_L8_M4_22a:
  1151. KERNEL4x8_M1
  1152. KERNEL4x8_E
  1153. b .Lstrmm_kernel_L8_M4_44
  1154. .Lstrmm_kernel_L8_M4_32:
  1155. tst counterL, #1
  1156. ble .Lstrmm_kernel_L8_M4_40
  1157. KERNEL4x8_I
  1158. KERNEL4x8_E
  1159. b .Lstrmm_kernel_L8_M4_44
  1160. .Lstrmm_kernel_L8_M4_40:
  1161. INIT4x8
  1162. .Lstrmm_kernel_L8_M4_44:
  1163. ands counterL , tempK, #1
  1164. ble .Lstrmm_kernel_L8_M4_100
  1165. .Lstrmm_kernel_L8_M4_46:
  1166. KERNEL4x8_SUB
  1167. .Lstrmm_kernel_L8_M4_100:
  1168. SAVE4x8
  1169. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1170. sub tempK, origK, tempOffset
  1171. #if defined(LEFT)
  1172. sub tempK, tempK, #4
  1173. #else
  1174. sub tempK, tempK, #8
  1175. #endif
  1176. lsl temp, tempK, #4
  1177. add pA, pA, temp
  1178. lsl temp, tempK, #5
  1179. add pB, pB, temp
  1180. #endif
  1181. #if defined(LEFT)
  1182. add tempOffset, tempOffset, #4
  1183. #endif
  1184. .Lstrmm_kernel_L8_M4_END:
  1185. /******************************************************************************/
  1186. .Lstrmm_kernel_L8_M2_BEGIN:
  1187. mov counterI, origM
  1188. tst counterI , #3
  1189. ble .Lstrmm_kernel_L8_END
  1190. tst counterI, #2 // counterI = counterI / 2
  1191. ble .Lstrmm_kernel_L8_M1_BEGIN
  1192. .Lstrmm_kernel_L8_M2_20:
  1193. INIT2x8
  1194. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1195. mov pB, origPB
  1196. #else
  1197. mov pB, origPB
  1198. lsl temp, tempOffset, #3
  1199. add pA, pA, temp
  1200. lsl temp, tempOffset, #5
  1201. add pB, pB, temp
  1202. #endif
  1203. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1204. sub tempK, origK, tempOffset
  1205. #elif defined(LEFT)
  1206. add tempK, tempOffset, #2
  1207. #else
  1208. add tempK, tempOffset, #8
  1209. #endif
  1210. asr counterL , tempK, #3 // counterL = counterL / 8
  1211. cmp counterL , #0
  1212. ble .Lstrmm_kernel_L8_M2_40
  1213. .Lstrmm_kernel_L8_M2_22:
  1214. KERNEL2x8_SUB
  1215. KERNEL2x8_SUB
  1216. KERNEL2x8_SUB
  1217. KERNEL2x8_SUB
  1218. KERNEL2x8_SUB
  1219. KERNEL2x8_SUB
  1220. KERNEL2x8_SUB
  1221. KERNEL2x8_SUB
  1222. subs counterL, counterL, #1
  1223. bgt .Lstrmm_kernel_L8_M2_22
  1224. .Lstrmm_kernel_L8_M2_40:
  1225. ands counterL , tempK, #7 // counterL = counterL % 8
  1226. ble .Lstrmm_kernel_L8_M2_100
  1227. .Lstrmm_kernel_L8_M2_42:
  1228. KERNEL2x8_SUB
  1229. subs counterL, counterL, #1
  1230. bgt .Lstrmm_kernel_L8_M2_42
  1231. .Lstrmm_kernel_L8_M2_100:
  1232. SAVE2x8
  1233. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1234. sub tempK, origK, tempOffset
  1235. #if defined(LEFT)
  1236. sub tempK, tempK, #2
  1237. #else
  1238. sub tempK, tempK, #8
  1239. #endif
  1240. lsl temp, tempK, #3
  1241. add pA, pA, temp
  1242. lsl temp, tempK, #5
  1243. add pB, pB, temp
  1244. #endif
  1245. #if defined(LEFT)
  1246. add tempOffset, tempOffset, #2
  1247. #endif
  1248. .Lstrmm_kernel_L8_M2_END:
  1249. /******************************************************************************/
  1250. .Lstrmm_kernel_L8_M1_BEGIN:
  1251. tst counterI, #1 // counterI = counterI % 2
  1252. ble .Lstrmm_kernel_L8_END
  1253. .Lstrmm_kernel_L8_M1_20:
  1254. INIT1x8
  1255. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1256. mov pB, origPB
  1257. #else
  1258. mov pB, origPB
  1259. lsl temp, tempOffset, #2
  1260. add pA, pA, temp
  1261. lsl temp, tempOffset, #5
  1262. add pB, pB, temp
  1263. #endif
  1264. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1265. sub tempK, origK, tempOffset
  1266. #elif defined(LEFT)
  1267. add tempK, tempOffset, #1
  1268. #else
  1269. add tempK, tempOffset, #8
  1270. #endif
  1271. asr counterL , tempK, #3 // counterL = counterL / 8
  1272. cmp counterL , #0
  1273. ble .Lstrmm_kernel_L8_M1_40
  1274. .Lstrmm_kernel_L8_M1_22:
  1275. KERNEL1x8_SUB
  1276. KERNEL1x8_SUB
  1277. KERNEL1x8_SUB
  1278. KERNEL1x8_SUB
  1279. KERNEL1x8_SUB
  1280. KERNEL1x8_SUB
  1281. KERNEL1x8_SUB
  1282. KERNEL1x8_SUB
  1283. subs counterL, counterL, #1
  1284. bgt .Lstrmm_kernel_L8_M1_22
  1285. .Lstrmm_kernel_L8_M1_40:
  1286. ands counterL , tempK, #7 // counterL = counterL % 8
  1287. ble .Lstrmm_kernel_L8_M1_100
  1288. .Lstrmm_kernel_L8_M1_42:
  1289. KERNEL1x8_SUB
  1290. subs counterL, counterL, #1
  1291. bgt .Lstrmm_kernel_L8_M1_42
  1292. .Lstrmm_kernel_L8_M1_100:
  1293. SAVE1x8
  1294. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1295. sub tempK, origK, tempOffset
  1296. #if defined(LEFT)
  1297. sub tempK, tempK, #1
  1298. #else
  1299. sub tempK, tempK, #8
  1300. #endif
  1301. lsl temp, tempK, #2
  1302. add pA, pA, temp
  1303. lsl temp, tempK, #5
  1304. add pB, pB, temp
  1305. #endif
  1306. #if defined(LEFT)
  1307. add tempOffset, tempOffset, #1
  1308. #endif
  1309. .Lstrmm_kernel_L8_END:
  1310. lsl temp, origK, #5 // B = B + K * 4 * 8
  1311. add origPB, origPB, temp
  1312. #if !defined(LEFT)
  1313. add tempOffset, tempOffset, #8
  1314. #endif
  1315. subs counterJ, counterJ , #1 // j--
  1316. bgt .Lstrmm_kernel_L8_BEGIN
  1317. /******************************************************************************/
  1318. /******************************************************************************/
  1319. .Lstrmm_kernel_L4_BEGIN:
  1320. mov counterJ , origN
  1321. tst counterJ , #7
  1322. ble .Lstrmm_kernel_L999
  1323. tst counterJ , #4
  1324. ble .Lstrmm_kernel_L2_BEGIN
  1325. mov pCRow0, pC // pCRow0 = pC
  1326. add pC,pC,LDC, lsl #2
  1327. #if defined(LEFT)
  1328. mov tempOffset, offset
  1329. #endif
  1330. mov pA, origPA // pA = A
  1331. /******************************************************************************/
  1332. .Lstrmm_kernel_L4_M8_BEGIN:
  1333. mov counterI, origM
  1334. asr counterI, counterI, #3 // counterI = counterI / 8
  1335. cmp counterI, #0
  1336. ble .Lstrmm_kernel_L4_M4_BEGIN
  1337. .Lstrmm_kernel_L4_M8_20:
  1338. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1339. mov pB, origPB
  1340. #else
  1341. mov pB, origPB
  1342. lsl temp, tempOffset, #5
  1343. add pA, pA, temp
  1344. lsl temp, tempOffset, #4
  1345. add pB, pB, temp
  1346. #endif
  1347. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1348. sub tempK, origK, tempOffset
  1349. #elif defined(LEFT)
  1350. add tempK, tempOffset, #8
  1351. #else
  1352. add tempK, tempOffset, #4
  1353. #endif
  1354. asr counterL , tempK, #1 // L = K / 2
  1355. cmp counterL , #2 // is there at least 4 to do?
  1356. blt .Lstrmm_kernel_L4_M8_32
  1357. KERNEL8x4_I // do one in the K
  1358. KERNEL8x4_M2 // do another in the K
  1359. subs counterL, counterL, #2
  1360. ble .Lstrmm_kernel_L4_M8_22a
  1361. .align 5
  1362. .Lstrmm_kernel_L4_M8_22:
  1363. KERNEL8x4_M1
  1364. KERNEL8x4_M2
  1365. subs counterL, counterL, #1
  1366. bgt .Lstrmm_kernel_L4_M8_22
  1367. .Lstrmm_kernel_L4_M8_22a:
  1368. KERNEL8x4_M1
  1369. KERNEL8x4_E
  1370. b .Lstrmm_kernel_L4_M8_44
  1371. .Lstrmm_kernel_L4_M8_32:
  1372. tst counterL, #1
  1373. ble .Lstrmm_kernel_L4_M8_40
  1374. KERNEL8x4_I
  1375. KERNEL8x4_E
  1376. b .Lstrmm_kernel_L4_M8_44
  1377. .Lstrmm_kernel_L4_M8_40:
  1378. INIT8x4
  1379. .Lstrmm_kernel_L4_M8_44:
  1380. ands counterL , tempK, #1
  1381. ble .Lstrmm_kernel_L4_M8_100
  1382. .Lstrmm_kernel_L4_M8_46:
  1383. KERNEL8x4_SUB
  1384. .Lstrmm_kernel_L4_M8_100:
  1385. SAVE8x4
  1386. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1387. sub tempK, origK, tempOffset
  1388. #if defined(LEFT)
  1389. sub tempK, tempK, #8
  1390. #else
  1391. sub tempK, tempK, #4
  1392. #endif
  1393. lsl temp, tempK, #5
  1394. add pA, pA, temp
  1395. lsl temp, tempK, #4
  1396. add pB, pB, temp
  1397. #endif
  1398. #if defined(LEFT)
  1399. add tempOffset, tempOffset, #8
  1400. #endif
  1401. .Lstrmm_kernel_L4_M8_END:
  1402. subs counterI, counterI, #1
  1403. bne .Lstrmm_kernel_L4_M8_20
  1404. /******************************************************************************/
  1405. .Lstrmm_kernel_L4_M4_BEGIN:
  1406. mov counterI, origM
  1407. tst counterI , #7
  1408. ble .Lstrmm_kernel_L4_END
  1409. tst counterI, #4
  1410. ble .Lstrmm_kernel_L4_M2_BEGIN
  1411. .Lstrmm_kernel_L4_M4_20:
  1412. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1413. mov pB, origPB
  1414. #else
  1415. mov pB, origPB
  1416. lsl temp, tempOffset, #4
  1417. add pB, pB, temp
  1418. add pA, pA, temp
  1419. #endif
  1420. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1421. sub tempK, origK, tempOffset
  1422. #elif defined(LEFT)
  1423. add tempK, tempOffset, #4
  1424. #else
  1425. add tempK, tempOffset, #4
  1426. #endif
  1427. asr counterL , tempK, #1 // L = K / 2
  1428. cmp counterL , #2 // is there at least 4 to do?
  1429. blt .Lstrmm_kernel_L4_M4_32
  1430. KERNEL4x4_I // do one in the K
  1431. KERNEL4x4_M2 // do another in the K
  1432. subs counterL, counterL, #2
  1433. ble .Lstrmm_kernel_L4_M4_22a
  1434. .align 5
  1435. .Lstrmm_kernel_L4_M4_22:
  1436. KERNEL4x4_M1
  1437. KERNEL4x4_M2
  1438. subs counterL, counterL, #1
  1439. bgt .Lstrmm_kernel_L4_M4_22
  1440. .Lstrmm_kernel_L4_M4_22a:
  1441. KERNEL4x4_M1
  1442. KERNEL4x4_E
  1443. b .Lstrmm_kernel_L4_M4_44
  1444. .Lstrmm_kernel_L4_M4_32:
  1445. tst counterL, #1
  1446. ble .Lstrmm_kernel_L4_M4_40
  1447. KERNEL4x4_I
  1448. KERNEL4x4_E
  1449. b .Lstrmm_kernel_L4_M4_44
  1450. .Lstrmm_kernel_L4_M4_40:
  1451. INIT4x4
  1452. .Lstrmm_kernel_L4_M4_44:
  1453. ands counterL , tempK, #1
  1454. ble .Lstrmm_kernel_L4_M4_100
  1455. .Lstrmm_kernel_L4_M4_46:
  1456. KERNEL4x4_SUB
  1457. .Lstrmm_kernel_L4_M4_100:
  1458. SAVE4x4
  1459. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1460. sub tempK, origK, tempOffset
  1461. #if defined(LEFT)
  1462. sub tempK, tempK, #4
  1463. #else
  1464. sub tempK, tempK, #4
  1465. #endif
  1466. lsl temp, tempK, #4
  1467. add pA, pA, temp
  1468. add pB, pB, temp
  1469. #endif
  1470. #if defined(LEFT)
  1471. add tempOffset, tempOffset, #4
  1472. #endif
  1473. .Lstrmm_kernel_L4_M4_END:
  1474. /******************************************************************************/
  1475. .Lstrmm_kernel_L4_M2_BEGIN:
  1476. mov counterI, origM
  1477. tst counterI , #3
  1478. ble .Lstrmm_kernel_L4_END
  1479. tst counterI, #2 // counterI = counterI / 2
  1480. ble .Lstrmm_kernel_L4_M1_BEGIN
  1481. .Lstrmm_kernel_L4_M2_20:
  1482. INIT2x4
  1483. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1484. mov pB, origPB
  1485. #else
  1486. mov pB, origPB
  1487. lsl temp, tempOffset, #3
  1488. add pA, pA, temp
  1489. lsl temp, tempOffset, #4
  1490. add pB, pB, temp
  1491. #endif
  1492. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1493. sub tempK, origK, tempOffset
  1494. #elif defined(LEFT)
  1495. add tempK, tempOffset, #2
  1496. #else
  1497. add tempK, tempOffset, #4
  1498. #endif
  1499. asr counterL , tempK, #3 // counterL = counterL / 8
  1500. cmp counterL , #0
  1501. ble .Lstrmm_kernel_L4_M2_40
  1502. .Lstrmm_kernel_L4_M2_22:
  1503. KERNEL2x4_SUB
  1504. KERNEL2x4_SUB
  1505. KERNEL2x4_SUB
  1506. KERNEL2x4_SUB
  1507. KERNEL2x4_SUB
  1508. KERNEL2x4_SUB
  1509. KERNEL2x4_SUB
  1510. KERNEL2x4_SUB
  1511. subs counterL, counterL, #1
  1512. bgt .Lstrmm_kernel_L4_M2_22
  1513. .Lstrmm_kernel_L4_M2_40:
  1514. ands counterL , tempK, #7 // counterL = counterL % 8
  1515. ble .Lstrmm_kernel_L4_M2_100
  1516. .Lstrmm_kernel_L4_M2_42:
  1517. KERNEL2x4_SUB
  1518. subs counterL, counterL, #1
  1519. bgt .Lstrmm_kernel_L4_M2_42
  1520. .Lstrmm_kernel_L4_M2_100:
  1521. SAVE2x4
  1522. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1523. sub tempK, origK, tempOffset
  1524. #if defined(LEFT)
  1525. sub tempK, tempK, #2
  1526. #else
  1527. sub tempK, tempK, #4
  1528. #endif
  1529. lsl temp, tempK, #3
  1530. add pA, pA, temp
  1531. lsl temp, tempK, #4
  1532. add pB, pB, temp
  1533. #endif
  1534. #if defined(LEFT)
  1535. add tempOffset, tempOffset, #2
  1536. #endif
  1537. .Lstrmm_kernel_L4_M2_END:
  1538. /******************************************************************************/
  1539. .Lstrmm_kernel_L4_M1_BEGIN:
  1540. tst counterI, #1 // counterI = counterI % 2
  1541. ble .Lstrmm_kernel_L4_END
  1542. .Lstrmm_kernel_L4_M1_20:
  1543. INIT1x4
  1544. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1545. mov pB, origPB
  1546. #else
  1547. mov pB, origPB
  1548. lsl temp, tempOffset, #4
  1549. add pB, pB, temp
  1550. lsl temp, tempOffset, #2
  1551. add pA, pA, temp
  1552. #endif
  1553. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1554. sub tempK, origK, tempOffset
  1555. #elif defined(LEFT)
  1556. add tempK, tempOffset, #1
  1557. #else
  1558. add tempK, tempOffset, #4
  1559. #endif
  1560. asr counterL , tempK, #3 // counterL = counterL / 8
  1561. cmp counterL , #0
  1562. ble .Lstrmm_kernel_L4_M1_40
  1563. .Lstrmm_kernel_L4_M1_22:
  1564. KERNEL1x4_SUB
  1565. KERNEL1x4_SUB
  1566. KERNEL1x4_SUB
  1567. KERNEL1x4_SUB
  1568. KERNEL1x4_SUB
  1569. KERNEL1x4_SUB
  1570. KERNEL1x4_SUB
  1571. KERNEL1x4_SUB
  1572. subs counterL, counterL, #1
  1573. bgt .Lstrmm_kernel_L4_M1_22
  1574. .Lstrmm_kernel_L4_M1_40:
  1575. ands counterL , tempK, #7 // counterL = counterL % 8
  1576. ble .Lstrmm_kernel_L4_M1_100
  1577. .Lstrmm_kernel_L4_M1_42:
  1578. KERNEL1x4_SUB
  1579. subs counterL, counterL, #1
  1580. bgt .Lstrmm_kernel_L4_M1_42
  1581. .Lstrmm_kernel_L4_M1_100:
  1582. SAVE1x4
  1583. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1584. sub tempK, origK, tempOffset
  1585. #if defined(LEFT)
  1586. sub tempK, tempK, #1
  1587. #else
  1588. sub tempK, tempK, #4
  1589. #endif
  1590. lsl temp, tempK, #2
  1591. add pA, pA, temp
  1592. lsl temp, tempK, #4
  1593. add pB, pB, temp
  1594. #endif
  1595. #if defined(LEFT)
  1596. add tempOffset, tempOffset, #1
  1597. #endif
  1598. .Lstrmm_kernel_L4_END:
  1599. add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
  1600. #if !defined(LEFT)
  1601. add tempOffset, tempOffset, #4
  1602. #endif
  1603. /******************************************************************************/
  1604. /******************************************************************************/
  1605. .Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction
  1606. mov counterJ , origN
  1607. tst counterJ , #3
  1608. ble .Lstrmm_kernel_L999
  1609. tst counterJ , #2
  1610. ble .Lstrmm_kernel_L1_BEGIN
  1611. mov pCRow0, pC // pCRow0 = pC
  1612. add pC,pC,LDC, lsl #1
  1613. #if defined(LEFT)
  1614. mov tempOffset, offset
  1615. #endif
  1616. mov pA, origPA // pA = A
  1617. /******************************************************************************/
  1618. .Lstrmm_kernel_L2_M8_BEGIN:
  1619. mov counterI, origM
  1620. asr counterI, counterI, #3 // counterI = counterI / 8
  1621. cmp counterI,#0
  1622. ble .Lstrmm_kernel_L2_M4_BEGIN
  1623. .Lstrmm_kernel_L2_M8_20:
  1624. INIT8x2
  1625. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1626. mov pB, origPB
  1627. #else
  1628. mov pB, origPB
  1629. lsl temp, tempOffset, #5
  1630. add pA, pA, temp
  1631. lsl temp, tempOffset, #3
  1632. add pB, pB, temp
  1633. #endif
  1634. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1635. sub tempK, origK, tempOffset
  1636. #elif defined(LEFT)
  1637. add tempK, tempOffset, #8
  1638. #else
  1639. add tempK, tempOffset, #2
  1640. #endif
  1641. asr counterL , tempK, #3 // counterL = counterL / 8
  1642. cmp counterL,#0
  1643. ble .Lstrmm_kernel_L2_M8_40
  1644. .align 5
  1645. .Lstrmm_kernel_L2_M8_22:
  1646. KERNEL8x2_SUB
  1647. KERNEL8x2_SUB
  1648. KERNEL8x2_SUB
  1649. KERNEL8x2_SUB
  1650. KERNEL8x2_SUB
  1651. KERNEL8x2_SUB
  1652. KERNEL8x2_SUB
  1653. KERNEL8x2_SUB
  1654. subs counterL, counterL, #1
  1655. bgt .Lstrmm_kernel_L2_M8_22
  1656. .Lstrmm_kernel_L2_M8_40:
  1657. ands counterL , tempK, #7 // counterL = counterL % 8
  1658. ble .Lstrmm_kernel_L2_M8_100
  1659. .Lstrmm_kernel_L2_M8_42:
  1660. KERNEL8x2_SUB
  1661. subs counterL, counterL, #1
  1662. bgt .Lstrmm_kernel_L2_M8_42
  1663. .Lstrmm_kernel_L2_M8_100:
  1664. SAVE8x2
  1665. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1666. sub tempK, origK, tempOffset
  1667. #if defined(LEFT)
  1668. sub tempK, tempK, #8
  1669. #else
  1670. sub tempK, tempK, #2
  1671. #endif
  1672. lsl temp, tempK, #5
  1673. add pA, pA, temp
  1674. lsl temp, tempK, #3
  1675. add pB, pB, temp
  1676. #endif
  1677. #if defined(LEFT)
  1678. add tempOffset, tempOffset, #8
  1679. #endif
  1680. .Lstrmm_kernel_L2_M8_END:
  1681. subs counterI, counterI, #1
  1682. bgt .Lstrmm_kernel_L2_M8_20
  1683. /******************************************************************************/
  1684. .Lstrmm_kernel_L2_M4_BEGIN:
  1685. mov counterI, origM
  1686. tst counterI , #7
  1687. ble .Lstrmm_kernel_L2_END
  1688. tst counterI, #4
  1689. ble .Lstrmm_kernel_L2_M2_BEGIN
  1690. .Lstrmm_kernel_L2_M4_20:
  1691. INIT4x2
  1692. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1693. mov pB, origPB
  1694. #else
  1695. mov pB, origPB
  1696. lsl temp, tempOffset, #3
  1697. add pB, pB, temp
  1698. lsl temp, tempOffset, #4
  1699. add pA, pA, temp
  1700. #endif
  1701. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1702. sub tempK, origK, tempOffset
  1703. #elif defined(LEFT)
  1704. add tempK, tempOffset, #4
  1705. #else
  1706. add tempK, tempOffset, #2
  1707. #endif
  1708. asr counterL , tempK, #3 // counterL = counterL / 8
  1709. cmp counterL,#0
  1710. ble .Lstrmm_kernel_L2_M4_40
  1711. .align 5
  1712. .Lstrmm_kernel_L2_M4_22:
  1713. KERNEL4x2_SUB
  1714. KERNEL4x2_SUB
  1715. KERNEL4x2_SUB
  1716. KERNEL4x2_SUB
  1717. KERNEL4x2_SUB
  1718. KERNEL4x2_SUB
  1719. KERNEL4x2_SUB
  1720. KERNEL4x2_SUB
  1721. subs counterL, counterL, #1
  1722. bgt .Lstrmm_kernel_L2_M4_22
  1723. .Lstrmm_kernel_L2_M4_40:
  1724. ands counterL , tempK, #7 // counterL = counterL % 8
  1725. ble .Lstrmm_kernel_L2_M4_100
  1726. .Lstrmm_kernel_L2_M4_42:
  1727. KERNEL4x2_SUB
  1728. subs counterL, counterL, #1
  1729. bgt .Lstrmm_kernel_L2_M4_42
  1730. .Lstrmm_kernel_L2_M4_100:
  1731. SAVE4x2
  1732. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1733. sub tempK, origK, tempOffset
  1734. #if defined(LEFT)
  1735. sub tempK, tempK, #4
  1736. #else
  1737. sub tempK, tempK, #2
  1738. #endif
  1739. lsl temp, tempK, #4
  1740. add pA, pA, temp
  1741. lsl temp, tempK, #3
  1742. add pB, pB, temp
  1743. #endif
  1744. #if defined(LEFT)
  1745. add tempOffset, tempOffset, #4
  1746. #endif
  1747. .Lstrmm_kernel_L2_M4_END:
  1748. /******************************************************************************/
  1749. .Lstrmm_kernel_L2_M2_BEGIN:
  1750. mov counterI, origM
  1751. tst counterI , #3
  1752. ble .Lstrmm_kernel_L2_END
  1753. tst counterI, #2 // counterI = counterI / 2
  1754. ble .Lstrmm_kernel_L2_M1_BEGIN
  1755. .Lstrmm_kernel_L2_M2_20:
  1756. INIT2x2
  1757. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1758. mov pB, origPB
  1759. #else
  1760. mov pB, origPB
  1761. lsl temp, tempOffset, #3
  1762. add pB, pB, temp
  1763. lsl temp, tempOffset, #3
  1764. add pA, pA, temp
  1765. #endif
  1766. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1767. sub tempK, origK, tempOffset
  1768. #elif defined(LEFT)
  1769. add tempK, tempOffset, #2
  1770. #else
  1771. add tempK, tempOffset, #2
  1772. #endif
  1773. asr counterL , tempK, #3 // counterL = counterL / 8
  1774. cmp counterL,#0
  1775. ble .Lstrmm_kernel_L2_M2_40
  1776. .Lstrmm_kernel_L2_M2_22:
  1777. KERNEL2x2_SUB
  1778. KERNEL2x2_SUB
  1779. KERNEL2x2_SUB
  1780. KERNEL2x2_SUB
  1781. KERNEL2x2_SUB
  1782. KERNEL2x2_SUB
  1783. KERNEL2x2_SUB
  1784. KERNEL2x2_SUB
  1785. subs counterL, counterL, #1
  1786. bgt .Lstrmm_kernel_L2_M2_22
  1787. .Lstrmm_kernel_L2_M2_40:
  1788. ands counterL , tempK, #7 // counterL = counterL % 8
  1789. ble .Lstrmm_kernel_L2_M2_100
  1790. .Lstrmm_kernel_L2_M2_42:
  1791. KERNEL2x2_SUB
  1792. subs counterL, counterL, #1
  1793. bgt .Lstrmm_kernel_L2_M2_42
  1794. .Lstrmm_kernel_L2_M2_100:
  1795. SAVE2x2
  1796. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1797. sub tempK, origK, tempOffset
  1798. #if defined(LEFT)
  1799. sub tempK, tempK, #2
  1800. #else
  1801. sub tempK, tempK, #2
  1802. #endif
  1803. lsl temp, tempK, #3
  1804. add pA, pA, temp
  1805. lsl temp, tempK, #3
  1806. add pB, pB, temp
  1807. #endif
  1808. #if defined(LEFT)
  1809. add tempOffset, tempOffset, #2
  1810. #endif
  1811. .Lstrmm_kernel_L2_M2_END:
  1812. /******************************************************************************/
  1813. .Lstrmm_kernel_L2_M1_BEGIN:
  1814. tst counterI, #1 // counterI = counterI % 2
  1815. ble .Lstrmm_kernel_L2_END
  1816. .Lstrmm_kernel_L2_M1_20:
  1817. INIT1x2
  1818. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1819. mov pB, origPB
  1820. #else
  1821. mov pB, origPB
  1822. lsl temp, tempOffset, #3
  1823. add pB, pB, temp
  1824. lsl temp, tempOffset, #2
  1825. add pA, pA, temp
  1826. #endif
  1827. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1828. sub tempK, origK, tempOffset
  1829. #elif defined(LEFT)
  1830. add tempK, tempOffset, #1
  1831. #else
  1832. add tempK, tempOffset, #2
  1833. #endif
  1834. asr counterL , tempK, #3 // counterL = counterL / 8
  1835. cmp counterL, #0
  1836. ble .Lstrmm_kernel_L2_M1_40
  1837. .Lstrmm_kernel_L2_M1_22:
  1838. KERNEL1x2_SUB
  1839. KERNEL1x2_SUB
  1840. KERNEL1x2_SUB
  1841. KERNEL1x2_SUB
  1842. KERNEL1x2_SUB
  1843. KERNEL1x2_SUB
  1844. KERNEL1x2_SUB
  1845. KERNEL1x2_SUB
  1846. subs counterL, counterL, #1
  1847. bgt .Lstrmm_kernel_L2_M1_22
  1848. .Lstrmm_kernel_L2_M1_40:
  1849. ands counterL , tempK, #7 // counterL = counterL % 8
  1850. ble .Lstrmm_kernel_L2_M1_100
  1851. .Lstrmm_kernel_L2_M1_42:
  1852. KERNEL1x2_SUB
  1853. subs counterL, counterL, #1
  1854. bgt .Lstrmm_kernel_L2_M1_42
  1855. .Lstrmm_kernel_L2_M1_100:
  1856. SAVE1x2
  1857. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1858. sub tempK, origK, tempOffset
  1859. #if defined(LEFT)
  1860. sub tempK, tempK, #1
  1861. #else
  1862. sub tempK, tempK, #2
  1863. #endif
  1864. lsl temp, tempK, #2
  1865. add pA, pA, temp
  1866. lsl temp, tempK, #3
  1867. add pB, pB, temp
  1868. #endif
  1869. #if defined(LEFT)
  1870. add tempOffset, tempOffset, #1
  1871. #endif
  1872. .Lstrmm_kernel_L2_END:
  1873. #if !defined(LEFT)
  1874. add tempOffset, tempOffset, #2
  1875. #endif
  1876. add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
  1877. /******************************************************************************/
  1878. /******************************************************************************/
  1879. .Lstrmm_kernel_L1_BEGIN:
  1880. mov counterJ , origN
  1881. tst counterJ , #1
  1882. ble .Lstrmm_kernel_L999 // done
  1883. mov pCRow0, pC // pCRow0 = C
  1884. add pC , pC , LDC // Update pC to point to next
  1885. #if defined(LEFT)
  1886. mov tempOffset, offset
  1887. #endif
  1888. mov pA, origPA // pA = A
  1889. /******************************************************************************/
  1890. .Lstrmm_kernel_L1_M8_BEGIN:
  1891. mov counterI, origM
  1892. asr counterI, counterI, #3
  1893. cmp counterI, #0
  1894. ble .Lstrmm_kernel_L1_M4_BEGIN
  1895. .Lstrmm_kernel_L1_M8_20:
  1896. INIT8x1
  1897. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1898. mov pB, origPB
  1899. #else
  1900. mov pB, origPB
  1901. lsl temp, tempOffset, #5
  1902. add pA, pA, temp
  1903. lsl temp, tempOffset, #2
  1904. add pB, pB, temp
  1905. #endif
  1906. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1907. sub tempK, origK, tempOffset
  1908. #elif defined(LEFT)
  1909. add tempK, tempOffset, #8
  1910. #else
  1911. add tempK, tempOffset, #1
  1912. #endif
  1913. asr counterL , tempK, #3 // counterL = counterL / 8
  1914. cmp counterL , #0
  1915. ble .Lstrmm_kernel_L1_M8_40
  1916. .align 5
  1917. .Lstrmm_kernel_L1_M8_22:
  1918. KERNEL8x1_SUB
  1919. KERNEL8x1_SUB
  1920. KERNEL8x1_SUB
  1921. KERNEL8x1_SUB
  1922. KERNEL8x1_SUB
  1923. KERNEL8x1_SUB
  1924. KERNEL8x1_SUB
  1925. KERNEL8x1_SUB
  1926. subs counterL, counterL, #1
  1927. bgt .Lstrmm_kernel_L1_M8_22
  1928. .Lstrmm_kernel_L1_M8_40:
  1929. ands counterL , tempK, #7 // counterL = counterL % 8
  1930. ble .Lstrmm_kernel_L1_M8_100
  1931. .Lstrmm_kernel_L1_M8_42:
  1932. KERNEL8x1_SUB
  1933. subs counterL, counterL, #1
  1934. bgt .Lstrmm_kernel_L1_M8_42
  1935. .Lstrmm_kernel_L1_M8_100:
  1936. SAVE8x1
  1937. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1938. sub tempK, origK, tempOffset
  1939. #if defined(LEFT)
  1940. sub tempK, tempK, #8
  1941. #else
  1942. sub tempK, tempK, #1
  1943. #endif
  1944. lsl temp, tempK, #5
  1945. add pA, pA, temp
  1946. lsl temp, tempK, #2
  1947. add pB, pB, temp
  1948. #endif
  1949. #if defined(LEFT)
  1950. add tempOffset, tempOffset, #8
  1951. #endif
  1952. .Lstrmm_kernel_L1_M8_END:
  1953. subs counterI, counterI, #1
  1954. bgt .Lstrmm_kernel_L1_M8_20
  1955. /******************************************************************************/
  1956. .Lstrmm_kernel_L1_M4_BEGIN:
  1957. mov counterI, origM
  1958. tst counterI , #7
  1959. ble .Lstrmm_kernel_L1_END
  1960. tst counterI, #4
  1961. ble .Lstrmm_kernel_L1_M2_BEGIN
  1962. .Lstrmm_kernel_L1_M4_20:
  1963. INIT4x1
  1964. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1965. mov pB, origPB
  1966. #else
  1967. mov pB, origPB
  1968. lsl temp, tempOffset, #2
  1969. add pB, pB, temp
  1970. lsl temp, tempOffset, #4
  1971. add pA, pA, temp
  1972. #endif
  1973. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1974. sub tempK, origK, tempOffset
  1975. #elif defined(LEFT)
  1976. add tempK, tempOffset, #4
  1977. #else
  1978. add tempK, tempOffset, #1
  1979. #endif
  1980. asr counterL , tempK, #3 // counterL = counterL / 8
  1981. cmp counterL , #0
  1982. ble .Lstrmm_kernel_L1_M4_40
  1983. .align 5
  1984. .Lstrmm_kernel_L1_M4_22:
  1985. KERNEL4x1_SUB
  1986. KERNEL4x1_SUB
  1987. KERNEL4x1_SUB
  1988. KERNEL4x1_SUB
  1989. KERNEL4x1_SUB
  1990. KERNEL4x1_SUB
  1991. KERNEL4x1_SUB
  1992. KERNEL4x1_SUB
  1993. subs counterL, counterL, #1
  1994. bgt .Lstrmm_kernel_L1_M4_22
  1995. .Lstrmm_kernel_L1_M4_40:
  1996. ands counterL , tempK, #7 // counterL = counterL % 8
  1997. ble .Lstrmm_kernel_L1_M4_100
  1998. .Lstrmm_kernel_L1_M4_42:
  1999. KERNEL4x1_SUB
  2000. subs counterL, counterL, #1
  2001. bgt .Lstrmm_kernel_L1_M4_42
  2002. .Lstrmm_kernel_L1_M4_100:
  2003. SAVE4x1
  2004. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2005. sub tempK, origK, tempOffset
  2006. #if defined(LEFT)
  2007. sub tempK, tempK, #4
  2008. #else
  2009. sub tempK, tempK, #1
  2010. #endif
  2011. lsl temp, tempK, #4
  2012. add pA, pA, temp
  2013. lsl temp, tempK, #2
  2014. add pB, pB, temp
  2015. #endif
  2016. #if defined(LEFT)
  2017. add tempOffset, tempOffset, #4
  2018. #endif
  2019. .Lstrmm_kernel_L1_M4_END:
  2020. /******************************************************************************/
  2021. .Lstrmm_kernel_L1_M2_BEGIN:
  2022. mov counterI, origM
  2023. tst counterI , #3
  2024. ble .Lstrmm_kernel_L1_END
  2025. tst counterI, #2 // counterI = counterI / 2
  2026. ble .Lstrmm_kernel_L1_M1_BEGIN
  2027. .Lstrmm_kernel_L1_M2_20:
  2028. INIT2x1
  2029. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2030. mov pB, origPB
  2031. #else
  2032. mov pB, origPB
  2033. lsl temp, tempOffset, #2
  2034. add pB, pB, temp
  2035. lsl temp, tempOffset, #3
  2036. add pA, pA, temp
  2037. #endif
  2038. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2039. sub tempK, origK, tempOffset
  2040. #elif defined(LEFT)
  2041. add tempK, tempOffset, #2
  2042. #else
  2043. add tempK, tempOffset, #1
  2044. #endif
  2045. asr counterL , tempK, #3 // counterL = counterL / 8
  2046. cmp counterL , #0
  2047. ble .Lstrmm_kernel_L1_M2_40
  2048. .Lstrmm_kernel_L1_M2_22:
  2049. KERNEL2x1_SUB
  2050. KERNEL2x1_SUB
  2051. KERNEL2x1_SUB
  2052. KERNEL2x1_SUB
  2053. KERNEL2x1_SUB
  2054. KERNEL2x1_SUB
  2055. KERNEL2x1_SUB
  2056. KERNEL2x1_SUB
  2057. subs counterL, counterL, #1
  2058. bgt .Lstrmm_kernel_L1_M2_22
  2059. .Lstrmm_kernel_L1_M2_40:
  2060. ands counterL , tempK, #7 // counterL = counterL % 8
  2061. ble .Lstrmm_kernel_L1_M2_100
  2062. .Lstrmm_kernel_L1_M2_42:
  2063. KERNEL2x1_SUB
  2064. subs counterL, counterL, #1
  2065. bgt .Lstrmm_kernel_L1_M2_42
  2066. .Lstrmm_kernel_L1_M2_100:
  2067. SAVE2x1
  2068. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2069. sub tempK, origK, tempOffset
  2070. #if defined(LEFT)
  2071. sub tempK, tempK, #2
  2072. #else
  2073. sub tempK, tempK, #1
  2074. #endif
  2075. lsl temp, tempK, #3
  2076. add pA, pA, temp
  2077. lsl temp, tempK, #2
  2078. add pB, pB, temp
  2079. #endif
  2080. #if defined(LEFT)
  2081. add tempOffset, tempOffset, #2
  2082. #endif
  2083. .Lstrmm_kernel_L1_M2_END:
  2084. /******************************************************************************/
  2085. .Lstrmm_kernel_L1_M1_BEGIN:
  2086. tst counterI, #1 // counterI = counterI % 2
  2087. ble .Lstrmm_kernel_L1_END
  2088. .Lstrmm_kernel_L1_M1_20:
  2089. INIT1x1
  2090. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2091. mov pB, origPB
  2092. #else
  2093. mov pB, origPB
  2094. lsl temp, tempOffset, #2
  2095. add pB, pB, temp
  2096. lsl temp, tempOffset, #2
  2097. add pA, pA, temp
  2098. #endif
  2099. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2100. sub tempK, origK, tempOffset
  2101. #elif defined(LEFT)
  2102. add tempK, tempOffset, #1
  2103. #else
  2104. add tempK, tempOffset, #1
  2105. #endif
  2106. asr counterL , tempK, #3 // counterL = counterL / 8
  2107. cmp counterL , #0
  2108. ble .Lstrmm_kernel_L1_M1_40
  2109. .Lstrmm_kernel_L1_M1_22:
  2110. KERNEL1x1_SUB
  2111. KERNEL1x1_SUB
  2112. KERNEL1x1_SUB
  2113. KERNEL1x1_SUB
  2114. KERNEL1x1_SUB
  2115. KERNEL1x1_SUB
  2116. KERNEL1x1_SUB
  2117. KERNEL1x1_SUB
  2118. subs counterL, counterL, #1
  2119. bgt .Lstrmm_kernel_L1_M1_22
  2120. .Lstrmm_kernel_L1_M1_40:
  2121. ands counterL , tempK, #7 // counterL = counterL % 8
  2122. ble .Lstrmm_kernel_L1_M1_100
  2123. .Lstrmm_kernel_L1_M1_42:
  2124. KERNEL1x1_SUB
  2125. subs counterL, counterL, #1
  2126. bgt .Lstrmm_kernel_L1_M1_42
  2127. .Lstrmm_kernel_L1_M1_100:
  2128. SAVE1x1
  2129. .Lstrmm_kernel_L1_END:
  2130. /******************************************************************************/
  2131. .Lstrmm_kernel_L999:
  2132. mov x0, #0 // set return value
  2133. ldp d8, d9, [sp, #(0 * 16)]
  2134. ldp d10, d11, [sp, #(1 * 16)]
  2135. ldp d12, d13, [sp, #(2 * 16)]
  2136. ldp d14, d15, [sp, #(3 * 16)]
  2137. ldp d16, d17, [sp, #(4 * 16)]
  2138. ldp x18, x19, [sp, #(5 * 16)]
  2139. ldp x20, x21, [sp, #(6 * 16)]
  2140. ldp x22, x23, [sp, #(7 * 16)]
  2141. ldp x24, x25, [sp, #(8 * 16)]
  2142. ldp x26, x27, [sp, #(9 * 16)]
  2143. ldr x28, [sp, #(10 * 16)]
  2144. add sp, sp, #(11*16)
  2145. ret
  2146. EPILOGUE