You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strmm_kernel_8x8.S 54 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 x7 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define temp x16
  48. #define tempOffset x17
  49. #define tempK x18
  50. #define alpha0 s10
  51. #define alphaV0 v10.s[0]
  52. #define alpha1 s11
  53. #define alphaV1 v11.s[0]
  54. #define alpha2 s14
  55. #define alphaV2 v14.s[0]
  56. #define alpha3 s15
  57. #define alphaV3 v15.s[0]
  58. // 00 origM
  59. // 01 origN
  60. // 02 origK
  61. // 03 origPA
  62. // 04 origPB
  63. // 05 pC
  64. // 06 origLDC -> LDC
  65. // 07 offset
  66. // 08 counterL
  67. // 09 counterI
  68. // 10 counterJ
  69. // 11 pB
  70. // 12 pCRow0
  71. // 13 pCRow1
  72. // 14 pCRow2
  73. // 15 pA
  74. // 16 temp
  75. // 17 tempOffset
  76. // 18 must save tempK
  77. // 19 must save
  78. // 20 must save
  79. // 21 must save
  80. // 22 must save
  81. // 23 must save
  82. // 24 must save
  83. // 25 must save
  84. // 26 must save
  85. // 27 must save
  86. // 28 must save
  87. // 29 frame
  88. // 30 link
  89. // 31 sp
  90. //v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3
  91. //v01 pA0_4, pA0_5, pA0_6, pA0_7
  92. //v02 pA1_0, pA1_1, pA1_2, pA1_3
  93. //v03 pA1_4, pA1_5, pA1_6, pA1_7
  94. //v04 pB0_0, pB0_1, pB0_2, pB0_3
  95. //v05 pB0_4, pB0_5, pB0_6, pB0_7
  96. //v06 pB1_0, pB1_1, pB1_2, pB1_3
  97. //v07 pB1_4, pB1_5, pB1_6, pB1_7
  98. //v08 must save
  99. //v09 must save
  100. //v10 must save ALPHA0
  101. //v11 must save ALPHA1
  102. //v12 must save
  103. //v13 must save
  104. //v14 must save ALPHA2
  105. //v15 must save ALPHA3
  106. //v16 must save C00, C01, C02, C03
  107. //v17 must save C04, C05, C06, C07
  108. //v18 C08, C09, C10, C11
  109. //v19 C12, C13, C14, C15
  110. //v20 C16, C17, C18, C19
  111. //v21 C20, C21, C22, C23
  112. //v22 C24, C25, C26, C27
  113. //v23 C28, C29, C30, C31
  114. //v24 C32, C33, C34, C35
  115. //v25 C36, C37, C38, C39
  116. //v26 C40, C41, C42, C43
  117. //v27 C44, C45, C46, C47
  118. //v28 C48, C49, C50, C51
  119. //v29 C52, C53, C54, C55
  120. //v30 C56, C57, C58, C59
  121. //v31 C60, C61, C62, C63
  122. /*******************************************************************************
  123. * Macro definitions
  124. *******************************************************************************/
  125. .macro INIT8x8
  126. fmov s16, wzr
  127. fmov s17, wzr
  128. fmov s18, s16
  129. fmov s19, s17
  130. fmov s20, wzr
  131. fmov s21, s16
  132. fmov s22, s17
  133. fmov s23, s18
  134. fmov s24, wzr
  135. fmov s25, s16
  136. fmov s26, s17
  137. fmov s27, s18
  138. fmov s28, wzr
  139. fmov s29, s16
  140. fmov s30, s17
  141. fmov s31, s18
  142. .endm
  143. .macro KERNEL8x8_I
  144. ld1 {v4.4s}, [pB]
  145. add pB, pB, #16
  146. ld1 {v5.4s}, [pB]
  147. add pB, pB, #16
  148. ld1 {v0.4s}, [pA]
  149. add pA, pA, #16
  150. ld1 {v1.4s}, [pA]
  151. add pA, pA, #16
  152. fmul v16.4s, v0.4s, v4.s[0]
  153. fmul v17.4s, v1.4s, v4.s[0]
  154. fmul v18.4s, v0.4s, v4.s[1]
  155. fmul v19.4s, v1.4s, v4.s[1]
  156. fmul v20.4s, v0.4s, v4.s[2]
  157. fmul v21.4s, v1.4s, v4.s[2]
  158. fmul v22.4s, v0.4s, v4.s[3]
  159. fmul v23.4s, v1.4s, v4.s[3]
  160. fmul v24.4s, v0.4s, v5.s[0]
  161. fmul v25.4s, v1.4s, v5.s[0]
  162. fmul v26.4s, v0.4s, v5.s[1]
  163. fmul v27.4s, v1.4s, v5.s[1]
  164. fmul v28.4s, v0.4s, v5.s[2]
  165. fmul v29.4s, v1.4s, v5.s[2]
  166. fmul v30.4s, v0.4s, v5.s[3]
  167. fmul v31.4s, v1.4s, v5.s[3]
  168. ld1 {v6.4s}, [pB]
  169. add pB, pB, #16
  170. ld1 {v7.4s}, [pB]
  171. add pB, pB, #16
  172. ld1 {v2.4s}, [pA]
  173. add pA, pA, #16
  174. ld1 {v3.4s}, [pA]
  175. add pA, pA, #16
  176. .endm
  177. .macro KERNEL8x8_M1
  178. fmla v16.4s, v0.4s, v4.s[0]
  179. fmla v17.4s, v1.4s, v4.s[0]
  180. fmla v18.4s, v0.4s, v4.s[1]
  181. fmla v19.4s, v1.4s, v4.s[1]
  182. fmla v20.4s, v0.4s, v4.s[2]
  183. fmla v21.4s, v1.4s, v4.s[2]
  184. fmla v22.4s, v0.4s, v4.s[3]
  185. fmla v23.4s, v1.4s, v4.s[3]
  186. fmla v24.4s, v0.4s, v5.s[0]
  187. fmla v25.4s, v1.4s, v5.s[0]
  188. fmla v26.4s, v0.4s, v5.s[1]
  189. fmla v27.4s, v1.4s, v5.s[1]
  190. fmla v28.4s, v0.4s, v5.s[2]
  191. fmla v29.4s, v1.4s, v5.s[2]
  192. fmla v30.4s, v0.4s, v5.s[3]
  193. fmla v31.4s, v1.4s, v5.s[3]
  194. ld1 {v6.4s}, [pB]
  195. add pB, pB, #16
  196. ld1 {v7.4s}, [pB]
  197. add pB, pB, #16
  198. ld1 {v2.4s}, [pA]
  199. add pA, pA, #16
  200. ld1 {v3.4s}, [pA]
  201. add pA, pA, #16
  202. .endm
  203. .macro KERNEL8x8_M2
  204. fmla v16.4s, v2.4s, v6.s[0]
  205. fmla v17.4s, v3.4s, v6.s[0]
  206. fmla v18.4s, v2.4s, v6.s[1]
  207. fmla v19.4s, v3.4s, v6.s[1]
  208. fmla v20.4s, v2.4s, v6.s[2]
  209. fmla v21.4s, v3.4s, v6.s[2]
  210. fmla v22.4s, v2.4s, v6.s[3]
  211. fmla v23.4s, v3.4s, v6.s[3]
  212. fmla v24.4s, v2.4s, v7.s[0]
  213. fmla v25.4s, v3.4s, v7.s[0]
  214. fmla v26.4s, v2.4s, v7.s[1]
  215. fmla v27.4s, v3.4s, v7.s[1]
  216. fmla v28.4s, v2.4s, v7.s[2]
  217. fmla v29.4s, v3.4s, v7.s[2]
  218. fmla v30.4s, v2.4s, v7.s[3]
  219. fmla v31.4s, v3.4s, v7.s[3]
  220. ld1 {v4.4s}, [pB]
  221. add pB, pB, #16
  222. ld1 {v5.4s}, [pB]
  223. add pB, pB, #16
  224. ld1 {v0.4s}, [pA]
  225. add pA, pA, #16
  226. ld1 {v1.4s}, [pA]
  227. add pA, pA, #16
  228. .endm
  229. .macro KERNEL8x8_E
  230. fmla v16.4s, v2.4s, v6.s[0]
  231. fmla v17.4s, v3.4s, v6.s[0]
  232. fmla v18.4s, v2.4s, v6.s[1]
  233. fmla v19.4s, v3.4s, v6.s[1]
  234. fmla v20.4s, v2.4s, v6.s[2]
  235. fmla v21.4s, v3.4s, v6.s[2]
  236. fmla v22.4s, v2.4s, v6.s[3]
  237. fmla v23.4s, v3.4s, v6.s[3]
  238. fmla v24.4s, v2.4s, v7.s[0]
  239. fmla v25.4s, v3.4s, v7.s[0]
  240. fmla v26.4s, v2.4s, v7.s[1]
  241. fmla v27.4s, v3.4s, v7.s[1]
  242. fmla v28.4s, v2.4s, v7.s[2]
  243. fmla v29.4s, v3.4s, v7.s[2]
  244. fmla v30.4s, v2.4s, v7.s[3]
  245. fmla v31.4s, v3.4s, v7.s[3]
  246. .endm
  247. .macro KERNEL8x8_SUB
  248. ld1 {v4.4s}, [pB]
  249. add pB, pB, #16
  250. ld1 {v5.4s}, [pB]
  251. add pB, pB, #16
  252. ld1 {v0.4s}, [pA]
  253. add pA, pA, #16
  254. ld1 {v1.4s}, [pA]
  255. add pA, pA, #16
  256. fmla v16.4s, v0.4s, v4.s[0]
  257. fmla v17.4s, v1.4s, v4.s[0]
  258. fmla v18.4s, v0.4s, v4.s[1]
  259. fmla v19.4s, v1.4s, v4.s[1]
  260. fmla v20.4s, v0.4s, v4.s[2]
  261. fmla v21.4s, v1.4s, v4.s[2]
  262. fmla v22.4s, v0.4s, v4.s[3]
  263. fmla v23.4s, v1.4s, v4.s[3]
  264. fmla v24.4s, v0.4s, v5.s[0]
  265. fmla v25.4s, v1.4s, v5.s[0]
  266. fmla v26.4s, v0.4s, v5.s[1]
  267. fmla v27.4s, v1.4s, v5.s[1]
  268. fmla v28.4s, v0.4s, v5.s[2]
  269. fmla v29.4s, v1.4s, v5.s[2]
  270. fmla v30.4s, v0.4s, v5.s[3]
  271. fmla v31.4s, v1.4s, v5.s[3]
  272. .endm
  273. .macro SAVE8x8
  274. add pCRow1, pCRow0, LDC
  275. fmul v0.4s, v16.4s, alphaV0
  276. fmul v1.4s, v17.4s, alphaV1
  277. st1 {v0.4s, v1.4s}, [pCRow0]
  278. add pCRow2, pCRow1, LDC
  279. fmul v2.4s, v18.4s, alphaV2
  280. fmul v3.4s, v19.4s, alphaV3
  281. st1 {v2.4s, v3.4s}, [pCRow1]
  282. add pCRow1, pCRow2, LDC
  283. fmul v4.4s, v20.4s, alphaV0
  284. fmul v5.4s, v21.4s, alphaV1
  285. st1 {v4.4s, v5.4s}, [pCRow2]
  286. add pCRow2, pCRow1, LDC
  287. fmul v6.4s, v22.4s, alphaV2
  288. fmul v7.4s, v23.4s, alphaV3
  289. st1 {v6.4s, v7.4s}, [pCRow1]
  290. add pCRow1, pCRow2, LDC
  291. fmul v0.4s, v24.4s, alphaV0
  292. fmul v1.4s, v25.4s, alphaV1
  293. st1 {v0.4s, v1.4s}, [pCRow2]
  294. add pCRow2, pCRow1, LDC
  295. fmul v2.4s, v26.4s, alphaV2
  296. fmul v3.4s, v27.4s, alphaV3
  297. st1 {v2.4s, v3.4s}, [pCRow1]
  298. add pCRow1, pCRow2, LDC
  299. fmul v4.4s, v28.4s, alphaV0
  300. fmul v5.4s, v29.4s, alphaV1
  301. st1 {v4.4s, v5.4s}, [pCRow2]
  302. fmul v6.4s, v30.4s, alphaV2
  303. fmul v7.4s, v31.4s, alphaV3
  304. st1 {v6.4s, v7.4s}, [pCRow1]
  305. add pCRow0, pCRow0, #32
  306. .endm
  307. /******************************************************************************/
  308. .macro INIT4x8
  309. fmov s16, wzr
  310. fmov s18, wzr
  311. fmov s20, wzr
  312. fmov s22, s16
  313. fmov s24, wzr
  314. fmov s26, s16
  315. fmov s28, s18
  316. fmov s30, s20
  317. .endm
  318. .macro KERNEL4x8_I
  319. ld1 {v4.4s}, [pB]
  320. add pB, pB, #16
  321. ld1 {v5.4s}, [pB]
  322. add pB, pB, #16
  323. ld1 {v0.4s}, [pA]
  324. add pA, pA, #16
  325. fmul v16.4s, v0.4s, v4.s[0]
  326. fmul v18.4s, v0.4s, v4.s[1]
  327. fmul v20.4s, v0.4s, v4.s[2]
  328. fmul v22.4s, v0.4s, v4.s[3]
  329. fmul v24.4s, v0.4s, v5.s[0]
  330. fmul v26.4s, v0.4s, v5.s[1]
  331. fmul v28.4s, v0.4s, v5.s[2]
  332. fmul v30.4s, v0.4s, v5.s[3]
  333. ld1 {v6.4s}, [pB]
  334. add pB, pB, #16
  335. ld1 {v7.4s}, [pB]
  336. add pB, pB, #16
  337. ld1 {v2.4s}, [pA]
  338. add pA, pA, #16
  339. .endm
  340. .macro KERNEL4x8_M1
  341. fmla v16.4s, v0.4s, v4.s[0]
  342. fmla v18.4s, v0.4s, v4.s[1]
  343. fmla v20.4s, v0.4s, v4.s[2]
  344. fmla v22.4s, v0.4s, v4.s[3]
  345. fmla v24.4s, v0.4s, v5.s[0]
  346. fmla v26.4s, v0.4s, v5.s[1]
  347. fmla v28.4s, v0.4s, v5.s[2]
  348. fmla v30.4s, v0.4s, v5.s[3]
  349. ld1 {v6.4s}, [pB]
  350. add pB, pB, #16
  351. ld1 {v7.4s}, [pB]
  352. add pB, pB, #16
  353. ld1 {v2.4s}, [pA]
  354. add pA, pA, #16
  355. .endm
  356. .macro KERNEL4x8_M2
  357. fmla v16.4s, v2.4s, v6.s[0]
  358. fmla v18.4s, v2.4s, v6.s[1]
  359. fmla v20.4s, v2.4s, v6.s[2]
  360. fmla v22.4s, v2.4s, v6.s[3]
  361. fmla v24.4s, v2.4s, v7.s[0]
  362. fmla v26.4s, v2.4s, v7.s[1]
  363. fmla v28.4s, v2.4s, v7.s[2]
  364. fmla v30.4s, v2.4s, v7.s[3]
  365. ld1 {v4.4s}, [pB]
  366. add pB, pB, #16
  367. ld1 {v5.4s}, [pB]
  368. add pB, pB, #16
  369. ld1 {v0.4s}, [pA]
  370. add pA, pA, #16
  371. .endm
  372. .macro KERNEL4x8_E
  373. fmla v16.4s, v2.4s, v6.s[0]
  374. fmla v18.4s, v2.4s, v6.s[1]
  375. fmla v20.4s, v2.4s, v6.s[2]
  376. fmla v22.4s, v2.4s, v6.s[3]
  377. fmla v24.4s, v2.4s, v7.s[0]
  378. fmla v26.4s, v2.4s, v7.s[1]
  379. fmla v28.4s, v2.4s, v7.s[2]
  380. fmla v30.4s, v2.4s, v7.s[3]
  381. .endm
  382. .macro KERNEL4x8_SUB
  383. ld1 {v4.4s}, [pB]
  384. add pB, pB, #16
  385. ld1 {v5.4s}, [pB]
  386. add pB, pB, #16
  387. ld1 {v0.4s}, [pA]
  388. add pA, pA, #16
  389. fmla v16.4s, v0.4s, v4.s[0]
  390. fmla v18.4s, v0.4s, v4.s[1]
  391. fmla v20.4s, v0.4s, v4.s[2]
  392. fmla v22.4s, v0.4s, v4.s[3]
  393. fmla v24.4s, v0.4s, v5.s[0]
  394. fmla v26.4s, v0.4s, v5.s[1]
  395. fmla v28.4s, v0.4s, v5.s[2]
  396. fmla v30.4s, v0.4s, v5.s[3]
  397. .endm
  398. .macro SAVE4x8
  399. add pCRow1, pCRow0, LDC
  400. fmul v0.4s, v16.4s, alphaV0
  401. st1 {v0.4s}, [pCRow0]
  402. add pCRow2, pCRow1, LDC
  403. fmul v2.4s, v18.4s, alphaV2
  404. st1 {v2.4s}, [pCRow1]
  405. add pCRow1, pCRow2, LDC
  406. fmul v4.4s, v20.4s, alphaV0
  407. st1 {v4.4s}, [pCRow2]
  408. add pCRow2, pCRow1, LDC
  409. fmul v6.4s, v22.4s, alphaV2
  410. st1 {v6.4s}, [pCRow1]
  411. add pCRow1, pCRow2, LDC
  412. fmul v0.4s, v24.4s, alphaV0
  413. st1 {v0.4s}, [pCRow2]
  414. add pCRow2, pCRow1, LDC
  415. fmul v2.4s, v26.4s, alphaV2
  416. st1 {v2.4s}, [pCRow1]
  417. add pCRow1, pCRow2, LDC
  418. fmul v4.4s, v28.4s, alphaV0
  419. st1 {v4.4s}, [pCRow2]
  420. fmul v6.4s, v30.4s, alphaV2
  421. st1 {v6.4s}, [pCRow1]
  422. add pCRow0, pCRow0, #16
  423. .endm
  424. /******************************************************************************/
  425. .macro INIT2x8
  426. fmov s16, wzr
  427. fmov s18, wzr
  428. fmov s20, wzr
  429. fmov s22, s16
  430. fmov s24, wzr
  431. fmov s26, s16
  432. fmov s28, s18
  433. fmov s30, s20
  434. .endm
  435. .macro KERNEL2x8_SUB
  436. ld1 {v4.4s}, [pB]
  437. add pB, pB, #16
  438. ld1 {v5.4s}, [pB]
  439. add pB, pB, #16
  440. ld1 {v0.2s}, [pA]
  441. add pA, pA, #8
  442. fmla v16.2s, v0.2s, v4.s[0]
  443. fmla v18.2s, v0.2s, v4.s[1]
  444. fmla v20.2s, v0.2s, v4.s[2]
  445. fmla v22.2s, v0.2s, v4.s[3]
  446. fmla v24.2s, v0.2s, v5.s[0]
  447. fmla v26.2s, v0.2s, v5.s[1]
  448. fmla v28.2s, v0.2s, v5.s[2]
  449. fmla v30.2s, v0.2s, v5.s[3]
  450. .endm
  451. .macro SAVE2x8
  452. add pCRow1, pCRow0, LDC
  453. fmul v0.2s, v16.2s, alphaV0
  454. st1 {v0.2s}, [pCRow0]
  455. add pCRow2, pCRow1, LDC
  456. fmul v2.2s, v18.2s, alphaV2
  457. st1 {v2.2s}, [pCRow1]
  458. add pCRow1, pCRow2, LDC
  459. fmul v4.2s, v20.2s, alphaV0
  460. st1 {v4.2s}, [pCRow2]
  461. add pCRow2, pCRow1, LDC
  462. fmul v6.2s, v22.2s, alphaV2
  463. st1 {v6.2s}, [pCRow1]
  464. add pCRow1, pCRow2, LDC
  465. fmul v0.2s, v24.2s, alphaV0
  466. st1 {v0.2s}, [pCRow2]
  467. add pCRow2, pCRow1, LDC
  468. fmul v2.2s, v26.2s, alphaV2
  469. st1 {v2.2s}, [pCRow1]
  470. add pCRow1, pCRow2, LDC
  471. fmul v4.2s, v28.2s, alphaV0
  472. st1 {v4.2s}, [pCRow2]
  473. fmul v6.2s, v30.2s, alphaV2
  474. st1 {v6.2s}, [pCRow1]
  475. add pCRow0, pCRow0, #8
  476. .endm
  477. /******************************************************************************/
  478. .macro INIT1x8
  479. fmov s16, wzr
  480. fmov s18, wzr
  481. fmov s20, wzr
  482. fmov s22, s16
  483. fmov s24, wzr
  484. fmov s26, s16
  485. fmov s28, s18
  486. fmov s30, s20
  487. .endm
  488. .macro KERNEL1x8_SUB
  489. ld1 {v4.4s}, [pB]
  490. add pB, pB, #16
  491. ld1 {v5.4s}, [pB]
  492. add pB, pB, #16
  493. ldr s0, [pA]
  494. add pA, pA, #4
  495. fmla s16, s0, v4.s[0]
  496. fmla s18, s0, v4.s[1]
  497. fmla s20, s0, v4.s[2]
  498. fmla s22, s0, v4.s[3]
  499. fmla s24, s0, v5.s[0]
  500. fmla s26, s0, v5.s[1]
  501. fmla s28, s0, v5.s[2]
  502. fmla s30, s0, v5.s[3]
  503. .endm
  504. .macro SAVE1x8
  505. add pCRow1, pCRow0, LDC
  506. fmul s0, s16, alphaV0
  507. str s0, [pCRow0]
  508. add pCRow2, pCRow1, LDC
  509. fmul s2, s18, alphaV2
  510. str s2, [pCRow1]
  511. add pCRow1, pCRow2, LDC
  512. fmul s4, s20, alphaV0
  513. str s4, [pCRow2]
  514. add pCRow2, pCRow1, LDC
  515. fmul s6, s22, alphaV2
  516. str s6, [pCRow1]
  517. add pCRow1, pCRow2, LDC
  518. fmul s0, s24, alphaV0
  519. str s0, [pCRow2]
  520. add pCRow2, pCRow1, LDC
  521. fmul s2, s26, alphaV2
  522. str s2, [pCRow1]
  523. add pCRow1, pCRow2, LDC
  524. fmul s4, s28, alphaV0
  525. str s4, [pCRow2]
  526. fmul s6, s30, alphaV2
  527. str s6, [pCRow1]
  528. add pCRow0, pCRow0, #4
  529. .endm
  530. /******************************************************************************/
  531. .macro INIT8x4
  532. fmov s16, wzr
  533. fmov s17, wzr
  534. fmov s20, wzr
  535. fmov s21, s16
  536. fmov s24, wzr
  537. fmov s25, s16
  538. fmov s28, wzr
  539. fmov s29, s16
  540. .endm
  541. .macro KERNEL8x4_I
  542. ld1 {v8.2s, v9.2s}, [pB]
  543. add pB, pB, #16
  544. ld1 {v0.4s}, [pA]
  545. add pA, pA, #16
  546. ld1 {v1.4s}, [pA]
  547. add pA, pA, #16
  548. fmul v16.4s, v0.4s, v8.s[0]
  549. fmul v17.4s, v1.4s, v8.s[0]
  550. fmul v20.4s, v0.4s, v8.s[1]
  551. fmul v21.4s, v1.4s, v8.s[1]
  552. fmul v24.4s, v0.4s, v9.s[0]
  553. fmul v25.4s, v1.4s, v9.s[0]
  554. fmul v28.4s, v0.4s, v9.s[1]
  555. fmul v29.4s, v1.4s, v9.s[1]
  556. ld1 {v12.2s, v13.2s}, [pB]
  557. add pB, pB, #16
  558. ld1 {v4.4s}, [pA]
  559. add pA, pA, #16
  560. ld1 {v5.4s}, [pA]
  561. add pA, pA, #16
  562. .endm
  563. .macro KERNEL8x4_M1
  564. fmla v16.4s, v0.4s, v8.s[0]
  565. fmla v17.4s, v1.4s, v8.s[0]
  566. fmla v20.4s, v0.4s, v8.s[1]
  567. fmla v21.4s, v1.4s, v8.s[1]
  568. fmla v24.4s, v0.4s, v9.s[0]
  569. fmla v25.4s, v1.4s, v9.s[0]
  570. fmla v28.4s, v0.4s, v9.s[1]
  571. fmla v29.4s, v1.4s, v9.s[1]
  572. ld1 {v12.2s, v13.2s}, [pB]
  573. add pB, pB, #16
  574. ld1 {v4.4s}, [pA]
  575. add pA, pA, #16
  576. ld1 {v5.4s}, [pA]
  577. add pA, pA, #16
  578. .endm
  579. .macro KERNEL8x4_M2
  580. fmla v16.4s, v4.4s, v12.s[0]
  581. fmla v17.4s, v5.4s, v12.s[0]
  582. fmla v20.4s, v4.4s, v12.s[1]
  583. fmla v21.4s, v5.4s, v12.s[1]
  584. fmla v24.4s, v4.4s, v13.s[0]
  585. fmla v25.4s, v5.4s, v13.s[0]
  586. fmla v28.4s, v4.4s, v13.s[1]
  587. fmla v29.4s, v5.4s, v13.s[1]
  588. ld1 {v8.2s, v9.2s}, [pB]
  589. add pB, pB, #16
  590. ld1 {v0.4s}, [pA]
  591. add pA, pA, #16
  592. ld1 {v1.4s}, [pA]
  593. add pA, pA, #16
  594. .endm
  595. .macro KERNEL8x4_E
  596. fmla v16.4s, v4.4s, v12.s[0]
  597. fmla v17.4s, v5.4s, v12.s[0]
  598. fmla v20.4s, v4.4s, v12.s[1]
  599. fmla v21.4s, v5.4s, v12.s[1]
  600. fmla v24.4s, v4.4s, v13.s[0]
  601. fmla v25.4s, v5.4s, v13.s[0]
  602. fmla v28.4s, v4.4s, v13.s[1]
  603. fmla v29.4s, v5.4s, v13.s[1]
  604. .endm
  605. .macro KERNEL8x4_SUB
  606. ld1 {v8.2s, v9.2s}, [pB]
  607. add pB, pB, #16
  608. ld1 {v0.4s}, [pA]
  609. add pA, pA, #16
  610. ld1 {v1.4s}, [pA]
  611. add pA, pA, #16
  612. fmla v16.4s, v0.4s, v8.s[0]
  613. fmla v17.4s, v1.4s, v8.s[0]
  614. fmla v20.4s, v0.4s, v8.s[1]
  615. fmla v21.4s, v1.4s, v8.s[1]
  616. fmla v24.4s, v0.4s, v9.s[0]
  617. fmla v25.4s, v1.4s, v9.s[0]
  618. fmla v28.4s, v0.4s, v9.s[1]
  619. fmla v29.4s, v1.4s, v9.s[1]
  620. .endm
  621. .macro SAVE8x4
  622. add pCRow1, pCRow0, LDC
  623. fmul v0.4s, v16.4s, alphaV0
  624. fmul v1.4s, v17.4s, alphaV1
  625. st1 {v0.4s, v1.4s}, [pCRow0]
  626. add pCRow2, pCRow1, LDC
  627. fmul v4.4s, v20.4s, alphaV0
  628. fmul v5.4s, v21.4s, alphaV1
  629. st1 {v4.4s, v5.4s}, [pCRow1]
  630. add pCRow1, pCRow2, LDC
  631. fmul v0.4s, v24.4s, alphaV0
  632. fmul v1.4s, v25.4s, alphaV1
  633. st1 {v0.4s, v1.4s}, [pCRow2]
  634. fmul v4.4s, v28.4s, alphaV0
  635. fmul v5.4s, v29.4s, alphaV1
  636. st1 {v4.4s, v5.4s}, [pCRow1]
  637. add pCRow0, pCRow0, #32
  638. .endm
  639. /******************************************************************************/
  640. .macro INIT4x4
  641. fmov s16, wzr
  642. fmov s17, s16
  643. fmov s20, s17
  644. fmov s21, s16
  645. fmov s24, s17
  646. fmov s25, s16
  647. fmov s28, s17
  648. fmov s29, s16
  649. .endm
  650. .macro KERNEL4x4_I
  651. ld1 {v8.2s, v9.2s}, [pB]
  652. add pB, pB, #16
  653. ld1 {v0.2s, v1.2s}, [pA]
  654. add pA, pA, #16
  655. fmul v16.2s, v0.2s, v8.s[0]
  656. fmul v29.2s, v1.2s, v9.s[1]
  657. fmul v20.2s, v0.2s, v8.s[1]
  658. fmul v25.2s, v1.2s, v9.s[0]
  659. fmul v24.2s, v0.2s, v9.s[0]
  660. fmul v21.2s, v1.2s, v8.s[1]
  661. fmul v28.2s, v0.2s, v9.s[1]
  662. fmul v17.2s, v1.2s, v8.s[0]
  663. ld1 {v12.2s, v13.2s}, [pB]
  664. add pB, pB, #16
  665. ld1 {v4.2s, v5.2s}, [pA]
  666. add pA, pA, #16
  667. .endm
  668. .macro KERNEL4x4_M1
  669. fmla v16.2s, v0.2s, v8.s[0]
  670. fmla v29.2s, v1.2s, v9.s[1]
  671. ld1 {v12.2s, v13.2s}, [pB] // For next round
  672. add pB, pB, #16
  673. fmla v20.2s, v0.2s, v8.s[1]
  674. fmla v25.2s, v1.2s, v9.s[0]
  675. ld1 {v4.2s, v5.2s}, [pA] // For next round
  676. add pA, pA, #16
  677. fmla v24.2s, v0.2s, v9.s[0]
  678. fmla v21.2s, v1.2s, v8.s[1]
  679. prfm PLDL1KEEP, [pB, #512]
  680. fmla v28.2s, v0.2s, v9.s[1]
  681. fmla v17.2s, v1.2s, v8.s[0]
  682. .endm
  683. .macro KERNEL4x4_M2
  684. fmla v16.2s, v4.2s, v12.s[0]
  685. fmla v29.2s, v5.2s, v13.s[1]
  686. ld1 {v8.2s, v9.2s}, [pB] // For next round
  687. add pB, pB, #16
  688. fmla v20.2s, v4.2s, v12.s[1]
  689. fmla v25.2s, v5.2s, v13.s[0]
  690. ld1 {v0.2s, v1.2s}, [pA] // For next round
  691. add pA, pA, #16
  692. fmla v24.2s, v4.2s, v13.s[0]
  693. fmla v21.2s, v5.2s, v12.s[1]
  694. prfm PLDL1KEEP, [pA, #512]
  695. fmla v28.2s, v4.2s, v13.s[1]
  696. fmla v17.2s, v5.2s, v12.s[0]
  697. .endm
  698. .macro KERNEL4x4_E
  699. fmla v16.2s, v4.2s, v12.s[0]
  700. fmla v29.2s, v5.2s, v13.s[1]
  701. fmla v20.2s, v4.2s, v12.s[1]
  702. fmla v25.2s, v5.2s, v13.s[0]
  703. fmla v24.2s, v4.2s, v13.s[0]
  704. fmla v21.2s, v5.2s, v12.s[1]
  705. fmla v28.2s, v4.2s, v13.s[1]
  706. fmla v17.2s, v5.2s, v12.s[0]
  707. .endm
  708. .macro KERNEL4x4_SUB
  709. ld1 {v8.2s, v9.2s}, [pB]
  710. add pB, pB, #16
  711. ld1 {v0.2s, v1.2s}, [pA]
  712. add pA, pA, #16
  713. fmla v16.2s, v0.2s, v8.s[0]
  714. fmla v29.2s, v1.2s, v9.s[1]
  715. fmla v20.2s, v0.2s, v8.s[1]
  716. fmla v25.2s, v1.2s, v9.s[0]
  717. fmla v24.2s, v0.2s, v9.s[0]
  718. fmla v21.2s, v1.2s, v8.s[1]
  719. fmla v28.2s, v0.2s, v9.s[1]
  720. fmla v17.2s, v1.2s, v8.s[0]
  721. .endm
  722. .macro SAVE4x4
  723. fmul v8.2s, v16.2s, alphaV0
  724. fmul v9.2s, v17.2s, alphaV1
  725. st1 {v8.2s, v9.2s}, [pCRow0]
  726. add pCRow1, pCRow0, LDC
  727. fmul v12.2s, v20.2s, alphaV2
  728. fmul v13.2s, v21.2s, alphaV3
  729. st1 {v12.2s, v13.2s}, [pCRow1]
  730. add pCRow2, pCRow1, LDC
  731. fmul v8.2s, v24.2s, alphaV0
  732. fmul v9.2s, v25.2s, alphaV1
  733. st1 {v8.2s, v9.2s}, [pCRow2]
  734. add pCRow1, pCRow2, LDC
  735. fmul v12.2s, v28.2s, alphaV2
  736. fmul v13.2s, v29.2s, alphaV3
  737. st1 {v12.2s, v13.2s}, [pCRow1]
  738. add pCRow0, pCRow0, #16
  739. .endm
  740. /******************************************************************************/
  741. .macro INIT2x4
  742. fmov s16, wzr
  743. fmov s20, s16
  744. fmov s24, s20
  745. fmov s28, s16
  746. .endm
  747. .macro KERNEL2x4_SUB
  748. ld1 {v8.2s, v9.2s}, [pB]
  749. add pB, pB, #16
  750. ld1 {v0.2s}, [pA]
  751. add pA, pA, #8
  752. fmla v16.2s, v0.2s, v8.s[0]
  753. fmla v20.2s, v0.2s, v8.s[1]
  754. fmla v24.2s, v0.2s, v9.s[0]
  755. fmla v28.2s, v0.2s, v9.s[1]
  756. .endm
  757. .macro SAVE2x4
  758. fmul v8.2s, v16.2s, alphaV0
  759. st1 {v8.2s}, [pCRow0]
  760. add pCRow1, pCRow0, LDC
  761. fmul v12.2s, v20.2s, alphaV1
  762. st1 {v12.2s}, [pCRow1]
  763. add pCRow2, pCRow1, LDC
  764. fmul v8.2s, v24.2s, alphaV2
  765. st1 {v8.2s}, [pCRow2]
  766. add pCRow1, pCRow2, LDC
  767. fmul v12.2s, v28.2s, alphaV3
  768. st1 {v12.2s}, [pCRow1]
  769. add pCRow0, pCRow0, #8
  770. .endm
  771. /******************************************************************************/
  772. .macro INIT1x4
  773. fmov s16, wzr
  774. fmov s20, s16
  775. .endm
  776. .macro KERNEL1x4_SUB
  777. ldr s0, [pA]
  778. add pA, pA, #4
  779. ld1 {v8.2s, v9.2s}, [pB]
  780. add pB, pB, #16
  781. fmla v16.2s, v8.2s, v0.s[0]
  782. fmla v20.2s, v9.2s, v0.s[0]
  783. .endm
  784. .macro SAVE1x4
  785. add pCRow1, pCRow0, LDC
  786. fmul v8.2s, v16.2s, alphaV0
  787. st1 {v8.s}[0], [pCRow0]
  788. st1 {v8.s}[1], [pCRow1]
  789. add pCRow2, pCRow1, LDC
  790. add pCRow1, pCRow2, LDC
  791. fmul v12.2s, v20.2s, alphaV1
  792. st1 {v12.s}[0], [pCRow2]
  793. st1 {v12.s}[1], [pCRow1]
  794. add pCRow0, pCRow0, #4
  795. .endm
  796. /******************************************************************************/
  797. .macro INIT8x2
  798. fmov s16, wzr
  799. fmov s17, s16
  800. fmov s20, s17
  801. fmov s21, s16
  802. .endm
  803. .macro KERNEL8x2_SUB
  804. ld1 {v8.2s}, [pB]
  805. add pB, pB, #8
  806. ld1 {v0.4s}, [pA]
  807. add pA, pA, #16
  808. ld1 {v1.4s}, [pA]
  809. add pA, pA, #16
  810. fmla v16.4s, v0.4s, v8.s[0]
  811. fmla v17.4s, v1.4s, v8.s[0]
  812. fmla v20.4s, v0.4s, v8.s[1]
  813. fmla v21.4s, v1.4s, v8.s[1]
  814. .endm
  815. .macro SAVE8x2
  816. add pCRow1, pCRow0, LDC
  817. fmul v0.4s, v16.4s, alphaV0
  818. fmul v1.4s, v17.4s, alphaV1
  819. st1 {v0.4s, v1.4s}, [pCRow0]
  820. add pCRow2, pCRow1, LDC
  821. fmul v4.4s, v20.4s, alphaV0
  822. fmul v5.4s, v21.4s, alphaV1
  823. st1 {v4.4s, v5.4s}, [pCRow1]
  824. add pCRow0, pCRow0, #32
  825. .endm
  826. /******************************************************************************/
  827. .macro INIT4x2
  828. fmov s16, wzr
  829. fmov s17, s16
  830. fmov s20, s17
  831. fmov s21, s16
  832. .endm
  833. .macro KERNEL4x2_SUB
  834. ld1 {v8.2s}, [pB]
  835. add pB, pB, #8
  836. ld1 {v0.2s, v1.2s}, [pA]
  837. add pA, pA, #16
  838. fmla v16.2s, v0.2s, v8.s[0]
  839. fmla v17.2s, v1.2s, v8.s[0]
  840. fmla v20.2s, v0.2s, v8.s[1]
  841. fmla v21.2s, v1.2s, v8.s[1]
  842. .endm
  843. .macro SAVE4x2
  844. fmul v8.2s, v16.2s, alphaV0
  845. fmul v9.2s, v17.2s, alphaV1
  846. st1 {v8.2s, v9.2s}, [pCRow0]
  847. add pCRow1, pCRow0, LDC
  848. fmul v12.2s, v20.2s, alphaV2
  849. fmul v13.2s, v21.2s, alphaV3
  850. st1 {v12.2s, v13.2s}, [pCRow1]
  851. add pCRow0, pCRow0, #16
  852. .endm
  853. /******************************************************************************/
  854. .macro INIT2x2
  855. fmov s16, wzr
  856. fmov s20, s16
  857. .endm
  858. .macro KERNEL2x2_SUB
  859. ld1 {v8.2s}, [pB]
  860. add pB, pB, #8
  861. ld1 {v0.2s}, [pA]
  862. add pA, pA, #8
  863. fmla v16.2s, v0.2s, v8.s[0]
  864. fmla v20.2s, v0.2s, v8.s[1]
  865. .endm
  866. .macro SAVE2x2
  867. fmul v8.2s, v16.2s, alphaV0
  868. st1 {v8.2s}, [pCRow0]
  869. add pCRow1 , pCRow0, LDC
  870. fmul v12.2s, v20.2s, alphaV1
  871. st1 {v12.2s}, [pCRow1]
  872. add pCRow0, pCRow0, #8
  873. .endm
  874. /******************************************************************************/
  875. .macro INIT1x2
  876. fmov s16, wzr
  877. .endm
  878. .macro KERNEL1x2_SUB
  879. ld1 {v8.2s} , [pB]
  880. add pB , pB, #8
  881. ldr s0 , [pA]
  882. add pA, pA, #4
  883. fmla v16.2s, v8.2s, v0.s[0]
  884. .endm
  885. .macro SAVE1x2
  886. add pCRow1 , pCRow0, LDC
  887. fmul v8.2s, v16.2s, alphaV0
  888. st1 {v8.s}[0], [pCRow0]
  889. st1 {v8.s}[1], [pCRow1]
  890. add pCRow0, pCRow0, #4
  891. .endm
  892. /******************************************************************************/
  893. .macro INIT8x1
  894. fmov s16, wzr
  895. fmov s17, wzr
  896. .endm
  897. .macro KERNEL8x1_SUB
  898. ldr s8, [pB]
  899. add pB , pB, #4
  900. ld1 {v0.4s}, [pA]
  901. add pA, pA, #16
  902. ld1 {v1.4s}, [pA]
  903. add pA, pA, #16
  904. fmla v16.4s, v0.4s, v8.s[0]
  905. fmla v17.4s, v1.4s, v8.s[0]
  906. .endm
  907. .macro SAVE8x1
  908. fmul v0.4s, v16.4s, alphaV0
  909. fmul v1.4s, v17.4s, alphaV1
  910. st1 {v0.4s, v1.4s}, [pCRow0]
  911. add pCRow0, pCRow0, #32
  912. .endm
  913. /******************************************************************************/
  914. .macro INIT4x1
  915. fmov s16, wzr
  916. fmov s17, s16
  917. .endm
  918. .macro KERNEL4x1_SUB
  919. ldr s8, [pB]
  920. add pB , pB, #4
  921. ld1 {v0.2s, v1.2s}, [pA]
  922. add pA , pA, #16
  923. fmla v16.2s, v0.2s, v8.s[0]
  924. fmla v17.2s, v1.2s, v8.s[0]
  925. .endm
  926. .macro SAVE4x1
  927. fmul v8.2s, v16.2s, alphaV0
  928. fmul v9.2s, v17.2s, alphaV1
  929. st1 {v8.2s, v9.2s}, [pCRow0]
  930. add pCRow0, pCRow0, #16
  931. .endm
  932. /******************************************************************************/
  933. .macro INIT2x1
  934. fmov s16, wzr
  935. .endm
  936. .macro KERNEL2x1_SUB
  937. ldr s8, [pB]
  938. add pB , pB, #4
  939. ld1 {v0.2s}, [pA]
  940. add pA , pA, #8
  941. fmla v16.2s, v0.2s, v8.s[0]
  942. .endm
  943. .macro SAVE2x1
  944. fmul v8.2s, v16.2s, alphaV0
  945. st1 {v8.2s}, [pCRow0]
  946. add pCRow0, pCRow0, #8
  947. .endm
  948. /******************************************************************************/
  949. .macro INIT1x1
  950. fmov s16, wzr
  951. .endm
  952. .macro KERNEL1x1_SUB
  953. ldr s8, [pB]
  954. add pB , pB, #4
  955. ldr s0, [pA]
  956. add pA , pA, #4
  957. fmadd s16, s0, s8, s16
  958. .endm
  959. .macro SAVE1x1
  960. fmul s8, s16, alpha0
  961. str s8, [pCRow0]
  962. add pCRow0, pCRow0, #4
  963. .endm
  964. /*******************************************************************************
  965. * End of macro definitions
  966. *******************************************************************************/
  967. PROLOGUE
  968. .Lstrmm_kernel_begin:
  969. .align 5
  970. add sp, sp, #-(11 * 16)
  971. stp d8, d9, [sp, #(0 * 16)]
  972. stp d10, d11, [sp, #(1 * 16)]
  973. stp d12, d13, [sp, #(2 * 16)]
  974. stp d14, d15, [sp, #(3 * 16)]
  975. stp d16, d17, [sp, #(4 * 16)]
  976. stp x18, x19, [sp, #(5 * 16)]
  977. stp x20, x21, [sp, #(6 * 16)]
  978. stp x22, x23, [sp, #(7 * 16)]
  979. stp x24, x25, [sp, #(8 * 16)]
  980. stp x26, x27, [sp, #(9 * 16)]
  981. str x28, [sp, #(10 * 16)]
  982. fmov alpha0, s0
  983. fmov alpha1, s0
  984. fmov alpha2, s0
  985. fmov alpha3, s0
  986. lsl LDC, LDC, #2 // ldc = ldc * 4
  987. #if !defined(LEFT)
  988. neg tempOffset, offset
  989. #endif
  990. mov pB, origPB
  991. mov counterJ, origN
  992. asr counterJ, counterJ, #3 // J = J / 8
  993. cmp counterJ, #0
  994. ble .Lstrmm_kernel_L4_BEGIN
  995. /******************************************************************************/
  996. /******************************************************************************/
  997. .Lstrmm_kernel_L8_BEGIN:
  998. mov pCRow0, pC // pCRow0 = C
  999. add pC, pC, LDC, lsl #3
  1000. #if defined(LEFT)
  1001. mov tempOffset, offset
  1002. #endif
  1003. mov pA, origPA // pA = start of A array
  1004. /******************************************************************************/
  1005. .Lstrmm_kernel_L8_M8_BEGIN:
  1006. mov counterI, origM
  1007. asr counterI, counterI, #3 // counterI = counterI / 8
  1008. cmp counterI, #0
  1009. ble .Lstrmm_kernel_L8_M4_BEGIN
  1010. .Lstrmm_kernel_L8_M8_20:
  1011. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1012. mov pB, origPB
  1013. #else
  1014. mov pB, origPB
  1015. lsl temp, tempOffset, #5
  1016. add pA, pA, temp
  1017. add pB, pB, temp
  1018. #endif
  1019. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1020. sub tempK, origK, tempOffset
  1021. #elif defined(LEFT)
  1022. add tempK, tempOffset, #8
  1023. #else
  1024. add tempK, tempOffset, #8
  1025. #endif
  1026. asr counterL , tempK, #1 // L = K / 2
  1027. cmp counterL , #2 // is there at least 4 to do?
  1028. blt .Lstrmm_kernel_L8_M8_32
  1029. KERNEL8x8_I // do one in the K
  1030. KERNEL8x8_M2 // do another in the K
  1031. subs counterL, counterL, #2
  1032. ble .Lstrmm_kernel_L8_M8_22a
  1033. .align 5
  1034. .Lstrmm_kernel_L8_M8_22:
  1035. KERNEL8x8_M1
  1036. KERNEL8x8_M2
  1037. subs counterL, counterL, #1
  1038. bgt .Lstrmm_kernel_L8_M8_22
  1039. .Lstrmm_kernel_L8_M8_22a:
  1040. KERNEL8x8_M1
  1041. KERNEL8x8_E
  1042. b .Lstrmm_kernel_L8_M8_44
  1043. .Lstrmm_kernel_L8_M8_32:
  1044. tst counterL, #1
  1045. ble .Lstrmm_kernel_L8_M8_40
  1046. KERNEL8x8_I
  1047. KERNEL8x8_E
  1048. b .Lstrmm_kernel_L8_M8_44
  1049. .Lstrmm_kernel_L8_M8_40:
  1050. INIT8x8
  1051. .Lstrmm_kernel_L8_M8_44:
  1052. ands counterL , tempK, #1
  1053. ble .Lstrmm_kernel_L8_M8_100
  1054. .Lstrmm_kernel_L8_M8_46:
  1055. KERNEL8x8_SUB
  1056. .Lstrmm_kernel_L8_M8_100:
  1057. SAVE8x8
  1058. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1059. sub tempK, origK, tempOffset
  1060. #if defined(LEFT)
  1061. sub tempK, tempK, #8
  1062. #else
  1063. sub tempK, tempK, #8
  1064. #endif
  1065. lsl temp, tempK, #5
  1066. add pA, pA, temp
  1067. add pB, pB, temp
  1068. #endif
  1069. #if defined(LEFT)
  1070. add tempOffset, tempOffset, #8
  1071. #endif
  1072. .Lstrmm_kernel_L8_M8_END:
  1073. subs counterI, counterI, #1
  1074. bne .Lstrmm_kernel_L8_M8_20
  1075. /******************************************************************************/
  1076. .Lstrmm_kernel_L8_M4_BEGIN:
  1077. mov counterI, origM
  1078. tst counterI , #7
  1079. ble .Lstrmm_kernel_L8_END
  1080. tst counterI, #4
  1081. ble .Lstrmm_kernel_L8_M2_BEGIN
  1082. .Lstrmm_kernel_L8_M4_20:
  1083. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1084. mov pB, origPB
  1085. #else
  1086. mov pB, origPB
  1087. lsl temp, tempOffset, #4
  1088. add pA, pA, temp
  1089. lsl temp, tempOffset, #5
  1090. add pB, pB, temp
  1091. #endif
  1092. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1093. sub tempK, origK, tempOffset
  1094. #elif defined(LEFT)
  1095. add tempK, tempOffset, #4
  1096. #else
  1097. add tempK, tempOffset, #8
  1098. #endif
  1099. asr counterL , tempK, #1 // L = K / 2
  1100. cmp counterL , #2 // is there at least 4 to do?
  1101. blt .Lstrmm_kernel_L8_M4_32
  1102. KERNEL4x8_I // do one in the K
  1103. KERNEL4x8_M2 // do another in the K
  1104. subs counterL, counterL, #2
  1105. ble .Lstrmm_kernel_L8_M4_22a
  1106. .align 5
  1107. .Lstrmm_kernel_L8_M4_22:
  1108. KERNEL4x8_M1
  1109. KERNEL4x8_M2
  1110. subs counterL, counterL, #1
  1111. bgt .Lstrmm_kernel_L8_M4_22
  1112. .Lstrmm_kernel_L8_M4_22a:
  1113. KERNEL4x8_M1
  1114. KERNEL4x8_E
  1115. b .Lstrmm_kernel_L8_M4_44
  1116. .Lstrmm_kernel_L8_M4_32:
  1117. tst counterL, #1
  1118. ble .Lstrmm_kernel_L8_M4_40
  1119. KERNEL4x8_I
  1120. KERNEL4x8_E
  1121. b .Lstrmm_kernel_L8_M4_44
  1122. .Lstrmm_kernel_L8_M4_40:
  1123. INIT4x8
  1124. .Lstrmm_kernel_L8_M4_44:
  1125. ands counterL , tempK, #1
  1126. ble .Lstrmm_kernel_L8_M4_100
  1127. .Lstrmm_kernel_L8_M4_46:
  1128. KERNEL4x8_SUB
  1129. .Lstrmm_kernel_L8_M4_100:
  1130. SAVE4x8
  1131. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1132. sub tempK, origK, tempOffset
  1133. #if defined(LEFT)
  1134. sub tempK, tempK, #4
  1135. #else
  1136. sub tempK, tempK, #8
  1137. #endif
  1138. lsl temp, tempK, #4
  1139. add pA, pA, temp
  1140. lsl temp, tempK, #5
  1141. add pB, pB, temp
  1142. #endif
  1143. #if defined(LEFT)
  1144. add tempOffset, tempOffset, #4
  1145. #endif
  1146. .Lstrmm_kernel_L8_M4_END:
  1147. /******************************************************************************/
  1148. .Lstrmm_kernel_L8_M2_BEGIN:
  1149. mov counterI, origM
  1150. tst counterI , #3
  1151. ble .Lstrmm_kernel_L8_END
  1152. tst counterI, #2 // counterI = counterI / 2
  1153. ble .Lstrmm_kernel_L8_M1_BEGIN
  1154. .Lstrmm_kernel_L8_M2_20:
  1155. INIT2x8
  1156. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1157. mov pB, origPB
  1158. #else
  1159. mov pB, origPB
  1160. lsl temp, tempOffset, #3
  1161. add pA, pA, temp
  1162. lsl temp, tempOffset, #5
  1163. add pB, pB, temp
  1164. #endif
  1165. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1166. sub tempK, origK, tempOffset
  1167. #elif defined(LEFT)
  1168. add tempK, tempOffset, #2
  1169. #else
  1170. add tempK, tempOffset, #8
  1171. #endif
  1172. asr counterL , tempK, #3 // counterL = counterL / 8
  1173. cmp counterL , #0
  1174. ble .Lstrmm_kernel_L8_M2_40
  1175. .Lstrmm_kernel_L8_M2_22:
  1176. KERNEL2x8_SUB
  1177. KERNEL2x8_SUB
  1178. KERNEL2x8_SUB
  1179. KERNEL2x8_SUB
  1180. KERNEL2x8_SUB
  1181. KERNEL2x8_SUB
  1182. KERNEL2x8_SUB
  1183. KERNEL2x8_SUB
  1184. subs counterL, counterL, #1
  1185. bgt .Lstrmm_kernel_L8_M2_22
  1186. .Lstrmm_kernel_L8_M2_40:
  1187. ands counterL , tempK, #7 // counterL = counterL % 8
  1188. ble .Lstrmm_kernel_L8_M2_100
  1189. .Lstrmm_kernel_L8_M2_42:
  1190. KERNEL2x8_SUB
  1191. subs counterL, counterL, #1
  1192. bgt .Lstrmm_kernel_L8_M2_42
  1193. .Lstrmm_kernel_L8_M2_100:
  1194. SAVE2x8
  1195. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1196. sub tempK, origK, tempOffset
  1197. #if defined(LEFT)
  1198. sub tempK, tempK, #2
  1199. #else
  1200. sub tempK, tempK, #8
  1201. #endif
  1202. lsl temp, tempK, #3
  1203. add pA, pA, temp
  1204. lsl temp, tempK, #5
  1205. add pB, pB, temp
  1206. #endif
  1207. #if defined(LEFT)
  1208. add tempOffset, tempOffset, #2
  1209. #endif
  1210. .Lstrmm_kernel_L8_M2_END:
  1211. /******************************************************************************/
  1212. .Lstrmm_kernel_L8_M1_BEGIN:
  1213. tst counterI, #1 // counterI = counterI % 2
  1214. ble .Lstrmm_kernel_L8_END
  1215. .Lstrmm_kernel_L8_M1_20:
  1216. INIT1x8
  1217. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1218. mov pB, origPB
  1219. #else
  1220. mov pB, origPB
  1221. lsl temp, tempOffset, #2
  1222. add pA, pA, temp
  1223. lsl temp, tempOffset, #5
  1224. add pB, pB, temp
  1225. #endif
  1226. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1227. sub tempK, origK, tempOffset
  1228. #elif defined(LEFT)
  1229. add tempK, tempOffset, #1
  1230. #else
  1231. add tempK, tempOffset, #8
  1232. #endif
  1233. asr counterL , tempK, #3 // counterL = counterL / 8
  1234. cmp counterL , #0
  1235. ble .Lstrmm_kernel_L8_M1_40
  1236. .Lstrmm_kernel_L8_M1_22:
  1237. KERNEL1x8_SUB
  1238. KERNEL1x8_SUB
  1239. KERNEL1x8_SUB
  1240. KERNEL1x8_SUB
  1241. KERNEL1x8_SUB
  1242. KERNEL1x8_SUB
  1243. KERNEL1x8_SUB
  1244. KERNEL1x8_SUB
  1245. subs counterL, counterL, #1
  1246. bgt .Lstrmm_kernel_L8_M1_22
  1247. .Lstrmm_kernel_L8_M1_40:
  1248. ands counterL , tempK, #7 // counterL = counterL % 8
  1249. ble .Lstrmm_kernel_L8_M1_100
  1250. .Lstrmm_kernel_L8_M1_42:
  1251. KERNEL1x8_SUB
  1252. subs counterL, counterL, #1
  1253. bgt .Lstrmm_kernel_L8_M1_42
  1254. .Lstrmm_kernel_L8_M1_100:
  1255. SAVE1x8
  1256. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1257. sub tempK, origK, tempOffset
  1258. #if defined(LEFT)
  1259. sub tempK, tempK, #1
  1260. #else
  1261. sub tempK, tempK, #8
  1262. #endif
  1263. lsl temp, tempK, #2
  1264. add pA, pA, temp
  1265. lsl temp, tempK, #5
  1266. add pB, pB, temp
  1267. #endif
  1268. #if defined(LEFT)
  1269. add tempOffset, tempOffset, #1
  1270. #endif
  1271. .Lstrmm_kernel_L8_END:
  1272. lsl temp, origK, #5 // B = B + K * 4 * 8
  1273. add origPB, origPB, temp
  1274. #if !defined(LEFT)
  1275. add tempOffset, tempOffset, #8
  1276. #endif
  1277. subs counterJ, counterJ , #1 // j--
  1278. bgt .Lstrmm_kernel_L8_BEGIN
  1279. /******************************************************************************/
  1280. /******************************************************************************/
  1281. .Lstrmm_kernel_L4_BEGIN:
  1282. mov counterJ , origN
  1283. tst counterJ , #7
  1284. ble .Lstrmm_kernel_L999
  1285. tst counterJ , #4
  1286. ble .Lstrmm_kernel_L2_BEGIN
  1287. mov pCRow0, pC // pCRow0 = pC
  1288. add pC,pC,LDC, lsl #2
  1289. #if defined(LEFT)
  1290. mov tempOffset, offset
  1291. #endif
  1292. mov pA, origPA // pA = A
  1293. /******************************************************************************/
  1294. .Lstrmm_kernel_L4_M8_BEGIN:
  1295. mov counterI, origM
  1296. asr counterI, counterI, #3 // counterI = counterI / 8
  1297. cmp counterI, #0
  1298. ble .Lstrmm_kernel_L4_M4_BEGIN
  1299. .Lstrmm_kernel_L4_M8_20:
  1300. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1301. mov pB, origPB
  1302. #else
  1303. mov pB, origPB
  1304. lsl temp, tempOffset, #5
  1305. add pA, pA, temp
  1306. lsl temp, tempOffset, #4
  1307. add pB, pB, temp
  1308. #endif
  1309. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1310. sub tempK, origK, tempOffset
  1311. #elif defined(LEFT)
  1312. add tempK, tempOffset, #8
  1313. #else
  1314. add tempK, tempOffset, #4
  1315. #endif
  1316. asr counterL , tempK, #1 // L = K / 2
  1317. cmp counterL , #2 // is there at least 4 to do?
  1318. blt .Lstrmm_kernel_L4_M8_32
  1319. KERNEL8x4_I // do one in the K
  1320. KERNEL8x4_M2 // do another in the K
  1321. subs counterL, counterL, #2
  1322. ble .Lstrmm_kernel_L4_M8_22a
  1323. .align 5
  1324. .Lstrmm_kernel_L4_M8_22:
  1325. KERNEL8x4_M1
  1326. KERNEL8x4_M2
  1327. subs counterL, counterL, #1
  1328. bgt .Lstrmm_kernel_L4_M8_22
  1329. .Lstrmm_kernel_L4_M8_22a:
  1330. KERNEL8x4_M1
  1331. KERNEL8x4_E
  1332. b .Lstrmm_kernel_L4_M8_44
  1333. .Lstrmm_kernel_L4_M8_32:
  1334. tst counterL, #1
  1335. ble .Lstrmm_kernel_L4_M8_40
  1336. KERNEL8x4_I
  1337. KERNEL8x4_E
  1338. b .Lstrmm_kernel_L4_M8_44
  1339. .Lstrmm_kernel_L4_M8_40:
  1340. INIT8x4
  1341. .Lstrmm_kernel_L4_M8_44:
  1342. ands counterL , tempK, #1
  1343. ble .Lstrmm_kernel_L4_M8_100
  1344. .Lstrmm_kernel_L4_M8_46:
  1345. KERNEL8x4_SUB
  1346. .Lstrmm_kernel_L4_M8_100:
  1347. SAVE8x4
  1348. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1349. sub tempK, origK, tempOffset
  1350. #if defined(LEFT)
  1351. sub tempK, tempK, #8
  1352. #else
  1353. sub tempK, tempK, #4
  1354. #endif
  1355. lsl temp, tempK, #5
  1356. add pA, pA, temp
  1357. lsl temp, tempK, #4
  1358. add pB, pB, temp
  1359. #endif
  1360. #if defined(LEFT)
  1361. add tempOffset, tempOffset, #8
  1362. #endif
  1363. .Lstrmm_kernel_L4_M8_END:
  1364. subs counterI, counterI, #1
  1365. bne .Lstrmm_kernel_L4_M8_20
  1366. /******************************************************************************/
  1367. .Lstrmm_kernel_L4_M4_BEGIN:
  1368. mov counterI, origM
  1369. tst counterI , #7
  1370. ble .Lstrmm_kernel_L4_END
  1371. tst counterI, #4
  1372. ble .Lstrmm_kernel_L4_M2_BEGIN
  1373. .Lstrmm_kernel_L4_M4_20:
  1374. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1375. mov pB, origPB
  1376. #else
  1377. mov pB, origPB
  1378. lsl temp, tempOffset, #4
  1379. add pB, pB, temp
  1380. add pA, pA, temp
  1381. #endif
  1382. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1383. sub tempK, origK, tempOffset
  1384. #elif defined(LEFT)
  1385. add tempK, tempOffset, #4
  1386. #else
  1387. add tempK, tempOffset, #4
  1388. #endif
  1389. asr counterL , tempK, #1 // L = K / 2
  1390. cmp counterL , #2 // is there at least 4 to do?
  1391. blt .Lstrmm_kernel_L4_M4_32
  1392. KERNEL4x4_I // do one in the K
  1393. KERNEL4x4_M2 // do another in the K
  1394. subs counterL, counterL, #2
  1395. ble .Lstrmm_kernel_L4_M4_22a
  1396. .align 5
  1397. .Lstrmm_kernel_L4_M4_22:
  1398. KERNEL4x4_M1
  1399. KERNEL4x4_M2
  1400. subs counterL, counterL, #1
  1401. bgt .Lstrmm_kernel_L4_M4_22
  1402. .Lstrmm_kernel_L4_M4_22a:
  1403. KERNEL4x4_M1
  1404. KERNEL4x4_E
  1405. b .Lstrmm_kernel_L4_M4_44
  1406. .Lstrmm_kernel_L4_M4_32:
  1407. tst counterL, #1
  1408. ble .Lstrmm_kernel_L4_M4_40
  1409. KERNEL4x4_I
  1410. KERNEL4x4_E
  1411. b .Lstrmm_kernel_L4_M4_44
  1412. .Lstrmm_kernel_L4_M4_40:
  1413. INIT4x4
  1414. .Lstrmm_kernel_L4_M4_44:
  1415. ands counterL , tempK, #1
  1416. ble .Lstrmm_kernel_L4_M4_100
  1417. .Lstrmm_kernel_L4_M4_46:
  1418. KERNEL4x4_SUB
  1419. .Lstrmm_kernel_L4_M4_100:
  1420. SAVE4x4
  1421. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1422. sub tempK, origK, tempOffset
  1423. #if defined(LEFT)
  1424. sub tempK, tempK, #4
  1425. #else
  1426. sub tempK, tempK, #4
  1427. #endif
  1428. lsl temp, tempK, #4
  1429. add pA, pA, temp
  1430. add pB, pB, temp
  1431. #endif
  1432. #if defined(LEFT)
  1433. add tempOffset, tempOffset, #4
  1434. #endif
  1435. .Lstrmm_kernel_L4_M4_END:
  1436. /******************************************************************************/
  1437. .Lstrmm_kernel_L4_M2_BEGIN:
  1438. mov counterI, origM
  1439. tst counterI , #3
  1440. ble .Lstrmm_kernel_L4_END
  1441. tst counterI, #2 // counterI = counterI / 2
  1442. ble .Lstrmm_kernel_L4_M1_BEGIN
  1443. .Lstrmm_kernel_L4_M2_20:
  1444. INIT2x4
  1445. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1446. mov pB, origPB
  1447. #else
  1448. mov pB, origPB
  1449. lsl temp, tempOffset, #3
  1450. add pA, pA, temp
  1451. lsl temp, tempOffset, #4
  1452. add pB, pB, temp
  1453. #endif
  1454. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1455. sub tempK, origK, tempOffset
  1456. #elif defined(LEFT)
  1457. add tempK, tempOffset, #2
  1458. #else
  1459. add tempK, tempOffset, #4
  1460. #endif
  1461. asr counterL , tempK, #3 // counterL = counterL / 8
  1462. cmp counterL , #0
  1463. ble .Lstrmm_kernel_L4_M2_40
  1464. .Lstrmm_kernel_L4_M2_22:
  1465. KERNEL2x4_SUB
  1466. KERNEL2x4_SUB
  1467. KERNEL2x4_SUB
  1468. KERNEL2x4_SUB
  1469. KERNEL2x4_SUB
  1470. KERNEL2x4_SUB
  1471. KERNEL2x4_SUB
  1472. KERNEL2x4_SUB
  1473. subs counterL, counterL, #1
  1474. bgt .Lstrmm_kernel_L4_M2_22
  1475. .Lstrmm_kernel_L4_M2_40:
  1476. ands counterL , tempK, #7 // counterL = counterL % 8
  1477. ble .Lstrmm_kernel_L4_M2_100
  1478. .Lstrmm_kernel_L4_M2_42:
  1479. KERNEL2x4_SUB
  1480. subs counterL, counterL, #1
  1481. bgt .Lstrmm_kernel_L4_M2_42
  1482. .Lstrmm_kernel_L4_M2_100:
  1483. SAVE2x4
  1484. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1485. sub tempK, origK, tempOffset
  1486. #if defined(LEFT)
  1487. sub tempK, tempK, #2
  1488. #else
  1489. sub tempK, tempK, #4
  1490. #endif
  1491. lsl temp, tempK, #3
  1492. add pA, pA, temp
  1493. lsl temp, tempK, #4
  1494. add pB, pB, temp
  1495. #endif
  1496. #if defined(LEFT)
  1497. add tempOffset, tempOffset, #2
  1498. #endif
  1499. .Lstrmm_kernel_L4_M2_END:
  1500. /******************************************************************************/
  1501. .Lstrmm_kernel_L4_M1_BEGIN:
  1502. tst counterI, #1 // counterI = counterI % 2
  1503. ble .Lstrmm_kernel_L4_END
  1504. .Lstrmm_kernel_L4_M1_20:
  1505. INIT1x4
  1506. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1507. mov pB, origPB
  1508. #else
  1509. mov pB, origPB
  1510. lsl temp, tempOffset, #4
  1511. add pB, pB, temp
  1512. lsl temp, tempOffset, #2
  1513. add pA, pA, temp
  1514. #endif
  1515. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1516. sub tempK, origK, tempOffset
  1517. #elif defined(LEFT)
  1518. add tempK, tempOffset, #1
  1519. #else
  1520. add tempK, tempOffset, #4
  1521. #endif
  1522. asr counterL , tempK, #3 // counterL = counterL / 8
  1523. cmp counterL , #0
  1524. ble .Lstrmm_kernel_L4_M1_40
  1525. .Lstrmm_kernel_L4_M1_22:
  1526. KERNEL1x4_SUB
  1527. KERNEL1x4_SUB
  1528. KERNEL1x4_SUB
  1529. KERNEL1x4_SUB
  1530. KERNEL1x4_SUB
  1531. KERNEL1x4_SUB
  1532. KERNEL1x4_SUB
  1533. KERNEL1x4_SUB
  1534. subs counterL, counterL, #1
  1535. bgt .Lstrmm_kernel_L4_M1_22
  1536. .Lstrmm_kernel_L4_M1_40:
  1537. ands counterL , tempK, #7 // counterL = counterL % 8
  1538. ble .Lstrmm_kernel_L4_M1_100
  1539. .Lstrmm_kernel_L4_M1_42:
  1540. KERNEL1x4_SUB
  1541. subs counterL, counterL, #1
  1542. bgt .Lstrmm_kernel_L4_M1_42
  1543. .Lstrmm_kernel_L4_M1_100:
  1544. SAVE1x4
  1545. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1546. sub tempK, origK, tempOffset
  1547. #if defined(LEFT)
  1548. sub tempK, tempK, #1
  1549. #else
  1550. sub tempK, tempK, #4
  1551. #endif
  1552. lsl temp, tempK, #2
  1553. add pA, pA, temp
  1554. lsl temp, tempK, #4
  1555. add pB, pB, temp
  1556. #endif
  1557. #if defined(LEFT)
  1558. add tempOffset, tempOffset, #1
  1559. #endif
  1560. .Lstrmm_kernel_L4_END:
  1561. add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
  1562. #if !defined(LEFT)
  1563. add tempOffset, tempOffset, #4
  1564. #endif
  1565. /******************************************************************************/
  1566. /******************************************************************************/
  1567. .Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction
  1568. mov counterJ , origN
  1569. tst counterJ , #3
  1570. ble .Lstrmm_kernel_L999
  1571. tst counterJ , #2
  1572. ble .Lstrmm_kernel_L1_BEGIN
  1573. mov pCRow0, pC // pCRow0 = pC
  1574. add pC,pC,LDC, lsl #1
  1575. #if defined(LEFT)
  1576. mov tempOffset, offset
  1577. #endif
  1578. mov pA, origPA // pA = A
  1579. /******************************************************************************/
  1580. .Lstrmm_kernel_L2_M8_BEGIN:
  1581. mov counterI, origM
  1582. asr counterI, counterI, #3 // counterI = counterI / 8
  1583. cmp counterI,#0
  1584. ble .Lstrmm_kernel_L2_M4_BEGIN
  1585. .Lstrmm_kernel_L2_M8_20:
  1586. INIT8x2
  1587. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1588. mov pB, origPB
  1589. #else
  1590. mov pB, origPB
  1591. lsl temp, tempOffset, #5
  1592. add pA, pA, temp
  1593. lsl temp, tempOffset, #3
  1594. add pB, pB, temp
  1595. #endif
  1596. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1597. sub tempK, origK, tempOffset
  1598. #elif defined(LEFT)
  1599. add tempK, tempOffset, #8
  1600. #else
  1601. add tempK, tempOffset, #2
  1602. #endif
  1603. asr counterL , tempK, #3 // counterL = counterL / 8
  1604. cmp counterL,#0
  1605. ble .Lstrmm_kernel_L2_M8_40
  1606. .align 5
  1607. .Lstrmm_kernel_L2_M8_22:
  1608. KERNEL8x2_SUB
  1609. KERNEL8x2_SUB
  1610. KERNEL8x2_SUB
  1611. KERNEL8x2_SUB
  1612. KERNEL8x2_SUB
  1613. KERNEL8x2_SUB
  1614. KERNEL8x2_SUB
  1615. KERNEL8x2_SUB
  1616. subs counterL, counterL, #1
  1617. bgt .Lstrmm_kernel_L2_M8_22
  1618. .Lstrmm_kernel_L2_M8_40:
  1619. ands counterL , tempK, #7 // counterL = counterL % 8
  1620. ble .Lstrmm_kernel_L2_M8_100
  1621. .Lstrmm_kernel_L2_M8_42:
  1622. KERNEL8x2_SUB
  1623. subs counterL, counterL, #1
  1624. bgt .Lstrmm_kernel_L2_M8_42
  1625. .Lstrmm_kernel_L2_M8_100:
  1626. SAVE8x2
  1627. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1628. sub tempK, origK, tempOffset
  1629. #if defined(LEFT)
  1630. sub tempK, tempK, #8
  1631. #else
  1632. sub tempK, tempK, #2
  1633. #endif
  1634. lsl temp, tempK, #5
  1635. add pA, pA, temp
  1636. lsl temp, tempK, #3
  1637. add pB, pB, temp
  1638. #endif
  1639. #if defined(LEFT)
  1640. add tempOffset, tempOffset, #8
  1641. #endif
  1642. .Lstrmm_kernel_L2_M8_END:
  1643. subs counterI, counterI, #1
  1644. bgt .Lstrmm_kernel_L2_M8_20
  1645. /******************************************************************************/
  1646. .Lstrmm_kernel_L2_M4_BEGIN:
  1647. mov counterI, origM
  1648. tst counterI , #7
  1649. ble .Lstrmm_kernel_L2_END
  1650. tst counterI, #4
  1651. ble .Lstrmm_kernel_L2_M2_BEGIN
  1652. .Lstrmm_kernel_L2_M4_20:
  1653. INIT4x2
  1654. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1655. mov pB, origPB
  1656. #else
  1657. mov pB, origPB
  1658. lsl temp, tempOffset, #3
  1659. add pB, pB, temp
  1660. lsl temp, tempOffset, #4
  1661. add pA, pA, temp
  1662. #endif
  1663. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1664. sub tempK, origK, tempOffset
  1665. #elif defined(LEFT)
  1666. add tempK, tempOffset, #4
  1667. #else
  1668. add tempK, tempOffset, #2
  1669. #endif
  1670. asr counterL , tempK, #3 // counterL = counterL / 8
  1671. cmp counterL,#0
  1672. ble .Lstrmm_kernel_L2_M4_40
  1673. .align 5
  1674. .Lstrmm_kernel_L2_M4_22:
  1675. KERNEL4x2_SUB
  1676. KERNEL4x2_SUB
  1677. KERNEL4x2_SUB
  1678. KERNEL4x2_SUB
  1679. KERNEL4x2_SUB
  1680. KERNEL4x2_SUB
  1681. KERNEL4x2_SUB
  1682. KERNEL4x2_SUB
  1683. subs counterL, counterL, #1
  1684. bgt .Lstrmm_kernel_L2_M4_22
  1685. .Lstrmm_kernel_L2_M4_40:
  1686. ands counterL , tempK, #7 // counterL = counterL % 8
  1687. ble .Lstrmm_kernel_L2_M4_100
  1688. .Lstrmm_kernel_L2_M4_42:
  1689. KERNEL4x2_SUB
  1690. subs counterL, counterL, #1
  1691. bgt .Lstrmm_kernel_L2_M4_42
  1692. .Lstrmm_kernel_L2_M4_100:
  1693. SAVE4x2
  1694. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1695. sub tempK, origK, tempOffset
  1696. #if defined(LEFT)
  1697. sub tempK, tempK, #4
  1698. #else
  1699. sub tempK, tempK, #2
  1700. #endif
  1701. lsl temp, tempK, #4
  1702. add pA, pA, temp
  1703. lsl temp, tempK, #3
  1704. add pB, pB, temp
  1705. #endif
  1706. #if defined(LEFT)
  1707. add tempOffset, tempOffset, #4
  1708. #endif
  1709. .Lstrmm_kernel_L2_M4_END:
  1710. /******************************************************************************/
  1711. .Lstrmm_kernel_L2_M2_BEGIN:
  1712. mov counterI, origM
  1713. tst counterI , #3
  1714. ble .Lstrmm_kernel_L2_END
  1715. tst counterI, #2 // counterI = counterI / 2
  1716. ble .Lstrmm_kernel_L2_M1_BEGIN
  1717. .Lstrmm_kernel_L2_M2_20:
  1718. INIT2x2
  1719. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1720. mov pB, origPB
  1721. #else
  1722. mov pB, origPB
  1723. lsl temp, tempOffset, #3
  1724. add pB, pB, temp
  1725. lsl temp, tempOffset, #3
  1726. add pA, pA, temp
  1727. #endif
  1728. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1729. sub tempK, origK, tempOffset
  1730. #elif defined(LEFT)
  1731. add tempK, tempOffset, #2
  1732. #else
  1733. add tempK, tempOffset, #2
  1734. #endif
  1735. asr counterL , tempK, #3 // counterL = counterL / 8
  1736. cmp counterL,#0
  1737. ble .Lstrmm_kernel_L2_M2_40
  1738. .Lstrmm_kernel_L2_M2_22:
  1739. KERNEL2x2_SUB
  1740. KERNEL2x2_SUB
  1741. KERNEL2x2_SUB
  1742. KERNEL2x2_SUB
  1743. KERNEL2x2_SUB
  1744. KERNEL2x2_SUB
  1745. KERNEL2x2_SUB
  1746. KERNEL2x2_SUB
  1747. subs counterL, counterL, #1
  1748. bgt .Lstrmm_kernel_L2_M2_22
  1749. .Lstrmm_kernel_L2_M2_40:
  1750. ands counterL , tempK, #7 // counterL = counterL % 8
  1751. ble .Lstrmm_kernel_L2_M2_100
  1752. .Lstrmm_kernel_L2_M2_42:
  1753. KERNEL2x2_SUB
  1754. subs counterL, counterL, #1
  1755. bgt .Lstrmm_kernel_L2_M2_42
  1756. .Lstrmm_kernel_L2_M2_100:
  1757. SAVE2x2
  1758. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1759. sub tempK, origK, tempOffset
  1760. #if defined(LEFT)
  1761. sub tempK, tempK, #2
  1762. #else
  1763. sub tempK, tempK, #2
  1764. #endif
  1765. lsl temp, tempK, #3
  1766. add pA, pA, temp
  1767. lsl temp, tempK, #3
  1768. add pB, pB, temp
  1769. #endif
  1770. #if defined(LEFT)
  1771. add tempOffset, tempOffset, #2
  1772. #endif
  1773. .Lstrmm_kernel_L2_M2_END:
  1774. /******************************************************************************/
  1775. .Lstrmm_kernel_L2_M1_BEGIN:
  1776. tst counterI, #1 // counterI = counterI % 2
  1777. ble .Lstrmm_kernel_L2_END
  1778. .Lstrmm_kernel_L2_M1_20:
  1779. INIT1x2
  1780. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1781. mov pB, origPB
  1782. #else
  1783. mov pB, origPB
  1784. lsl temp, tempOffset, #3
  1785. add pB, pB, temp
  1786. lsl temp, tempOffset, #2
  1787. add pA, pA, temp
  1788. #endif
  1789. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1790. sub tempK, origK, tempOffset
  1791. #elif defined(LEFT)
  1792. add tempK, tempOffset, #1
  1793. #else
  1794. add tempK, tempOffset, #2
  1795. #endif
  1796. asr counterL , tempK, #3 // counterL = counterL / 8
  1797. cmp counterL, #0
  1798. ble .Lstrmm_kernel_L2_M1_40
  1799. .Lstrmm_kernel_L2_M1_22:
  1800. KERNEL1x2_SUB
  1801. KERNEL1x2_SUB
  1802. KERNEL1x2_SUB
  1803. KERNEL1x2_SUB
  1804. KERNEL1x2_SUB
  1805. KERNEL1x2_SUB
  1806. KERNEL1x2_SUB
  1807. KERNEL1x2_SUB
  1808. subs counterL, counterL, #1
  1809. bgt .Lstrmm_kernel_L2_M1_22
  1810. .Lstrmm_kernel_L2_M1_40:
  1811. ands counterL , tempK, #7 // counterL = counterL % 8
  1812. ble .Lstrmm_kernel_L2_M1_100
  1813. .Lstrmm_kernel_L2_M1_42:
  1814. KERNEL1x2_SUB
  1815. subs counterL, counterL, #1
  1816. bgt .Lstrmm_kernel_L2_M1_42
  1817. .Lstrmm_kernel_L2_M1_100:
  1818. SAVE1x2
  1819. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1820. sub tempK, origK, tempOffset
  1821. #if defined(LEFT)
  1822. sub tempK, tempK, #1
  1823. #else
  1824. sub tempK, tempK, #2
  1825. #endif
  1826. lsl temp, tempK, #2
  1827. add pA, pA, temp
  1828. lsl temp, tempK, #3
  1829. add pB, pB, temp
  1830. #endif
  1831. #if defined(LEFT)
  1832. add tempOffset, tempOffset, #1
  1833. #endif
  1834. .Lstrmm_kernel_L2_END:
  1835. #if !defined(LEFT)
  1836. add tempOffset, tempOffset, #2
  1837. #endif
  1838. add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
  1839. /******************************************************************************/
  1840. /******************************************************************************/
  1841. .Lstrmm_kernel_L1_BEGIN:
  1842. mov counterJ , origN
  1843. tst counterJ , #1
  1844. ble .Lstrmm_kernel_L999 // done
  1845. mov pCRow0, pC // pCRow0 = C
  1846. add pC , pC , LDC // Update pC to point to next
  1847. #if defined(LEFT)
  1848. mov tempOffset, offset
  1849. #endif
  1850. mov pA, origPA // pA = A
  1851. /******************************************************************************/
  1852. .Lstrmm_kernel_L1_M8_BEGIN:
  1853. mov counterI, origM
  1854. asr counterI, counterI, #3
  1855. cmp counterI, #0
  1856. ble .Lstrmm_kernel_L1_M4_BEGIN
  1857. .Lstrmm_kernel_L1_M8_20:
  1858. INIT8x1
  1859. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1860. mov pB, origPB
  1861. #else
  1862. mov pB, origPB
  1863. lsl temp, tempOffset, #5
  1864. add pA, pA, temp
  1865. lsl temp, tempOffset, #2
  1866. add pB, pB, temp
  1867. #endif
  1868. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1869. sub tempK, origK, tempOffset
  1870. #elif defined(LEFT)
  1871. add tempK, tempOffset, #8
  1872. #else
  1873. add tempK, tempOffset, #1
  1874. #endif
  1875. asr counterL , tempK, #3 // counterL = counterL / 8
  1876. cmp counterL , #0
  1877. ble .Lstrmm_kernel_L1_M8_40
  1878. .align 5
  1879. .Lstrmm_kernel_L1_M8_22:
  1880. KERNEL8x1_SUB
  1881. KERNEL8x1_SUB
  1882. KERNEL8x1_SUB
  1883. KERNEL8x1_SUB
  1884. KERNEL8x1_SUB
  1885. KERNEL8x1_SUB
  1886. KERNEL8x1_SUB
  1887. KERNEL8x1_SUB
  1888. subs counterL, counterL, #1
  1889. bgt .Lstrmm_kernel_L1_M8_22
  1890. .Lstrmm_kernel_L1_M8_40:
  1891. ands counterL , tempK, #7 // counterL = counterL % 8
  1892. ble .Lstrmm_kernel_L1_M8_100
  1893. .Lstrmm_kernel_L1_M8_42:
  1894. KERNEL8x1_SUB
  1895. subs counterL, counterL, #1
  1896. bgt .Lstrmm_kernel_L1_M8_42
  1897. .Lstrmm_kernel_L1_M8_100:
  1898. SAVE8x1
  1899. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1900. sub tempK, origK, tempOffset
  1901. #if defined(LEFT)
  1902. sub tempK, tempK, #8
  1903. #else
  1904. sub tempK, tempK, #1
  1905. #endif
  1906. lsl temp, tempK, #5
  1907. add pA, pA, temp
  1908. lsl temp, tempK, #2
  1909. add pB, pB, temp
  1910. #endif
  1911. #if defined(LEFT)
  1912. add tempOffset, tempOffset, #8
  1913. #endif
  1914. .Lstrmm_kernel_L1_M8_END:
  1915. subs counterI, counterI, #1
  1916. bgt .Lstrmm_kernel_L1_M8_20
  1917. /******************************************************************************/
  1918. .Lstrmm_kernel_L1_M4_BEGIN:
  1919. mov counterI, origM
  1920. tst counterI , #7
  1921. ble .Lstrmm_kernel_L1_END
  1922. tst counterI, #4
  1923. ble .Lstrmm_kernel_L1_M2_BEGIN
  1924. .Lstrmm_kernel_L1_M4_20:
  1925. INIT4x1
  1926. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1927. mov pB, origPB
  1928. #else
  1929. mov pB, origPB
  1930. lsl temp, tempOffset, #2
  1931. add pB, pB, temp
  1932. lsl temp, tempOffset, #4
  1933. add pA, pA, temp
  1934. #endif
  1935. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1936. sub tempK, origK, tempOffset
  1937. #elif defined(LEFT)
  1938. add tempK, tempOffset, #4
  1939. #else
  1940. add tempK, tempOffset, #1
  1941. #endif
  1942. asr counterL , tempK, #3 // counterL = counterL / 8
  1943. cmp counterL , #0
  1944. ble .Lstrmm_kernel_L1_M4_40
  1945. .align 5
  1946. .Lstrmm_kernel_L1_M4_22:
  1947. KERNEL4x1_SUB
  1948. KERNEL4x1_SUB
  1949. KERNEL4x1_SUB
  1950. KERNEL4x1_SUB
  1951. KERNEL4x1_SUB
  1952. KERNEL4x1_SUB
  1953. KERNEL4x1_SUB
  1954. KERNEL4x1_SUB
  1955. subs counterL, counterL, #1
  1956. bgt .Lstrmm_kernel_L1_M4_22
  1957. .Lstrmm_kernel_L1_M4_40:
  1958. ands counterL , tempK, #7 // counterL = counterL % 8
  1959. ble .Lstrmm_kernel_L1_M4_100
  1960. .Lstrmm_kernel_L1_M4_42:
  1961. KERNEL4x1_SUB
  1962. subs counterL, counterL, #1
  1963. bgt .Lstrmm_kernel_L1_M4_42
  1964. .Lstrmm_kernel_L1_M4_100:
  1965. SAVE4x1
  1966. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1967. sub tempK, origK, tempOffset
  1968. #if defined(LEFT)
  1969. sub tempK, tempK, #4
  1970. #else
  1971. sub tempK, tempK, #1
  1972. #endif
  1973. lsl temp, tempK, #4
  1974. add pA, pA, temp
  1975. lsl temp, tempK, #2
  1976. add pB, pB, temp
  1977. #endif
  1978. #if defined(LEFT)
  1979. add tempOffset, tempOffset, #4
  1980. #endif
  1981. .Lstrmm_kernel_L1_M4_END:
  1982. /******************************************************************************/
  1983. .Lstrmm_kernel_L1_M2_BEGIN:
  1984. mov counterI, origM
  1985. tst counterI , #3
  1986. ble .Lstrmm_kernel_L1_END
  1987. tst counterI, #2 // counterI = counterI / 2
  1988. ble .Lstrmm_kernel_L1_M1_BEGIN
  1989. .Lstrmm_kernel_L1_M2_20:
  1990. INIT2x1
  1991. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1992. mov pB, origPB
  1993. #else
  1994. mov pB, origPB
  1995. lsl temp, tempOffset, #2
  1996. add pB, pB, temp
  1997. lsl temp, tempOffset, #3
  1998. add pA, pA, temp
  1999. #endif
  2000. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2001. sub tempK, origK, tempOffset
  2002. #elif defined(LEFT)
  2003. add tempK, tempOffset, #2
  2004. #else
  2005. add tempK, tempOffset, #1
  2006. #endif
  2007. asr counterL , tempK, #3 // counterL = counterL / 8
  2008. cmp counterL , #0
  2009. ble .Lstrmm_kernel_L1_M2_40
  2010. .Lstrmm_kernel_L1_M2_22:
  2011. KERNEL2x1_SUB
  2012. KERNEL2x1_SUB
  2013. KERNEL2x1_SUB
  2014. KERNEL2x1_SUB
  2015. KERNEL2x1_SUB
  2016. KERNEL2x1_SUB
  2017. KERNEL2x1_SUB
  2018. KERNEL2x1_SUB
  2019. subs counterL, counterL, #1
  2020. bgt .Lstrmm_kernel_L1_M2_22
  2021. .Lstrmm_kernel_L1_M2_40:
  2022. ands counterL , tempK, #7 // counterL = counterL % 8
  2023. ble .Lstrmm_kernel_L1_M2_100
  2024. .Lstrmm_kernel_L1_M2_42:
  2025. KERNEL2x1_SUB
  2026. subs counterL, counterL, #1
  2027. bgt .Lstrmm_kernel_L1_M2_42
  2028. .Lstrmm_kernel_L1_M2_100:
  2029. SAVE2x1
  2030. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2031. sub tempK, origK, tempOffset
  2032. #if defined(LEFT)
  2033. sub tempK, tempK, #2
  2034. #else
  2035. sub tempK, tempK, #1
  2036. #endif
  2037. lsl temp, tempK, #3
  2038. add pA, pA, temp
  2039. lsl temp, tempK, #2
  2040. add pB, pB, temp
  2041. #endif
  2042. #if defined(LEFT)
  2043. add tempOffset, tempOffset, #2
  2044. #endif
  2045. .Lstrmm_kernel_L1_M2_END:
  2046. /******************************************************************************/
  2047. .Lstrmm_kernel_L1_M1_BEGIN:
  2048. tst counterI, #1 // counterI = counterI % 2
  2049. ble .Lstrmm_kernel_L1_END
  2050. .Lstrmm_kernel_L1_M1_20:
  2051. INIT1x1
  2052. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2053. mov pB, origPB
  2054. #else
  2055. mov pB, origPB
  2056. lsl temp, tempOffset, #2
  2057. add pB, pB, temp
  2058. lsl temp, tempOffset, #2
  2059. add pA, pA, temp
  2060. #endif
  2061. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2062. sub tempK, origK, tempOffset
  2063. #elif defined(LEFT)
  2064. add tempK, tempOffset, #1
  2065. #else
  2066. add tempK, tempOffset, #1
  2067. #endif
  2068. asr counterL , tempK, #3 // counterL = counterL / 8
  2069. cmp counterL , #0
  2070. ble .Lstrmm_kernel_L1_M1_40
  2071. .Lstrmm_kernel_L1_M1_22:
  2072. KERNEL1x1_SUB
  2073. KERNEL1x1_SUB
  2074. KERNEL1x1_SUB
  2075. KERNEL1x1_SUB
  2076. KERNEL1x1_SUB
  2077. KERNEL1x1_SUB
  2078. KERNEL1x1_SUB
  2079. KERNEL1x1_SUB
  2080. subs counterL, counterL, #1
  2081. bgt .Lstrmm_kernel_L1_M1_22
  2082. .Lstrmm_kernel_L1_M1_40:
  2083. ands counterL , tempK, #7 // counterL = counterL % 8
  2084. ble .Lstrmm_kernel_L1_M1_100
  2085. .Lstrmm_kernel_L1_M1_42:
  2086. KERNEL1x1_SUB
  2087. subs counterL, counterL, #1
  2088. bgt .Lstrmm_kernel_L1_M1_42
  2089. .Lstrmm_kernel_L1_M1_100:
  2090. SAVE1x1
  2091. .Lstrmm_kernel_L1_END:
  2092. /******************************************************************************/
  2093. .Lstrmm_kernel_L999:
  2094. mov x0, #0 // set return value
  2095. ldp d8, d9, [sp, #(0 * 16)]
  2096. ldp d10, d11, [sp, #(1 * 16)]
  2097. ldp d12, d13, [sp, #(2 * 16)]
  2098. ldp d14, d15, [sp, #(3 * 16)]
  2099. ldp d16, d17, [sp, #(4 * 16)]
  2100. ldp x18, x19, [sp, #(5 * 16)]
  2101. ldp x20, x21, [sp, #(6 * 16)]
  2102. ldp x22, x23, [sp, #(7 * 16)]
  2103. ldp x24, x25, [sp, #(8 * 16)]
  2104. ldp x26, x27, [sp, #(9 * 16)]
  2105. ldr x28, [sp, #(10 * 16)]
  2106. add sp, sp, #(11*16)
  2107. ret
  2108. EPILOGUE