You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ctrmm_kernel_8x4.S 52 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 s1 X3 x4 x5 x6 x7*/
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0, FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define alphaR w17
  49. #define alphaI w19
  50. #define temp x20
  51. #define tempOffset x21
  52. #define tempK x22
  53. #define alpha0_R s10
  54. #define alphaV0_R v10.s[0]
  55. #define alpha0_I s11
  56. #define alphaV0_I v11.s[0]
  57. #define A_PRE_SIZE 2560
  58. #define B_PRE_SIZE 448
  59. #define C_PRE_SIZE 128
  60. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  61. #define OP_rr fmla
  62. #define OP_ii fmls
  63. #define OP_ri fmla
  64. #define OP_ir fmla
  65. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  66. #define OP_rr fmla
  67. #define OP_ii fmla
  68. #define OP_ri fmls
  69. #define OP_ir fmla
  70. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  71. #define OP_rr fmla
  72. #define OP_ii fmla
  73. #define OP_ri fmla
  74. #define OP_ir fmls
  75. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  76. #define OP_rr fmla
  77. #define OP_ii fmls
  78. #define OP_ri fmls
  79. #define OP_ir fmls
  80. #endif
  81. // 00 origM
  82. // 01 origN
  83. // 02 origK
  84. // 03 origPA
  85. // 04 origPB
  86. // 05 pC
  87. // 06 origLDC -> LDC
  88. // 07 offset
  89. // 08 counterL
  90. // 09 counterI
  91. // 10 counterJ
  92. // 11 pB
  93. // 12 pCRow0
  94. // 13 pCRow1
  95. // 14 pCRow2
  96. // 15 pA
  97. // 16 temp
  98. // 17 tempOffset
  99. // 18 must save tempK
  100. // 19 must save
  101. // 20 must save
  102. // 21 must save
  103. // 22 must save
  104. // 23 must save
  105. // 24 must save
  106. // 25 must save
  107. // 26 must save
  108. // 27 must save
  109. // 28 must save
  110. // 29 frame
  111. // 30 link
  112. // 31 sp
  113. //v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R
  114. //v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I
  115. //v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R
  116. //v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I
  117. //v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R
  118. //v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I
  119. //v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R
  120. //v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I
  121. //v08 must save pB0_00_R, pB0_01_R
  122. //v09 must save pB0_00_I, pB0_01_I
  123. //v10 must save pB0_02_R, pB0_03_R --> ALPHA0_R
  124. //v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I
  125. //v12 must save pB1_00_R, pB1_01_R
  126. //v13 must save pB1_00_I, pB1_01_I
  127. //v14 must save pB1_02_R, pB1_03_R
  128. //v15 must save pB1_02_I, pB1_03_I
  129. //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R
  130. //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I
  131. //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R
  132. //v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I
  133. //v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R
  134. //v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I
  135. //v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R
  136. //v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I
  137. //v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R
  138. //v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I
  139. //v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R
  140. //v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I
  141. //v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R
  142. //v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I
  143. //v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R
  144. //v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I
  145. /*******************************************************************************
  146. * Macro definitions
  147. *******************************************************************************/
  148. .macro INIT8x4
  149. fmov s16, wzr
  150. fmov s17, wzr
  151. fmov s18, wzr
  152. fmov s19, s16
  153. fmov s20, wzr
  154. fmov s21, s16
  155. fmov s22, s17
  156. fmov s23, s18
  157. fmov s24, wzr
  158. fmov s25, s16
  159. fmov s26, s17
  160. fmov s27, s18
  161. fmov s28, wzr
  162. fmov s29, s16
  163. fmov s30, s17
  164. fmov s31, s18
  165. .endm
  166. .macro KERNEL8x4_I
  167. ld2 {v8.2s, v9.2s}, [pB]
  168. add pB, pB, #16
  169. ld2 {v0.4s, v1.4s}, [pA]
  170. add pA, pA, #32
  171. ld2 {v2.4s, v3.4s}, [pA]
  172. add pA, pA, #32
  173. fmul v16.4s, v0.4s, v8.s[0]
  174. OP_ii v16.4s, v1.4s, v9.s[0]
  175. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  176. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  177. eor v17.16b, v17.16b, v17.16b
  178. fmls v17.4s, v0.4s, v9.s[0]
  179. #else
  180. fmul v17.4s, v0.4s, v9.s[0]
  181. #endif
  182. OP_ir v17.4s, v1.4s, v8.s[0]
  183. ld2 {v10.2s, v11.2s}, [pB]
  184. add pB, pB, #16
  185. fmul v18.4s, v2.4s, v8.s[0]
  186. OP_ii v18.4s, v3.4s, v9.s[0]
  187. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  188. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  189. eor v19.16b, v19.16b, v19.16b
  190. fmls v19.4s, v2.4s, v9.s[0]
  191. #else
  192. fmul v19.4s, v2.4s, v9.s[0]
  193. #endif
  194. OP_ir v19.4s, v3.4s, v8.s[0]
  195. ld2 {v12.2s, v13.2s}, [pB]
  196. add pB, pB, #16
  197. fmul v20.4s, v0.4s, v8.s[1]
  198. OP_ii v20.4s, v1.4s, v9.s[1]
  199. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  200. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  201. eor v21.16b, v21.16b, v21.16b
  202. fmls v21.4s, v0.4s, v9.s[1]
  203. #else
  204. fmul v21.4s, v0.4s, v9.s[1]
  205. #endif
  206. OP_ir v21.4s, v1.4s, v8.s[1]
  207. ld2 {v14.2s, v15.2s}, [pB]
  208. add pB, pB, #16
  209. fmul v22.4s, v2.4s, v8.s[1]
  210. OP_ii v22.4s, v3.4s, v9.s[1]
  211. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  212. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  213. eor v23.16b, v23.16b, v23.16b
  214. fmls v23.4s, v2.4s, v9.s[1]
  215. #else
  216. fmul v23.4s, v2.4s, v9.s[1]
  217. #endif
  218. OP_ir v23.4s, v3.4s, v8.s[1]
  219. ld2 {v4.4s, v5.4s}, [pA]
  220. add pA, pA, #32
  221. fmul v24.4s, v0.4s, v10.s[0]
  222. OP_ii v24.4s, v1.4s, v11.s[0]
  223. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  224. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  225. eor v25.16b, v25.16b, v25.16b
  226. fmls v25.4s, v0.4s, v11.s[0]
  227. #else
  228. fmul v25.4s, v0.4s, v11.s[0]
  229. #endif
  230. OP_ir v25.4s, v1.4s, v10.s[0]
  231. ld2 {v6.4s, v7.4s}, [pA]
  232. add pA, pA, #32
  233. fmul v26.4s, v2.4s, v10.s[0]
  234. OP_ii v26.4s, v3.4s, v11.s[0]
  235. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  236. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  237. eor v27.16b, v27.16b, v27.16b
  238. fmls v27.4s, v2.4s, v11.s[0]
  239. #else
  240. fmul v27.4s, v2.4s, v11.s[0]
  241. #endif
  242. OP_ir v27.4s, v3.4s, v10.s[0]
  243. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  244. fmul v28.4s, v0.4s, v10.s[1]
  245. OP_ii v28.4s, v1.4s, v11.s[1]
  246. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  247. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  248. eor v29.16b, v29.16b, v29.16b
  249. fmls v29.4s, v0.4s, v11.s[1]
  250. #else
  251. fmul v29.4s, v0.4s, v11.s[1]
  252. #endif
  253. OP_ir v29.4s, v1.4s, v10.s[1]
  254. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  255. fmul v30.4s, v2.4s, v10.s[1]
  256. OP_ii v30.4s, v3.4s, v11.s[1]
  257. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  258. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  259. eor v31.16b, v31.16b, v31.16b
  260. fmls v31.4s, v2.4s, v11.s[1]
  261. #else
  262. fmul v31.4s, v2.4s, v11.s[1]
  263. #endif
  264. OP_ir v31.4s, v3.4s, v10.s[1]
  265. .endm
  266. .macro KERNEL8x4_M1
  267. OP_rr v16.4s, v0.4s, v8.s[0]
  268. OP_ii v16.4s, v1.4s, v9.s[0]
  269. OP_ri v17.4s, v0.4s, v9.s[0]
  270. OP_ir v17.4s, v1.4s, v8.s[0]
  271. ld2 {v12.2s, v13.2s}, [pB]
  272. add pB, pB, #16
  273. OP_rr v18.4s, v2.4s, v8.s[0]
  274. OP_ii v18.4s, v3.4s, v9.s[0]
  275. OP_ri v19.4s, v2.4s, v9.s[0]
  276. OP_ir v19.4s, v3.4s, v8.s[0]
  277. ld2 {v4.4s, v5.4s}, [pA]
  278. add pA, pA, #32
  279. OP_rr v20.4s, v0.4s, v8.s[1]
  280. OP_ii v20.4s, v1.4s, v9.s[1]
  281. OP_ri v21.4s, v0.4s, v9.s[1]
  282. OP_ir v21.4s, v1.4s, v8.s[1]
  283. ld2 {v6.4s, v7.4s}, [pA]
  284. add pA, pA, #32
  285. OP_rr v22.4s, v2.4s, v8.s[1]
  286. OP_ii v22.4s, v3.4s, v9.s[1]
  287. OP_ri v23.4s, v2.4s, v9.s[1]
  288. OP_ir v23.4s, v3.4s, v8.s[1]
  289. ld2 {v14.2s, v15.2s}, [pB]
  290. add pB, pB, #16
  291. OP_rr v24.4s, v0.4s, v10.s[0]
  292. OP_ii v24.4s, v1.4s, v11.s[0]
  293. OP_ri v25.4s, v0.4s, v11.s[0]
  294. OP_ir v25.4s, v1.4s, v10.s[0]
  295. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  296. OP_rr v26.4s, v2.4s, v10.s[0]
  297. OP_ii v26.4s, v3.4s, v11.s[0]
  298. OP_ri v27.4s, v2.4s, v11.s[0]
  299. OP_ir v27.4s, v3.4s, v10.s[0]
  300. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  301. OP_rr v28.4s, v0.4s, v10.s[1]
  302. OP_ii v28.4s, v1.4s, v11.s[1]
  303. OP_ri v29.4s, v0.4s, v11.s[1]
  304. OP_ir v29.4s, v1.4s, v10.s[1]
  305. OP_rr v30.4s, v2.4s, v10.s[1]
  306. OP_ii v30.4s, v3.4s, v11.s[1]
  307. OP_ri v31.4s, v2.4s, v11.s[1]
  308. OP_ir v31.4s, v3.4s, v10.s[1]
  309. .endm
  310. .macro KERNEL8x4_M2
  311. OP_rr v16.4s, v4.4s, v12.s[0]
  312. OP_ii v16.4s, v5.4s, v13.s[0]
  313. OP_ri v17.4s, v4.4s, v13.s[0]
  314. OP_ir v17.4s, v5.4s, v12.s[0]
  315. ld2 {v8.2s, v9.2s}, [pB]
  316. add pB, pB, #16
  317. OP_rr v18.4s, v6.4s, v12.s[0]
  318. OP_ii v18.4s, v7.4s, v13.s[0]
  319. OP_ri v19.4s, v6.4s, v13.s[0]
  320. OP_ir v19.4s, v7.4s, v12.s[0]
  321. ld2 {v0.4s, v1.4s}, [pA]
  322. add pA, pA, #32
  323. OP_rr v20.4s, v4.4s, v12.s[1]
  324. OP_ii v20.4s, v5.4s, v13.s[1]
  325. OP_ri v21.4s, v4.4s, v13.s[1]
  326. OP_ir v21.4s, v5.4s, v12.s[1]
  327. ld2 {v2.4s, v3.4s}, [pA]
  328. add pA, pA, #32
  329. OP_rr v22.4s, v6.4s, v12.s[1]
  330. OP_ii v22.4s, v7.4s, v13.s[1]
  331. OP_ri v23.4s, v6.4s, v13.s[1]
  332. OP_ir v23.4s, v7.4s, v12.s[1]
  333. ld2 {v10.2s, v11.2s}, [pB]
  334. add pB, pB, #16
  335. OP_rr v24.4s, v4.4s, v14.s[0]
  336. OP_ii v24.4s, v5.4s, v15.s[0]
  337. OP_ri v25.4s, v4.4s, v15.s[0]
  338. OP_ir v25.4s, v5.4s, v14.s[0]
  339. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  340. OP_rr v26.4s, v6.4s, v14.s[0]
  341. OP_ii v26.4s, v7.4s, v15.s[0]
  342. OP_ri v27.4s, v6.4s, v15.s[0]
  343. OP_ir v27.4s, v7.4s, v14.s[0]
  344. OP_rr v28.4s, v4.4s, v14.s[1]
  345. OP_ii v28.4s, v5.4s, v15.s[1]
  346. OP_ri v29.4s, v4.4s, v15.s[1]
  347. OP_ir v29.4s, v5.4s, v14.s[1]
  348. OP_rr v30.4s, v6.4s, v14.s[1]
  349. OP_ii v30.4s, v7.4s, v15.s[1]
  350. OP_ri v31.4s, v6.4s, v15.s[1]
  351. OP_ir v31.4s, v7.4s, v14.s[1]
  352. .endm
  353. .macro KERNEL8x4_E
  354. OP_rr v16.4s, v4.4s, v12.s[0]
  355. OP_ii v16.4s, v5.4s, v13.s[0]
  356. OP_ri v17.4s, v4.4s, v13.s[0]
  357. OP_ir v17.4s, v5.4s, v12.s[0]
  358. OP_rr v18.4s, v6.4s, v12.s[0]
  359. OP_ii v18.4s, v7.4s, v13.s[0]
  360. OP_ri v19.4s, v6.4s, v13.s[0]
  361. OP_ir v19.4s, v7.4s, v12.s[0]
  362. OP_rr v20.4s, v4.4s, v12.s[1]
  363. OP_ii v20.4s, v5.4s, v13.s[1]
  364. OP_ri v21.4s, v4.4s, v13.s[1]
  365. OP_ir v21.4s, v5.4s, v12.s[1]
  366. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  367. OP_rr v22.4s, v6.4s, v12.s[1]
  368. OP_ii v22.4s, v7.4s, v13.s[1]
  369. OP_ri v23.4s, v6.4s, v13.s[1]
  370. OP_ir v23.4s, v7.4s, v12.s[1]
  371. OP_rr v24.4s, v4.4s, v14.s[0]
  372. OP_ii v24.4s, v5.4s, v15.s[0]
  373. OP_ri v25.4s, v4.4s, v15.s[0]
  374. OP_ir v25.4s, v5.4s, v14.s[0]
  375. OP_rr v26.4s, v6.4s, v14.s[0]
  376. OP_ii v26.4s, v7.4s, v15.s[0]
  377. OP_ri v27.4s, v6.4s, v15.s[0]
  378. OP_ir v27.4s, v7.4s, v14.s[0]
  379. OP_rr v28.4s, v4.4s, v14.s[1]
  380. OP_ii v28.4s, v5.4s, v15.s[1]
  381. OP_ri v29.4s, v4.4s, v15.s[1]
  382. OP_ir v29.4s, v5.4s, v14.s[1]
  383. OP_rr v30.4s, v6.4s, v14.s[1]
  384. OP_ii v30.4s, v7.4s, v15.s[1]
  385. OP_ri v31.4s, v6.4s, v15.s[1]
  386. OP_ir v31.4s, v7.4s, v14.s[1]
  387. .endm
  388. .macro KERNEL8x4_SUB
  389. ld2 {v8.2s, v9.2s}, [pB]
  390. add pB, pB, #16
  391. ld2 {v0.4s, v1.4s}, [pA]
  392. add pA, pA, #32
  393. OP_rr v16.4s, v0.4s, v8.s[0]
  394. OP_ii v16.4s, v1.4s, v9.s[0]
  395. OP_ri v17.4s, v0.4s, v9.s[0]
  396. OP_ir v17.4s, v1.4s, v8.s[0]
  397. ld2 {v2.4s, v3.4s}, [pA]
  398. add pA, pA, #32
  399. OP_rr v20.4s, v0.4s, v8.s[1]
  400. OP_ii v20.4s, v1.4s, v9.s[1]
  401. OP_ri v21.4s, v0.4s, v9.s[1]
  402. OP_ir v21.4s, v1.4s, v8.s[1]
  403. ld2 {v10.2s, v11.2s}, [pB]
  404. add pB, pB, #16
  405. OP_rr v18.4s, v2.4s, v8.s[0]
  406. OP_ii v18.4s, v3.4s, v9.s[0]
  407. OP_ri v19.4s, v2.4s, v9.s[0]
  408. OP_ir v19.4s, v3.4s, v8.s[0]
  409. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  410. OP_rr v22.4s, v2.4s, v8.s[1]
  411. OP_ii v22.4s, v3.4s, v9.s[1]
  412. OP_ri v23.4s, v2.4s, v9.s[1]
  413. OP_ir v23.4s, v3.4s, v8.s[1]
  414. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  415. OP_rr v24.4s, v0.4s, v10.s[0]
  416. OP_ii v24.4s, v1.4s, v11.s[0]
  417. OP_ri v25.4s, v0.4s, v11.s[0]
  418. OP_ir v25.4s, v1.4s, v10.s[0]
  419. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  420. OP_rr v26.4s, v2.4s, v10.s[0]
  421. OP_ii v26.4s, v3.4s, v11.s[0]
  422. OP_ri v27.4s, v2.4s, v11.s[0]
  423. OP_ir v27.4s, v3.4s, v10.s[0]
  424. OP_rr v28.4s, v0.4s, v10.s[1]
  425. OP_ii v28.4s, v1.4s, v11.s[1]
  426. OP_ri v29.4s, v0.4s, v11.s[1]
  427. OP_ir v29.4s, v1.4s, v10.s[1]
  428. OP_rr v30.4s, v2.4s, v10.s[1]
  429. OP_ii v30.4s, v3.4s, v11.s[1]
  430. OP_ri v31.4s, v2.4s, v11.s[1]
  431. OP_ir v31.4s, v3.4s, v10.s[1]
  432. .endm
  433. .macro SAVE8x4
  434. fmov alpha0_R, alphaR
  435. fmov alpha0_I, alphaI
  436. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  437. fmul v0.4s, v16.4s, alphaV0_R
  438. fmls v0.4s, v17.4s, alphaV0_I
  439. fmul v1.4s, v16.4s, alphaV0_I
  440. fmla v1.4s, v17.4s, alphaV0_R
  441. st2 {v0.4s, v1.4s}, [pCRow0]
  442. add pCRow0, pCRow0, #32
  443. fmul v2.4s, v18.4s, alphaV0_R
  444. fmls v2.4s, v19.4s, alphaV0_I
  445. fmul v3.4s, v18.4s, alphaV0_I
  446. fmla v3.4s, v19.4s, alphaV0_R
  447. st2 {v2.4s, v3.4s}, [pCRow0]
  448. add pCRow0, pCRow0, #32
  449. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  450. fmul v4.4s, v20.4s, alphaV0_R
  451. fmls v4.4s, v21.4s, alphaV0_I
  452. fmul v5.4s, v20.4s, alphaV0_I
  453. fmla v5.4s, v21.4s, alphaV0_R
  454. st2 {v4.4s, v5.4s}, [pCRow1]
  455. add pCRow1, pCRow1, #32
  456. fmul v6.4s, v22.4s, alphaV0_R
  457. fmls v6.4s, v23.4s, alphaV0_I
  458. fmul v7.4s, v22.4s, alphaV0_I
  459. fmla v7.4s, v23.4s, alphaV0_R
  460. st2 {v6.4s, v7.4s}, [pCRow1]
  461. add pCRow1, pCRow1, #32
  462. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  463. fmul v0.4s, v24.4s, alphaV0_R
  464. fmls v0.4s, v25.4s, alphaV0_I
  465. fmul v1.4s, v24.4s, alphaV0_I
  466. fmla v1.4s, v25.4s, alphaV0_R
  467. st2 {v0.4s, v1.4s}, [pCRow2]
  468. add pCRow2, pCRow2, #32
  469. fmul v2.4s, v26.4s, alphaV0_R
  470. fmls v2.4s, v27.4s, alphaV0_I
  471. fmul v3.4s, v26.4s, alphaV0_I
  472. fmla v3.4s, v27.4s, alphaV0_R
  473. st2 {v2.4s, v3.4s}, [pCRow2]
  474. add pCRow2, pCRow2, #32
  475. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  476. fmul v4.4s, v28.4s, alphaV0_R
  477. fmls v4.4s, v29.4s, alphaV0_I
  478. fmul v5.4s, v28.4s, alphaV0_I
  479. fmla v5.4s, v29.4s, alphaV0_R
  480. st2 {v4.4s, v5.4s}, [pCRow3]
  481. add pCRow3, pCRow3, #32
  482. fmul v6.4s, v30.4s, alphaV0_R
  483. fmls v6.4s, v31.4s, alphaV0_I
  484. fmul v7.4s, v30.4s, alphaV0_I
  485. fmla v7.4s, v31.4s, alphaV0_R
  486. st2 {v6.4s, v7.4s}, [pCRow3]
  487. add pCRow3, pCRow3, #32
  488. .endm
  489. /******************************************************************************/
  490. .macro INIT4x4
  491. fmov s16, wzr
  492. fmov s17, s16
  493. fmov s20, s17
  494. fmov s21, s16
  495. fmov s24, s17
  496. fmov s25, s16
  497. fmov s28, s17
  498. fmov s29, s16
  499. .endm
  500. .macro KERNEL4x4_I
  501. ld2 {v8.4s, v9.4s}, [pB]
  502. add pB, pB, #32
  503. ld2 {v0.4s, v1.4s}, [pA]
  504. add pA, pA, #32
  505. fmul v16.4s, v0.4s, v8.s[0]
  506. OP_ii v16.4s, v1.4s, v9.s[0]
  507. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  508. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  509. eor v17.16b, v17.16b, v17.16b
  510. fmls v17.4s, v0.4s, v9.s[0]
  511. #else
  512. fmul v17.4s, v0.4s, v9.s[0]
  513. #endif
  514. OP_ir v17.4s, v1.4s, v8.s[0]
  515. fmul v20.4s, v0.4s, v8.s[1]
  516. OP_ii v20.4s, v1.4s, v9.s[1]
  517. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  518. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  519. eor v21.16b, v21.16b, v21.16b
  520. fmls v21.4s, v0.4s, v9.s[1]
  521. #else
  522. fmul v21.4s, v0.4s, v9.s[1]
  523. #endif
  524. OP_ir v21.4s, v1.4s, v8.s[1]
  525. fmul v24.4s, v0.4s, v8.s[2]
  526. OP_ii v24.4s, v1.4s, v9.s[2]
  527. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  528. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  529. eor v25.16b, v25.16b, v25.16b
  530. fmls v25.4s, v0.4s, v9.s[2]
  531. #else
  532. fmul v25.4s, v0.4s, v9.s[2]
  533. #endif
  534. OP_ir v25.4s, v1.4s, v8.s[2]
  535. fmul v28.4s, v0.4s, v8.s[3]
  536. OP_ii v28.4s, v1.4s, v9.s[3]
  537. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  538. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  539. eor v29.16b, v29.16b, v29.16b
  540. fmls v29.4s, v0.4s, v9.s[3]
  541. #else
  542. fmul v29.4s, v0.4s, v9.s[3]
  543. #endif
  544. OP_ir v29.4s, v1.4s, v8.s[3]
  545. ld2 {v12.4s, v13.4s}, [pB]
  546. add pB, pB, #32
  547. ld2 {v4.4s, v5.4s}, [pA]
  548. add pA, pA, #32
  549. .endm
  550. .macro KERNEL4x4_M1
  551. OP_rr v16.4s, v0.4s, v8.s[0]
  552. OP_ii v16.4s, v1.4s, v9.s[0]
  553. OP_ri v17.4s, v0.4s, v9.s[0]
  554. OP_ir v17.4s, v1.4s, v8.s[0]
  555. ld2 {v12.4s, v13.4s}, [pB] // For next round
  556. add pB, pB, #32
  557. OP_rr v20.4s, v0.4s, v8.s[1]
  558. OP_ii v20.4s, v1.4s, v9.s[1]
  559. OP_ri v21.4s, v0.4s, v9.s[1]
  560. OP_ir v21.4s, v1.4s, v8.s[1]
  561. ld2 {v4.4s, v5.4s}, [pA] // For next round
  562. add pA, pA, #32
  563. OP_rr v24.4s, v0.4s, v8.s[2]
  564. OP_ii v24.4s, v1.4s, v9.s[2]
  565. OP_ri v25.4s, v0.4s, v9.s[2]
  566. OP_ir v25.4s, v1.4s, v8.s[2]
  567. prfm PLDL1KEEP, [pA, #512]
  568. OP_rr v28.4s, v0.4s, v8.s[3]
  569. OP_ii v28.4s, v1.4s, v9.s[3]
  570. OP_ri v29.4s, v0.4s, v9.s[3]
  571. OP_ir v29.4s, v1.4s, v8.s[3]
  572. .endm
  573. .macro KERNEL4x4_M2
  574. OP_rr v16.4s, v4.4s, v12.s[0]
  575. OP_ii v16.4s, v5.4s, v13.s[0]
  576. OP_ri v17.4s, v4.4s, v13.s[0]
  577. OP_ir v17.4s, v5.4s, v12.s[0]
  578. ld2 {v8.4s, v9.4s}, [pB] // For next round
  579. add pB, pB, #32
  580. OP_rr v20.4s, v4.4s, v12.s[1]
  581. OP_ii v20.4s, v5.4s, v13.s[1]
  582. OP_ri v21.4s, v4.4s, v13.s[1]
  583. OP_ir v21.4s, v5.4s, v12.s[1]
  584. ld2 {v0.4s, v1.4s}, [pA] // For next round
  585. add pA, pA, #32
  586. OP_rr v24.4s, v4.4s, v12.s[2]
  587. OP_ii v24.4s, v5.4s, v13.s[2]
  588. OP_ri v25.4s, v4.4s, v13.s[2]
  589. OP_ir v25.4s, v5.4s, v12.s[2]
  590. prfm PLDL1KEEP, [pB, #512]
  591. OP_rr v28.4s, v4.4s, v12.s[3]
  592. OP_ii v28.4s, v5.4s, v13.s[3]
  593. OP_ri v29.4s, v4.4s, v13.s[3]
  594. OP_ir v29.4s, v5.4s, v12.s[3]
  595. .endm
  596. .macro KERNEL4x4_E
  597. OP_rr v16.4s, v4.4s, v12.s[0]
  598. OP_ii v16.4s, v5.4s, v13.s[0]
  599. OP_ri v17.4s, v4.4s, v13.s[0]
  600. OP_ir v17.4s, v5.4s, v12.s[0]
  601. OP_rr v20.4s, v4.4s, v12.s[1]
  602. OP_ii v20.4s, v5.4s, v13.s[1]
  603. OP_ri v21.4s, v4.4s, v13.s[1]
  604. OP_ir v21.4s, v5.4s, v12.s[1]
  605. OP_rr v24.4s, v4.4s, v12.s[2]
  606. OP_ii v24.4s, v5.4s, v13.s[2]
  607. OP_ri v25.4s, v4.4s, v13.s[2]
  608. OP_ir v25.4s, v5.4s, v12.s[2]
  609. OP_rr v28.4s, v4.4s, v12.s[3]
  610. OP_ii v28.4s, v5.4s, v13.s[3]
  611. OP_ri v29.4s, v4.4s, v13.s[3]
  612. OP_ir v29.4s, v5.4s, v12.s[3]
  613. .endm
  614. .macro KERNEL4x4_SUB
  615. ld2 {v8.4s, v9.4s}, [pB]
  616. add pB, pB, #32
  617. ld2 {v0.4s, v1.4s}, [pA]
  618. add pA, pA, #32
  619. OP_rr v16.4s, v0.4s, v8.s[0]
  620. OP_ii v16.4s, v1.4s, v9.s[0]
  621. OP_ri v17.4s, v0.4s, v9.s[0]
  622. OP_ir v17.4s, v1.4s, v8.s[0]
  623. OP_rr v20.4s, v0.4s, v8.s[1]
  624. OP_ii v20.4s, v1.4s, v9.s[1]
  625. OP_ri v21.4s, v0.4s, v9.s[1]
  626. OP_ir v21.4s, v1.4s, v8.s[1]
  627. OP_rr v24.4s, v0.4s, v8.s[2]
  628. OP_ii v24.4s, v1.4s, v9.s[2]
  629. OP_ri v25.4s, v0.4s, v9.s[2]
  630. OP_ir v25.4s, v1.4s, v8.s[2]
  631. OP_rr v28.4s, v0.4s, v8.s[3]
  632. OP_ii v28.4s, v1.4s, v9.s[3]
  633. OP_ri v29.4s, v0.4s, v9.s[3]
  634. OP_ir v29.4s, v1.4s, v8.s[3]
  635. .endm
  636. .macro SAVE4x4
  637. fmov alpha0_R, alphaR
  638. fmov alpha0_I, alphaI
  639. mov pCRow1, pCRow0
  640. fmul v0.4s, v16.4s, alphaV0_R
  641. fmls v0.4s, v17.4s, alphaV0_I
  642. fmul v1.4s, v16.4s, alphaV0_I
  643. fmla v1.4s, v17.4s, alphaV0_R
  644. st2 {v0.4s, v1.4s}, [pCRow1]
  645. add pCRow1, pCRow1, LDC
  646. fmul v4.4s, v20.4s, alphaV0_R
  647. fmls v4.4s, v21.4s, alphaV0_I
  648. fmul v5.4s, v20.4s, alphaV0_I
  649. fmla v5.4s, v21.4s, alphaV0_R
  650. st2 {v4.4s, v5.4s}, [pCRow1]
  651. add pCRow1, pCRow1, LDC
  652. fmul v0.4s, v24.4s, alphaV0_R
  653. fmls v0.4s, v25.4s, alphaV0_I
  654. fmul v1.4s, v24.4s, alphaV0_I
  655. fmla v1.4s, v25.4s, alphaV0_R
  656. st2 {v0.4s, v1.4s}, [pCRow1]
  657. add pCRow1, pCRow1, LDC
  658. fmul v4.4s, v28.4s, alphaV0_R
  659. fmls v4.4s, v29.4s, alphaV0_I
  660. fmul v5.4s, v28.4s, alphaV0_I
  661. fmla v5.4s, v29.4s, alphaV0_R
  662. st2 {v4.4s, v5.4s}, [pCRow1]
  663. add pCRow0, pCRow0, #32
  664. .endm
  665. /******************************************************************************/
  666. .macro INIT2x4
  667. fmov s16, wzr
  668. fmov s17, wzr
  669. fmov s20, s16
  670. fmov s21, s17
  671. fmov s24, s16
  672. fmov s25, s17
  673. fmov s28, s16
  674. fmov s29, s17
  675. .endm
  676. .macro KERNEL2x4_SUB
  677. ld2 {v8.4s, v9.4s}, [pB]
  678. add pB, pB, #32
  679. ld2 {v0.2s, v1.2s}, [pA]
  680. add pA, pA, #16
  681. OP_rr v16.2s, v0.2s, v8.s[0]
  682. OP_ii v16.2s, v1.2s, v9.s[0]
  683. OP_ri v17.2s, v0.2s, v9.s[0]
  684. OP_ir v17.2s, v1.2s, v8.s[0]
  685. OP_rr v20.2s, v0.2s, v8.s[1]
  686. OP_ii v20.2s, v1.2s, v9.s[1]
  687. OP_ri v21.2s, v0.2s, v9.s[1]
  688. OP_ir v21.2s, v1.2s, v8.s[1]
  689. OP_rr v24.2s, v0.2s, v8.s[2]
  690. OP_ii v24.2s, v1.2s, v9.s[2]
  691. OP_ri v25.2s, v0.2s, v9.s[2]
  692. OP_ir v25.2s, v1.2s, v8.s[2]
  693. OP_rr v28.2s, v0.2s, v8.s[3]
  694. OP_ii v28.2s, v1.2s, v9.s[3]
  695. OP_ri v29.2s, v0.2s, v9.s[3]
  696. OP_ir v29.2s, v1.2s, v8.s[3]
  697. .endm
  698. .macro SAVE2x4
  699. fmov alpha0_R, alphaR
  700. fmov alpha0_I, alphaI
  701. mov pCRow1, pCRow0
  702. fmul v0.2s, v16.2s, alphaV0_R
  703. fmls v0.2s, v17.2s, alphaV0_I
  704. fmul v1.2s, v16.2s, alphaV0_I
  705. fmla v1.2s, v17.2s, alphaV0_R
  706. st2 {v0.2s, v1.2s}, [pCRow1]
  707. add pCRow1, pCRow1, LDC
  708. fmul v4.2s, v20.2s, alphaV0_R
  709. fmls v4.2s, v21.2s, alphaV0_I
  710. fmul v5.2s, v20.2s, alphaV0_I
  711. fmla v5.2s, v21.2s, alphaV0_R
  712. st2 {v4.2s, v5.2s}, [pCRow1]
  713. add pCRow1, pCRow1, LDC
  714. fmul v0.2s, v24.2s, alphaV0_R
  715. fmls v0.2s, v25.2s, alphaV0_I
  716. fmul v1.2s, v24.2s, alphaV0_I
  717. fmla v1.2s, v25.2s, alphaV0_R
  718. st2 {v0.2s, v1.2s}, [pCRow1]
  719. add pCRow1, pCRow1, LDC
  720. fmul v4.2s, v28.2s, alphaV0_R
  721. fmls v4.2s, v29.2s, alphaV0_I
  722. fmul v5.2s, v28.2s, alphaV0_I
  723. fmla v5.2s, v29.2s, alphaV0_R
  724. st2 {v4.2s, v5.2s}, [pCRow1]
  725. add pCRow0, pCRow0, #16
  726. .endm
  727. /******************************************************************************/
  728. .macro INIT1x4
  729. fmov s16, wzr
  730. fmov s17, wzr
  731. fmov s20, s16
  732. fmov s21, s17
  733. fmov s24, s16
  734. fmov s25, s17
  735. fmov s28, s16
  736. fmov s29, s17
  737. .endm
  738. .macro KERNEL1x4_SUB
  739. ld2 {v8.4s, v9.4s}, [pB]
  740. add pB, pB, #32
  741. ld2 {v0.s, v1.s}[0], [pA]
  742. add pA, pA, #8
  743. OP_rr s16, s0, v8.s[0]
  744. OP_ii s16, s1, v9.s[0]
  745. OP_ri s17, s0, v9.s[0]
  746. OP_ir s17, s1, v8.s[0]
  747. OP_rr s20, s0, v8.s[1]
  748. OP_ii s20, s1, v9.s[1]
  749. OP_ri s21, s0, v9.s[1]
  750. OP_ir s21, s1, v8.s[1]
  751. OP_rr s24, s0, v8.s[2]
  752. OP_ii s24, s1, v9.s[2]
  753. OP_ri s25, s0, v9.s[2]
  754. OP_ir s25, s1, v8.s[2]
  755. OP_rr s28, s0, v8.s[3]
  756. OP_ii s28, s1, v9.s[3]
  757. OP_ri s29, s0, v9.s[3]
  758. OP_ir s29, s1, v8.s[3]
  759. .endm
  760. .macro SAVE1x4
  761. fmov alpha0_R, alphaR
  762. fmov alpha0_I, alphaI
  763. mov pCRow1, pCRow0
  764. fmul s0, s16, alphaV0_R
  765. fmls s0, s17, alphaV0_I
  766. fmul s1, s16, alphaV0_I
  767. fmla s1, s17, alphaV0_R
  768. st2 {v0.s, v1.s}[0], [pCRow1]
  769. add pCRow1, pCRow1, LDC
  770. fmul s4, s20, alphaV0_R
  771. fmls s4, s21, alphaV0_I
  772. fmul s5, s20, alphaV0_I
  773. fmla s5, s21, alphaV0_R
  774. st2 {v4.s, v5.s}[0], [pCRow1]
  775. add pCRow1, pCRow1, LDC
  776. fmul s0, s24, alphaV0_R
  777. fmls s0, s25, alphaV0_I
  778. fmul s1, s24, alphaV0_I
  779. fmla s1, s25, alphaV0_R
  780. st2 {v0.s, v1.s}[0], [pCRow1]
  781. add pCRow1, pCRow1, LDC
  782. fmul s4, s28, alphaV0_R
  783. fmls s4, s29, alphaV0_I
  784. fmul s5, s28, alphaV0_I
  785. fmla s5, s29, alphaV0_R
  786. st2 {v4.s, v5.s}[0], [pCRow1]
  787. add pCRow0, pCRow0, #8
  788. .endm
  789. /******************************************************************************/
  790. .macro INIT8x2
  791. fmov s16, wzr
  792. fmov s17, wzr
  793. fmov s18, wzr
  794. fmov s19, s16
  795. fmov s20, wzr
  796. fmov s21, s16
  797. fmov s22, s17
  798. fmov s23, s18
  799. .endm
  800. .macro KERNEL8x2_SUB
  801. ld2 {v8.2s, v9.2s}, [pB]
  802. add pB, pB, #16
  803. ld2 {v0.4s, v1.4s}, [pA]
  804. add pA, pA, #32
  805. ld2 {v2.4s, v3.4s}, [pA]
  806. add pA, pA, #32
  807. OP_rr v16.4s, v0.4s, v8.s[0]
  808. OP_ii v16.4s, v1.4s, v9.s[0]
  809. OP_ri v17.4s, v0.4s, v9.s[0]
  810. OP_ir v17.4s, v1.4s, v8.s[0]
  811. OP_rr v18.4s, v2.4s, v8.s[0]
  812. OP_ii v18.4s, v3.4s, v9.s[0]
  813. OP_ri v19.4s, v2.4s, v9.s[0]
  814. OP_ir v19.4s, v3.4s, v8.s[0]
  815. OP_rr v20.4s, v0.4s, v8.s[1]
  816. OP_ii v20.4s, v1.4s, v9.s[1]
  817. OP_ri v21.4s, v0.4s, v9.s[1]
  818. OP_ir v21.4s, v1.4s, v8.s[1]
  819. OP_rr v22.4s, v2.4s, v8.s[1]
  820. OP_ii v22.4s, v3.4s, v9.s[1]
  821. OP_ri v23.4s, v2.4s, v9.s[1]
  822. OP_ir v23.4s, v3.4s, v8.s[1]
  823. .endm
  824. .macro SAVE8x2
  825. fmov alpha0_R, alphaR
  826. fmov alpha0_I, alphaI
  827. mov pCRow1, pCRow0
  828. fmul v0.4s, v16.4s, alphaV0_R
  829. fmls v0.4s, v17.4s, alphaV0_I
  830. fmul v1.4s, v16.4s, alphaV0_I
  831. fmla v1.4s, v17.4s, alphaV0_R
  832. st2 {v0.4s, v1.4s}, [pCRow1]
  833. add pCRow2, pCRow1, #32
  834. fmul v2.4s, v18.4s, alphaV0_R
  835. fmls v2.4s, v19.4s, alphaV0_I
  836. fmul v3.4s, v18.4s, alphaV0_I
  837. fmla v3.4s, v19.4s, alphaV0_R
  838. st2 {v2.4s, v3.4s}, [pCRow2]
  839. add pCRow1, pCRow1, LDC
  840. fmul v4.4s, v20.4s, alphaV0_R
  841. fmls v4.4s, v21.4s, alphaV0_I
  842. fmul v5.4s, v20.4s, alphaV0_I
  843. fmla v5.4s, v21.4s, alphaV0_R
  844. st2 {v4.4s, v5.4s}, [pCRow1]
  845. add pCRow2, pCRow1, #32
  846. fmul v6.4s, v22.4s, alphaV0_R
  847. fmls v6.4s, v23.4s, alphaV0_I
  848. fmul v7.4s, v22.4s, alphaV0_I
  849. fmla v7.4s, v23.4s, alphaV0_R
  850. st2 {v6.4s, v7.4s}, [pCRow2]
  851. add pCRow0, pCRow0, #64
  852. .endm
  853. /******************************************************************************/
  854. .macro INIT4x2
  855. fmov s16, wzr
  856. fmov s17, wzr
  857. fmov s20, s16
  858. fmov s21, s17
  859. .endm
  860. .macro KERNEL4x2_SUB
  861. ld2 {v8.2s, v9.2s}, [pB]
  862. add pB, pB, #16
  863. ld2 {v0.4s, v1.4s}, [pA]
  864. add pA, pA, #32
  865. OP_rr v16.4s, v0.4s, v8.s[0]
  866. OP_ii v16.4s, v1.4s, v9.s[0]
  867. OP_ri v17.4s, v0.4s, v9.s[0]
  868. OP_ir v17.4s, v1.4s, v8.s[0]
  869. OP_rr v20.4s, v0.4s, v8.s[1]
  870. OP_ii v20.4s, v1.4s, v9.s[1]
  871. OP_ri v21.4s, v0.4s, v9.s[1]
  872. OP_ir v21.4s, v1.4s, v8.s[1]
  873. .endm
  874. .macro SAVE4x2
  875. fmov alpha0_R, alphaR
  876. fmov alpha0_I, alphaI
  877. mov pCRow1, pCRow0
  878. fmul v0.4s, v16.4s, alphaV0_R
  879. fmls v0.4s, v17.4s, alphaV0_I
  880. fmul v1.4s, v16.4s, alphaV0_I
  881. fmla v1.4s, v17.4s, alphaV0_R
  882. st2 {v0.4s, v1.4s}, [pCRow1]
  883. add pCRow1, pCRow1, LDC
  884. fmul v4.4s, v20.4s, alphaV0_R
  885. fmls v4.4s, v21.4s, alphaV0_I
  886. fmul v5.4s, v20.4s, alphaV0_I
  887. fmla v5.4s, v21.4s, alphaV0_R
  888. st2 {v4.4s, v5.4s}, [pCRow1]
  889. add pCRow0, pCRow0, #32
  890. .endm
  891. /******************************************************************************/
  892. .macro INIT2x2
  893. fmov s16, wzr
  894. fmov s17, wzr
  895. fmov s20, s16
  896. fmov s21, s17
  897. .endm
  898. .macro KERNEL2x2_SUB
  899. ld2 {v8.2s, v9.2s}, [pB]
  900. add pB, pB, #16
  901. ld2 {v0.2s, v1.2s}, [pA]
  902. add pA, pA, #16
  903. OP_rr v16.2s, v0.2s, v8.s[0]
  904. OP_ii v16.2s, v1.2s, v9.s[0]
  905. OP_ri v17.2s, v0.2s, v9.s[0]
  906. OP_ir v17.2s, v1.2s, v8.s[0]
  907. OP_rr v20.2s, v0.2s, v8.s[1]
  908. OP_ii v20.2s, v1.2s, v9.s[1]
  909. OP_ri v21.2s, v0.2s, v9.s[1]
  910. OP_ir v21.2s, v1.2s, v8.s[1]
  911. .endm
  912. .macro SAVE2x2
  913. fmov alpha0_R, alphaR
  914. fmov alpha0_I, alphaI
  915. mov pCRow1, pCRow0
  916. fmul v0.2s, v16.2s, alphaV0_R
  917. fmls v0.2s, v17.2s, alphaV0_I
  918. fmul v1.2s, v16.2s, alphaV0_I
  919. fmla v1.2s, v17.2s, alphaV0_R
  920. st2 {v0.2s, v1.2s}, [pCRow1]
  921. add pCRow1, pCRow1, LDC
  922. fmul v4.2s, v20.2s, alphaV0_R
  923. fmls v4.2s, v21.2s, alphaV0_I
  924. fmul v5.2s, v20.2s, alphaV0_I
  925. fmla v5.2s, v21.2s, alphaV0_R
  926. st2 {v4.2s, v5.2s}, [pCRow1]
  927. add pCRow0, pCRow0, #16
  928. .endm
  929. /******************************************************************************/
  930. .macro INIT1x2
  931. fmov s16, wzr
  932. fmov s17, wzr
  933. fmov s20, wzr
  934. fmov s21, wzr
  935. .endm
  936. .macro KERNEL1x2_SUB
  937. ld2 {v8.2s, v9.2s}, [pB]
  938. add pB, pB, #16
  939. ld2 {v0.s, v1.s}[0], [pA]
  940. add pA, pA, #8
  941. OP_rr s16, s0, v8.s[0]
  942. OP_ii s16, s1, v9.s[0]
  943. OP_ri s17, s0, v9.s[0]
  944. OP_ir s17, s1, v8.s[0]
  945. OP_rr s20, s0, v8.s[1]
  946. OP_ii s20, s1, v9.s[1]
  947. OP_ri s21, s0, v9.s[1]
  948. OP_ir s21, s1, v8.s[1]
  949. .endm
  950. .macro SAVE1x2
  951. fmov alpha0_R, alphaR
  952. fmov alpha0_I, alphaI
  953. mov pCRow1, pCRow0
  954. fmul s0, s16, alphaV0_R
  955. fmls s0, s17, alphaV0_I
  956. fmul s1, s16, alphaV0_I
  957. fmla s1, s17, alphaV0_R
  958. st2 {v0.s, v1.s}[0], [pCRow1]
  959. add pCRow1, pCRow1, LDC
  960. fmul s4, s20, alphaV0_R
  961. fmls s4, s21, alphaV0_I
  962. fmul s5, s20, alphaV0_I
  963. fmla s5, s21, alphaV0_R
  964. st2 {v4.s, v5.s}[0], [pCRow1]
  965. add pCRow0, pCRow0, #8
  966. .endm
  967. /******************************************************************************/
  968. .macro INIT8x1
  969. fmov s16, wzr
  970. fmov s17, wzr
  971. fmov s18, wzr
  972. fmov s19, s16
  973. .endm
  974. .macro KERNEL8x1_SUB
  975. ld1 {v8.2s}, [pB]
  976. add pB, pB, #8
  977. ld2 {v0.4s, v1.4s}, [pA]
  978. add pA, pA, #32
  979. ld2 {v2.4s, v3.4s}, [pA]
  980. add pA, pA, #32
  981. OP_rr v16.4s, v0.4s, v8.s[0]
  982. OP_ii v16.4s, v1.4s, v8.s[1]
  983. OP_ri v17.4s, v0.4s, v8.s[1]
  984. OP_ir v17.4s, v1.4s, v8.s[0]
  985. OP_rr v18.4s, v2.4s, v8.s[0]
  986. OP_ii v18.4s, v3.4s, v8.s[1]
  987. OP_ri v19.4s, v2.4s, v8.s[1]
  988. OP_ir v19.4s, v3.4s, v8.s[0]
  989. .endm
  990. .macro SAVE8x1
  991. fmov alpha0_R, alphaR
  992. fmov alpha0_I, alphaI
  993. mov pCRow1, pCRow0
  994. fmul v0.4s, v16.4s, alphaV0_R
  995. fmls v0.4s, v17.4s, alphaV0_I
  996. fmul v1.4s, v16.4s, alphaV0_I
  997. fmla v1.4s, v17.4s, alphaV0_R
  998. st2 {v0.4s, v1.4s}, [pCRow1]
  999. add pCRow1, pCRow1, #32
  1000. fmul v2.4s, v18.4s, alphaV0_R
  1001. fmls v2.4s, v19.4s, alphaV0_I
  1002. fmul v3.4s, v18.4s, alphaV0_I
  1003. fmla v3.4s, v19.4s, alphaV0_R
  1004. st2 {v2.4s, v3.4s}, [pCRow1]
  1005. add pCRow0, pCRow0, #64
  1006. .endm
  1007. /******************************************************************************/
  1008. .macro INIT4x1
  1009. fmov s16, wzr
  1010. fmov s17, s16
  1011. .endm
  1012. .macro KERNEL4x1_SUB
  1013. ld2 {v8.s, v9.s}[0], [pB]
  1014. add pB, pB, #8
  1015. ld2 {v0.4s, v1.4s}, [pA]
  1016. add pA, pA, #32
  1017. OP_rr v16.4s, v0.4s, v8.s[0]
  1018. OP_ii v16.4s, v1.4s, v9.s[0]
  1019. OP_ri v17.4s, v0.4s, v9.s[0]
  1020. OP_ir v17.4s, v1.4s, v8.s[0]
  1021. .endm
  1022. .macro SAVE4x1
  1023. fmov alpha0_R, alphaR
  1024. fmov alpha0_I, alphaI
  1025. mov pCRow1, pCRow0
  1026. fmul v0.4s, v16.4s, alphaV0_R
  1027. fmls v0.4s, v17.4s, alphaV0_I
  1028. fmul v1.4s, v16.4s, alphaV0_I
  1029. fmla v1.4s, v17.4s, alphaV0_R
  1030. st2 {v0.4s, v1.4s}, [pCRow1]
  1031. add pCRow0, pCRow0, #32
  1032. .endm
  1033. /******************************************************************************/
  1034. .macro INIT2x1
  1035. fmov s16, wzr
  1036. fmov s17, wzr
  1037. .endm
  1038. .macro KERNEL2x1_SUB
  1039. ld2 {v8.s, v9.s}[0], [pB]
  1040. add pB, pB, #8
  1041. ld2 {v0.2s, v1.2s}, [pA]
  1042. add pA, pA, #16
  1043. OP_rr v16.2s, v0.2s, v8.s[0]
  1044. OP_ii v16.2s, v1.2s, v9.s[0]
  1045. OP_ri v17.2s, v0.2s, v9.s[0]
  1046. OP_ir v17.2s, v1.2s, v8.s[0]
  1047. .endm
  1048. .macro SAVE2x1
  1049. fmov alpha0_R, alphaR
  1050. fmov alpha0_I, alphaI
  1051. mov pCRow1, pCRow0
  1052. fmul v0.2s, v16.2s, alphaV0_R
  1053. fmls v0.2s, v17.2s, alphaV0_I
  1054. fmul v1.2s, v16.2s, alphaV0_I
  1055. fmla v1.2s, v17.2s, alphaV0_R
  1056. st2 {v0.2s, v1.2s}, [pCRow1]
  1057. add pCRow0, pCRow0, #16
  1058. .endm
  1059. /******************************************************************************/
  1060. .macro INIT1x1
  1061. fmov s16, wzr
  1062. fmov s17, wzr
  1063. .endm
  1064. .macro KERNEL1x1_SUB
  1065. ld2 {v8.s, v9.s}[0], [pB]
  1066. add pB, pB, #8
  1067. ld2 {v0.s, v1.s}[0], [pA]
  1068. add pA, pA, #8
  1069. OP_rr s16, s0, v8.s[0]
  1070. OP_ii s16, s1, v9.s[0]
  1071. OP_ri s17, s0, v9.s[0]
  1072. OP_ir s17, s1, v8.s[0]
  1073. .endm
  1074. .macro SAVE1x1
  1075. fmov alpha0_R, alphaR
  1076. fmov alpha0_I, alphaI
  1077. mov pCRow1, pCRow0
  1078. fmul s0, s16, alphaV0_R
  1079. fmls s0, s17, alphaV0_I
  1080. fmul s1, s16, alphaV0_I
  1081. fmla s1, s17, alphaV0_R
  1082. st2 {v0.s, v1.s}[0], [pCRow1]
  1083. add pCRow0, pCRow0, #8
  1084. .endm
  1085. /*******************************************************************************
  1086. * End of macro definitions
  1087. *******************************************************************************/
  1088. PROLOGUE
  1089. .align 5
  1090. add sp, sp, #-(11 * 16)
  1091. stp d8, d9, [sp, #(0 * 16)]
  1092. stp d10, d11, [sp, #(1 * 16)]
  1093. stp d12, d13, [sp, #(2 * 16)]
  1094. stp d14, d15, [sp, #(3 * 16)]
  1095. stp d16, d17, [sp, #(4 * 16)]
  1096. stp x18, x19, [sp, #(5 * 16)]
  1097. stp x20, x21, [sp, #(6 * 16)]
  1098. stp x22, x23, [sp, #(7 * 16)]
  1099. stp x24, x25, [sp, #(8 * 16)]
  1100. stp x26, x27, [sp, #(9 * 16)]
  1101. str x28, [sp, #(10 * 16)]
  1102. prfm PLDL1KEEP, [origPB]
  1103. prfm PLDL1KEEP, [origPA]
  1104. fmov alphaR, s0
  1105. fmov alphaI, s1
  1106. lsl LDC, LDC, #3 // ldc = ldc * 8
  1107. #if !defined(LEFT)
  1108. neg tempOffset, offset
  1109. #endif
  1110. mov pB, origPB
  1111. mov counterJ, origN
  1112. asr counterJ, counterJ, #2 // J = J / 4
  1113. cmp counterJ, #0
  1114. ble .Lctrmm_kernel_L2_BEGIN
  1115. /******************************************************************************/
  1116. .Lctrmm_kernel_L4_BEGIN:
  1117. mov pCRow0, pC
  1118. add pCRow1, pCRow0, LDC
  1119. add pCRow2, pCRow1, LDC
  1120. add pCRow3, pCRow2, LDC
  1121. add pC, pCRow3, LDC
  1122. #if defined(LEFT)
  1123. mov tempOffset, offset
  1124. #endif
  1125. mov pA, origPA // pA = start of A array
  1126. .Lctrmm_kernel_L4_M8_BEGIN:
  1127. mov counterI, origM
  1128. asr counterI, counterI, #3 // counterI = counterI / 8
  1129. cmp counterI, #0
  1130. ble .Lctrmm_kernel_L4_M4_BEGIN
  1131. .Lctrmm_kernel_L4_M8_20:
  1132. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1133. mov pB, origPB
  1134. #else
  1135. mov pB, origPB
  1136. lsl temp, tempOffset, #6
  1137. add pA, pA, temp
  1138. lsl temp, tempOffset, #5
  1139. add pB, pB, temp
  1140. #endif
  1141. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1142. sub tempK, origK, tempOffset
  1143. #elif defined(LEFT)
  1144. add tempK, tempOffset, #8
  1145. #else
  1146. add tempK, tempOffset, #4
  1147. #endif
  1148. asr counterL , tempK, #3
  1149. cmp counterL , #2
  1150. blt .Lctrmm_kernel_L4_M8_32
  1151. KERNEL8x4_I
  1152. KERNEL8x4_M2
  1153. KERNEL8x4_M1
  1154. KERNEL8x4_M2
  1155. KERNEL8x4_M1
  1156. KERNEL8x4_M2
  1157. KERNEL8x4_M1
  1158. KERNEL8x4_M2
  1159. subs counterL, counterL, #2 // subtract 2
  1160. ble .Lctrmm_kernel_L4_M8_22a
  1161. .align 5
  1162. .Lctrmm_kernel_L4_M8_22:
  1163. KERNEL8x4_M1
  1164. KERNEL8x4_M2
  1165. KERNEL8x4_M1
  1166. KERNEL8x4_M2
  1167. KERNEL8x4_M1
  1168. KERNEL8x4_M2
  1169. KERNEL8x4_M1
  1170. KERNEL8x4_M2
  1171. subs counterL, counterL, #1
  1172. bgt .Lctrmm_kernel_L4_M8_22
  1173. .align 5
  1174. .Lctrmm_kernel_L4_M8_22a:
  1175. KERNEL8x4_M1
  1176. KERNEL8x4_M2
  1177. KERNEL8x4_M1
  1178. KERNEL8x4_M2
  1179. KERNEL8x4_M1
  1180. KERNEL8x4_M2
  1181. KERNEL8x4_M1
  1182. KERNEL8x4_E
  1183. b .Lctrmm_kernel_L4_M8_44
  1184. .align 5
  1185. .Lctrmm_kernel_L4_M8_32:
  1186. tst counterL, #1
  1187. ble .Lctrmm_kernel_L4_M8_40
  1188. KERNEL8x4_I
  1189. KERNEL8x4_M2
  1190. KERNEL8x4_M1
  1191. KERNEL8x4_M2
  1192. KERNEL8x4_M1
  1193. KERNEL8x4_M2
  1194. KERNEL8x4_M1
  1195. KERNEL8x4_E
  1196. b .Lctrmm_kernel_L4_M8_44
  1197. .Lctrmm_kernel_L4_M8_40:
  1198. INIT8x4
  1199. .Lctrmm_kernel_L4_M8_44:
  1200. ands counterL , tempK, #7
  1201. ble .Lctrmm_kernel_L4_M8_100
  1202. .align 5
  1203. .Lctrmm_kernel_L4_M8_46:
  1204. KERNEL8x4_SUB
  1205. subs counterL, counterL, #1
  1206. bne .Lctrmm_kernel_L4_M8_46
  1207. .Lctrmm_kernel_L4_M8_100:
  1208. SAVE8x4
  1209. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1210. sub tempK, origK, tempOffset
  1211. #if defined(LEFT)
  1212. sub tempK, tempK, #8
  1213. #else
  1214. sub tempK, tempK, #4
  1215. #endif
  1216. lsl temp, tempK, #6
  1217. add pA, pA, temp
  1218. lsl temp, tempK, #5
  1219. add pB, pB, temp
  1220. #endif
  1221. #if defined(LEFT)
  1222. add tempOffset, tempOffset, #8
  1223. #endif
  1224. prfm PLDL1KEEP, [pA]
  1225. prfm PLDL1KEEP, [pA, #64]
  1226. prfm PLDL1KEEP, [origPB]
  1227. .Lctrmm_kernel_L4_M8_END:
  1228. subs counterI, counterI, #1
  1229. bne .Lctrmm_kernel_L4_M8_20
  1230. .Lctrmm_kernel_L4_M4_BEGIN:
  1231. mov counterI, origM
  1232. tst counterI , #7
  1233. ble .Lctrmm_kernel_L4_END
  1234. tst counterI, #4
  1235. ble .Lctrmm_kernel_L4_M2_BEGIN
  1236. .Lctrmm_kernel_L4_M4_20:
  1237. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1238. mov pB, origPB
  1239. #else
  1240. mov pB, origPB
  1241. lsl temp, tempOffset, #5
  1242. add pB, pB, temp
  1243. add pA, pA, temp
  1244. #endif
  1245. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1246. sub tempK, origK, tempOffset
  1247. #elif defined(LEFT)
  1248. add tempK, tempOffset, #4
  1249. #else
  1250. add tempK, tempOffset, #4
  1251. #endif
  1252. asr counterL , tempK, #1 // L = K / 2
  1253. cmp counterL , #2 // is there at least 4 to do?
  1254. blt .Lctrmm_kernel_L4_M4_32
  1255. KERNEL4x4_I // do one in the K
  1256. KERNEL4x4_M2 // do another in the K
  1257. subs counterL, counterL, #2
  1258. ble .Lctrmm_kernel_L4_M4_22a
  1259. .align 5
  1260. .Lctrmm_kernel_L4_M4_22:
  1261. KERNEL4x4_M1
  1262. KERNEL4x4_M2
  1263. subs counterL, counterL, #1
  1264. bgt .Lctrmm_kernel_L4_M4_22
  1265. .Lctrmm_kernel_L4_M4_22a:
  1266. KERNEL4x4_M1
  1267. KERNEL4x4_E
  1268. b .Lctrmm_kernel_L4_M4_44
  1269. .Lctrmm_kernel_L4_M4_32:
  1270. tst counterL, #1
  1271. ble .Lctrmm_kernel_L4_M4_40
  1272. KERNEL4x4_I
  1273. KERNEL4x4_E
  1274. b .Lctrmm_kernel_L4_M4_44
  1275. .Lctrmm_kernel_L4_M4_40:
  1276. INIT4x4
  1277. .Lctrmm_kernel_L4_M4_44:
  1278. ands counterL , tempK, #1
  1279. ble .Lctrmm_kernel_L4_M4_100
  1280. .Lctrmm_kernel_L4_M4_46:
  1281. KERNEL4x4_SUB
  1282. .Lctrmm_kernel_L4_M4_100:
  1283. SAVE4x4
  1284. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1285. sub tempK, origK, tempOffset
  1286. #if defined(LEFT)
  1287. sub tempK, tempK, #4
  1288. #else
  1289. sub tempK, tempK, #4
  1290. #endif
  1291. lsl temp, tempK, #5
  1292. add pA, pA, temp
  1293. add pB, pB, temp
  1294. #endif
  1295. #if defined(LEFT)
  1296. add tempOffset, tempOffset, #4
  1297. #endif
  1298. .Lctrmm_kernel_L4_M4_END:
  1299. .Lctrmm_kernel_L4_M2_BEGIN:
  1300. mov counterI, origM
  1301. tst counterI , #3
  1302. ble .Lctrmm_kernel_L4_END
  1303. tst counterI, #2 // counterI = counterI / 2
  1304. ble .Lctrmm_kernel_L4_M1_BEGIN
  1305. .Lctrmm_kernel_L4_M2_20:
  1306. INIT2x4
  1307. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1308. mov pB, origPB
  1309. #else
  1310. mov pB, origPB
  1311. lsl temp, tempOffset, #4
  1312. add pA, pA, temp
  1313. lsl temp, tempOffset, #5
  1314. add pB, pB, temp
  1315. #endif
  1316. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1317. sub tempK, origK, tempOffset
  1318. #elif defined(LEFT)
  1319. add tempK, tempOffset, #2
  1320. #else
  1321. add tempK, tempOffset, #4
  1322. #endif
  1323. asr counterL , tempK, #3 // counterL = counterL / 8
  1324. cmp counterL , #0
  1325. ble .Lctrmm_kernel_L4_M2_40
  1326. .Lctrmm_kernel_L4_M2_22:
  1327. KERNEL2x4_SUB
  1328. KERNEL2x4_SUB
  1329. KERNEL2x4_SUB
  1330. KERNEL2x4_SUB
  1331. KERNEL2x4_SUB
  1332. KERNEL2x4_SUB
  1333. KERNEL2x4_SUB
  1334. KERNEL2x4_SUB
  1335. subs counterL, counterL, #1
  1336. bgt .Lctrmm_kernel_L4_M2_22
  1337. .Lctrmm_kernel_L4_M2_40:
  1338. ands counterL , tempK, #7 // counterL = counterL % 8
  1339. ble .Lctrmm_kernel_L4_M2_100
  1340. .Lctrmm_kernel_L4_M2_42:
  1341. KERNEL2x4_SUB
  1342. subs counterL, counterL, #1
  1343. bgt .Lctrmm_kernel_L4_M2_42
  1344. .Lctrmm_kernel_L4_M2_100:
  1345. SAVE2x4
  1346. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1347. sub tempK, origK, tempOffset
  1348. #if defined(LEFT)
  1349. sub tempK, tempK, #2
  1350. #else
  1351. sub tempK, tempK, #4
  1352. #endif
  1353. lsl temp, tempK, #4
  1354. add pA, pA, temp
  1355. lsl temp, tempK, #5
  1356. add pB, pB, temp
  1357. #endif
  1358. #if defined(LEFT)
  1359. add tempOffset, tempOffset, #2
  1360. #endif
  1361. .Lctrmm_kernel_L4_M2_END:
  1362. .Lctrmm_kernel_L4_M1_BEGIN:
  1363. tst counterI, #1 // counterI = counterI % 2
  1364. ble .Lctrmm_kernel_L4_END
  1365. .Lctrmm_kernel_L4_M1_20:
  1366. INIT1x4
  1367. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1368. mov pB, origPB
  1369. #else
  1370. mov pB, origPB
  1371. lsl temp, tempOffset, #5
  1372. add pB, pB, temp
  1373. lsl temp, tempOffset, #3
  1374. add pA, pA, temp
  1375. #endif
  1376. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1377. sub tempK, origK, tempOffset
  1378. #elif defined(LEFT)
  1379. add tempK, tempOffset, #1
  1380. #else
  1381. add tempK, tempOffset, #4
  1382. #endif
  1383. asr counterL , tempK, #3 // counterL = counterL / 8
  1384. cmp counterL , #0
  1385. ble .Lctrmm_kernel_L4_M1_40
  1386. .Lctrmm_kernel_L4_M1_22:
  1387. KERNEL1x4_SUB
  1388. KERNEL1x4_SUB
  1389. KERNEL1x4_SUB
  1390. KERNEL1x4_SUB
  1391. KERNEL1x4_SUB
  1392. KERNEL1x4_SUB
  1393. KERNEL1x4_SUB
  1394. KERNEL1x4_SUB
  1395. subs counterL, counterL, #1
  1396. bgt .Lctrmm_kernel_L4_M1_22
  1397. .Lctrmm_kernel_L4_M1_40:
  1398. ands counterL , tempK, #7 // counterL = counterL % 8
  1399. ble .Lctrmm_kernel_L4_M1_100
  1400. .Lctrmm_kernel_L4_M1_42:
  1401. KERNEL1x4_SUB
  1402. subs counterL, counterL, #1
  1403. bgt .Lctrmm_kernel_L4_M1_42
  1404. .Lctrmm_kernel_L4_M1_100:
  1405. SAVE1x4
  1406. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1407. sub tempK, origK, tempOffset
  1408. #if defined(LEFT)
  1409. sub tempK, tempK, #1
  1410. #else
  1411. sub tempK, tempK, #4
  1412. #endif
  1413. lsl temp, tempK, #3
  1414. add pA, pA, temp
  1415. lsl temp, tempK, #5
  1416. add pB, pB, temp
  1417. #endif
  1418. #if defined(LEFT)
  1419. add tempOffset, tempOffset, #1
  1420. #endif
  1421. .Lctrmm_kernel_L4_END:
  1422. lsl temp, origK, #5
  1423. add origPB, origPB, temp // B = B + K * 4 * 8
  1424. #if !defined(LEFT)
  1425. add tempOffset, tempOffset, #4
  1426. #endif
  1427. subs counterJ, counterJ , #1 // j--
  1428. bgt .Lctrmm_kernel_L4_BEGIN
  1429. /******************************************************************************/
  1430. .Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
  1431. mov counterJ , origN
  1432. tst counterJ , #3
  1433. ble .Lctrmm_kernel_L999 // error, N was less than 4?
  1434. tst counterJ , #2
  1435. ble .Lctrmm_kernel_L1_BEGIN
  1436. mov pCRow0, pC // pCRow0 = pC
  1437. add pC,pC,LDC, lsl #1
  1438. #if defined(LEFT)
  1439. mov tempOffset, offset
  1440. #endif
  1441. mov pA, origPA // pA = A
  1442. .Lctrmm_kernel_L2_M8_BEGIN:
  1443. mov counterI, origM
  1444. asr counterI, counterI, #3 // counterI = counterI / 8
  1445. cmp counterI, #0
  1446. ble .Lctrmm_kernel_L2_M4_BEGIN
  1447. .Lctrmm_kernel_L2_M8_20:
  1448. INIT8x2
  1449. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1450. mov pB, origPB
  1451. #else
  1452. mov pB, origPB
  1453. lsl temp, tempOffset, #6
  1454. add pA, pA, temp
  1455. lsl temp, tempOffset, #4
  1456. add pB, pB, temp
  1457. #endif
  1458. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1459. sub tempK, origK, tempOffset
  1460. #elif defined(LEFT)
  1461. add tempK, tempOffset, #8
  1462. #else
  1463. add tempK, tempOffset, #2
  1464. #endif
  1465. asr counterL , tempK, #3 // counterL = counterL / 8
  1466. cmp counterL,#0
  1467. ble .Lctrmm_kernel_L2_M8_40
  1468. .align 5
  1469. .Lctrmm_kernel_L2_M8_22:
  1470. KERNEL8x2_SUB
  1471. KERNEL8x2_SUB
  1472. KERNEL8x2_SUB
  1473. KERNEL8x2_SUB
  1474. KERNEL8x2_SUB
  1475. KERNEL8x2_SUB
  1476. KERNEL8x2_SUB
  1477. KERNEL8x2_SUB
  1478. subs counterL, counterL, #1
  1479. bgt .Lctrmm_kernel_L2_M8_22
  1480. .Lctrmm_kernel_L2_M8_40:
  1481. ands counterL , tempK, #7 // counterL = counterL % 8
  1482. ble .Lctrmm_kernel_L2_M8_100
  1483. .Lctrmm_kernel_L2_M8_42:
  1484. KERNEL8x2_SUB
  1485. subs counterL, counterL, #1
  1486. bgt .Lctrmm_kernel_L2_M8_42
  1487. .Lctrmm_kernel_L2_M8_100:
  1488. SAVE8x2
  1489. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1490. sub tempK, origK, tempOffset
  1491. #if defined(LEFT)
  1492. sub tempK, tempK, #8
  1493. #else
  1494. sub tempK, tempK, #2
  1495. #endif
  1496. lsl temp, tempK, #6
  1497. add pA, pA, temp
  1498. lsl temp, tempK, #4
  1499. add pB, pB, temp
  1500. #endif
  1501. #if defined(LEFT)
  1502. add tempOffset, tempOffset, #8
  1503. #endif
  1504. .Lctrmm_kernel_L2_M8_END:
  1505. subs counterI, counterI, #1
  1506. bgt .Lctrmm_kernel_L2_M8_20
  1507. .Lctrmm_kernel_L2_M4_BEGIN:
  1508. mov counterI, origM
  1509. tst counterI , #7
  1510. ble .Lctrmm_kernel_L2_END
  1511. tst counterI, #4 // counterI = counterI / 2
  1512. ble .Lctrmm_kernel_L2_M2_BEGIN
  1513. .Lctrmm_kernel_L2_M4_20:
  1514. INIT4x2
  1515. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1516. mov pB, origPB
  1517. #else
  1518. mov pB, origPB
  1519. lsl temp, tempOffset, #4
  1520. add pB, pB, temp
  1521. lsl temp, tempOffset, #5
  1522. add pA, pA, temp
  1523. #endif
  1524. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1525. sub tempK, origK, tempOffset
  1526. #elif defined(LEFT)
  1527. add tempK, tempOffset, #4
  1528. #else
  1529. add tempK, tempOffset, #2
  1530. #endif
  1531. asr counterL , tempK, #3 // counterL = counterL / 8
  1532. cmp counterL,#0
  1533. ble .Lctrmm_kernel_L2_M4_40
  1534. .align 5
  1535. .Lctrmm_kernel_L2_M4_22:
  1536. KERNEL4x2_SUB
  1537. KERNEL4x2_SUB
  1538. KERNEL4x2_SUB
  1539. KERNEL4x2_SUB
  1540. KERNEL4x2_SUB
  1541. KERNEL4x2_SUB
  1542. KERNEL4x2_SUB
  1543. KERNEL4x2_SUB
  1544. subs counterL, counterL, #1
  1545. bgt .Lctrmm_kernel_L2_M4_22
  1546. .Lctrmm_kernel_L2_M4_40:
  1547. ands counterL , tempK, #7 // counterL = counterL % 8
  1548. ble .Lctrmm_kernel_L2_M4_100
  1549. .Lctrmm_kernel_L2_M4_42:
  1550. KERNEL4x2_SUB
  1551. subs counterL, counterL, #1
  1552. bgt .Lctrmm_kernel_L2_M4_42
  1553. .Lctrmm_kernel_L2_M4_100:
  1554. SAVE4x2
  1555. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1556. sub tempK, origK, tempOffset
  1557. #if defined(LEFT)
  1558. sub tempK, tempK, #4
  1559. #else
  1560. sub tempK, tempK, #2
  1561. #endif
  1562. lsl temp, tempK, #5
  1563. add pA, pA, temp
  1564. lsl temp, tempK, #4
  1565. add pB, pB, temp
  1566. #endif
  1567. #if defined(LEFT)
  1568. add tempOffset, tempOffset, #4
  1569. #endif
  1570. .Lctrmm_kernel_L2_M4_END:
  1571. .Lctrmm_kernel_L2_M2_BEGIN:
  1572. mov counterI, origM
  1573. tst counterI , #3
  1574. ble .Lctrmm_kernel_L2_END
  1575. tst counterI, #2 // counterI = counterI / 2
  1576. ble .Lctrmm_kernel_L2_M1_BEGIN
  1577. .Lctrmm_kernel_L2_M2_20:
  1578. INIT2x2
  1579. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1580. mov pB, origPB
  1581. #else
  1582. mov pB, origPB
  1583. lsl temp, tempOffset, #4
  1584. add pB, pB, temp
  1585. lsl temp, tempOffset, #4
  1586. add pA, pA, temp
  1587. #endif
  1588. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1589. sub tempK, origK, tempOffset
  1590. #elif defined(LEFT)
  1591. add tempK, tempOffset, #2
  1592. #else
  1593. add tempK, tempOffset, #2
  1594. #endif
  1595. asr counterL , tempK, #3 // counterL = counterL / 8
  1596. cmp counterL,#0
  1597. ble .Lctrmm_kernel_L2_M2_40
  1598. .Lctrmm_kernel_L2_M2_22:
  1599. KERNEL2x2_SUB
  1600. KERNEL2x2_SUB
  1601. KERNEL2x2_SUB
  1602. KERNEL2x2_SUB
  1603. KERNEL2x2_SUB
  1604. KERNEL2x2_SUB
  1605. KERNEL2x2_SUB
  1606. KERNEL2x2_SUB
  1607. subs counterL, counterL, #1
  1608. bgt .Lctrmm_kernel_L2_M2_22
  1609. .Lctrmm_kernel_L2_M2_40:
  1610. ands counterL , tempK, #7 // counterL = counterL % 8
  1611. ble .Lctrmm_kernel_L2_M2_100
  1612. .Lctrmm_kernel_L2_M2_42:
  1613. KERNEL2x2_SUB
  1614. subs counterL, counterL, #1
  1615. bgt .Lctrmm_kernel_L2_M2_42
  1616. .Lctrmm_kernel_L2_M2_100:
  1617. SAVE2x2
  1618. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1619. sub tempK, origK, tempOffset
  1620. #if defined(LEFT)
  1621. sub tempK, tempK, #2
  1622. #else
  1623. sub tempK, tempK, #2
  1624. #endif
  1625. lsl temp, tempK, #4
  1626. add pA, pA, temp
  1627. lsl temp, tempK, #4
  1628. add pB, pB, temp
  1629. #endif
  1630. #if defined(LEFT)
  1631. add tempOffset, tempOffset, #2
  1632. #endif
  1633. .Lctrmm_kernel_L2_M2_END:
  1634. .Lctrmm_kernel_L2_M1_BEGIN:
  1635. tst counterI, #1 // counterI = counterI % 2
  1636. ble .Lctrmm_kernel_L2_END
  1637. .Lctrmm_kernel_L2_M1_20:
  1638. INIT1x2
  1639. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1640. mov pB, origPB
  1641. #else
  1642. mov pB, origPB
  1643. lsl temp, tempOffset, #4
  1644. add pB, pB, temp
  1645. lsl temp, tempOffset, #3
  1646. add pA, pA, temp
  1647. #endif
  1648. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1649. sub tempK, origK, tempOffset
  1650. #elif defined(LEFT)
  1651. add tempK, tempOffset, #1
  1652. #else
  1653. add tempK, tempOffset, #2
  1654. #endif
  1655. asr counterL , tempK, #3 // counterL = counterL / 8
  1656. cmp counterL, #0
  1657. ble .Lctrmm_kernel_L2_M1_40
  1658. .Lctrmm_kernel_L2_M1_22:
  1659. KERNEL1x2_SUB
  1660. KERNEL1x2_SUB
  1661. KERNEL1x2_SUB
  1662. KERNEL1x2_SUB
  1663. KERNEL1x2_SUB
  1664. KERNEL1x2_SUB
  1665. KERNEL1x2_SUB
  1666. KERNEL1x2_SUB
  1667. subs counterL, counterL, #1
  1668. bgt .Lctrmm_kernel_L2_M1_22
  1669. .Lctrmm_kernel_L2_M1_40:
  1670. ands counterL , tempK, #7 // counterL = counterL % 8
  1671. ble .Lctrmm_kernel_L2_M1_100
  1672. .Lctrmm_kernel_L2_M1_42:
  1673. KERNEL1x2_SUB
  1674. subs counterL, counterL, #1
  1675. bgt .Lctrmm_kernel_L2_M1_42
  1676. .Lctrmm_kernel_L2_M1_100:
  1677. SAVE1x2
  1678. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1679. sub tempK, origK, tempOffset
  1680. #if defined(LEFT)
  1681. sub tempK, tempK, #1
  1682. #else
  1683. sub tempK, tempK, #2
  1684. #endif
  1685. lsl temp, tempK, #3
  1686. add pA, pA, temp
  1687. lsl temp, tempK, #4
  1688. add pB, pB, temp
  1689. #endif
  1690. #if defined(LEFT)
  1691. add tempOffset, tempOffset, #1
  1692. #endif
  1693. .Lctrmm_kernel_L2_END:
  1694. #if !defined(LEFT)
  1695. add tempOffset, tempOffset, #2
  1696. #endif
  1697. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1698. /******************************************************************************/
  1699. .Lctrmm_kernel_L1_BEGIN:
  1700. mov counterJ , origN
  1701. tst counterJ , #1
  1702. ble .Lctrmm_kernel_L999 // done
  1703. mov pCRow0, pC // pCRow0 = C
  1704. add pC , pC , LDC // Update pC to point to next
  1705. #if defined(LEFT)
  1706. mov tempOffset, offset
  1707. #endif
  1708. mov pA, origPA // pA = A
  1709. .Lctrmm_kernel_L1_M8_BEGIN:
  1710. mov counterI, origM
  1711. asr counterI, counterI, #3 // counterI = counterI / 8
  1712. cmp counterI, #0
  1713. ble .Lctrmm_kernel_L1_M4_BEGIN
  1714. .Lctrmm_kernel_L1_M8_20:
  1715. INIT8x1
  1716. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1717. mov pB, origPB
  1718. #else
  1719. mov pB, origPB
  1720. lsl temp, tempOffset, #6
  1721. add pA, pA, temp
  1722. lsl temp, tempOffset, #3
  1723. add pB, pB, temp
  1724. #endif
  1725. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1726. sub tempK, origK, tempOffset
  1727. #elif defined(LEFT)
  1728. add tempK, tempOffset, #8
  1729. #else
  1730. add tempK, tempOffset, #1
  1731. #endif
  1732. asr counterL , tempK, #3 // counterL = counterL / 8
  1733. cmp counterL , #0
  1734. ble .Lctrmm_kernel_L1_M8_40
  1735. .align 5
  1736. .Lctrmm_kernel_L1_M8_22:
  1737. KERNEL8x1_SUB
  1738. KERNEL8x1_SUB
  1739. KERNEL8x1_SUB
  1740. KERNEL8x1_SUB
  1741. KERNEL8x1_SUB
  1742. KERNEL8x1_SUB
  1743. KERNEL8x1_SUB
  1744. KERNEL8x1_SUB
  1745. subs counterL, counterL, #1
  1746. bgt .Lctrmm_kernel_L1_M8_22
  1747. .Lctrmm_kernel_L1_M8_40:
  1748. ands counterL , tempK, #7 // counterL = counterL % 8
  1749. ble .Lctrmm_kernel_L1_M8_100
  1750. .Lctrmm_kernel_L1_M8_42:
  1751. KERNEL8x1_SUB
  1752. subs counterL, counterL, #1
  1753. bgt .Lctrmm_kernel_L1_M8_42
  1754. .Lctrmm_kernel_L1_M8_100:
  1755. SAVE8x1
  1756. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1757. sub tempK, origK, tempOffset
  1758. #if defined(LEFT)
  1759. sub tempK, tempK, #8
  1760. #else
  1761. sub tempK, tempK, #1
  1762. #endif
  1763. lsl temp, tempK, #6
  1764. add pA, pA, temp
  1765. lsl temp, tempK, #3
  1766. add pB, pB, temp
  1767. #endif
  1768. #if defined(LEFT)
  1769. add tempOffset, tempOffset, #8
  1770. #endif
  1771. .Lctrmm_kernel_L1_M8_END:
  1772. subs counterI, counterI, #1
  1773. bgt .Lctrmm_kernel_L1_M8_20
  1774. .Lctrmm_kernel_L1_M4_BEGIN:
  1775. mov counterI, origM
  1776. tst counterI , #7
  1777. ble .Lctrmm_kernel_L1_END
  1778. tst counterI, #4 // counterI = counterI / 2
  1779. ble .Lctrmm_kernel_L1_M2_BEGIN
  1780. .Lctrmm_kernel_L1_M4_20:
  1781. INIT4x1
  1782. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1783. mov pB, origPB
  1784. #else
  1785. mov pB, origPB
  1786. lsl temp, tempOffset, #3
  1787. add pB, pB, temp
  1788. lsl temp, tempOffset, #5
  1789. add pA, pA, temp
  1790. #endif
  1791. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1792. sub tempK, origK, tempOffset
  1793. #elif defined(LEFT)
  1794. add tempK, tempOffset, #4
  1795. #else
  1796. add tempK, tempOffset, #1
  1797. #endif
  1798. asr counterL , tempK, #3 // counterL = counterL / 8
  1799. cmp counterL , #0
  1800. ble .Lctrmm_kernel_L1_M4_40
  1801. .align 5
  1802. .Lctrmm_kernel_L1_M4_22:
  1803. KERNEL4x1_SUB
  1804. KERNEL4x1_SUB
  1805. KERNEL4x1_SUB
  1806. KERNEL4x1_SUB
  1807. KERNEL4x1_SUB
  1808. KERNEL4x1_SUB
  1809. KERNEL4x1_SUB
  1810. KERNEL4x1_SUB
  1811. subs counterL, counterL, #1
  1812. bgt .Lctrmm_kernel_L1_M4_22
  1813. .Lctrmm_kernel_L1_M4_40:
  1814. ands counterL , tempK, #7 // counterL = counterL % 8
  1815. ble .Lctrmm_kernel_L1_M4_100
  1816. .Lctrmm_kernel_L1_M4_42:
  1817. KERNEL4x1_SUB
  1818. subs counterL, counterL, #1
  1819. bgt .Lctrmm_kernel_L1_M4_42
  1820. .Lctrmm_kernel_L1_M4_100:
  1821. SAVE4x1
  1822. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1823. sub tempK, origK, tempOffset
  1824. #if defined(LEFT)
  1825. sub tempK, tempK, #4
  1826. #else
  1827. sub tempK, tempK, #1
  1828. #endif
  1829. lsl temp, tempK, #5
  1830. add pA, pA, temp
  1831. lsl temp, tempK, #3
  1832. add pB, pB, temp
  1833. #endif
  1834. #if defined(LEFT)
  1835. add tempOffset, tempOffset, #4
  1836. #endif
  1837. .Lctrmm_kernel_L1_M4_END:
  1838. .Lctrmm_kernel_L1_M2_BEGIN:
  1839. mov counterI, origM
  1840. tst counterI , #3
  1841. ble .Lctrmm_kernel_L1_END
  1842. tst counterI, #2 // counterI = counterI / 2
  1843. ble .Lctrmm_kernel_L1_M1_BEGIN
  1844. .Lctrmm_kernel_L1_M2_20:
  1845. INIT2x1
  1846. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1847. mov pB, origPB
  1848. #else
  1849. mov pB, origPB
  1850. lsl temp, tempOffset, #3
  1851. add pB, pB, temp
  1852. lsl temp, tempOffset, #4
  1853. add pA, pA, temp
  1854. #endif
  1855. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1856. sub tempK, origK, tempOffset
  1857. #elif defined(LEFT)
  1858. add tempK, tempOffset, #2
  1859. #else
  1860. add tempK, tempOffset, #1
  1861. #endif
  1862. asr counterL , tempK, #3 // counterL = counterL / 8
  1863. cmp counterL , #0
  1864. ble .Lctrmm_kernel_L1_M2_40
  1865. .Lctrmm_kernel_L1_M2_22:
  1866. KERNEL2x1_SUB
  1867. KERNEL2x1_SUB
  1868. KERNEL2x1_SUB
  1869. KERNEL2x1_SUB
  1870. KERNEL2x1_SUB
  1871. KERNEL2x1_SUB
  1872. KERNEL2x1_SUB
  1873. KERNEL2x1_SUB
  1874. subs counterL, counterL, #1
  1875. bgt .Lctrmm_kernel_L1_M2_22
  1876. .Lctrmm_kernel_L1_M2_40:
  1877. ands counterL , tempK, #7 // counterL = counterL % 8
  1878. ble .Lctrmm_kernel_L1_M2_100
  1879. .Lctrmm_kernel_L1_M2_42:
  1880. KERNEL2x1_SUB
  1881. subs counterL, counterL, #1
  1882. bgt .Lctrmm_kernel_L1_M2_42
  1883. .Lctrmm_kernel_L1_M2_100:
  1884. SAVE2x1
  1885. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1886. sub tempK, origK, tempOffset
  1887. #if defined(LEFT)
  1888. sub tempK, tempK, #2
  1889. #else
  1890. sub tempK, tempK, #1
  1891. #endif
  1892. lsl temp, tempK, #4
  1893. add pA, pA, temp
  1894. lsl temp, tempK, #3
  1895. add pB, pB, temp
  1896. #endif
  1897. #if defined(LEFT)
  1898. add tempOffset, tempOffset, #2
  1899. #endif
  1900. .Lctrmm_kernel_L1_M2_END:
  1901. .Lctrmm_kernel_L1_M1_BEGIN:
  1902. tst counterI, #1 // counterI = counterI % 2
  1903. ble .Lctrmm_kernel_L1_END
  1904. .Lctrmm_kernel_L1_M1_20:
  1905. INIT1x1
  1906. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1907. mov pB, origPB
  1908. #else
  1909. mov pB, origPB
  1910. lsl temp, tempOffset, #3
  1911. add pB, pB, temp
  1912. lsl temp, tempOffset, #3
  1913. add pA, pA, temp
  1914. #endif
  1915. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1916. sub tempK, origK, tempOffset
  1917. #elif defined(LEFT)
  1918. add tempK, tempOffset, #1
  1919. #else
  1920. add tempK, tempOffset, #1
  1921. #endif
  1922. asr counterL , tempK, #3 // counterL = counterL / 8
  1923. cmp counterL , #0
  1924. ble .Lctrmm_kernel_L1_M1_40
  1925. .Lctrmm_kernel_L1_M1_22:
  1926. KERNEL1x1_SUB
  1927. KERNEL1x1_SUB
  1928. KERNEL1x1_SUB
  1929. KERNEL1x1_SUB
  1930. KERNEL1x1_SUB
  1931. KERNEL1x1_SUB
  1932. KERNEL1x1_SUB
  1933. KERNEL1x1_SUB
  1934. subs counterL, counterL, #1
  1935. bgt .Lctrmm_kernel_L1_M1_22
  1936. .Lctrmm_kernel_L1_M1_40:
  1937. ands counterL , tempK, #7 // counterL = counterL % 8
  1938. ble .Lctrmm_kernel_L1_M1_100
  1939. .Lctrmm_kernel_L1_M1_42:
  1940. KERNEL1x1_SUB
  1941. subs counterL, counterL, #1
  1942. bgt .Lctrmm_kernel_L1_M1_42
  1943. .Lctrmm_kernel_L1_M1_100:
  1944. SAVE1x1
  1945. .Lctrmm_kernel_L1_END:
  1946. .Lctrmm_kernel_L999:
  1947. mov x0, #0 // set return value
  1948. ldp d8, d9, [sp, #(0 * 16)]
  1949. ldp d10, d11, [sp, #(1 * 16)]
  1950. ldp d12, d13, [sp, #(2 * 16)]
  1951. ldp d14, d15, [sp, #(3 * 16)]
  1952. ldp d16, d17, [sp, #(4 * 16)]
  1953. ldp x18, x19, [sp, #(5 * 16)]
  1954. ldp x20, x21, [sp, #(6 * 16)]
  1955. ldp x22, x23, [sp, #(7 * 16)]
  1956. ldp x24, x25, [sp, #(8 * 16)]
  1957. ldp x26, x27, [sp, #(9 * 16)]
  1958. ldr x28, [sp, #(10 * 16)]
  1959. add sp, sp, #(11*16)
  1960. ret
  1961. EPILOGUE