You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dtrsm_kernel_macro.S 49 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147
  1. /*******************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. /************** Dgemm Kernel 16x4 ****************/
  28. .macro KERNEL2x16x4
  29. xvld U0, A0, 0x00
  30. xvfmadd.d D0, U8, U12, D0
  31. xvfmadd.d D1, U9, U12, D1
  32. xvld U1, A0, 0x20
  33. xvfmadd.d D2, U10, U12, D2
  34. xvfmadd.d D3, U11, U12, D3
  35. xvld U2, A0, 0x40
  36. xvfmadd.d D4, U8, U13, D4
  37. xvfmadd.d D5, U9, U13, D5
  38. xvld U3, A0, 0x60
  39. xvfmadd.d D6, U10, U13, D6
  40. xvfmadd.d D7, U11, U13, D7
  41. xvldrepl.d U4, B0, 0x00
  42. xvfmadd.d D8, U8, U14, D8
  43. xvfmadd.d D9, U9, U14, D9
  44. preld 0, B0, B_PRE
  45. xvldrepl.d U5, B0, 0x08
  46. xvfmadd.d D10, U10, U14, D10
  47. xvfmadd.d D11, U11, U14, D11
  48. preld 0, A0, A_PRE
  49. xvldrepl.d U6, B0, 0x10
  50. xvfmadd.d D12, U8, U15, D12
  51. xvfmadd.d D13, U9, U15, D13
  52. preld 0, A0, A_PRE + 0x40
  53. xvldrepl.d U7, B0, 0x18
  54. xvfmadd.d D14, U10, U15, D14
  55. xvfmadd.d D15, U11, U15, D15
  56. addi.d A0, A0, 0x80
  57. addi.d B0, B0, 0x20
  58. xvld U8, A0, 0x00
  59. xvfmadd.d D0, U0, U4, D0
  60. xvfmadd.d D1, U1, U4, D1
  61. xvld U9, A0, 0x20
  62. xvfmadd.d D2, U2, U4, D2
  63. xvfmadd.d D3, U3, U4, D3
  64. xvld U10, A0, 0x40
  65. xvfmadd.d D4, U0, U5, D4
  66. xvfmadd.d D5, U1, U5, D5
  67. xvld U11, A0, 0x60
  68. xvfmadd.d D6, U2, U5, D6
  69. xvfmadd.d D7, U3, U5, D7
  70. xvldrepl.d U12, B0, 0x00
  71. xvfmadd.d D8, U0, U6, D8
  72. xvfmadd.d D9, U1, U6, D9
  73. preld 0, B0, B_PRE
  74. xvldrepl.d U13, B0, 0x08
  75. xvfmadd.d D10, U2, U6, D10
  76. xvfmadd.d D11, U3, U6, D11
  77. preld 0, A0, A_PRE
  78. xvldrepl.d U14, B0, 0x10
  79. xvfmadd.d D12, U0, U7, D12
  80. xvfmadd.d D13, U1, U7, D13
  81. preld 0, A0, A_PRE + 0x40
  82. xvldrepl.d U15, B0, 0x18
  83. xvfmadd.d D14, U2, U7, D14
  84. xvfmadd.d D15, U3, U7, D15
  85. addi.d A0, A0, 0x80
  86. addi.d B0, B0, 0x20
  87. .endm
  88. .macro KERNEL2x16x4_END
  89. xvld U0, A0, 0x00
  90. xvfmadd.d D0, U8, U12, D0
  91. xvfmadd.d D1, U9, U12, D1
  92. xvld U1, A0, 0x20
  93. xvfmadd.d D2, U10, U12, D2
  94. xvfmadd.d D3, U11, U12, D3
  95. xvld U2, A0, 0x40
  96. xvfmadd.d D4, U8, U13, D4
  97. xvfmadd.d D5, U9, U13, D5
  98. xvld U3, A0, 0x60
  99. xvfmadd.d D6, U10, U13, D6
  100. xvfmadd.d D7, U11, U13, D7
  101. xvldrepl.d U4, B0, 0x00
  102. xvfmadd.d D8, U8, U14, D8
  103. xvfmadd.d D9, U9, U14, D9
  104. preld 0, B0, B_PRE
  105. xvldrepl.d U5, B0, 0x08
  106. xvfmadd.d D10, U10, U14, D10
  107. xvfmadd.d D11, U11, U14, D11
  108. preld 0, A0, A_PRE
  109. xvldrepl.d U6, B0, 0x10
  110. xvfmadd.d D12, U8, U15, D12
  111. xvfmadd.d D13, U9, U15, D13
  112. preld 0, A0, A_PRE + 0x40
  113. xvldrepl.d U7, B0, 0x18
  114. xvfmadd.d D14, U10, U15, D14
  115. xvfmadd.d D15, U11, U15, D15
  116. addi.d A0, A0, 0x80
  117. addi.d B0, B0, 0x20
  118. xvfmadd.d D0, U0, U4, D0
  119. xvfmadd.d D1, U1, U4, D1
  120. xvfmadd.d D2, U2, U4, D2
  121. xvfmadd.d D3, U3, U4, D3
  122. xvfmadd.d D4, U0, U5, D4
  123. xvfmadd.d D5, U1, U5, D5
  124. xvfmadd.d D6, U2, U5, D6
  125. xvfmadd.d D7, U3, U5, D7
  126. xvfmadd.d D8, U0, U6, D8
  127. xvfmadd.d D9, U1, U6, D9
  128. preld 0, B0, B_PRE
  129. xvfmadd.d D10, U2, U6, D10
  130. xvfmadd.d D11, U3, U6, D11
  131. preld 0, A0, A_PRE
  132. xvfmadd.d D12, U0, U7, D12
  133. xvfmadd.d D13, U1, U7, D13
  134. preld 0, A0, A_PRE + 0x40
  135. xvfmadd.d D14, U2, U7, D14
  136. xvfmadd.d D15, U3, U7, D15
  137. .endm
  138. .macro KERNEL8x16x4
  139. .rept 4
  140. KERNEL2x16x4
  141. .endr
  142. .endm
  143. .macro KERNEL8x16x4_END
  144. .rept 3
  145. KERNEL2x16x4
  146. .endr
  147. KERNEL2x16x4_END
  148. .endm
  149. .macro KERNEL2x8x4
  150. xvld U0, A0, 0x00
  151. xvld U1, A0, 0x20
  152. xvldrepl.d U4, B0, 0x00
  153. xvfmadd.d D0, U8, U12, D0
  154. xvfmadd.d D1, U9, U12, D1
  155. xvldrepl.d U5, B0, 0x08
  156. xvfmadd.d D4, U8, U13, D4
  157. xvfmadd.d D5, U9, U13, D5
  158. xvldrepl.d U6, B0, 0x10
  159. xvfmadd.d D8, U8, U14, D8
  160. xvfmadd.d D9, U9, U14, D9
  161. xvldrepl.d U7, B0, 0x18
  162. xvfmadd.d D12, U8, U15, D12
  163. xvfmadd.d D13, U9, U15, D13
  164. addi.d A0, A0, 0x40
  165. addi.d B0, B0, 0x20
  166. xvld U8, A0, 0x00
  167. xvld U9, A0, 0x20
  168. xvldrepl.d U12, B0, 0x00
  169. xvfmadd.d D0, U0, U4, D0
  170. xvfmadd.d D1, U1, U4, D1
  171. xvldrepl.d U13, B0, 0x08
  172. xvfmadd.d D4, U0, U5, D4
  173. xvfmadd.d D5, U1, U5, D5
  174. xvldrepl.d U14, B0, 0x10
  175. xvfmadd.d D8, U0, U6, D8
  176. xvfmadd.d D9, U1, U6, D9
  177. xvldrepl.d U15, B0, 0x18
  178. xvfmadd.d D12, U0, U7, D12
  179. xvfmadd.d D13, U1, U7, D13
  180. addi.d A0, A0, 0x40
  181. addi.d B0, B0, 0x20
  182. .endm
  183. .macro KERNEL2x8x4_END
  184. xvld U0, A0, 0x00
  185. xvld U1, A0, 0x20
  186. xvldrepl.d U4, B0, 0x00
  187. xvfmadd.d D0, U8, U12, D0
  188. xvfmadd.d D1, U9, U12, D1
  189. xvldrepl.d U5, B0, 0x08
  190. xvfmadd.d D4, U8, U13, D4
  191. xvfmadd.d D5, U9, U13, D5
  192. xvldrepl.d U6, B0, 0x10
  193. xvfmadd.d D8, U8, U14, D8
  194. xvfmadd.d D9, U9, U14, D9
  195. xvldrepl.d U7, B0, 0x18
  196. xvfmadd.d D12, U8, U15, D12
  197. xvfmadd.d D13, U9, U15, D13
  198. addi.d A0, A0, 0x40
  199. addi.d B0, B0, 0x20
  200. xvfmadd.d D0, U0, U4, D0
  201. xvfmadd.d D1, U1, U4, D1
  202. xvfmadd.d D4, U0, U5, D4
  203. xvfmadd.d D5, U1, U5, D5
  204. xvfmadd.d D8, U0, U6, D8
  205. xvfmadd.d D9, U1, U6, D9
  206. xvfmadd.d D12, U0, U7, D12
  207. xvfmadd.d D13, U1, U7, D13
  208. .endm
  209. .macro KERNEL8x8x4
  210. .rept 4
  211. KERNEL2x8x4
  212. .endr
  213. .endm
  214. .macro KERNEL8x8x4_END
  215. .rept 3
  216. KERNEL2x8x4
  217. .endr
  218. KERNEL2x8x4_END
  219. .endm
  220. .macro KERNEL2x4x4
  221. xvld U0, A0, 0x00
  222. xvldrepl.d U4, B0, 0x00
  223. xvfmadd.d D0, U8, U12, D0
  224. xvldrepl.d U5, B0, 0x08
  225. xvfmadd.d D4, U8, U13, D4
  226. xvldrepl.d U6, B0, 0x10
  227. xvfmadd.d D8, U8, U14, D8
  228. xvldrepl.d U7, B0, 0x18
  229. xvfmadd.d D12, U8, U15, D12
  230. addi.d A0, A0, 0x20
  231. addi.d B0, B0, 0x20
  232. xvld U8, A0, 0x00
  233. xvldrepl.d U12, B0, 0x00
  234. xvfmadd.d D0, U0, U4, D0
  235. xvldrepl.d U13, B0, 0x08
  236. xvfmadd.d D4, U0, U5, D4
  237. xvldrepl.d U14, B0, 0x10
  238. xvfmadd.d D8, U0, U6, D8
  239. xvldrepl.d U15, B0, 0x18
  240. xvfmadd.d D12, U0, U7, D12
  241. addi.d A0, A0, 0x20
  242. addi.d B0, B0, 0x20
  243. .endm
  244. .macro KERNEL2x4x4_END
  245. xvld U0, A0, 0x00
  246. xvldrepl.d U4, B0, 0x00
  247. xvfmadd.d D0, U8, U12, D0
  248. xvldrepl.d U5, B0, 0x08
  249. xvfmadd.d D4, U8, U13, D4
  250. xvldrepl.d U6, B0, 0x10
  251. xvfmadd.d D8, U8, U14, D8
  252. xvldrepl.d U7, B0, 0x18
  253. xvfmadd.d D12, U8, U15, D12
  254. addi.d A0, A0, 0x20
  255. addi.d B0, B0, 0x20
  256. xvfmadd.d D0, U0, U4, D0
  257. xvfmadd.d D4, U0, U5, D4
  258. xvfmadd.d D8, U0, U6, D8
  259. xvfmadd.d D12, U0, U7, D12
  260. .endm
  261. .macro KERNEL8x4x4
  262. .rept 4
  263. KERNEL2x4x4
  264. .endr
  265. .endm
  266. .macro KERNEL8x4x4_END
  267. .rept 3
  268. KERNEL2x4x4
  269. .endr
  270. KERNEL2x4x4_END
  271. .endm
  272. .macro KERNEL2x2x4
  273. xvldrepl.d U0, A0, 0x00
  274. xvldrepl.d U1, A0, 0x08
  275. xvfmadd.d D0, U8, U12, D0
  276. xvfmadd.d D1, U9, U12, D1
  277. xvld U4, B0, 0x00
  278. addi.d A0, A0, 0x10
  279. addi.d B0, B0, 0x20
  280. xvldrepl.d U8, A0, 0x00
  281. xvldrepl.d U9, A0, 0x08
  282. xvfmadd.d D0, U0, U4, D0
  283. xvfmadd.d D1, U1, U4, D1
  284. xvld U12, B0, 0x00
  285. addi.d A0, A0, 0x10
  286. addi.d B0, B0, 0x20
  287. .endm
  288. .macro KERNEL2x2x4_END
  289. xvldrepl.d U0, A0, 0x00
  290. xvldrepl.d U1, A0, 0x08
  291. xvfmadd.d D0, U8, U12, D0
  292. xvfmadd.d D1, U9, U12, D1
  293. xvld U4, B0, 0x00
  294. addi.d A0, A0, 0x10
  295. addi.d B0, B0, 0x20
  296. xvfmadd.d D0, U0, U4, D0
  297. xvfmadd.d D1, U1, U4, D1
  298. .endm
  299. .macro KERNEL8x2x4
  300. .rept 4
  301. KERNEL2x2x4
  302. .endr
  303. .endm
  304. .macro KERNEL8x2x4_END
  305. .rept 3
  306. KERNEL2x2x4
  307. .endr
  308. KERNEL2x2x4_END
  309. .endm
  310. .macro KERNEL2x1x4
  311. xvldrepl.d U0, A0, 0x00
  312. xvfmadd.d D0, U8, U12, D0
  313. xvld U4, B0, 0x00
  314. addi.d A0, A0, 0x08
  315. addi.d B0, B0, 0x20
  316. xvldrepl.d U8, A0, 0x00
  317. xvfmadd.d D0, U0, U4, D0
  318. xvld U12, B0, 0x00
  319. addi.d A0, A0, 0x08
  320. addi.d B0, B0, 0x20
  321. .endm
  322. .macro KERNEL2x1x4_END
  323. xvldrepl.d U0, A0, 0x00
  324. xvfmadd.d D0, U8, U12, D0
  325. xvld U4, B0, 0x00
  326. addi.d A0, A0, 0x08
  327. addi.d B0, B0, 0x20
  328. xvfmadd.d D0, U0, U4, D0
  329. .endm
  330. .macro KERNEL8x1x4
  331. .rept 4
  332. KERNEL2x1x4
  333. .endr
  334. .endm
  335. .macro KERNEL8x1x4_END
  336. .rept 3
  337. KERNEL2x1x4
  338. .endr
  339. KERNEL2x1x4_END
  340. .endm
  341. .macro KERNEL2x16x2
  342. xvld U0, A0, 0x00
  343. xvfmadd.d D0, U8, U12, D0
  344. xvfmadd.d D1, U9, U12, D1
  345. xvld U1, A0, 0x20
  346. xvfmadd.d D2, U10, U12, D2
  347. xvfmadd.d D3, U11, U12, D3
  348. xvld U2, A0, 0x40
  349. xvfmadd.d D4, U8, U13, D4
  350. xvfmadd.d D5, U9, U13, D5
  351. xvld U3, A0, 0x60
  352. xvfmadd.d D6, U10, U13, D6
  353. xvfmadd.d D7, U11, U13, D7
  354. xvldrepl.d U4, B0, 0x00
  355. xvldrepl.d U5, B0, 0x08
  356. addi.d A0, A0, 0x80
  357. addi.d B0, B0, 0x10
  358. xvld U8, A0, 0x00
  359. xvfmadd.d D0, U0, U4, D0
  360. xvfmadd.d D1, U1, U4, D1
  361. xvld U9, A0, 0x20
  362. xvfmadd.d D2, U2, U4, D2
  363. xvfmadd.d D3, U3, U4, D3
  364. xvld U10, A0, 0x40
  365. xvfmadd.d D4, U0, U5, D4
  366. xvfmadd.d D5, U1, U5, D5
  367. xvld U11, A0, 0x60
  368. xvfmadd.d D6, U2, U5, D6
  369. xvfmadd.d D7, U3, U5, D7
  370. xvldrepl.d U12, B0, 0x00
  371. xvldrepl.d U13, B0, 0x08
  372. addi.d A0, A0, 0x80
  373. addi.d B0, B0, 0x10
  374. .endm
  375. .macro KERNEL2x16x2_END
  376. xvld U0, A0, 0x00
  377. xvfmadd.d D0, U8, U12, D0
  378. xvfmadd.d D1, U9, U12, D1
  379. xvld U1, A0, 0x20
  380. xvfmadd.d D2, U10, U12, D2
  381. xvfmadd.d D3, U11, U12, D3
  382. xvld U2, A0, 0x40
  383. xvfmadd.d D4, U8, U13, D4
  384. xvfmadd.d D5, U9, U13, D5
  385. xvld U3, A0, 0x60
  386. xvfmadd.d D6, U10, U13, D6
  387. xvfmadd.d D7, U11, U13, D7
  388. xvldrepl.d U4, B0, 0x00
  389. xvldrepl.d U5, B0, 0x08
  390. addi.d A0, A0, 0x80
  391. addi.d B0, B0, 0x10
  392. xvfmadd.d D0, U0, U4, D0
  393. xvfmadd.d D1, U1, U4, D1
  394. xvfmadd.d D2, U2, U4, D2
  395. xvfmadd.d D3, U3, U4, D3
  396. xvfmadd.d D4, U0, U5, D4
  397. xvfmadd.d D5, U1, U5, D5
  398. xvfmadd.d D6, U2, U5, D6
  399. xvfmadd.d D7, U3, U5, D7
  400. .endm
  401. .macro KERNEL8x16x2
  402. .rept 4
  403. KERNEL2x16x2
  404. .endr
  405. .endm
  406. .macro KERNEL8x16x2_END
  407. .rept 3
  408. KERNEL2x16x2
  409. .endr
  410. KERNEL2x16x2_END
  411. .endm
  412. .macro KERNEL2x8x2
  413. xvld U0, A0, 0x00
  414. xvfmadd.d D0, U8, U12, D0
  415. xvfmadd.d D1, U9, U12, D1
  416. xvld U1, A0, 0x20
  417. xvfmadd.d D4, U8, U13, D4
  418. xvfmadd.d D5, U9, U13, D5
  419. xvldrepl.d U4, B0, 0x00
  420. xvldrepl.d U5, B0, 0x08
  421. addi.d A0, A0, 0x40
  422. addi.d B0, B0, 0x10
  423. xvld U8, A0, 0x00
  424. xvfmadd.d D0, U0, U4, D0
  425. xvfmadd.d D1, U1, U4, D1
  426. xvld U9, A0, 0x20
  427. xvfmadd.d D4, U0, U5, D4
  428. xvfmadd.d D5, U1, U5, D5
  429. xvldrepl.d U12, B0, 0x00
  430. xvldrepl.d U13, B0, 0x08
  431. addi.d A0, A0, 0x40
  432. addi.d B0, B0, 0x10
  433. .endm
  434. .macro KERNEL2x8x2_END
  435. xvld U0, A0, 0x00
  436. xvfmadd.d D0, U8, U12, D0
  437. xvfmadd.d D1, U9, U12, D1
  438. xvld U1, A0, 0x20
  439. xvfmadd.d D4, U8, U13, D4
  440. xvfmadd.d D5, U9, U13, D5
  441. xvldrepl.d U4, B0, 0x00
  442. xvldrepl.d U5, B0, 0x08
  443. addi.d A0, A0, 0x40
  444. addi.d B0, B0, 0x10
  445. xvfmadd.d D0, U0, U4, D0
  446. xvfmadd.d D1, U1, U4, D1
  447. xvfmadd.d D4, U0, U5, D4
  448. xvfmadd.d D5, U1, U5, D5
  449. .endm
  450. .macro KERNEL8x8x2
  451. .rept 4
  452. KERNEL2x8x2
  453. .endr
  454. .endm
  455. .macro KERNEL8x8x2_END
  456. .rept 3
  457. KERNEL2x8x2
  458. .endr
  459. KERNEL2x8x2_END
  460. .endm
  461. .macro KERNEL2x4x2
  462. xvld U0, A0, 0x00
  463. xvfmadd.d D0, U8, U12, D0
  464. xvfmadd.d D4, U8, U13, D4
  465. xvldrepl.d U4, B0, 0x00
  466. xvldrepl.d U5, B0, 0x08
  467. addi.d A0, A0, 0x20
  468. addi.d B0, B0, 0x10
  469. xvld U8, A0, 0x00
  470. xvfmadd.d D0, U0, U4, D0
  471. xvfmadd.d D4, U0, U5, D4
  472. xvldrepl.d U12, B0, 0x00
  473. xvldrepl.d U13, B0, 0x08
  474. addi.d A0, A0, 0x20
  475. addi.d B0, B0, 0x10
  476. .endm
  477. .macro KERNEL2x4x2_END
  478. xvld U0, A0, 0x00
  479. xvfmadd.d D0, U8, U12, D0
  480. xvfmadd.d D4, U8, U13, D4
  481. xvldrepl.d U4, B0, 0x00
  482. xvldrepl.d U5, B0, 0x08
  483. addi.d A0, A0, 0x20
  484. addi.d B0, B0, 0x10
  485. xvfmadd.d D0, U0, U4, D0
  486. xvfmadd.d D4, U0, U5, D4
  487. .endm
  488. .macro KERNEL8x4x2
  489. .rept 4
  490. KERNEL2x4x2
  491. .endr
  492. .endm
  493. .macro KERNEL8x4x2_END
  494. .rept 3
  495. KERNEL2x4x2
  496. .endr
  497. KERNEL2x4x2_END
  498. .endm
  499. .macro KERNEL2x2x2
  500. xvld U0, A0, 0x00
  501. xvfmadd.d D0, U8, U12, D0
  502. xvfmadd.d D4, U8, U13, D4
  503. xvldrepl.d U4, B0, 0x00
  504. xvldrepl.d U5, B0, 0x08
  505. addi.d A0, A0, 0x10
  506. addi.d B0, B0, 0x10
  507. xvld U8, A0, 0x00
  508. xvfmadd.d D0, U0, U4, D0
  509. xvfmadd.d D4, U0, U5, D4
  510. xvldrepl.d U12, B0, 0x00
  511. xvldrepl.d U13, B0, 0x08
  512. addi.d A0, A0, 0x10
  513. addi.d B0, B0, 0x10
  514. .endm
  515. .macro KERNEL2x2x2_END
  516. xvld U0, A0, 0x00
  517. xvfmadd.d D0, U8, U12, D0
  518. xvfmadd.d D4, U8, U13, D4
  519. xvldrepl.d U4, B0, 0x00
  520. xvldrepl.d U5, B0, 0x08
  521. addi.d A0, A0, 0x10
  522. addi.d B0, B0, 0x10
  523. xvfmadd.d D0, U0, U4, D0
  524. xvfmadd.d D4, U0, U5, D4
  525. .endm
  526. .macro KERNEL8x2x2
  527. .rept 4
  528. KERNEL2x2x2
  529. .endr
  530. .endm
  531. .macro KERNEL8x2x2_END
  532. .rept 3
  533. KERNEL2x2x2
  534. .endr
  535. KERNEL2x2x2_END
  536. .endm
  537. .macro KERNEL2x1x2
  538. xvld U0, A0, 0x00
  539. xvfmadd.d D0, U8, U12, D0
  540. xvfmadd.d D4, U8, U13, D4
  541. xvldrepl.d U4, B0, 0x00
  542. xvldrepl.d U5, B0, 0x08
  543. addi.d A0, A0, 0x08
  544. addi.d B0, B0, 0x10
  545. xvld U8, A0, 0x00
  546. xvfmadd.d D0, U0, U4, D0
  547. xvfmadd.d D4, U0, U5, D4
  548. xvldrepl.d U12, B0, 0x00
  549. xvldrepl.d U13, B0, 0x08
  550. addi.d A0, A0, 0x08
  551. addi.d B0, B0, 0x10
  552. .endm
  553. .macro KERNEL2x1x2_END
  554. xvld U0, A0, 0x00
  555. xvfmadd.d D0, U8, U12, D0
  556. xvfmadd.d D4, U8, U13, D4
  557. xvldrepl.d U4, B0, 0x00
  558. xvldrepl.d U5, B0, 0x08
  559. addi.d A0, A0, 0x08
  560. addi.d B0, B0, 0x10
  561. xvfmadd.d D0, U0, U4, D0
  562. xvfmadd.d D4, U0, U5, D4
  563. .endm
  564. .macro KERNEL8x1x2
  565. .rept 4
  566. KERNEL2x1x2
  567. .endr
  568. .endm
  569. .macro KERNEL8x1x2_END
  570. .rept 3
  571. KERNEL2x1x2
  572. .endr
  573. KERNEL2x1x2_END
  574. .endm
  575. .macro KERNEL2x16x1
  576. xvld U0, A0, 0x00
  577. xvfmadd.d D0, U8, U12, D0
  578. xvfmadd.d D1, U9, U12, D1
  579. xvld U1, A0, 0x20
  580. xvfmadd.d D2, U10, U12, D2
  581. xvfmadd.d D3, U11, U12, D3
  582. xvld U2, A0, 0x40
  583. xvld U3, A0, 0x60
  584. xvldrepl.d U4, B0, 0x00
  585. addi.d A0, A0, 0x80
  586. addi.d B0, B0, 0x08
  587. xvld U8, A0, 0x00
  588. xvfmadd.d D0, U0, U4, D0
  589. xvfmadd.d D1, U1, U4, D1
  590. xvld U9, A0, 0x20
  591. xvfmadd.d D2, U2, U4, D2
  592. xvfmadd.d D3, U3, U4, D3
  593. xvld U10, A0, 0x40
  594. xvld U11, A0, 0x60
  595. xvldrepl.d U12, B0, 0x00
  596. addi.d A0, A0, 0x80
  597. addi.d B0, B0, 0x08
  598. .endm
  599. .macro KERNEL2x16x1_END
  600. xvld U0, A0, 0x00
  601. xvfmadd.d D0, U8, U12, D0
  602. xvfmadd.d D1, U9, U12, D1
  603. xvld U1, A0, 0x20
  604. xvfmadd.d D2, U10, U12, D2
  605. xvfmadd.d D3, U11, U12, D3
  606. xvld U2, A0, 0x40
  607. xvld U3, A0, 0x60
  608. xvldrepl.d U4, B0, 0x00
  609. addi.d A0, A0, 0x80
  610. addi.d B0, B0, 0x08
  611. xvfmadd.d D0, U0, U4, D0
  612. xvfmadd.d D1, U1, U4, D1
  613. xvfmadd.d D2, U2, U4, D2
  614. xvfmadd.d D3, U3, U4, D3
  615. .endm
  616. .macro KERNEL8x16x1
  617. .rept 4
  618. KERNEL2x16x1
  619. .endr
  620. .endm
  621. .macro KERNEL8x16x1_END
  622. .rept 3
  623. KERNEL2x16x1
  624. .endr
  625. KERNEL2x16x1_END
  626. .endm
  627. .macro KERNEL2x8x1
  628. xvld U0, A0, 0x00
  629. xvfmadd.d D0, U8, U12, D0
  630. xvfmadd.d D1, U9, U12, D1
  631. xvld U1, A0, 0x20
  632. xvldrepl.d U4, B0, 0x00
  633. addi.d A0, A0, 0x40
  634. addi.d B0, B0, 0x08
  635. xvld U8, A0, 0x00
  636. xvfmadd.d D0, U0, U4, D0
  637. xvfmadd.d D1, U1, U4, D1
  638. xvld U9, A0, 0x20
  639. xvldrepl.d U12, B0, 0x00
  640. addi.d A0, A0, 0x40
  641. addi.d B0, B0, 0x08
  642. .endm
  643. .macro KERNEL2x8x1_END
  644. xvld U0, A0, 0x00
  645. xvfmadd.d D0, U8, U12, D0
  646. xvfmadd.d D1, U9, U12, D1
  647. xvld U1, A0, 0x20
  648. xvldrepl.d U4, B0, 0x00
  649. addi.d A0, A0, 0x40
  650. addi.d B0, B0, 0x08
  651. xvfmadd.d D0, U0, U4, D0
  652. xvfmadd.d D1, U1, U4, D1
  653. .endm
  654. .macro KERNEL8x8x1
  655. .rept 4
  656. KERNEL2x8x1
  657. .endr
  658. .endm
  659. .macro KERNEL8x8x1_END
  660. .rept 3
  661. KERNEL2x8x1
  662. .endr
  663. KERNEL2x8x1_END
  664. .endm
  665. .macro KERNEL2x4x1
  666. xvld U0, A0, 0x00
  667. xvfmadd.d D0, U8, U12, D0
  668. xvldrepl.d U4, B0, 0x00
  669. addi.d A0, A0, 0x20
  670. addi.d B0, B0, 0x08
  671. xvld U8, A0, 0x00
  672. xvfmadd.d D0, U0, U4, D0
  673. xvldrepl.d U12, B0, 0x00
  674. addi.d A0, A0, 0x20
  675. addi.d B0, B0, 0x08
  676. .endm
  677. .macro KERNEL2x4x1_END
  678. xvld U0, A0, 0x00
  679. xvfmadd.d D0, U8, U12, D0
  680. xvldrepl.d U4, B0, 0x00
  681. addi.d A0, A0, 0x20
  682. addi.d B0, B0, 0x08
  683. xvfmadd.d D0, U0, U4, D0
  684. .endm
  685. .macro KERNEL8x4x1
  686. .rept 4
  687. KERNEL2x4x1
  688. .endr
  689. .endm
  690. .macro KERNEL8x4x1_END
  691. .rept 3
  692. KERNEL2x4x1
  693. .endr
  694. KERNEL2x4x1_END
  695. .endm
  696. .macro KERNEL2x2x1
  697. xvld U0, A0, 0x00
  698. xvfmadd.d D0, U8, U12, D0
  699. xvldrepl.d U4, B0, 0x00
  700. addi.d A0, A0, 0x10
  701. addi.d B0, B0, 0x08
  702. xvld U8, A0, 0x00
  703. xvfmadd.d D0, U0, U4, D0
  704. xvldrepl.d U12, B0, 0x00
  705. addi.d A0, A0, 0x10
  706. addi.d B0, B0, 0x08
  707. .endm
  708. .macro KERNEL2x2x1_END
  709. xvld U0, A0, 0x00
  710. xvfmadd.d D0, U8, U12, D0
  711. xvldrepl.d U4, B0, 0x00
  712. addi.d A0, A0, 0x10
  713. addi.d B0, B0, 0x08
  714. xvfmadd.d D0, U0, U4, D0
  715. .endm
  716. .macro KERNEL8x2x1
  717. .rept 4
  718. KERNEL2x2x1
  719. .endr
  720. .endm
  721. .macro KERNEL8x2x1_END
  722. .rept 3
  723. KERNEL2x2x1
  724. .endr
  725. KERNEL2x2x1_END
  726. .endm
  727. .macro KERNEL2x1x1
  728. xvld U0, A0, 0x00
  729. xvfmadd.d D0, U8, U12, D0
  730. xvldrepl.d U4, B0, 0x00
  731. addi.d A0, A0, 0x08
  732. addi.d B0, B0, 0x08
  733. xvld U8, A0, 0x00
  734. xvfmadd.d D0, U0, U4, D0
  735. xvldrepl.d U12, B0, 0x00
  736. addi.d A0, A0, 0x08
  737. addi.d B0, B0, 0x08
  738. .endm
  739. .macro KERNEL2x1x1_END
  740. xvld U0, A0, 0x00
  741. xvfmadd.d D0, U8, U12, D0
  742. xvldrepl.d U4, B0, 0x00
  743. addi.d A0, A0, 0x08
  744. addi.d B0, B0, 0x08
  745. xvfmadd.d D0, U0, U4, D0
  746. .endm
  747. .macro KERNEL8x1x1
  748. .rept 4
  749. KERNEL2x1x1
  750. .endr
  751. .endm
  752. .macro KERNEL8x1x1_END
  753. .rept 3
  754. KERNEL2x1x1
  755. .endr
  756. KERNEL2x1x1_END
  757. .endm
  758. .macro dgemm_16x4
  759. .L_dgemm_16x4: // See dgemm_kernel_16x4.S
  760. xvld U0, A0, 0x00
  761. xvld U1, A0, 0x20
  762. xvld U2, A0, 0x40
  763. xvld U3, A0, 0x60
  764. xvldrepl.d U4, B0, 0x00
  765. /* line 1 */
  766. xvfmul.d D0, U0, U4
  767. xvfmul.d D1, U1, U4
  768. xvfmul.d D2, U2, U4
  769. xvfmul.d D3, U3, U4
  770. xvldrepl.d U5, B0, 0x08
  771. /* line 2 */
  772. xvfmul.d D4, U0, U5
  773. xvfmul.d D5, U1, U5
  774. xvfmul.d D6, U2, U5
  775. xvfmul.d D7, U3, U5
  776. xvldrepl.d U6, B0, 0x10
  777. /* line 3 */
  778. xvfmul.d D8, U0, U6
  779. xvfmul.d D9, U1, U6
  780. xvfmul.d D10, U2, U6
  781. xvfmul.d D11, U3, U6
  782. xvldrepl.d U7, B0, 0x18
  783. /* line 4 */
  784. xvfmul.d D12, U0, U7
  785. xvfmul.d D13, U1, U7
  786. xvfmul.d D14, U2, U7
  787. xvfmul.d D15, U3, U7
  788. /* Add stride for A0 and B0 */
  789. PTR_ADDI A0, A0, 0x80
  790. PTR_ADDI B0, B0, 0x20
  791. /* Reduce L */
  792. PTR_ADDI L, L, -1
  793. PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
  794. /* if (TL < 1) goto L_L7 */
  795. beq ZERO,TL, .L_dgemm_16x4_L7
  796. xvld U8, A0, 0x00
  797. xvld U9, A0, 0x20
  798. xvld U10, A0, 0x40
  799. xvld U11, A0, 0x60
  800. PTR_ADDI TL, TL, -1
  801. xvldrepl.d U12, B0, 0x00
  802. xvldrepl.d U13, B0, 0x08
  803. xvldrepl.d U14, B0, 0x10
  804. xvldrepl.d U15, B0, 0x18
  805. PTR_ADDI A0, A0, 0x80
  806. PTR_ADDI B0, B0, 0x20
  807. beq ZERO, TL, .L_dgemm_16x4_TL1_END
  808. .align 5
  809. .L_dgemm_16x4_TL1:
  810. KERNEL8x16x4
  811. PTR_ADDI TL, TL, -1
  812. blt ZERO, TL, .L_dgemm_16x4_TL1
  813. .L_dgemm_16x4_TL1_END:
  814. KERNEL8x16x4_END
  815. .L_dgemm_16x4_L7:
  816. andi TL, L, 7
  817. beq TL, ZERO, .L_dgemm_16x4_L0
  818. .align 5
  819. .L_dgemm_16x4_L71:
  820. xvld U0, A0, 0x00
  821. xvld U1, A0, 0x20
  822. xvld U2, A0, 0x40
  823. xvld U3, A0, 0x60
  824. xvldrepl.d U4, B0, 0x00
  825. xvfmadd.d D0, U0, U4, D0
  826. xvfmadd.d D1, U1, U4, D1
  827. xvfmadd.d D2, U2, U4, D2
  828. xvfmadd.d D3, U3, U4, D3
  829. xvldrepl.d U5, B0, 0x08
  830. xvfmadd.d D4, U0, U5, D4
  831. xvfmadd.d D5, U1, U5, D5
  832. xvfmadd.d D6, U2, U5, D6
  833. xvfmadd.d D7, U3, U5, D7
  834. xvldrepl.d U6, B0, 0x10
  835. xvfmadd.d D8, U0, U6, D8
  836. xvfmadd.d D9, U1, U6, D9
  837. xvfmadd.d D10, U2, U6, D10
  838. xvfmadd.d D11, U3, U6, D11
  839. xvldrepl.d U7, B0, 0x18
  840. xvfmadd.d D12, U0, U7, D12
  841. xvfmadd.d D13, U1, U7, D13
  842. xvfmadd.d D14, U2, U7, D14
  843. xvfmadd.d D15, U3, U7, D15
  844. PTR_ADDI A0, A0, 0x80
  845. PTR_ADDI B0, B0, 0x20
  846. PTR_ADDI TL, TL, -1
  847. blt ZERO,TL, .L_dgemm_16x4_L71
  848. .L_dgemm_16x4_L0:
  849. // Load C
  850. GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
  851. GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60
  852. GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
  853. GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
  854. GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \
  855. U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7, \
  856. U8, U8, D8, U9, U9, D9, U10, U10, D10, U11, U11, D11, \
  857. U12, U12, D12, U13, U13, D13, U14, U14, D14, U15, U15, D15
  858. .endm
  859. .macro dgemm_1x4
  860. .L_dgemm_1x4: // See dgemm_kernel_16x4.S
  861. xvldrepl.d U0, A0, 0x00
  862. xvld U4, B0, 0x00
  863. xvfmul.d D0, U0, U4
  864. /* Add stride for A0 and B0 */
  865. PTR_ADDI A0, A0, 0x08
  866. PTR_ADDI B0, B0, 0x20
  867. /* Reduce L */
  868. PTR_ADDI L, L, -1
  869. PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
  870. /* if (TL < 1) goto L_M1_L7 */
  871. beq ZERO,TL, .L_dgemm_1x4_M1_L7
  872. xvldrepl.d U8, A0, 0x00
  873. PTR_ADDI TL, TL, -1
  874. xvld U12, B0, 0x00
  875. PTR_ADDI A0, A0, 0x08
  876. PTR_ADDI B0, B0, 0x20
  877. beq ZERO, TL, .L_dgemm_1x4_M1_TL1_END
  878. .align 5
  879. .L_dgemm_1x4_M1_TL1:
  880. KERNEL8x1x4
  881. PTR_ADDI TL, TL, -1
  882. blt ZERO,TL, .L_dgemm_1x4_M1_TL1
  883. .L_dgemm_1x4_M1_TL1_END:
  884. KERNEL8x1x4_END
  885. .L_dgemm_1x4_M1_L7:
  886. /* if (!(L & 7)) goto L_M1_L0 */
  887. andi TL, L, 7
  888. beq TL, ZERO,.L_dgemm_1x4_M1_L0
  889. .align 5
  890. .L_dgemm_1x4_M1_L71:
  891. xvldrepl.d U0, A0, 0x00
  892. xvld U4, B0, 0x00
  893. xvfmadd.d D0, U0, U4, D0
  894. /* Add stride for A0, B0 */
  895. PTR_ADDI A0, A0, 0x08
  896. PTR_ADDI B0, B0, 0x20
  897. PTR_ADDI TL, TL, -1
  898. blt ZERO,TL, .L_dgemm_1x4_M1_L71
  899. .L_dgemm_1x4_M1_L0:
  900. // Load C
  901. fld.d $f0, C0, 0x00
  902. fld.d $f1, C1, 0x00
  903. fld.d $f2, C2, 0x00
  904. fld.d $f3, C3, 0x00
  905. xvinsve0.d U0, U1, 0x01
  906. xvinsve0.d U0, U2, 0x02
  907. xvinsve0.d U0, U3, 0x03
  908. GSUB xvf, d, U0, U0, D0
  909. .endm
  910. .macro dgemm_2x4
  911. .L_dgemm_2x4:
  912. /* Load 2 * 64 from A0 */
  913. xvldrepl.d U0, A0, 0x00
  914. xvldrepl.d U1, A0, 0x08
  915. xvld U4, B0, 0x00
  916. xvfmul.d D0, U0, U4
  917. xvfmul.d D1, U1, U4
  918. /* Add stride for A0 and B0 */
  919. PTR_ADDI A0, A0, 0x10
  920. PTR_ADDI B0, B0, 0x20
  921. /* Reduce L */
  922. PTR_ADDI L, L, -1
  923. PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
  924. /* if (TL < 1) goto L_M2_L7 */
  925. beq ZERO,TL, .L_dgemm_2x4_M2_L7
  926. xvldrepl.d U8, A0, 0x00
  927. xvldrepl.d U9, A0, 0x08
  928. PTR_ADDI TL, TL, -1
  929. xvld U12, B0, 0x00
  930. PTR_ADDI A0, A0, 0x10
  931. PTR_ADDI B0, B0, 0x20
  932. beq ZERO, TL, .L_dgemm_2x4_M2_TL1_END
  933. .align 5
  934. .L_dgemm_2x4_M2_TL1:
  935. KERNEL8x2x4
  936. PTR_ADDI TL, TL, -1 /* TL-- */
  937. blt ZERO,TL, .L_dgemm_2x4_M2_TL1
  938. .L_dgemm_2x4_M2_TL1_END:
  939. KERNEL8x2x4_END
  940. .L_dgemm_2x4_M2_L7:
  941. /* if (!(L & 7)) goto L_M2_L0 */
  942. andi TL, L, 7
  943. beq TL, ZERO,.L_dgemm_2x4_M2_L0
  944. .align 5
  945. .L_dgemm_2x4_M2_L71:
  946. xvldrepl.d U0, A0, 0x00
  947. xvldrepl.d U1, A0, 0x08
  948. xvld U4, B0, 0x00
  949. xvfmadd.d D0, U0, U4, D0
  950. xvfmadd.d D1, U1, U4, D1
  951. /* Add stride for A0, B0 */
  952. PTR_ADDI A0, A0, 0x10
  953. PTR_ADDI B0, B0, 0x20
  954. PTR_ADDI TL, TL, -1
  955. blt ZERO,TL, .L_dgemm_2x4_M2_L71
  956. .L_dgemm_2x4_M2_L0:
  957. xvpackev.d D4, D1, D0
  958. xvpackod.d D5, D1, D0
  959. /* Load C0 */
  960. xvld U0, C0, 0x00
  961. /* Load C1 */
  962. xvld U1, C1, 0x00
  963. /* Load C2 */
  964. xvld U2, C2, 0x00
  965. /* Load C3 */
  966. xvld U3, C3, 0x00
  967. xvpermi.q U0, U2, 0x02
  968. xvpermi.q U1, U3, 0x02
  969. GSUB xvf, d, U0, U0, D4, U1, U1, D5
  970. .endm
  971. .macro dgemm_4x4
  972. .L_dgemm_4x4:
  973. /* Load 4 * 64 from A0 */
  974. xvld U0, A0, 0x00
  975. xvldrepl.d U4, B0, 0x00
  976. /* line 1 */
  977. xvfmul.d D0, U0, U4
  978. xvldrepl.d U5, B0, 0x08
  979. /* line 2 */
  980. xvfmul.d D4, U0, U5
  981. xvldrepl.d U6, B0, 0x10
  982. /* line 3 */
  983. xvfmul.d D8, U0, U6
  984. xvldrepl.d U7, B0, 0x18
  985. /* line 4 */
  986. xvfmul.d D12, U0, U7
  987. /* Add stride for A0 and B0 */
  988. PTR_ADDI A0, A0, 0x20
  989. PTR_ADDI B0, B0, 0x20
  990. /* Reduce L */
  991. PTR_ADDI L, L, -1
  992. PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
  993. /* if (TL < 1) goto L_M4_L7 */
  994. beq ZERO,TL, .L_dgemm_4x4_M4_L7
  995. xvld U8, A0, 0x00
  996. PTR_ADDI TL, TL, -1
  997. xvldrepl.d U12, B0, 0x00
  998. xvldrepl.d U13, B0, 0x08
  999. xvldrepl.d U14, B0, 0x10
  1000. xvldrepl.d U15, B0, 0x18
  1001. PTR_ADDI A0, A0, 0x20
  1002. PTR_ADDI B0, B0, 0x20
  1003. beq ZERO, TL, .L_dgemm_4x4_M4_TL1_END
  1004. .align 5
  1005. .L_dgemm_4x4_M4_TL1: /* TL-- */
  1006. KERNEL8x4x4
  1007. PTR_ADDI TL, TL, -1
  1008. blt ZERO,TL, .L_dgemm_4x4_M4_TL1
  1009. .L_dgemm_4x4_M4_TL1_END:
  1010. KERNEL8x4x4_END
  1011. .L_dgemm_4x4_M4_L7:
  1012. /* if (!(L & 7)) goto L_M4_L0 */
  1013. andi TL, L, 7
  1014. beq TL, ZERO,.L_dgemm_4x4_M4_L0
  1015. .align 5
  1016. .L_dgemm_4x4_M4_L71:
  1017. xvld U0, A0, 0x00
  1018. xvldrepl.d U4, B0, 0x00
  1019. xvfmadd.d D0, U0, U4, D0
  1020. xvldrepl.d U4, B0, 0x08
  1021. xvfmadd.d D4, U0, U4, D4
  1022. xvldrepl.d U4, B0, 0x10
  1023. xvfmadd.d D8, U0, U4, D8
  1024. xvldrepl.d U4, B0, 0x18
  1025. xvfmadd.d D12, U0, U4, D12
  1026. /* Add stride for A0, B0 */
  1027. PTR_ADDI A0, A0, 0x20
  1028. PTR_ADDI B0, B0, 0x20
  1029. PTR_ADDI TL, TL, -1
  1030. blt ZERO,TL, .L_dgemm_4x4_M4_L71
  1031. .L_dgemm_4x4_M4_L0:
  1032. /* Load C0 */
  1033. xvld U0, C0, 0x00
  1034. /* Load C1 */
  1035. xvld U1, C1, 0x00
  1036. /* Load C2 */
  1037. xvld U2, C2, 0x00
  1038. /* Load C3 */
  1039. xvld U3, C3, 0x00
  1040. GSUB xvf, d, U0, U0, D0, U1, U1, D4, U2, U2, D8, U3, U3, D12
  1041. .endm
  1042. .macro dgemm_8x4
  1043. .L_dgemm_8x4:
  1044. /* Load 8 * 64 from A0 */
  1045. xvld U0, A0, 0x00
  1046. xvld U1, A0, 0x20
  1047. xvldrepl.d U4, B0, 0x00
  1048. /* line 1 */
  1049. xvfmul.d D0, U0, U4
  1050. xvfmul.d D1, U1, U4
  1051. xvldrepl.d U5, B0, 0x08
  1052. /* line 2 */
  1053. xvfmul.d D4, U0, U5
  1054. xvfmul.d D5, U1, U5
  1055. xvldrepl.d U6, B0, 0x10
  1056. /* line 3 */
  1057. xvfmul.d D8, U0, U6
  1058. xvfmul.d D9, U1, U6
  1059. xvldrepl.d U7, B0, 0x18
  1060. /* line 4 */
  1061. xvfmul.d D12, U0, U7
  1062. xvfmul.d D13, U1, U7
  1063. /* Add stride for A0 and B0 */
  1064. PTR_ADDI A0, A0, 0x40
  1065. PTR_ADDI B0, B0, 0x20
  1066. /* Reduce L */
  1067. PTR_ADDI L, L, -1
  1068. PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
  1069. /* if (TL < 1) goto L_M8_L7 */
  1070. beq ZERO,TL, .L_dgemm_8x4_M8_L7
  1071. xvld U8, A0, 0x00
  1072. xvld U9, A0, 0x20
  1073. PTR_ADDI TL, TL, -1
  1074. xvldrepl.d U12, B0, 0x00
  1075. xvldrepl.d U13, B0, 0x08
  1076. xvldrepl.d U14, B0, 0x10
  1077. xvldrepl.d U15, B0, 0x18
  1078. PTR_ADDI A0, A0, 0x40
  1079. PTR_ADDI B0, B0, 0x20
  1080. beq ZERO, TL, .L_dgemm_8x4_M8_TL1_END
  1081. .align 5
  1082. .L_dgemm_8x4_M8_TL1: /* TL-- */
  1083. KERNEL8x8x4
  1084. PTR_ADDI TL, TL, -1 /* TL-- */
  1085. blt ZERO,TL, .L_dgemm_8x4_M8_TL1
  1086. .L_dgemm_8x4_M8_TL1_END:
  1087. KERNEL8x8x4_END
  1088. .L_dgemm_8x4_M8_L7:
  1089. /* if (!(L & 7)) goto L_M8_L0 */
  1090. andi TL, L, 7
  1091. beq TL, ZERO,.L_dgemm_8x4_M8_L0
  1092. .align 5
  1093. .L_dgemm_8x4_M8_L71:
  1094. xvld U0, A0, 0x00
  1095. xvld U1, A0, 0x20
  1096. xvldrepl.d U4, B0, 0x00
  1097. xvfmadd.d D0, U0, U4, D0
  1098. xvfmadd.d D1, U1, U4, D1
  1099. xvldrepl.d U5, B0, 0x08
  1100. xvfmadd.d D4, U0, U5, D4
  1101. xvfmadd.d D5, U1, U5, D5
  1102. xvldrepl.d U6, B0, 0x10
  1103. xvfmadd.d D8, U0, U6, D8
  1104. xvfmadd.d D9, U1, U6, D9
  1105. xvldrepl.d U7, B0, 0x18
  1106. xvfmadd.d D12, U0, U7, D12
  1107. xvfmadd.d D13, U1, U7, D13
  1108. /* Add stride for A0, B0 */
  1109. PTR_ADDI A0, A0, 0x40
  1110. PTR_ADDI B0, B0, 0x20
  1111. PTR_ADDI TL, TL, -1
  1112. blt ZERO,TL, .L_dgemm_8x4_M8_L71
  1113. .L_dgemm_8x4_M8_L0:
  1114. /* Load C0 */
  1115. xvld U0, C0, 0x00
  1116. xvld U1, C0, 0x20
  1117. /* Load C1 */
  1118. xvld U2, C1, 0x00
  1119. xvld U3, C1, 0x20
  1120. /* Load C2 */
  1121. xvld U4, C2, 0x00
  1122. xvld U5, C2, 0x20
  1123. /* Load C3 */
  1124. xvld U6, C3, 0x00
  1125. xvld U7, C3, 0x20
  1126. GSUB xvf, d, U0, U0, D0, U1, U1, D1, \
  1127. U2, U2, D4, U3, U3, D5, \
  1128. U4, U4, D8, U5, U5, D9, \
  1129. U6, U6, D12, U7, U7, D13
  1130. .endm
  1131. .macro dgemm_4x2
  1132. .L_dgemm_4x2:
  1133. /* Load 4 * 64 from A0 */
  1134. xvld U0, A0, 0x00
  1135. xvldrepl.d U4, B0, 0x00
  1136. /* line 1 */
  1137. xvfmul.d D0, U0, U4
  1138. xvldrepl.d U5, B0, 0x08
  1139. /* line 2 */
  1140. xvfmul.d D4, U0, U5
  1141. /* Add stride for A0 and B0 */
  1142. PTR_ADDI A0, A0, 0x20
  1143. PTR_ADDI B0, B0, 0x10
  1144. /* Reduce L */
  1145. PTR_ADDI L, L, -1
  1146. PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
  1147. /* if (TL < 1) goto L_dgemm_4x2_N3_M4_L7 */
  1148. beq ZERO,TL, .L_dgemm_4x2_N3_M4_L7
  1149. xvld U8, A0, 0x00
  1150. PTR_ADDI TL, TL, -1
  1151. xvldrepl.d U12, B0, 0x00
  1152. xvldrepl.d U13, B0, 0x08
  1153. PTR_ADDI A0, A0, 0x20
  1154. PTR_ADDI B0, B0, 0x10
  1155. beq ZERO, TL, .L_dgemm_4x2_N3_M4_TL1_END
  1156. .align 5
  1157. .L_dgemm_4x2_N3_M4_TL1: /* TL-- */
  1158. KERNEL8x4x2
  1159. PTR_ADDI TL, TL, -1 /* TL-- */
  1160. blt ZERO,TL, .L_dgemm_4x2_N3_M4_TL1
  1161. .L_dgemm_4x2_N3_M4_TL1_END:
  1162. KERNEL8x4x2_END
  1163. .L_dgemm_4x2_N3_M4_L7:
  1164. /* if (!(L & 7)) goto L_dgemm_4x2_N3_M4_L0 */
  1165. andi TL, L, 7
  1166. beq TL, ZERO,.L_dgemm_4x2_N3_M4_L0
  1167. .align 5
  1168. .L_dgemm_4x2_N3_M4_L71:
  1169. xvld U0, A0, 0x00
  1170. xvldrepl.d U4, B0, 0x00
  1171. xvfmadd.d D0, U0, U4, D0
  1172. xvldrepl.d U5, B0, 0x08
  1173. xvfmadd.d D4, U0, U5, D4
  1174. /* Add stride for A0, B0 */
  1175. PTR_ADDI A0, A0, 0x20
  1176. PTR_ADDI B0, B0, 0x10
  1177. PTR_ADDI TL, TL, -1
  1178. blt ZERO,TL, .L_dgemm_4x2_N3_M4_L71
  1179. .L_dgemm_4x2_N3_M4_L0:
  1180. /* Load C0 */
  1181. xvld U0, C0, 0x00
  1182. /* Load C1 */
  1183. xvld U1, C1, 0x00
  1184. GSUB xvf, d, U0, U0, D0, U1, U1, D4
  1185. .endm
  1186. .macro dgemm_2x2
  1187. .L_dgemm_2x2:
  1188. /* Load 2 * 64 from A0 */
  1189. xvld U0, A0, 0x00
  1190. xvldrepl.d U4, B0, 0x00
  1191. /* line 1 */
  1192. xvfmul.d D0, U0, U4
  1193. xvldrepl.d U4, B0, 0x08
  1194. /* line 2 */
  1195. xvfmul.d D4, U0, U4
  1196. /* Add stride for A0 and B0 */
  1197. PTR_ADDI A0, A0, 0x10
  1198. PTR_ADDI B0, B0, 0x10
  1199. /* Reduce L */
  1200. PTR_ADDI L, L, -1
  1201. PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
  1202. /* if (TL < 1) goto L_dgemm_2x2_N3_M2_L7 */
  1203. beq ZERO,TL, .L_dgemm_2x2_N3_M2_L7
  1204. xvld U8, A0, 0x00
  1205. PTR_ADDI TL, TL, -1
  1206. xvldrepl.d U12, B0, 0x00
  1207. xvldrepl.d U13, B0, 0x08
  1208. PTR_ADDI A0, A0, 0x10
  1209. PTR_ADDI B0, B0, 0x10
  1210. beq ZERO, TL, .L_dgemm_2x2_N3_M2_TL1_END
  1211. .align 5
  1212. .L_dgemm_2x2_N3_M2_TL1: /* TL-- */
  1213. KERNEL8x2x2
  1214. PTR_ADDI TL, TL, -1 /* TL-- */
  1215. blt ZERO,TL, .L_dgemm_2x2_N3_M2_TL1
  1216. .L_dgemm_2x2_N3_M2_TL1_END:
  1217. KERNEL8x2x2_END
  1218. .L_dgemm_2x2_N3_M2_L7:
  1219. /* if (!(L & 7)) goto L_dgemm_2x2_N3_M2_L0 */
  1220. andi TL, L, 7
  1221. beq TL, ZERO,.L_dgemm_2x2_N3_M2_L0
  1222. .align 5
  1223. .L_dgemm_2x2_N3_M2_L71:
  1224. xvld U0, A0, 0x00
  1225. xvldrepl.d U4, B0, 0x00
  1226. xvfmadd.d D0, U0, U4, D0
  1227. xvldrepl.d U5, B0, 0x08
  1228. xvfmadd.d D4, U0, U5, D4
  1229. /* Add stride for A0, B0 */
  1230. PTR_ADDI A0, A0, 0x10
  1231. PTR_ADDI B0, B0, 0x10
  1232. PTR_ADDI TL, TL, -1
  1233. blt ZERO,TL, .L_dgemm_2x2_N3_M2_L71
  1234. .L_dgemm_2x2_N3_M2_L0:
  1235. /* Load C0 */
  1236. xvld U0, C0, 0x00
  1237. /* Load C1 */
  1238. xvld U1, C1, 0x00
  1239. GSUB xvf, d, U0, U0, D0, U1, U1, D4
  1240. .endm
  1241. .macro dgemm_8x2
  1242. .L_dgemm_8x2:
  1243. /* Load 8 * 64 from A0 */
  1244. xvld U0, A0, 0x00
  1245. xvld U1, A0, 0x20
  1246. xvldrepl.d U4, B0, 0x00
  1247. /* line 1 */
  1248. xvfmul.d D0, U0, U4
  1249. xvfmul.d D1, U1, U4
  1250. xvldrepl.d U5, B0, 0x08
  1251. /* line 2 */
  1252. xvfmul.d D4, U0, U5
  1253. xvfmul.d D5, U1, U5
  1254. /* Add stride for A0 and B0 */
  1255. PTR_ADDI A0, A0, 0x40
  1256. PTR_ADDI B0, B0, 0x10
  1257. /* Reduce L */
  1258. PTR_ADDI L, L, -1
  1259. PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
  1260. /* if (TL < 1) goto L_dgemm_8x2_N3_M8_L7 */
  1261. beq ZERO,TL, .L_dgemm_8x2_N3_M8_L7
  1262. xvld U8, A0, 0x00
  1263. xvld U9, A0, 0x20
  1264. PTR_ADDI TL, TL, -1
  1265. xvldrepl.d U12, B0, 0x00
  1266. xvldrepl.d U13, B0, 0x08
  1267. PTR_ADDI A0, A0, 0x40
  1268. PTR_ADDI B0, B0, 0x10
  1269. beq ZERO, TL, .L_dgemm_8x2_N3_M8_TL1_END
  1270. .align 5
  1271. .L_dgemm_8x2_N3_M8_TL1: /* TL-- */
  1272. KERNEL8x8x2
  1273. PTR_ADDI TL, TL, -1 /* TL-- */
  1274. blt ZERO,TL, .L_dgemm_8x2_N3_M8_TL1
  1275. .L_dgemm_8x2_N3_M8_TL1_END:
  1276. KERNEL8x8x2_END
  1277. .L_dgemm_8x2_N3_M8_L7:
  1278. /* if (!(L & 7)) goto L_dgemm_8x2_N3_M8_L0 */
  1279. andi TL, L, 7
  1280. beq TL, ZERO,.L_dgemm_8x2_N3_M8_L0
  1281. .align 5
  1282. .L_dgemm_8x2_N3_M8_L71:
  1283. xvld U0, A0, 0x00
  1284. xvld U1, A0, 0x20
  1285. xvldrepl.d U4, B0, 0x00
  1286. xvfmadd.d D0, U0, U4, D0
  1287. xvfmadd.d D1, U1, U4, D1
  1288. xvldrepl.d U5, B0, 0x08
  1289. xvfmadd.d D4, U0, U5, D4
  1290. xvfmadd.d D5, U1, U5, D5
  1291. /* Add stride for A0, B0 */
  1292. PTR_ADDI A0, A0, 0x40
  1293. PTR_ADDI B0, B0, 0x10
  1294. PTR_ADDI TL, TL, -1
  1295. blt ZERO,TL, .L_dgemm_8x2_N3_M8_L71
  1296. .L_dgemm_8x2_N3_M8_L0:
  1297. /* Load C0 */
  1298. xvld U0, C0, 0x00
  1299. xvld U1, C0, 0x20
  1300. /* Load C1 */
  1301. xvld U2, C1, 0x00
  1302. xvld U3, C1, 0x20
  1303. GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D4, U3, U3, D5
  1304. .endm
  1305. .macro dgemm_16x2
  1306. .L_dgemm_16x2:
  1307. /* Load 16 * 64 from A0
  1308. * U0 = {a3, a2, a1, a0}
  1309. * U1 = {a7, a6, a5, a4}
  1310. * U2 = {a11, a10, a9, a8}
  1311. * U3 = {a15, a14, a13, a12}
  1312. */
  1313. xvld U0, A0, 0x00
  1314. xvld U1, A0, 0x20
  1315. xvld U2, A0, 0x40
  1316. xvld U3, A0, 0x60
  1317. xvldrepl.d U4, B0, 0x00
  1318. /* line 1 */
  1319. xvfmul.d D0, U0, U4
  1320. xvfmul.d D1, U1, U4
  1321. xvfmul.d D2, U2, U4
  1322. xvfmul.d D3, U3, U4
  1323. xvldrepl.d U5, B0, 0x08
  1324. /* line 2 */
  1325. xvfmul.d D4, U0, U5
  1326. xvfmul.d D5, U1, U5
  1327. xvfmul.d D6, U2, U5
  1328. xvfmul.d D7, U3, U5
  1329. /* Add stride for A0 and B0 */
  1330. PTR_ADDI A0, A0, 0x80
  1331. PTR_ADDI B0, B0, 0x10
  1332. /* Reduce L */
  1333. PTR_ADDI L, L, -1
  1334. PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
  1335. /* if (TL < 1) goto L_N3_L7 */
  1336. beq ZERO,TL, .L_dgemm_16x2_N3_L7
  1337. xvld U8, A0, 0x00
  1338. xvld U9, A0, 0x20
  1339. xvld U10, A0, 0x40
  1340. xvld U11, A0, 0x60
  1341. PTR_ADDI TL, TL, -1
  1342. xvldrepl.d U12, B0, 0x00
  1343. xvldrepl.d U13, B0, 0x08
  1344. PTR_ADDI A0, A0, 0x80
  1345. PTR_ADDI B0, B0, 0x10
  1346. beq ZERO, TL, .L_dgemm_16x2_N3_TL1_END
  1347. .align 5
  1348. .L_dgemm_16x2_N3_TL1: /* TL-- */
  1349. KERNEL8x16x2
  1350. PTR_ADDI TL, TL, -1 /* TL-- */
  1351. blt ZERO,TL, .L_dgemm_16x2_N3_TL1
  1352. .L_dgemm_16x2_N3_TL1_END:
  1353. KERNEL8x16x2_END
  1354. .L_dgemm_16x2_N3_L7:
  1355. /* if (!(L & 7)) goto L_dgemm_16x2_N3_L0 */
  1356. andi TL, L, 7
  1357. beq TL, ZERO,.L_dgemm_16x2_N3_L0
  1358. .align 5
  1359. .L_dgemm_16x2_N3_L71:
  1360. /* Load 16 * 64 from A0 */
  1361. xvld U0, A0, 0x00
  1362. xvld U1, A0, 0x20
  1363. xvld U2, A0, 0x40
  1364. xvld U3, A0, 0x60
  1365. xvldrepl.d U4, B0, 0x00
  1366. xvfmadd.d D0, U0, U4, D0
  1367. xvfmadd.d D1, U1, U4, D1
  1368. xvfmadd.d D2, U2, U4, D2
  1369. xvfmadd.d D3, U3, U4, D3
  1370. xvldrepl.d U5, B0, 0x08
  1371. xvfmadd.d D4, U0, U5, D4
  1372. xvfmadd.d D5, U1, U5, D5
  1373. xvfmadd.d D6, U2, U5, D6
  1374. xvfmadd.d D7, U3, U5, D7
  1375. /* Add stride for A0, B0 */
  1376. PTR_ADDI A0, A0, 0x80
  1377. PTR_ADDI B0, B0, 0x10
  1378. PTR_ADDI TL, TL, -1
  1379. blt ZERO,TL, .L_dgemm_16x2_N3_L71
  1380. .L_dgemm_16x2_N3_L0:
  1381. /* Load C0 */
  1382. xvld U0, C0, 0x00
  1383. xvld U1, C0, 0x20
  1384. xvld U2, C0, 0x40
  1385. xvld U3, C0, 0x60
  1386. /* Load C1 */
  1387. xvld U4, C1, 0x00
  1388. xvld U5, C1, 0x20
  1389. xvld U6, C1, 0x40
  1390. xvld U7, C1, 0x60
  1391. GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \
  1392. U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7
  1393. .endm
  1394. .macro dgemm_2x1
  1395. .L_dgemm_2x1:
  1396. /* Load 2 * 64 from A0 */
  1397. xvld U0, A0, 0x00
  1398. xvldrepl.d U4, B0, 0x00
  1399. /* line 1 */
  1400. xvfmul.d D0, U0, U4
  1401. /* Add stride for A0 and B0 */
  1402. PTR_ADDI A0, A0, 0x10
  1403. PTR_ADDI B0, B0, 0x08
  1404. /* Reduce L */
  1405. PTR_ADDI L, L, -1
  1406. PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
  1407. /* if (TL < 1) goto L_dgemm_2x1_N1_M2_L7 */
  1408. beq ZERO,TL, .L_dgemm_2x1_N1_M2_L7
  1409. xvld U8, A0, 0x00
  1410. PTR_ADDI TL, TL, -1
  1411. xvldrepl.d U12, B0, 0x00
  1412. PTR_ADDI A0, A0, 0x10
  1413. PTR_ADDI B0, B0, 0x08
  1414. beq ZERO, TL, .L_dgemm_2x1_N1_M2_TL1_END
  1415. .align 5
  1416. .L_dgemm_2x1_N1_M2_TL1: /* TL-- */
  1417. KERNEL8x2x1
  1418. PTR_ADDI TL, TL, -1 /* TL-- */
  1419. blt ZERO,TL, .L_dgemm_2x1_N1_M2_TL1
  1420. .L_dgemm_2x1_N1_M2_TL1_END:
  1421. KERNEL8x2x1_END
  1422. .L_dgemm_2x1_N1_M2_L7:
  1423. /* if (!(L & 7)) goto L_dgemm_2x1_N1_M2_L0 */
  1424. andi TL, L, 7
  1425. beq TL, ZERO,.L_dgemm_2x1_N1_M2_L0
  1426. .align 5
  1427. .L_dgemm_2x1_N1_M2_L71:
  1428. xvld U0, A0, 0x00
  1429. xvldrepl.d U4, B0, 0x00
  1430. xvfmadd.d D0, U0, U4, D0
  1431. /* Add stride for A0, B0 */
  1432. PTR_ADDI A0, A0, 0x10
  1433. PTR_ADDI B0, B0, 0x08
  1434. PTR_ADDI TL, TL, -1
  1435. blt ZERO,TL, .L_dgemm_2x1_N1_M2_L71
  1436. .L_dgemm_2x1_N1_M2_L0:
  1437. /* Load C0 */
  1438. xvld U0, C0, 0x00
  1439. GSUB xvf, d, U0, U0, D0
  1440. .endm
  1441. .macro dgemm_4x1
  1442. .L_dgemm_4x1:
  1443. /* Load 4 * 64 from A0 */
  1444. xvld U0, A0, 0x00
  1445. xvldrepl.d U4, B0, 0x00
  1446. /* line 1 */
  1447. xvfmul.d D0, U0, U4
  1448. /* Add stride for A0 and B0 */
  1449. PTR_ADDI A0, A0, 0x20
  1450. PTR_ADDI B0, B0, 0x08
  1451. /* Reduce L */
  1452. PTR_ADDI L, L, -1
  1453. PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
  1454. /* if (TL < 1) goto L_dgemm_4x1_N1_M4_L7 */
  1455. beq ZERO,TL, .L_dgemm_4x1_N1_M4_L7
  1456. xvld U8, A0, 0x00
  1457. PTR_ADDI TL, TL, -1
  1458. xvldrepl.d U12, B0, 0x00
  1459. PTR_ADDI A0, A0, 0x20
  1460. PTR_ADDI B0, B0, 0x08
  1461. beq ZERO, TL, .L_dgemm_4x1_N1_M4_TL1_END
  1462. .align 5
  1463. .L_dgemm_4x1_N1_M4_TL1: /* TL-- */
  1464. KERNEL8x4x1
  1465. PTR_ADDI TL, TL, -1 /* TL-- */
  1466. blt ZERO,TL, .L_dgemm_4x1_N1_M4_TL1
  1467. .L_dgemm_4x1_N1_M4_TL1_END:
  1468. KERNEL8x4x1_END
  1469. .L_dgemm_4x1_N1_M4_L7:
  1470. /* if (!(L & 7)) goto L_dgemm_4x1_N1_M4_L0 */
  1471. andi TL, L, 7
  1472. beq TL, ZERO,.L_dgemm_4x1_N1_M4_L0
  1473. .align 5
  1474. .L_dgemm_4x1_N1_M4_L71:
  1475. xvld U0, A0, 0x00
  1476. xvldrepl.d U4, B0, 0x00
  1477. xvfmadd.d D0, U0, U4, D0
  1478. /* Add stride for A0, B0 */
  1479. PTR_ADDI A0, A0, 0x20
  1480. PTR_ADDI B0, B0, 0x08
  1481. PTR_ADDI TL, TL, -1
  1482. blt ZERO,TL, .L_dgemm_4x1_N1_M4_L71
  1483. .L_dgemm_4x1_N1_M4_L0:
  1484. /* Load C0 */
  1485. xvld U0, C0, 0x00
  1486. GSUB xvf, d, U0, U0, D0
  1487. .endm
  1488. .macro dgemm_8x1
  1489. .L_dgemm_8x1:
  1490. /* Load 8 * 64 from A0 */
  1491. xvld U0, A0, 0x00
  1492. xvld U1, A0, 0x20
  1493. xvldrepl.d U4, B0, 0x00
  1494. /* line 1 */
  1495. xvfmul.d D0, U0, U4
  1496. xvfmul.d D1, U1, U4
  1497. /* Add stride for A0 and B0 */
  1498. PTR_ADDI A0, A0, 0x40
  1499. PTR_ADDI B0, B0, 0x08
  1500. /* Reduce L */
  1501. PTR_ADDI L, L, -1
  1502. PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
  1503. /* if (TL < 1) goto L_dgemm_8x1_N1_M8_L7 */
  1504. beq ZERO,TL, .L_dgemm_8x1_N1_M8_L7
  1505. xvld U8, A0, 0x00
  1506. xvld U9, A0, 0x20
  1507. PTR_ADDI TL, TL, -1
  1508. xvldrepl.d U12, B0, 0x00
  1509. PTR_ADDI A0, A0, 0x40
  1510. PTR_ADDI B0, B0, 0x08
  1511. beq ZERO, TL, .L_dgemm_8x1_N1_M8_TL1_END
  1512. .align 5
  1513. .L_dgemm_8x1_N1_M8_TL1: /* TL-- */
  1514. KERNEL8x8x1
  1515. PTR_ADDI TL, TL, -1 /* TL-- */
  1516. blt ZERO,TL, .L_dgemm_8x1_N1_M8_TL1
  1517. .L_dgemm_8x1_N1_M8_TL1_END:
  1518. KERNEL8x8x1_END
  1519. .L_dgemm_8x1_N1_M8_L7:
  1520. /* if (!(L & 7)) goto L_dgemm_8x1_N1_M8_L0 */
  1521. andi TL, L, 7
  1522. beq TL, ZERO,.L_dgemm_8x1_N1_M8_L0
  1523. .align 5
  1524. .L_dgemm_8x1_N1_M8_L71:
  1525. xvld U0, A0, 0x00
  1526. xvld U1, A0, 0x20
  1527. xvldrepl.d U4, B0, 0x00
  1528. xvfmadd.d D0, U0, U4, D0
  1529. xvfmadd.d D1, U1, U4, D1
  1530. /* Add stride for A0, B0 */
  1531. PTR_ADDI A0, A0, 0x40
  1532. PTR_ADDI B0, B0, 0x08
  1533. PTR_ADDI TL, TL, -1
  1534. blt ZERO,TL, .L_dgemm_8x1_N1_M8_L71
  1535. .L_dgemm_8x1_N1_M8_L0:
  1536. /* Load C0 */
  1537. xvld U0, C0, 0x00
  1538. xvld U1, C0, 0x20
  1539. GSUB xvf, d, U0, U0, D0, U1, U1, D1
  1540. .endm
  1541. .macro dgemm_16x1
  1542. .L_dgemm_16x1:
  1543. /* Load 16 * 64 from A0
  1544. * U0 = {a3, a2, a1, a0}
  1545. * U1 = {a7, a6, a5, a4}
  1546. * U2 = {a11, a10, a9, a8}
  1547. * U3 = {a15, a14, a13, a12}
  1548. */
  1549. xvld U0, A0, 0x00
  1550. xvld U1, A0, 0x20
  1551. xvld U2, A0, 0x40
  1552. xvld U3, A0, 0x60
  1553. xvldrepl.d U4, B0, 0x00
  1554. /* line 1 */
  1555. xvfmul.d D0, U0, U4
  1556. xvfmul.d D1, U1, U4
  1557. xvfmul.d D2, U2, U4
  1558. xvfmul.d D3, U3, U4
  1559. /* Add stride for A0 and B0 */
  1560. PTR_ADDI A0, A0, 0x80
  1561. PTR_ADDI B0, B0, 0x08
  1562. /* Reduce L */
  1563. PTR_ADDI L, L, -1
  1564. PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
  1565. /* if (TL < 1) goto L_dgemm_16x1_N1_L7 */
  1566. beq ZERO,TL, .L_dgemm_16x1_N1_L7
  1567. xvld U8, A0, 0x00
  1568. xvld U9, A0, 0x20
  1569. xvld U10, A0, 0x40
  1570. xvld U11, A0, 0x60
  1571. PTR_ADDI TL, TL, -1
  1572. xvldrepl.d U12, B0, 0x00
  1573. PTR_ADDI A0, A0, 0x80
  1574. PTR_ADDI B0, B0, 0x08
  1575. beq ZERO, TL, .L_dgemm_16x1_N1_TL1_END
  1576. .align 5
  1577. .L_dgemm_16x1_N1_TL1: /* TL-- */
  1578. KERNEL8x16x1
  1579. PTR_ADDI TL, TL, -1 /* TL-- */
  1580. blt ZERO,TL, .L_dgemm_16x1_N1_TL1
  1581. .L_dgemm_16x1_N1_TL1_END:
  1582. KERNEL8x16x1_END
  1583. .L_dgemm_16x1_N1_L7:
  1584. /* if (!(L & 7)) goto L_dgemm_16x1_N1_L0 */
  1585. andi TL, L, 7
  1586. beq TL, ZERO,.L_dgemm_16x1_N1_L0
  1587. .align 5
  1588. .L_dgemm_16x1_N1_L71:
  1589. /* Load 16 * 64 from A0 */
  1590. xvld U0, A0, 0x00
  1591. xvld U1, A0, 0x20
  1592. xvld U2, A0, 0x40
  1593. xvld U3, A0, 0x60
  1594. xvldrepl.d U4, B0, 0x00
  1595. xvfmadd.d D0, U0, U4, D0
  1596. xvfmadd.d D1, U1, U4, D1
  1597. xvfmadd.d D2, U2, U4, D2
  1598. xvfmadd.d D3, U3, U4, D3
  1599. /* Add stride for A0, B0 */
  1600. PTR_ADDI A0, A0, 0x80
  1601. PTR_ADDI B0, B0, 0x08
  1602. PTR_ADDI TL, TL, -1
  1603. blt ZERO,TL, .L_dgemm_16x1_N1_L71
  1604. .L_dgemm_16x1_N1_L0:
  1605. /* Load C0 */
  1606. xvld U0, C0, 0x00
  1607. xvld U1, C0, 0x20
  1608. xvld U2, C0, 0x40
  1609. xvld U3, C0, 0x60
  1610. GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3
  1611. .endm
  1612. .macro dgemm_1x2
  1613. .L_dgemm_1x2: // See dgemm_kernel_16x4.S
  1614. /* Load 1 * 64 from A0 */
  1615. xvld U0, A0, 0x00
  1616. xvldrepl.d U4, B0, 0x00
  1617. /* line 1 */
  1618. xvfmul.d D0, U0, U4
  1619. xvldrepl.d U4, B0, 0x08
  1620. /* line 2 */
  1621. xvfmul.d D4, U0, U4
  1622. /* Add stride for A0 and B0 */
  1623. addi.d A0, A0, 0x08
  1624. addi.d B0, B0, 0x10
  1625. /* Reduce L */
  1626. addi.d L, L, -1
  1627. srai.d TL, L, 3 /* TL = (L-1) >> 3 */
  1628. /* if (TL < 1) goto L_N3_M1_L7 */
  1629. beq ZERO,TL, .L_dgemm_1x2_N3_M1_L7
  1630. xvld U8, A0, 0x00
  1631. addi.d TL, TL, -1
  1632. xvldrepl.d U12, B0, 0x00
  1633. xvldrepl.d U13, B0, 0x08
  1634. addi.d A0, A0, 0x08
  1635. addi.d B0, B0, 0x10
  1636. beq ZERO, TL, .L_dgemm_1x2_N3_M1_TL1_END
  1637. .L_dgemm_1x2_N3_M1_TL1: /* TL-- */
  1638. KERNEL8x1x2
  1639. addi.d TL, TL, -1 /* TL-- */
  1640. blt ZERO,TL, .L_dgemm_1x2_N3_M1_TL1
  1641. .L_dgemm_1x2_N3_M1_TL1_END:
  1642. KERNEL8x1x2_END
  1643. .L_dgemm_1x2_N3_M1_L7:
  1644. /* if (!(L & 7)) goto L_dgemm_1x2_N3_M1_L0 */
  1645. andi TL, L, 7
  1646. beq TL, ZERO,.L_dgemm_1x2_N3_M1_L0
  1647. .L_dgemm_1x2_N3_M1_L71:
  1648. xvld U0, A0, 0x00
  1649. xvldrepl.d U4, B0, 0x00
  1650. xvfmadd.d D0, U0, U4, D0
  1651. xvldrepl.d U5, B0, 0x08
  1652. xvfmadd.d D4, U0, U5, D4
  1653. /* Add stride for A0, B0 */
  1654. addi.d A0, A0, 0x08
  1655. addi.d B0, B0, 0x10
  1656. addi.d TL, TL, -1
  1657. blt ZERO,TL, .L_dgemm_1x2_N3_M1_L71
  1658. .L_dgemm_1x2_N3_M1_L0:
  1659. xvld U0, C0, 0x00
  1660. xvld U1, C1, 0x00
  1661. xvinsve0.d U0, U1, 0x01
  1662. xvinsve0.d D0, D4, 0x01
  1663. GSUB xvf, d, U0, U0, D0
  1664. .endm
  1665. .macro dgemm_1x1
  1666. .L_dgemm_1x1:
  1667. /* Load 1 * 64 from A0 */
  1668. xvld U0, A0, 0x00
  1669. xvldrepl.d U4, B0, 0x00
  1670. /* line 1 */
  1671. xvfmul.d D0, U0, U4
  1672. /* Add stride for A0 and B0 */
  1673. addi.d A0, A0, 0x08
  1674. addi.d B0, B0, 0x08
  1675. /* Reduce L */
  1676. addi.d L, L, -1
  1677. srai.d TL, L, 3 /* TL = (L-1) >> 3 */
  1678. /* if (TL < 1) goto L_N1_M1_L7 */
  1679. beq ZERO,TL, .L_N1_M1_L7
  1680. xvld U8, A0, 0x00
  1681. addi.d TL, TL, -1
  1682. xvldrepl.d U12, B0, 0x00
  1683. addi.d A0, A0, 0x08
  1684. addi.d B0, B0, 0x08
  1685. beq ZERO, TL, .L_N1_M1_TL1_END
  1686. .L_N1_M1_TL1: /* TL-- */
  1687. KERNEL8x1x1
  1688. addi.d TL, TL, -1 /* TL-- */
  1689. blt ZERO,TL, .L_N1_M1_TL1
  1690. .L_N1_M1_TL1_END:
  1691. KERNEL8x1x1_END
  1692. .L_N1_M1_L7:
  1693. /* if (!(L & 7)) goto L_N1_M1_L0 */
  1694. andi TL, L, 7
  1695. beq TL, ZERO,.L_N1_M1_L0
  1696. .L_N1_M1_L71:
  1697. xvld U0, A0, 0x00
  1698. xvldrepl.d U4, B0, 0x00
  1699. xvfmadd.d D0, U0, U4, D0
  1700. /* Add stride for A0, B0 */
  1701. addi.d A0, A0, 0x08
  1702. addi.d B0, B0, 0x08
  1703. addi.d TL, TL, -1
  1704. blt ZERO,TL, .L_N1_M1_L71
  1705. .L_N1_M1_L0:
  1706. /* Load C0 */
  1707. xvld U0, C0, 0x00
  1708. GSUB xvf, d, U0, U0, D0
  1709. .endm