You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strmm_kernel_4x4_vfpv3.S 32 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/23 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA s0
  42. /******************************************************
  43. * [fp, #-128] - [fp, #-32] is reserved
  44. * for store and restore of floating point
  45. * registers
  46. *******************************************************/
  47. #define KK [fp, #-244 ]
  48. #define KKK [fp, #-248]
  49. #define LDC [fp, #-252 ]
  50. #define M [fp, #-256 ]
  51. #define N [fp, #-260 ]
  52. #define K [fp, #-264 ]
  53. #define A [fp, #-268 ]
  54. #define FP_ZERO [fp, #-240]
  55. #define FP_ZERO_0 [fp, # -240]
  56. #define FP_ZERO_1 [fp, # -236]
  57. #define ALPHA [fp, #-280]
  58. #if !defined(__ARM_PCS_VFP)
  59. #define OLD_ALPHA_SOFTFP r3
  60. #define OLD_A_SOFTFP [fp, #4 ]
  61. #define B [fp, #8 ]
  62. #define C [fp, #12 ]
  63. #define OLD_LDC [fp, #16 ]
  64. #define OFFSET [fp, #20 ]
  65. #else
  66. #define B [fp, #4 ]
  67. #define C [fp, #8 ]
  68. #define OLD_LDC [fp, #12 ]
  69. #define OFFSET [fp, #16 ]
  70. #endif
  71. #define I r0
  72. #define J r1
  73. #define L r2
  74. #define AO r5
  75. #define BO r6
  76. #define CO1 r8
  77. #define CO2 r9
  78. #define K1 r7
  79. #define BC r12
  80. #define A_PRE 96
  81. #define B_PRE 96
  82. #define C_PRE 64
  83. /**************************************************************************************
  84. * Macro definitions
  85. **************************************************************************************/
  86. .macro INIT4x4
  87. flds S16, FP_ZERO
  88. vmov.f32 s17, s16
  89. vmov.f32 s18, s16
  90. vmov.f32 s19, s16
  91. vmov.f32 s20, s16
  92. vmov.f32 s21, s16
  93. vmov.f32 s22, s16
  94. vmov.f32 s23, s16
  95. vmov.f32 s24, s16
  96. vmov.f32 s25, s16
  97. vmov.f32 s26, s16
  98. vmov.f32 s27, s16
  99. vmov.f32 s28, s16
  100. vmov.f32 s29, s16
  101. vmov.f32 s30, s16
  102. vmov.f32 s31, s16
  103. .endm
  104. .macro KERNEL4x4_I
  105. vldmia.f32 AO!, { s0 - s1 }
  106. pld [ AO , #A_PRE-8 ]
  107. vldmia.f32 BO!, { s8 - s9 }
  108. pld [ BO , #B_PRE-8 ]
  109. fmuls s16 , s0, s8
  110. vldmia.f32 AO!, { s2 - s3 }
  111. fmuls s17 , s1, s8
  112. fmuls s18 , s2, s8
  113. vldmia.f32 BO!, { s10 - s11 }
  114. fmuls s19 , s3, s8
  115. fmuls s20 , s0, s9
  116. vldmia.f32 AO!, { s4 - s5 }
  117. fmuls s21 , s1, s9
  118. fmuls s22 , s2, s9
  119. vldmia.f32 AO!, { s6 - s7 }
  120. fmuls s23 , s3, s9
  121. fmuls s24 , s0, s10
  122. vldmia.f32 BO!, { s12 - s13 }
  123. fmuls s25 , s1, s10
  124. fmuls s26 , s2, s10
  125. vldmia.f32 BO!, { s14 - s15 }
  126. fmuls s27 , s3, s10
  127. fmuls s28 , s0, s11
  128. fmuls s29 , s1, s11
  129. fmuls s30 , s2, s11
  130. fmuls s31 , s3, s11
  131. .endm
  132. .macro KERNEL4x4_M2
  133. pld [ AO , #A_PRE ]
  134. fmacs s16 , s4, s12
  135. fmacs s17 , s5, s12
  136. vldmia.f32 AO!, { s0 - s1 }
  137. fmacs s18 , s6, s12
  138. pld [ BO , #B_PRE ]
  139. fmacs s19 , s7, s12
  140. fmacs s20 , s4, s13
  141. vldmia.f32 AO!, { s2 - s3 }
  142. fmacs s21 , s5, s13
  143. fmacs s22 , s6, s13
  144. vldmia.f32 BO!, { s8 - s9 }
  145. fmacs s23 , s7, s13
  146. fmacs s24 , s4, s14
  147. vldmia.f32 BO!, { s10 - s11 }
  148. fmacs s25 , s5, s14
  149. fmacs s26 , s6, s14
  150. fmacs s27 , s7, s14
  151. fmacs s28 , s4, s15
  152. fmacs s29 , s5, s15
  153. fmacs s30 , s6, s15
  154. fmacs s31 , s7, s15
  155. .endm
  156. .macro KERNEL4x4_M1
  157. fmacs s16 , s0, s8
  158. vldmia.f32 AO!, { s4 - s5 }
  159. fmacs s17 , s1, s8
  160. fmacs s18 , s2, s8
  161. vldmia.f32 AO!, { s6 - s7 }
  162. fmacs s19 , s3, s8
  163. fmacs s20 , s0, s9
  164. vldmia.f32 BO!, { s12 - s13 }
  165. fmacs s21 , s1, s9
  166. fmacs s22 , s2, s9
  167. vldmia.f32 BO!, { s14 - s15 }
  168. fmacs s23 , s3, s9
  169. fmacs s24 , s0, s10
  170. fmacs s25 , s1, s10
  171. fmacs s26 , s2, s10
  172. fmacs s27 , s3, s10
  173. fmacs s28 , s0, s11
  174. fmacs s29 , s1, s11
  175. fmacs s30 , s2, s11
  176. fmacs s31 , s3, s11
  177. .endm
  178. .macro KERNEL4x4_E
  179. fmacs s16 , s4, s12
  180. fmacs s17 , s5, s12
  181. fmacs s18 , s6, s12
  182. fmacs s19 , s7, s12
  183. fmacs s20 , s4, s13
  184. fmacs s21 , s5, s13
  185. fmacs s22 , s6, s13
  186. fmacs s23 , s7, s13
  187. fmacs s24 , s4, s14
  188. fmacs s25 , s5, s14
  189. fmacs s26 , s6, s14
  190. fmacs s27 , s7, s14
  191. fmacs s28 , s4, s15
  192. fmacs s29 , s5, s15
  193. fmacs s30 , s6, s15
  194. fmacs s31 , s7, s15
  195. .endm
  196. .macro KERNEL4x4_SUB
  197. flds s8 , [ BO ]
  198. pld [ BO , #B_PRE ]
  199. flds s0 , [ AO ]
  200. pld [ AO , #A_PRE ]
  201. flds s1 , [ AO, #4 ]
  202. fmacs s16 , s0, s8
  203. flds s2 , [ AO, #8 ]
  204. fmacs s17 , s1, s8
  205. flds s3 , [ AO, #12 ]
  206. fmacs s18 , s2, s8
  207. flds s9 , [ BO, #4 ]
  208. fmacs s19 , s3, s8
  209. flds s10, [ BO, #8 ]
  210. fmacs s20 , s0, s9
  211. flds s11, [ BO, #12 ]
  212. fmacs s21 , s1, s9
  213. fmacs s22 , s2, s9
  214. fmacs s23 , s3, s9
  215. fmacs s24 , s0, s10
  216. fmacs s25 , s1, s10
  217. fmacs s26 , s2, s10
  218. fmacs s27 , s3, s10
  219. fmacs s28 , s0, s11
  220. fmacs s29 , s1, s11
  221. add AO , AO, #16
  222. fmacs s30 , s2, s11
  223. add BO , BO, #16
  224. fmacs s31 , s3, s11
  225. .endm
  226. .macro SAVE4x4
  227. ldr r3 , LDC
  228. add CO2 , CO1, r3
  229. flds s0, ALPHA
  230. add r4 , CO2, r3
  231. fmuls s8 , s0 , s16
  232. fmuls s9 , s0 , s17
  233. fmuls s10, s0 , s18
  234. fmuls s11, s0 , s19
  235. fmuls s12, s0 , s20
  236. fsts s8 , [CO1]
  237. fmuls s13, s0 , s21
  238. fsts s9 , [CO1, #4 ]
  239. fmuls s14, s0 , s22
  240. fsts s10, [CO1, #8 ]
  241. fmuls s15, s0 , s23
  242. fsts s11, [CO1, #12 ]
  243. fmuls s8 , s0 , s24
  244. fsts s12, [CO2]
  245. fmuls s9 , s0 , s25
  246. fsts s13, [CO2, #4 ]
  247. fmuls s10, s0 , s26
  248. fsts s14, [CO2, #8 ]
  249. fmuls s11, s0 , s27
  250. fsts s15, [CO2, #12 ]
  251. add CO2, r4 , r3
  252. fsts s8 , [r4 ]
  253. fmuls s12, s0 , s28
  254. fsts s9 , [r4 , #4 ]
  255. fmuls s13, s0 , s29
  256. fsts s10, [r4 , #8 ]
  257. fmuls s14, s0 , s30
  258. fsts s11, [r4 , #12 ]
  259. fmuls s15, s0 , s31
  260. vstmia.f32 CO2, { s12 - s15 }
  261. add CO1, CO1, #16
  262. .endm
  263. /******************************************************************************/
  264. .macro INIT2x4
  265. flds S16, FP_ZERO
  266. vmov.f32 s17, s16
  267. vmov.f32 s20, s16
  268. vmov.f32 s21, s16
  269. vmov.f32 s24, s16
  270. vmov.f32 s25, s16
  271. vmov.f32 s28, s16
  272. vmov.f32 s29, s16
  273. .endm
  274. .macro KERNEL2x4_SUB
  275. flds s8 , [ BO ]
  276. flds s9 , [ BO, #4 ]
  277. flds s10, [ BO, #8 ]
  278. flds s11, [ BO, #12 ]
  279. flds s0 , [ AO ]
  280. flds s1 , [ AO, #4 ]
  281. fmacs s16 , s0, s8
  282. fmacs s17 , s1, s8
  283. fmacs s20 , s0, s9
  284. fmacs s21 , s1, s9
  285. fmacs s24 , s0, s10
  286. fmacs s25 , s1, s10
  287. fmacs s28 , s0, s11
  288. fmacs s29 , s1, s11
  289. add AO , AO, #8
  290. add BO , BO, #16
  291. .endm
  292. .macro SAVE2x4
  293. ldr r3 , LDC
  294. add CO2 , CO1, r3
  295. add r4 , CO2, r3
  296. flds s0, ALPHA
  297. fmuls s8 , s0 , s16
  298. fmuls s9 , s0 , s17
  299. fsts s8 , [CO1]
  300. fsts s9 , [CO1, #4 ]
  301. fmuls s12, s0 , s20
  302. fmuls s13, s0 , s21
  303. fsts s12, [CO2]
  304. fsts s13, [CO2, #4 ]
  305. fmuls s8 , s0 , s24
  306. fmuls s9 , s0 , s25
  307. fsts s8 , [r4 ]
  308. fsts s9 , [r4 , #4 ]
  309. add CO2, r4 , r3
  310. fmuls s12, s0 , s28
  311. fmuls s13, s0 , s29
  312. fsts s12, [CO2]
  313. fsts s13, [CO2, #4 ]
  314. add CO1, CO1, #8
  315. .endm
  316. /******************************************************************************/
  317. .macro INIT1x4
  318. flds S16, FP_ZERO
  319. vmov.f32 s20, s16
  320. vmov.f32 s24, s16
  321. vmov.f32 s28, s16
  322. .endm
  323. .macro KERNEL1x4_SUB
  324. flds s8 , [ BO ]
  325. flds s9 , [ BO, #4 ]
  326. flds s10, [ BO, #8 ]
  327. flds s11, [ BO, #12 ]
  328. flds s0 , [ AO ]
  329. fmacs s16 , s0, s8
  330. fmacs s20 , s0, s9
  331. fmacs s24 , s0, s10
  332. fmacs s28 , s0, s11
  333. add AO , AO, #4
  334. add BO , BO, #16
  335. .endm
  336. .macro SAVE1x4
  337. ldr r3 , LDC
  338. add CO2 , CO1, r3
  339. add r4 , CO2, r3
  340. flds s0, ALPHA
  341. fmuls s8 , s0 , s16
  342. fsts s8 , [CO1]
  343. fmuls s12, s0 , s20
  344. fsts s12, [CO2]
  345. fmuls s8 , s0 , s24
  346. fsts s8 , [r4 ]
  347. add CO2, r4 , r3
  348. fmuls s12, s0 , s28
  349. fsts s12, [CO2]
  350. add CO1, CO1, #4
  351. .endm
  352. /******************************************************************************/
  353. /******************************************************************************/
  354. .macro INIT4x2
  355. flds S16, FP_ZERO
  356. vmov.f32 s17, s16
  357. vmov.f32 s18, s16
  358. vmov.f32 s19, s16
  359. vmov.f32 s20, s16
  360. vmov.f32 s21, s16
  361. vmov.f32 s22, s16
  362. vmov.f32 s23, s16
  363. .endm
  364. .macro KERNEL4x2_SUB
  365. flds s8 , [ BO ]
  366. flds s9 , [ BO, #4 ]
  367. flds s0 , [ AO ]
  368. flds s1 , [ AO, #4 ]
  369. flds s2 , [ AO, #8 ]
  370. flds s3 , [ AO, #12 ]
  371. fmacs s16 , s0, s8
  372. fmacs s17 , s1, s8
  373. fmacs s18 , s2, s8
  374. fmacs s19 , s3, s8
  375. fmacs s20 , s0, s9
  376. fmacs s21 , s1, s9
  377. fmacs s22 , s2, s9
  378. fmacs s23 , s3, s9
  379. add AO , AO, #16
  380. add BO , BO, #8
  381. .endm
  382. .macro SAVE4x2
  383. ldr r3 , LDC
  384. add CO2 , CO1, r3
  385. flds s0, ALPHA
  386. fmuls s8 , s0 , s16
  387. fmuls s9 , s0 , s17
  388. fmuls s10, s0 , s18
  389. fmuls s11, s0 , s19
  390. fsts s8 , [CO1]
  391. fsts s9 , [CO1, #4 ]
  392. fsts s10, [CO1, #8 ]
  393. fsts s11, [CO1, #12 ]
  394. fmuls s12, s0 , s20
  395. fmuls s13, s0 , s21
  396. fmuls s14, s0 , s22
  397. fmuls s15, s0 , s23
  398. fsts s12, [CO2]
  399. fsts s13, [CO2, #4 ]
  400. fsts s14, [CO2, #8 ]
  401. fsts s15, [CO2, #12 ]
  402. add CO1, CO1, #16
  403. .endm
  404. /******************************************************************************/
  405. .macro INIT2x2
  406. flds S16, FP_ZERO
  407. vmov.f32 s17, s16
  408. vmov.f32 s20, s16
  409. vmov.f32 s21, s16
  410. .endm
  411. .macro KERNEL2x2_SUB
  412. flds s8 , [ BO ]
  413. flds s9 , [ BO, #4 ]
  414. flds s0 , [ AO ]
  415. flds s1 , [ AO, #4 ]
  416. fmacs s16 , s0, s8
  417. fmacs s17 , s1, s8
  418. fmacs s20 , s0, s9
  419. fmacs s21 , s1, s9
  420. add AO , AO, #8
  421. add BO , BO, #8
  422. .endm
  423. .macro SAVE2x2
  424. ldr r3 , LDC
  425. add CO2 , CO1, r3
  426. flds s0, ALPHA
  427. fmuls s8 , s0 , s16
  428. fmuls s9 , s0 , s17
  429. fsts s8 , [CO1]
  430. fsts s9 , [CO1, #4 ]
  431. fmuls s12, s0 , s20
  432. fmuls s13, s0 , s21
  433. fsts s12, [CO2]
  434. fsts s13, [CO2, #4 ]
  435. add CO1, CO1, #8
  436. .endm
  437. /******************************************************************************/
  438. .macro INIT1x2
  439. flds S16, FP_ZERO
  440. vmov.f32 s20, s16
  441. .endm
  442. .macro KERNEL1x2_SUB
  443. flds s8 , [ BO ]
  444. flds s9 , [ BO, #4 ]
  445. flds s0 , [ AO ]
  446. fmacs s16 , s0, s8
  447. fmacs s20 , s0, s9
  448. add AO , AO, #4
  449. add BO , BO, #8
  450. .endm
  451. .macro SAVE1x2
  452. ldr r3 , LDC
  453. add CO2 , CO1, r3
  454. flds s0, ALPHA
  455. fmuls s8 , s0 , s16
  456. fsts s8 , [CO1]
  457. fmuls s12, s0 , s20
  458. fsts s12, [CO2]
  459. add CO1, CO1, #4
  460. .endm
  461. /******************************************************************************/
  462. /******************************************************************************/
  463. .macro INIT4x1
  464. flds S16, FP_ZERO
  465. vmov.f32 s17, s16
  466. vmov.f32 s18, s16
  467. vmov.f32 s19, s16
  468. .endm
  469. .macro KERNEL4x1_SUB
  470. flds s8 , [ BO ]
  471. flds s0 , [ AO ]
  472. flds s1 , [ AO, #4 ]
  473. flds s2 , [ AO, #8 ]
  474. flds s3 , [ AO, #12 ]
  475. fmacs s16 , s0, s8
  476. fmacs s17 , s1, s8
  477. fmacs s18 , s2, s8
  478. fmacs s19 , s3, s8
  479. add AO , AO, #16
  480. add BO , BO, #4
  481. .endm
  482. .macro SAVE4x1
  483. flds s0, ALPHA
  484. fmuls s8 , s0 , s16
  485. fmuls s9 , s0 , s17
  486. fmuls s10, s0 , s18
  487. fmuls s11, s0 , s19
  488. fsts s8 , [CO1]
  489. fsts s9 , [CO1, #4 ]
  490. fsts s10, [CO1, #8 ]
  491. fsts s11, [CO1, #12 ]
  492. add CO1, CO1, #16
  493. .endm
  494. /******************************************************************************/
  495. .macro INIT2x1
  496. flds S16, FP_ZERO
  497. vmov.f32 s17, s16
  498. .endm
  499. .macro KERNEL2x1_SUB
  500. flds s8 , [ BO ]
  501. flds s0 , [ AO ]
  502. flds s1 , [ AO, #4 ]
  503. fmacs s16 , s0, s8
  504. fmacs s17 , s1, s8
  505. add AO , AO, #8
  506. add BO , BO, #4
  507. .endm
  508. .macro SAVE2x1
  509. flds s0, ALPHA
  510. fmuls s8 , s0 , s16
  511. fmuls s9 , s0 , s17
  512. fsts s8 , [CO1]
  513. fsts s9 , [CO1, #4 ]
  514. add CO1, CO1, #8
  515. .endm
  516. /******************************************************************************/
  517. .macro INIT1x1
  518. flds S16, FP_ZERO
  519. .endm
  520. .macro KERNEL1x1_SUB
  521. flds s8 , [ BO ]
  522. flds s0 , [ AO ]
  523. fmacs s16 , s0, s8
  524. add AO , AO, #4
  525. add BO , BO, #4
  526. .endm
  527. .macro SAVE1x1
  528. flds s0, ALPHA
  529. fmuls s8 , s0 , s16
  530. fsts s8 , [CO1]
  531. add CO1, CO1, #4
  532. .endm
  533. /**************************************************************************************
  534. * End of macro definitions
  535. **************************************************************************************/
  536. PROLOGUE
  537. .align 5
  538. push {r4 - r9, fp}
  539. add fp, sp, #24
  540. sub sp, sp, #STACKSIZE // reserve stack
  541. #if !defined(__ARM_PCS_VFP)
  542. vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
  543. ldr OLD_A, OLD_A_SOFTFP
  544. #endif
  545. str OLD_M, M
  546. str OLD_N, N
  547. str OLD_K, K
  548. str OLD_A, A
  549. vstr OLD_ALPHA, ALPHA
  550. sub r3, fp, #128
  551. vstm r3, { s8 - s31} // store floating point registers
  552. movs r4, #0
  553. str r4, FP_ZERO
  554. str r4, FP_ZERO_1
  555. ldr r3, OLD_LDC
  556. lsl r3, r3, #2 // ldc = ldc * 4
  557. str r3, LDC
  558. ldr r3, OFFSET
  559. #ifndef LEFT
  560. neg r3 , r3
  561. #endif
  562. str r3 , KK
  563. ldr BC, B
  564. ldr J, N
  565. asrs J, J, #2 // J = J / 4
  566. ble _L2_BEGIN
  567. _L4_BEGIN:
  568. ldr CO1, C // CO1 = C
  569. ldr r4 , LDC
  570. lsl r4 , r4 , #2 // LDC * 4
  571. add r3 , r4, CO1
  572. str r3 , C // store C
  573. #if defined(LEFT)
  574. ldr r3 , OFFSET
  575. str r3 , KK
  576. #endif
  577. ldr AO, A // AO = A
  578. pld [AO , #A_PRE-64]
  579. pld [AO , #A_PRE-32]
  580. _L4_M4_BEGIN:
  581. ldr I, M
  582. asrs I, I, #2 // I = I / 4
  583. ble _L4_M2_BEGIN
  584. _L4_M4_20:
  585. #if (defined(LEFT) && defined(TRANSA)) || \
  586. (!defined(LEFT) && !defined(TRANSA))
  587. mov BO, BC
  588. #else
  589. mov BO, BC
  590. ldr r3 , KK
  591. lsls r4 , r3 , #4 // 4 float values
  592. add BO , BO , r4
  593. lsls r4 , r3 , #4 // 4 float values
  594. add AO , AO , r4
  595. #endif
  596. #ifndef TRMMKERNEL
  597. ldr K1, K
  598. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  599. ldr K1, K
  600. ldr r3, KK
  601. sub K1, K1, r3
  602. str K1, KKK
  603. #else
  604. ldr K1, KK
  605. #ifdef LEFT
  606. add K1, K1, #4 // number of values in AO
  607. #else
  608. add K1, K1, #4 // number of values in BO
  609. #endif
  610. str K1, KKK
  611. #endif
  612. asrs L , K1, #3 // L = L / 8
  613. cmp L , #3
  614. blt _L4_M4_30
  615. .align 5
  616. KERNEL4x4_I
  617. KERNEL4x4_M2
  618. KERNEL4x4_M1
  619. KERNEL4x4_M2
  620. KERNEL4x4_M1
  621. KERNEL4x4_M2
  622. KERNEL4x4_M1
  623. KERNEL4x4_M2
  624. sub L, L, #2
  625. _L4_M4_22:
  626. KERNEL4x4_M1
  627. KERNEL4x4_M2
  628. KERNEL4x4_M1
  629. KERNEL4x4_M2
  630. KERNEL4x4_M1
  631. KERNEL4x4_M2
  632. KERNEL4x4_M1
  633. KERNEL4x4_M2
  634. subs L, L, #1
  635. bgt _L4_M4_22
  636. KERNEL4x4_M1
  637. KERNEL4x4_M2
  638. KERNEL4x4_M1
  639. KERNEL4x4_M2
  640. KERNEL4x4_M1
  641. KERNEL4x4_M2
  642. KERNEL4x4_M1
  643. KERNEL4x4_E
  644. b _L4_M4_44
  645. _L4_M4_30:
  646. tst L, #3
  647. ble _L4_M4_40
  648. tst L, #2
  649. ble _L4_M4_32
  650. KERNEL4x4_I
  651. KERNEL4x4_M2
  652. KERNEL4x4_M1
  653. KERNEL4x4_M2
  654. KERNEL4x4_M1
  655. KERNEL4x4_M2
  656. KERNEL4x4_M1
  657. KERNEL4x4_M2
  658. KERNEL4x4_M1
  659. KERNEL4x4_M2
  660. KERNEL4x4_M1
  661. KERNEL4x4_M2
  662. KERNEL4x4_M1
  663. KERNEL4x4_M2
  664. KERNEL4x4_M1
  665. KERNEL4x4_E
  666. b _L4_M4_44
  667. _L4_M4_32:
  668. tst L, #1
  669. ble _L4_M4_40
  670. KERNEL4x4_I
  671. KERNEL4x4_M2
  672. KERNEL4x4_M1
  673. KERNEL4x4_M2
  674. KERNEL4x4_M1
  675. KERNEL4x4_M2
  676. KERNEL4x4_M1
  677. KERNEL4x4_E
  678. b _L4_M4_44
  679. _L4_M4_40:
  680. INIT4x4
  681. _L4_M4_44:
  682. ands L , K1, #7 // L = L % 8
  683. ble _L4_M4_100
  684. _L4_M4_46:
  685. KERNEL4x4_SUB
  686. subs L, L, #1
  687. bne _L4_M4_46
  688. _L4_M4_100:
  689. SAVE4x4
  690. #if (defined(LEFT) && defined(TRANSA)) || \
  691. (!defined(LEFT) && !defined(TRANSA))
  692. ldr r3 , K
  693. ldr r4 , KKK
  694. sub r3 , r3 , r4
  695. lsls r4 , r3 , #4 // 4 float values
  696. add BO , BO , r4
  697. lsls r4 , r3 , #4 // 4 float values
  698. add AO , AO , r4
  699. #endif
  700. #if defined(LEFT)
  701. ldr r3 , KK
  702. add r3 , r3 , #4 // number of values in AO
  703. str r3 , KK
  704. #endif
  705. _L4_M4_END:
  706. subs I, I, #1
  707. bne _L4_M4_20
  708. _L4_M2_BEGIN:
  709. ldr I, M
  710. tst I , #3
  711. ble _L4_END
  712. tst I, #2 // I = I / 2
  713. ble _L4_M1_BEGIN
  714. _L4_M2_20:
  715. INIT2x4
  716. #if (defined(LEFT) && defined(TRANSA)) || \
  717. (!defined(LEFT) && !defined(TRANSA))
  718. mov BO, BC
  719. #else
  720. mov BO, BC
  721. ldr r3 , KK
  722. lsls r4 , r3 , #4 // 4 float values
  723. add BO , BO , r4
  724. lsls r4 , r3 , #3 // 2 float values
  725. add AO , AO , r4
  726. #endif
  727. #ifndef TRMMKERNEL
  728. ldr K1, K
  729. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  730. ldr K1, K
  731. ldr r3, KK
  732. sub K1, K1, r3
  733. str K1, KKK
  734. #else
  735. ldr K1, KK
  736. #ifdef LEFT
  737. add K1, K1, #2 // number of values in AO
  738. #else
  739. add K1, K1, #4 // number of values in BO
  740. #endif
  741. str K1, KKK
  742. #endif
  743. asrs L , K1, #3 // L = L / 8
  744. ble _L4_M2_40
  745. _L4_M2_22:
  746. KERNEL2x4_SUB
  747. KERNEL2x4_SUB
  748. KERNEL2x4_SUB
  749. KERNEL2x4_SUB
  750. KERNEL2x4_SUB
  751. KERNEL2x4_SUB
  752. KERNEL2x4_SUB
  753. KERNEL2x4_SUB
  754. subs L, L, #1
  755. bgt _L4_M2_22
  756. _L4_M2_40:
  757. ands L , K1, #7 // L = L % 8
  758. ble _L4_M2_100
  759. _L4_M2_42:
  760. KERNEL2x4_SUB
  761. subs L, L, #1
  762. bgt _L4_M2_42
  763. _L4_M2_100:
  764. SAVE2x4
  765. #if (defined(LEFT) && defined(TRANSA)) || \
  766. (!defined(LEFT) && !defined(TRANSA))
  767. ldr r3 , K
  768. ldr r4 , KKK
  769. sub r3 , r3 , r4
  770. lsls r4 , r3 , #4 // 4 float values
  771. add BO , BO , r4
  772. lsls r4 , r3 , #3 // 2 float values
  773. add AO , AO , r4
  774. #endif
  775. #if defined(LEFT)
  776. ldr r3 , KK
  777. add r3 , r3 , #2 // number of values in AO
  778. str r3 , KK
  779. #endif
  780. _L4_M2_END:
  781. _L4_M1_BEGIN:
  782. tst I, #1 // I = I % 2
  783. ble _L4_END
  784. _L4_M1_20:
  785. INIT1x4
  786. #if (defined(LEFT) && defined(TRANSA)) || \
  787. (!defined(LEFT) && !defined(TRANSA))
  788. mov BO, BC
  789. #else
  790. mov BO, BC
  791. ldr r3 , KK
  792. lsls r4 , r3 , #4 // 4 float values
  793. add BO , BO , r4
  794. lsls r4 , r3 , #2 // 1 float value
  795. add AO , AO , r4
  796. #endif
  797. #ifndef TRMMKERNEL
  798. ldr K1, K
  799. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  800. ldr K1, K
  801. ldr r3, KK
  802. sub K1, K1, r3
  803. str K1, KKK
  804. #else
  805. ldr K1, KK
  806. #ifdef LEFT
  807. add K1, K1, #1 // number of values in AO
  808. #else
  809. add K1, K1, #4 // number of values in BO
  810. #endif
  811. str K1, KKK
  812. #endif
  813. asrs L , K1, #3 // L = L / 8
  814. ble _L4_M1_40
  815. _L4_M1_22:
  816. KERNEL1x4_SUB
  817. KERNEL1x4_SUB
  818. KERNEL1x4_SUB
  819. KERNEL1x4_SUB
  820. KERNEL1x4_SUB
  821. KERNEL1x4_SUB
  822. KERNEL1x4_SUB
  823. KERNEL1x4_SUB
  824. subs L, L, #1
  825. bgt _L4_M1_22
  826. _L4_M1_40:
  827. ands L , K1, #7 // L = L % 8
  828. ble _L4_M1_100
  829. _L4_M1_42:
  830. KERNEL1x4_SUB
  831. subs L, L, #1
  832. bgt _L4_M1_42
  833. _L4_M1_100:
  834. SAVE1x4
  835. #if (defined(LEFT) && defined(TRANSA)) || \
  836. (!defined(LEFT) && !defined(TRANSA))
  837. ldr r3 , K
  838. ldr r4 , KKK
  839. sub r3 , r3 , r4
  840. lsls r4 , r3 , #4 // 4 float values
  841. add BO , BO , r4
  842. lsls r4 , r3 , #2 // 1 float value
  843. add AO , AO , r4
  844. #endif
  845. #if defined(LEFT)
  846. ldr r3 , KK
  847. add r3 , r3 , #1 // number of values in AO
  848. str r3 , KK
  849. #endif
  850. _L4_END:
  851. mov r3, BC
  852. ldr r4, K
  853. lsl r4, r4, #4 // k * 4 * 4
  854. add r3, r3, r4 // B = B + K * 4 * 4
  855. mov BC, r3
  856. #if !defined(LEFT)
  857. ldr r3 , KK
  858. add r3 , r3 , #4 // number of values in BO
  859. str r3 , KK
  860. #endif
  861. subs J , #1 // j--
  862. bgt _L4_BEGIN
  863. /*********************************************************************************************/
  864. _L2_BEGIN:
  865. ldr J , N
  866. tst J , #3
  867. ble _L999
  868. tst J , #2
  869. ble _L1_BEGIN
  870. ldr CO1, C // CO1 = C
  871. ldr r4 , LDC
  872. lsl r4 , r4 , #1 // LDC * 2
  873. add r3 , r4, CO1
  874. str r3 , C // store C
  875. #if defined(LEFT)
  876. ldr r3 , OFFSET
  877. str r3 , KK
  878. #endif
  879. ldr AO, A // AO = A
  880. //pld [AO , #A_PRE-96]
  881. //pld [AO , #A_PRE-64]
  882. //pld [AO , #A_PRE-32]
  883. _L2_M4_BEGIN:
  884. ldr I, M
  885. asrs I, I, #2 // I = I / 4
  886. ble _L2_M2_BEGIN
  887. _L2_M4_20:
  888. INIT4x2
  889. #if (defined(LEFT) && defined(TRANSA)) || \
  890. (!defined(LEFT) && !defined(TRANSA))
  891. mov BO, BC
  892. #else
  893. mov BO, BC
  894. ldr r3 , KK
  895. lsls r4 , r3 , #3 // 2 float values
  896. add BO , BO , r4
  897. lsls r4 , r3 , #4 // 4 float values
  898. add AO , AO , r4
  899. #endif
  900. #ifndef TRMMKERNEL
  901. ldr K1, K
  902. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  903. ldr K1, K
  904. ldr r3, KK
  905. sub K1, K1, r3
  906. str K1, KKK
  907. #else
  908. ldr K1, KK
  909. #ifdef LEFT
  910. add K1, K1, #4 // number of values in AO
  911. #else
  912. add K1, K1, #2 // number of values in BO
  913. #endif
  914. str K1, KKK
  915. #endif
  916. asrs L , K1, #3 // L = L / 8
  917. ble _L2_M4_40
  918. .align 5
  919. _L2_M4_22:
  920. KERNEL4x2_SUB
  921. KERNEL4x2_SUB
  922. KERNEL4x2_SUB
  923. KERNEL4x2_SUB
  924. KERNEL4x2_SUB
  925. KERNEL4x2_SUB
  926. KERNEL4x2_SUB
  927. KERNEL4x2_SUB
  928. subs L, L, #1
  929. bgt _L2_M4_22
  930. _L2_M4_40:
  931. ands L , K1, #7 // L = L % 8
  932. ble _L2_M4_100
  933. _L2_M4_42:
  934. KERNEL4x2_SUB
  935. subs L, L, #1
  936. bgt _L2_M4_42
  937. _L2_M4_100:
  938. SAVE4x2
  939. #if (defined(LEFT) && defined(TRANSA)) || \
  940. (!defined(LEFT) && !defined(TRANSA))
  941. ldr r3 , K
  942. ldr r4 , KKK
  943. sub r3 , r3 , r4
  944. lsls r4 , r3 , #3 // 2 float values
  945. add BO , BO , r4
  946. lsls r4 , r3 , #4 // 4 float values
  947. add AO , AO , r4
  948. #endif
  949. #if defined(LEFT)
  950. ldr r3 , KK
  951. add r3 , r3 , #4 // number of values in AO
  952. str r3 , KK
  953. #endif
  954. _L2_M4_END:
  955. subs I, I, #1
  956. bgt _L2_M4_20
  957. _L2_M2_BEGIN:
  958. ldr I, M
  959. tst I , #3
  960. ble _L2_END
  961. tst I, #2 // I = I / 2
  962. ble _L2_M1_BEGIN
  963. _L2_M2_20:
  964. INIT2x2
  965. #if (defined(LEFT) && defined(TRANSA)) || \
  966. (!defined(LEFT) && !defined(TRANSA))
  967. mov BO, BC
  968. #else
  969. mov BO, BC
  970. ldr r3 , KK
  971. lsls r4 , r3 , #3 // 2 float values
  972. add BO , BO , r4
  973. lsls r4 , r3 , #3 // 2 float values
  974. add AO , AO , r4
  975. #endif
  976. #ifndef TRMMKERNEL
  977. ldr K1, K
  978. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  979. ldr K1, K
  980. ldr r3, KK
  981. sub K1, K1, r3
  982. str K1, KKK
  983. #else
  984. ldr K1, KK
  985. #ifdef LEFT
  986. add K1, K1, #2 // number of values in AO
  987. #else
  988. add K1, K1, #2 // number of values in BO
  989. #endif
  990. str K1, KKK
  991. #endif
  992. asrs L , K1, #3 // L = L / 8
  993. ble _L2_M2_40
  994. _L2_M2_22:
  995. KERNEL2x2_SUB
  996. KERNEL2x2_SUB
  997. KERNEL2x2_SUB
  998. KERNEL2x2_SUB
  999. KERNEL2x2_SUB
  1000. KERNEL2x2_SUB
  1001. KERNEL2x2_SUB
  1002. KERNEL2x2_SUB
  1003. subs L, L, #1
  1004. bgt _L2_M2_22
  1005. _L2_M2_40:
  1006. ands L , K1, #7 // L = L % 8
  1007. ble _L2_M2_100
  1008. _L2_M2_42:
  1009. KERNEL2x2_SUB
  1010. subs L, L, #1
  1011. bgt _L2_M2_42
  1012. _L2_M2_100:
  1013. SAVE2x2
  1014. #if (defined(LEFT) && defined(TRANSA)) || \
  1015. (!defined(LEFT) && !defined(TRANSA))
  1016. ldr r3 , K
  1017. ldr r4 , KKK
  1018. sub r3 , r3 , r4
  1019. lsls r4 , r3 , #3 // 2 float values
  1020. add BO , BO , r4
  1021. lsls r4 , r3 , #3 // 2 float values
  1022. add AO , AO , r4
  1023. #endif
  1024. #if defined(LEFT)
  1025. ldr r3 , KK
  1026. add r3 , r3 , #2 // number of values in AO
  1027. str r3 , KK
  1028. #endif
  1029. _L2_M2_END:
  1030. _L2_M1_BEGIN:
  1031. tst I, #1 // I = I % 2
  1032. ble _L2_END
  1033. _L2_M1_20:
  1034. INIT1x2
  1035. #if (defined(LEFT) && defined(TRANSA)) || \
  1036. (!defined(LEFT) && !defined(TRANSA))
  1037. mov BO, BC
  1038. #else
  1039. mov BO, BC
  1040. ldr r3 , KK
  1041. lsls r4 , r3 , #3 // 2 float values
  1042. add BO , BO , r4
  1043. lsls r4 , r3 , #2 // 1 float value
  1044. add AO , AO , r4
  1045. #endif
  1046. #ifndef TRMMKERNEL
  1047. ldr K1, K
  1048. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1049. ldr K1, K
  1050. ldr r3, KK
  1051. sub K1, K1, r3
  1052. str K1, KKK
  1053. #else
  1054. ldr K1, KK
  1055. #ifdef LEFT
  1056. add K1, K1, #1 // number of values in AO
  1057. #else
  1058. add K1, K1, #2 // number of values in BO
  1059. #endif
  1060. str K1, KKK
  1061. #endif
  1062. asrs L , K1, #3 // L = L / 8
  1063. ble _L2_M1_40
  1064. _L2_M1_22:
  1065. KERNEL1x2_SUB
  1066. KERNEL1x2_SUB
  1067. KERNEL1x2_SUB
  1068. KERNEL1x2_SUB
  1069. KERNEL1x2_SUB
  1070. KERNEL1x2_SUB
  1071. KERNEL1x2_SUB
  1072. KERNEL1x2_SUB
  1073. subs L, L, #1
  1074. bgt _L2_M1_22
  1075. _L2_M1_40:
  1076. ands L , K1, #7 // L = L % 8
  1077. ble _L2_M1_100
  1078. _L2_M1_42:
  1079. KERNEL1x2_SUB
  1080. subs L, L, #1
  1081. bgt _L2_M1_42
  1082. _L2_M1_100:
  1083. SAVE1x2
  1084. #if (defined(LEFT) && defined(TRANSA)) || \
  1085. (!defined(LEFT) && !defined(TRANSA))
  1086. ldr r3 , K
  1087. ldr r4 , KKK
  1088. sub r3 , r3 , r4
  1089. lsls r4 , r3 , #3 // 2 float values
  1090. add BO , BO , r4
  1091. lsls r4 , r3 , #2 // 1 float value
  1092. add AO , AO , r4
  1093. #endif
  1094. #if defined(LEFT)
  1095. ldr r3 , KK
  1096. add r3 , r3 , #1 // number of values in AO
  1097. str r3 , KK
  1098. #endif
  1099. _L2_END:
  1100. mov r3, BC
  1101. ldr r4, K
  1102. lsl r4, r4, #3 // k * 2 * 4
  1103. add r3, r3, r4 // B = B + K * 2 * 4
  1104. mov BC, r3
  1105. #if !defined(LEFT)
  1106. ldr r3 , KK
  1107. add r3 , r3 , #2 // number of values in BO
  1108. str r3 , KK
  1109. #endif
  1110. /*********************************************************************************************/
  1111. _L1_BEGIN:
  1112. ldr J , N
  1113. tst J , #1
  1114. ble _L999
  1115. ldr CO1, C // CO1 = C
  1116. ldr r4 , LDC
  1117. add r3 , r4, CO1
  1118. str r3 , C // store C
  1119. #if defined(LEFT)
  1120. ldr r3 , OFFSET
  1121. str r3 , KK
  1122. #endif
  1123. ldr AO, A // AO = A
  1124. //pld [AO , #A_PRE-96]
  1125. //pld [AO , #A_PRE-64]
  1126. //pld [AO , #A_PRE-32]
  1127. _L1_M4_BEGIN:
  1128. ldr I, M
  1129. asrs I, I, #2 // I = I / 4
  1130. ble _L1_M2_BEGIN
  1131. _L1_M4_20:
  1132. INIT4x1
  1133. #if (defined(LEFT) && defined(TRANSA)) || \
  1134. (!defined(LEFT) && !defined(TRANSA))
  1135. mov BO, BC
  1136. #else
  1137. mov BO, BC
  1138. ldr r3 , KK
  1139. lsls r4 , r3 , #2 // 1 float value
  1140. add BO , BO , r4
  1141. lsls r4 , r3 , #4 // 4 float values
  1142. add AO , AO , r4
  1143. #endif
  1144. #ifndef TRMMKERNEL
  1145. ldr K1, K
  1146. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1147. ldr K1, K
  1148. ldr r3, KK
  1149. sub K1, K1, r3
  1150. str K1, KKK
  1151. #else
  1152. ldr K1, KK
  1153. #ifdef LEFT
  1154. add K1, K1, #4 // number of values in AO
  1155. #else
  1156. add K1, K1, #1 // number of values in BO
  1157. #endif
  1158. str K1, KKK
  1159. #endif
  1160. asrs L , K1, #3 // L = L / 8
  1161. ble _L1_M4_40
  1162. .align 5
  1163. _L1_M4_22:
  1164. KERNEL4x1_SUB
  1165. KERNEL4x1_SUB
  1166. KERNEL4x1_SUB
  1167. KERNEL4x1_SUB
  1168. KERNEL4x1_SUB
  1169. KERNEL4x1_SUB
  1170. KERNEL4x1_SUB
  1171. KERNEL4x1_SUB
  1172. subs L, L, #1
  1173. bgt _L1_M4_22
  1174. _L1_M4_40:
  1175. ands L , K1, #7 // L = L % 8
  1176. ble _L1_M4_100
  1177. _L1_M4_42:
  1178. KERNEL4x1_SUB
  1179. subs L, L, #1
  1180. bgt _L1_M4_42
  1181. _L1_M4_100:
  1182. SAVE4x1
  1183. #if (defined(LEFT) && defined(TRANSA)) || \
  1184. (!defined(LEFT) && !defined(TRANSA))
  1185. ldr r3 , K
  1186. ldr r4 , KKK
  1187. sub r3 , r3 , r4
  1188. lsls r4 , r3 , #2 // 1 float value
  1189. add BO , BO , r4
  1190. lsls r4 , r3 , #4 // 4 float values
  1191. add AO , AO , r4
  1192. #endif
  1193. #if defined(LEFT)
  1194. ldr r3 , KK
  1195. add r3 , r3 , #4 // number of values in AO
  1196. str r3 , KK
  1197. #endif
  1198. _L1_M4_END:
  1199. subs I, I, #1
  1200. bgt _L1_M4_20
  1201. _L1_M2_BEGIN:
  1202. ldr I, M
  1203. tst I , #3
  1204. ble _L1_END
  1205. tst I, #2 // I = I / 2
  1206. ble _L1_M1_BEGIN
  1207. _L1_M2_20:
  1208. INIT2x1
  1209. #if (defined(LEFT) && defined(TRANSA)) || \
  1210. (!defined(LEFT) && !defined(TRANSA))
  1211. mov BO, BC
  1212. #else
  1213. mov BO, BC
  1214. ldr r3 , KK
  1215. lsls r4 , r3 , #2 // 1 float value
  1216. add BO , BO , r4
  1217. lsls r4 , r3 , #3 // 2 float values
  1218. add AO , AO , r4
  1219. #endif
  1220. #ifndef TRMMKERNEL
  1221. ldr K1, K
  1222. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1223. ldr K1, K
  1224. ldr r3, KK
  1225. sub K1, K1, r3
  1226. str K1, KKK
  1227. #else
  1228. ldr K1, KK
  1229. #ifdef LEFT
  1230. add K1, K1, #2 // number of values in AO
  1231. #else
  1232. add K1, K1, #1 // number of values in BO
  1233. #endif
  1234. str K1, KKK
  1235. #endif
  1236. asrs L , K1, #3 // L = L / 8
  1237. ble _L1_M2_40
  1238. _L1_M2_22:
  1239. KERNEL2x1_SUB
  1240. KERNEL2x1_SUB
  1241. KERNEL2x1_SUB
  1242. KERNEL2x1_SUB
  1243. KERNEL2x1_SUB
  1244. KERNEL2x1_SUB
  1245. KERNEL2x1_SUB
  1246. KERNEL2x1_SUB
  1247. subs L, L, #1
  1248. bgt _L1_M2_22
  1249. _L1_M2_40:
  1250. ands L , K1, #7 // L = L % 8
  1251. ble _L1_M2_100
  1252. _L1_M2_42:
  1253. KERNEL2x1_SUB
  1254. subs L, L, #1
  1255. bgt _L1_M2_42
  1256. _L1_M2_100:
  1257. SAVE2x1
  1258. #if (defined(LEFT) && defined(TRANSA)) || \
  1259. (!defined(LEFT) && !defined(TRANSA))
  1260. ldr r3 , K
  1261. ldr r4 , KKK
  1262. sub r3 , r3 , r4
  1263. lsls r4 , r3 , #2 // 1 float value
  1264. add BO , BO , r4
  1265. lsls r4 , r3 , #3 // 2 float values
  1266. add AO , AO , r4
  1267. #endif
  1268. #if defined(LEFT)
  1269. ldr r3 , KK
  1270. add r3 , r3 , #2 // number of values in AO
  1271. str r3 , KK
  1272. #endif
  1273. _L1_M2_END:
  1274. _L1_M1_BEGIN:
  1275. tst I, #1 // I = I % 2
  1276. ble _L1_END
  1277. _L1_M1_20:
  1278. INIT1x1
  1279. #if (defined(LEFT) && defined(TRANSA)) || \
  1280. (!defined(LEFT) && !defined(TRANSA))
  1281. mov BO, BC
  1282. #else
  1283. mov BO, BC
  1284. ldr r3 , KK
  1285. lsls r4 , r3 , #2 // 1 float value
  1286. add BO , BO , r4
  1287. lsls r4 , r3 , #2 // 1 float value
  1288. add AO , AO , r4
  1289. #endif
  1290. #ifndef TRMMKERNEL
  1291. ldr K1, K
  1292. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1293. ldr K1, K
  1294. ldr r3, KK
  1295. sub K1, K1, r3
  1296. str K1, KKK
  1297. #else
  1298. ldr K1, KK
  1299. #ifdef LEFT
  1300. add K1, K1, #1 // number of values in AO
  1301. #else
  1302. add K1, K1, #1 // number of values in BO
  1303. #endif
  1304. str K1, KKK
  1305. #endif
  1306. asrs L , K1, #3 // L = L / 8
  1307. ble _L1_M1_40
  1308. _L1_M1_22:
  1309. KERNEL1x1_SUB
  1310. KERNEL1x1_SUB
  1311. KERNEL1x1_SUB
  1312. KERNEL1x1_SUB
  1313. KERNEL1x1_SUB
  1314. KERNEL1x1_SUB
  1315. KERNEL1x1_SUB
  1316. KERNEL1x1_SUB
  1317. subs L, L, #1
  1318. bgt _L1_M1_22
  1319. _L1_M1_40:
  1320. ands L , K1, #7 // L = L % 8
  1321. ble _L1_M1_100
  1322. _L1_M1_42:
  1323. KERNEL1x1_SUB
  1324. subs L, L, #1
  1325. bgt _L1_M1_42
  1326. _L1_M1_100:
  1327. SAVE1x1
  1328. _L1_END:
  1329. _L999:
  1330. sub r3, fp, #128
  1331. vldm r3, { s8 - s31} // restore floating point registers
  1332. movs r0, #0 // set return value
  1333. sub sp, fp, #24
  1334. pop {r4 - r9, fp}
  1335. bx lr
  1336. EPILOGUE