You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dtrmm_kernel_4x4_vfpv3.S 34 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/23 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 252
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA d0
  42. /******************************************************
  43. * [fp, #-128] - [fp, #-64] is reserved
  44. * for store and restore of floating point
  45. * registers
  46. *******************************************************/
  47. #define KK [fp, #-240 ]
  48. #define KKK [fp, #-244]
  49. #define C [fp, #-248 ]
  50. #define LDC [fp, #-252 ]
  51. #define M [fp, #-256 ]
  52. #define N [fp, #-260 ]
  53. #define K [fp, #-264 ]
  54. #define A [fp, #-268 ]
  55. #define FP_ZERO [fp, #-236]
  56. #define FP_ZERO_0 [fp, #-236]
  57. #define FP_ZERO_1 [fp, #-232]
  58. #define ALPHA [fp, #-276 ]
  59. #if !defined(__ARM_PCS_VFP)
  60. #define OLD_ALPHA_SOFTFP [fp, #4]
  61. #define OLD_A_SOFTFP [fp, #12 ]
  62. #define B [fp, #16 ]
  63. #define OLD_C [fp, #20 ]
  64. #define OLD_LDC [fp, #24 ]
  65. #define OFFSET [fp, #28 ]
  66. #else
  67. #define B [fp, #4 ]
  68. #define OLD_C [fp, #8 ]
  69. #define OLD_LDC [fp, #12 ]
  70. #define OFFSET [fp, #16 ]
  71. #endif
  72. #define I r0
  73. #define J r1
  74. #define L r2
  75. #define AO r5
  76. #define BO r6
  77. #define CO1 r8
  78. #define CO2 r9
  79. #define K1 r7
  80. #define BC r12
  81. #define A_PRE 64
  82. #define B_PRE 64
  83. #define C_PRE 64
  84. /**************************************************************************************
  85. * Macro definitions
  86. **************************************************************************************/
  87. .macro INIT4x4
  88. fldd d16, FP_ZERO
  89. vmov.f64 d17, d16
  90. vmov.f64 d18, d16
  91. vmov.f64 d19, d16
  92. vmov.f64 d20, d16
  93. vmov.f64 d21, d16
  94. vmov.f64 d22, d16
  95. vmov.f64 d23, d16
  96. vmov.f64 d24, d16
  97. vmov.f64 d25, d16
  98. vmov.f64 d26, d16
  99. vmov.f64 d27, d16
  100. vmov.f64 d28, d16
  101. vmov.f64 d29, d16
  102. vmov.f64 d30, d16
  103. vmov.f64 d31, d16
  104. .endm
  105. .macro KERNEL4x4_I
  106. pld [ BO , #B_PRE ]
  107. fldd d8 , [ BO ]
  108. pld [ AO , #A_PRE ]
  109. vldmia.f64 AO!, { d0 - d1}
  110. fmuld d16 , d0, d8
  111. vldmia.f64 AO!, { d2 - d3}
  112. fmuld d17 , d1, d8
  113. fldd d9 , [ BO, #8 ]
  114. fmuld d18 , d2, d8
  115. fldd d10, [ BO, #16 ]
  116. fmuld d19 , d3, d8
  117. fldd d11, [ BO, #24 ]
  118. fmuld d20 , d0, d9
  119. fmuld d21 , d1, d9
  120. add BO , BO, #32
  121. fmuld d22 , d2, d9
  122. fldd d12, [ BO ]
  123. fmuld d23 , d3, d9
  124. fmuld d24 , d0, d10
  125. vldmia.f64 AO!, { d4 - d5 }
  126. fmuld d25 , d1, d10
  127. fmuld d26 , d2, d10
  128. vldmia.f64 AO!, { d6 - d7 }
  129. fmuld d27 , d3, d10
  130. fldd d13, [ BO, #8 ]
  131. fmuld d28 , d0, d11
  132. fldd d14, [ BO, #16 ]
  133. fmuld d29 , d1, d11
  134. fldd d15, [ BO, #24 ]
  135. fmuld d30 , d2, d11
  136. fmuld d31 , d3, d11
  137. add BO , BO, #32
  138. .endm
  139. .macro KERNEL4x4_S
  140. pld [ BO , #B_PRE ]
  141. fldd d8 , [ BO ]
  142. pld [ AO , #A_PRE ]
  143. vldmia.f64 AO!, { d0 - d1}
  144. fmacd d16 , d0, d8
  145. vldmia.f64 AO!, { d2 - d3}
  146. fmacd d17 , d1, d8
  147. fldd d9 , [ BO, #8 ]
  148. fmacd d18 , d2, d8
  149. fldd d10, [ BO, #16 ]
  150. fmacd d19 , d3, d8
  151. fldd d11, [ BO, #24 ]
  152. fmacd d20 , d0, d9
  153. fmacd d21 , d1, d9
  154. add BO , BO, #32
  155. fmacd d22 , d2, d9
  156. fldd d12, [ BO ]
  157. fmacd d23 , d3, d9
  158. fmacd d24 , d0, d10
  159. vldmia.f64 AO!, { d4 - d5 }
  160. fmacd d25 , d1, d10
  161. fmacd d26 , d2, d10
  162. vldmia.f64 AO!, { d6 - d7 }
  163. fmacd d27 , d3, d10
  164. fldd d13, [ BO, #8 ]
  165. fmacd d28 , d0, d11
  166. fldd d14, [ BO, #16 ]
  167. fmacd d29 , d1, d11
  168. fldd d15, [ BO, #24 ]
  169. fmacd d30 , d2, d11
  170. fmacd d31 , d3, d11
  171. add BO , BO, #32
  172. .endm
  173. .macro KERNEL4x4_M1
  174. fmacd d16 , d4, d12
  175. pld [ AO , #A_PRE ]
  176. fmacd d17 , d5, d12
  177. fmacd d18 , d6, d12
  178. pld [ BO , #B_PRE ]
  179. fmacd d19 , d7, d12
  180. fmacd d20 , d4, d13
  181. fldd d8 , [ BO ]
  182. fmacd d21 , d5, d13
  183. fmacd d22 , d6, d13
  184. vldmia.f64 AO!, { d0 - d1 }
  185. fmacd d23 , d7, d13
  186. fmacd d24 , d4, d14
  187. vldmia.f64 AO!, { d2 - d3 }
  188. fmacd d25 , d5, d14
  189. fldd d9 , [ BO, #8 ]
  190. fmacd d26 , d6, d14
  191. fldd d10, [ BO, #16 ]
  192. fmacd d27 , d7, d14
  193. fldd d11, [ BO, #24 ]
  194. fmacd d28 , d4, d15
  195. fmacd d29 , d5, d15
  196. fmacd d30 , d6, d15
  197. add BO , BO, #32
  198. fmacd d31 , d7, d15
  199. .endm
  200. .macro KERNEL4x4_M2
  201. fmacd d16 , d0, d8
  202. pld [ AO , #A_PRE ]
  203. fmacd d17 , d1, d8
  204. pld [ BO , #B_PRE ]
  205. fmacd d18 , d2, d8
  206. fldd d12, [ BO ]
  207. fmacd d19 , d3, d8
  208. fmacd d20 , d0, d9
  209. vldmia.f64 AO!, { d4 - d5 }
  210. fmacd d21 , d1, d9
  211. fmacd d22 , d2, d9
  212. vldmia.f64 AO!, { d6 - d7 }
  213. fmacd d23 , d3, d9
  214. fmacd d24 , d0, d10
  215. fmacd d25 , d1, d10
  216. fmacd d26 , d2, d10
  217. fmacd d27 , d3, d10
  218. fldd d13, [ BO, #8 ]
  219. fmacd d28 , d0, d11
  220. fldd d14, [ BO, #16 ]
  221. fmacd d29 , d1, d11
  222. fldd d15, [ BO, #24 ]
  223. fmacd d30 , d2, d11
  224. fmacd d31 , d3, d11
  225. add BO , BO, #32
  226. .endm
  227. .macro KERNEL4x4_E
  228. fmacd d16 , d4, d12
  229. pld [ AO , #A_PRE ]
  230. fmacd d17 , d5, d12
  231. fmacd d18 , d6, d12
  232. pld [ BO , #B_PRE ]
  233. fmacd d19 , d7, d12
  234. fmacd d20 , d4, d13
  235. fmacd d21 , d5, d13
  236. fmacd d22 , d6, d13
  237. fmacd d23 , d7, d13
  238. fmacd d24 , d4, d14
  239. fmacd d25 , d5, d14
  240. fmacd d26 , d6, d14
  241. fmacd d27 , d7, d14
  242. fmacd d28 , d4, d15
  243. fmacd d29 , d5, d15
  244. fmacd d30 , d6, d15
  245. fmacd d31 , d7, d15
  246. .endm
  247. .macro KERNEL4x4_SUB
  248. pld [ BO , #B_PRE ]
  249. pld [ AO , #A_PRE ]
  250. fldd d8 , [ BO ]
  251. fldd d0 , [ AO ]
  252. fldd d1 , [ AO, #8 ]
  253. fldd d2 , [ AO, #16 ]
  254. fldd d3 , [ AO, #24 ]
  255. fmacd d16 , d0, d8
  256. fldd d9 , [ BO, #8 ]
  257. fmacd d17 , d1, d8
  258. fldd d10, [ BO, #16 ]
  259. fmacd d18 , d2, d8
  260. fldd d11, [ BO, #24 ]
  261. fmacd d19 , d3, d8
  262. fmacd d20 , d0, d9
  263. fmacd d21 , d1, d9
  264. fmacd d22 , d2, d9
  265. fmacd d23 , d3, d9
  266. fmacd d24 , d0, d10
  267. fmacd d25 , d1, d10
  268. fmacd d26 , d2, d10
  269. fmacd d27 , d3, d10
  270. fmacd d28 , d0, d11
  271. fmacd d29 , d1, d11
  272. add AO , AO, #32
  273. fmacd d30 , d2, d11
  274. add BO , BO, #32
  275. fmacd d31 , d3, d11
  276. .endm
  277. .macro SAVE4x4
  278. ldr r3 , LDC
  279. add CO2 , CO1, r3
  280. fldd d0, ALPHA
  281. add r4 , CO2, r3
  282. fmuld d8 , d0 , d16
  283. fmuld d9 , d0 , d17
  284. fmuld d10, d0 , d18
  285. fmuld d11, d0 , d19
  286. fmuld d12, d0 , d20
  287. fstd d8 , [CO1]
  288. fmuld d13, d0 , d21
  289. fstd d9 , [CO1, #8 ]
  290. fmuld d14, d0 , d22
  291. fstd d10, [CO1, #16 ]
  292. fmuld d15, d0 , d23
  293. fstd d11, [CO1, #24 ]
  294. fmuld d8 , d0 , d24
  295. fstd d12, [CO2]
  296. fmuld d9 , d0 , d25
  297. fstd d13, [CO2, #8 ]
  298. fmuld d10, d0 , d26
  299. fstd d14, [CO2, #16 ]
  300. fmuld d11, d0 , d27
  301. fstd d15, [CO2, #24 ]
  302. add CO2, r4 , r3
  303. fstd d8 , [r4 ]
  304. fmuld d12, d0 , d28
  305. fstd d9 , [r4 , #8 ]
  306. fmuld d13, d0 , d29
  307. fstd d10, [r4 , #16 ]
  308. fmuld d14, d0 , d30
  309. fstd d11, [r4 , #24 ]
  310. fmuld d15, d0 , d31
  311. vstmia.f64 CO2, { d12 - d15 }
  312. add CO1, CO1, #32
  313. .endm
  314. /******************************************************************************/
  315. .macro INIT2x4
  316. fldd d16, FP_ZERO
  317. vmov.f64 d17, d16
  318. vmov.f64 d20, d16
  319. vmov.f64 d21, d16
  320. vmov.f64 d24, d16
  321. vmov.f64 d25, d16
  322. vmov.f64 d28, d16
  323. vmov.f64 d29, d16
  324. .endm
  325. .macro KERNEL2x4_SUB
  326. fldd d8 , [ BO ]
  327. fldd d9 , [ BO, #8 ]
  328. fldd d10, [ BO, #16 ]
  329. fldd d11, [ BO, #24 ]
  330. fldd d0 , [ AO ]
  331. fldd d1 , [ AO, #8 ]
  332. fmacd d16 , d0, d8
  333. fmacd d17 , d1, d8
  334. fmacd d20 , d0, d9
  335. fmacd d21 , d1, d9
  336. fmacd d24 , d0, d10
  337. fmacd d25 , d1, d10
  338. fmacd d28 , d0, d11
  339. fmacd d29 , d1, d11
  340. add AO , AO, #16
  341. add BO , BO, #32
  342. .endm
  343. .macro SAVE2x4
  344. ldr r3 , LDC
  345. add CO2 , CO1, r3
  346. add r4 , CO2, r3
  347. fldd d0, ALPHA
  348. fmuld d8 , d0 , d16
  349. fmuld d9 , d0 , d17
  350. fstd d8 , [CO1]
  351. fstd d9 , [CO1, #8 ]
  352. fmuld d12, d0 , d20
  353. fmuld d13, d0 , d21
  354. fstd d12, [CO2]
  355. fstd d13, [CO2, #8 ]
  356. fmuld d8 , d0 , d24
  357. fmuld d9 , d0 , d25
  358. fstd d8 , [r4 ]
  359. fstd d9 , [r4 , #8 ]
  360. add CO2, r4 , r3
  361. fmuld d12, d0 , d28
  362. fmuld d13, d0 , d29
  363. fstd d12, [CO2]
  364. fstd d13, [CO2, #8 ]
  365. add CO1, CO1, #16
  366. .endm
  367. /******************************************************************************/
  368. .macro INIT1x4
  369. fldd d16, FP_ZERO
  370. vmov.f64 d20, d16
  371. vmov.f64 d24, d16
  372. vmov.f64 d28, d16
  373. .endm
  374. .macro KERNEL1x4_SUB
  375. fldd d8 , [ BO ]
  376. fldd d9 , [ BO, #8 ]
  377. fldd d10, [ BO, #16 ]
  378. fldd d11, [ BO, #24 ]
  379. fldd d0 , [ AO ]
  380. fmacd d16 , d0, d8
  381. fmacd d20 , d0, d9
  382. fmacd d24 , d0, d10
  383. fmacd d28 , d0, d11
  384. add AO , AO, #8
  385. add BO , BO, #32
  386. .endm
  387. .macro SAVE1x4
  388. ldr r3 , LDC
  389. add CO2 , CO1, r3
  390. add r4 , CO2, r3
  391. fldd d0, ALPHA
  392. fmuld d8 , d0 , d16
  393. fstd d8 , [CO1]
  394. fmuld d12, d0 , d20
  395. fstd d12, [CO2]
  396. fmuld d8 , d0 , d24
  397. fstd d8 , [r4 ]
  398. add CO2, r4 , r3
  399. fmuld d12, d0 , d28
  400. fstd d12, [CO2]
  401. add CO1, CO1, #8
  402. .endm
  403. /******************************************************************************/
  404. /******************************************************************************/
  405. .macro INIT4x2
  406. fldd d16, FP_ZERO
  407. vmov.f64 d17, d16
  408. vmov.f64 d18, d16
  409. vmov.f64 d19, d16
  410. vmov.f64 d20, d16
  411. vmov.f64 d21, d16
  412. vmov.f64 d22, d16
  413. vmov.f64 d23, d16
  414. .endm
  415. .macro KERNEL4x2_SUB
  416. fldd d8 , [ BO ]
  417. fldd d9 , [ BO, #8 ]
  418. fldd d0 , [ AO ]
  419. fldd d1 , [ AO, #8 ]
  420. fldd d2 , [ AO, #16 ]
  421. fldd d3 , [ AO, #24 ]
  422. fmacd d16 , d0, d8
  423. fmacd d17 , d1, d8
  424. fmacd d18 , d2, d8
  425. fmacd d19 , d3, d8
  426. fmacd d20 , d0, d9
  427. fmacd d21 , d1, d9
  428. fmacd d22 , d2, d9
  429. fmacd d23 , d3, d9
  430. add AO , AO, #32
  431. add BO , BO, #16
  432. .endm
  433. .macro SAVE4x2
  434. ldr r3 , LDC
  435. add CO2 , CO1, r3
  436. fldd d0, ALPHA
  437. fmuld d8 , d0 , d16
  438. fmuld d9 , d0 , d17
  439. fmuld d10, d0 , d18
  440. fmuld d11, d0 , d19
  441. fstd d8 , [CO1]
  442. fstd d9 , [CO1, #8 ]
  443. fstd d10, [CO1, #16 ]
  444. fstd d11, [CO1, #24 ]
  445. fmuld d12, d0 , d20
  446. fmuld d13, d0 , d21
  447. fmuld d14, d0 , d22
  448. fmuld d15, d0 , d23
  449. fstd d12, [CO2]
  450. fstd d13, [CO2, #8 ]
  451. fstd d14, [CO2, #16 ]
  452. fstd d15, [CO2, #24 ]
  453. add CO1, CO1, #32
  454. .endm
  455. /******************************************************************************/
  456. .macro INIT2x2
  457. fldd d16, FP_ZERO
  458. vmov.f64 d17, d16
  459. vmov.f64 d20, d16
  460. vmov.f64 d21, d16
  461. .endm
  462. .macro KERNEL2x2_SUB
  463. fldd d8 , [ BO ]
  464. fldd d9 , [ BO, #8 ]
  465. fldd d0 , [ AO ]
  466. fldd d1 , [ AO, #8 ]
  467. fmacd d16 , d0, d8
  468. fmacd d17 , d1, d8
  469. fmacd d20 , d0, d9
  470. fmacd d21 , d1, d9
  471. add AO , AO, #16
  472. add BO , BO, #16
  473. .endm
  474. .macro SAVE2x2
  475. ldr r3 , LDC
  476. add CO2 , CO1, r3
  477. fldd d0, ALPHA
  478. fmuld d8 , d0 , d16
  479. fmuld d9 , d0 , d17
  480. fstd d8 , [CO1]
  481. fstd d9 , [CO1, #8 ]
  482. fmuld d12, d0 , d20
  483. fmuld d13, d0 , d21
  484. fstd d12, [CO2]
  485. fstd d13, [CO2, #8 ]
  486. add CO1, CO1, #16
  487. .endm
  488. /******************************************************************************/
  489. .macro INIT1x2
  490. fldd d16, FP_ZERO
  491. vmov.f64 d20, d16
  492. .endm
  493. .macro KERNEL1x2_SUB
  494. fldd d8 , [ BO ]
  495. fldd d9 , [ BO, #8 ]
  496. fldd d0 , [ AO ]
  497. fmacd d16 , d0, d8
  498. fmacd d20 , d0, d9
  499. add AO , AO, #8
  500. add BO , BO, #16
  501. .endm
  502. .macro SAVE1x2
  503. ldr r3 , LDC
  504. add CO2 , CO1, r3
  505. fldd d0, ALPHA
  506. fmuld d8 , d0 , d16
  507. fstd d8 , [CO1]
  508. fmuld d12, d0 , d20
  509. fstd d12, [CO2]
  510. add CO1, CO1, #8
  511. .endm
  512. /******************************************************************************/
  513. /******************************************************************************/
  514. .macro INIT4x1
  515. fldd d16, FP_ZERO
  516. vmov.f64 d17, d16
  517. vmov.f64 d18, d16
  518. vmov.f64 d19, d16
  519. .endm
  520. .macro KERNEL4x1_SUB
  521. fldd d8 , [ BO ]
  522. fldd d0 , [ AO ]
  523. fldd d1 , [ AO, #8 ]
  524. fldd d2 , [ AO, #16 ]
  525. fldd d3 , [ AO, #24 ]
  526. fmacd d16 , d0, d8
  527. fmacd d17 , d1, d8
  528. fmacd d18 , d2, d8
  529. fmacd d19 , d3, d8
  530. add AO , AO, #32
  531. add BO , BO, #8
  532. .endm
  533. .macro SAVE4x1
  534. fldd d0, ALPHA
  535. fmuld d8 , d0 , d16
  536. fmuld d9 , d0 , d17
  537. fmuld d10, d0 , d18
  538. fmuld d11, d0 , d19
  539. fstd d8 , [CO1]
  540. fstd d9 , [CO1, #8 ]
  541. fstd d10, [CO1, #16 ]
  542. fstd d11, [CO1, #24 ]
  543. add CO1, CO1, #32
  544. .endm
  545. /******************************************************************************/
  546. .macro INIT2x1
  547. fldd d16, FP_ZERO
  548. vmov.f64 d17, d16
  549. .endm
  550. .macro KERNEL2x1_SUB
  551. fldd d8 , [ BO ]
  552. fldd d0 , [ AO ]
  553. fldd d1 , [ AO, #8 ]
  554. fmacd d16 , d0, d8
  555. fmacd d17 , d1, d8
  556. add AO , AO, #16
  557. add BO , BO, #8
  558. .endm
  559. .macro SAVE2x1
  560. fldd d0, ALPHA
  561. fmuld d8 , d0 , d16
  562. fmuld d9 , d0 , d17
  563. fstd d8 , [CO1]
  564. fstd d9 , [CO1, #8 ]
  565. add CO1, CO1, #16
  566. .endm
  567. /******************************************************************************/
  568. .macro INIT1x1
  569. fldd d16, FP_ZERO
  570. .endm
  571. .macro KERNEL1x1_SUB
  572. fldd d8 , [ BO ]
  573. fldd d0 , [ AO ]
  574. fmacd d16 , d0, d8
  575. add AO , AO, #8
  576. add BO , BO, #8
  577. .endm
  578. .macro SAVE1x1
  579. fldd d0, ALPHA
  580. fmuld d8 , d0 , d16
  581. fstd d8 , [CO1]
  582. add CO1, CO1, #8
  583. .endm
  584. /**************************************************************************************
  585. * End of macro definitions
  586. **************************************************************************************/
  587. PROLOGUE
  588. .align 5
  589. push {r4 - r9, fp}
  590. add fp, sp, #24
  591. sub sp, sp, #STACKSIZE // reserve stack
  592. #if !defined(__ARM_PCS_VFP)
  593. vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
  594. ldr OLD_A, OLD_A_SOFTFP
  595. #endif
  596. str OLD_M, M
  597. str OLD_N, N
  598. str OLD_K, K
  599. str OLD_A, A
  600. vstr OLD_ALPHA, ALPHA
  601. sub r3, fp, #128
  602. vstm r3, { d8 - d15} // store floating point registers
  603. movs r4, #0
  604. str r4, FP_ZERO
  605. str r4, FP_ZERO_1
  606. ldr r3, OLD_LDC
  607. lsl r3, r3, #3 // ldc = ldc * 8
  608. str r3, LDC
  609. ldr r3, OLD_C
  610. str r3, C
  611. ldr BC, B
  612. ldr r3, OFFSET
  613. #ifndef LEFT
  614. neg r3 , r3
  615. #endif
  616. str r3 , KK
  617. ldr J, N
  618. asrs J, J, #2 // J = J / 4
  619. ble _L2_BEGIN
  620. _L4_BEGIN:
  621. ldr CO1, C // CO1 = C
  622. ldr r4 , LDC
  623. lsl r4 , r4 , #2 // LDC * 4
  624. add r3 , r4, CO1
  625. str r3 , C // store C
  626. #if defined(LEFT)
  627. ldr r3 , OFFSET
  628. str r3 , KK
  629. #endif
  630. ldr AO, A // AO = A
  631. pld [AO , #A_PRE-64]
  632. pld [AO , #A_PRE-32]
  633. _L4_M4_BEGIN:
  634. ldr I, M
  635. asrs I, I, #2 // I = I / 4
  636. ble _L4_M2_BEGIN
  637. _L4_M4_20:
  638. #if (defined(LEFT) && defined(TRANSA)) || \
  639. (!defined(LEFT) && !defined(TRANSA))
  640. mov BO, BC
  641. #else
  642. mov BO, BC
  643. ldr r3 , KK
  644. lsls r4 , r3 , #5 // 4 double values
  645. add BO , BO , r4
  646. lsls r4 , r3 , #5 // 4 double values
  647. add AO , AO , r4
  648. #endif
  649. #ifndef TRMMKERNEL
  650. ldr L , K
  651. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  652. ldr L , K
  653. ldr r3, KK
  654. sub L , L, r3
  655. str L , KKK
  656. #else
  657. ldr L , KK
  658. #ifdef LEFT
  659. add L , L , #4 // number of values in AO
  660. #else
  661. add L , L , #4 // number of values in BO
  662. #endif
  663. str L , KKK
  664. #endif
  665. mov K1, L
  666. asrs L , K1, #5 // L = L / 8
  667. ble _L4_M4_40
  668. .align 5
  669. KERNEL4x4_I
  670. KERNEL4x4_M1
  671. KERNEL4x4_M2
  672. KERNEL4x4_M1
  673. KERNEL4x4_M2
  674. KERNEL4x4_M1
  675. KERNEL4x4_M2
  676. KERNEL4x4_M1
  677. KERNEL4x4_M2
  678. KERNEL4x4_M1
  679. KERNEL4x4_M2
  680. KERNEL4x4_M1
  681. KERNEL4x4_M2
  682. KERNEL4x4_M1
  683. KERNEL4x4_M2
  684. KERNEL4x4_M1
  685. KERNEL4x4_M2
  686. KERNEL4x4_M1
  687. KERNEL4x4_M2
  688. KERNEL4x4_M1
  689. KERNEL4x4_M2
  690. KERNEL4x4_M1
  691. KERNEL4x4_M2
  692. KERNEL4x4_M1
  693. KERNEL4x4_M2
  694. KERNEL4x4_M1
  695. KERNEL4x4_M2
  696. KERNEL4x4_M1
  697. KERNEL4x4_M2
  698. KERNEL4x4_M1
  699. KERNEL4x4_M2
  700. KERNEL4x4_E
  701. subs L, L, #1
  702. ble _L4_M4_41
  703. _L4_M4_22:
  704. KERNEL4x4_S
  705. KERNEL4x4_M1
  706. KERNEL4x4_M2
  707. KERNEL4x4_M1
  708. KERNEL4x4_M2
  709. KERNEL4x4_M1
  710. KERNEL4x4_M2
  711. KERNEL4x4_M1
  712. KERNEL4x4_M2
  713. KERNEL4x4_M1
  714. KERNEL4x4_M2
  715. KERNEL4x4_M1
  716. KERNEL4x4_M2
  717. KERNEL4x4_M1
  718. KERNEL4x4_M2
  719. KERNEL4x4_M1
  720. KERNEL4x4_M2
  721. KERNEL4x4_M1
  722. KERNEL4x4_M2
  723. KERNEL4x4_M1
  724. KERNEL4x4_M2
  725. KERNEL4x4_M1
  726. KERNEL4x4_M2
  727. KERNEL4x4_M1
  728. KERNEL4x4_M2
  729. KERNEL4x4_M1
  730. KERNEL4x4_M2
  731. KERNEL4x4_M1
  732. KERNEL4x4_M2
  733. KERNEL4x4_M1
  734. KERNEL4x4_M2
  735. KERNEL4x4_E
  736. subs L, L, #1
  737. ble _L4_M4_41
  738. b _L4_M4_22
  739. _L4_M4_40:
  740. INIT4x4
  741. _L4_M4_41:
  742. ands L , K1, #31 // L = L % 8
  743. ble _L4_M4_100
  744. _L4_M4_42:
  745. KERNEL4x4_SUB
  746. subs L, L, #1
  747. bgt _L4_M4_42
  748. _L4_M4_100:
  749. SAVE4x4
  750. #if (defined(LEFT) && defined(TRANSA)) || \
  751. (!defined(LEFT) && !defined(TRANSA))
  752. ldr r3 , K
  753. ldr r4 , KKK
  754. sub r3 , r3 , r4
  755. lsls r4 , r3 , #5 // 4 double values
  756. add BO , BO , r4
  757. lsls r4 , r3 , #5 // 4 double values
  758. add AO , AO , r4
  759. #endif
  760. #if defined(LEFT)
  761. ldr r3 , KK
  762. add r3 , r3 , #4 // number of values in AO
  763. str r3 , KK
  764. #endif
  765. _L4_M4_END:
  766. subs I, I, #1
  767. bgt _L4_M4_20
  768. _L4_M2_BEGIN:
  769. ldr I, M
  770. tst I , #3
  771. ble _L4_END
  772. tst I, #2 // I = I / 2
  773. ble _L4_M1_BEGIN
  774. _L4_M2_20:
  775. INIT2x4
  776. #if (defined(LEFT) && defined(TRANSA)) || \
  777. (!defined(LEFT) && !defined(TRANSA))
  778. mov BO, BC
  779. #else
  780. mov BO, BC
  781. ldr r3 , KK
  782. lsls r4 , r3 , #5 // 4 double values
  783. add BO , BO , r4
  784. lsls r4 , r3 , #4 // 2 double values
  785. add AO , AO , r4
  786. #endif
  787. #ifndef TRMMKERNEL
  788. ldr L , K
  789. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  790. ldr L , K
  791. ldr r3, KK
  792. sub L , L, r3
  793. str L , KKK
  794. #else
  795. ldr L , KK
  796. #ifdef LEFT
  797. add L , L , #2 // number of values in AO
  798. #else
  799. add L , L , #4 // number of values in BO
  800. #endif
  801. str L , KKK
  802. #endif
  803. mov K1, L
  804. asrs L , K1, #3 // L = L / 8
  805. ble _L4_M2_40
  806. _L4_M2_22:
  807. KERNEL2x4_SUB
  808. KERNEL2x4_SUB
  809. KERNEL2x4_SUB
  810. KERNEL2x4_SUB
  811. KERNEL2x4_SUB
  812. KERNEL2x4_SUB
  813. KERNEL2x4_SUB
  814. KERNEL2x4_SUB
  815. subs L, L, #1
  816. bgt _L4_M2_22
  817. _L4_M2_40:
  818. ands L , K1, #7 // L = L % 8
  819. ble _L4_M2_100
  820. _L4_M2_42:
  821. KERNEL2x4_SUB
  822. subs L, L, #1
  823. bgt _L4_M2_42
  824. _L4_M2_100:
  825. SAVE2x4
  826. #if (defined(LEFT) && defined(TRANSA)) || \
  827. (!defined(LEFT) && !defined(TRANSA))
  828. ldr r3 , K
  829. ldr r4 , KKK
  830. sub r3 , r3 , r4
  831. lsls r4 , r3 , #5 // 4 double values
  832. add BO , BO , r4
  833. lsls r4 , r3 , #4 // 2 double values
  834. add AO , AO , r4
  835. #endif
  836. #if defined(LEFT)
  837. ldr r3 , KK
  838. add r3 , r3 , #2 // number of values in AO
  839. str r3 , KK
  840. #endif
  841. _L4_M2_END:
  842. _L4_M1_BEGIN:
  843. tst I, #1 // I = I % 2
  844. ble _L4_END
  845. _L4_M1_20:
  846. INIT1x4
  847. #if (defined(LEFT) && defined(TRANSA)) || \
  848. (!defined(LEFT) && !defined(TRANSA))
  849. mov BO, BC
  850. #else
  851. mov BO, BC
  852. ldr r3 , KK
  853. lsls r4 , r3 , #5 // 4 double values
  854. add BO , BO , r4
  855. lsls r4 , r3 , #3 // 1 double value
  856. add AO , AO , r4
  857. #endif
  858. #ifndef TRMMKERNEL
  859. ldr L , K
  860. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  861. ldr L , K
  862. ldr r3, KK
  863. sub L , L, r3
  864. str L , KKK
  865. #else
  866. ldr L , KK
  867. #ifdef LEFT
  868. add L , L , #1 // number of values in AO
  869. #else
  870. add L , L , #4 // number of values in BO
  871. #endif
  872. str L , KKK
  873. #endif
  874. mov K1, L
  875. asrs L , K1, #3 // L = L / 8
  876. ble _L4_M1_40
  877. _L4_M1_22:
  878. KERNEL1x4_SUB
  879. KERNEL1x4_SUB
  880. KERNEL1x4_SUB
  881. KERNEL1x4_SUB
  882. KERNEL1x4_SUB
  883. KERNEL1x4_SUB
  884. KERNEL1x4_SUB
  885. KERNEL1x4_SUB
  886. subs L, L, #1
  887. bgt _L4_M1_22
  888. _L4_M1_40:
  889. ands L , K1, #7 // L = L % 8
  890. ble _L4_M1_100
  891. _L4_M1_42:
  892. KERNEL1x4_SUB
  893. subs L, L, #1
  894. bgt _L4_M1_42
  895. _L4_M1_100:
  896. SAVE1x4
  897. #if (defined(LEFT) && defined(TRANSA)) || \
  898. (!defined(LEFT) && !defined(TRANSA))
  899. ldr r3 , K
  900. ldr r4 , KKK
  901. sub r3 , r3 , r4
  902. lsls r4 , r3 , #5 // 4 double values
  903. add BO , BO , r4
  904. lsls r4 , r3 , #3 // 1 double value
  905. add AO , AO , r4
  906. #endif
  907. #if defined(LEFT)
  908. ldr r3 , KK
  909. add r3 , r3 , #1 // number of values in AO
  910. str r3 , KK
  911. #endif
  912. _L4_END:
  913. mov r3, BC
  914. ldr r4, K
  915. lsl r4, r4, #5 // k * 4 * 8
  916. add r3, r3, r4 // B = B + K * 4 * 8
  917. mov BC, r3
  918. #if !defined(LEFT)
  919. ldr r3 , KK
  920. add r3 , r3 , #4 // number of values in BO
  921. str r3 , KK
  922. #endif
  923. subs J , #1 // j--
  924. bgt _L4_BEGIN
  925. /*********************************************************************************************/
  926. _L2_BEGIN:
  927. ldr J , N
  928. tst J , #3
  929. ble _L999
  930. tst J , #2
  931. ble _L1_BEGIN
  932. ldr CO1, C // CO1 = C
  933. ldr r4 , LDC
  934. lsl r4 , r4 , #1 // LDC * 2
  935. add r3 , r4, CO1
  936. str r3 , C // store C
  937. #if defined(LEFT)
  938. ldr r3 , OFFSET
  939. str r3 , KK
  940. #endif
  941. ldr AO, A // AO = A
  942. //pld [AO , #A_PRE-96]
  943. //pld [AO , #A_PRE-64]
  944. //pld [AO , #A_PRE-32]
  945. _L2_M4_BEGIN:
  946. ldr I, M
  947. asrs I, I, #2 // I = I / 4
  948. ble _L2_M2_BEGIN
  949. _L2_M4_20:
  950. INIT4x2
  951. #if (defined(LEFT) && defined(TRANSA)) || \
  952. (!defined(LEFT) && !defined(TRANSA))
  953. mov BO, BC
  954. #else
  955. mov BO, BC
  956. ldr r3 , KK
  957. lsls r4 , r3 , #4 // 2 double values
  958. add BO , BO , r4
  959. lsls r4 , r3 , #5 // 4 double values
  960. add AO , AO , r4
  961. #endif
  962. #ifndef TRMMKERNEL
  963. ldr L , K
  964. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  965. ldr L , K
  966. ldr r3, KK
  967. sub L , L, r3
  968. str L , KKK
  969. #else
  970. ldr L , KK
  971. #ifdef LEFT
  972. add L , L , #4 // number of values in AO
  973. #else
  974. add L , L , #2 // number of values in BO
  975. #endif
  976. str L , KKK
  977. #endif
  978. mov K1, L
  979. asrs L , K1, #3 // L = L / 8
  980. ble _L2_M4_40
  981. .align 5
  982. _L2_M4_22:
  983. KERNEL4x2_SUB
  984. KERNEL4x2_SUB
  985. KERNEL4x2_SUB
  986. KERNEL4x2_SUB
  987. KERNEL4x2_SUB
  988. KERNEL4x2_SUB
  989. KERNEL4x2_SUB
  990. KERNEL4x2_SUB
  991. subs L, L, #1
  992. bgt _L2_M4_22
  993. _L2_M4_40:
  994. ands L , K1, #7 // L = L % 8
  995. ble _L2_M4_100
  996. _L2_M4_42:
  997. KERNEL4x2_SUB
  998. subs L, L, #1
  999. bgt _L2_M4_42
  1000. _L2_M4_100:
  1001. SAVE4x2
  1002. #if (defined(LEFT) && defined(TRANSA)) || \
  1003. (!defined(LEFT) && !defined(TRANSA))
  1004. ldr r3 , K
  1005. ldr r4 , KKK
  1006. sub r3 , r3 , r4
  1007. lsls r4 , r3 , #4 // 2 double values
  1008. add BO , BO , r4
  1009. lsls r4 , r3 , #5 // 4 double values
  1010. add AO , AO , r4
  1011. #endif
  1012. #if defined(LEFT)
  1013. ldr r3 , KK
  1014. add r3 , r3 , #4 // number of values in AO
  1015. str r3 , KK
  1016. #endif
  1017. _L2_M4_END:
  1018. subs I, I, #1
  1019. bgt _L2_M4_20
  1020. _L2_M2_BEGIN:
  1021. ldr I, M
  1022. tst I , #3
  1023. ble _L2_END
  1024. tst I, #2 // I = I / 2
  1025. ble _L2_M1_BEGIN
  1026. _L2_M2_20:
  1027. INIT2x2
  1028. #if (defined(LEFT) && defined(TRANSA)) || \
  1029. (!defined(LEFT) && !defined(TRANSA))
  1030. mov BO, BC
  1031. #else
  1032. mov BO, BC
  1033. ldr r3 , KK
  1034. lsls r4 , r3 , #4 // 2 double values
  1035. add BO , BO , r4
  1036. lsls r4 , r3 , #4 // 2 double values
  1037. add AO , AO , r4
  1038. #endif
  1039. #ifndef TRMMKERNEL
  1040. ldr L , K
  1041. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1042. ldr L , K
  1043. ldr r3, KK
  1044. sub L , L, r3
  1045. str L , KKK
  1046. #else
  1047. ldr L , KK
  1048. #ifdef LEFT
  1049. add L , L , #2 // number of values in AO
  1050. #else
  1051. add L , L , #2 // number of values in BO
  1052. #endif
  1053. str L , KKK
  1054. #endif
  1055. mov K1, L
  1056. asrs L , K1, #3 // L = L / 8
  1057. ble _L2_M2_40
  1058. _L2_M2_22:
  1059. KERNEL2x2_SUB
  1060. KERNEL2x2_SUB
  1061. KERNEL2x2_SUB
  1062. KERNEL2x2_SUB
  1063. KERNEL2x2_SUB
  1064. KERNEL2x2_SUB
  1065. KERNEL2x2_SUB
  1066. KERNEL2x2_SUB
  1067. subs L, L, #1
  1068. bgt _L2_M2_22
  1069. _L2_M2_40:
  1070. ands L , K1, #7 // L = L % 8
  1071. ble _L2_M2_100
  1072. _L2_M2_42:
  1073. KERNEL2x2_SUB
  1074. subs L, L, #1
  1075. bgt _L2_M2_42
  1076. _L2_M2_100:
  1077. SAVE2x2
  1078. #if (defined(LEFT) && defined(TRANSA)) || \
  1079. (!defined(LEFT) && !defined(TRANSA))
  1080. ldr r3 , K
  1081. ldr r4 , KKK
  1082. sub r3 , r3 , r4
  1083. lsls r4 , r3 , #4 // 2 double values
  1084. add BO , BO , r4
  1085. lsls r4 , r3 , #4 // 2 double values
  1086. add AO , AO , r4
  1087. #endif
  1088. #if defined(LEFT)
  1089. ldr r3 , KK
  1090. add r3 , r3 , #2 // number of values in AO
  1091. str r3 , KK
  1092. #endif
  1093. _L2_M2_END:
  1094. _L2_M1_BEGIN:
  1095. tst I, #1 // I = I % 2
  1096. ble _L2_END
  1097. _L2_M1_20:
  1098. INIT1x2
  1099. #if (defined(LEFT) && defined(TRANSA)) || \
  1100. (!defined(LEFT) && !defined(TRANSA))
  1101. mov BO, BC
  1102. #else
  1103. mov BO, BC
  1104. ldr r3 , KK
  1105. lsls r4 , r3 , #4 // 2 double values
  1106. add BO , BO , r4
  1107. lsls r4 , r3 , #3 // 1 double value
  1108. add AO , AO , r4
  1109. #endif
  1110. #ifndef TRMMKERNEL
  1111. ldr L , K
  1112. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1113. ldr L , K
  1114. ldr r3, KK
  1115. sub L , L, r3
  1116. str L , KKK
  1117. #else
  1118. ldr L , KK
  1119. #ifdef LEFT
  1120. add L , L , #1 // number of values in AO
  1121. #else
  1122. add L , L , #2 // number of values in BO
  1123. #endif
  1124. str L , KKK
  1125. #endif
  1126. mov K1, L
  1127. asrs L , K1, #3 // L = L / 8
  1128. ble _L2_M1_40
  1129. _L2_M1_22:
  1130. KERNEL1x2_SUB
  1131. KERNEL1x2_SUB
  1132. KERNEL1x2_SUB
  1133. KERNEL1x2_SUB
  1134. KERNEL1x2_SUB
  1135. KERNEL1x2_SUB
  1136. KERNEL1x2_SUB
  1137. KERNEL1x2_SUB
  1138. subs L, L, #1
  1139. bgt _L2_M1_22
  1140. _L2_M1_40:
  1141. ands L , K1, #7 // L = L % 8
  1142. ble _L2_M1_100
  1143. _L2_M1_42:
  1144. KERNEL1x2_SUB
  1145. subs L, L, #1
  1146. bgt _L2_M1_42
  1147. _L2_M1_100:
  1148. SAVE1x2
  1149. #if (defined(LEFT) && defined(TRANSA)) || \
  1150. (!defined(LEFT) && !defined(TRANSA))
  1151. ldr r3 , K
  1152. ldr r4 , KKK
  1153. sub r3 , r3 , r4
  1154. lsls r4 , r3 , #4 // 2 double values
  1155. add BO , BO , r4
  1156. lsls r4 , r3 , #3 // 1 double value
  1157. add AO , AO , r4
  1158. #endif
  1159. #if defined(LEFT)
  1160. ldr r3 , KK
  1161. add r3 , r3 , #1 // number of values in AO
  1162. str r3 , KK
  1163. #endif
  1164. _L2_END:
  1165. mov r3, BC
  1166. ldr r4, K
  1167. lsl r4, r4, #4 // k * 2 * 8
  1168. add r3, r3, r4 // B = B + K * 2 * 8
  1169. mov BC, r3
  1170. #if !defined(LEFT)
  1171. ldr r3 , KK
  1172. add r3 , r3 , #2 // number of values in BO
  1173. str r3 , KK
  1174. #endif
  1175. /*********************************************************************************************/
  1176. _L1_BEGIN:
  1177. ldr J , N
  1178. tst J , #1
  1179. ble _L999
  1180. ldr CO1, C // CO1 = C
  1181. ldr r4 , LDC
  1182. add r3 , r4, CO1
  1183. str r3 , C // store C
  1184. #if defined(LEFT)
  1185. ldr r3 , OFFSET
  1186. str r3 , KK
  1187. #endif
  1188. ldr AO, A // AO = A
  1189. //pld [AO , #A_PRE-96]
  1190. //pld [AO , #A_PRE-64]
  1191. //pld [AO , #A_PRE-32]
  1192. _L1_M4_BEGIN:
  1193. ldr I, M
  1194. asrs I, I, #2 // I = I / 4
  1195. ble _L1_M2_BEGIN
  1196. _L1_M4_20:
  1197. INIT4x1
  1198. #if (defined(LEFT) && defined(TRANSA)) || \
  1199. (!defined(LEFT) && !defined(TRANSA))
  1200. mov BO, BC
  1201. #else
  1202. mov BO, BC
  1203. ldr r3 , KK
  1204. lsls r4 , r3 , #3 // 1 double value
  1205. add BO , BO , r4
  1206. lsls r4 , r3 , #5 // 4 double values
  1207. add AO , AO , r4
  1208. #endif
  1209. #ifndef TRMMKERNEL
  1210. ldr L , K
  1211. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1212. ldr L , K
  1213. ldr r3, KK
  1214. sub L , L, r3
  1215. str L , KKK
  1216. #else
  1217. ldr L , KK
  1218. #ifdef LEFT
  1219. add L , L , #4 // number of values in AO
  1220. #else
  1221. add L , L , #1 // number of values in BO
  1222. #endif
  1223. str L , KKK
  1224. #endif
  1225. mov K1, L
  1226. asrs L , K1, #3 // L = L / 8
  1227. ble _L1_M4_40
  1228. .align 5
  1229. _L1_M4_22:
  1230. KERNEL4x1_SUB
  1231. KERNEL4x1_SUB
  1232. KERNEL4x1_SUB
  1233. KERNEL4x1_SUB
  1234. KERNEL4x1_SUB
  1235. KERNEL4x1_SUB
  1236. KERNEL4x1_SUB
  1237. KERNEL4x1_SUB
  1238. subs L, L, #1
  1239. bgt _L1_M4_22
  1240. _L1_M4_40:
  1241. ands L , K1, #7 // L = L % 8
  1242. ble _L1_M4_100
  1243. _L1_M4_42:
  1244. KERNEL4x1_SUB
  1245. subs L, L, #1
  1246. bgt _L1_M4_42
  1247. _L1_M4_100:
  1248. SAVE4x1
  1249. #if (defined(LEFT) && defined(TRANSA)) || \
  1250. (!defined(LEFT) && !defined(TRANSA))
  1251. ldr r3 , K
  1252. ldr r4 , KKK
  1253. sub r3 , r3 , r4
  1254. lsls r4 , r3 , #3 // 1 double value
  1255. add BO , BO , r4
  1256. lsls r4 , r3 , #5 // 4 double values
  1257. add AO , AO , r4
  1258. #endif
  1259. #if defined(LEFT)
  1260. ldr r3 , KK
  1261. add r3 , r3 , #4 // number of values in AO
  1262. str r3 , KK
  1263. #endif
  1264. _L1_M4_END:
  1265. subs I, I, #1
  1266. bgt _L1_M4_20
  1267. _L1_M2_BEGIN:
  1268. ldr I, M
  1269. tst I , #3
  1270. ble _L1_END
  1271. tst I, #2 // I = I / 2
  1272. ble _L1_M1_BEGIN
  1273. _L1_M2_20:
  1274. INIT2x1
  1275. #if (defined(LEFT) && defined(TRANSA)) || \
  1276. (!defined(LEFT) && !defined(TRANSA))
  1277. mov BO, BC
  1278. #else
  1279. mov BO, BC
  1280. ldr r3 , KK
  1281. lsls r4 , r3 , #3 // 1 double value
  1282. add BO , BO , r4
  1283. lsls r4 , r3 , #4 // 2 double values
  1284. add AO , AO , r4
  1285. #endif
  1286. #ifndef TRMMKERNEL
  1287. ldr L , K
  1288. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1289. ldr L , K
  1290. ldr r3, KK
  1291. sub L , L, r3
  1292. str L , KKK
  1293. #else
  1294. ldr L , KK
  1295. #ifdef LEFT
  1296. add L , L , #2 // number of values in AO
  1297. #else
  1298. add L , L , #1 // number of values in BO
  1299. #endif
  1300. str L , KKK
  1301. #endif
  1302. mov K1, L
  1303. asrs L , K1, #3 // L = L / 8
  1304. ble _L1_M2_40
  1305. _L1_M2_22:
  1306. KERNEL2x1_SUB
  1307. KERNEL2x1_SUB
  1308. KERNEL2x1_SUB
  1309. KERNEL2x1_SUB
  1310. KERNEL2x1_SUB
  1311. KERNEL2x1_SUB
  1312. KERNEL2x1_SUB
  1313. KERNEL2x1_SUB
  1314. subs L, L, #1
  1315. bgt _L1_M2_22
  1316. _L1_M2_40:
  1317. ands L , K1, #7 // L = L % 8
  1318. ble _L1_M2_100
  1319. _L1_M2_42:
  1320. KERNEL2x1_SUB
  1321. subs L, L, #1
  1322. bgt _L1_M2_42
  1323. _L1_M2_100:
  1324. SAVE2x1
  1325. #if (defined(LEFT) && defined(TRANSA)) || \
  1326. (!defined(LEFT) && !defined(TRANSA))
  1327. ldr r3 , K
  1328. ldr r4 , KKK
  1329. sub r3 , r3 , r4
  1330. lsls r4 , r3 , #3 // 1 double value
  1331. add BO , BO , r4
  1332. lsls r4 , r3 , #4 // 2 double values
  1333. add AO , AO , r4
  1334. #endif
  1335. #if defined(LEFT)
  1336. ldr r3 , KK
  1337. add r3 , r3 , #2 // number of values in AO
  1338. str r3 , KK
  1339. #endif
  1340. _L1_M2_END:
  1341. _L1_M1_BEGIN:
  1342. tst I, #1 // I = I % 2
  1343. ble _L1_END
  1344. _L1_M1_20:
  1345. INIT1x1
  1346. #if (defined(LEFT) && defined(TRANSA)) || \
  1347. (!defined(LEFT) && !defined(TRANSA))
  1348. mov BO, BC
  1349. #else
  1350. mov BO, BC
  1351. ldr r3 , KK
  1352. lsls r4 , r3 , #3 // 1 double value
  1353. add BO , BO , r4
  1354. lsls r4 , r3 , #3 // 1 double value
  1355. add AO , AO , r4
  1356. #endif
  1357. #ifndef TRMMKERNEL
  1358. ldr L , K
  1359. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1360. ldr L , K
  1361. ldr r3, KK
  1362. sub L , L, r3
  1363. str L , KKK
  1364. #else
  1365. ldr L , KK
  1366. #ifdef LEFT
  1367. add L , L , #1 // number of values in AO
  1368. #else
  1369. add L , L , #1 // number of values in BO
  1370. #endif
  1371. str L , KKK
  1372. #endif
  1373. mov K1, L
  1374. asrs L , K1, #3 // L = L / 8
  1375. ble _L1_M1_40
  1376. _L1_M1_22:
  1377. KERNEL1x1_SUB
  1378. KERNEL1x1_SUB
  1379. KERNEL1x1_SUB
  1380. KERNEL1x1_SUB
  1381. KERNEL1x1_SUB
  1382. KERNEL1x1_SUB
  1383. KERNEL1x1_SUB
  1384. KERNEL1x1_SUB
  1385. subs L, L, #1
  1386. bgt _L1_M1_22
  1387. _L1_M1_40:
  1388. ands L , K1, #7 // L = L % 8
  1389. ble _L1_M1_100
  1390. _L1_M1_42:
  1391. KERNEL1x1_SUB
  1392. subs L, L, #1
  1393. bgt _L1_M1_42
  1394. _L1_M1_100:
  1395. SAVE1x1
  1396. _L1_END:
  1397. _L999:
  1398. sub r3, fp, #128
  1399. vldm r3, { d8 - d15} // restore floating point registers
  1400. movs r0, #0 // set return value
  1401. sub sp, fp, #24
  1402. pop {r4 - r9, fp}
  1403. bx lr
  1404. EPILOGUE