You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ctrmm_kernel_2x2_vfp.S 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/10/16 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA_R s0
  42. #define OLD_ALPHA_I s1
  43. /******************************************************
  44. * [fp, #-128] - [fp, #-64] is reserved
  45. * for store and restore of floating point
  46. * registers
  47. *******************************************************/
  48. #define KKK [fp, #-240]
  49. #define KK [fp, #-244 ]
  50. #define A [fp, #-248 ]
  51. #define LDC [fp, #-252 ]
  52. #define M [fp, #-256 ]
  53. #define N [fp, #-260 ]
  54. #define K [fp, #-264 ]
  55. #define FP_ZERO [fp, #-232]
  56. #define FP_ZERO_0 [fp, #-232]
  57. #define FP_ZERO_1 [fp, #-228]
  58. #define ALPHA_I [fp, #-272]
  59. #define ALPHA_R [fp, #-280]
  60. #if !defined(__ARM_PCS_VFP)
  61. #define OLD_ALPHAR_SOFTFP r3
  62. #define OLD_ALPHAI_SOFTFP [fp, #4]
  63. #define OLD_A_SOFTFP [fp, #8 ]
  64. #define B [fp, #12 ]
  65. #define C [fp, #16 ]
  66. #define OLD_LDC [fp, #20 ]
  67. #define OFFSET [fp, #24 ]
  68. #else
  69. #define B [fp, #4 ]
  70. #define C [fp, #8 ]
  71. #define OLD_LDC [fp, #12 ]
  72. #define OFFSET [fp, #16 ]
  73. #endif
  74. #define I r0
  75. #define J r1
  76. #define L r2
  77. #define AO r5
  78. #define BO r6
  79. #define CO1 r8
  80. #define CO2 r9
  81. #define K1 r7
  82. #define BC r12
  83. #define A_PRE 96
  84. #define B_PRE 96
  85. #define C_PRE 64
  86. /**************************************************************************************
  87. * Macro definitions
  88. **************************************************************************************/
  89. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  90. #define KMAC_R vmls.f32
  91. #define KMAC_I fmacs
  92. #define FMAC_R1 fmacs
  93. #define FMAC_R2 vmls.f32
  94. #define FMAC_I1 fmacs
  95. #define FMAC_I2 fmacs
  96. #elif defined(CN) || defined(CT)
  97. #define KMAC_R fmacs
  98. #define KMAC_I vmls.f32
  99. #define FMAC_R1 fmacs
  100. #define FMAC_R2 vmls.f32
  101. #define FMAC_I1 fmacs
  102. #define FMAC_I2 fmacs
  103. #elif defined(NC) || defined(TC)
  104. #define KMAC_R fmacs
  105. #define KMAC_I vmls.f32
  106. #define FMAC_R1 fmacs
  107. #define FMAC_R2 fmacs
  108. #define FMAC_I1 vmls.f32
  109. #define FMAC_I2 fmacs
  110. #else
  111. #define KMAC_R vmls.f32
  112. #define KMAC_I fmacs
  113. #define FMAC_R1 fmacs
  114. #define FMAC_R2 fmacs
  115. #define FMAC_I1 vmls.f32
  116. #define FMAC_I2 fmacs
  117. #endif
  118. .macro INIT2x2
  119. flds s8 , FP_ZERO
  120. vmov.f32 s9 , s8
  121. vmov.f32 s10, s8
  122. vmov.f32 s11, s8
  123. vmov.f32 s12, s8
  124. vmov.f32 s13, s8
  125. vmov.f32 s14, s8
  126. vmov.f32 s15, s8
  127. .endm
  128. .macro KERNEL2x2_I
  129. pld [ AO, #A_PRE ]
  130. vldmia.f32 AO!, { s0 - s3 }
  131. pld [ BO, #B_PRE ]
  132. vldmia.f32 BO!, { s4 - s7 }
  133. fmuls s8 , s0, s4
  134. fmuls s9 , s0, s5
  135. fmuls s10 , s2, s4
  136. fmuls s11 , s2, s5
  137. KMAC_R s8 , s1, s5
  138. KMAC_I s9 , s1, s4
  139. KMAC_R s10 , s3, s5
  140. KMAC_I s11 , s3, s4
  141. fmuls s12 , s0, s6
  142. fmuls s13 , s0, s7
  143. fmuls s14 , s2, s6
  144. fmuls s15 , s2, s7
  145. KMAC_R s12 , s1, s7
  146. KMAC_I s13 , s1, s6
  147. KMAC_R s14 , s3, s7
  148. KMAC_I s15 , s3, s6
  149. .endm
  150. .macro KERNEL2x2_M1
  151. pld [ AO, #A_PRE ]
  152. vldmia.f32 AO!, { s0 - s3 }
  153. pld [ BO, #B_PRE ]
  154. vldmia.f32 BO!, { s4 - s7 }
  155. fmacs s8 , s0, s4
  156. fmacs s9 , s0, s5
  157. fmacs s10 , s2, s4
  158. fmacs s11 , s2, s5
  159. KMAC_R s8 , s1, s5
  160. KMAC_I s9 , s1, s4
  161. KMAC_R s10 , s3, s5
  162. KMAC_I s11 , s3, s4
  163. fmacs s12 , s0, s6
  164. fmacs s13 , s0, s7
  165. fmacs s14 , s2, s6
  166. fmacs s15 , s2, s7
  167. KMAC_R s12 , s1, s7
  168. KMAC_I s13 , s1, s6
  169. KMAC_R s14 , s3, s7
  170. KMAC_I s15 , s3, s6
  171. .endm
  172. .macro KERNEL2x2_M2
  173. vldmia.f32 AO!, { s0 - s3 }
  174. vldmia.f32 BO!, { s4 - s7 }
  175. fmacs s8 , s0, s4
  176. fmacs s9 , s0, s5
  177. fmacs s10 , s2, s4
  178. fmacs s11 , s2, s5
  179. KMAC_R s8 , s1, s5
  180. KMAC_I s9 , s1, s4
  181. KMAC_R s10 , s3, s5
  182. KMAC_I s11 , s3, s4
  183. fmacs s12 , s0, s6
  184. fmacs s13 , s0, s7
  185. fmacs s14 , s2, s6
  186. fmacs s15 , s2, s7
  187. KMAC_R s12 , s1, s7
  188. KMAC_I s13 , s1, s6
  189. KMAC_R s14 , s3, s7
  190. KMAC_I s15 , s3, s6
  191. .endm
  192. .macro KERNEL2x2_E
  193. vldmia.f32 AO!, { s0 - s3 }
  194. vldmia.f32 BO!, { s4 - s7 }
  195. fmacs s8 , s0, s4
  196. fmacs s9 , s0, s5
  197. fmacs s10 , s2, s4
  198. fmacs s11 , s2, s5
  199. KMAC_R s8 , s1, s5
  200. KMAC_I s9 , s1, s4
  201. KMAC_R s10 , s3, s5
  202. KMAC_I s11 , s3, s4
  203. fmacs s12 , s0, s6
  204. fmacs s13 , s0, s7
  205. fmacs s14 , s2, s6
  206. fmacs s15 , s2, s7
  207. KMAC_R s12 , s1, s7
  208. KMAC_I s13 , s1, s6
  209. KMAC_R s14 , s3, s7
  210. KMAC_I s15 , s3, s6
  211. .endm
  212. .macro KERNEL2x2_SUB
  213. vldmia.f32 AO!, { s0 - s3 }
  214. vldmia.f32 BO!, { s4 - s7 }
  215. fmacs s8 , s0, s4
  216. fmacs s9 , s0, s5
  217. fmacs s10 , s2, s4
  218. fmacs s11 , s2, s5
  219. KMAC_R s8 , s1, s5
  220. KMAC_I s9 , s1, s4
  221. KMAC_R s10 , s3, s5
  222. KMAC_I s11 , s3, s4
  223. fmacs s12 , s0, s6
  224. fmacs s13 , s0, s7
  225. fmacs s14 , s2, s6
  226. fmacs s15 , s2, s7
  227. KMAC_R s12 , s1, s7
  228. KMAC_I s13 , s1, s6
  229. KMAC_R s14 , s3, s7
  230. KMAC_I s15 , s3, s6
  231. .endm
  232. .macro SAVE2x2
  233. ldr r3 , LDC
  234. add CO2 , CO1, r3
  235. flds s0, ALPHA_R
  236. flds s1, ALPHA_I
  237. flds s4, FP_ZERO
  238. vmov.f32 s5, s4
  239. vmov.f32 s6, s4
  240. vmov.f32 s7, s4
  241. FMAC_R1 s4 , s0 , s8
  242. FMAC_I1 s5 , s0 , s9
  243. FMAC_R2 s4 , s1 , s9
  244. FMAC_I2 s5 , s1 , s8
  245. FMAC_R1 s6 , s0 , s10
  246. FMAC_I1 s7 , s0 , s11
  247. FMAC_R2 s6 , s1 , s11
  248. FMAC_I2 s7 , s1 , s10
  249. vstmia.f32 CO1, { s4 - s7 }
  250. flds s4, FP_ZERO
  251. vmov.f32 s5, s4
  252. vmov.f32 s6, s4
  253. vmov.f32 s7, s4
  254. FMAC_R1 s4 , s0 , s12
  255. FMAC_I1 s5 , s0 , s13
  256. FMAC_R2 s4 , s1 , s13
  257. FMAC_I2 s5 , s1 , s12
  258. FMAC_R1 s6 , s0 , s14
  259. FMAC_I1 s7 , s0 , s15
  260. FMAC_R2 s6 , s1 , s15
  261. FMAC_I2 s7 , s1 , s14
  262. vstmia.f32 CO2, { s4 - s7 }
  263. add CO1, CO1, #16
  264. .endm
  265. /******************************************************************************/
  266. .macro INIT1x2
  267. flds s8 , FP_ZERO
  268. vmov.f32 s9 , s8
  269. vmov.f32 s12, s8
  270. vmov.f32 s13, s8
  271. .endm
  272. .macro KERNEL1x2_I
  273. flds s0 , [ AO ]
  274. flds s1 , [ AO, #4 ]
  275. flds s4 , [ BO ]
  276. flds s5 , [ BO, #4 ]
  277. flds s6 , [ BO, #8 ]
  278. flds s7 , [ BO, #12 ]
  279. fmuls s8 , s0, s4
  280. KMAC_R s8 , s1, s5
  281. fmuls s9 , s0, s5
  282. KMAC_I s9 , s1, s4
  283. fmuls s12 , s0, s6
  284. KMAC_R s12 , s1, s7
  285. fmuls s13 , s0, s7
  286. KMAC_I s13 , s1, s6
  287. add BO , BO, #16
  288. add AO , AO, #8
  289. .endm
  290. .macro KERNEL1x2_M1
  291. flds s0 , [ AO ]
  292. flds s1 , [ AO, #4 ]
  293. flds s4 , [ BO ]
  294. flds s5 , [ BO, #4 ]
  295. flds s6 , [ BO, #8 ]
  296. flds s7 , [ BO, #12 ]
  297. fmacs s8 , s0, s4
  298. KMAC_R s8 , s1, s5
  299. fmacs s9 , s0, s5
  300. KMAC_I s9 , s1, s4
  301. fmacs s12 , s0, s6
  302. KMAC_R s12 , s1, s7
  303. fmacs s13 , s0, s7
  304. KMAC_I s13 , s1, s6
  305. add BO , BO, #16
  306. add AO , AO, #8
  307. .endm
  308. .macro KERNEL1x2_M2
  309. flds s0 , [ AO ]
  310. flds s1 , [ AO, #4 ]
  311. flds s4 , [ BO ]
  312. flds s5 , [ BO, #4 ]
  313. flds s6 , [ BO, #8 ]
  314. flds s7 , [ BO, #12 ]
  315. fmacs s8 , s0, s4
  316. KMAC_R s8 , s1, s5
  317. fmacs s9 , s0, s5
  318. KMAC_I s9 , s1, s4
  319. fmacs s12 , s0, s6
  320. KMAC_R s12 , s1, s7
  321. fmacs s13 , s0, s7
  322. KMAC_I s13 , s1, s6
  323. add BO , BO, #16
  324. add AO , AO, #8
  325. .endm
  326. .macro KERNEL1x2_E
  327. flds s0 , [ AO ]
  328. flds s1 , [ AO, #4 ]
  329. flds s4 , [ BO ]
  330. flds s5 , [ BO, #4 ]
  331. flds s6 , [ BO, #8 ]
  332. flds s7 , [ BO, #12 ]
  333. fmacs s8 , s0, s4
  334. KMAC_R s8 , s1, s5
  335. fmacs s9 , s0, s5
  336. KMAC_I s9 , s1, s4
  337. fmacs s12 , s0, s6
  338. KMAC_R s12 , s1, s7
  339. fmacs s13 , s0, s7
  340. KMAC_I s13 , s1, s6
  341. add BO , BO, #16
  342. add AO , AO, #8
  343. .endm
  344. .macro KERNEL1x2_SUB
  345. flds s0 , [ AO ]
  346. flds s1 , [ AO, #4 ]
  347. flds s4 , [ BO ]
  348. flds s5 , [ BO, #4 ]
  349. flds s6 , [ BO, #8 ]
  350. flds s7 , [ BO, #12 ]
  351. fmacs s8 , s0, s4
  352. KMAC_R s8 , s1, s5
  353. fmacs s9 , s0, s5
  354. KMAC_I s9 , s1, s4
  355. fmacs s12 , s0, s6
  356. KMAC_R s12 , s1, s7
  357. fmacs s13 , s0, s7
  358. KMAC_I s13 , s1, s6
  359. add BO , BO, #16
  360. add AO , AO, #8
  361. .endm
  362. .macro SAVE1x2
  363. ldr r3 , LDC
  364. add CO2 , CO1, r3
  365. flds s0, ALPHA_R
  366. flds s1, ALPHA_I
  367. flds s4, FP_ZERO
  368. vmov.f32 s5, s4
  369. FMAC_R1 s4 , s0 , s8
  370. FMAC_I1 s5 , s0 , s9
  371. FMAC_R2 s4 , s1 , s9
  372. FMAC_I2 s5 , s1 , s8
  373. vstmia.f32 CO1, { s4 - s5 }
  374. flds s4, FP_ZERO
  375. vmov.f32 s5, s4
  376. FMAC_R1 s4 , s0 , s12
  377. FMAC_I1 s5 , s0 , s13
  378. FMAC_R2 s4 , s1 , s13
  379. FMAC_I2 s5 , s1 , s12
  380. vstmia.f32 CO2, { s4 - s5 }
  381. add CO1, CO1, #8
  382. .endm
  383. /******************************************************************************/
  384. .macro INIT2x1
  385. flds s8 , FP_ZERO
  386. vmov.f32 s9 , s8
  387. vmov.f32 s10, s8
  388. vmov.f32 s11, s8
  389. .endm
  390. .macro KERNEL2x1_I
  391. flds s0 , [ AO ]
  392. flds s1 , [ AO, #4 ]
  393. flds s2 , [ AO, #8 ]
  394. flds s3 , [ AO, #12 ]
  395. flds s4 , [ BO ]
  396. flds s5 , [ BO, #4 ]
  397. fmuls s8 , s0, s4
  398. KMAC_R s8 , s1, s5
  399. fmuls s9 , s0, s5
  400. KMAC_I s9 , s1, s4
  401. fmuls s10 , s2, s4
  402. KMAC_R s10 , s3, s5
  403. fmuls s11 , s2, s5
  404. KMAC_I s11 , s3, s4
  405. add BO , BO, #8
  406. add AO , AO, #16
  407. .endm
  408. .macro KERNEL2x1_M1
  409. flds s0 , [ AO ]
  410. flds s1 , [ AO, #4 ]
  411. flds s2 , [ AO, #8 ]
  412. flds s3 , [ AO, #12 ]
  413. flds s4 , [ BO ]
  414. flds s5 , [ BO, #4 ]
  415. fmacs s8 , s0, s4
  416. KMAC_R s8 , s1, s5
  417. fmacs s9 , s0, s5
  418. KMAC_I s9 , s1, s4
  419. fmacs s10 , s2, s4
  420. KMAC_R s10 , s3, s5
  421. fmacs s11 , s2, s5
  422. KMAC_I s11 , s3, s4
  423. add BO , BO, #8
  424. add AO , AO, #16
  425. .endm
  426. .macro KERNEL2x1_M2
  427. flds s0 , [ AO ]
  428. flds s1 , [ AO, #4 ]
  429. flds s2 , [ AO, #8 ]
  430. flds s3 , [ AO, #12 ]
  431. flds s4 , [ BO ]
  432. flds s5 , [ BO, #4 ]
  433. fmacs s8 , s0, s4
  434. KMAC_R s8 , s1, s5
  435. fmacs s9 , s0, s5
  436. KMAC_I s9 , s1, s4
  437. fmacs s10 , s2, s4
  438. KMAC_R s10 , s3, s5
  439. fmacs s11 , s2, s5
  440. KMAC_I s11 , s3, s4
  441. add BO , BO, #8
  442. add AO , AO, #16
  443. .endm
  444. .macro KERNEL2x1_E
  445. flds s0 , [ AO ]
  446. flds s1 , [ AO, #4 ]
  447. flds s2 , [ AO, #8 ]
  448. flds s3 , [ AO, #12 ]
  449. flds s4 , [ BO ]
  450. flds s5 , [ BO, #4 ]
  451. fmacs s8 , s0, s4
  452. KMAC_R s8 , s1, s5
  453. fmacs s9 , s0, s5
  454. KMAC_I s9 , s1, s4
  455. fmacs s10 , s2, s4
  456. KMAC_R s10 , s3, s5
  457. fmacs s11 , s2, s5
  458. KMAC_I s11 , s3, s4
  459. add BO , BO, #8
  460. add AO , AO, #16
  461. .endm
  462. .macro KERNEL2x1_SUB
  463. flds s0 , [ AO ]
  464. flds s1 , [ AO, #4 ]
  465. flds s2 , [ AO, #8 ]
  466. flds s3 , [ AO, #12 ]
  467. flds s4 , [ BO ]
  468. flds s5 , [ BO, #4 ]
  469. fmacs s8 , s0, s4
  470. KMAC_R s8 , s1, s5
  471. fmacs s9 , s0, s5
  472. KMAC_I s9 , s1, s4
  473. fmacs s10 , s2, s4
  474. KMAC_R s10 , s3, s5
  475. fmacs s11 , s2, s5
  476. KMAC_I s11 , s3, s4
  477. add BO , BO, #8
  478. add AO , AO, #16
  479. .endm
  480. .macro SAVE2x1
  481. flds s0, ALPHA_R
  482. flds s1, ALPHA_I
  483. flds s4, FP_ZERO
  484. vmov.f32 s5, s4
  485. vmov.f32 s6, s4
  486. vmov.f32 s7, s4
  487. FMAC_R1 s4 , s0 , s8
  488. FMAC_I1 s5 , s0 , s9
  489. FMAC_R2 s4 , s1 , s9
  490. FMAC_I2 s5 , s1 , s8
  491. FMAC_R1 s6 , s0 , s10
  492. FMAC_I1 s7 , s0 , s11
  493. FMAC_R2 s6 , s1 , s11
  494. FMAC_I2 s7 , s1 , s10
  495. vstmia.f32 CO1, { s4 - s7 }
  496. add CO1, CO1, #16
  497. .endm
  498. /******************************************************************************/
  499. .macro INIT1x1
  500. flds s8 , FP_ZERO
  501. vmov.f32 s9 , s8
  502. .endm
  503. .macro KERNEL1x1_I
  504. flds s0 , [ AO ]
  505. flds s1 , [ AO, #4 ]
  506. flds s4 , [ BO ]
  507. flds s5 , [ BO, #4 ]
  508. fmuls s8 , s0, s4
  509. KMAC_R s8 , s1, s5
  510. fmuls s9 , s0, s5
  511. KMAC_I s9 , s1, s4
  512. add BO , BO, #8
  513. add AO , AO, #8
  514. .endm
  515. .macro KERNEL1x1_M1
  516. flds s0 , [ AO ]
  517. flds s1 , [ AO, #4 ]
  518. flds s4 , [ BO ]
  519. flds s5 , [ BO, #4 ]
  520. fmacs s8 , s0, s4
  521. KMAC_R s8 , s1, s5
  522. fmacs s9 , s0, s5
  523. KMAC_I s9 , s1, s4
  524. add BO , BO, #8
  525. add AO , AO, #8
  526. .endm
  527. .macro KERNEL1x1_M2
  528. flds s0 , [ AO ]
  529. flds s1 , [ AO, #4 ]
  530. flds s4 , [ BO ]
  531. flds s5 , [ BO, #4 ]
  532. fmacs s8 , s0, s4
  533. KMAC_R s8 , s1, s5
  534. fmacs s9 , s0, s5
  535. KMAC_I s9 , s1, s4
  536. add BO , BO, #8
  537. add AO , AO, #8
  538. .endm
  539. .macro KERNEL1x1_E
  540. flds s0 , [ AO ]
  541. flds s1 , [ AO, #4 ]
  542. flds s4 , [ BO ]
  543. flds s5 , [ BO, #4 ]
  544. fmacs s8 , s0, s4
  545. KMAC_R s8 , s1, s5
  546. fmacs s9 , s0, s5
  547. KMAC_I s9 , s1, s4
  548. add BO , BO, #8
  549. add AO , AO, #8
  550. .endm
  551. .macro KERNEL1x1_SUB
  552. flds s0 , [ AO ]
  553. flds s1 , [ AO, #4 ]
  554. flds s4 , [ BO ]
  555. flds s5 , [ BO, #4 ]
  556. fmacs s8 , s0, s4
  557. KMAC_R s8 , s1, s5
  558. fmacs s9 , s0, s5
  559. KMAC_I s9 , s1, s4
  560. add BO , BO, #8
  561. add AO , AO, #8
  562. .endm
  563. .macro SAVE1x1
  564. flds s0, ALPHA_R
  565. flds s1, ALPHA_I
  566. flds s4, FP_ZERO
  567. vmov.f32 s5, s4
  568. FMAC_R1 s4 , s0 , s8
  569. FMAC_I1 s5 , s0 , s9
  570. FMAC_R2 s4 , s1 , s9
  571. FMAC_I2 s5 , s1 , s8
  572. vstmia.f32 CO1, { s4 - s5 }
  573. add CO1, CO1, #8
  574. .endm
  575. /**************************************************************************************
  576. * End of macro definitions
  577. **************************************************************************************/
  578. PROLOGUE
  579. .align 5
  580. push {r4 - r9, fp}
  581. add fp, sp, #24
  582. sub sp, sp, #STACKSIZE // reserve stack
  583. #if !defined(__ARM_PCS_VFP)
  584. vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
  585. vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
  586. ldr OLD_A, OLD_A_SOFTFP
  587. #endif
  588. str OLD_M, M
  589. str OLD_N, N
  590. str OLD_K, K
  591. str OLD_A, A
  592. vstr OLD_ALPHA_R, ALPHA_R
  593. vstr OLD_ALPHA_I, ALPHA_I
  594. sub r3, fp, #128
  595. vstm r3, { s8 - s15} // store floating point registers
  596. movs r4, #0
  597. str r4, FP_ZERO
  598. str r4, FP_ZERO_1
  599. ldr r3, OLD_LDC
  600. lsl r3, r3, #3 // ldc = ldc * 4 * 2
  601. str r3, LDC
  602. ldr r3, OFFSET
  603. #ifndef LEFT
  604. neg r3 , r3
  605. #endif
  606. str r3 , KK
  607. ldr BC, B
  608. ldr J, N
  609. asrs J, J, #1 // J = J / 2
  610. ble _L1_BEGIN
  611. _L2_BEGIN:
  612. ldr CO1, C // CO1 = C
  613. ldr r4 , LDC
  614. lsl r4 , r4 , #1 // LDC * 2
  615. add r3 , r4, CO1
  616. str r3 , C // store C
  617. #if defined(LEFT)
  618. ldr r3 , OFFSET
  619. str r3 , KK
  620. #endif
  621. ldr AO, A // AO = A
  622. pld [AO , #A_PRE-64]
  623. pld [AO , #A_PRE-32]
  624. _L2_M2_BEGIN:
  625. ldr I, M
  626. asrs I, I, #1 // I = I / 2
  627. ble _L2_M1_BEGIN
  628. _L2_M2_20:
  629. #if (defined(LEFT) && defined(TRANSA)) || \
  630. (!defined(LEFT) && !defined(TRANSA))
  631. mov BO, BC
  632. #else
  633. mov BO, BC
  634. ldr r3 , KK
  635. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  636. add BO , BO , r4
  637. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  638. add AO , AO , r4
  639. #endif
  640. #ifndef TRMMKERNEL
  641. ldr K1, K
  642. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  643. ldr K1, K
  644. ldr r3, KK
  645. sub K1, K1, r3
  646. str K1, KKK
  647. #else
  648. ldr K1, KK
  649. #ifdef LEFT
  650. add K1, K1, #2 // number of values in AO
  651. #else
  652. add K1, K1, #2 // number of values in BO
  653. #endif
  654. str K1, KKK
  655. #endif
  656. asrs L , K1, #3 // L = L / 8
  657. cmp L , #3
  658. blt _L2_M2_30
  659. .align 5
  660. KERNEL2x2_I
  661. KERNEL2x2_M2
  662. KERNEL2x2_M1
  663. KERNEL2x2_M2
  664. KERNEL2x2_M1
  665. KERNEL2x2_M2
  666. KERNEL2x2_M1
  667. KERNEL2x2_M2
  668. sub L, L, #2
  669. _L2_M2_22:
  670. KERNEL2x2_M1
  671. KERNEL2x2_M2
  672. KERNEL2x2_M1
  673. KERNEL2x2_M2
  674. KERNEL2x2_M1
  675. KERNEL2x2_M2
  676. KERNEL2x2_M1
  677. KERNEL2x2_M2
  678. subs L, L, #1
  679. bgt _L2_M2_22
  680. KERNEL2x2_M1
  681. KERNEL2x2_M2
  682. KERNEL2x2_M1
  683. KERNEL2x2_M2
  684. KERNEL2x2_M1
  685. KERNEL2x2_M2
  686. KERNEL2x2_M1
  687. KERNEL2x2_E
  688. b _L2_M2_44
  689. _L2_M2_30:
  690. tst L, #3
  691. ble _L2_M2_40
  692. tst L, #2
  693. ble _L2_M2_32
  694. KERNEL2x2_I
  695. KERNEL2x2_M2
  696. KERNEL2x2_M1
  697. KERNEL2x2_M2
  698. KERNEL2x2_M1
  699. KERNEL2x2_M2
  700. KERNEL2x2_M1
  701. KERNEL2x2_M2
  702. KERNEL2x2_M1
  703. KERNEL2x2_M2
  704. KERNEL2x2_M1
  705. KERNEL2x2_M2
  706. KERNEL2x2_M1
  707. KERNEL2x2_M2
  708. KERNEL2x2_M1
  709. KERNEL2x2_E
  710. b _L2_M2_44
  711. _L2_M2_32:
  712. tst L, #1
  713. ble _L2_M2_40
  714. KERNEL2x2_I
  715. KERNEL2x2_M2
  716. KERNEL2x2_M1
  717. KERNEL2x2_M2
  718. KERNEL2x2_M1
  719. KERNEL2x2_M2
  720. KERNEL2x2_M1
  721. KERNEL2x2_E
  722. b _L2_M2_44
  723. _L2_M2_40:
  724. INIT2x2
  725. _L2_M2_44:
  726. ands L , K1, #7 // L = L % 8
  727. ble _L2_M2_100
  728. _L2_M2_46:
  729. KERNEL2x2_SUB
  730. subs L, L, #1
  731. bne _L2_M2_46
  732. _L2_M2_100:
  733. SAVE2x2
  734. #if (defined(LEFT) && defined(TRANSA)) || \
  735. (!defined(LEFT) && !defined(TRANSA))
  736. ldr r3 , K
  737. ldr r4 , KKK
  738. sub r3 , r3 , r4
  739. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  740. add BO , BO , r4
  741. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  742. add AO , AO , r4
  743. #endif
  744. #if defined(LEFT)
  745. ldr r3 , KK
  746. add r3 , r3 , #2 // number of values in AO
  747. str r3 , KK
  748. #endif
  749. _L2_M2_END:
  750. subs I, I, #1
  751. bne _L2_M2_20
  752. _L2_M1_BEGIN:
  753. ldr I, M
  754. tst I, #1 // I = I % 2
  755. ble _L2_END
  756. _L2_M1_20:
  757. INIT1x2
  758. #if (defined(LEFT) && defined(TRANSA)) || \
  759. (!defined(LEFT) && !defined(TRANSA))
  760. mov BO, BC
  761. #else
  762. mov BO, BC
  763. ldr r3 , KK
  764. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  765. add BO , BO , r4
  766. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  767. add AO , AO , r4
  768. #endif
  769. #ifndef TRMMKERNEL
  770. ldr K1, K
  771. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  772. ldr K1, K
  773. ldr r3, KK
  774. sub K1, K1, r3
  775. str K1, KKK
  776. #else
  777. ldr K1, KK
  778. #ifdef LEFT
  779. add K1, K1, #1 // number of values in AO
  780. #else
  781. add K1, K1, #2 // number of values in BO
  782. #endif
  783. str K1, KKK
  784. #endif
  785. asrs L , K1, #3 // L = L / 8
  786. ble _L2_M1_40
  787. _L2_M1_22:
  788. KERNEL1x2_SUB
  789. KERNEL1x2_SUB
  790. KERNEL1x2_SUB
  791. KERNEL1x2_SUB
  792. KERNEL1x2_SUB
  793. KERNEL1x2_SUB
  794. KERNEL1x2_SUB
  795. KERNEL1x2_SUB
  796. subs L, L, #1
  797. bgt _L2_M1_22
  798. _L2_M1_40:
  799. ands L , K1, #7 // L = L % 8
  800. ble _L2_M1_100
  801. _L2_M1_42:
  802. KERNEL1x2_SUB
  803. subs L, L, #1
  804. bgt _L2_M1_42
  805. _L2_M1_100:
  806. SAVE1x2
  807. #if (defined(LEFT) && defined(TRANSA)) || \
  808. (!defined(LEFT) && !defined(TRANSA))
  809. ldr r3 , K
  810. ldr r4 , KKK
  811. sub r3 , r3 , r4
  812. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  813. add BO , BO , r4
  814. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  815. add AO , AO , r4
  816. #endif
  817. #if defined(LEFT)
  818. ldr r3 , KK
  819. add r3 , r3 , #1 // number of values in AO
  820. str r3 , KK
  821. #endif
  822. _L2_END:
  823. mov r3, BC
  824. ldr r4, K
  825. lsl r4, r4, #4 // k * 2 * 4 * 2
  826. add r3, r3, r4 // B = B + K * 2 * 8
  827. mov BC, r3
  828. #if !defined(LEFT)
  829. ldr r3 , KK
  830. add r3 , r3 , #2 // number of values in BO
  831. str r3 , KK
  832. #endif
  833. subs J , #1 // j--
  834. bgt _L2_BEGIN
  835. /*********************************************************************************************/
  836. _L1_BEGIN:
  837. ldr J , N
  838. tst J , #1
  839. ble _L999
  840. ldr CO1, C // CO1 = C
  841. ldr r4 , LDC
  842. add r3 , r4, CO1
  843. str r3 , C // store C
  844. #if defined(LEFT)
  845. ldr r3 , OFFSET
  846. str r3 , KK
  847. #endif
  848. ldr AO, A // AO = A
  849. _L1_M2_BEGIN:
  850. ldr I, M
  851. asrs I, I, #1 // I = I / 2
  852. ble _L1_M1_BEGIN
  853. _L1_M2_20:
  854. #if (defined(LEFT) && defined(TRANSA)) || \
  855. (!defined(LEFT) && !defined(TRANSA))
  856. mov BO, BC
  857. #else
  858. mov BO, BC
  859. ldr r3 , KK
  860. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  861. add BO , BO , r4
  862. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  863. add AO , AO , r4
  864. #endif
  865. #ifndef TRMMKERNEL
  866. ldr K1, K
  867. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  868. ldr K1, K
  869. ldr r3, KK
  870. sub K1, K1, r3
  871. str K1, KKK
  872. #else
  873. ldr K1, KK
  874. #ifdef LEFT
  875. add K1, K1, #2 // number of values in AO
  876. #else
  877. add K1, K1, #1 // number of values in BO
  878. #endif
  879. str K1, KKK
  880. #endif
  881. asrs L , K1, #3 // L = L / 8
  882. cmp L , #3
  883. blt _L1_M2_30
  884. .align 5
  885. KERNEL2x1_I
  886. KERNEL2x1_M2
  887. KERNEL2x1_M1
  888. KERNEL2x1_M2
  889. KERNEL2x1_M1
  890. KERNEL2x1_M2
  891. KERNEL2x1_M1
  892. KERNEL2x1_M2
  893. sub L, L, #2
  894. _L1_M2_22:
  895. KERNEL2x1_M1
  896. KERNEL2x1_M2
  897. KERNEL2x1_M1
  898. KERNEL2x1_M2
  899. KERNEL2x1_M1
  900. KERNEL2x1_M2
  901. KERNEL2x1_M1
  902. KERNEL2x1_M2
  903. subs L, L, #1
  904. bgt _L1_M2_22
  905. KERNEL2x1_M1
  906. KERNEL2x1_M2
  907. KERNEL2x1_M1
  908. KERNEL2x1_M2
  909. KERNEL2x1_M1
  910. KERNEL2x1_M2
  911. KERNEL2x1_M1
  912. KERNEL2x1_E
  913. b _L1_M2_44
  914. _L1_M2_30:
  915. tst L, #3
  916. ble _L1_M2_40
  917. tst L, #2
  918. ble _L1_M2_32
  919. KERNEL2x1_I
  920. KERNEL2x1_M2
  921. KERNEL2x1_M1
  922. KERNEL2x1_M2
  923. KERNEL2x1_M1
  924. KERNEL2x1_M2
  925. KERNEL2x1_M1
  926. KERNEL2x1_M2
  927. KERNEL2x1_M1
  928. KERNEL2x1_M2
  929. KERNEL2x1_M1
  930. KERNEL2x1_M2
  931. KERNEL2x1_M1
  932. KERNEL2x1_M2
  933. KERNEL2x1_M1
  934. KERNEL2x1_E
  935. b _L1_M2_44
  936. _L1_M2_32:
  937. tst L, #1
  938. ble _L1_M2_40
  939. KERNEL2x1_I
  940. KERNEL2x1_M2
  941. KERNEL2x1_M1
  942. KERNEL2x1_M2
  943. KERNEL2x1_M1
  944. KERNEL2x1_M2
  945. KERNEL2x1_M1
  946. KERNEL2x1_E
  947. b _L1_M2_44
  948. _L1_M2_40:
  949. INIT2x1
  950. _L1_M2_44:
  951. ands L , K1, #7 // L = L % 8
  952. ble _L1_M2_100
  953. _L1_M2_46:
  954. KERNEL2x1_SUB
  955. subs L, L, #1
  956. bne _L1_M2_46
  957. _L1_M2_100:
  958. SAVE2x1
  959. #if (defined(LEFT) && defined(TRANSA)) || \
  960. (!defined(LEFT) && !defined(TRANSA))
  961. ldr r3 , K
  962. ldr r4 , KKK
  963. sub r3 , r3 , r4
  964. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  965. add BO , BO , r4
  966. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  967. add AO , AO , r4
  968. #endif
  969. #if defined(LEFT)
  970. ldr r3 , KK
  971. add r3 , r3 , #2 // number of values in AO
  972. str r3 , KK
  973. #endif
  974. _L1_M2_END:
  975. subs I, I, #1
  976. bne _L1_M2_20
  977. _L1_M1_BEGIN:
  978. ldr I, M
  979. tst I, #1 // I = I % 2
  980. ble _L1_END
  981. _L1_M1_20:
  982. INIT1x1
  983. #if (defined(LEFT) && defined(TRANSA)) || \
  984. (!defined(LEFT) && !defined(TRANSA))
  985. mov BO, BC
  986. #else
  987. mov BO, BC
  988. ldr r3 , KK
  989. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  990. add BO , BO , r4
  991. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  992. add AO , AO , r4
  993. #endif
  994. #ifndef TRMMKERNEL
  995. ldr K1, K
  996. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  997. ldr K1, K
  998. ldr r3, KK
  999. sub K1, K1, r3
  1000. str K1, KKK
  1001. #else
  1002. ldr K1, KK
  1003. #ifdef LEFT
  1004. add K1, K1, #1 // number of values in AO
  1005. #else
  1006. add K1, K1, #1 // number of values in BO
  1007. #endif
  1008. str K1, KKK
  1009. #endif
  1010. asrs L , K1, #3 // L = L / 8
  1011. ble _L1_M1_40
  1012. _L1_M1_22:
  1013. KERNEL1x1_SUB
  1014. KERNEL1x1_SUB
  1015. KERNEL1x1_SUB
  1016. KERNEL1x1_SUB
  1017. KERNEL1x1_SUB
  1018. KERNEL1x1_SUB
  1019. KERNEL1x1_SUB
  1020. KERNEL1x1_SUB
  1021. subs L, L, #1
  1022. bgt _L1_M1_22
  1023. _L1_M1_40:
  1024. ands L , K1, #7 // L = L % 8
  1025. ble _L1_M1_100
  1026. _L1_M1_42:
  1027. KERNEL1x1_SUB
  1028. subs L, L, #1
  1029. bgt _L1_M1_42
  1030. _L1_M1_100:
  1031. SAVE1x1
  1032. _L1_END:
  1033. _L999:
  1034. sub r3, fp, #128
  1035. vldm r3, { s8 - s15} // restore floating point registers
  1036. movs r0, #0 // set return value
  1037. sub sp, fp, #24
  1038. pop {r4 - r9, fp}
  1039. bx lr
  1040. EPILOGUE