You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrmm_kernel_2x2_vfp.S 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA_R d0
  42. #define OLD_ALPHA_I d1
  43. /******************************************************
  44. * [fp, #-128] - [fp, #-64] is reserved
  45. * for store and restore of floating point
  46. * registers
  47. *******************************************************/
  48. #define KKK [fp, #-240]
  49. #define KK [fp, #-244 ]
  50. #define A [fp, #-248 ]
  51. #define LDC [fp, #-252 ]
  52. #define M [fp, #-256 ]
  53. #define N [fp, #-260 ]
  54. #define K [fp, #-264 ]
  55. #define FP_ZERO [fp, #-232]
  56. #define FP_ZERO_0 [fp, #-232]
  57. #define FP_ZERO_1 [fp, #-228]
  58. #define ALPHA_I [fp, #-272]
  59. #define ALPHA_R [fp, #-280]
  60. #if !defined(__ARM_PCS_VFP)
  61. #define OLD_ALPHAR_SOFTFP [fp, #4]
  62. #define OLD_ALPHAI_SOFTFP [fp, #12]
  63. #define OLD_A_SOFTFP [fp, #20 ]
  64. #define B [fp, #24 ]
  65. #define C [fp, #28 ]
  66. #define OLD_LDC [fp, #32 ]
  67. #define OFFSET [fp, #36 ]
  68. #else
  69. #define B [fp, #4 ]
  70. #define C [fp, #8 ]
  71. #define OLD_LDC [fp, #12 ]
  72. #define OFFSET [fp, #16 ]
  73. #endif
  74. #define I r0
  75. #define J r1
  76. #define L r2
  77. #define AO r5
  78. #define BO r6
  79. #define CO1 r8
  80. #define CO2 r9
  81. #define K1 r7
  82. #define BC r12
  83. #define A_PRE 96
  84. #define B_PRE 96
  85. #define C_PRE 64
  86. /**************************************************************************************
  87. * Macro definitions
  88. **************************************************************************************/
  89. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  90. #define KMAC_R vmls.f64
  91. #define KMAC_I fmacd
  92. #define FMAC_R1 fmacd
  93. #define FMAC_R2 vmls.f64
  94. #define FMAC_I1 fmacd
  95. #define FMAC_I2 fmacd
  96. #elif defined(CN) || defined(CT)
  97. #define KMAC_R fmacd
  98. #define KMAC_I vmls.f64
  99. #define FMAC_R1 fmacd
  100. #define FMAC_R2 vmls.f64
  101. #define FMAC_I1 fmacd
  102. #define FMAC_I2 fmacd
  103. #elif defined(NC) || defined(TC)
  104. #define KMAC_R fmacd
  105. #define KMAC_I vmls.f64
  106. #define FMAC_R1 fmacd
  107. #define FMAC_R2 fmacd
  108. #define FMAC_I1 vmls.f64
  109. #define FMAC_I2 fmacd
  110. #else
  111. #define KMAC_R vmls.f64
  112. #define KMAC_I fmacd
  113. #define FMAC_R1 fmacd
  114. #define FMAC_R2 fmacd
  115. #define FMAC_I1 vmls.f64
  116. #define FMAC_I2 fmacd
  117. #endif
  118. /**************************************************************************************
  119. * Macro definitions
  120. **************************************************************************************/
  121. .macro INIT2x2
  122. fldd d8 , FP_ZERO
  123. vmov.f64 d9 , d8
  124. vmov.f64 d10, d8
  125. vmov.f64 d11, d8
  126. vmov.f64 d12, d8
  127. vmov.f64 d13, d8
  128. vmov.f64 d14, d8
  129. vmov.f64 d15, d8
  130. .endm
  131. .macro KERNEL2x2_I
  132. pld [ AO, #A_PRE ]
  133. pld [ BO, #B_PRE ]
  134. fldd d0 , [ AO ]
  135. fldd d1 , [ AO, #8 ]
  136. fldd d2 , [ AO, #16 ]
  137. fldd d3 , [ AO, #24 ]
  138. fldd d4 , [ BO ]
  139. fldd d5 , [ BO, #8 ]
  140. fldd d6 , [ BO, #16 ]
  141. fldd d7 , [ BO, #24 ]
  142. fmuld d8 , d0, d4
  143. KMAC_R d8 , d1, d5
  144. fmuld d9 , d0, d5
  145. KMAC_I d9 , d1, d4
  146. fmuld d10 , d2, d4
  147. KMAC_R d10 , d3, d5
  148. fmuld d11 , d2, d5
  149. KMAC_I d11 , d3, d4
  150. fmuld d12 , d0, d6
  151. KMAC_R d12 , d1, d7
  152. fmuld d13 , d0, d7
  153. KMAC_I d13 , d1, d6
  154. fmuld d14 , d2, d6
  155. KMAC_R d14 , d3, d7
  156. fmuld d15 , d2, d7
  157. KMAC_I d15 , d3, d6
  158. add BO , BO, #32
  159. add AO , AO, #32
  160. .endm
  161. .macro KERNEL2x2_M1
  162. fldd d0 , [ AO ]
  163. fldd d4 , [ BO ]
  164. fldd d5 , [ BO, #8 ]
  165. fmacd d8 , d0, d4
  166. fldd d1 , [ AO, #8 ]
  167. fmacd d9 , d0, d5
  168. fldd d2 , [ AO, #16 ]
  169. KMAC_R d8 , d1, d5
  170. fldd d3 , [ AO, #24 ]
  171. KMAC_I d9 , d1, d4
  172. fldd d6 , [ BO, #16 ]
  173. fmacd d10 , d2, d4
  174. fldd d7 , [ BO, #24 ]
  175. fmacd d11 , d2, d5
  176. KMAC_R d10 , d3, d5
  177. pld [ AO, #A_PRE ]
  178. KMAC_I d11 , d3, d4
  179. pld [ BO, #B_PRE ]
  180. fmacd d12 , d0, d6
  181. fmacd d13 , d0, d7
  182. KMAC_R d12 , d1, d7
  183. KMAC_I d13 , d1, d6
  184. fmacd d14 , d2, d6
  185. fmacd d15 , d2, d7
  186. add BO , BO, #32
  187. KMAC_R d14 , d3, d7
  188. add AO , AO, #32
  189. KMAC_I d15 , d3, d6
  190. .endm
  191. .macro KERNEL2x2_M2
  192. fldd d0 , [ AO ]
  193. fldd d4 , [ BO ]
  194. fldd d5 , [ BO, #8 ]
  195. fmacd d8 , d0, d4
  196. fldd d1 , [ AO, #8 ]
  197. fmacd d9 , d0, d5
  198. fldd d2 , [ AO, #16 ]
  199. KMAC_R d8 , d1, d5
  200. fldd d3 , [ AO, #24 ]
  201. KMAC_I d9 , d1, d4
  202. fldd d6 , [ BO, #16 ]
  203. fmacd d10 , d2, d4
  204. fldd d7 , [ BO, #24 ]
  205. fmacd d11 , d2, d5
  206. KMAC_R d10 , d3, d5
  207. pld [ AO, #A_PRE ]
  208. KMAC_I d11 , d3, d4
  209. pld [ BO, #B_PRE ]
  210. fmacd d12 , d0, d6
  211. fmacd d13 , d0, d7
  212. KMAC_R d12 , d1, d7
  213. KMAC_I d13 , d1, d6
  214. fmacd d14 , d2, d6
  215. fmacd d15 , d2, d7
  216. add BO , BO, #32
  217. KMAC_R d14 , d3, d7
  218. add AO , AO, #32
  219. KMAC_I d15 , d3, d6
  220. .endm
  221. .macro KERNEL2x2_E
  222. fldd d0 , [ AO ]
  223. fldd d1 , [ AO, #8 ]
  224. fldd d2 , [ AO, #16 ]
  225. fldd d3 , [ AO, #24 ]
  226. fldd d4 , [ BO ]
  227. fldd d5 , [ BO, #8 ]
  228. fldd d6 , [ BO, #16 ]
  229. fldd d7 , [ BO, #24 ]
  230. fmacd d8 , d0, d4
  231. KMAC_R d8 , d1, d5
  232. fmacd d9 , d0, d5
  233. KMAC_I d9 , d1, d4
  234. fmacd d10 , d2, d4
  235. KMAC_R d10 , d3, d5
  236. fmacd d11 , d2, d5
  237. KMAC_I d11 , d3, d4
  238. fmacd d12 , d0, d6
  239. KMAC_R d12 , d1, d7
  240. fmacd d13 , d0, d7
  241. KMAC_I d13 , d1, d6
  242. fmacd d14 , d2, d6
  243. KMAC_R d14 , d3, d7
  244. fmacd d15 , d2, d7
  245. KMAC_I d15 , d3, d6
  246. add BO , BO, #32
  247. add AO , AO, #32
  248. .endm
  249. .macro KERNEL2x2_SUB
  250. fldd d0 , [ AO ]
  251. fldd d4 , [ BO ]
  252. fldd d5 , [ BO, #8 ]
  253. fmacd d8 , d0, d4
  254. fldd d1 , [ AO, #8 ]
  255. fmacd d9 , d0, d5
  256. fldd d2 , [ AO, #16 ]
  257. KMAC_R d8 , d1, d5
  258. fldd d3 , [ AO, #24 ]
  259. KMAC_I d9 , d1, d4
  260. fldd d6 , [ BO, #16 ]
  261. fmacd d10 , d2, d4
  262. fldd d7 , [ BO, #24 ]
  263. fmacd d11 , d2, d5
  264. KMAC_R d10 , d3, d5
  265. pld [ AO, #A_PRE ]
  266. KMAC_I d11 , d3, d4
  267. pld [ BO, #B_PRE ]
  268. fmacd d12 , d0, d6
  269. fmacd d13 , d0, d7
  270. KMAC_R d12 , d1, d7
  271. KMAC_I d13 , d1, d6
  272. fmacd d14 , d2, d6
  273. fmacd d15 , d2, d7
  274. add BO , BO, #32
  275. KMAC_R d14 , d3, d7
  276. add AO , AO, #32
  277. KMAC_I d15 , d3, d6
  278. .endm
  279. .macro SAVE2x2
  280. ldr r3 , LDC
  281. add CO2 , CO1, r3
  282. fldd d0, ALPHA_R
  283. fldd d1, ALPHA_I
  284. fldd d4 , FP_ZERO
  285. vmov.f64 d5 , d4
  286. vmov.f64 d6 , d4
  287. vmov.f64 d7 , d4
  288. FMAC_R1 d4 , d0 , d8
  289. FMAC_I1 d5 , d0 , d9
  290. FMAC_R2 d4 , d1 , d9
  291. FMAC_I2 d5 , d1 , d8
  292. FMAC_R1 d6 , d0 , d10
  293. FMAC_I1 d7 , d0 , d11
  294. FMAC_R2 d6 , d1 , d11
  295. FMAC_I2 d7 , d1 , d10
  296. vstmia.f64 CO1, { d4 - d7 }
  297. fldd d4 , FP_ZERO
  298. vmov.f64 d5 , d4
  299. vmov.f64 d6 , d4
  300. vmov.f64 d7 , d4
  301. FMAC_R1 d4 , d0 , d12
  302. FMAC_I1 d5 , d0 , d13
  303. FMAC_R2 d4 , d1 , d13
  304. FMAC_I2 d5 , d1 , d12
  305. FMAC_R1 d6 , d0 , d14
  306. FMAC_I1 d7 , d0 , d15
  307. FMAC_R2 d6 , d1 , d15
  308. FMAC_I2 d7 , d1 , d14
  309. vstmia.f64 CO2, { d4 - d7 }
  310. add CO1, CO1, #32
  311. .endm
  312. /******************************************************************************/
  313. .macro INIT1x2
  314. fldd d8 , FP_ZERO
  315. vmov.f64 d9 , d8
  316. vmov.f64 d12, d8
  317. vmov.f64 d13, d8
  318. .endm
  319. .macro KERNEL1x2_I
  320. fldd d0 , [ AO ]
  321. fldd d1 , [ AO, #8 ]
  322. fldd d4 , [ BO ]
  323. fldd d5 , [ BO, #8 ]
  324. fldd d6 , [ BO, #16 ]
  325. fldd d7 , [ BO, #24 ]
  326. fmuld d8 , d0, d4
  327. KMAC_R d8 , d1, d5
  328. fmuld d9 , d0, d5
  329. KMAC_I d9 , d1, d4
  330. fmuld d12 , d0, d6
  331. KMAC_R d12 , d1, d7
  332. fmuld d13 , d0, d7
  333. KMAC_I d13 , d1, d6
  334. add BO , BO, #32
  335. add AO , AO, #16
  336. .endm
  337. .macro KERNEL1x2_M1
  338. fldd d0 , [ AO ]
  339. fldd d1 , [ AO, #8 ]
  340. fldd d4 , [ BO ]
  341. fldd d5 , [ BO, #8 ]
  342. fldd d6 , [ BO, #16 ]
  343. fldd d7 , [ BO, #24 ]
  344. fmacd d8 , d0, d4
  345. KMAC_R d8 , d1, d5
  346. fmacd d9 , d0, d5
  347. KMAC_I d9 , d1, d4
  348. fmacd d12 , d0, d6
  349. KMAC_R d12 , d1, d7
  350. fmacd d13 , d0, d7
  351. KMAC_I d13 , d1, d6
  352. add BO , BO, #32
  353. add AO , AO, #16
  354. .endm
  355. .macro KERNEL1x2_M2
  356. fldd d0 , [ AO ]
  357. fldd d1 , [ AO, #8 ]
  358. fldd d4 , [ BO ]
  359. fldd d5 , [ BO, #8 ]
  360. fldd d6 , [ BO, #16 ]
  361. fldd d7 , [ BO, #24 ]
  362. fmacd d8 , d0, d4
  363. KMAC_R d8 , d1, d5
  364. fmacd d9 , d0, d5
  365. KMAC_I d9 , d1, d4
  366. fmacd d12 , d0, d6
  367. KMAC_R d12 , d1, d7
  368. fmacd d13 , d0, d7
  369. KMAC_I d13 , d1, d6
  370. add BO , BO, #32
  371. add AO , AO, #16
  372. .endm
  373. .macro KERNEL1x2_E
  374. fldd d0 , [ AO ]
  375. fldd d1 , [ AO, #8 ]
  376. fldd d4 , [ BO ]
  377. fldd d5 , [ BO, #8 ]
  378. fldd d6 , [ BO, #16 ]
  379. fldd d7 , [ BO, #24 ]
  380. fmacd d8 , d0, d4
  381. KMAC_R d8 , d1, d5
  382. fmacd d9 , d0, d5
  383. KMAC_I d9 , d1, d4
  384. fmacd d12 , d0, d6
  385. KMAC_R d12 , d1, d7
  386. fmacd d13 , d0, d7
  387. KMAC_I d13 , d1, d6
  388. add BO , BO, #32
  389. add AO , AO, #16
  390. .endm
  391. .macro KERNEL1x2_SUB
  392. fldd d0 , [ AO ]
  393. fldd d1 , [ AO, #8 ]
  394. fldd d4 , [ BO ]
  395. fldd d5 , [ BO, #8 ]
  396. fldd d6 , [ BO, #16 ]
  397. fldd d7 , [ BO, #24 ]
  398. fmacd d8 , d0, d4
  399. KMAC_R d8 , d1, d5
  400. fmacd d9 , d0, d5
  401. KMAC_I d9 , d1, d4
  402. fmacd d12 , d0, d6
  403. KMAC_R d12 , d1, d7
  404. fmacd d13 , d0, d7
  405. KMAC_I d13 , d1, d6
  406. add BO , BO, #32
  407. add AO , AO, #16
  408. .endm
  409. .macro SAVE1x2
  410. ldr r3 , LDC
  411. add CO2 , CO1, r3
  412. fldd d0, ALPHA_R
  413. fldd d1, ALPHA_I
  414. fldd d4 , FP_ZERO
  415. vmov.f64 d5 , d4
  416. FMAC_R1 d4 , d0 , d8
  417. FMAC_I1 d5 , d0 , d9
  418. FMAC_R2 d4 , d1 , d9
  419. FMAC_I2 d5 , d1 , d8
  420. vstmia.f64 CO1, { d4 - d5 }
  421. fldd d4 , FP_ZERO
  422. vmov.f64 d5 , d4
  423. FMAC_R1 d4 , d0 , d12
  424. FMAC_I1 d5 , d0 , d13
  425. FMAC_R2 d4 , d1 , d13
  426. FMAC_I2 d5 , d1 , d12
  427. vstmia.f64 CO2, { d4 - d5 }
  428. add CO1, CO1, #16
  429. .endm
  430. /******************************************************************************/
  431. .macro INIT2x1
  432. fldd d8 , FP_ZERO
  433. vmov.f64 d9 , d8
  434. vmov.f64 d10, d8
  435. vmov.f64 d11, d8
  436. .endm
  437. .macro KERNEL2x1_I
  438. fldd d0 , [ AO ]
  439. fldd d1 , [ AO, #8 ]
  440. fldd d2 , [ AO, #16 ]
  441. fldd d3 , [ AO, #24 ]
  442. fldd d4 , [ BO ]
  443. fldd d5 , [ BO, #8 ]
  444. fmuld d8 , d0, d4
  445. KMAC_R d8 , d1, d5
  446. fmuld d9 , d0, d5
  447. KMAC_I d9 , d1, d4
  448. fmuld d10 , d2, d4
  449. KMAC_R d10 , d3, d5
  450. fmuld d11 , d2, d5
  451. KMAC_I d11 , d3, d4
  452. add BO , BO, #16
  453. add AO , AO, #32
  454. .endm
  455. .macro KERNEL2x1_M1
  456. fldd d0 , [ AO ]
  457. fldd d1 , [ AO, #8 ]
  458. fldd d2 , [ AO, #16 ]
  459. fldd d3 , [ AO, #24 ]
  460. fldd d4 , [ BO ]
  461. fldd d5 , [ BO, #8 ]
  462. fmacd d8 , d0, d4
  463. KMAC_R d8 , d1, d5
  464. fmacd d9 , d0, d5
  465. KMAC_I d9 , d1, d4
  466. fmacd d10 , d2, d4
  467. KMAC_R d10 , d3, d5
  468. fmacd d11 , d2, d5
  469. KMAC_I d11 , d3, d4
  470. add BO , BO, #16
  471. add AO , AO, #32
  472. .endm
  473. .macro KERNEL2x1_M2
  474. fldd d0 , [ AO ]
  475. fldd d1 , [ AO, #8 ]
  476. fldd d2 , [ AO, #16 ]
  477. fldd d3 , [ AO, #24 ]
  478. fldd d4 , [ BO ]
  479. fldd d5 , [ BO, #8 ]
  480. fmacd d8 , d0, d4
  481. KMAC_R d8 , d1, d5
  482. fmacd d9 , d0, d5
  483. KMAC_I d9 , d1, d4
  484. fmacd d10 , d2, d4
  485. KMAC_R d10 , d3, d5
  486. fmacd d11 , d2, d5
  487. KMAC_I d11 , d3, d4
  488. add BO , BO, #16
  489. add AO , AO, #32
  490. .endm
  491. .macro KERNEL2x1_E
  492. fldd d0 , [ AO ]
  493. fldd d1 , [ AO, #8 ]
  494. fldd d2 , [ AO, #16 ]
  495. fldd d3 , [ AO, #24 ]
  496. fldd d4 , [ BO ]
  497. fldd d5 , [ BO, #8 ]
  498. fmacd d8 , d0, d4
  499. KMAC_R d8 , d1, d5
  500. fmacd d9 , d0, d5
  501. KMAC_I d9 , d1, d4
  502. fmacd d10 , d2, d4
  503. KMAC_R d10 , d3, d5
  504. fmacd d11 , d2, d5
  505. KMAC_I d11 , d3, d4
  506. add BO , BO, #16
  507. add AO , AO, #32
  508. .endm
  509. .macro KERNEL2x1_SUB
  510. fldd d0 , [ AO ]
  511. fldd d1 , [ AO, #8 ]
  512. fldd d2 , [ AO, #16 ]
  513. fldd d3 , [ AO, #24 ]
  514. fldd d4 , [ BO ]
  515. fldd d5 , [ BO, #8 ]
  516. fmacd d8 , d0, d4
  517. KMAC_R d8 , d1, d5
  518. fmacd d9 , d0, d5
  519. KMAC_I d9 , d1, d4
  520. fmacd d10 , d2, d4
  521. KMAC_R d10 , d3, d5
  522. fmacd d11 , d2, d5
  523. KMAC_I d11 , d3, d4
  524. add BO , BO, #16
  525. add AO , AO, #32
  526. .endm
  527. .macro SAVE2x1
  528. fldd d0, ALPHA_R
  529. fldd d1, ALPHA_I
  530. fldd d4 , FP_ZERO
  531. vmov.f64 d5 , d4
  532. vmov.f64 d6 , d4
  533. vmov.f64 d7 , d4
  534. FMAC_R1 d4 , d0 , d8
  535. FMAC_I1 d5 , d0 , d9
  536. FMAC_R2 d4 , d1 , d9
  537. FMAC_I2 d5 , d1 , d8
  538. FMAC_R1 d6 , d0 , d10
  539. FMAC_I1 d7 , d0 , d11
  540. FMAC_R2 d6 , d1 , d11
  541. FMAC_I2 d7 , d1 , d10
  542. vstmia.f64 CO1, { d4 - d7 }
  543. add CO1, CO1, #32
  544. .endm
  545. /******************************************************************************/
  546. .macro INIT1x1
  547. fldd d8 , FP_ZERO
  548. vmov.f64 d9 , d8
  549. .endm
  550. .macro KERNEL1x1_I
  551. fldd d0 , [ AO ]
  552. fldd d1 , [ AO, #8 ]
  553. fldd d4 , [ BO ]
  554. fldd d5 , [ BO, #8 ]
  555. fmuld d8 , d0, d4
  556. KMAC_R d8 , d1, d5
  557. fmuld d9 , d0, d5
  558. KMAC_I d9 , d1, d4
  559. add BO , BO, #16
  560. add AO , AO, #16
  561. .endm
  562. .macro KERNEL1x1_M1
  563. fldd d0 , [ AO ]
  564. fldd d1 , [ AO, #8 ]
  565. fldd d4 , [ BO ]
  566. fldd d5 , [ BO, #8 ]
  567. fmacd d8 , d0, d4
  568. KMAC_R d8 , d1, d5
  569. fmacd d9 , d0, d5
  570. KMAC_I d9 , d1, d4
  571. add BO , BO, #16
  572. add AO , AO, #16
  573. .endm
  574. .macro KERNEL1x1_M2
  575. fldd d0 , [ AO ]
  576. fldd d1 , [ AO, #8 ]
  577. fldd d4 , [ BO ]
  578. fldd d5 , [ BO, #8 ]
  579. fmacd d8 , d0, d4
  580. KMAC_R d8 , d1, d5
  581. fmacd d9 , d0, d5
  582. KMAC_I d9 , d1, d4
  583. add BO , BO, #16
  584. add AO , AO, #16
  585. .endm
  586. .macro KERNEL1x1_E
  587. fldd d0 , [ AO ]
  588. fldd d1 , [ AO, #8 ]
  589. fldd d4 , [ BO ]
  590. fldd d5 , [ BO, #8 ]
  591. fmacd d8 , d0, d4
  592. KMAC_R d8 , d1, d5
  593. fmacd d9 , d0, d5
  594. KMAC_I d9 , d1, d4
  595. add BO , BO, #16
  596. add AO , AO, #16
  597. .endm
  598. .macro KERNEL1x1_SUB
  599. fldd d0 , [ AO ]
  600. fldd d1 , [ AO, #8 ]
  601. fldd d4 , [ BO ]
  602. fldd d5 , [ BO, #8 ]
  603. fmacd d8 , d0, d4
  604. KMAC_R d8 , d1, d5
  605. fmacd d9 , d0, d5
  606. KMAC_I d9 , d1, d4
  607. add BO , BO, #16
  608. add AO , AO, #16
  609. .endm
  610. .macro SAVE1x1
  611. fldd d0, ALPHA_R
  612. fldd d1, ALPHA_I
  613. fldd d4 , FP_ZERO
  614. vmov.f64 d5 , d4
  615. FMAC_R1 d4 , d0 , d8
  616. FMAC_I1 d5 , d0 , d9
  617. FMAC_R2 d4 , d1 , d9
  618. FMAC_I2 d5 , d1 , d8
  619. vstmia.f64 CO1, { d4 - d5 }
  620. add CO1, CO1, #16
  621. .endm
  622. /**************************************************************************************
  623. * End of macro definitions
  624. **************************************************************************************/
  625. PROLOGUE
  626. .align 5
  627. push {r4 - r9, fp}
  628. add fp, sp, #24
  629. sub sp, sp, #STACKSIZE // reserve stack
  630. #if !defined(__ARM_PCS_VFP)
  631. vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
  632. vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
  633. ldr OLD_A, OLD_A_SOFTFP
  634. #endif
  635. str OLD_M, M
  636. str OLD_N, N
  637. str OLD_K, K
  638. str OLD_A, A
  639. vstr OLD_ALPHA_R, ALPHA_R
  640. vstr OLD_ALPHA_I, ALPHA_I
  641. sub r3, fp, #128
  642. vstm r3, { d8 - d15} // store floating point registers
  643. movs r4, #0
  644. str r4, FP_ZERO
  645. str r4, FP_ZERO_1
  646. ldr r3, OLD_LDC
  647. lsl r3, r3, #4 // ldc = ldc * 8 * 2
  648. str r3, LDC
  649. ldr r3, OFFSET
  650. #ifndef LEFT
  651. neg r3 , r3
  652. #endif
  653. str r3 , KK
  654. ldr BC, B
  655. ldr J, N
  656. asrs J, J, #1 // J = J / 2
  657. ble _L1_BEGIN
  658. _L2_BEGIN:
  659. ldr CO1, C // CO1 = C
  660. ldr r4 , LDC
  661. lsl r4 , r4 , #1 // LDC * 2
  662. add r3 , r4, CO1
  663. str r3 , C // store C
  664. #if defined(LEFT)
  665. ldr r3 , OFFSET
  666. str r3 , KK
  667. #endif
  668. ldr AO, A // AO = A
  669. pld [AO , #A_PRE-64]
  670. pld [AO , #A_PRE-32]
  671. _L2_M2_BEGIN:
  672. ldr I, M
  673. asrs I, I, #1 // I = I / 2
  674. ble _L2_M1_BEGIN
  675. _L2_M2_20:
  676. #if (defined(LEFT) && defined(TRANSA)) || \
  677. (!defined(LEFT) && !defined(TRANSA))
  678. mov BO, BC
  679. #else
  680. mov BO, BC
  681. ldr r3 , KK
  682. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  683. add BO , BO , r4
  684. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  685. add AO , AO , r4
  686. #endif
  687. #ifndef TRMMKERNEL
  688. ldr K1, K
  689. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  690. ldr K1, K
  691. ldr r3, KK
  692. sub K1, K1, r3
  693. str K1, KKK
  694. #else
  695. ldr K1, KK
  696. #ifdef LEFT
  697. add K1, K1, #2 // number of values in AO
  698. #else
  699. add K1, K1, #2 // number of values in BO
  700. #endif
  701. str K1, KKK
  702. #endif
  703. asrs L , K1, #3 // L = L / 8
  704. cmp L , #3
  705. blt _L2_M2_30
  706. .align 5
  707. KERNEL2x2_I
  708. KERNEL2x2_M2
  709. KERNEL2x2_M1
  710. KERNEL2x2_M2
  711. KERNEL2x2_M1
  712. KERNEL2x2_M2
  713. KERNEL2x2_M1
  714. KERNEL2x2_M2
  715. sub L, L, #2
  716. _L2_M2_22:
  717. KERNEL2x2_M1
  718. KERNEL2x2_M2
  719. KERNEL2x2_M1
  720. KERNEL2x2_M2
  721. KERNEL2x2_M1
  722. KERNEL2x2_M2
  723. KERNEL2x2_M1
  724. KERNEL2x2_M2
  725. subs L, L, #1
  726. bgt _L2_M2_22
  727. KERNEL2x2_M1
  728. KERNEL2x2_M2
  729. KERNEL2x2_M1
  730. KERNEL2x2_M2
  731. KERNEL2x2_M1
  732. KERNEL2x2_M2
  733. KERNEL2x2_M1
  734. KERNEL2x2_E
  735. b _L2_M2_44
  736. _L2_M2_30:
  737. tst L, #3
  738. ble _L2_M2_40
  739. tst L, #2
  740. ble _L2_M2_32
  741. KERNEL2x2_I
  742. KERNEL2x2_M2
  743. KERNEL2x2_M1
  744. KERNEL2x2_M2
  745. KERNEL2x2_M1
  746. KERNEL2x2_M2
  747. KERNEL2x2_M1
  748. KERNEL2x2_M2
  749. KERNEL2x2_M1
  750. KERNEL2x2_M2
  751. KERNEL2x2_M1
  752. KERNEL2x2_M2
  753. KERNEL2x2_M1
  754. KERNEL2x2_M2
  755. KERNEL2x2_M1
  756. KERNEL2x2_E
  757. b _L2_M2_44
  758. _L2_M2_32:
  759. tst L, #1
  760. ble _L2_M2_40
  761. KERNEL2x2_I
  762. KERNEL2x2_M2
  763. KERNEL2x2_M1
  764. KERNEL2x2_M2
  765. KERNEL2x2_M1
  766. KERNEL2x2_M2
  767. KERNEL2x2_M1
  768. KERNEL2x2_E
  769. b _L2_M2_44
  770. _L2_M2_40:
  771. INIT2x2
  772. _L2_M2_44:
  773. ands L , K1, #7 // L = L % 8
  774. ble _L2_M2_100
  775. _L2_M2_46:
  776. KERNEL2x2_SUB
  777. subs L, L, #1
  778. bne _L2_M2_46
  779. _L2_M2_100:
  780. SAVE2x2
  781. #if (defined(LEFT) && defined(TRANSA)) || \
  782. (!defined(LEFT) && !defined(TRANSA))
  783. ldr r3 , K
  784. ldr r4 , KKK
  785. sub r3 , r3 , r4
  786. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  787. add BO , BO , r4
  788. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  789. add AO , AO , r4
  790. #endif
  791. #if defined(LEFT)
  792. ldr r3 , KK
  793. add r3 , r3 , #2 // number of values in AO
  794. str r3 , KK
  795. #endif
  796. _L2_M2_END:
  797. subs I, I, #1
  798. bne _L2_M2_20
  799. _L2_M1_BEGIN:
  800. ldr I, M
  801. tst I, #1 // I = I % 2
  802. ble _L2_END
  803. _L2_M1_20:
  804. INIT1x2
  805. #if (defined(LEFT) && defined(TRANSA)) || \
  806. (!defined(LEFT) && !defined(TRANSA))
  807. mov BO, BC
  808. #else
  809. mov BO, BC
  810. ldr r3 , KK
  811. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  812. add BO , BO , r4
  813. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  814. add AO , AO , r4
  815. #endif
  816. #ifndef TRMMKERNEL
  817. ldr K1, K
  818. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  819. ldr K1, K
  820. ldr r3, KK
  821. sub K1, K1, r3
  822. str K1, KKK
  823. #else
  824. ldr K1, KK
  825. #ifdef LEFT
  826. add K1, K1, #1 // number of values in AO
  827. #else
  828. add K1, K1, #2 // number of values in BO
  829. #endif
  830. str K1, KKK
  831. #endif
  832. asrs L , K1, #3 // L = L / 8
  833. ble _L2_M1_40
  834. _L2_M1_22:
  835. KERNEL1x2_SUB
  836. KERNEL1x2_SUB
  837. KERNEL1x2_SUB
  838. KERNEL1x2_SUB
  839. KERNEL1x2_SUB
  840. KERNEL1x2_SUB
  841. KERNEL1x2_SUB
  842. KERNEL1x2_SUB
  843. subs L, L, #1
  844. bgt _L2_M1_22
  845. _L2_M1_40:
  846. ands L , K1, #7 // L = L % 8
  847. ble _L2_M1_100
  848. _L2_M1_42:
  849. KERNEL1x2_SUB
  850. subs L, L, #1
  851. bgt _L2_M1_42
  852. _L2_M1_100:
  853. SAVE1x2
  854. #if (defined(LEFT) && defined(TRANSA)) || \
  855. (!defined(LEFT) && !defined(TRANSA))
  856. ldr r3 , K
  857. ldr r4 , KKK
  858. sub r3 , r3 , r4
  859. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  860. add BO , BO , r4
  861. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  862. add AO , AO , r4
  863. #endif
  864. #if defined(LEFT)
  865. ldr r3 , KK
  866. add r3 , r3 , #1 // number of values in AO
  867. str r3 , KK
  868. #endif
  869. _L2_END:
  870. mov r3, BC
  871. ldr r4, K
  872. lsl r4, r4, #5 // k * 2 * 8 * 2
  873. add r3, r3, r4 // B = B + K * 4 * 8
  874. mov BC, r3
  875. #if !defined(LEFT)
  876. ldr r3 , KK
  877. add r3 , r3 , #2 // number of values in BO
  878. str r3 , KK
  879. #endif
  880. subs J , #1 // j--
  881. bgt _L2_BEGIN
  882. /*********************************************************************************************/
  883. _L1_BEGIN:
  884. ldr J , N
  885. tst J , #1
  886. ble _L999
  887. ldr CO1, C // CO1 = C
  888. ldr r4 , LDC
  889. add r3 , r4, CO1
  890. str r3 , C // store C
  891. #if defined(LEFT)
  892. ldr r3 , OFFSET
  893. str r3 , KK
  894. #endif
  895. ldr AO, A // AO = A
  896. _L1_M2_BEGIN:
  897. ldr I, M
  898. asrs I, I, #1 // I = I / 2
  899. ble _L1_M1_BEGIN
  900. _L1_M2_20:
  901. #if (defined(LEFT) && defined(TRANSA)) || \
  902. (!defined(LEFT) && !defined(TRANSA))
  903. mov BO, BC
  904. #else
  905. mov BO, BC
  906. ldr r3 , KK
  907. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  908. add BO , BO , r4
  909. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  910. add AO , AO , r4
  911. #endif
  912. #ifndef TRMMKERNEL
  913. ldr K1, K
  914. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  915. ldr K1, K
  916. ldr r3, KK
  917. sub K1, K1, r3
  918. str K1, KKK
  919. #else
  920. ldr K1, KK
  921. #ifdef LEFT
  922. add K1, K1, #2 // number of values in AO
  923. #else
  924. add K1, K1, #1 // number of values in BO
  925. #endif
  926. str K1, KKK
  927. #endif
  928. asrs L , K1, #3 // L = L / 8
  929. cmp L , #3
  930. blt _L1_M2_30
  931. .align 5
  932. KERNEL2x1_I
  933. KERNEL2x1_M2
  934. KERNEL2x1_M1
  935. KERNEL2x1_M2
  936. KERNEL2x1_M1
  937. KERNEL2x1_M2
  938. KERNEL2x1_M1
  939. KERNEL2x1_M2
  940. sub L, L, #2
  941. _L1_M2_22:
  942. KERNEL2x1_M1
  943. KERNEL2x1_M2
  944. KERNEL2x1_M1
  945. KERNEL2x1_M2
  946. KERNEL2x1_M1
  947. KERNEL2x1_M2
  948. KERNEL2x1_M1
  949. KERNEL2x1_M2
  950. subs L, L, #1
  951. bgt _L1_M2_22
  952. KERNEL2x1_M1
  953. KERNEL2x1_M2
  954. KERNEL2x1_M1
  955. KERNEL2x1_M2
  956. KERNEL2x1_M1
  957. KERNEL2x1_M2
  958. KERNEL2x1_M1
  959. KERNEL2x1_E
  960. b _L1_M2_44
  961. _L1_M2_30:
  962. tst L, #3
  963. ble _L1_M2_40
  964. tst L, #2
  965. ble _L1_M2_32
  966. KERNEL2x1_I
  967. KERNEL2x1_M2
  968. KERNEL2x1_M1
  969. KERNEL2x1_M2
  970. KERNEL2x1_M1
  971. KERNEL2x1_M2
  972. KERNEL2x1_M1
  973. KERNEL2x1_M2
  974. KERNEL2x1_M1
  975. KERNEL2x1_M2
  976. KERNEL2x1_M1
  977. KERNEL2x1_M2
  978. KERNEL2x1_M1
  979. KERNEL2x1_M2
  980. KERNEL2x1_M1
  981. KERNEL2x1_E
  982. b _L1_M2_44
  983. _L1_M2_32:
  984. tst L, #1
  985. ble _L1_M2_40
  986. KERNEL2x1_I
  987. KERNEL2x1_M2
  988. KERNEL2x1_M1
  989. KERNEL2x1_M2
  990. KERNEL2x1_M1
  991. KERNEL2x1_M2
  992. KERNEL2x1_M1
  993. KERNEL2x1_E
  994. b _L1_M2_44
  995. _L1_M2_40:
  996. INIT2x1
  997. _L1_M2_44:
  998. ands L , K1, #7 // L = L % 8
  999. ble _L1_M2_100
  1000. _L1_M2_46:
  1001. KERNEL2x1_SUB
  1002. subs L, L, #1
  1003. bne _L1_M2_46
  1004. _L1_M2_100:
  1005. SAVE2x1
  1006. #if (defined(LEFT) && defined(TRANSA)) || \
  1007. (!defined(LEFT) && !defined(TRANSA))
  1008. ldr r3 , K
  1009. ldr r4 , KKK
  1010. sub r3 , r3 , r4
  1011. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  1012. add BO , BO , r4
  1013. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  1014. add AO , AO , r4
  1015. #endif
  1016. #if defined(LEFT)
  1017. ldr r3 , KK
  1018. add r3 , r3 , #2 // number of values in AO
  1019. str r3 , KK
  1020. #endif
  1021. _L1_M2_END:
  1022. subs I, I, #1
  1023. bne _L1_M2_20
  1024. _L1_M1_BEGIN:
  1025. ldr I, M
  1026. tst I, #1 // I = I % 2
  1027. ble _L1_END
  1028. _L1_M1_20:
  1029. INIT1x1
  1030. #if (defined(LEFT) && defined(TRANSA)) || \
  1031. (!defined(LEFT) && !defined(TRANSA))
  1032. mov BO, BC
  1033. #else
  1034. mov BO, BC
  1035. ldr r3 , KK
  1036. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  1037. add BO , BO , r4
  1038. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  1039. add AO , AO , r4
  1040. #endif
  1041. #ifndef TRMMKERNEL
  1042. ldr K1, K
  1043. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1044. ldr K1, K
  1045. ldr r3, KK
  1046. sub K1, K1, r3
  1047. str K1, KKK
  1048. #else
  1049. ldr K1, KK
  1050. #ifdef LEFT
  1051. add K1, K1, #1 // number of values in AO
  1052. #else
  1053. add K1, K1, #1 // number of values in BO
  1054. #endif
  1055. str K1, KKK
  1056. #endif
  1057. asrs L , K1, #3 // L = L / 8
  1058. ble _L1_M1_40
  1059. _L1_M1_22:
  1060. KERNEL1x1_SUB
  1061. KERNEL1x1_SUB
  1062. KERNEL1x1_SUB
  1063. KERNEL1x1_SUB
  1064. KERNEL1x1_SUB
  1065. KERNEL1x1_SUB
  1066. KERNEL1x1_SUB
  1067. KERNEL1x1_SUB
  1068. subs L, L, #1
  1069. bgt _L1_M1_22
  1070. _L1_M1_40:
  1071. ands L , K1, #7 // L = L % 8
  1072. ble _L1_M1_100
  1073. _L1_M1_42:
  1074. KERNEL1x1_SUB
  1075. subs L, L, #1
  1076. bgt _L1_M1_42
  1077. _L1_M1_100:
  1078. SAVE1x1
  1079. #if (defined(LEFT) && defined(TRANSA)) || \
  1080. (!defined(LEFT) && !defined(TRANSA))
  1081. ldr r3 , K
  1082. ldr r4 , KKK
  1083. sub r3 , r3 , r4
  1084. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  1085. add BO , BO , r4
  1086. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  1087. add AO , AO , r4
  1088. #endif
  1089. #if defined(LEFT)
  1090. ldr r3 , KK
  1091. add r3 , r3 , #1 // number of values in AO
  1092. str r3 , KK
  1093. #endif
  1094. _L1_END:
  1095. _L999:
  1096. sub r3, fp, #128
  1097. vldm r3, { d8 - d15} // restore floating point registers
  1098. movs r0, #0 // set return value
  1099. sub sp, fp, #24
  1100. pop {r4 - r9, fp}
  1101. bx lr
  1102. EPILOGUE