You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_2x2_vfpv3.S 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/05 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. * 2013/11/01 Saar
  34. * UNROLL_N 2
  35. * UNROLL_M 2
  36. * CGEMM_P 96
  37. * CGEMM_Q 120
  38. * CGEMM_R 4096
  39. * A_PRE 96
  40. * B_PRE 96
  41. * C_PRE 64
  42. *
  43. * Performance on Odroid U2:
  44. *
  45. * 1 Core: 2.59 GFLOPS ATLAS: 2.37 GFLOPS
  46. * 2 Cores: 5.17 GFLOPS ATLAS: 4.46 GFLOPS
  47. * 3 Cores: 7.69 GFLOPS ATLAS: 6.50 GFLOPS
  48. * 4 Cores: 10.22 GFLOPS ATLAS: 8.18 GFLOPS
  49. **************************************************************************************/
  50. #define ASSEMBLER
  51. #include "common.h"
  52. #define STACKSIZE 256
  53. #define OLD_M r0
  54. #define OLD_N r1
  55. #define OLD_K r2
  56. #define OLD_A r3
  57. #define OLD_ALPHA_R s0
  58. #define OLD_ALPHA_I s1
  59. /******************************************************
  60. * [fp, #-128] - [fp, #-64] is reserved
  61. * for store and restore of floating point
  62. * registers
  63. *******************************************************/
  64. #define A [fp, #-248 ]
  65. #define LDC [fp, #-252 ]
  66. #define M [fp, #-256 ]
  67. #define N [fp, #-260 ]
  68. #define K [fp, #-264 ]
  69. #define FP_ZERO [fp, #-240]
  70. #define FP_ZERO_0 [fp, # -240]
  71. #define FP_ZERO_1 [fp, # -236]
  72. #define ALPHA_I [fp, #-272]
  73. #define ALPHA_R [fp, #-280]
  74. #define B [fp, #4 ]
  75. #define C [fp, #8 ]
  76. #define OLD_LDC [fp, #12 ]
  77. #define I r0
  78. #define J r1
  79. #define L r2
  80. #define AO r5
  81. #define BO r6
  82. #define CO1 r8
  83. #define CO2 r9
  84. #define K1 r7
  85. #define BC r12
  86. #define A_PRE 96
  87. #define B_PRE 96
  88. #define C_PRE 64
  89. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  90. #define FADD_R fsubs
  91. #define FADD_I fadds
  92. #define FMAC_R1 fnmacs
  93. #define FMAC_R2 fnmacs
  94. #define FMAC_I1 fmacs
  95. #define FMAC_I2 fnmacs
  96. #elif defined(CN) || defined(CT)
  97. #define FADD_R fadds
  98. #define FADD_I fsubs
  99. #define FMAC_R1 fmacs
  100. #define FMAC_R2 fmacs
  101. #define FMAC_I1 fnmacs
  102. #define FMAC_I2 fmacs
  103. #elif defined(NC) || defined(TC)
  104. #define FADD_R fadds
  105. #define FADD_I fsubs
  106. #define FMAC_R1 fmacs
  107. #define FMAC_R2 fnmacs
  108. #define FMAC_I1 fmacs
  109. #define FMAC_I2 fmacs
  110. #else
  111. #define FADD_R fsubs
  112. #define FADD_I fadds
  113. #define FMAC_R1 fnmacs
  114. #define FMAC_R2 fmacs
  115. #define FMAC_I1 fnmacs
  116. #define FMAC_I2 fnmacs
  117. #endif
  118. /**************************************************************************************
  119. * Macro definitions
  120. **************************************************************************************/
  121. .macro INIT2x2
  122. flds s16, FP_ZERO
  123. vmov.f32 s17, s16
  124. vmov.f32 s18, s16
  125. vmov.f32 s19, s16
  126. vmov.f32 s20, s16
  127. vmov.f32 s21, s16
  128. vmov.f32 s22, s16
  129. vmov.f32 s23, s16
  130. vmov.f32 s24, s16
  131. vmov.f32 s25, s16
  132. vmov.f32 s26, s16
  133. vmov.f32 s27, s16
  134. vmov.f32 s28, s16
  135. vmov.f32 s29, s16
  136. vmov.f32 s30, s16
  137. vmov.f32 s31, s16
  138. .endm
  139. .macro KERNEL2x2_I
  140. pld [ AO , #A_PRE ]
  141. pld [ BO , #B_PRE ]
  142. fldmias AO!, { s0 - s1 }
  143. fldmias BO!, { s8 - s9 }
  144. fmuls s16 , s0, s8
  145. fmuls s24 , s1, s9
  146. fldmias AO!, { s2 - s3 }
  147. fmuls s17 , s0, s9
  148. fmuls s25 , s1, s8
  149. fldmias BO!, { s10 - s11 }
  150. fmuls s18 , s2, s8
  151. fmuls s26 , s3, s9
  152. fldmias AO!, { s4 - s5 }
  153. fmuls s19 , s2, s9
  154. fmuls s27 , s3, s8
  155. fldmias BO!, { s12 - s13 }
  156. fmuls s20 , s0, s10
  157. fmuls s28 , s1, s11
  158. fldmias AO!, { s6 - s7 }
  159. fmuls s21 , s0, s11
  160. fmuls s29 , s1, s10
  161. fldmias BO!, { s14 - s15 }
  162. fmuls s22 , s2, s10
  163. fmuls s30 , s3, s11
  164. fmuls s23 , s2, s11
  165. fmuls s31 , s3, s10
  166. .endm
  167. .macro KERNEL2x2_M1
  168. fmacs s16 , s0, s8
  169. fldmias AO!, { s4 - s5 }
  170. fmacs s24 , s1, s9
  171. fmacs s17 , s0, s9
  172. fldmias BO!, { s12 - s13 }
  173. fmacs s25 , s1, s8
  174. fmacs s18 , s2, s8
  175. fldmias AO!, { s6 - s7 }
  176. fmacs s26 , s3, s9
  177. fmacs s19 , s2, s9
  178. fldmias BO!, { s14 - s15 }
  179. fmacs s27 , s3, s8
  180. fmacs s20 , s0, s10
  181. fmacs s28 , s1, s11
  182. fmacs s21 , s0, s11
  183. fmacs s29 , s1, s10
  184. fmacs s22 , s2, s10
  185. fmacs s30 , s3, s11
  186. fmacs s23 , s2, s11
  187. fmacs s31 , s3, s10
  188. .endm
  189. .macro KERNEL2x2_M2
  190. pld [ AO , #A_PRE ]
  191. fmacs s16 , s4, s12
  192. pld [ BO , #B_PRE ]
  193. fmacs s24 , s5, s13
  194. fmacs s17 , s4, s13
  195. fldmias AO!, { s0 - s1 }
  196. fmacs s25 , s5, s12
  197. fmacs s18 , s6, s12
  198. fmacs s26 , s7, s13
  199. fldmias BO!, { s8 - s9 }
  200. fmacs s19 , s6, s13
  201. fmacs s27 , s7, s12
  202. fldmias AO!, { s2 - s3 }
  203. fmacs s20 , s4, s14
  204. fmacs s28 , s5, s15
  205. fldmias BO!, { s10 - s11 }
  206. fmacs s21 , s4, s15
  207. fmacs s29 , s5, s14
  208. fmacs s22 , s6, s14
  209. fmacs s30 , s7, s15
  210. fmacs s23 , s6, s15
  211. fmacs s31 , s7, s14
  212. .endm
  213. .macro KERNEL2x2_E
  214. fmacs s16 , s4, s12
  215. fmacs s24 , s5, s13
  216. fmacs s17 , s4, s13
  217. fmacs s25 , s5, s12
  218. fmacs s18 , s6, s12
  219. fmacs s26 , s7, s13
  220. fmacs s19 , s6, s13
  221. fmacs s27 , s7, s12
  222. fmacs s20 , s4, s14
  223. fmacs s28 , s5, s15
  224. fmacs s21 , s4, s15
  225. fmacs s29 , s5, s14
  226. fmacs s22 , s6, s14
  227. fmacs s30 , s7, s15
  228. fmacs s23 , s6, s15
  229. fmacs s31 , s7, s14
  230. .endm
  231. .macro KERNEL2x2_SUB
  232. fldmias AO!, { s0 - s1 }
  233. fldmias BO!, { s8 - s9 }
  234. fmacs s16 , s0, s8
  235. fmacs s24 , s1, s9
  236. fldmias AO!, { s2 - s3 }
  237. fmacs s17 , s0, s9
  238. fmacs s25 , s1, s8
  239. fldmias BO!, { s10 - s11 }
  240. fmacs s18 , s2, s8
  241. fmacs s26 , s3, s9
  242. fmacs s19 , s2, s9
  243. fmacs s27 , s3, s8
  244. fmacs s20 , s0, s10
  245. fmacs s28 , s1, s11
  246. fmacs s21 , s0, s11
  247. fmacs s29 , s1, s10
  248. fmacs s22 , s2, s10
  249. fmacs s30 , s3, s11
  250. fmacs s23 , s2, s11
  251. fmacs s31 , s3, s10
  252. .endm
  253. .macro SAVE2x2
  254. pld [ CO1 , #C_PRE ]
  255. ldr r3 , LDC
  256. add CO2 , CO1, r3
  257. flds s0, ALPHA_R
  258. flds s1, ALPHA_I
  259. fldmias CO1, { s4 - s7 }
  260. fldmias CO2, { s8 - s11 }
  261. FADD_R s16, s24 , s16
  262. FADD_I s17, s25 , s17
  263. FADD_R s18, s26 , s18
  264. FADD_I s19, s27 , s19
  265. FADD_R s20, s28 , s20
  266. FADD_I s21, s29 , s21
  267. FADD_R s22, s30 , s22
  268. FADD_I s23, s31 , s23
  269. FMAC_R1 s4 , s0 , s16
  270. FMAC_I1 s5 , s0 , s17
  271. FMAC_R2 s4 , s1 , s17
  272. FMAC_I2 s5 , s1 , s16
  273. FMAC_R1 s6 , s0 , s18
  274. FMAC_I1 s7 , s0 , s19
  275. FMAC_R2 s6 , s1 , s19
  276. FMAC_I2 s7 , s1 , s18
  277. FMAC_R1 s8 , s0 , s20
  278. FMAC_I1 s9 , s0 , s21
  279. FMAC_R2 s8 , s1 , s21
  280. FMAC_I2 s9 , s1 , s20
  281. FMAC_R1 s10, s0 , s22
  282. FMAC_I1 s11, s0 , s23
  283. FMAC_R2 s10, s1 , s23
  284. FMAC_I2 s11, s1 , s22
  285. fstmias CO1, { s4 - s7 }
  286. fstmias CO2, { s8 - s11 }
  287. add CO1, CO1, #16
  288. .endm
  289. /******************************************************************************/
  290. .macro INIT1x2
  291. flds s16, FP_ZERO
  292. vmov.f32 s17, s16
  293. vmov.f32 s20, s16
  294. vmov.f32 s21, s16
  295. vmov.f32 s24, s16
  296. vmov.f32 s25, s16
  297. vmov.f32 s28, s16
  298. vmov.f32 s29, s16
  299. .endm
  300. .macro KERNEL1x2_I
  301. pld [ AO , #A_PRE ]
  302. pld [ BO , #B_PRE ]
  303. flds s0 , [ AO ]
  304. flds s1 , [ AO, #4 ]
  305. flds s8 , [ BO ]
  306. flds s9 , [ BO, #4 ]
  307. flds s10, [ BO, #8 ]
  308. flds s11, [ BO, #12 ]
  309. fmuls s16 , s0, s8
  310. fmuls s24 , s1, s9
  311. fmuls s17 , s0, s9
  312. fmuls s25 , s1, s8
  313. fmuls s20 , s0, s10
  314. fmuls s28 , s1, s11
  315. fmuls s21 , s0, s11
  316. fmuls s29 , s1, s10
  317. add BO , BO, #16
  318. add AO , AO, #8
  319. pld [ BO , #B_PRE ]
  320. flds s4 , [ AO, #0 ]
  321. flds s5 , [ AO, #4 ]
  322. flds s12, [ BO ]
  323. flds s13, [ BO, #4 ]
  324. flds s14, [ BO, #8 ]
  325. flds s15, [ BO, #12 ]
  326. add BO , BO, #16
  327. add AO , AO, #8
  328. .endm
  329. .macro KERNEL1x2_M1
  330. pld [ BO , #B_PRE ]
  331. fmacs s16 , s0, s8
  332. fmacs s24 , s1, s9
  333. fmacs s17 , s0, s9
  334. fmacs s25 , s1, s8
  335. fmacs s20 , s0, s10
  336. fmacs s28 , s1, s11
  337. fmacs s21 , s0, s11
  338. fmacs s29 , s1, s10
  339. flds s4 , [ AO, #0 ]
  340. flds s5 , [ AO, #4 ]
  341. flds s12, [ BO ]
  342. flds s13, [ BO, #4 ]
  343. flds s14, [ BO, #8 ]
  344. flds s15, [ BO, #12 ]
  345. add BO , BO, #16
  346. add AO , AO, #8
  347. .endm
  348. .macro KERNEL1x2_M2
  349. pld [ AO , #A_PRE ]
  350. pld [ BO , #B_PRE ]
  351. fmacs s16 , s4, s12
  352. fmacs s24 , s5, s13
  353. fmacs s17 , s4, s13
  354. fmacs s25 , s5, s12
  355. fmacs s20 , s4, s14
  356. fmacs s28 , s5, s15
  357. fmacs s21 , s4, s15
  358. fmacs s29 , s5, s14
  359. flds s0 , [ AO, #0 ]
  360. flds s1 , [ AO, #4 ]
  361. flds s8 , [ BO ]
  362. flds s9 , [ BO, #4 ]
  363. flds s10, [ BO, #8 ]
  364. flds s11, [ BO, #12 ]
  365. add BO , BO, #16
  366. add AO , AO, #8
  367. .endm
  368. .macro KERNEL1x2_E
  369. fmacs s16 , s4, s12
  370. fmacs s24 , s5, s13
  371. fmacs s17 , s4, s13
  372. fmacs s25 , s5, s12
  373. fmacs s20 , s4, s14
  374. fmacs s28 , s5, s15
  375. fmacs s21 , s4, s15
  376. fmacs s29 , s5, s14
  377. .endm
  378. .macro KERNEL1x2_SUB
  379. pld [ AO , #A_PRE ]
  380. pld [ BO , #B_PRE ]
  381. flds s0 , [ AO ]
  382. flds s1 , [ AO, #4 ]
  383. flds s8 , [ BO ]
  384. flds s9 , [ BO, #4 ]
  385. flds s10, [ BO, #8 ]
  386. flds s11, [ BO, #12 ]
  387. fmacs s16 , s0, s8
  388. fmacs s24 , s1, s9
  389. fmacs s17 , s0, s9
  390. fmacs s25 , s1, s8
  391. fmacs s20 , s0, s10
  392. fmacs s28 , s1, s11
  393. fmacs s21 , s0, s11
  394. fmacs s29 , s1, s10
  395. add BO , BO, #16
  396. add AO , AO, #8
  397. .endm
  398. .macro SAVE1x2
  399. pld [ CO1 , #C_PRE ]
  400. ldr r3 , LDC
  401. add CO2 , CO1, r3
  402. flds s0, ALPHA_R
  403. flds s1, ALPHA_I
  404. fldmias CO1, { s4 - s5 }
  405. fldmias CO2, { s8 - s9 }
  406. FADD_R s16, s24 , s16
  407. FADD_I s17, s25 , s17
  408. FADD_R s20, s28 , s20
  409. FADD_I s21, s29 , s21
  410. FMAC_R1 s4 , s0 , s16
  411. FMAC_I1 s5 , s0 , s17
  412. FMAC_R2 s4 , s1 , s17
  413. FMAC_I2 s5 , s1 , s16
  414. FMAC_R1 s8 , s0 , s20
  415. FMAC_I1 s9 , s0 , s21
  416. FMAC_R2 s8 , s1 , s21
  417. FMAC_I2 s9 , s1 , s20
  418. fstmias CO1, { s4 - s5 }
  419. fstmias CO2, { s8 - s9 }
  420. add CO1, CO1, #8
  421. .endm
  422. /******************************************************************************/
  423. .macro INIT2x1
  424. flds s16, FP_ZERO
  425. vmov.f32 s17, s16
  426. vmov.f32 s18, s16
  427. vmov.f32 s19, s16
  428. vmov.f32 s24, s16
  429. vmov.f32 s25, s16
  430. vmov.f32 s26, s16
  431. vmov.f32 s27, s16
  432. .endm
  433. .macro KERNEL2x1_I
  434. pld [ AO , #A_PRE ]
  435. pld [ BO , #B_PRE ]
  436. flds s0 , [ AO ]
  437. flds s1 , [ AO, #4 ]
  438. flds s2 , [ AO, #8 ]
  439. flds s3 , [ AO, #12 ]
  440. flds s8 , [ BO ]
  441. flds s9 , [ BO, #4 ]
  442. fmuls s16 , s0, s8
  443. fmuls s24 , s1, s9
  444. fmuls s17 , s0, s9
  445. fmuls s25 , s1, s8
  446. fmuls s18 , s2, s8
  447. fmuls s26 , s3, s9
  448. fmuls s19 , s2, s9
  449. fmuls s27 , s3, s8
  450. add BO , BO, #8
  451. add AO , AO, #16
  452. pld [ BO , #B_PRE ]
  453. pld [ AO , #A_PRE ]
  454. flds s4 , [ AO, #0 ]
  455. flds s5 , [ AO, #4 ]
  456. flds s6 , [ AO, #8 ]
  457. flds s7 , [ AO, #12 ]
  458. flds s12, [ BO ]
  459. flds s13, [ BO, #4 ]
  460. add BO , BO, #8
  461. add AO , AO, #16
  462. .endm
  463. .macro KERNEL2x1_M1
  464. pld [ AO , #A_PRE ]
  465. pld [ BO , #B_PRE ]
  466. fmacs s16 , s0, s8
  467. fmacs s24 , s1, s9
  468. fmacs s17 , s0, s9
  469. fmacs s25 , s1, s8
  470. fmacs s18 , s2, s8
  471. fmacs s26 , s3, s9
  472. fmacs s19 , s2, s9
  473. fmacs s27 , s3, s8
  474. flds s4 , [ AO, #0 ]
  475. flds s5 , [ AO, #4 ]
  476. flds s6 , [ AO, #8 ]
  477. flds s7 , [ AO, #12 ]
  478. flds s12, [ BO ]
  479. flds s13, [ BO, #4 ]
  480. add BO , BO, #8
  481. add AO , AO, #16
  482. .endm
  483. .macro KERNEL2x1_M2
  484. pld [ AO , #A_PRE ]
  485. pld [ BO , #B_PRE ]
  486. fmacs s16 , s4, s12
  487. fmacs s24 , s5, s13
  488. fmacs s17 , s4, s13
  489. fmacs s25 , s5, s12
  490. fmacs s18 , s6, s12
  491. fmacs s26 , s7, s13
  492. fmacs s19 , s6, s13
  493. fmacs s27 , s7, s12
  494. flds s0 , [ AO, #0 ]
  495. flds s1 , [ AO, #4 ]
  496. flds s2 , [ AO, #8 ]
  497. flds s3 , [ AO, #12 ]
  498. flds s8 , [ BO ]
  499. flds s9 , [ BO, #4 ]
  500. add BO , BO, #8
  501. add AO , AO, #16
  502. .endm
  503. .macro KERNEL2x1_E
  504. fmacs s16 , s4, s12
  505. fmacs s24 , s5, s13
  506. fmacs s17 , s4, s13
  507. fmacs s25 , s5, s12
  508. fmacs s18 , s6, s12
  509. fmacs s26 , s7, s13
  510. fmacs s19 , s6, s13
  511. fmacs s27 , s7, s12
  512. .endm
  513. .macro KERNEL2x1_SUB
  514. pld [ AO , #A_PRE ]
  515. pld [ BO , #B_PRE ]
  516. flds s0 , [ AO ]
  517. flds s1 , [ AO, #4 ]
  518. flds s2 , [ AO, #8 ]
  519. flds s3 , [ AO, #12 ]
  520. flds s8 , [ BO ]
  521. flds s9 , [ BO, #4 ]
  522. fmacs s16 , s0, s8
  523. fmacs s24 , s1, s9
  524. fmacs s17 , s0, s9
  525. fmacs s25 , s1, s8
  526. fmacs s18 , s2, s8
  527. fmacs s26 , s3, s9
  528. fmacs s19 , s2, s9
  529. fmacs s27 , s3, s8
  530. add BO , BO, #8
  531. add AO , AO, #16
  532. .endm
  533. .macro SAVE2x1
  534. pld [ CO1 , #C_PRE ]
  535. flds s0, ALPHA_R
  536. flds s1, ALPHA_I
  537. fldmias CO1, { s4 - s7 }
  538. FADD_R s16, s24 , s16
  539. FADD_I s17, s25 , s17
  540. FADD_R s18, s26 , s18
  541. FADD_I s19, s27 , s19
  542. FMAC_R1 s4 , s0 , s16
  543. FMAC_I1 s5 , s0 , s17
  544. FMAC_R2 s4 , s1 , s17
  545. FMAC_I2 s5 , s1 , s16
  546. FMAC_R1 s6 , s0 , s18
  547. FMAC_I1 s7 , s0 , s19
  548. FMAC_R2 s6 , s1 , s19
  549. FMAC_I2 s7 , s1 , s18
  550. fstmias CO1, { s4 - s7 }
  551. add CO1, CO1, #16
  552. .endm
  553. /******************************************************************************/
  554. .macro INIT1x1
  555. flds s16, FP_ZERO
  556. vmov.f32 s17, s16
  557. vmov.f32 s24, s16
  558. vmov.f32 s25, s16
  559. .endm
  560. .macro KERNEL1x1_I
  561. pld [ AO , #A_PRE ]
  562. pld [ BO , #B_PRE ]
  563. flds s0 , [ AO ]
  564. flds s1 , [ AO, #4 ]
  565. flds s8 , [ BO ]
  566. flds s9 , [ BO, #4 ]
  567. fmuls s16 , s0, s8
  568. fmuls s24 , s1, s9
  569. fmuls s17 , s0, s9
  570. fmuls s25 , s1, s8
  571. add BO , BO, #8
  572. add AO , AO, #8
  573. pld [ BO , #B_PRE ]
  574. pld [ AO , #A_PRE ]
  575. flds s4 , [ AO, #0 ]
  576. flds s5 , [ AO, #4 ]
  577. flds s12, [ BO ]
  578. flds s13, [ BO, #4 ]
  579. add BO , BO, #8
  580. add AO , AO, #8
  581. .endm
  582. .macro KERNEL1x1_M1
  583. fmacs s16 , s0, s8
  584. fmacs s24 , s1, s9
  585. fmacs s17 , s0, s9
  586. fmacs s25 , s1, s8
  587. flds s4 , [ AO, #0 ]
  588. flds s5 , [ AO, #4 ]
  589. flds s12, [ BO ]
  590. flds s13, [ BO, #4 ]
  591. add BO , BO, #8
  592. add AO , AO, #8
  593. .endm
  594. .macro KERNEL1x1_M2
  595. fmacs s16 , s4, s12
  596. fmacs s24 , s5, s13
  597. fmacs s17 , s4, s13
  598. fmacs s25 , s5, s12
  599. flds s0 , [ AO, #0 ]
  600. flds s1 , [ AO, #4 ]
  601. flds s8 , [ BO ]
  602. flds s9 , [ BO, #4 ]
  603. add BO , BO, #8
  604. add AO , AO, #8
  605. .endm
  606. .macro KERNEL1x1_E
  607. fmacs s16 , s4, s12
  608. fmacs s24 , s5, s13
  609. fmacs s17 , s4, s13
  610. fmacs s25 , s5, s12
  611. .endm
  612. .macro KERNEL1x1_SUB
  613. flds s0 , [ AO ]
  614. flds s1 , [ AO, #4 ]
  615. flds s8 , [ BO ]
  616. flds s9 , [ BO, #4 ]
  617. fmacs s16 , s0, s8
  618. fmacs s24 , s1, s9
  619. fmacs s17 , s0, s9
  620. fmacs s25 , s1, s8
  621. add BO , BO, #8
  622. add AO , AO, #8
  623. .endm
  624. .macro SAVE1x1
  625. pld [ CO1 , #C_PRE ]
  626. flds s0, ALPHA_R
  627. flds s1, ALPHA_I
  628. fldmias CO1, { s4 - s5 }
  629. FADD_R s16, s24 , s16
  630. FADD_I s17, s25 , s17
  631. FMAC_R1 s4 , s0 , s16
  632. FMAC_I1 s5 , s0 , s17
  633. FMAC_R2 s4 , s1 , s17
  634. FMAC_I2 s5 , s1 , s16
  635. fstmias CO1, { s4 - s5 }
  636. add CO1, CO1, #8
  637. .endm
  638. /******************************************************************************/
  639. /**************************************************************************************
  640. * End of macro definitions
  641. **************************************************************************************/
  642. PROLOGUE
  643. .align 5
  644. push {r4 - r9, fp}
  645. add fp, sp, #24
  646. sub sp, sp, #STACKSIZE // reserve stack
  647. str OLD_M, M
  648. str OLD_N, N
  649. str OLD_K, K
  650. str OLD_A, A
  651. vstr OLD_ALPHA_R, ALPHA_R
  652. vstr OLD_ALPHA_I, ALPHA_I
  653. sub r3, fp, #128
  654. vstm r3, { s8 - s31} // store floating point registers
  655. movs r4, #0
  656. str r4, FP_ZERO
  657. str r4, FP_ZERO_1
  658. ldr r3, OLD_LDC
  659. lsl r3, r3, #3 // ldc = ldc * 4 * 2
  660. str r3, LDC
  661. ldr K1, K
  662. ldr BC, B
  663. ldr J, N
  664. asrs J, J, #1 // J = J / 2
  665. ble cgemm_kernel_L1_BEGIN
  666. cgemm_kernel_L2_BEGIN:
  667. ldr CO1, C // CO1 = C
  668. ldr r4 , LDC
  669. lsl r4 , r4 , #1 // LDC * 2
  670. add r3 , r4, CO1
  671. str r3 , C // store C
  672. ldr AO, A // AO = A
  673. pld [AO , #A_PRE-64]
  674. pld [AO , #A_PRE-32]
  675. cgemm_kernel_L2_M2_BEGIN:
  676. ldr I, M
  677. asrs I, I, #1 // I = I / 2
  678. ble cgemm_kernel_L2_M1_BEGIN
  679. cgemm_kernel_L2_M2_20:
  680. mov BO, BC
  681. asrs L , K1, #3 // L = L / 8
  682. cmp L , #3
  683. blt cgemm_kernel_L2_M2_30
  684. .align 5
  685. KERNEL2x2_I
  686. KERNEL2x2_M2
  687. KERNEL2x2_M1
  688. KERNEL2x2_M2
  689. KERNEL2x2_M1
  690. KERNEL2x2_M2
  691. KERNEL2x2_M1
  692. KERNEL2x2_M2
  693. sub L, L, #2
  694. cgemm_kernel_L2_M2_22:
  695. KERNEL2x2_M1
  696. KERNEL2x2_M2
  697. KERNEL2x2_M1
  698. KERNEL2x2_M2
  699. KERNEL2x2_M1
  700. KERNEL2x2_M2
  701. KERNEL2x2_M1
  702. KERNEL2x2_M2
  703. subs L, L, #1
  704. bgt cgemm_kernel_L2_M2_22
  705. KERNEL2x2_M1
  706. KERNEL2x2_M2
  707. KERNEL2x2_M1
  708. KERNEL2x2_M2
  709. KERNEL2x2_M1
  710. KERNEL2x2_M2
  711. KERNEL2x2_M1
  712. KERNEL2x2_E
  713. b cgemm_kernel_L2_M2_44
  714. cgemm_kernel_L2_M2_30:
  715. tst L, #3
  716. ble cgemm_kernel_L2_M2_40
  717. tst L, #2
  718. ble cgemm_kernel_L2_M2_32
  719. KERNEL2x2_I
  720. KERNEL2x2_M2
  721. KERNEL2x2_M1
  722. KERNEL2x2_M2
  723. KERNEL2x2_M1
  724. KERNEL2x2_M2
  725. KERNEL2x2_M1
  726. KERNEL2x2_M2
  727. KERNEL2x2_M1
  728. KERNEL2x2_M2
  729. KERNEL2x2_M1
  730. KERNEL2x2_M2
  731. KERNEL2x2_M1
  732. KERNEL2x2_M2
  733. KERNEL2x2_M1
  734. KERNEL2x2_E
  735. b cgemm_kernel_L2_M2_44
  736. cgemm_kernel_L2_M2_32:
  737. tst L, #1
  738. ble cgemm_kernel_L2_M2_40
  739. KERNEL2x2_I
  740. KERNEL2x2_M2
  741. KERNEL2x2_M1
  742. KERNEL2x2_M2
  743. KERNEL2x2_M1
  744. KERNEL2x2_M2
  745. KERNEL2x2_M1
  746. KERNEL2x2_E
  747. b cgemm_kernel_L2_M2_44
  748. cgemm_kernel_L2_M2_40:
  749. INIT2x2
  750. cgemm_kernel_L2_M2_44:
  751. ands L , K1, #7 // L = L % 8
  752. ble cgemm_kernel_L2_M2_100
  753. cgemm_kernel_L2_M2_46:
  754. KERNEL2x2_SUB
  755. subs L, L, #1
  756. bne cgemm_kernel_L2_M2_46
  757. cgemm_kernel_L2_M2_100:
  758. SAVE2x2
  759. cgemm_kernel_L2_M2_END:
  760. subs I, I, #1
  761. bne cgemm_kernel_L2_M2_20
  762. cgemm_kernel_L2_M1_BEGIN:
  763. ldr I, M
  764. tst I, #1 // I = I % 2
  765. ble cgemm_kernel_L2_END
  766. cgemm_kernel_L2_M1_20:
  767. INIT1x2
  768. mov BO, BC
  769. asrs L , K1, #3 // L = L / 8
  770. ble cgemm_kernel_L2_M1_40
  771. cgemm_kernel_L2_M1_22:
  772. KERNEL1x2_SUB
  773. KERNEL1x2_SUB
  774. KERNEL1x2_SUB
  775. KERNEL1x2_SUB
  776. KERNEL1x2_SUB
  777. KERNEL1x2_SUB
  778. KERNEL1x2_SUB
  779. KERNEL1x2_SUB
  780. subs L, L, #1
  781. bgt cgemm_kernel_L2_M1_22
  782. cgemm_kernel_L2_M1_40:
  783. ands L , K1, #7 // L = L % 8
  784. ble cgemm_kernel_L2_M1_100
  785. cgemm_kernel_L2_M1_42:
  786. KERNEL1x2_SUB
  787. subs L, L, #1
  788. bgt cgemm_kernel_L2_M1_42
  789. cgemm_kernel_L2_M1_100:
  790. SAVE1x2
  791. cgemm_kernel_L2_END:
  792. mov r3, BC
  793. mov r4, K1
  794. lsl r4, r4, #4 // k * 2 * 4 * 2
  795. add r3, r3, r4 // B = B + K * 2 * 8
  796. mov BC, r3
  797. subs J , #1 // j--
  798. bgt cgemm_kernel_L2_BEGIN
  799. /*********************************************************************************************/
  800. cgemm_kernel_L1_BEGIN:
  801. ldr J , N
  802. tst J , #1
  803. ble cgemm_kernel_L999
  804. ldr CO1, C // CO1 = C
  805. ldr r4 , LDC
  806. add r3 , r4, CO1
  807. str r3 , C // store C
  808. ldr AO, A // AO = A
  809. cgemm_kernel_L1_M2_BEGIN:
  810. ldr I, M
  811. asrs I, I, #1 // I = I / 2
  812. ble cgemm_kernel_L1_M1_BEGIN
  813. cgemm_kernel_L1_M2_20:
  814. mov BO, BC
  815. asrs L , K1, #3 // L = L / 8
  816. cmp L , #3
  817. blt cgemm_kernel_L1_M2_30
  818. .align 5
  819. KERNEL2x1_I
  820. KERNEL2x1_M2
  821. KERNEL2x1_M1
  822. KERNEL2x1_M2
  823. KERNEL2x1_M1
  824. KERNEL2x1_M2
  825. KERNEL2x1_M1
  826. KERNEL2x1_M2
  827. sub L, L, #2
  828. cgemm_kernel_L1_M2_22:
  829. KERNEL2x1_M1
  830. KERNEL2x1_M2
  831. KERNEL2x1_M1
  832. KERNEL2x1_M2
  833. KERNEL2x1_M1
  834. KERNEL2x1_M2
  835. KERNEL2x1_M1
  836. KERNEL2x1_M2
  837. subs L, L, #1
  838. bgt cgemm_kernel_L1_M2_22
  839. KERNEL2x1_M1
  840. KERNEL2x1_M2
  841. KERNEL2x1_M1
  842. KERNEL2x1_M2
  843. KERNEL2x1_M1
  844. KERNEL2x1_M2
  845. KERNEL2x1_M1
  846. KERNEL2x1_E
  847. b cgemm_kernel_L1_M2_44
  848. cgemm_kernel_L1_M2_30:
  849. tst L, #3
  850. ble cgemm_kernel_L1_M2_40
  851. tst L, #2
  852. ble cgemm_kernel_L1_M2_32
  853. KERNEL2x1_I
  854. KERNEL2x1_M2
  855. KERNEL2x1_M1
  856. KERNEL2x1_M2
  857. KERNEL2x1_M1
  858. KERNEL2x1_M2
  859. KERNEL2x1_M1
  860. KERNEL2x1_M2
  861. KERNEL2x1_M1
  862. KERNEL2x1_M2
  863. KERNEL2x1_M1
  864. KERNEL2x1_M2
  865. KERNEL2x1_M1
  866. KERNEL2x1_M2
  867. KERNEL2x1_M1
  868. KERNEL2x1_E
  869. b cgemm_kernel_L1_M2_44
  870. cgemm_kernel_L1_M2_32:
  871. tst L, #1
  872. ble cgemm_kernel_L1_M2_40
  873. KERNEL2x1_I
  874. KERNEL2x1_M2
  875. KERNEL2x1_M1
  876. KERNEL2x1_M2
  877. KERNEL2x1_M1
  878. KERNEL2x1_M2
  879. KERNEL2x1_M1
  880. KERNEL2x1_E
  881. b cgemm_kernel_L1_M2_44
  882. cgemm_kernel_L1_M2_40:
  883. INIT2x1
  884. cgemm_kernel_L1_M2_44:
  885. ands L , K1, #7 // L = L % 8
  886. ble cgemm_kernel_L1_M2_100
  887. cgemm_kernel_L1_M2_46:
  888. KERNEL2x1_SUB
  889. subs L, L, #1
  890. bne cgemm_kernel_L1_M2_46
  891. cgemm_kernel_L1_M2_100:
  892. SAVE2x1
  893. cgemm_kernel_L1_M2_END:
  894. subs I, I, #1
  895. bne cgemm_kernel_L1_M2_20
  896. cgemm_kernel_L1_M1_BEGIN:
  897. ldr I, M
  898. tst I, #1 // I = I % 2
  899. ble cgemm_kernel_L1_END
  900. cgemm_kernel_L1_M1_20:
  901. INIT1x1
  902. mov BO, BC
  903. asrs L , K1, #3 // L = L / 8
  904. ble cgemm_kernel_L1_M1_40
  905. cgemm_kernel_L1_M1_22:
  906. KERNEL1x1_SUB
  907. KERNEL1x1_SUB
  908. KERNEL1x1_SUB
  909. KERNEL1x1_SUB
  910. KERNEL1x1_SUB
  911. KERNEL1x1_SUB
  912. KERNEL1x1_SUB
  913. KERNEL1x1_SUB
  914. subs L, L, #1
  915. bgt cgemm_kernel_L1_M1_22
  916. cgemm_kernel_L1_M1_40:
  917. ands L , K1, #7 // L = L % 8
  918. ble cgemm_kernel_L1_M1_100
  919. cgemm_kernel_L1_M1_42:
  920. KERNEL1x1_SUB
  921. subs L, L, #1
  922. bgt cgemm_kernel_L1_M1_42
  923. cgemm_kernel_L1_M1_100:
  924. SAVE1x1
  925. cgemm_kernel_L1_END:
  926. cgemm_kernel_L999:
  927. sub r3, fp, #128
  928. vldm r3, { s8 - s31} // restore floating point registers
  929. movs r0, #0 // set return value
  930. sub sp, fp, #24
  931. pop {r4 - r9, fp}
  932. bx lr
  933. EPILOGUE