You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_2x2_vfp.S 20 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. ***************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA_R d0
  42. #define OLD_ALPHA_I d1
  43. /******************************************************
  44. * [fp, #-128] - [fp, #-64] is reserved
  45. * for store and restore of floating point
  46. * registers
  47. *******************************************************/
  48. #define A [fp, #-248 ]
  49. #define LDC [fp, #-252 ]
  50. #define M [fp, #-256 ]
  51. #define N [fp, #-260 ]
  52. #define K [fp, #-264 ]
  53. #define FP_ZERO [fp, #-240]
  54. #define FP_ZERO_0 [fp, # -240]
  55. #define FP_ZERO_1 [fp, # -236]
  56. #define ALPHA_I [fp, #-272]
  57. #define ALPHA_R [fp, #-280]
  58. #if !defined(__ARM_PCS_VFP)
  59. #define OLD_ALPHAR_SOFTFP [fp, #4]
  60. #define OLD_ALPHAI_SOFTFP [fp, #12]
  61. #define OLD_A_SOFTFP [fp, #20 ]
  62. #define B [fp, #24 ]
  63. #define C [fp, #28 ]
  64. #define OLD_LDC [fp, #32 ]
  65. #else
  66. #define B [fp, #4 ]
  67. #define C [fp, #8 ]
  68. #define OLD_LDC [fp, #12 ]
  69. #endif
  70. #define I r0
  71. #define J r1
  72. #define L r2
  73. #define AO r5
  74. #define BO r6
  75. #define CO1 r8
  76. #define CO2 r9
  77. #define K1 r7
  78. #define BC r12
  79. #define A_PRE 96
  80. #define B_PRE 96
  81. #define C_PRE 64
  82. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  83. #define KMAC_R vmls.f64
  84. #define KMAC_I fmacd
  85. #define FMAC_R1 fmacd
  86. #define FMAC_R2 vmls.f64
  87. #define FMAC_I1 fmacd
  88. #define FMAC_I2 fmacd
  89. #elif defined(CN) || defined(CT)
  90. #define KMAC_R fmacd
  91. #define KMAC_I vmls.f64
  92. #define FMAC_R1 fmacd
  93. #define FMAC_R2 vmls.f64
  94. #define FMAC_I1 fmacd
  95. #define FMAC_I2 fmacd
  96. #elif defined(NC) || defined(TC)
  97. #define KMAC_R fmacd
  98. #define KMAC_I vmls.f64
  99. #define FMAC_R1 fmacd
  100. #define FMAC_R2 fmacd
  101. #define FMAC_I1 vmls.f64
  102. #define FMAC_I2 fmacd
  103. #else
  104. #define KMAC_R vmls.f64
  105. #define KMAC_I fmacd
  106. #define FMAC_R1 fmacd
  107. #define FMAC_R2 fmacd
  108. #define FMAC_I1 vmls.f64
  109. #define FMAC_I2 fmacd
  110. #endif
  111. /**************************************************************************************
  112. * Macro definitions
  113. **************************************************************************************/
  114. .macro INIT2x2
  115. fldd d8 , FP_ZERO
  116. vmov.f64 d9 , d8
  117. vmov.f64 d10, d8
  118. vmov.f64 d11, d8
  119. vmov.f64 d12, d8
  120. vmov.f64 d13, d8
  121. vmov.f64 d14, d8
  122. vmov.f64 d15, d8
  123. .endm
  124. .macro KERNEL2x2_I
  125. pld [ AO, #A_PRE ]
  126. pld [ BO, #B_PRE ]
  127. fldd d0 , [ AO ]
  128. fldd d1 , [ AO, #8 ]
  129. fldd d2 , [ AO, #16 ]
  130. fldd d3 , [ AO, #24 ]
  131. fldd d4 , [ BO ]
  132. fldd d5 , [ BO, #8 ]
  133. fldd d6 , [ BO, #16 ]
  134. fldd d7 , [ BO, #24 ]
  135. fmuld d8 , d0, d4
  136. KMAC_R d8 , d1, d5
  137. fmuld d9 , d0, d5
  138. KMAC_I d9 , d1, d4
  139. fmuld d10 , d2, d4
  140. KMAC_R d10 , d3, d5
  141. fmuld d11 , d2, d5
  142. KMAC_I d11 , d3, d4
  143. fmuld d12 , d0, d6
  144. KMAC_R d12 , d1, d7
  145. fmuld d13 , d0, d7
  146. KMAC_I d13 , d1, d6
  147. fmuld d14 , d2, d6
  148. KMAC_R d14 , d3, d7
  149. fmuld d15 , d2, d7
  150. KMAC_I d15 , d3, d6
  151. add BO , BO, #32
  152. add AO , AO, #32
  153. .endm
  154. .macro KERNEL2x2_M1
  155. fldd d0 , [ AO ]
  156. fldd d4 , [ BO ]
  157. fldd d5 , [ BO, #8 ]
  158. fmacd d8 , d0, d4
  159. fldd d1 , [ AO, #8 ]
  160. fmacd d9 , d0, d5
  161. fldd d2 , [ AO, #16 ]
  162. KMAC_R d8 , d1, d5
  163. fldd d3 , [ AO, #24 ]
  164. KMAC_I d9 , d1, d4
  165. fldd d6 , [ BO, #16 ]
  166. fmacd d10 , d2, d4
  167. fldd d7 , [ BO, #24 ]
  168. fmacd d11 , d2, d5
  169. KMAC_R d10 , d3, d5
  170. pld [ AO, #A_PRE ]
  171. KMAC_I d11 , d3, d4
  172. pld [ BO, #B_PRE ]
  173. fmacd d12 , d0, d6
  174. fmacd d13 , d0, d7
  175. KMAC_R d12 , d1, d7
  176. KMAC_I d13 , d1, d6
  177. fmacd d14 , d2, d6
  178. fmacd d15 , d2, d7
  179. add BO , BO, #32
  180. KMAC_R d14 , d3, d7
  181. add AO , AO, #32
  182. KMAC_I d15 , d3, d6
  183. .endm
  184. .macro KERNEL2x2_M2
  185. fldd d0 , [ AO ]
  186. fldd d4 , [ BO ]
  187. fldd d5 , [ BO, #8 ]
  188. fmacd d8 , d0, d4
  189. fldd d1 , [ AO, #8 ]
  190. fmacd d9 , d0, d5
  191. fldd d2 , [ AO, #16 ]
  192. KMAC_R d8 , d1, d5
  193. fldd d3 , [ AO, #24 ]
  194. KMAC_I d9 , d1, d4
  195. fldd d6 , [ BO, #16 ]
  196. fmacd d10 , d2, d4
  197. fldd d7 , [ BO, #24 ]
  198. fmacd d11 , d2, d5
  199. KMAC_R d10 , d3, d5
  200. pld [ AO, #A_PRE ]
  201. KMAC_I d11 , d3, d4
  202. pld [ BO, #B_PRE ]
  203. fmacd d12 , d0, d6
  204. fmacd d13 , d0, d7
  205. KMAC_R d12 , d1, d7
  206. KMAC_I d13 , d1, d6
  207. fmacd d14 , d2, d6
  208. fmacd d15 , d2, d7
  209. add BO , BO, #32
  210. KMAC_R d14 , d3, d7
  211. add AO , AO, #32
  212. KMAC_I d15 , d3, d6
  213. .endm
  214. .macro KERNEL2x2_E
  215. fldd d0 , [ AO ]
  216. fldd d1 , [ AO, #8 ]
  217. fldd d2 , [ AO, #16 ]
  218. fldd d3 , [ AO, #24 ]
  219. fldd d4 , [ BO ]
  220. fldd d5 , [ BO, #8 ]
  221. fldd d6 , [ BO, #16 ]
  222. fldd d7 , [ BO, #24 ]
  223. fmacd d8 , d0, d4
  224. KMAC_R d8 , d1, d5
  225. fmacd d9 , d0, d5
  226. KMAC_I d9 , d1, d4
  227. fmacd d10 , d2, d4
  228. KMAC_R d10 , d3, d5
  229. fmacd d11 , d2, d5
  230. KMAC_I d11 , d3, d4
  231. fmacd d12 , d0, d6
  232. KMAC_R d12 , d1, d7
  233. fmacd d13 , d0, d7
  234. KMAC_I d13 , d1, d6
  235. fmacd d14 , d2, d6
  236. KMAC_R d14 , d3, d7
  237. fmacd d15 , d2, d7
  238. KMAC_I d15 , d3, d6
  239. add BO , BO, #32
  240. add AO , AO, #32
  241. .endm
  242. .macro KERNEL2x2_SUB
  243. fldd d0 , [ AO ]
  244. fldd d4 , [ BO ]
  245. fldd d5 , [ BO, #8 ]
  246. fmacd d8 , d0, d4
  247. fldd d1 , [ AO, #8 ]
  248. fmacd d9 , d0, d5
  249. fldd d2 , [ AO, #16 ]
  250. KMAC_R d8 , d1, d5
  251. fldd d3 , [ AO, #24 ]
  252. KMAC_I d9 , d1, d4
  253. fldd d6 , [ BO, #16 ]
  254. fmacd d10 , d2, d4
  255. fldd d7 , [ BO, #24 ]
  256. fmacd d11 , d2, d5
  257. KMAC_R d10 , d3, d5
  258. pld [ AO, #A_PRE ]
  259. KMAC_I d11 , d3, d4
  260. pld [ BO, #B_PRE ]
  261. fmacd d12 , d0, d6
  262. fmacd d13 , d0, d7
  263. KMAC_R d12 , d1, d7
  264. KMAC_I d13 , d1, d6
  265. fmacd d14 , d2, d6
  266. fmacd d15 , d2, d7
  267. add BO , BO, #32
  268. KMAC_R d14 , d3, d7
  269. add AO , AO, #32
  270. KMAC_I d15 , d3, d6
  271. .endm
  272. .macro SAVE2x2
  273. ldr r3 , LDC
  274. add CO2 , CO1, r3
  275. fldd d0, ALPHA_R
  276. fldd d1, ALPHA_I
  277. vldmia.f64 CO1, { d4 - d7 }
  278. FMAC_R1 d4 , d0 , d8
  279. FMAC_I1 d5 , d0 , d9
  280. FMAC_R2 d4 , d1 , d9
  281. FMAC_I2 d5 , d1 , d8
  282. FMAC_R1 d6 , d0 , d10
  283. FMAC_I1 d7 , d0 , d11
  284. FMAC_R2 d6 , d1 , d11
  285. FMAC_I2 d7 , d1 , d10
  286. vstmia.f64 CO1, { d4 - d7 }
  287. vldmia.f64 CO2, { d4 - d7 }
  288. FMAC_R1 d4 , d0 , d12
  289. FMAC_I1 d5 , d0 , d13
  290. FMAC_R2 d4 , d1 , d13
  291. FMAC_I2 d5 , d1 , d12
  292. FMAC_R1 d6 , d0 , d14
  293. FMAC_I1 d7 , d0 , d15
  294. FMAC_R2 d6 , d1 , d15
  295. FMAC_I2 d7 , d1 , d14
  296. vstmia.f64 CO2, { d4 - d7 }
  297. add CO1, CO1, #32
  298. .endm
  299. /******************************************************************************/
  300. .macro INIT1x2
  301. fldd d8 , FP_ZERO
  302. vmov.f64 d9 , d8
  303. vmov.f64 d12, d8
  304. vmov.f64 d13, d8
  305. .endm
  306. .macro KERNEL1x2_I
  307. fldd d0 , [ AO ]
  308. fldd d1 , [ AO, #8 ]
  309. fldd d4 , [ BO ]
  310. fldd d5 , [ BO, #8 ]
  311. fldd d6 , [ BO, #16 ]
  312. fldd d7 , [ BO, #24 ]
  313. fmuld d8 , d0, d4
  314. KMAC_R d8 , d1, d5
  315. fmuld d9 , d0, d5
  316. KMAC_I d9 , d1, d4
  317. fmuld d12 , d0, d6
  318. KMAC_R d12 , d1, d7
  319. fmuld d13 , d0, d7
  320. KMAC_I d13 , d1, d6
  321. add BO , BO, #32
  322. add AO , AO, #16
  323. .endm
  324. .macro KERNEL1x2_M1
  325. fldd d0 , [ AO ]
  326. fldd d1 , [ AO, #8 ]
  327. fldd d4 , [ BO ]
  328. fldd d5 , [ BO, #8 ]
  329. fldd d6 , [ BO, #16 ]
  330. fldd d7 , [ BO, #24 ]
  331. fmacd d8 , d0, d4
  332. KMAC_R d8 , d1, d5
  333. fmacd d9 , d0, d5
  334. KMAC_I d9 , d1, d4
  335. fmacd d12 , d0, d6
  336. KMAC_R d12 , d1, d7
  337. fmacd d13 , d0, d7
  338. KMAC_I d13 , d1, d6
  339. add BO , BO, #32
  340. add AO , AO, #16
  341. .endm
  342. .macro KERNEL1x2_M2
  343. fldd d0 , [ AO ]
  344. fldd d1 , [ AO, #8 ]
  345. fldd d4 , [ BO ]
  346. fldd d5 , [ BO, #8 ]
  347. fldd d6 , [ BO, #16 ]
  348. fldd d7 , [ BO, #24 ]
  349. fmacd d8 , d0, d4
  350. KMAC_R d8 , d1, d5
  351. fmacd d9 , d0, d5
  352. KMAC_I d9 , d1, d4
  353. fmacd d12 , d0, d6
  354. KMAC_R d12 , d1, d7
  355. fmacd d13 , d0, d7
  356. KMAC_I d13 , d1, d6
  357. add BO , BO, #32
  358. add AO , AO, #16
  359. .endm
  360. .macro KERNEL1x2_E
  361. fldd d0 , [ AO ]
  362. fldd d1 , [ AO, #8 ]
  363. fldd d4 , [ BO ]
  364. fldd d5 , [ BO, #8 ]
  365. fldd d6 , [ BO, #16 ]
  366. fldd d7 , [ BO, #24 ]
  367. fmacd d8 , d0, d4
  368. KMAC_R d8 , d1, d5
  369. fmacd d9 , d0, d5
  370. KMAC_I d9 , d1, d4
  371. fmacd d12 , d0, d6
  372. KMAC_R d12 , d1, d7
  373. fmacd d13 , d0, d7
  374. KMAC_I d13 , d1, d6
  375. add BO , BO, #32
  376. add AO , AO, #16
  377. .endm
  378. .macro KERNEL1x2_SUB
  379. fldd d0 , [ AO ]
  380. fldd d1 , [ AO, #8 ]
  381. fldd d4 , [ BO ]
  382. fldd d5 , [ BO, #8 ]
  383. fldd d6 , [ BO, #16 ]
  384. fldd d7 , [ BO, #24 ]
  385. fmacd d8 , d0, d4
  386. KMAC_R d8 , d1, d5
  387. fmacd d9 , d0, d5
  388. KMAC_I d9 , d1, d4
  389. fmacd d12 , d0, d6
  390. KMAC_R d12 , d1, d7
  391. fmacd d13 , d0, d7
  392. KMAC_I d13 , d1, d6
  393. add BO , BO, #32
  394. add AO , AO, #16
  395. .endm
  396. .macro SAVE1x2
  397. ldr r3 , LDC
  398. add CO2 , CO1, r3
  399. fldd d0, ALPHA_R
  400. fldd d1, ALPHA_I
  401. vldmia.f64 CO1, { d4 - d5 }
  402. FMAC_R1 d4 , d0 , d8
  403. FMAC_I1 d5 , d0 , d9
  404. FMAC_R2 d4 , d1 , d9
  405. FMAC_I2 d5 , d1 , d8
  406. vstmia.f64 CO1, { d4 - d5 }
  407. vldmia.f64 CO2, { d4 - d5 }
  408. FMAC_R1 d4 , d0 , d12
  409. FMAC_I1 d5 , d0 , d13
  410. FMAC_R2 d4 , d1 , d13
  411. FMAC_I2 d5 , d1 , d12
  412. vstmia.f64 CO2, { d4 - d5 }
  413. add CO1, CO1, #16
  414. .endm
  415. /******************************************************************************/
  416. .macro INIT2x1
  417. fldd d8 , FP_ZERO
  418. vmov.f64 d9 , d8
  419. vmov.f64 d10, d8
  420. vmov.f64 d11, d8
  421. .endm
  422. .macro KERNEL2x1_I
  423. fldd d0 , [ AO ]
  424. fldd d1 , [ AO, #8 ]
  425. fldd d2 , [ AO, #16 ]
  426. fldd d3 , [ AO, #24 ]
  427. fldd d4 , [ BO ]
  428. fldd d5 , [ BO, #8 ]
  429. fmuld d8 , d0, d4
  430. KMAC_R d8 , d1, d5
  431. fmuld d9 , d0, d5
  432. KMAC_I d9 , d1, d4
  433. fmuld d10 , d2, d4
  434. KMAC_R d10 , d3, d5
  435. fmuld d11 , d2, d5
  436. KMAC_I d11 , d3, d4
  437. add BO , BO, #16
  438. add AO , AO, #32
  439. .endm
  440. .macro KERNEL2x1_M1
  441. fldd d0 , [ AO ]
  442. fldd d1 , [ AO, #8 ]
  443. fldd d2 , [ AO, #16 ]
  444. fldd d3 , [ AO, #24 ]
  445. fldd d4 , [ BO ]
  446. fldd d5 , [ BO, #8 ]
  447. fmacd d8 , d0, d4
  448. KMAC_R d8 , d1, d5
  449. fmacd d9 , d0, d5
  450. KMAC_I d9 , d1, d4
  451. fmacd d10 , d2, d4
  452. KMAC_R d10 , d3, d5
  453. fmacd d11 , d2, d5
  454. KMAC_I d11 , d3, d4
  455. add BO , BO, #16
  456. add AO , AO, #32
  457. .endm
  458. .macro KERNEL2x1_M2
  459. fldd d0 , [ AO ]
  460. fldd d1 , [ AO, #8 ]
  461. fldd d2 , [ AO, #16 ]
  462. fldd d3 , [ AO, #24 ]
  463. fldd d4 , [ BO ]
  464. fldd d5 , [ BO, #8 ]
  465. fmacd d8 , d0, d4
  466. KMAC_R d8 , d1, d5
  467. fmacd d9 , d0, d5
  468. KMAC_I d9 , d1, d4
  469. fmacd d10 , d2, d4
  470. KMAC_R d10 , d3, d5
  471. fmacd d11 , d2, d5
  472. KMAC_I d11 , d3, d4
  473. add BO , BO, #16
  474. add AO , AO, #32
  475. .endm
  476. .macro KERNEL2x1_E
  477. fldd d0 , [ AO ]
  478. fldd d1 , [ AO, #8 ]
  479. fldd d2 , [ AO, #16 ]
  480. fldd d3 , [ AO, #24 ]
  481. fldd d4 , [ BO ]
  482. fldd d5 , [ BO, #8 ]
  483. fmacd d8 , d0, d4
  484. KMAC_R d8 , d1, d5
  485. fmacd d9 , d0, d5
  486. KMAC_I d9 , d1, d4
  487. fmacd d10 , d2, d4
  488. KMAC_R d10 , d3, d5
  489. fmacd d11 , d2, d5
  490. KMAC_I d11 , d3, d4
  491. add BO , BO, #16
  492. add AO , AO, #32
  493. .endm
  494. .macro KERNEL2x1_SUB
  495. fldd d0 , [ AO ]
  496. fldd d1 , [ AO, #8 ]
  497. fldd d2 , [ AO, #16 ]
  498. fldd d3 , [ AO, #24 ]
  499. fldd d4 , [ BO ]
  500. fldd d5 , [ BO, #8 ]
  501. fmacd d8 , d0, d4
  502. KMAC_R d8 , d1, d5
  503. fmacd d9 , d0, d5
  504. KMAC_I d9 , d1, d4
  505. fmacd d10 , d2, d4
  506. KMAC_R d10 , d3, d5
  507. fmacd d11 , d2, d5
  508. KMAC_I d11 , d3, d4
  509. add BO , BO, #16
  510. add AO , AO, #32
  511. .endm
  512. .macro SAVE2x1
  513. fldd d0, ALPHA_R
  514. fldd d1, ALPHA_I
  515. vldmia.f64 CO1, { d4 - d7 }
  516. FMAC_R1 d4 , d0 , d8
  517. FMAC_I1 d5 , d0 , d9
  518. FMAC_R2 d4 , d1 , d9
  519. FMAC_I2 d5 , d1 , d8
  520. FMAC_R1 d6 , d0 , d10
  521. FMAC_I1 d7 , d0 , d11
  522. FMAC_R2 d6 , d1 , d11
  523. FMAC_I2 d7 , d1 , d10
  524. vstmia.f64 CO1, { d4 - d7 }
  525. add CO1, CO1, #32
  526. .endm
  527. /******************************************************************************/
  528. .macro INIT1x1
  529. fldd d8 , FP_ZERO
  530. vmov.f64 d9 , d8
  531. .endm
  532. .macro KERNEL1x1_I
  533. fldd d0 , [ AO ]
  534. fldd d1 , [ AO, #8 ]
  535. fldd d4 , [ BO ]
  536. fldd d5 , [ BO, #8 ]
  537. fmuld d8 , d0, d4
  538. KMAC_R d8 , d1, d5
  539. fmuld d9 , d0, d5
  540. KMAC_I d9 , d1, d4
  541. add BO , BO, #16
  542. add AO , AO, #16
  543. .endm
  544. .macro KERNEL1x1_M1
  545. fldd d0 , [ AO ]
  546. fldd d1 , [ AO, #8 ]
  547. fldd d4 , [ BO ]
  548. fldd d5 , [ BO, #8 ]
  549. fmacd d8 , d0, d4
  550. KMAC_R d8 , d1, d5
  551. fmacd d9 , d0, d5
  552. KMAC_I d9 , d1, d4
  553. add BO , BO, #16
  554. add AO , AO, #16
  555. .endm
  556. .macro KERNEL1x1_M2
  557. fldd d0 , [ AO ]
  558. fldd d1 , [ AO, #8 ]
  559. fldd d4 , [ BO ]
  560. fldd d5 , [ BO, #8 ]
  561. fmacd d8 , d0, d4
  562. KMAC_R d8 , d1, d5
  563. fmacd d9 , d0, d5
  564. KMAC_I d9 , d1, d4
  565. add BO , BO, #16
  566. add AO , AO, #16
  567. .endm
  568. .macro KERNEL1x1_E
  569. fldd d0 , [ AO ]
  570. fldd d1 , [ AO, #8 ]
  571. fldd d4 , [ BO ]
  572. fldd d5 , [ BO, #8 ]
  573. fmacd d8 , d0, d4
  574. KMAC_R d8 , d1, d5
  575. fmacd d9 , d0, d5
  576. KMAC_I d9 , d1, d4
  577. add BO , BO, #16
  578. add AO , AO, #16
  579. .endm
  580. .macro KERNEL1x1_SUB
  581. fldd d0 , [ AO ]
  582. fldd d1 , [ AO, #8 ]
  583. fldd d4 , [ BO ]
  584. fldd d5 , [ BO, #8 ]
  585. fmacd d8 , d0, d4
  586. KMAC_R d8 , d1, d5
  587. fmacd d9 , d0, d5
  588. KMAC_I d9 , d1, d4
  589. add BO , BO, #16
  590. add AO , AO, #16
  591. .endm
  592. .macro SAVE1x1
  593. fldd d0, ALPHA_R
  594. fldd d1, ALPHA_I
  595. vldmia.f64 CO1, { d4 - d5 }
  596. FMAC_R1 d4 , d0 , d8
  597. FMAC_I1 d5 , d0 , d9
  598. FMAC_R2 d4 , d1 , d9
  599. FMAC_I2 d5 , d1 , d8
  600. vstmia.f64 CO1, { d4 - d5 }
  601. add CO1, CO1, #16
  602. .endm
  603. /******************************************************************************/
  604. /**************************************************************************************
  605. * End of macro definitions
  606. **************************************************************************************/
  607. PROLOGUE
  608. .align 5
  609. push {r4 - r9, fp}
  610. add fp, sp, #24
  611. sub sp, sp, #STACKSIZE // reserve stack
  612. #if !defined(__ARM_PCS_VFP)
  613. vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
  614. vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
  615. ldr OLD_A, OLD_A_SOFTFP
  616. #endif
  617. str OLD_M, M
  618. str OLD_N, N
  619. str OLD_K, K
  620. str OLD_A, A
  621. vstr OLD_ALPHA_R, ALPHA_R
  622. vstr OLD_ALPHA_I, ALPHA_I
  623. sub r3, fp, #128
  624. vstm r3, { d8 - d15} // store floating point registers
  625. movs r4, #0
  626. str r4, FP_ZERO
  627. str r4, FP_ZERO_1
  628. ldr r3, OLD_LDC
  629. lsl r3, r3, #4 // ldc = ldc * 8 * 2
  630. str r3, LDC
  631. ldr K1, K
  632. ldr BC, B
  633. ldr J, N
  634. asrs J, J, #1 // J = J / 2
  635. ble zgemm_kernel_L1_BEGIN
  636. zgemm_kernel_L2_BEGIN:
  637. ldr CO1, C // CO1 = C
  638. ldr r4 , LDC
  639. lsl r4 , r4 , #1 // LDC * 2
  640. add r3 , r4, CO1
  641. str r3 , C // store C
  642. ldr AO, A // AO = A
  643. pld [AO , #A_PRE-64]
  644. pld [AO , #A_PRE-32]
  645. zgemm_kernel_L2_M2_BEGIN:
  646. ldr I, M
  647. asrs I, I, #1 // I = I / 2
  648. ble zgemm_kernel_L2_M1_BEGIN
  649. zgemm_kernel_L2_M2_20:
  650. mov BO, BC
  651. asrs L , K1, #3 // L = L / 8
  652. cmp L , #3
  653. blt zgemm_kernel_L2_M2_30
  654. .align 5
  655. KERNEL2x2_I
  656. KERNEL2x2_M2
  657. KERNEL2x2_M1
  658. KERNEL2x2_M2
  659. KERNEL2x2_M1
  660. KERNEL2x2_M2
  661. KERNEL2x2_M1
  662. KERNEL2x2_M2
  663. sub L, L, #2
  664. zgemm_kernel_L2_M2_22:
  665. KERNEL2x2_M1
  666. KERNEL2x2_M2
  667. KERNEL2x2_M1
  668. KERNEL2x2_M2
  669. KERNEL2x2_M1
  670. KERNEL2x2_M2
  671. KERNEL2x2_M1
  672. KERNEL2x2_M2
  673. subs L, L, #1
  674. bgt zgemm_kernel_L2_M2_22
  675. KERNEL2x2_M1
  676. KERNEL2x2_M2
  677. KERNEL2x2_M1
  678. KERNEL2x2_M2
  679. KERNEL2x2_M1
  680. KERNEL2x2_M2
  681. KERNEL2x2_M1
  682. KERNEL2x2_E
  683. b zgemm_kernel_L2_M2_44
  684. zgemm_kernel_L2_M2_30:
  685. tst L, #3
  686. ble zgemm_kernel_L2_M2_40
  687. tst L, #2
  688. ble zgemm_kernel_L2_M2_32
  689. KERNEL2x2_I
  690. KERNEL2x2_M2
  691. KERNEL2x2_M1
  692. KERNEL2x2_M2
  693. KERNEL2x2_M1
  694. KERNEL2x2_M2
  695. KERNEL2x2_M1
  696. KERNEL2x2_M2
  697. KERNEL2x2_M1
  698. KERNEL2x2_M2
  699. KERNEL2x2_M1
  700. KERNEL2x2_M2
  701. KERNEL2x2_M1
  702. KERNEL2x2_M2
  703. KERNEL2x2_M1
  704. KERNEL2x2_E
  705. b zgemm_kernel_L2_M2_44
  706. zgemm_kernel_L2_M2_32:
  707. tst L, #1
  708. ble zgemm_kernel_L2_M2_40
  709. KERNEL2x2_I
  710. KERNEL2x2_M2
  711. KERNEL2x2_M1
  712. KERNEL2x2_M2
  713. KERNEL2x2_M1
  714. KERNEL2x2_M2
  715. KERNEL2x2_M1
  716. KERNEL2x2_E
  717. b zgemm_kernel_L2_M2_44
  718. zgemm_kernel_L2_M2_40:
  719. INIT2x2
  720. zgemm_kernel_L2_M2_44:
  721. ands L , K1, #7 // L = L % 8
  722. ble zgemm_kernel_L2_M2_100
  723. zgemm_kernel_L2_M2_46:
  724. KERNEL2x2_SUB
  725. subs L, L, #1
  726. bne zgemm_kernel_L2_M2_46
  727. zgemm_kernel_L2_M2_100:
  728. SAVE2x2
  729. zgemm_kernel_L2_M2_END:
  730. subs I, I, #1
  731. bne zgemm_kernel_L2_M2_20
  732. zgemm_kernel_L2_M1_BEGIN:
  733. ldr I, M
  734. tst I, #1 // I = I % 2
  735. ble zgemm_kernel_L2_END
  736. zgemm_kernel_L2_M1_20:
  737. INIT1x2
  738. mov BO, BC
  739. asrs L , K1, #3 // L = L / 8
  740. ble zgemm_kernel_L2_M1_40
  741. zgemm_kernel_L2_M1_22:
  742. KERNEL1x2_SUB
  743. KERNEL1x2_SUB
  744. KERNEL1x2_SUB
  745. KERNEL1x2_SUB
  746. KERNEL1x2_SUB
  747. KERNEL1x2_SUB
  748. KERNEL1x2_SUB
  749. KERNEL1x2_SUB
  750. subs L, L, #1
  751. bgt zgemm_kernel_L2_M1_22
  752. zgemm_kernel_L2_M1_40:
  753. ands L , K1, #7 // L = L % 8
  754. ble zgemm_kernel_L2_M1_100
  755. zgemm_kernel_L2_M1_42:
  756. KERNEL1x2_SUB
  757. subs L, L, #1
  758. bgt zgemm_kernel_L2_M1_42
  759. zgemm_kernel_L2_M1_100:
  760. SAVE1x2
  761. zgemm_kernel_L2_END:
  762. mov r3, BC
  763. mov r4, K1
  764. lsl r4, r4, #5 // k * 2 * 8 * 2
  765. add r3, r3, r4 // B = B + K * 4 * 8
  766. mov BC, r3
  767. subs J , #1 // j--
  768. bgt zgemm_kernel_L2_BEGIN
  769. /*********************************************************************************************/
  770. zgemm_kernel_L1_BEGIN:
  771. ldr J , N
  772. tst J , #1
  773. ble zgemm_kernel_L999
  774. ldr CO1, C // CO1 = C
  775. ldr r4 , LDC
  776. add r3 , r4, CO1
  777. str r3 , C // store C
  778. ldr AO, A // AO = A
  779. zgemm_kernel_L1_M2_BEGIN:
  780. ldr I, M
  781. asrs I, I, #1 // I = I / 2
  782. ble zgemm_kernel_L1_M1_BEGIN
  783. zgemm_kernel_L1_M2_20:
  784. mov BO, BC
  785. asrs L , K1, #3 // L = L / 8
  786. cmp L , #3
  787. blt zgemm_kernel_L1_M2_30
  788. .align 5
  789. KERNEL2x1_I
  790. KERNEL2x1_M2
  791. KERNEL2x1_M1
  792. KERNEL2x1_M2
  793. KERNEL2x1_M1
  794. KERNEL2x1_M2
  795. KERNEL2x1_M1
  796. KERNEL2x1_M2
  797. sub L, L, #2
  798. zgemm_kernel_L1_M2_22:
  799. KERNEL2x1_M1
  800. KERNEL2x1_M2
  801. KERNEL2x1_M1
  802. KERNEL2x1_M2
  803. KERNEL2x1_M1
  804. KERNEL2x1_M2
  805. KERNEL2x1_M1
  806. KERNEL2x1_M2
  807. subs L, L, #1
  808. bgt zgemm_kernel_L1_M2_22
  809. KERNEL2x1_M1
  810. KERNEL2x1_M2
  811. KERNEL2x1_M1
  812. KERNEL2x1_M2
  813. KERNEL2x1_M1
  814. KERNEL2x1_M2
  815. KERNEL2x1_M1
  816. KERNEL2x1_E
  817. b zgemm_kernel_L1_M2_44
  818. zgemm_kernel_L1_M2_30:
  819. tst L, #3
  820. ble zgemm_kernel_L1_M2_40
  821. tst L, #2
  822. ble zgemm_kernel_L1_M2_32
  823. KERNEL2x1_I
  824. KERNEL2x1_M2
  825. KERNEL2x1_M1
  826. KERNEL2x1_M2
  827. KERNEL2x1_M1
  828. KERNEL2x1_M2
  829. KERNEL2x1_M1
  830. KERNEL2x1_M2
  831. KERNEL2x1_M1
  832. KERNEL2x1_M2
  833. KERNEL2x1_M1
  834. KERNEL2x1_M2
  835. KERNEL2x1_M1
  836. KERNEL2x1_M2
  837. KERNEL2x1_M1
  838. KERNEL2x1_E
  839. b zgemm_kernel_L1_M2_44
  840. zgemm_kernel_L1_M2_32:
  841. tst L, #1
  842. ble zgemm_kernel_L1_M2_40
  843. KERNEL2x1_I
  844. KERNEL2x1_M2
  845. KERNEL2x1_M1
  846. KERNEL2x1_M2
  847. KERNEL2x1_M1
  848. KERNEL2x1_M2
  849. KERNEL2x1_M1
  850. KERNEL2x1_E
  851. b zgemm_kernel_L1_M2_44
  852. zgemm_kernel_L1_M2_40:
  853. INIT2x1
  854. zgemm_kernel_L1_M2_44:
  855. ands L , K1, #7 // L = L % 8
  856. ble zgemm_kernel_L1_M2_100
  857. zgemm_kernel_L1_M2_46:
  858. KERNEL2x1_SUB
  859. subs L, L, #1
  860. bne zgemm_kernel_L1_M2_46
  861. zgemm_kernel_L1_M2_100:
  862. SAVE2x1
  863. zgemm_kernel_L1_M2_END:
  864. subs I, I, #1
  865. bne zgemm_kernel_L1_M2_20
  866. zgemm_kernel_L1_M1_BEGIN:
  867. ldr I, M
  868. tst I, #1 // I = I % 2
  869. ble zgemm_kernel_L1_END
  870. zgemm_kernel_L1_M1_20:
  871. INIT1x1
  872. mov BO, BC
  873. asrs L , K1, #3 // L = L / 8
  874. ble zgemm_kernel_L1_M1_40
  875. zgemm_kernel_L1_M1_22:
  876. KERNEL1x1_SUB
  877. KERNEL1x1_SUB
  878. KERNEL1x1_SUB
  879. KERNEL1x1_SUB
  880. KERNEL1x1_SUB
  881. KERNEL1x1_SUB
  882. KERNEL1x1_SUB
  883. KERNEL1x1_SUB
  884. subs L, L, #1
  885. bgt zgemm_kernel_L1_M1_22
  886. zgemm_kernel_L1_M1_40:
  887. ands L , K1, #7 // L = L % 8
  888. ble zgemm_kernel_L1_M1_100
  889. zgemm_kernel_L1_M1_42:
  890. KERNEL1x1_SUB
  891. subs L, L, #1
  892. bgt zgemm_kernel_L1_M1_42
  893. zgemm_kernel_L1_M1_100:
  894. SAVE1x1
  895. zgemm_kernel_L1_END:
  896. zgemm_kernel_L999:
  897. sub r3, fp, #128
  898. vldm r3, { d8 - d15} // restore floating point registers
  899. movs r0, #0 // set return value
  900. sub sp, fp, #24
  901. pop {r4 - r9, fp}
  902. bx lr
  903. EPILOGUE