You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_2x2_vfp.S 20 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA_R s0
  42. #define OLD_ALPHA_I s1
  43. /******************************************************
  44. * [fp, #-128] - [fp, #-64] is reserved
  45. * for store and restore of floating point
  46. * registers
  47. *******************************************************/
  48. #define A [fp, #-248 ]
  49. #define LDC [fp, #-252 ]
  50. #define M [fp, #-256 ]
  51. #define N [fp, #-260 ]
  52. #define K [fp, #-264 ]
  53. #define FP_ZERO [fp, #-240]
  54. #define FP_ZERO_0 [fp, # -240]
  55. #define FP_ZERO_1 [fp, # -236]
  56. #define ALPHA_I [fp, #-272]
  57. #define ALPHA_R [fp, #-280]
  58. #if !defined(__ARM_PCS_VFP)
  59. #define OLD_ALPHAR_SOFTFP r3
  60. #define OLD_ALPHAI_SOFTFP [fp, #4]
  61. #define OLD_A_SOFTFP [fp, #8 ]
  62. #define B [fp, #12 ]
  63. #define C [fp, #16 ]
  64. #define OLD_LDC [fp, #20 ]
  65. #else
  66. #define B [fp, #4 ]
  67. #define C [fp, #8 ]
  68. #define OLD_LDC [fp, #12 ]
  69. #endif
  70. #define I r0
  71. #define J r1
  72. #define L r2
  73. #define AO r5
  74. #define BO r6
  75. #define CO1 r8
  76. #define CO2 r9
  77. #define K1 r7
  78. #define BC r12
  79. #define A_PRE 96
  80. #define B_PRE 96
  81. #define C_PRE 64
  82. /**************************************************************************************
  83. * Macro definitions
  84. **************************************************************************************/
  85. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  86. #define KMAC_R vmls.f32
  87. #define KMAC_I fmacs
  88. #define FMAC_R1 fmacs
  89. #define FMAC_R2 vmls.f32
  90. #define FMAC_I1 fmacs
  91. #define FMAC_I2 fmacs
  92. #elif defined(CN) || defined(CT)
  93. #define KMAC_R fmacs
  94. #define KMAC_I vmls.f32
  95. #define FMAC_R1 fmacs
  96. #define FMAC_R2 vmls.f32
  97. #define FMAC_I1 fmacs
  98. #define FMAC_I2 fmacs
  99. #elif defined(NC) || defined(TC)
  100. #define KMAC_R fmacs
  101. #define KMAC_I vmls.f32
  102. #define FMAC_R1 fmacs
  103. #define FMAC_R2 fmacs
  104. #define FMAC_I1 vmls.f32
  105. #define FMAC_I2 fmacs
  106. #else
  107. #define KMAC_R vmls.f32
  108. #define KMAC_I fmacs
  109. #define FMAC_R1 fmacs
  110. #define FMAC_R2 fmacs
  111. #define FMAC_I1 vmls.f32
  112. #define FMAC_I2 fmacs
  113. #endif
  114. /**************************************************************************************
  115. * Macro definitions
  116. **************************************************************************************/
  117. .macro INIT2x2
  118. flds s8 , FP_ZERO
  119. vmov.f32 s9 , s8
  120. vmov.f32 s10, s8
  121. vmov.f32 s11, s8
  122. vmov.f32 s12, s8
  123. vmov.f32 s13, s8
  124. vmov.f32 s14, s8
  125. vmov.f32 s15, s8
  126. .endm
  127. .macro KERNEL2x2_I
  128. pld [ AO, #A_PRE ]
  129. fldmias AO!, { s0 - s3 }
  130. pld [ BO, #B_PRE ]
  131. fldmias BO!, { s4 - s7 }
  132. fmuls s8 , s0, s4
  133. fmuls s9 , s0, s5
  134. fmuls s10 , s2, s4
  135. fmuls s11 , s2, s5
  136. KMAC_R s8 , s1, s5
  137. KMAC_I s9 , s1, s4
  138. KMAC_R s10 , s3, s5
  139. KMAC_I s11 , s3, s4
  140. fmuls s12 , s0, s6
  141. fmuls s13 , s0, s7
  142. fmuls s14 , s2, s6
  143. fmuls s15 , s2, s7
  144. KMAC_R s12 , s1, s7
  145. KMAC_I s13 , s1, s6
  146. KMAC_R s14 , s3, s7
  147. KMAC_I s15 , s3, s6
  148. .endm
  149. .macro KERNEL2x2_M1
  150. pld [ AO, #A_PRE ]
  151. fldmias AO!, { s0 - s3 }
  152. pld [ BO, #B_PRE ]
  153. fldmias BO!, { s4 - s7 }
  154. fmacs s8 , s0, s4
  155. fmacs s9 , s0, s5
  156. fmacs s10 , s2, s4
  157. fmacs s11 , s2, s5
  158. KMAC_R s8 , s1, s5
  159. KMAC_I s9 , s1, s4
  160. KMAC_R s10 , s3, s5
  161. KMAC_I s11 , s3, s4
  162. fmacs s12 , s0, s6
  163. fmacs s13 , s0, s7
  164. fmacs s14 , s2, s6
  165. fmacs s15 , s2, s7
  166. KMAC_R s12 , s1, s7
  167. KMAC_I s13 , s1, s6
  168. KMAC_R s14 , s3, s7
  169. KMAC_I s15 , s3, s6
  170. .endm
  171. .macro KERNEL2x2_M2
  172. fldmias AO!, { s0 - s3 }
  173. fldmias BO!, { s4 - s7 }
  174. fmacs s8 , s0, s4
  175. fmacs s9 , s0, s5
  176. fmacs s10 , s2, s4
  177. fmacs s11 , s2, s5
  178. KMAC_R s8 , s1, s5
  179. KMAC_I s9 , s1, s4
  180. KMAC_R s10 , s3, s5
  181. KMAC_I s11 , s3, s4
  182. fmacs s12 , s0, s6
  183. fmacs s13 , s0, s7
  184. fmacs s14 , s2, s6
  185. fmacs s15 , s2, s7
  186. KMAC_R s12 , s1, s7
  187. KMAC_I s13 , s1, s6
  188. KMAC_R s14 , s3, s7
  189. KMAC_I s15 , s3, s6
  190. .endm
  191. .macro KERNEL2x2_E
  192. fldmias AO!, { s0 - s3 }
  193. fldmias BO!, { s4 - s7 }
  194. fmacs s8 , s0, s4
  195. fmacs s9 , s0, s5
  196. fmacs s10 , s2, s4
  197. fmacs s11 , s2, s5
  198. KMAC_R s8 , s1, s5
  199. KMAC_I s9 , s1, s4
  200. KMAC_R s10 , s3, s5
  201. KMAC_I s11 , s3, s4
  202. fmacs s12 , s0, s6
  203. fmacs s13 , s0, s7
  204. fmacs s14 , s2, s6
  205. fmacs s15 , s2, s7
  206. KMAC_R s12 , s1, s7
  207. KMAC_I s13 , s1, s6
  208. KMAC_R s14 , s3, s7
  209. KMAC_I s15 , s3, s6
  210. .endm
  211. .macro KERNEL2x2_SUB
  212. fldmias AO!, { s0 - s3 }
  213. fldmias BO!, { s4 - s7 }
  214. fmacs s8 , s0, s4
  215. fmacs s9 , s0, s5
  216. fmacs s10 , s2, s4
  217. fmacs s11 , s2, s5
  218. KMAC_R s8 , s1, s5
  219. KMAC_I s9 , s1, s4
  220. KMAC_R s10 , s3, s5
  221. KMAC_I s11 , s3, s4
  222. fmacs s12 , s0, s6
  223. fmacs s13 , s0, s7
  224. fmacs s14 , s2, s6
  225. fmacs s15 , s2, s7
  226. KMAC_R s12 , s1, s7
  227. KMAC_I s13 , s1, s6
  228. KMAC_R s14 , s3, s7
  229. KMAC_I s15 , s3, s6
  230. .endm
  231. .macro SAVE2x2
  232. ldr r3 , LDC
  233. add CO2 , CO1, r3
  234. flds s0, ALPHA_R
  235. flds s1, ALPHA_I
  236. fldmias CO1, { s4 - s7 }
  237. FMAC_R1 s4 , s0 , s8
  238. FMAC_I1 s5 , s0 , s9
  239. FMAC_R2 s4 , s1 , s9
  240. FMAC_I2 s5 , s1 , s8
  241. FMAC_R1 s6 , s0 , s10
  242. FMAC_I1 s7 , s0 , s11
  243. FMAC_R2 s6 , s1 , s11
  244. FMAC_I2 s7 , s1 , s10
  245. fstmias CO1, { s4 - s7 }
  246. fldmias CO2, { s4 - s7 }
  247. FMAC_R1 s4 , s0 , s12
  248. FMAC_I1 s5 , s0 , s13
  249. FMAC_R2 s4 , s1 , s13
  250. FMAC_I2 s5 , s1 , s12
  251. FMAC_R1 s6 , s0 , s14
  252. FMAC_I1 s7 , s0 , s15
  253. FMAC_R2 s6 , s1 , s15
  254. FMAC_I2 s7 , s1 , s14
  255. fstmias CO2, { s4 - s7 }
  256. add CO1, CO1, #16
  257. .endm
  258. /******************************************************************************/
  259. .macro INIT1x2
  260. flds s8 , FP_ZERO
  261. vmov.f32 s9 , s8
  262. vmov.f32 s12, s8
  263. vmov.f32 s13, s8
  264. .endm
  265. .macro KERNEL1x2_I
  266. flds s0 , [ AO ]
  267. flds s1 , [ AO, #4 ]
  268. flds s4 , [ BO ]
  269. flds s5 , [ BO, #4 ]
  270. flds s6 , [ BO, #8 ]
  271. flds s7 , [ BO, #12 ]
  272. fmuls s8 , s0, s4
  273. KMAC_R s8 , s1, s5
  274. fmuls s9 , s0, s5
  275. KMAC_I s9 , s1, s4
  276. fmuls s12 , s0, s6
  277. KMAC_R s12 , s1, s7
  278. fmuls s13 , s0, s7
  279. KMAC_I s13 , s1, s6
  280. add BO , BO, #16
  281. add AO , AO, #8
  282. .endm
  283. .macro KERNEL1x2_M1
  284. flds s0 , [ AO ]
  285. flds s1 , [ AO, #4 ]
  286. flds s4 , [ BO ]
  287. flds s5 , [ BO, #4 ]
  288. flds s6 , [ BO, #8 ]
  289. flds s7 , [ BO, #12 ]
  290. fmacs s8 , s0, s4
  291. KMAC_R s8 , s1, s5
  292. fmacs s9 , s0, s5
  293. KMAC_I s9 , s1, s4
  294. fmacs s12 , s0, s6
  295. KMAC_R s12 , s1, s7
  296. fmacs s13 , s0, s7
  297. KMAC_I s13 , s1, s6
  298. add BO , BO, #16
  299. add AO , AO, #8
  300. .endm
  301. .macro KERNEL1x2_M2
  302. flds s0 , [ AO ]
  303. flds s1 , [ AO, #4 ]
  304. flds s4 , [ BO ]
  305. flds s5 , [ BO, #4 ]
  306. flds s6 , [ BO, #8 ]
  307. flds s7 , [ BO, #12 ]
  308. fmacs s8 , s0, s4
  309. KMAC_R s8 , s1, s5
  310. fmacs s9 , s0, s5
  311. KMAC_I s9 , s1, s4
  312. fmacs s12 , s0, s6
  313. KMAC_R s12 , s1, s7
  314. fmacs s13 , s0, s7
  315. KMAC_I s13 , s1, s6
  316. add BO , BO, #16
  317. add AO , AO, #8
  318. .endm
  319. .macro KERNEL1x2_E
  320. flds s0 , [ AO ]
  321. flds s1 , [ AO, #4 ]
  322. flds s4 , [ BO ]
  323. flds s5 , [ BO, #4 ]
  324. flds s6 , [ BO, #8 ]
  325. flds s7 , [ BO, #12 ]
  326. fmacs s8 , s0, s4
  327. KMAC_R s8 , s1, s5
  328. fmacs s9 , s0, s5
  329. KMAC_I s9 , s1, s4
  330. fmacs s12 , s0, s6
  331. KMAC_R s12 , s1, s7
  332. fmacs s13 , s0, s7
  333. KMAC_I s13 , s1, s6
  334. add BO , BO, #16
  335. add AO , AO, #8
  336. .endm
  337. .macro KERNEL1x2_SUB
  338. flds s0 , [ AO ]
  339. flds s1 , [ AO, #4 ]
  340. flds s4 , [ BO ]
  341. flds s5 , [ BO, #4 ]
  342. flds s6 , [ BO, #8 ]
  343. flds s7 , [ BO, #12 ]
  344. fmacs s8 , s0, s4
  345. KMAC_R s8 , s1, s5
  346. fmacs s9 , s0, s5
  347. KMAC_I s9 , s1, s4
  348. fmacs s12 , s0, s6
  349. KMAC_R s12 , s1, s7
  350. fmacs s13 , s0, s7
  351. KMAC_I s13 , s1, s6
  352. add BO , BO, #16
  353. add AO , AO, #8
  354. .endm
  355. .macro SAVE1x2
  356. ldr r3 , LDC
  357. add CO2 , CO1, r3
  358. flds s0, ALPHA_R
  359. flds s1, ALPHA_I
  360. fldmias CO1, { s4 - s5 }
  361. FMAC_R1 s4 , s0 , s8
  362. FMAC_I1 s5 , s0 , s9
  363. FMAC_R2 s4 , s1 , s9
  364. FMAC_I2 s5 , s1 , s8
  365. fstmias CO1, { s4 - s5 }
  366. fldmias CO2, { s4 - s5 }
  367. FMAC_R1 s4 , s0 , s12
  368. FMAC_I1 s5 , s0 , s13
  369. FMAC_R2 s4 , s1 , s13
  370. FMAC_I2 s5 , s1 , s12
  371. fstmias CO2, { s4 - s5 }
  372. add CO1, CO1, #8
  373. .endm
  374. /******************************************************************************/
  375. .macro INIT2x1
  376. flds s8 , FP_ZERO
  377. vmov.f32 s9 , s8
  378. vmov.f32 s10, s8
  379. vmov.f32 s11, s8
  380. .endm
  381. .macro KERNEL2x1_I
  382. flds s0 , [ AO ]
  383. flds s1 , [ AO, #4 ]
  384. flds s2 , [ AO, #8 ]
  385. flds s3 , [ AO, #12 ]
  386. flds s4 , [ BO ]
  387. flds s5 , [ BO, #4 ]
  388. fmuls s8 , s0, s4
  389. KMAC_R s8 , s1, s5
  390. fmuls s9 , s0, s5
  391. KMAC_I s9 , s1, s4
  392. fmuls s10 , s2, s4
  393. KMAC_R s10 , s3, s5
  394. fmuls s11 , s2, s5
  395. KMAC_I s11 , s3, s4
  396. add BO , BO, #8
  397. add AO , AO, #16
  398. .endm
  399. .macro KERNEL2x1_M1
  400. flds s0 , [ AO ]
  401. flds s1 , [ AO, #4 ]
  402. flds s2 , [ AO, #8 ]
  403. flds s3 , [ AO, #12 ]
  404. flds s4 , [ BO ]
  405. flds s5 , [ BO, #4 ]
  406. fmacs s8 , s0, s4
  407. KMAC_R s8 , s1, s5
  408. fmacs s9 , s0, s5
  409. KMAC_I s9 , s1, s4
  410. fmacs s10 , s2, s4
  411. KMAC_R s10 , s3, s5
  412. fmacs s11 , s2, s5
  413. KMAC_I s11 , s3, s4
  414. add BO , BO, #8
  415. add AO , AO, #16
  416. .endm
  417. .macro KERNEL2x1_M2
  418. flds s0 , [ AO ]
  419. flds s1 , [ AO, #4 ]
  420. flds s2 , [ AO, #8 ]
  421. flds s3 , [ AO, #12 ]
  422. flds s4 , [ BO ]
  423. flds s5 , [ BO, #4 ]
  424. fmacs s8 , s0, s4
  425. KMAC_R s8 , s1, s5
  426. fmacs s9 , s0, s5
  427. KMAC_I s9 , s1, s4
  428. fmacs s10 , s2, s4
  429. KMAC_R s10 , s3, s5
  430. fmacs s11 , s2, s5
  431. KMAC_I s11 , s3, s4
  432. add BO , BO, #8
  433. add AO , AO, #16
  434. .endm
  435. .macro KERNEL2x1_E
  436. flds s0 , [ AO ]
  437. flds s1 , [ AO, #4 ]
  438. flds s2 , [ AO, #8 ]
  439. flds s3 , [ AO, #12 ]
  440. flds s4 , [ BO ]
  441. flds s5 , [ BO, #4 ]
  442. fmacs s8 , s0, s4
  443. KMAC_R s8 , s1, s5
  444. fmacs s9 , s0, s5
  445. KMAC_I s9 , s1, s4
  446. fmacs s10 , s2, s4
  447. KMAC_R s10 , s3, s5
  448. fmacs s11 , s2, s5
  449. KMAC_I s11 , s3, s4
  450. add BO , BO, #8
  451. add AO , AO, #16
  452. .endm
  453. .macro KERNEL2x1_SUB
  454. flds s0 , [ AO ]
  455. flds s1 , [ AO, #4 ]
  456. flds s2 , [ AO, #8 ]
  457. flds s3 , [ AO, #12 ]
  458. flds s4 , [ BO ]
  459. flds s5 , [ BO, #4 ]
  460. fmacs s8 , s0, s4
  461. KMAC_R s8 , s1, s5
  462. fmacs s9 , s0, s5
  463. KMAC_I s9 , s1, s4
  464. fmacs s10 , s2, s4
  465. KMAC_R s10 , s3, s5
  466. fmacs s11 , s2, s5
  467. KMAC_I s11 , s3, s4
  468. add BO , BO, #8
  469. add AO , AO, #16
  470. .endm
  471. .macro SAVE2x1
  472. flds s0, ALPHA_R
  473. flds s1, ALPHA_I
  474. fldmias CO1, { s4 - s7 }
  475. FMAC_R1 s4 , s0 , s8
  476. FMAC_I1 s5 , s0 , s9
  477. FMAC_R2 s4 , s1 , s9
  478. FMAC_I2 s5 , s1 , s8
  479. FMAC_R1 s6 , s0 , s10
  480. FMAC_I1 s7 , s0 , s11
  481. FMAC_R2 s6 , s1 , s11
  482. FMAC_I2 s7 , s1 , s10
  483. fstmias CO1, { s4 - s7 }
  484. add CO1, CO1, #16
  485. .endm
  486. /******************************************************************************/
  487. .macro INIT1x1
  488. flds s8 , FP_ZERO
  489. vmov.f32 s9 , s8
  490. .endm
  491. .macro KERNEL1x1_I
  492. flds s0 , [ AO ]
  493. flds s1 , [ AO, #4 ]
  494. flds s4 , [ BO ]
  495. flds s5 , [ BO, #4 ]
  496. fmuls s8 , s0, s4
  497. KMAC_R s8 , s1, s5
  498. fmuls s9 , s0, s5
  499. KMAC_I s9 , s1, s4
  500. add BO , BO, #8
  501. add AO , AO, #8
  502. .endm
  503. .macro KERNEL1x1_M1
  504. flds s0 , [ AO ]
  505. flds s1 , [ AO, #4 ]
  506. flds s4 , [ BO ]
  507. flds s5 , [ BO, #4 ]
  508. fmacs s8 , s0, s4
  509. KMAC_R s8 , s1, s5
  510. fmacs s9 , s0, s5
  511. KMAC_I s9 , s1, s4
  512. add BO , BO, #8
  513. add AO , AO, #8
  514. .endm
  515. .macro KERNEL1x1_M2
  516. flds s0 , [ AO ]
  517. flds s1 , [ AO, #4 ]
  518. flds s4 , [ BO ]
  519. flds s5 , [ BO, #4 ]
  520. fmacs s8 , s0, s4
  521. KMAC_R s8 , s1, s5
  522. fmacs s9 , s0, s5
  523. KMAC_I s9 , s1, s4
  524. add BO , BO, #8
  525. add AO , AO, #8
  526. .endm
  527. .macro KERNEL1x1_E
  528. flds s0 , [ AO ]
  529. flds s1 , [ AO, #4 ]
  530. flds s4 , [ BO ]
  531. flds s5 , [ BO, #4 ]
  532. fmacs s8 , s0, s4
  533. KMAC_R s8 , s1, s5
  534. fmacs s9 , s0, s5
  535. KMAC_I s9 , s1, s4
  536. add BO , BO, #8
  537. add AO , AO, #8
  538. .endm
  539. .macro KERNEL1x1_SUB
  540. flds s0 , [ AO ]
  541. flds s1 , [ AO, #4 ]
  542. flds s4 , [ BO ]
  543. flds s5 , [ BO, #4 ]
  544. fmacs s8 , s0, s4
  545. KMAC_R s8 , s1, s5
  546. fmacs s9 , s0, s5
  547. KMAC_I s9 , s1, s4
  548. add BO , BO, #8
  549. add AO , AO, #8
  550. .endm
  551. .macro SAVE1x1
  552. flds s0, ALPHA_R
  553. flds s1, ALPHA_I
  554. fldmias CO1, { s4 - s5 }
  555. FMAC_R1 s4 , s0 , s8
  556. FMAC_I1 s5 , s0 , s9
  557. FMAC_R2 s4 , s1 , s9
  558. FMAC_I2 s5 , s1 , s8
  559. fstmias CO1, { s4 - s5 }
  560. add CO1, CO1, #8
  561. .endm
  562. /**************************************************************************************
  563. * End of macro definitions
  564. **************************************************************************************/
  565. PROLOGUE
  566. .align 5
  567. push {r4 - r9, fp}
  568. add fp, sp, #24
  569. sub sp, sp, #STACKSIZE // reserve stack
  570. #if !defined(__ARM_PCS_VFP)
  571. vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
  572. vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
  573. ldr OLD_A, OLD_A_SOFTFP
  574. #endif
  575. str OLD_M, M
  576. str OLD_N, N
  577. str OLD_K, K
  578. str OLD_A, A
  579. vstr OLD_ALPHA_R, ALPHA_R
  580. vstr OLD_ALPHA_I, ALPHA_I
  581. sub r3, fp, #128
  582. vstm r3, { s8 - s15} // store floating point registers
  583. movs r4, #0
  584. str r4, FP_ZERO
  585. str r4, FP_ZERO_1
  586. ldr r3, OLD_LDC
  587. lsl r3, r3, #3 // ldc = ldc * 4 * 2
  588. str r3, LDC
  589. ldr K1, K
  590. ldr BC, B
  591. ldr J, N
  592. asrs J, J, #1 // J = J / 2
  593. ble cgemm_kernel_L1_BEGIN
  594. cgemm_kernel_L2_BEGIN:
  595. ldr CO1, C // CO1 = C
  596. ldr r4 , LDC
  597. lsl r4 , r4 , #1 // LDC * 2
  598. add r3 , r4, CO1
  599. str r3 , C // store C
  600. ldr AO, A // AO = A
  601. pld [AO , #A_PRE-64]
  602. pld [AO , #A_PRE-32]
  603. cgemm_kernel_L2_M2_BEGIN:
  604. ldr I, M
  605. asrs I, I, #1 // I = I / 2
  606. ble cgemm_kernel_L2_M1_BEGIN
  607. cgemm_kernel_L2_M2_20:
  608. mov BO, BC
  609. asrs L , K1, #3 // L = L / 8
  610. cmp L , #3
  611. blt cgemm_kernel_L2_M2_30
  612. .align 5
  613. KERNEL2x2_I
  614. KERNEL2x2_M2
  615. KERNEL2x2_M1
  616. KERNEL2x2_M2
  617. KERNEL2x2_M1
  618. KERNEL2x2_M2
  619. KERNEL2x2_M1
  620. KERNEL2x2_M2
  621. sub L, L, #2
  622. cgemm_kernel_L2_M2_22:
  623. KERNEL2x2_M1
  624. KERNEL2x2_M2
  625. KERNEL2x2_M1
  626. KERNEL2x2_M2
  627. KERNEL2x2_M1
  628. KERNEL2x2_M2
  629. KERNEL2x2_M1
  630. KERNEL2x2_M2
  631. subs L, L, #1
  632. bgt cgemm_kernel_L2_M2_22
  633. KERNEL2x2_M1
  634. KERNEL2x2_M2
  635. KERNEL2x2_M1
  636. KERNEL2x2_M2
  637. KERNEL2x2_M1
  638. KERNEL2x2_M2
  639. KERNEL2x2_M1
  640. KERNEL2x2_E
  641. b cgemm_kernel_L2_M2_44
  642. cgemm_kernel_L2_M2_30:
  643. tst L, #3
  644. ble cgemm_kernel_L2_M2_40
  645. tst L, #2
  646. ble cgemm_kernel_L2_M2_32
  647. KERNEL2x2_I
  648. KERNEL2x2_M2
  649. KERNEL2x2_M1
  650. KERNEL2x2_M2
  651. KERNEL2x2_M1
  652. KERNEL2x2_M2
  653. KERNEL2x2_M1
  654. KERNEL2x2_M2
  655. KERNEL2x2_M1
  656. KERNEL2x2_M2
  657. KERNEL2x2_M1
  658. KERNEL2x2_M2
  659. KERNEL2x2_M1
  660. KERNEL2x2_M2
  661. KERNEL2x2_M1
  662. KERNEL2x2_E
  663. b cgemm_kernel_L2_M2_44
  664. cgemm_kernel_L2_M2_32:
  665. tst L, #1
  666. ble cgemm_kernel_L2_M2_40
  667. KERNEL2x2_I
  668. KERNEL2x2_M2
  669. KERNEL2x2_M1
  670. KERNEL2x2_M2
  671. KERNEL2x2_M1
  672. KERNEL2x2_M2
  673. KERNEL2x2_M1
  674. KERNEL2x2_E
  675. b cgemm_kernel_L2_M2_44
  676. cgemm_kernel_L2_M2_40:
  677. INIT2x2
  678. cgemm_kernel_L2_M2_44:
  679. ands L , K1, #7 // L = L % 8
  680. ble cgemm_kernel_L2_M2_100
  681. cgemm_kernel_L2_M2_46:
  682. KERNEL2x2_SUB
  683. subs L, L, #1
  684. bne cgemm_kernel_L2_M2_46
  685. cgemm_kernel_L2_M2_100:
  686. SAVE2x2
  687. cgemm_kernel_L2_M2_END:
  688. subs I, I, #1
  689. bne cgemm_kernel_L2_M2_20
  690. cgemm_kernel_L2_M1_BEGIN:
  691. ldr I, M
  692. tst I, #1 // I = I % 2
  693. ble cgemm_kernel_L2_END
  694. cgemm_kernel_L2_M1_20:
  695. INIT1x2
  696. mov BO, BC
  697. asrs L , K1, #3 // L = L / 8
  698. ble cgemm_kernel_L2_M1_40
  699. cgemm_kernel_L2_M1_22:
  700. KERNEL1x2_SUB
  701. KERNEL1x2_SUB
  702. KERNEL1x2_SUB
  703. KERNEL1x2_SUB
  704. KERNEL1x2_SUB
  705. KERNEL1x2_SUB
  706. KERNEL1x2_SUB
  707. KERNEL1x2_SUB
  708. subs L, L, #1
  709. bgt cgemm_kernel_L2_M1_22
  710. cgemm_kernel_L2_M1_40:
  711. ands L , K1, #7 // L = L % 8
  712. ble cgemm_kernel_L2_M1_100
  713. cgemm_kernel_L2_M1_42:
  714. KERNEL1x2_SUB
  715. subs L, L, #1
  716. bgt cgemm_kernel_L2_M1_42
  717. cgemm_kernel_L2_M1_100:
  718. SAVE1x2
  719. cgemm_kernel_L2_END:
  720. mov r3, BC
  721. mov r4, K1
  722. lsl r4, r4, #4 // k * 2 * 4 * 2
  723. add r3, r3, r4 // B = B + K * 2 * 8
  724. mov BC, r3
  725. subs J , #1 // j--
  726. bgt cgemm_kernel_L2_BEGIN
  727. /*********************************************************************************************/
  728. cgemm_kernel_L1_BEGIN:
  729. ldr J , N
  730. tst J , #1
  731. ble cgemm_kernel_L999
  732. ldr CO1, C // CO1 = C
  733. ldr r4 , LDC
  734. add r3 , r4, CO1
  735. str r3 , C // store C
  736. ldr AO, A // AO = A
  737. cgemm_kernel_L1_M2_BEGIN:
  738. ldr I, M
  739. asrs I, I, #1 // I = I / 2
  740. ble cgemm_kernel_L1_M1_BEGIN
  741. cgemm_kernel_L1_M2_20:
  742. mov BO, BC
  743. asrs L , K1, #3 // L = L / 8
  744. cmp L , #3
  745. blt cgemm_kernel_L1_M2_30
  746. .align 5
  747. KERNEL2x1_I
  748. KERNEL2x1_M2
  749. KERNEL2x1_M1
  750. KERNEL2x1_M2
  751. KERNEL2x1_M1
  752. KERNEL2x1_M2
  753. KERNEL2x1_M1
  754. KERNEL2x1_M2
  755. sub L, L, #2
  756. cgemm_kernel_L1_M2_22:
  757. KERNEL2x1_M1
  758. KERNEL2x1_M2
  759. KERNEL2x1_M1
  760. KERNEL2x1_M2
  761. KERNEL2x1_M1
  762. KERNEL2x1_M2
  763. KERNEL2x1_M1
  764. KERNEL2x1_M2
  765. subs L, L, #1
  766. bgt cgemm_kernel_L1_M2_22
  767. KERNEL2x1_M1
  768. KERNEL2x1_M2
  769. KERNEL2x1_M1
  770. KERNEL2x1_M2
  771. KERNEL2x1_M1
  772. KERNEL2x1_M2
  773. KERNEL2x1_M1
  774. KERNEL2x1_E
  775. b cgemm_kernel_L1_M2_44
  776. cgemm_kernel_L1_M2_30:
  777. tst L, #3
  778. ble cgemm_kernel_L1_M2_40
  779. tst L, #2
  780. ble cgemm_kernel_L1_M2_32
  781. KERNEL2x1_I
  782. KERNEL2x1_M2
  783. KERNEL2x1_M1
  784. KERNEL2x1_M2
  785. KERNEL2x1_M1
  786. KERNEL2x1_M2
  787. KERNEL2x1_M1
  788. KERNEL2x1_M2
  789. KERNEL2x1_M1
  790. KERNEL2x1_M2
  791. KERNEL2x1_M1
  792. KERNEL2x1_M2
  793. KERNEL2x1_M1
  794. KERNEL2x1_M2
  795. KERNEL2x1_M1
  796. KERNEL2x1_E
  797. b cgemm_kernel_L1_M2_44
  798. cgemm_kernel_L1_M2_32:
  799. tst L, #1
  800. ble cgemm_kernel_L1_M2_40
  801. KERNEL2x1_I
  802. KERNEL2x1_M2
  803. KERNEL2x1_M1
  804. KERNEL2x1_M2
  805. KERNEL2x1_M1
  806. KERNEL2x1_M2
  807. KERNEL2x1_M1
  808. KERNEL2x1_E
  809. b cgemm_kernel_L1_M2_44
  810. cgemm_kernel_L1_M2_40:
  811. INIT2x1
  812. cgemm_kernel_L1_M2_44:
  813. ands L , K1, #7 // L = L % 8
  814. ble cgemm_kernel_L1_M2_100
  815. cgemm_kernel_L1_M2_46:
  816. KERNEL2x1_SUB
  817. subs L, L, #1
  818. bne cgemm_kernel_L1_M2_46
  819. cgemm_kernel_L1_M2_100:
  820. SAVE2x1
  821. cgemm_kernel_L1_M2_END:
  822. subs I, I, #1
  823. bne cgemm_kernel_L1_M2_20
  824. cgemm_kernel_L1_M1_BEGIN:
  825. ldr I, M
  826. tst I, #1 // I = I % 2
  827. ble cgemm_kernel_L1_END
  828. cgemm_kernel_L1_M1_20:
  829. INIT1x1
  830. mov BO, BC
  831. asrs L , K1, #3 // L = L / 8
  832. ble cgemm_kernel_L1_M1_40
  833. cgemm_kernel_L1_M1_22:
  834. KERNEL1x1_SUB
  835. KERNEL1x1_SUB
  836. KERNEL1x1_SUB
  837. KERNEL1x1_SUB
  838. KERNEL1x1_SUB
  839. KERNEL1x1_SUB
  840. KERNEL1x1_SUB
  841. KERNEL1x1_SUB
  842. subs L, L, #1
  843. bgt cgemm_kernel_L1_M1_22
  844. cgemm_kernel_L1_M1_40:
  845. ands L , K1, #7 // L = L % 8
  846. ble cgemm_kernel_L1_M1_100
  847. cgemm_kernel_L1_M1_42:
  848. KERNEL1x1_SUB
  849. subs L, L, #1
  850. bgt cgemm_kernel_L1_M1_42
  851. cgemm_kernel_L1_M1_100:
  852. SAVE1x1
  853. cgemm_kernel_L1_END:
  854. cgemm_kernel_L999:
  855. sub r3, fp, #128
  856. vldm r3, { s8 - s15} // restore floating point registers
  857. movs r0, #0 // set return value
  858. sub sp, fp, #24
  859. pop {r4 - r9, fp}
  860. bx lr
  861. EPILOGUE