You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ctrmm_kernel_2x2_vfpv3.S 26 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/10/16 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA_R s0
  42. #define OLD_ALPHA_I s1
  43. /******************************************************
  44. * [fp, #-128] - [fp, #-64] is reserved
  45. * for store and restore of floating point
  46. * registers
  47. *******************************************************/
  48. #define KKK [fp, #-240]
  49. #define KK [fp, #-244 ]
  50. #define A [fp, #-248 ]
  51. #define LDC [fp, #-252 ]
  52. #define M [fp, #-256 ]
  53. #define N [fp, #-260 ]
  54. #define K [fp, #-264 ]
  55. #define FP_ZERO [fp, #-236]
  56. #define FP_ZERO_0 [fp, #-236]
  57. #define FP_ZERO_1 [fp, #-232]
  58. #define ALPHA_I [fp, #-272]
  59. #define ALPHA_R [fp, #-280]
  60. #if !defined(__ARM_PCS_VFP)
  61. #define OLD_ALPHAR_SOFTFP r3
  62. #define OLD_ALPHAI_SOFTFP [fp, #4]
  63. #define OLD_A_SOFTFP [fp, #8 ]
  64. #define B [fp, #12 ]
  65. #define C [fp, #16 ]
  66. #define OLD_LDC [fp, #20 ]
  67. #define OFFSET [fp, #24 ]
  68. #else
  69. #define B [fp, #4 ]
  70. #define C [fp, #8 ]
  71. #define OLD_LDC [fp, #12 ]
  72. #define OFFSET [fp, #16 ]
  73. #endif
  74. #define I r0
  75. #define J r1
  76. #define L r2
  77. #define AO r5
  78. #define BO r6
  79. #define CO1 r8
  80. #define CO2 r9
  81. #define K1 r7
  82. #define BC r12
  83. #define A_PRE 96
  84. #define B_PRE 96
  85. #define C_PRE 64
  86. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  87. #define FADD_R fsubs
  88. #define FADD_I fadds
  89. #define FMAC_R1 vnmul.f32
  90. #define FMAC_R2 vmls.f32
  91. #define FMAC_I1 fmuls
  92. #define FMAC_I2 vmls.f32
  93. #elif defined(CN) || defined(CT)
  94. #define FADD_R fadds
  95. #define FADD_I fsubs
  96. #define FMAC_R1 fmuls
  97. #define FMAC_R2 fmacs
  98. #define FMAC_I1 vnmul.f32
  99. #define FMAC_I2 fmacs
  100. #elif defined(NC) || defined(TC)
  101. #define FADD_R fadds
  102. #define FADD_I fsubs
  103. #define FMAC_R1 fmuls
  104. #define FMAC_R2 vmls.f32
  105. #define FMAC_I1 fmuls
  106. #define FMAC_I2 fmacs
  107. #else
  108. #define FADD_R fsubs
  109. #define FADD_I fadds
  110. #define FMAC_R1 vnmul.f32
  111. #define FMAC_R2 fmacs
  112. #define FMAC_I1 vnmul.f32
  113. #define FMAC_I2 vmls.f32
  114. #endif
  115. /**************************************************************************************
  116. * Macro definitions
  117. **************************************************************************************/
  118. .macro INIT2x2
  119. flds s16 , FP_ZERO
  120. vmov.f32 s17, s16
  121. vmov.f32 s18, s16
  122. vmov.f32 s19, s16
  123. vmov.f32 s20, s16
  124. vmov.f32 s21, s16
  125. vmov.f32 s22, s16
  126. vmov.f32 s23, s16
  127. vmov.f32 s24, s16
  128. vmov.f32 s25, s16
  129. vmov.f32 s26, s16
  130. vmov.f32 s27, s16
  131. vmov.f32 s28, s16
  132. vmov.f32 s29, s16
  133. vmov.f32 s30, s16
  134. vmov.f32 s31, s16
  135. .endm
  136. .macro KERNEL2x2_I
  137. pld [ AO , #A_PRE ]
  138. pld [ BO , #B_PRE ]
  139. vldmia.f32 AO!, { s0 - s1 }
  140. vldmia.f32 BO!, { s8 - s9 }
  141. fmuls s16 , s0, s8
  142. fmuls s24 , s1, s9
  143. vldmia.f32 AO!, { s2 - s3 }
  144. fmuls s17 , s0, s9
  145. fmuls s25 , s1, s8
  146. vldmia.f32 BO!, { s10 - s11 }
  147. fmuls s18 , s2, s8
  148. fmuls s26 , s3, s9
  149. vldmia.f32 AO!, { s4 - s5 }
  150. fmuls s19 , s2, s9
  151. fmuls s27 , s3, s8
  152. vldmia.f32 BO!, { s12 - s13 }
  153. fmuls s20 , s0, s10
  154. fmuls s28 , s1, s11
  155. vldmia.f32 AO!, { s6 - s7 }
  156. fmuls s21 , s0, s11
  157. fmuls s29 , s1, s10
  158. vldmia.f32 BO!, { s14 - s15 }
  159. fmuls s22 , s2, s10
  160. fmuls s30 , s3, s11
  161. fmuls s23 , s2, s11
  162. fmuls s31 , s3, s10
  163. .endm
  164. .macro KERNEL2x2_M1
  165. fmacs s16 , s0, s8
  166. vldmia.f32 AO!, { s4 - s5 }
  167. fmacs s24 , s1, s9
  168. fmacs s17 , s0, s9
  169. vldmia.f32 BO!, { s12 - s13 }
  170. fmacs s25 , s1, s8
  171. fmacs s18 , s2, s8
  172. vldmia.f32 AO!, { s6 - s7 }
  173. fmacs s26 , s3, s9
  174. fmacs s19 , s2, s9
  175. vldmia.f32 BO!, { s14 - s15 }
  176. fmacs s27 , s3, s8
  177. fmacs s20 , s0, s10
  178. fmacs s28 , s1, s11
  179. fmacs s21 , s0, s11
  180. fmacs s29 , s1, s10
  181. fmacs s22 , s2, s10
  182. fmacs s30 , s3, s11
  183. fmacs s23 , s2, s11
  184. fmacs s31 , s3, s10
  185. .endm
  186. .macro KERNEL2x2_M2
  187. pld [ AO , #A_PRE ]
  188. fmacs s16 , s4, s12
  189. pld [ BO , #B_PRE ]
  190. fmacs s24 , s5, s13
  191. fmacs s17 , s4, s13
  192. vldmia.f32 AO!, { s0 - s1 }
  193. fmacs s25 , s5, s12
  194. fmacs s18 , s6, s12
  195. fmacs s26 , s7, s13
  196. vldmia.f32 BO!, { s8 - s9 }
  197. fmacs s19 , s6, s13
  198. fmacs s27 , s7, s12
  199. vldmia.f32 AO!, { s2 - s3 }
  200. fmacs s20 , s4, s14
  201. fmacs s28 , s5, s15
  202. vldmia.f32 BO!, { s10 - s11 }
  203. fmacs s21 , s4, s15
  204. fmacs s29 , s5, s14
  205. fmacs s22 , s6, s14
  206. fmacs s30 , s7, s15
  207. fmacs s23 , s6, s15
  208. fmacs s31 , s7, s14
  209. .endm
  210. .macro KERNEL2x2_E
  211. fmacs s16 , s4, s12
  212. fmacs s24 , s5, s13
  213. fmacs s17 , s4, s13
  214. fmacs s25 , s5, s12
  215. fmacs s18 , s6, s12
  216. fmacs s26 , s7, s13
  217. fmacs s19 , s6, s13
  218. fmacs s27 , s7, s12
  219. fmacs s20 , s4, s14
  220. fmacs s28 , s5, s15
  221. fmacs s21 , s4, s15
  222. fmacs s29 , s5, s14
  223. fmacs s22 , s6, s14
  224. fmacs s30 , s7, s15
  225. fmacs s23 , s6, s15
  226. fmacs s31 , s7, s14
  227. .endm
  228. .macro KERNEL2x2_SUB
  229. vldmia.f32 AO!, { s0 - s1 }
  230. vldmia.f32 BO!, { s8 - s9 }
  231. fmacs s16 , s0, s8
  232. fmacs s24 , s1, s9
  233. vldmia.f32 AO!, { s2 - s3 }
  234. fmacs s17 , s0, s9
  235. fmacs s25 , s1, s8
  236. vldmia.f32 BO!, { s10 - s11 }
  237. fmacs s18 , s2, s8
  238. fmacs s26 , s3, s9
  239. fmacs s19 , s2, s9
  240. fmacs s27 , s3, s8
  241. fmacs s20 , s0, s10
  242. fmacs s28 , s1, s11
  243. fmacs s21 , s0, s11
  244. fmacs s29 , s1, s10
  245. fmacs s22 , s2, s10
  246. fmacs s30 , s3, s11
  247. fmacs s23 , s2, s11
  248. fmacs s31 , s3, s10
  249. .endm
  250. .macro SAVE2x2
  251. ldr r3 , LDC
  252. add CO2 , CO1, r3
  253. flds s0, ALPHA_R
  254. flds s1, ALPHA_I
  255. FADD_R s16, s24 , s16
  256. FADD_I s17, s25 , s17
  257. FADD_R s18, s26 , s18
  258. FADD_I s19, s27 , s19
  259. FADD_R s20, s28 , s20
  260. FADD_I s21, s29 , s21
  261. FADD_R s22, s30 , s22
  262. FADD_I s23, s31 , s23
  263. FMAC_R1 s4 , s0 , s16
  264. FMAC_I1 s5 , s0 , s17
  265. FMAC_R2 s4 , s1 , s17
  266. FMAC_I2 s5 , s1 , s16
  267. FMAC_R1 s6 , s0 , s18
  268. FMAC_I1 s7 , s0 , s19
  269. FMAC_R2 s6 , s1 , s19
  270. FMAC_I2 s7 , s1 , s18
  271. FMAC_R1 s8 , s0 , s20
  272. FMAC_I1 s9 , s0 , s21
  273. FMAC_R2 s8 , s1 , s21
  274. FMAC_I2 s9 , s1 , s20
  275. FMAC_R1 s10, s0 , s22
  276. FMAC_I1 s11, s0 , s23
  277. FMAC_R2 s10, s1 , s23
  278. FMAC_I2 s11, s1 , s22
  279. vstmia.f32 CO1, { s4 - s7 }
  280. vstmia.f32 CO2, { s8 - s11 }
  281. add CO1, CO1, #16
  282. .endm
  283. /******************************************************************************/
  284. .macro INIT1x2
  285. flds s16 , FP_ZERO
  286. vmov.f32 s17, s16
  287. vmov.f32 s20, s16
  288. vmov.f32 s21, s16
  289. vmov.f32 s24, s16
  290. vmov.f32 s25, s16
  291. vmov.f32 s28, s16
  292. vmov.f32 s29, s16
  293. .endm
  294. .macro KERNEL1x2_I
  295. pld [ AO , #A_PRE ]
  296. pld [ BO , #B_PRE ]
  297. flds s0 , [ AO ]
  298. flds s1 , [ AO, #4 ]
  299. flds s8 , [ BO ]
  300. flds s9 , [ BO, #4 ]
  301. flds s10, [ BO, #8 ]
  302. flds s11, [ BO, #12 ]
  303. fmuls s16 , s0, s8
  304. fmuls s24 , s1, s9
  305. fmuls s17 , s0, s9
  306. fmuls s25 , s1, s8
  307. fmuls s20 , s0, s10
  308. fmuls s28 , s1, s11
  309. fmuls s21 , s0, s11
  310. fmuls s29 , s1, s10
  311. add BO , BO, #16
  312. add AO , AO, #8
  313. pld [ BO , #B_PRE ]
  314. flds s4 , [ AO, #0 ]
  315. flds s5 , [ AO, #4 ]
  316. flds s12, [ BO ]
  317. flds s13, [ BO, #4 ]
  318. flds s14, [ BO, #8 ]
  319. flds s15, [ BO, #12 ]
  320. add BO , BO, #16
  321. add AO , AO, #8
  322. .endm
  323. .macro KERNEL1x2_M1
  324. pld [ BO , #B_PRE ]
  325. fmacs s16 , s0, s8
  326. fmacs s24 , s1, s9
  327. fmacs s17 , s0, s9
  328. fmacs s25 , s1, s8
  329. fmacs s20 , s0, s10
  330. fmacs s28 , s1, s11
  331. fmacs s21 , s0, s11
  332. fmacs s29 , s1, s10
  333. flds s4 , [ AO, #0 ]
  334. flds s5 , [ AO, #4 ]
  335. flds s12, [ BO ]
  336. flds s13, [ BO, #4 ]
  337. flds s14, [ BO, #8 ]
  338. flds s15, [ BO, #12 ]
  339. add BO , BO, #16
  340. add AO , AO, #8
  341. .endm
  342. .macro KERNEL1x2_M2
  343. pld [ AO , #A_PRE ]
  344. pld [ BO , #B_PRE ]
  345. fmacs s16 , s4, s12
  346. fmacs s24 , s5, s13
  347. fmacs s17 , s4, s13
  348. fmacs s25 , s5, s12
  349. fmacs s20 , s4, s14
  350. fmacs s28 , s5, s15
  351. fmacs s21 , s4, s15
  352. fmacs s29 , s5, s14
  353. flds s0 , [ AO, #0 ]
  354. flds s1 , [ AO, #4 ]
  355. flds s8 , [ BO ]
  356. flds s9 , [ BO, #4 ]
  357. flds s10, [ BO, #8 ]
  358. flds s11, [ BO, #12 ]
  359. add BO , BO, #16
  360. add AO , AO, #8
  361. .endm
  362. .macro KERNEL1x2_E
  363. fmacs s16 , s4, s12
  364. fmacs s24 , s5, s13
  365. fmacs s17 , s4, s13
  366. fmacs s25 , s5, s12
  367. fmacs s20 , s4, s14
  368. fmacs s28 , s5, s15
  369. fmacs s21 , s4, s15
  370. fmacs s29 , s5, s14
  371. .endm
  372. .macro KERNEL1x2_SUB
  373. pld [ AO , #A_PRE ]
  374. pld [ BO , #B_PRE ]
  375. flds s0 , [ AO ]
  376. flds s1 , [ AO, #4 ]
  377. flds s8 , [ BO ]
  378. flds s9 , [ BO, #4 ]
  379. flds s10, [ BO, #8 ]
  380. flds s11, [ BO, #12 ]
  381. fmacs s16 , s0, s8
  382. fmacs s24 , s1, s9
  383. fmacs s17 , s0, s9
  384. fmacs s25 , s1, s8
  385. fmacs s20 , s0, s10
  386. fmacs s28 , s1, s11
  387. fmacs s21 , s0, s11
  388. fmacs s29 , s1, s10
  389. add BO , BO, #16
  390. add AO , AO, #8
  391. .endm
  392. .macro SAVE1x2
  393. ldr r3 , LDC
  394. add CO2 , CO1, r3
  395. flds s0, ALPHA_R
  396. flds s1, ALPHA_I
  397. FADD_R s16, s24 , s16
  398. FADD_I s17, s25 , s17
  399. FADD_R s20, s28 , s20
  400. FADD_I s21, s29 , s21
  401. FMAC_R1 s4 , s0 , s16
  402. FMAC_I1 s5 , s0 , s17
  403. FMAC_R2 s4 , s1 , s17
  404. FMAC_I2 s5 , s1 , s16
  405. FMAC_R1 s8 , s0 , s20
  406. FMAC_I1 s9 , s0 , s21
  407. FMAC_R2 s8 , s1 , s21
  408. FMAC_I2 s9 , s1 , s20
  409. vstmia.f32 CO1, { s4 - s5 }
  410. vstmia.f32 CO2, { s8 - s9 }
  411. add CO1, CO1, #8
  412. .endm
  413. /******************************************************************************/
  414. .macro INIT2x1
  415. flds s16 , FP_ZERO
  416. vmov.f32 s17, s16
  417. vmov.f32 s18, s16
  418. vmov.f32 s19, s16
  419. vmov.f32 s24, s16
  420. vmov.f32 s25, s16
  421. vmov.f32 s26, s16
  422. vmov.f32 s27, s16
  423. .endm
  424. .macro KERNEL2x1_I
  425. pld [ AO , #A_PRE ]
  426. pld [ BO , #B_PRE ]
  427. flds s0 , [ AO ]
  428. flds s1 , [ AO, #4 ]
  429. flds s2 , [ AO, #8 ]
  430. flds s3 , [ AO, #12 ]
  431. flds s8 , [ BO ]
  432. flds s9 , [ BO, #4 ]
  433. fmuls s16 , s0, s8
  434. fmuls s24 , s1, s9
  435. fmuls s17 , s0, s9
  436. fmuls s25 , s1, s8
  437. fmuls s18 , s2, s8
  438. fmuls s26 , s3, s9
  439. fmuls s19 , s2, s9
  440. fmuls s27 , s3, s8
  441. add BO , BO, #8
  442. add AO , AO, #16
  443. pld [ BO , #B_PRE ]
  444. pld [ AO , #A_PRE ]
  445. flds s4 , [ AO, #0 ]
  446. flds s5 , [ AO, #4 ]
  447. flds s6 , [ AO, #8 ]
  448. flds s7 , [ AO, #12 ]
  449. flds s12, [ BO ]
  450. flds s13, [ BO, #4 ]
  451. add BO , BO, #8
  452. add AO , AO, #16
  453. .endm
  454. .macro KERNEL2x1_M1
  455. pld [ AO , #A_PRE ]
  456. pld [ BO , #B_PRE ]
  457. fmacs s16 , s0, s8
  458. fmacs s24 , s1, s9
  459. fmacs s17 , s0, s9
  460. fmacs s25 , s1, s8
  461. fmacs s18 , s2, s8
  462. fmacs s26 , s3, s9
  463. fmacs s19 , s2, s9
  464. fmacs s27 , s3, s8
  465. flds s4 , [ AO, #0 ]
  466. flds s5 , [ AO, #4 ]
  467. flds s6 , [ AO, #8 ]
  468. flds s7 , [ AO, #12 ]
  469. flds s12, [ BO ]
  470. flds s13, [ BO, #4 ]
  471. add BO , BO, #8
  472. add AO , AO, #16
  473. .endm
  474. .macro KERNEL2x1_M2
  475. pld [ AO , #A_PRE ]
  476. pld [ BO , #B_PRE ]
  477. fmacs s16 , s4, s12
  478. fmacs s24 , s5, s13
  479. fmacs s17 , s4, s13
  480. fmacs s25 , s5, s12
  481. fmacs s18 , s6, s12
  482. fmacs s26 , s7, s13
  483. fmacs s19 , s6, s13
  484. fmacs s27 , s7, s12
  485. flds s0 , [ AO, #0 ]
  486. flds s1 , [ AO, #4 ]
  487. flds s2 , [ AO, #8 ]
  488. flds s3 , [ AO, #12 ]
  489. flds s8 , [ BO ]
  490. flds s9 , [ BO, #4 ]
  491. add BO , BO, #8
  492. add AO , AO, #16
  493. .endm
  494. .macro KERNEL2x1_E
  495. fmacs s16 , s4, s12
  496. fmacs s24 , s5, s13
  497. fmacs s17 , s4, s13
  498. fmacs s25 , s5, s12
  499. fmacs s18 , s6, s12
  500. fmacs s26 , s7, s13
  501. fmacs s19 , s6, s13
  502. fmacs s27 , s7, s12
  503. .endm
  504. .macro KERNEL2x1_SUB
  505. pld [ AO , #A_PRE ]
  506. pld [ BO , #B_PRE ]
  507. flds s0 , [ AO ]
  508. flds s1 , [ AO, #4 ]
  509. flds s2 , [ AO, #8 ]
  510. flds s3 , [ AO, #12 ]
  511. flds s8 , [ BO ]
  512. flds s9 , [ BO, #4 ]
  513. fmacs s16 , s0, s8
  514. fmacs s24 , s1, s9
  515. fmacs s17 , s0, s9
  516. fmacs s25 , s1, s8
  517. fmacs s18 , s2, s8
  518. fmacs s26 , s3, s9
  519. fmacs s19 , s2, s9
  520. fmacs s27 , s3, s8
  521. add BO , BO, #8
  522. add AO , AO, #16
  523. .endm
  524. .macro SAVE2x1
  525. flds s0, ALPHA_R
  526. flds s1, ALPHA_I
  527. FADD_R s16, s24 , s16
  528. FADD_I s17, s25 , s17
  529. FADD_R s18, s26 , s18
  530. FADD_I s19, s27 , s19
  531. FMAC_R1 s4 , s0 , s16
  532. FMAC_I1 s5 , s0 , s17
  533. FMAC_R2 s4 , s1 , s17
  534. FMAC_I2 s5 , s1 , s16
  535. FMAC_R1 s6 , s0 , s18
  536. FMAC_I1 s7 , s0 , s19
  537. FMAC_R2 s6 , s1 , s19
  538. FMAC_I2 s7 , s1 , s18
  539. vstmia.f32 CO1, { s4 - s7 }
  540. add CO1, CO1, #16
  541. .endm
  542. /******************************************************************************/
  543. .macro INIT1x1
  544. flds s16 , FP_ZERO
  545. vmov.f32 s17, s16
  546. vmov.f32 s24, s16
  547. vmov.f32 s25, s16
  548. .endm
  549. .macro KERNEL1x1_I
  550. pld [ AO , #A_PRE ]
  551. pld [ BO , #B_PRE ]
  552. flds s0 , [ AO ]
  553. flds s1 , [ AO, #4 ]
  554. flds s8 , [ BO ]
  555. flds s9 , [ BO, #4 ]
  556. fmuls s16 , s0, s8
  557. fmuls s24 , s1, s9
  558. fmuls s17 , s0, s9
  559. fmuls s25 , s1, s8
  560. add BO , BO, #8
  561. add AO , AO, #8
  562. pld [ BO , #B_PRE ]
  563. pld [ AO , #A_PRE ]
  564. flds s4 , [ AO, #0 ]
  565. flds s5 , [ AO, #4 ]
  566. flds s12, [ BO ]
  567. flds s13, [ BO, #4 ]
  568. add BO , BO, #8
  569. add AO , AO, #8
  570. .endm
  571. .macro KERNEL1x1_M1
  572. fmacs s16 , s0, s8
  573. fmacs s24 , s1, s9
  574. fmacs s17 , s0, s9
  575. fmacs s25 , s1, s8
  576. flds s4 , [ AO, #0 ]
  577. flds s5 , [ AO, #4 ]
  578. flds s12, [ BO ]
  579. flds s13, [ BO, #4 ]
  580. add BO , BO, #8
  581. add AO , AO, #8
  582. .endm
  583. .macro KERNEL1x1_M2
  584. fmacs s16 , s4, s12
  585. fmacs s24 , s5, s13
  586. fmacs s17 , s4, s13
  587. fmacs s25 , s5, s12
  588. flds s0 , [ AO, #0 ]
  589. flds s1 , [ AO, #4 ]
  590. flds s8 , [ BO ]
  591. flds s9 , [ BO, #4 ]
  592. add BO , BO, #8
  593. add AO , AO, #8
  594. .endm
  595. .macro KERNEL1x1_E
  596. fmacs s16 , s4, s12
  597. fmacs s24 , s5, s13
  598. fmacs s17 , s4, s13
  599. fmacs s25 , s5, s12
  600. .endm
  601. .macro KERNEL1x1_SUB
  602. flds s0 , [ AO ]
  603. flds s1 , [ AO, #4 ]
  604. flds s8 , [ BO ]
  605. flds s9 , [ BO, #4 ]
  606. fmacs s16 , s0, s8
  607. fmacs s24 , s1, s9
  608. fmacs s17 , s0, s9
  609. fmacs s25 , s1, s8
  610. add BO , BO, #8
  611. add AO , AO, #8
  612. .endm
  613. .macro SAVE1x1
  614. flds s0, ALPHA_R
  615. flds s1, ALPHA_I
  616. FADD_R s16, s24 , s16
  617. FADD_I s17, s25 , s17
  618. FMAC_R1 s4 , s0 , s16
  619. FMAC_I1 s5 , s0 , s17
  620. FMAC_R2 s4 , s1 , s17
  621. FMAC_I2 s5 , s1 , s16
  622. vstmia.f32 CO1, { s4 - s5 }
  623. add CO1, CO1, #8
  624. .endm
  625. /******************************************************************************/
  626. /**************************************************************************************
  627. * End of macro definitions
  628. **************************************************************************************/
  629. PROLOGUE
  630. .align 5
  631. push {r4 - r9, fp}
  632. add fp, sp, #24
  633. sub sp, sp, #STACKSIZE // reserve stack
  634. #if !defined(__ARM_PCS_VFP)
  635. vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
  636. vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
  637. ldr OLD_A, OLD_A_SOFTFP
  638. #endif
  639. str OLD_M, M
  640. str OLD_N, N
  641. str OLD_K, K
  642. str OLD_A, A
  643. vstr OLD_ALPHA_R, ALPHA_R
  644. vstr OLD_ALPHA_I, ALPHA_I
  645. sub r3, fp, #128
  646. vstm r3, { s8 - s31} // store floating point registers
  647. movs r4, #0
  648. str r4, FP_ZERO
  649. str r4, FP_ZERO_1
  650. ldr r3, OLD_LDC
  651. lsl r3, r3, #3 // ldc = ldc * 4 * 2
  652. str r3, LDC
  653. ldr r3, OFFSET
  654. #ifndef LEFT
  655. neg r3 , r3
  656. #endif
  657. str r3 , KK
  658. ldr BC, B
  659. ldr J, N
  660. asrs J, J, #1 // J = J / 2
  661. ble _L1_BEGIN
  662. _L2_BEGIN:
  663. ldr CO1, C // CO1 = C
  664. ldr r4 , LDC
  665. lsl r4 , r4 , #1 // LDC * 2
  666. add r3 , r4, CO1
  667. str r3 , C // store C
  668. #if defined(LEFT)
  669. ldr r3 , OFFSET
  670. str r3 , KK
  671. #endif
  672. ldr AO, A // AO = A
  673. pld [AO , #A_PRE-64]
  674. pld [AO , #A_PRE-32]
  675. _L2_M2_BEGIN:
  676. ldr I, M
  677. asrs I, I, #1 // I = I / 2
  678. ble _L2_M1_BEGIN
  679. _L2_M2_20:
  680. #if (defined(LEFT) && defined(TRANSA)) || \
  681. (!defined(LEFT) && !defined(TRANSA))
  682. mov BO, BC
  683. #else
  684. mov BO, BC
  685. ldr r3 , KK
  686. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  687. add BO , BO , r4
  688. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  689. add AO , AO , r4
  690. #endif
  691. #ifndef TRMMKERNEL
  692. ldr K1, K
  693. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  694. ldr K1, K
  695. ldr r3, KK
  696. sub K1, K1, r3
  697. str K1, KKK
  698. #else
  699. ldr K1, KK
  700. #ifdef LEFT
  701. add K1, K1, #2 // number of values in AO
  702. #else
  703. add K1, K1, #2 // number of values in BO
  704. #endif
  705. str K1, KKK
  706. #endif
  707. asrs L , K1, #3 // L = L / 8
  708. cmp L , #3
  709. blt _L2_M2_30
  710. .align 5
  711. KERNEL2x2_I
  712. KERNEL2x2_M2
  713. KERNEL2x2_M1
  714. KERNEL2x2_M2
  715. KERNEL2x2_M1
  716. KERNEL2x2_M2
  717. KERNEL2x2_M1
  718. KERNEL2x2_M2
  719. sub L, L, #2
  720. _L2_M2_22:
  721. KERNEL2x2_M1
  722. KERNEL2x2_M2
  723. KERNEL2x2_M1
  724. KERNEL2x2_M2
  725. KERNEL2x2_M1
  726. KERNEL2x2_M2
  727. KERNEL2x2_M1
  728. KERNEL2x2_M2
  729. subs L, L, #1
  730. bgt _L2_M2_22
  731. KERNEL2x2_M1
  732. KERNEL2x2_M2
  733. KERNEL2x2_M1
  734. KERNEL2x2_M2
  735. KERNEL2x2_M1
  736. KERNEL2x2_M2
  737. KERNEL2x2_M1
  738. KERNEL2x2_E
  739. b _L2_M2_44
  740. _L2_M2_30:
  741. tst L, #3
  742. ble _L2_M2_40
  743. tst L, #2
  744. ble _L2_M2_32
  745. KERNEL2x2_I
  746. KERNEL2x2_M2
  747. KERNEL2x2_M1
  748. KERNEL2x2_M2
  749. KERNEL2x2_M1
  750. KERNEL2x2_M2
  751. KERNEL2x2_M1
  752. KERNEL2x2_M2
  753. KERNEL2x2_M1
  754. KERNEL2x2_M2
  755. KERNEL2x2_M1
  756. KERNEL2x2_M2
  757. KERNEL2x2_M1
  758. KERNEL2x2_M2
  759. KERNEL2x2_M1
  760. KERNEL2x2_E
  761. b _L2_M2_44
  762. _L2_M2_32:
  763. tst L, #1
  764. ble _L2_M2_40
  765. KERNEL2x2_I
  766. KERNEL2x2_M2
  767. KERNEL2x2_M1
  768. KERNEL2x2_M2
  769. KERNEL2x2_M1
  770. KERNEL2x2_M2
  771. KERNEL2x2_M1
  772. KERNEL2x2_E
  773. b _L2_M2_44
  774. _L2_M2_40:
  775. INIT2x2
  776. _L2_M2_44:
  777. ands L , K1, #7 // L = L % 8
  778. ble _L2_M2_100
  779. _L2_M2_46:
  780. KERNEL2x2_SUB
  781. subs L, L, #1
  782. bne _L2_M2_46
  783. _L2_M2_100:
  784. SAVE2x2
  785. #if (defined(LEFT) && defined(TRANSA)) || \
  786. (!defined(LEFT) && !defined(TRANSA))
  787. ldr r3 , K
  788. ldr r4 , KKK
  789. sub r3 , r3 , r4
  790. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  791. add BO , BO , r4
  792. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  793. add AO , AO , r4
  794. #endif
  795. #if defined(LEFT)
  796. ldr r3 , KK
  797. add r3 , r3 , #2 // number of values in AO
  798. str r3 , KK
  799. #endif
  800. _L2_M2_END:
  801. subs I, I, #1
  802. bne _L2_M2_20
  803. _L2_M1_BEGIN:
  804. ldr I, M
  805. tst I, #1 // I = I % 2
  806. ble _L2_END
  807. _L2_M1_20:
  808. INIT1x2
  809. #if (defined(LEFT) && defined(TRANSA)) || \
  810. (!defined(LEFT) && !defined(TRANSA))
  811. mov BO, BC
  812. #else
  813. mov BO, BC
  814. ldr r3 , KK
  815. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  816. add BO , BO , r4
  817. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  818. add AO , AO , r4
  819. #endif
  820. #ifndef TRMMKERNEL
  821. ldr K1, K
  822. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  823. ldr K1, K
  824. ldr r3, KK
  825. sub K1, K1, r3
  826. str K1, KKK
  827. #else
  828. ldr K1, KK
  829. #ifdef LEFT
  830. add K1, K1, #1 // number of values in AO
  831. #else
  832. add K1, K1, #2 // number of values in BO
  833. #endif
  834. str K1, KKK
  835. #endif
  836. asrs L , K1, #3 // L = L / 8
  837. ble _L2_M1_40
  838. _L2_M1_22:
  839. KERNEL1x2_SUB
  840. KERNEL1x2_SUB
  841. KERNEL1x2_SUB
  842. KERNEL1x2_SUB
  843. KERNEL1x2_SUB
  844. KERNEL1x2_SUB
  845. KERNEL1x2_SUB
  846. KERNEL1x2_SUB
  847. subs L, L, #1
  848. bgt _L2_M1_22
  849. _L2_M1_40:
  850. ands L , K1, #7 // L = L % 8
  851. ble _L2_M1_100
  852. _L2_M1_42:
  853. KERNEL1x2_SUB
  854. subs L, L, #1
  855. bgt _L2_M1_42
  856. _L2_M1_100:
  857. SAVE1x2
  858. #if (defined(LEFT) && defined(TRANSA)) || \
  859. (!defined(LEFT) && !defined(TRANSA))
  860. ldr r3 , K
  861. ldr r4 , KKK
  862. sub r3 , r3 , r4
  863. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  864. add BO , BO , r4
  865. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  866. add AO , AO , r4
  867. #endif
  868. #if defined(LEFT)
  869. ldr r3 , KK
  870. add r3 , r3 , #1 // number of values in AO
  871. str r3 , KK
  872. #endif
  873. _L2_END:
  874. mov r3, BC
  875. ldr r4, K
  876. lsl r4, r4, #4 // k * 2 * 4 * 2
  877. add r3, r3, r4 // B = B + K * 2 * 8
  878. mov BC, r3
  879. #if !defined(LEFT)
  880. ldr r3 , KK
  881. add r3 , r3 , #2 // number of values in BO
  882. str r3 , KK
  883. #endif
  884. subs J , #1 // j--
  885. bgt _L2_BEGIN
  886. /*********************************************************************************************/
  887. _L1_BEGIN:
  888. ldr J , N
  889. tst J , #1
  890. ble _L999
  891. ldr CO1, C // CO1 = C
  892. ldr r4 , LDC
  893. add r3 , r4, CO1
  894. str r3 , C // store C
  895. #if defined(LEFT)
  896. ldr r3 , OFFSET
  897. str r3 , KK
  898. #endif
  899. ldr AO, A // AO = A
  900. _L1_M2_BEGIN:
  901. ldr I, M
  902. asrs I, I, #1 // I = I / 2
  903. ble _L1_M1_BEGIN
  904. _L1_M2_20:
  905. #if (defined(LEFT) && defined(TRANSA)) || \
  906. (!defined(LEFT) && !defined(TRANSA))
  907. mov BO, BC
  908. #else
  909. mov BO, BC
  910. ldr r3 , KK
  911. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  912. add BO , BO , r4
  913. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  914. add AO , AO , r4
  915. #endif
  916. #ifndef TRMMKERNEL
  917. ldr K1, K
  918. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  919. ldr K1, K
  920. ldr r3, KK
  921. sub K1, K1, r3
  922. str K1, KKK
  923. #else
  924. ldr K1, KK
  925. #ifdef LEFT
  926. add K1, K1, #2 // number of values in AO
  927. #else
  928. add K1, K1, #1 // number of values in BO
  929. #endif
  930. str K1, KKK
  931. #endif
  932. asrs L , K1, #3 // L = L / 8
  933. cmp L , #3
  934. blt _L1_M2_30
  935. .align 5
  936. KERNEL2x1_I
  937. KERNEL2x1_M2
  938. KERNEL2x1_M1
  939. KERNEL2x1_M2
  940. KERNEL2x1_M1
  941. KERNEL2x1_M2
  942. KERNEL2x1_M1
  943. KERNEL2x1_M2
  944. sub L, L, #2
  945. _L1_M2_22:
  946. KERNEL2x1_M1
  947. KERNEL2x1_M2
  948. KERNEL2x1_M1
  949. KERNEL2x1_M2
  950. KERNEL2x1_M1
  951. KERNEL2x1_M2
  952. KERNEL2x1_M1
  953. KERNEL2x1_M2
  954. subs L, L, #1
  955. bgt _L1_M2_22
  956. KERNEL2x1_M1
  957. KERNEL2x1_M2
  958. KERNEL2x1_M1
  959. KERNEL2x1_M2
  960. KERNEL2x1_M1
  961. KERNEL2x1_M2
  962. KERNEL2x1_M1
  963. KERNEL2x1_E
  964. b _L1_M2_44
  965. _L1_M2_30:
  966. tst L, #3
  967. ble _L1_M2_40
  968. tst L, #2
  969. ble _L1_M2_32
  970. KERNEL2x1_I
  971. KERNEL2x1_M2
  972. KERNEL2x1_M1
  973. KERNEL2x1_M2
  974. KERNEL2x1_M1
  975. KERNEL2x1_M2
  976. KERNEL2x1_M1
  977. KERNEL2x1_M2
  978. KERNEL2x1_M1
  979. KERNEL2x1_M2
  980. KERNEL2x1_M1
  981. KERNEL2x1_M2
  982. KERNEL2x1_M1
  983. KERNEL2x1_M2
  984. KERNEL2x1_M1
  985. KERNEL2x1_E
  986. b _L1_M2_44
  987. _L1_M2_32:
  988. tst L, #1
  989. ble _L1_M2_40
  990. KERNEL2x1_I
  991. KERNEL2x1_M2
  992. KERNEL2x1_M1
  993. KERNEL2x1_M2
  994. KERNEL2x1_M1
  995. KERNEL2x1_M2
  996. KERNEL2x1_M1
  997. KERNEL2x1_E
  998. b _L1_M2_44
  999. _L1_M2_40:
  1000. INIT2x1
  1001. _L1_M2_44:
  1002. ands L , K1, #7 // L = L % 8
  1003. ble _L1_M2_100
  1004. _L1_M2_46:
  1005. KERNEL2x1_SUB
  1006. subs L, L, #1
  1007. bne _L1_M2_46
  1008. _L1_M2_100:
  1009. SAVE2x1
  1010. #if (defined(LEFT) && defined(TRANSA)) || \
  1011. (!defined(LEFT) && !defined(TRANSA))
  1012. ldr r3 , K
  1013. ldr r4 , KKK
  1014. sub r3 , r3 , r4
  1015. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  1016. add BO , BO , r4
  1017. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  1018. add AO , AO , r4
  1019. #endif
  1020. #if defined(LEFT)
  1021. ldr r3 , KK
  1022. add r3 , r3 , #2 // number of values in AO
  1023. str r3 , KK
  1024. #endif
  1025. _L1_M2_END:
  1026. subs I, I, #1
  1027. bne _L1_M2_20
  1028. _L1_M1_BEGIN:
  1029. ldr I, M
  1030. tst I, #1 // I = I % 2
  1031. ble _L1_END
  1032. _L1_M1_20:
  1033. INIT1x1
  1034. #if (defined(LEFT) && defined(TRANSA)) || \
  1035. (!defined(LEFT) && !defined(TRANSA))
  1036. mov BO, BC
  1037. #else
  1038. mov BO, BC
  1039. ldr r3 , KK
  1040. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  1041. add BO , BO , r4
  1042. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  1043. add AO , AO , r4
  1044. #endif
  1045. #ifndef TRMMKERNEL
  1046. ldr K1, K
  1047. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1048. ldr K1, K
  1049. ldr r3, KK
  1050. sub K1, K1, r3
  1051. str K1, KKK
  1052. #else
  1053. ldr K1, KK
  1054. #ifdef LEFT
  1055. add K1, K1, #1 // number of values in AO
  1056. #else
  1057. add K1, K1, #1 // number of values in BO
  1058. #endif
  1059. str K1, KKK
  1060. #endif
  1061. asrs L , K1, #3 // L = L / 8
  1062. ble _L1_M1_40
  1063. _L1_M1_22:
  1064. KERNEL1x1_SUB
  1065. KERNEL1x1_SUB
  1066. KERNEL1x1_SUB
  1067. KERNEL1x1_SUB
  1068. KERNEL1x1_SUB
  1069. KERNEL1x1_SUB
  1070. KERNEL1x1_SUB
  1071. KERNEL1x1_SUB
  1072. subs L, L, #1
  1073. bgt _L1_M1_22
  1074. _L1_M1_40:
  1075. ands L , K1, #7 // L = L % 8
  1076. ble _L1_M1_100
  1077. _L1_M1_42:
  1078. KERNEL1x1_SUB
  1079. subs L, L, #1
  1080. bgt _L1_M1_42
  1081. _L1_M1_100:
  1082. SAVE1x1
  1083. _L1_END:
  1084. _L999:
  1085. sub r3, fp, #128
  1086. vldm r3, { s8 - s31} // restore floating point registers
  1087. movs r0, #0 // set return value
  1088. sub sp, fp, #24
  1089. pop {r4 - r9, fp}
  1090. bx lr
  1091. EPILOGUE