You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ctrmm_kernel_2x2_vfpv3.S 25 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/10/16 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA_R s0
  42. #define OLD_ALPHA_I s1
  43. /******************************************************
  44. * [fp, #-128] - [fp, #-64] is reserved
  45. * for store and restore of floating point
  46. * registers
  47. *******************************************************/
  48. #define KKK [fp, #-240]
  49. #define KK [fp, #-244 ]
  50. #define A [fp, #-248 ]
  51. #define LDC [fp, #-252 ]
  52. #define M [fp, #-256 ]
  53. #define N [fp, #-260 ]
  54. #define K [fp, #-264 ]
  55. #define FP_ZERO [fp, #-236]
  56. #define FP_ZERO_0 [fp, #-236]
  57. #define FP_ZERO_1 [fp, #-232]
  58. #define ALPHA_I [fp, #-272]
  59. #define ALPHA_R [fp, #-280]
  60. #define B [fp, #4 ]
  61. #define C [fp, #8 ]
  62. #define OLD_LDC [fp, #12 ]
  63. #define OFFSET [fp, #16 ]
  64. #define I r0
  65. #define J r1
  66. #define L r2
  67. #define AO r5
  68. #define BO r6
  69. #define CO1 r8
  70. #define CO2 r9
  71. #define K1 r7
  72. #define BC r12
  73. #define A_PRE 96
  74. #define B_PRE 96
  75. #define C_PRE 64
  76. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  77. #define FADD_R fsubs
  78. #define FADD_I fadds
  79. #define FMAC_R1 fnmuls
  80. #define FMAC_R2 fnmacs
  81. #define FMAC_I1 fmuls
  82. #define FMAC_I2 fnmacs
  83. #elif defined(CN) || defined(CT)
  84. #define FADD_R fadds
  85. #define FADD_I fsubs
  86. #define FMAC_R1 fmuls
  87. #define FMAC_R2 fmacs
  88. #define FMAC_I1 fnmuls
  89. #define FMAC_I2 fmacs
  90. #elif defined(NC) || defined(TC)
  91. #define FADD_R fadds
  92. #define FADD_I fsubs
  93. #define FMAC_R1 fmuls
  94. #define FMAC_R2 fnmacs
  95. #define FMAC_I1 fmuls
  96. #define FMAC_I2 fmacs
  97. #else
  98. #define FADD_R fsubs
  99. #define FADD_I fadds
  100. #define FMAC_R1 fnmuls
  101. #define FMAC_R2 fmacs
  102. #define FMAC_I1 fnmuls
  103. #define FMAC_I2 fnmacs
  104. #endif
  105. /**************************************************************************************
  106. * Macro definitions
  107. **************************************************************************************/
  108. .macro INIT2x2
  109. flds s16 , FP_ZERO
  110. vmov.f32 s17, s16
  111. vmov.f32 s18, s16
  112. vmov.f32 s19, s16
  113. vmov.f32 s20, s16
  114. vmov.f32 s21, s16
  115. vmov.f32 s22, s16
  116. vmov.f32 s23, s16
  117. vmov.f32 s24, s16
  118. vmov.f32 s25, s16
  119. vmov.f32 s26, s16
  120. vmov.f32 s27, s16
  121. vmov.f32 s28, s16
  122. vmov.f32 s29, s16
  123. vmov.f32 s30, s16
  124. vmov.f32 s31, s16
  125. .endm
  126. .macro KERNEL2x2_I
  127. pld [ AO , #A_PRE ]
  128. pld [ BO , #B_PRE ]
  129. fldmias AO!, { s0 - s1 }
  130. fldmias BO!, { s8 - s9 }
  131. fmuls s16 , s0, s8
  132. fmuls s24 , s1, s9
  133. fldmias AO!, { s2 - s3 }
  134. fmuls s17 , s0, s9
  135. fmuls s25 , s1, s8
  136. fldmias BO!, { s10 - s11 }
  137. fmuls s18 , s2, s8
  138. fmuls s26 , s3, s9
  139. fldmias AO!, { s4 - s5 }
  140. fmuls s19 , s2, s9
  141. fmuls s27 , s3, s8
  142. fldmias BO!, { s12 - s13 }
  143. fmuls s20 , s0, s10
  144. fmuls s28 , s1, s11
  145. fldmias AO!, { s6 - s7 }
  146. fmuls s21 , s0, s11
  147. fmuls s29 , s1, s10
  148. fldmias BO!, { s14 - s15 }
  149. fmuls s22 , s2, s10
  150. fmuls s30 , s3, s11
  151. fmuls s23 , s2, s11
  152. fmuls s31 , s3, s10
  153. .endm
  154. .macro KERNEL2x2_M1
  155. fmacs s16 , s0, s8
  156. fldmias AO!, { s4 - s5 }
  157. fmacs s24 , s1, s9
  158. fmacs s17 , s0, s9
  159. fldmias BO!, { s12 - s13 }
  160. fmacs s25 , s1, s8
  161. fmacs s18 , s2, s8
  162. fldmias AO!, { s6 - s7 }
  163. fmacs s26 , s3, s9
  164. fmacs s19 , s2, s9
  165. fldmias BO!, { s14 - s15 }
  166. fmacs s27 , s3, s8
  167. fmacs s20 , s0, s10
  168. fmacs s28 , s1, s11
  169. fmacs s21 , s0, s11
  170. fmacs s29 , s1, s10
  171. fmacs s22 , s2, s10
  172. fmacs s30 , s3, s11
  173. fmacs s23 , s2, s11
  174. fmacs s31 , s3, s10
  175. .endm
  176. .macro KERNEL2x2_M2
  177. pld [ AO , #A_PRE ]
  178. fmacs s16 , s4, s12
  179. pld [ BO , #B_PRE ]
  180. fmacs s24 , s5, s13
  181. fmacs s17 , s4, s13
  182. fldmias AO!, { s0 - s1 }
  183. fmacs s25 , s5, s12
  184. fmacs s18 , s6, s12
  185. fmacs s26 , s7, s13
  186. fldmias BO!, { s8 - s9 }
  187. fmacs s19 , s6, s13
  188. fmacs s27 , s7, s12
  189. fldmias AO!, { s2 - s3 }
  190. fmacs s20 , s4, s14
  191. fmacs s28 , s5, s15
  192. fldmias BO!, { s10 - s11 }
  193. fmacs s21 , s4, s15
  194. fmacs s29 , s5, s14
  195. fmacs s22 , s6, s14
  196. fmacs s30 , s7, s15
  197. fmacs s23 , s6, s15
  198. fmacs s31 , s7, s14
  199. .endm
  200. .macro KERNEL2x2_E
  201. fmacs s16 , s4, s12
  202. fmacs s24 , s5, s13
  203. fmacs s17 , s4, s13
  204. fmacs s25 , s5, s12
  205. fmacs s18 , s6, s12
  206. fmacs s26 , s7, s13
  207. fmacs s19 , s6, s13
  208. fmacs s27 , s7, s12
  209. fmacs s20 , s4, s14
  210. fmacs s28 , s5, s15
  211. fmacs s21 , s4, s15
  212. fmacs s29 , s5, s14
  213. fmacs s22 , s6, s14
  214. fmacs s30 , s7, s15
  215. fmacs s23 , s6, s15
  216. fmacs s31 , s7, s14
  217. .endm
  218. .macro KERNEL2x2_SUB
  219. fldmias AO!, { s0 - s1 }
  220. fldmias BO!, { s8 - s9 }
  221. fmacs s16 , s0, s8
  222. fmacs s24 , s1, s9
  223. fldmias AO!, { s2 - s3 }
  224. fmacs s17 , s0, s9
  225. fmacs s25 , s1, s8
  226. fldmias BO!, { s10 - s11 }
  227. fmacs s18 , s2, s8
  228. fmacs s26 , s3, s9
  229. fmacs s19 , s2, s9
  230. fmacs s27 , s3, s8
  231. fmacs s20 , s0, s10
  232. fmacs s28 , s1, s11
  233. fmacs s21 , s0, s11
  234. fmacs s29 , s1, s10
  235. fmacs s22 , s2, s10
  236. fmacs s30 , s3, s11
  237. fmacs s23 , s2, s11
  238. fmacs s31 , s3, s10
  239. .endm
  240. .macro SAVE2x2
  241. ldr r3 , LDC
  242. add CO2 , CO1, r3
  243. flds s0, ALPHA_R
  244. flds s1, ALPHA_I
  245. FADD_R s16, s24 , s16
  246. FADD_I s17, s25 , s17
  247. FADD_R s18, s26 , s18
  248. FADD_I s19, s27 , s19
  249. FADD_R s20, s28 , s20
  250. FADD_I s21, s29 , s21
  251. FADD_R s22, s30 , s22
  252. FADD_I s23, s31 , s23
  253. FMAC_R1 s4 , s0 , s16
  254. FMAC_I1 s5 , s0 , s17
  255. FMAC_R2 s4 , s1 , s17
  256. FMAC_I2 s5 , s1 , s16
  257. FMAC_R1 s6 , s0 , s18
  258. FMAC_I1 s7 , s0 , s19
  259. FMAC_R2 s6 , s1 , s19
  260. FMAC_I2 s7 , s1 , s18
  261. FMAC_R1 s8 , s0 , s20
  262. FMAC_I1 s9 , s0 , s21
  263. FMAC_R2 s8 , s1 , s21
  264. FMAC_I2 s9 , s1 , s20
  265. FMAC_R1 s10, s0 , s22
  266. FMAC_I1 s11, s0 , s23
  267. FMAC_R2 s10, s1 , s23
  268. FMAC_I2 s11, s1 , s22
  269. fstmias CO1, { s4 - s7 }
  270. fstmias CO2, { s8 - s11 }
  271. add CO1, CO1, #16
  272. .endm
  273. /******************************************************************************/
  274. .macro INIT1x2
  275. flds s16 , FP_ZERO
  276. vmov.f32 s17, s16
  277. vmov.f32 s20, s16
  278. vmov.f32 s21, s16
  279. vmov.f32 s24, s16
  280. vmov.f32 s25, s16
  281. vmov.f32 s28, s16
  282. vmov.f32 s29, s16
  283. .endm
  284. .macro KERNEL1x2_I
  285. pld [ AO , #A_PRE ]
  286. pld [ BO , #B_PRE ]
  287. flds s0 , [ AO ]
  288. flds s1 , [ AO, #4 ]
  289. flds s8 , [ BO ]
  290. flds s9 , [ BO, #4 ]
  291. flds s10, [ BO, #8 ]
  292. flds s11, [ BO, #12 ]
  293. fmuls s16 , s0, s8
  294. fmuls s24 , s1, s9
  295. fmuls s17 , s0, s9
  296. fmuls s25 , s1, s8
  297. fmuls s20 , s0, s10
  298. fmuls s28 , s1, s11
  299. fmuls s21 , s0, s11
  300. fmuls s29 , s1, s10
  301. add BO , BO, #16
  302. add AO , AO, #8
  303. pld [ BO , #B_PRE ]
  304. flds s4 , [ AO, #0 ]
  305. flds s5 , [ AO, #4 ]
  306. flds s12, [ BO ]
  307. flds s13, [ BO, #4 ]
  308. flds s14, [ BO, #8 ]
  309. flds s15, [ BO, #12 ]
  310. add BO , BO, #16
  311. add AO , AO, #8
  312. .endm
  313. .macro KERNEL1x2_M1
  314. pld [ BO , #B_PRE ]
  315. fmacs s16 , s0, s8
  316. fmacs s24 , s1, s9
  317. fmacs s17 , s0, s9
  318. fmacs s25 , s1, s8
  319. fmacs s20 , s0, s10
  320. fmacs s28 , s1, s11
  321. fmacs s21 , s0, s11
  322. fmacs s29 , s1, s10
  323. flds s4 , [ AO, #0 ]
  324. flds s5 , [ AO, #4 ]
  325. flds s12, [ BO ]
  326. flds s13, [ BO, #4 ]
  327. flds s14, [ BO, #8 ]
  328. flds s15, [ BO, #12 ]
  329. add BO , BO, #16
  330. add AO , AO, #8
  331. .endm
  332. .macro KERNEL1x2_M2
  333. pld [ AO , #A_PRE ]
  334. pld [ BO , #B_PRE ]
  335. fmacs s16 , s4, s12
  336. fmacs s24 , s5, s13
  337. fmacs s17 , s4, s13
  338. fmacs s25 , s5, s12
  339. fmacs s20 , s4, s14
  340. fmacs s28 , s5, s15
  341. fmacs s21 , s4, s15
  342. fmacs s29 , s5, s14
  343. flds s0 , [ AO, #0 ]
  344. flds s1 , [ AO, #4 ]
  345. flds s8 , [ BO ]
  346. flds s9 , [ BO, #4 ]
  347. flds s10, [ BO, #8 ]
  348. flds s11, [ BO, #12 ]
  349. add BO , BO, #16
  350. add AO , AO, #8
  351. .endm
  352. .macro KERNEL1x2_E
  353. fmacs s16 , s4, s12
  354. fmacs s24 , s5, s13
  355. fmacs s17 , s4, s13
  356. fmacs s25 , s5, s12
  357. fmacs s20 , s4, s14
  358. fmacs s28 , s5, s15
  359. fmacs s21 , s4, s15
  360. fmacs s29 , s5, s14
  361. .endm
  362. .macro KERNEL1x2_SUB
  363. pld [ AO , #A_PRE ]
  364. pld [ BO , #B_PRE ]
  365. flds s0 , [ AO ]
  366. flds s1 , [ AO, #4 ]
  367. flds s8 , [ BO ]
  368. flds s9 , [ BO, #4 ]
  369. flds s10, [ BO, #8 ]
  370. flds s11, [ BO, #12 ]
  371. fmacs s16 , s0, s8
  372. fmacs s24 , s1, s9
  373. fmacs s17 , s0, s9
  374. fmacs s25 , s1, s8
  375. fmacs s20 , s0, s10
  376. fmacs s28 , s1, s11
  377. fmacs s21 , s0, s11
  378. fmacs s29 , s1, s10
  379. add BO , BO, #16
  380. add AO , AO, #8
  381. .endm
  382. .macro SAVE1x2
  383. ldr r3 , LDC
  384. add CO2 , CO1, r3
  385. flds s0, ALPHA_R
  386. flds s1, ALPHA_I
  387. FADD_R s16, s24 , s16
  388. FADD_I s17, s25 , s17
  389. FADD_R s20, s28 , s20
  390. FADD_I s21, s29 , s21
  391. FMAC_R1 s4 , s0 , s16
  392. FMAC_I1 s5 , s0 , s17
  393. FMAC_R2 s4 , s1 , s17
  394. FMAC_I2 s5 , s1 , s16
  395. FMAC_R1 s8 , s0 , s20
  396. FMAC_I1 s9 , s0 , s21
  397. FMAC_R2 s8 , s1 , s21
  398. FMAC_I2 s9 , s1 , s20
  399. fstmias CO1, { s4 - s5 }
  400. fstmias CO2, { s8 - s9 }
  401. add CO1, CO1, #8
  402. .endm
  403. /******************************************************************************/
  404. .macro INIT2x1
  405. flds s16 , FP_ZERO
  406. vmov.f32 s17, s16
  407. vmov.f32 s18, s16
  408. vmov.f32 s19, s16
  409. vmov.f32 s24, s16
  410. vmov.f32 s25, s16
  411. vmov.f32 s26, s16
  412. vmov.f32 s27, s16
  413. .endm
  414. .macro KERNEL2x1_I
  415. pld [ AO , #A_PRE ]
  416. pld [ BO , #B_PRE ]
  417. flds s0 , [ AO ]
  418. flds s1 , [ AO, #4 ]
  419. flds s2 , [ AO, #8 ]
  420. flds s3 , [ AO, #12 ]
  421. flds s8 , [ BO ]
  422. flds s9 , [ BO, #4 ]
  423. fmuls s16 , s0, s8
  424. fmuls s24 , s1, s9
  425. fmuls s17 , s0, s9
  426. fmuls s25 , s1, s8
  427. fmuls s18 , s2, s8
  428. fmuls s26 , s3, s9
  429. fmuls s19 , s2, s9
  430. fmuls s27 , s3, s8
  431. add BO , BO, #8
  432. add AO , AO, #16
  433. pld [ BO , #B_PRE ]
  434. pld [ AO , #A_PRE ]
  435. flds s4 , [ AO, #0 ]
  436. flds s5 , [ AO, #4 ]
  437. flds s6 , [ AO, #8 ]
  438. flds s7 , [ AO, #12 ]
  439. flds s12, [ BO ]
  440. flds s13, [ BO, #4 ]
  441. add BO , BO, #8
  442. add AO , AO, #16
  443. .endm
  444. .macro KERNEL2x1_M1
  445. pld [ AO , #A_PRE ]
  446. pld [ BO , #B_PRE ]
  447. fmacs s16 , s0, s8
  448. fmacs s24 , s1, s9
  449. fmacs s17 , s0, s9
  450. fmacs s25 , s1, s8
  451. fmacs s18 , s2, s8
  452. fmacs s26 , s3, s9
  453. fmacs s19 , s2, s9
  454. fmacs s27 , s3, s8
  455. flds s4 , [ AO, #0 ]
  456. flds s5 , [ AO, #4 ]
  457. flds s6 , [ AO, #8 ]
  458. flds s7 , [ AO, #12 ]
  459. flds s12, [ BO ]
  460. flds s13, [ BO, #4 ]
  461. add BO , BO, #8
  462. add AO , AO, #16
  463. .endm
  464. .macro KERNEL2x1_M2
  465. pld [ AO , #A_PRE ]
  466. pld [ BO , #B_PRE ]
  467. fmacs s16 , s4, s12
  468. fmacs s24 , s5, s13
  469. fmacs s17 , s4, s13
  470. fmacs s25 , s5, s12
  471. fmacs s18 , s6, s12
  472. fmacs s26 , s7, s13
  473. fmacs s19 , s6, s13
  474. fmacs s27 , s7, s12
  475. flds s0 , [ AO, #0 ]
  476. flds s1 , [ AO, #4 ]
  477. flds s2 , [ AO, #8 ]
  478. flds s3 , [ AO, #12 ]
  479. flds s8 , [ BO ]
  480. flds s9 , [ BO, #4 ]
  481. add BO , BO, #8
  482. add AO , AO, #16
  483. .endm
  484. .macro KERNEL2x1_E
  485. fmacs s16 , s4, s12
  486. fmacs s24 , s5, s13
  487. fmacs s17 , s4, s13
  488. fmacs s25 , s5, s12
  489. fmacs s18 , s6, s12
  490. fmacs s26 , s7, s13
  491. fmacs s19 , s6, s13
  492. fmacs s27 , s7, s12
  493. .endm
  494. .macro KERNEL2x1_SUB
  495. pld [ AO , #A_PRE ]
  496. pld [ BO , #B_PRE ]
  497. flds s0 , [ AO ]
  498. flds s1 , [ AO, #4 ]
  499. flds s2 , [ AO, #8 ]
  500. flds s3 , [ AO, #12 ]
  501. flds s8 , [ BO ]
  502. flds s9 , [ BO, #4 ]
  503. fmacs s16 , s0, s8
  504. fmacs s24 , s1, s9
  505. fmacs s17 , s0, s9
  506. fmacs s25 , s1, s8
  507. fmacs s18 , s2, s8
  508. fmacs s26 , s3, s9
  509. fmacs s19 , s2, s9
  510. fmacs s27 , s3, s8
  511. add BO , BO, #8
  512. add AO , AO, #16
  513. .endm
  514. .macro SAVE2x1
  515. flds s0, ALPHA_R
  516. flds s1, ALPHA_I
  517. FADD_R s16, s24 , s16
  518. FADD_I s17, s25 , s17
  519. FADD_R s18, s26 , s18
  520. FADD_I s19, s27 , s19
  521. FMAC_R1 s4 , s0 , s16
  522. FMAC_I1 s5 , s0 , s17
  523. FMAC_R2 s4 , s1 , s17
  524. FMAC_I2 s5 , s1 , s16
  525. FMAC_R1 s6 , s0 , s18
  526. FMAC_I1 s7 , s0 , s19
  527. FMAC_R2 s6 , s1 , s19
  528. FMAC_I2 s7 , s1 , s18
  529. fstmias CO1, { s4 - s7 }
  530. add CO1, CO1, #16
  531. .endm
  532. /******************************************************************************/
  533. .macro INIT1x1
  534. flds s16 , FP_ZERO
  535. vmov.f32 s17, s16
  536. vmov.f32 s24, s16
  537. vmov.f32 s25, s16
  538. .endm
  539. .macro KERNEL1x1_I
  540. pld [ AO , #A_PRE ]
  541. pld [ BO , #B_PRE ]
  542. flds s0 , [ AO ]
  543. flds s1 , [ AO, #4 ]
  544. flds s8 , [ BO ]
  545. flds s9 , [ BO, #4 ]
  546. fmuls s16 , s0, s8
  547. fmuls s24 , s1, s9
  548. fmuls s17 , s0, s9
  549. fmuls s25 , s1, s8
  550. add BO , BO, #8
  551. add AO , AO, #8
  552. pld [ BO , #B_PRE ]
  553. pld [ AO , #A_PRE ]
  554. flds s4 , [ AO, #0 ]
  555. flds s5 , [ AO, #4 ]
  556. flds s12, [ BO ]
  557. flds s13, [ BO, #4 ]
  558. add BO , BO, #8
  559. add AO , AO, #8
  560. .endm
  561. .macro KERNEL1x1_M1
  562. fmacs s16 , s0, s8
  563. fmacs s24 , s1, s9
  564. fmacs s17 , s0, s9
  565. fmacs s25 , s1, s8
  566. flds s4 , [ AO, #0 ]
  567. flds s5 , [ AO, #4 ]
  568. flds s12, [ BO ]
  569. flds s13, [ BO, #4 ]
  570. add BO , BO, #8
  571. add AO , AO, #8
  572. .endm
  573. .macro KERNEL1x1_M2
  574. fmacs s16 , s4, s12
  575. fmacs s24 , s5, s13
  576. fmacs s17 , s4, s13
  577. fmacs s25 , s5, s12
  578. flds s0 , [ AO, #0 ]
  579. flds s1 , [ AO, #4 ]
  580. flds s8 , [ BO ]
  581. flds s9 , [ BO, #4 ]
  582. add BO , BO, #8
  583. add AO , AO, #8
  584. .endm
  585. .macro KERNEL1x1_E
  586. fmacs s16 , s4, s12
  587. fmacs s24 , s5, s13
  588. fmacs s17 , s4, s13
  589. fmacs s25 , s5, s12
  590. .endm
  591. .macro KERNEL1x1_SUB
  592. flds s0 , [ AO ]
  593. flds s1 , [ AO, #4 ]
  594. flds s8 , [ BO ]
  595. flds s9 , [ BO, #4 ]
  596. fmacs s16 , s0, s8
  597. fmacs s24 , s1, s9
  598. fmacs s17 , s0, s9
  599. fmacs s25 , s1, s8
  600. add BO , BO, #8
  601. add AO , AO, #8
  602. .endm
  603. .macro SAVE1x1
  604. flds s0, ALPHA_R
  605. flds s1, ALPHA_I
  606. FADD_R s16, s24 , s16
  607. FADD_I s17, s25 , s17
  608. FMAC_R1 s4 , s0 , s16
  609. FMAC_I1 s5 , s0 , s17
  610. FMAC_R2 s4 , s1 , s17
  611. FMAC_I2 s5 , s1 , s16
  612. fstmias CO1, { s4 - s5 }
  613. add CO1, CO1, #8
  614. .endm
  615. /******************************************************************************/
  616. /**************************************************************************************
  617. * End of macro definitions
  618. **************************************************************************************/
  619. PROLOGUE
  620. .align 5
  621. push {r4 - r9, fp}
  622. add fp, sp, #24
  623. sub sp, sp, #STACKSIZE // reserve stack
  624. str OLD_M, M
  625. str OLD_N, N
  626. str OLD_K, K
  627. str OLD_A, A
  628. vstr OLD_ALPHA_R, ALPHA_R
  629. vstr OLD_ALPHA_I, ALPHA_I
  630. sub r3, fp, #128
  631. vstm r3, { s8 - s31} // store floating point registers
  632. movs r4, #0
  633. str r4, FP_ZERO
  634. str r4, FP_ZERO_1
  635. ldr r3, OLD_LDC
  636. lsl r3, r3, #3 // ldc = ldc * 4 * 2
  637. str r3, LDC
  638. ldr r3, OFFSET
  639. #ifndef LEFT
  640. neg r3 , r3
  641. #endif
  642. str r3 , KK
  643. ldr BC, B
  644. ldr J, N
  645. asrs J, J, #1 // J = J / 2
  646. ble _L1_BEGIN
  647. _L2_BEGIN:
  648. ldr CO1, C // CO1 = C
  649. ldr r4 , LDC
  650. lsl r4 , r4 , #1 // LDC * 2
  651. add r3 , r4, CO1
  652. str r3 , C // store C
  653. #if defined(LEFT)
  654. ldr r3 , OFFSET
  655. str r3 , KK
  656. #endif
  657. ldr AO, A // AO = A
  658. pld [AO , #A_PRE-64]
  659. pld [AO , #A_PRE-32]
  660. _L2_M2_BEGIN:
  661. ldr I, M
  662. asrs I, I, #1 // I = I / 2
  663. ble _L2_M1_BEGIN
  664. _L2_M2_20:
  665. #if (defined(LEFT) && defined(TRANSA)) || \
  666. (!defined(LEFT) && !defined(TRANSA))
  667. mov BO, BC
  668. #else
  669. mov BO, BC
  670. ldr r3 , KK
  671. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  672. add BO , BO , r4
  673. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  674. add AO , AO , r4
  675. #endif
  676. #ifndef TRMMKERNEL
  677. ldr K1, K
  678. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  679. ldr K1, K
  680. ldr r3, KK
  681. sub K1, K1, r3
  682. str K1, KKK
  683. #else
  684. ldr K1, KK
  685. #ifdef LEFT
  686. add K1, K1, #2 // number of values in AO
  687. #else
  688. add K1, K1, #2 // number of values in BO
  689. #endif
  690. str K1, KKK
  691. #endif
  692. asrs L , K1, #3 // L = L / 8
  693. cmp L , #3
  694. blt _L2_M2_30
  695. .align 5
  696. KERNEL2x2_I
  697. KERNEL2x2_M2
  698. KERNEL2x2_M1
  699. KERNEL2x2_M2
  700. KERNEL2x2_M1
  701. KERNEL2x2_M2
  702. KERNEL2x2_M1
  703. KERNEL2x2_M2
  704. sub L, L, #2
  705. _L2_M2_22:
  706. KERNEL2x2_M1
  707. KERNEL2x2_M2
  708. KERNEL2x2_M1
  709. KERNEL2x2_M2
  710. KERNEL2x2_M1
  711. KERNEL2x2_M2
  712. KERNEL2x2_M1
  713. KERNEL2x2_M2
  714. subs L, L, #1
  715. bgt _L2_M2_22
  716. KERNEL2x2_M1
  717. KERNEL2x2_M2
  718. KERNEL2x2_M1
  719. KERNEL2x2_M2
  720. KERNEL2x2_M1
  721. KERNEL2x2_M2
  722. KERNEL2x2_M1
  723. KERNEL2x2_E
  724. b _L2_M2_44
  725. _L2_M2_30:
  726. tst L, #3
  727. ble _L2_M2_40
  728. tst L, #2
  729. ble _L2_M2_32
  730. KERNEL2x2_I
  731. KERNEL2x2_M2
  732. KERNEL2x2_M1
  733. KERNEL2x2_M2
  734. KERNEL2x2_M1
  735. KERNEL2x2_M2
  736. KERNEL2x2_M1
  737. KERNEL2x2_M2
  738. KERNEL2x2_M1
  739. KERNEL2x2_M2
  740. KERNEL2x2_M1
  741. KERNEL2x2_M2
  742. KERNEL2x2_M1
  743. KERNEL2x2_M2
  744. KERNEL2x2_M1
  745. KERNEL2x2_E
  746. b _L2_M2_44
  747. _L2_M2_32:
  748. tst L, #1
  749. ble _L2_M2_40
  750. KERNEL2x2_I
  751. KERNEL2x2_M2
  752. KERNEL2x2_M1
  753. KERNEL2x2_M2
  754. KERNEL2x2_M1
  755. KERNEL2x2_M2
  756. KERNEL2x2_M1
  757. KERNEL2x2_E
  758. b _L2_M2_44
  759. _L2_M2_40:
  760. INIT2x2
  761. _L2_M2_44:
  762. ands L , K1, #7 // L = L % 8
  763. ble _L2_M2_100
  764. _L2_M2_46:
  765. KERNEL2x2_SUB
  766. subs L, L, #1
  767. bne _L2_M2_46
  768. _L2_M2_100:
  769. SAVE2x2
  770. #if (defined(LEFT) && defined(TRANSA)) || \
  771. (!defined(LEFT) && !defined(TRANSA))
  772. ldr r3 , K
  773. ldr r4 , KKK
  774. sub r3 , r3 , r4
  775. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  776. add BO , BO , r4
  777. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  778. add AO , AO , r4
  779. #endif
  780. #if defined(LEFT)
  781. ldr r3 , KK
  782. add r3 , r3 , #2 // number of values in AO
  783. str r3 , KK
  784. #endif
  785. _L2_M2_END:
  786. subs I, I, #1
  787. bne _L2_M2_20
  788. _L2_M1_BEGIN:
  789. ldr I, M
  790. tst I, #1 // I = I % 2
  791. ble _L2_END
  792. _L2_M1_20:
  793. INIT1x2
  794. #if (defined(LEFT) && defined(TRANSA)) || \
  795. (!defined(LEFT) && !defined(TRANSA))
  796. mov BO, BC
  797. #else
  798. mov BO, BC
  799. ldr r3 , KK
  800. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  801. add BO , BO , r4
  802. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  803. add AO , AO , r4
  804. #endif
  805. #ifndef TRMMKERNEL
  806. ldr K1, K
  807. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  808. ldr K1, K
  809. ldr r3, KK
  810. sub K1, K1, r3
  811. str K1, KKK
  812. #else
  813. ldr K1, KK
  814. #ifdef LEFT
  815. add K1, K1, #1 // number of values in AO
  816. #else
  817. add K1, K1, #2 // number of values in BO
  818. #endif
  819. str K1, KKK
  820. #endif
  821. asrs L , K1, #3 // L = L / 8
  822. ble _L2_M1_40
  823. _L2_M1_22:
  824. KERNEL1x2_SUB
  825. KERNEL1x2_SUB
  826. KERNEL1x2_SUB
  827. KERNEL1x2_SUB
  828. KERNEL1x2_SUB
  829. KERNEL1x2_SUB
  830. KERNEL1x2_SUB
  831. KERNEL1x2_SUB
  832. subs L, L, #1
  833. bgt _L2_M1_22
  834. _L2_M1_40:
  835. ands L , K1, #7 // L = L % 8
  836. ble _L2_M1_100
  837. _L2_M1_42:
  838. KERNEL1x2_SUB
  839. subs L, L, #1
  840. bgt _L2_M1_42
  841. _L2_M1_100:
  842. SAVE1x2
  843. #if (defined(LEFT) && defined(TRANSA)) || \
  844. (!defined(LEFT) && !defined(TRANSA))
  845. ldr r3 , K
  846. ldr r4 , KKK
  847. sub r3 , r3 , r4
  848. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  849. add BO , BO , r4
  850. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  851. add AO , AO , r4
  852. #endif
  853. #if defined(LEFT)
  854. ldr r3 , KK
  855. add r3 , r3 , #1 // number of values in AO
  856. str r3 , KK
  857. #endif
  858. _L2_END:
  859. mov r3, BC
  860. ldr r4, K
  861. lsl r4, r4, #4 // k * 2 * 4 * 2
  862. add r3, r3, r4 // B = B + K * 2 * 8
  863. mov BC, r3
  864. #if !defined(LEFT)
  865. ldr r3 , KK
  866. add r3 , r3 , #2 // number of values in BO
  867. str r3 , KK
  868. #endif
  869. subs J , #1 // j--
  870. bgt _L2_BEGIN
  871. /*********************************************************************************************/
  872. _L1_BEGIN:
  873. ldr J , N
  874. tst J , #1
  875. ble _L999
  876. ldr CO1, C // CO1 = C
  877. ldr r4 , LDC
  878. add r3 , r4, CO1
  879. str r3 , C // store C
  880. #if defined(LEFT)
  881. ldr r3 , OFFSET
  882. str r3 , KK
  883. #endif
  884. ldr AO, A // AO = A
  885. _L1_M2_BEGIN:
  886. ldr I, M
  887. asrs I, I, #1 // I = I / 2
  888. ble _L1_M1_BEGIN
  889. _L1_M2_20:
  890. #if (defined(LEFT) && defined(TRANSA)) || \
  891. (!defined(LEFT) && !defined(TRANSA))
  892. mov BO, BC
  893. #else
  894. mov BO, BC
  895. ldr r3 , KK
  896. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  897. add BO , BO , r4
  898. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  899. add AO , AO , r4
  900. #endif
  901. #ifndef TRMMKERNEL
  902. ldr K1, K
  903. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  904. ldr K1, K
  905. ldr r3, KK
  906. sub K1, K1, r3
  907. str K1, KKK
  908. #else
  909. ldr K1, KK
  910. #ifdef LEFT
  911. add K1, K1, #2 // number of values in AO
  912. #else
  913. add K1, K1, #1 // number of values in BO
  914. #endif
  915. str K1, KKK
  916. #endif
  917. asrs L , K1, #3 // L = L / 8
  918. cmp L , #3
  919. blt _L1_M2_30
  920. .align 5
  921. KERNEL2x1_I
  922. KERNEL2x1_M2
  923. KERNEL2x1_M1
  924. KERNEL2x1_M2
  925. KERNEL2x1_M1
  926. KERNEL2x1_M2
  927. KERNEL2x1_M1
  928. KERNEL2x1_M2
  929. sub L, L, #2
  930. _L1_M2_22:
  931. KERNEL2x1_M1
  932. KERNEL2x1_M2
  933. KERNEL2x1_M1
  934. KERNEL2x1_M2
  935. KERNEL2x1_M1
  936. KERNEL2x1_M2
  937. KERNEL2x1_M1
  938. KERNEL2x1_M2
  939. subs L, L, #1
  940. bgt _L1_M2_22
  941. KERNEL2x1_M1
  942. KERNEL2x1_M2
  943. KERNEL2x1_M1
  944. KERNEL2x1_M2
  945. KERNEL2x1_M1
  946. KERNEL2x1_M2
  947. KERNEL2x1_M1
  948. KERNEL2x1_E
  949. b _L1_M2_44
  950. _L1_M2_30:
  951. tst L, #3
  952. ble _L1_M2_40
  953. tst L, #2
  954. ble _L1_M2_32
  955. KERNEL2x1_I
  956. KERNEL2x1_M2
  957. KERNEL2x1_M1
  958. KERNEL2x1_M2
  959. KERNEL2x1_M1
  960. KERNEL2x1_M2
  961. KERNEL2x1_M1
  962. KERNEL2x1_M2
  963. KERNEL2x1_M1
  964. KERNEL2x1_M2
  965. KERNEL2x1_M1
  966. KERNEL2x1_M2
  967. KERNEL2x1_M1
  968. KERNEL2x1_M2
  969. KERNEL2x1_M1
  970. KERNEL2x1_E
  971. b _L1_M2_44
  972. _L1_M2_32:
  973. tst L, #1
  974. ble _L1_M2_40
  975. KERNEL2x1_I
  976. KERNEL2x1_M2
  977. KERNEL2x1_M1
  978. KERNEL2x1_M2
  979. KERNEL2x1_M1
  980. KERNEL2x1_M2
  981. KERNEL2x1_M1
  982. KERNEL2x1_E
  983. b _L1_M2_44
  984. _L1_M2_40:
  985. INIT2x1
  986. _L1_M2_44:
  987. ands L , K1, #7 // L = L % 8
  988. ble _L1_M2_100
  989. _L1_M2_46:
  990. KERNEL2x1_SUB
  991. subs L, L, #1
  992. bne _L1_M2_46
  993. _L1_M2_100:
  994. SAVE2x1
  995. #if (defined(LEFT) && defined(TRANSA)) || \
  996. (!defined(LEFT) && !defined(TRANSA))
  997. ldr r3 , K
  998. ldr r4 , KKK
  999. sub r3 , r3 , r4
  1000. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  1001. add BO , BO , r4
  1002. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  1003. add AO , AO , r4
  1004. #endif
  1005. #if defined(LEFT)
  1006. ldr r3 , KK
  1007. add r3 , r3 , #2 // number of values in AO
  1008. str r3 , KK
  1009. #endif
  1010. _L1_M2_END:
  1011. subs I, I, #1
  1012. bne _L1_M2_20
  1013. _L1_M1_BEGIN:
  1014. ldr I, M
  1015. tst I, #1 // I = I % 2
  1016. ble _L1_END
  1017. _L1_M1_20:
  1018. INIT1x1
  1019. #if (defined(LEFT) && defined(TRANSA)) || \
  1020. (!defined(LEFT) && !defined(TRANSA))
  1021. mov BO, BC
  1022. #else
  1023. mov BO, BC
  1024. ldr r3 , KK
  1025. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  1026. add BO , BO , r4
  1027. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  1028. add AO , AO , r4
  1029. #endif
  1030. #ifndef TRMMKERNEL
  1031. ldr K1, K
  1032. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1033. ldr K1, K
  1034. ldr r3, KK
  1035. sub K1, K1, r3
  1036. str K1, KKK
  1037. #else
  1038. ldr K1, KK
  1039. #ifdef LEFT
  1040. add K1, K1, #1 // number of values in AO
  1041. #else
  1042. add K1, K1, #1 // number of values in BO
  1043. #endif
  1044. str K1, KKK
  1045. #endif
  1046. asrs L , K1, #3 // L = L / 8
  1047. ble _L1_M1_40
  1048. _L1_M1_22:
  1049. KERNEL1x1_SUB
  1050. KERNEL1x1_SUB
  1051. KERNEL1x1_SUB
  1052. KERNEL1x1_SUB
  1053. KERNEL1x1_SUB
  1054. KERNEL1x1_SUB
  1055. KERNEL1x1_SUB
  1056. KERNEL1x1_SUB
  1057. subs L, L, #1
  1058. bgt _L1_M1_22
  1059. _L1_M1_40:
  1060. ands L , K1, #7 // L = L % 8
  1061. ble _L1_M1_100
  1062. _L1_M1_42:
  1063. KERNEL1x1_SUB
  1064. subs L, L, #1
  1065. bgt _L1_M1_42
  1066. _L1_M1_100:
  1067. SAVE1x1
  1068. _L1_END:
  1069. _L999:
  1070. sub r3, fp, #128
  1071. vldm r3, { s8 - s31} // restore floating point registers
  1072. movs r0, #0 // set return value
  1073. sub sp, fp, #24
  1074. pop {r4 - r9, fp}
  1075. bx lr
  1076. EPILOGUE