You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strmm_kernel_4x2_vfp.S 19 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 252
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA s0
  42. /******************************************************
  43. * [fp, #-128] - [fp, #-64] is reserved
  44. * for store and restore of floating point
  45. * registers
  46. *******************************************************/
  47. #define KK [fp, #-240 ]
  48. #define KKK [fp, #-244]
  49. #define C [fp, #-248 ]
  50. #define LDC [fp, #-252 ]
  51. #define M [fp, #-256 ]
  52. #define N [fp, #-260 ]
  53. #define K [fp, #-264 ]
  54. #define A [fp, #-268 ]
  55. #define ALPHA [fp, #-276 ]
  56. #define B [fp, #4 ]
  57. #define OLD_C [fp, #8 ]
  58. #define OLD_LDC [fp, #12 ]
  59. #define OFFSET [fp, #16 ]
  60. #define I r0
  61. #define J r1
  62. #define L r2
  63. #define AO r5
  64. #define BO r6
  65. #define CO1 r8
  66. #define CO2 r9
  67. #define K1 r7
  68. #define BC r12
  69. #define A_PRE 64
  70. #define B_PRE 64
  71. #define C_PRE 64
  72. /**************************************************************************************
  73. * Macro definitions
  74. **************************************************************************************/
  75. .macro INIT4x2
  76. vldr.f32 s8 , =0
  77. vmov.f32 s9, s8
  78. vmov.f32 s10, s8
  79. vmov.f32 s11, s8
  80. vmov.f32 s12, s8
  81. vmov.f32 s13, s8
  82. vmov.f32 s14, s8
  83. vmov.f32 s15, s8
  84. .endm
  85. .macro KERNEL4x2_SUB
  86. fldmias AO!, { s0 - s3 }
  87. fldmias BO!, { s4 - s5 }
  88. fmacs s8 , s0, s4
  89. fmacs s9 , s1, s4
  90. fmacs s10 , s2, s4
  91. fmacs s11 , s3, s4
  92. fmacs s12 , s0, s5
  93. fmacs s13 , s1, s5
  94. fmacs s14 , s2, s5
  95. fmacs s15 , s3, s5
  96. .endm
  97. .macro SAVE4x2
  98. ldr r3 , LDC
  99. add CO2 , CO1, r3
  100. flds s0, ALPHA
  101. fmuls s4 , s0 , s8
  102. fmuls s5 , s0 , s9
  103. fmuls s6 , s0 , s10
  104. fmuls s7 , s0 , s11
  105. fsts s4 , [CO1]
  106. fsts s5 , [CO1, #4 ]
  107. fsts s6 , [CO1, #8 ]
  108. fsts s7 , [CO1, #12 ]
  109. fmuls s4 , s0 , s12
  110. fmuls s5 , s0 , s13
  111. fmuls s6 , s0 , s14
  112. fmuls s7 , s0 , s15
  113. fsts s4 , [CO2]
  114. fsts s5 , [CO2, #4 ]
  115. fsts s6 , [CO2, #8 ]
  116. fsts s7 , [CO2, #12 ]
  117. add CO1, CO1, #16
  118. .endm
  119. /******************************************************************************/
  120. .macro INIT2x2
  121. vldr.f32 s8 , =0
  122. vmov.f32 s9, s8
  123. vmov.f32 s12, s8
  124. vmov.f32 s13, s8
  125. .endm
  126. .macro KERNEL2x2_SUB
  127. flds s4 , [ BO ]
  128. flds s5 , [ BO, #4 ]
  129. flds s0 , [ AO ]
  130. flds s1 , [ AO, #4 ]
  131. fmacs s8 , s0, s4
  132. fmacs s9 , s1, s4
  133. fmacs s12 , s0, s5
  134. fmacs s13 , s1, s5
  135. add AO , AO, #8
  136. add BO , BO, #8
  137. .endm
  138. .macro SAVE2x2
  139. ldr r3 , LDC
  140. add CO2 , CO1, r3
  141. flds s0, ALPHA
  142. fmuls s4 , s0 , s8
  143. fmuls s5 , s0 , s9
  144. fsts s4 , [CO1]
  145. fsts s5 , [CO1, #4 ]
  146. fmuls s4 , s0 , s12
  147. fmuls s5 , s0 , s13
  148. fsts s4 , [CO2]
  149. fsts s5 , [CO2, #4 ]
  150. add CO1, CO1, #8
  151. .endm
  152. /******************************************************************************/
  153. .macro INIT1x2
  154. vldr.f32 s8 , =0
  155. vmov.f32 s12, s8
  156. .endm
  157. .macro KERNEL1x2_SUB
  158. flds s4 , [ BO ]
  159. flds s5 , [ BO, #4 ]
  160. flds s0 , [ AO ]
  161. fmacs s8 , s0, s4
  162. fmacs s12 , s0, s5
  163. add AO , AO, #4
  164. add BO , BO, #8
  165. .endm
  166. .macro SAVE1x2
  167. ldr r3 , LDC
  168. add CO2 , CO1, r3
  169. flds s0, ALPHA
  170. fmuls s4 , s0 , s8
  171. fsts s4 , [CO1]
  172. fmuls s4 , s0 , s12
  173. fsts s4 , [CO2]
  174. add CO1, CO1, #4
  175. .endm
  176. /******************************************************************************/
  177. .macro INIT4x1
  178. vldr.f32 s8 , =0
  179. vmov.f32 s9, s8
  180. vmov.f32 s10, s8
  181. vmov.f32 s11, s8
  182. .endm
  183. .macro KERNEL4x1_SUB
  184. flds s4 , [ BO ]
  185. flds s0 , [ AO ]
  186. flds s1 , [ AO, #4 ]
  187. flds s2 , [ AO, #8 ]
  188. flds s3 , [ AO, #12 ]
  189. fmacs s8 , s0, s4
  190. fmacs s9 , s1, s4
  191. fmacs s10 , s2, s4
  192. fmacs s11 , s3, s4
  193. add AO , AO, #16
  194. add BO , BO, #4
  195. .endm
  196. .macro SAVE4x1
  197. flds s0, ALPHA
  198. fmuls s4 , s0 , s8
  199. fmuls s5 , s0 , s9
  200. fmuls s6 , s0 , s10
  201. fmuls s7 , s0 , s11
  202. fsts s4 , [CO1]
  203. fsts s5 , [CO1, #4 ]
  204. fsts s6 , [CO1, #8 ]
  205. fsts s7 , [CO1, #12 ]
  206. add CO1, CO1, #16
  207. .endm
  208. /******************************************************************************/
  209. .macro INIT2x1
  210. vldr.f32 s8 , =0
  211. vmov.f32 s9 , s8
  212. .endm
  213. .macro KERNEL2x1_SUB
  214. flds s4 , [ BO ]
  215. flds s0 , [ AO ]
  216. flds s1 , [ AO, #4 ]
  217. fmacs s8 , s0, s4
  218. fmacs s9 , s1, s4
  219. add AO , AO, #8
  220. add BO , BO, #4
  221. .endm
  222. .macro SAVE2x1
  223. flds s0, ALPHA
  224. fmuls s4 , s0 , s8
  225. fmuls s5 , s0 , s9
  226. fsts s4 , [CO1]
  227. fsts s5 , [CO1, #4 ]
  228. add CO1, CO1, #8
  229. .endm
  230. /******************************************************************************/
  231. .macro INIT1x1
  232. vldr.f32 s8 , =0
  233. .endm
  234. .macro KERNEL1x1_SUB
  235. flds s4 , [ BO ]
  236. flds s0 , [ AO ]
  237. fmacs s8 , s0, s4
  238. add AO , AO, #4
  239. add BO , BO, #4
  240. .endm
  241. .macro SAVE1x1
  242. flds s0, ALPHA
  243. fmuls s4 , s0 , s8
  244. fsts s4 , [CO1]
  245. add CO1, CO1, #4
  246. .endm
  247. /**************************************************************************************
  248. * End of macro definitions
  249. **************************************************************************************/
  250. PROLOGUE
  251. .align 5
  252. push {r4 - r9, fp}
  253. add fp, sp, #24
  254. sub sp, sp, #STACKSIZE // reserve stack
  255. str OLD_M, M
  256. str OLD_N, N
  257. str OLD_K, K
  258. str OLD_A, A
  259. vstr OLD_ALPHA, ALPHA
  260. sub r3, fp, #128
  261. vstm r3, { s8 - s15} // store floating point registers
  262. ldr r3, OLD_LDC
  263. lsl r3, r3, #2 // ldc = ldc * 4
  264. str r3, LDC
  265. ldr r3, OLD_C
  266. str r3, C
  267. ldr BC, B
  268. ldr r3, OFFSET
  269. #ifndef LEFT
  270. neg r3 , r3
  271. #endif
  272. str r3 , KK
  273. ldr J, N
  274. asrs J, J, #1 // J = J / 2
  275. ble _L1_BEGIN
  276. _L2_BEGIN:
  277. ldr CO1, C // CO1 = C
  278. ldr r4 , LDC
  279. lsl r4 , r4 , #1 // LDC * 2
  280. add r3 , r4, CO1
  281. str r3 , C // store C
  282. #if defined(LEFT)
  283. ldr r3 , OFFSET
  284. str r3 , KK
  285. #endif
  286. ldr AO, A // AO = A
  287. _L2_M4_BEGIN:
  288. ldr I, M
  289. asrs I, I, #2 // I = I / 4
  290. ble _L2_M2_BEGIN
  291. _L2_M4_20:
  292. INIT4x2
  293. #if (defined(LEFT) && defined(TRANSA)) || \
  294. (!defined(LEFT) && !defined(TRANSA))
  295. mov BO, BC
  296. #else
  297. mov BO, BC
  298. ldr r3 , KK
  299. lsls r4 , r3 , #3 // 2 float values
  300. add BO , BO , r4
  301. lsls r4 , r3 , #4 // 4 float values
  302. add AO , AO , r4
  303. #endif
  304. #ifndef TRMMKERNEL
  305. ldr L , K
  306. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  307. ldr L , K
  308. ldr r3, KK
  309. sub L , L, r3
  310. str L , KKK
  311. #else
  312. ldr L , KK
  313. #ifdef LEFT
  314. add L , L , #4 // number of values in AO
  315. #else
  316. add L , L , #2 // number of values in BO
  317. #endif
  318. str L , KKK
  319. #endif
  320. mov K1, L
  321. asrs L , K1, #3 // L = L / 8
  322. ble _L2_M4_40
  323. .align 5
  324. _L2_M4_22:
  325. pld [ AO , #A_PRE ]
  326. pld [ BO , #B_PRE ]
  327. KERNEL4x2_SUB
  328. KERNEL4x2_SUB
  329. pld [ AO , #A_PRE ]
  330. KERNEL4x2_SUB
  331. KERNEL4x2_SUB
  332. pld [ AO , #A_PRE ]
  333. pld [ BO , #B_PRE ]
  334. KERNEL4x2_SUB
  335. KERNEL4x2_SUB
  336. pld [ AO , #A_PRE ]
  337. KERNEL4x2_SUB
  338. KERNEL4x2_SUB
  339. subs L, L, #1
  340. bgt _L2_M4_22
  341. _L2_M4_40:
  342. ands L , K1, #7 // L = L % 8
  343. ble _L2_M4_100
  344. _L2_M4_42:
  345. KERNEL4x2_SUB
  346. subs L, L, #1
  347. bgt _L2_M4_42
  348. _L2_M4_100:
  349. SAVE4x2
  350. #if (defined(LEFT) && defined(TRANSA)) || \
  351. (!defined(LEFT) && !defined(TRANSA))
  352. ldr r3 , K
  353. ldr r4 , KKK
  354. sub r3 , r3 , r4
  355. lsls r4 , r3 , #3 // 2 float values
  356. add BO , BO , r4
  357. lsls r4 , r3 , #4 // 4 float values
  358. add AO , AO , r4
  359. #endif
  360. #if defined(LEFT)
  361. ldr r3 , KK
  362. add r3 , r3 , #4 // number of values in AO
  363. str r3 , KK
  364. #endif
  365. _L2_M4_END:
  366. subs I, I, #1
  367. bgt _L2_M4_20
  368. _L2_M2_BEGIN:
  369. ldr I, M
  370. tst I , #3
  371. ble _L2_END
  372. tst I, #2 // I = I / 2
  373. ble _L2_M1_BEGIN
  374. _L2_M2_20:
  375. INIT2x2
  376. #if (defined(LEFT) && defined(TRANSA)) || \
  377. (!defined(LEFT) && !defined(TRANSA))
  378. mov BO, BC
  379. #else
  380. mov BO, BC
  381. ldr r3 , KK
  382. lsls r4 , r3 , #3 // 2 float values
  383. add BO , BO , r4
  384. lsls r4 , r3 , #3 // 2 float values
  385. add AO , AO , r4
  386. #endif
  387. #ifndef TRMMKERNEL
  388. ldr L , K
  389. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  390. ldr L , K
  391. ldr r3, KK
  392. sub L , L, r3
  393. str L , KKK
  394. #else
  395. ldr L , KK
  396. #ifdef LEFT
  397. add L , L , #2 // number of values in AO
  398. #else
  399. add L , L , #2 // number of values in BO
  400. #endif
  401. str L , KKK
  402. #endif
  403. mov K1, L
  404. asrs L , K1, #3 // L = L / 8
  405. ble _L2_M2_40
  406. _L2_M2_22:
  407. KERNEL2x2_SUB
  408. KERNEL2x2_SUB
  409. KERNEL2x2_SUB
  410. KERNEL2x2_SUB
  411. KERNEL2x2_SUB
  412. KERNEL2x2_SUB
  413. KERNEL2x2_SUB
  414. KERNEL2x2_SUB
  415. subs L, L, #1
  416. bgt _L2_M2_22
  417. _L2_M2_40:
  418. ands L , K1, #7 // L = L % 8
  419. ble _L2_M2_100
  420. _L2_M2_42:
  421. KERNEL2x2_SUB
  422. subs L, L, #1
  423. bgt _L2_M2_42
  424. _L2_M2_100:
  425. SAVE2x2
  426. #if (defined(LEFT) && defined(TRANSA)) || \
  427. (!defined(LEFT) && !defined(TRANSA))
  428. ldr r3 , K
  429. ldr r4 , KKK
  430. sub r3 , r3 , r4
  431. lsls r4 , r3 , #3 // 2 float values
  432. add BO , BO , r4
  433. lsls r4 , r3 , #3 // 2 float values
  434. add AO , AO , r4
  435. #endif
  436. #if defined(LEFT)
  437. ldr r3 , KK
  438. add r3 , r3 , #2 // number of values in AO
  439. str r3 , KK
  440. #endif
  441. _L2_M2_END:
  442. _L2_M1_BEGIN:
  443. tst I, #1 // I = I % 2
  444. ble _L2_END
  445. _L2_M1_20:
  446. INIT1x2
  447. #if (defined(LEFT) && defined(TRANSA)) || \
  448. (!defined(LEFT) && !defined(TRANSA))
  449. mov BO, BC
  450. #else
  451. mov BO, BC
  452. ldr r3 , KK
  453. lsls r4 , r3 , #3 // 2 float values
  454. add BO , BO , r4
  455. lsls r4 , r3 , #2 // 1 float value
  456. add AO , AO , r4
  457. #endif
  458. #ifndef TRMMKERNEL
  459. ldr L , K
  460. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  461. ldr L , K
  462. ldr r3, KK
  463. sub L , L, r3
  464. str L , KKK
  465. #else
  466. ldr L , KK
  467. #ifdef LEFT
  468. add L , L , #1 // number of values in AO
  469. #else
  470. add L , L , #2 // number of values in BO
  471. #endif
  472. str L , KKK
  473. #endif
  474. mov K1, L
  475. asrs L , K1, #3 // L = L / 8
  476. ble _L2_M1_40
  477. _L2_M1_22:
  478. KERNEL1x2_SUB
  479. KERNEL1x2_SUB
  480. KERNEL1x2_SUB
  481. KERNEL1x2_SUB
  482. KERNEL1x2_SUB
  483. KERNEL1x2_SUB
  484. KERNEL1x2_SUB
  485. KERNEL1x2_SUB
  486. subs L, L, #1
  487. bgt _L2_M1_22
  488. _L2_M1_40:
  489. ands L , K1, #7 // L = L % 8
  490. ble _L2_M1_100
  491. _L2_M1_42:
  492. KERNEL1x2_SUB
  493. subs L, L, #1
  494. bgt _L2_M1_42
  495. _L2_M1_100:
  496. SAVE1x2
  497. #if (defined(LEFT) && defined(TRANSA)) || \
  498. (!defined(LEFT) && !defined(TRANSA))
  499. ldr r3 , K
  500. ldr r4 , KKK
  501. sub r3 , r3 , r4
  502. lsls r4 , r3 , #3 // 2 float values
  503. add BO , BO , r4
  504. lsls r4 , r3 , #2 // 1 float value
  505. add AO , AO , r4
  506. #endif
  507. #if defined(LEFT)
  508. ldr r3 , KK
  509. add r3 , r3 , #1 // number of values in AO
  510. str r3 , KK
  511. #endif
  512. _L2_END:
  513. mov r3, BC
  514. ldr r4, K
  515. lsl r4, r4, #3 // k * 2 * 4
  516. add r3, r3, r4 // B = B + K * 2 * 4
  517. mov BC, r3
  518. #if !defined(LEFT)
  519. ldr r3 , KK
  520. add r3 , r3 , #2 // number of values in BO
  521. str r3 , KK
  522. #endif
  523. subs J , #1 // j--
  524. bgt _L2_BEGIN
  525. /*********************************************************************************************/
  526. _L1_BEGIN:
  527. ldr J , N
  528. tst J , #1
  529. ble _L999
  530. ldr CO1, C // CO1 = C
  531. ldr r4 , LDC
  532. add r3 , r4, CO1
  533. str r3 , C // store C
  534. #if defined(LEFT)
  535. ldr r3 , OFFSET
  536. str r3 , KK
  537. #endif
  538. ldr AO, A // AO = A
  539. //pld [AO , #A_PRE-96]
  540. //pld [AO , #A_PRE-64]
  541. //pld [AO , #A_PRE-32]
  542. _L1_M4_BEGIN:
  543. ldr I, M
  544. asrs I, I, #2 // I = I / 4
  545. ble _L1_M2_BEGIN
  546. _L1_M4_20:
  547. INIT4x1
  548. #if (defined(LEFT) && defined(TRANSA)) || \
  549. (!defined(LEFT) && !defined(TRANSA))
  550. mov BO, BC
  551. #else
  552. mov BO, BC
  553. ldr r3 , KK
  554. lsls r4 , r3 , #2 // 1 float value
  555. add BO , BO , r4
  556. lsls r4 , r3 , #4 // 4 float values
  557. add AO , AO , r4
  558. #endif
  559. #ifndef TRMMKERNEL
  560. ldr L , K
  561. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  562. ldr L , K
  563. ldr r3, KK
  564. sub L , L, r3
  565. str L , KKK
  566. #else
  567. ldr L , KK
  568. #ifdef LEFT
  569. add L , L , #4 // number of values in AO
  570. #else
  571. add L , L , #1 // number of values in BO
  572. #endif
  573. str L , KKK
  574. #endif
  575. mov K1, L
  576. asrs L , K1, #3 // L = L / 8
  577. ble _L1_M4_40
  578. .align 5
  579. _L1_M4_22:
  580. KERNEL4x1_SUB
  581. KERNEL4x1_SUB
  582. KERNEL4x1_SUB
  583. KERNEL4x1_SUB
  584. KERNEL4x1_SUB
  585. KERNEL4x1_SUB
  586. KERNEL4x1_SUB
  587. KERNEL4x1_SUB
  588. subs L, L, #1
  589. bgt _L1_M4_22
  590. _L1_M4_40:
  591. ands L , K1, #7 // L = L % 8
  592. ble _L1_M4_100
  593. _L1_M4_42:
  594. KERNEL4x1_SUB
  595. subs L, L, #1
  596. bgt _L1_M4_42
  597. _L1_M4_100:
  598. SAVE4x1
  599. #if (defined(LEFT) && defined(TRANSA)) || \
  600. (!defined(LEFT) && !defined(TRANSA))
  601. ldr r3 , K
  602. ldr r4 , KKK
  603. sub r3 , r3 , r4
  604. lsls r4 , r3 , #2 // 1 float value
  605. add BO , BO , r4
  606. lsls r4 , r3 , #4 // 4 float values
  607. add AO , AO , r4
  608. #endif
  609. #if defined(LEFT)
  610. ldr r3 , KK
  611. add r3 , r3 , #4 // number of values in AO
  612. str r3 , KK
  613. #endif
  614. _L1_M4_END:
  615. subs I, I, #1
  616. bgt _L1_M4_20
  617. _L1_M2_BEGIN:
  618. ldr I, M
  619. tst I , #3
  620. ble _L1_END
  621. tst I, #2 // I = I / 2
  622. ble _L1_M1_BEGIN
  623. _L1_M2_20:
  624. INIT2x1
  625. #if (defined(LEFT) && defined(TRANSA)) || \
  626. (!defined(LEFT) && !defined(TRANSA))
  627. mov BO, BC
  628. #else
  629. mov BO, BC
  630. ldr r3 , KK
  631. lsls r4 , r3 , #2 // 1 float value
  632. add BO , BO , r4
  633. lsls r4 , r3 , #3 // 2 float values
  634. add AO , AO , r4
  635. #endif
  636. #ifndef TRMMKERNEL
  637. ldr L , K
  638. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  639. ldr L , K
  640. ldr r3, KK
  641. sub L , L, r3
  642. str L , KKK
  643. #else
  644. ldr L , KK
  645. #ifdef LEFT
  646. add L , L , #2 // number of values in AO
  647. #else
  648. add L , L , #1 // number of values in BO
  649. #endif
  650. str L , KKK
  651. #endif
  652. mov K1, L
  653. asrs L , K1, #3 // L = L / 8
  654. ble _L1_M2_40
  655. _L1_M2_22:
  656. KERNEL2x1_SUB
  657. KERNEL2x1_SUB
  658. KERNEL2x1_SUB
  659. KERNEL2x1_SUB
  660. KERNEL2x1_SUB
  661. KERNEL2x1_SUB
  662. KERNEL2x1_SUB
  663. KERNEL2x1_SUB
  664. subs L, L, #1
  665. bgt _L1_M2_22
  666. _L1_M2_40:
  667. ands L , K1, #7 // L = L % 8
  668. ble _L1_M2_100
  669. _L1_M2_42:
  670. KERNEL2x1_SUB
  671. subs L, L, #1
  672. bgt _L1_M2_42
  673. _L1_M2_100:
  674. SAVE2x1
  675. #if (defined(LEFT) && defined(TRANSA)) || \
  676. (!defined(LEFT) && !defined(TRANSA))
  677. ldr r3 , K
  678. ldr r4 , KKK
  679. sub r3 , r3 , r4
  680. lsls r4 , r3 , #2 // 1 float value
  681. add BO , BO , r4
  682. lsls r4 , r3 , #3 // 2 float values
  683. add AO , AO , r4
  684. #endif
  685. #if defined(LEFT)
  686. ldr r3 , KK
  687. add r3 , r3 , #2 // number of values in AO
  688. str r3 , KK
  689. #endif
  690. _L1_M2_END:
  691. _L1_M1_BEGIN:
  692. tst I, #1 // I = I % 2
  693. ble _L1_END
  694. _L1_M1_20:
  695. INIT1x1
  696. #if (defined(LEFT) && defined(TRANSA)) || \
  697. (!defined(LEFT) && !defined(TRANSA))
  698. mov BO, BC
  699. #else
  700. mov BO, BC
  701. ldr r3 , KK
  702. lsls r4 , r3 , #2 // 1 float value
  703. add BO , BO , r4
  704. lsls r4 , r3 , #2 // 1 float value
  705. add AO , AO , r4
  706. #endif
  707. #ifndef TRMMKERNEL
  708. ldr L , K
  709. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  710. ldr L , K
  711. ldr r3, KK
  712. sub L , L, r3
  713. str L , KKK
  714. #else
  715. ldr L , KK
  716. #ifdef LEFT
  717. add L , L , #1 // number of values in AO
  718. #else
  719. add L , L , #1 // number of values in BO
  720. #endif
  721. str L , KKK
  722. #endif
  723. mov K1, L
  724. asrs L , K1, #3 // L = L / 8
  725. ble _L1_M1_40
  726. _L1_M1_22:
  727. KERNEL1x1_SUB
  728. KERNEL1x1_SUB
  729. KERNEL1x1_SUB
  730. KERNEL1x1_SUB
  731. KERNEL1x1_SUB
  732. KERNEL1x1_SUB
  733. KERNEL1x1_SUB
  734. KERNEL1x1_SUB
  735. subs L, L, #1
  736. bgt _L1_M1_22
  737. _L1_M1_40:
  738. ands L , K1, #7 // L = L % 8
  739. ble _L1_M1_100
  740. _L1_M1_42:
  741. KERNEL1x1_SUB
  742. subs L, L, #1
  743. bgt _L1_M1_42
  744. _L1_M1_100:
  745. SAVE1x1
  746. _L1_END:
  747. _L999:
  748. sub r3, fp, #128
  749. vldm r3, { s8 - s15} // restore floating point registers
  750. movs r0, #0 // set return value
  751. sub sp, fp, #24
  752. pop {r4 - r9, fp}
  753. bx lr
  754. EPILOGUE