You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strmm_kernel_4x2_vfp.S 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 252
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA s0
  42. /******************************************************
  43. * [fp, #-128] - [fp, #-64] is reserved
  44. * for store and restore of floating point
  45. * registers
  46. *******************************************************/
  47. #define KK [fp, #-240 ]
  48. #define KKK [fp, #-244]
  49. #define C [fp, #-248 ]
  50. #define LDC [fp, #-252 ]
  51. #define M [fp, #-256 ]
  52. #define N [fp, #-260 ]
  53. #define K [fp, #-264 ]
  54. #define A [fp, #-268 ]
  55. #define FP_ZERO [fp, #-232]
  56. #define FP_ZERO_0 [fp, #-232]
  57. #define FP_ZERO_1 [fp, #-228]
  58. #define ALPHA [fp, #-276 ]
  59. #define B [fp, #4 ]
  60. #define OLD_C [fp, #8 ]
  61. #define OLD_LDC [fp, #12 ]
  62. #define OFFSET [fp, #16 ]
  63. #define I r0
  64. #define J r1
  65. #define L r2
  66. #define AO r5
  67. #define BO r6
  68. #define CO1 r8
  69. #define CO2 r9
  70. #define K1 r7
  71. #define BC r12
  72. #define A_PRE 64
  73. #define B_PRE 64
  74. #define C_PRE 64
  75. /**************************************************************************************
  76. * Macro definitions
  77. **************************************************************************************/
  78. .macro INIT4x2
  79. flds s8 , FP_ZERO
  80. vmov.f32 s9, s8
  81. vmov.f32 s10, s8
  82. vmov.f32 s11, s8
  83. vmov.f32 s12, s8
  84. vmov.f32 s13, s8
  85. vmov.f32 s14, s8
  86. vmov.f32 s15, s8
  87. .endm
  88. .macro KERNEL4x2_SUB
  89. fldmias AO!, { s0 - s3 }
  90. fldmias BO!, { s4 - s5 }
  91. fmacs s8 , s0, s4
  92. fmacs s9 , s1, s4
  93. fmacs s10 , s2, s4
  94. fmacs s11 , s3, s4
  95. fmacs s12 , s0, s5
  96. fmacs s13 , s1, s5
  97. fmacs s14 , s2, s5
  98. fmacs s15 , s3, s5
  99. .endm
  100. .macro SAVE4x2
  101. ldr r3 , LDC
  102. add CO2 , CO1, r3
  103. flds s0, ALPHA
  104. fmuls s4 , s0 , s8
  105. fmuls s5 , s0 , s9
  106. fmuls s6 , s0 , s10
  107. fmuls s7 , s0 , s11
  108. fsts s4 , [CO1]
  109. fsts s5 , [CO1, #4 ]
  110. fsts s6 , [CO1, #8 ]
  111. fsts s7 , [CO1, #12 ]
  112. fmuls s4 , s0 , s12
  113. fmuls s5 , s0 , s13
  114. fmuls s6 , s0 , s14
  115. fmuls s7 , s0 , s15
  116. fsts s4 , [CO2]
  117. fsts s5 , [CO2, #4 ]
  118. fsts s6 , [CO2, #8 ]
  119. fsts s7 , [CO2, #12 ]
  120. add CO1, CO1, #16
  121. .endm
  122. /******************************************************************************/
  123. .macro INIT2x2
  124. flds s8 , FP_ZERO
  125. vmov.f32 s9, s8
  126. vmov.f32 s12, s8
  127. vmov.f32 s13, s8
  128. .endm
  129. .macro KERNEL2x2_SUB
  130. flds s4 , [ BO ]
  131. flds s5 , [ BO, #4 ]
  132. flds s0 , [ AO ]
  133. flds s1 , [ AO, #4 ]
  134. fmacs s8 , s0, s4
  135. fmacs s9 , s1, s4
  136. fmacs s12 , s0, s5
  137. fmacs s13 , s1, s5
  138. add AO , AO, #8
  139. add BO , BO, #8
  140. .endm
  141. .macro SAVE2x2
  142. ldr r3 , LDC
  143. add CO2 , CO1, r3
  144. flds s0, ALPHA
  145. fmuls s4 , s0 , s8
  146. fmuls s5 , s0 , s9
  147. fsts s4 , [CO1]
  148. fsts s5 , [CO1, #4 ]
  149. fmuls s4 , s0 , s12
  150. fmuls s5 , s0 , s13
  151. fsts s4 , [CO2]
  152. fsts s5 , [CO2, #4 ]
  153. add CO1, CO1, #8
  154. .endm
  155. /******************************************************************************/
  156. .macro INIT1x2
  157. flds s8 , FP_ZERO
  158. vmov.f32 s12, s8
  159. .endm
  160. .macro KERNEL1x2_SUB
  161. flds s4 , [ BO ]
  162. flds s5 , [ BO, #4 ]
  163. flds s0 , [ AO ]
  164. fmacs s8 , s0, s4
  165. fmacs s12 , s0, s5
  166. add AO , AO, #4
  167. add BO , BO, #8
  168. .endm
  169. .macro SAVE1x2
  170. ldr r3 , LDC
  171. add CO2 , CO1, r3
  172. flds s0, ALPHA
  173. fmuls s4 , s0 , s8
  174. fsts s4 , [CO1]
  175. fmuls s4 , s0 , s12
  176. fsts s4 , [CO2]
  177. add CO1, CO1, #4
  178. .endm
  179. /******************************************************************************/
  180. .macro INIT4x1
  181. flds s8 , FP_ZERO
  182. vmov.f32 s9, s8
  183. vmov.f32 s10, s8
  184. vmov.f32 s11, s8
  185. .endm
  186. .macro KERNEL4x1_SUB
  187. flds s4 , [ BO ]
  188. flds s0 , [ AO ]
  189. flds s1 , [ AO, #4 ]
  190. flds s2 , [ AO, #8 ]
  191. flds s3 , [ AO, #12 ]
  192. fmacs s8 , s0, s4
  193. fmacs s9 , s1, s4
  194. fmacs s10 , s2, s4
  195. fmacs s11 , s3, s4
  196. add AO , AO, #16
  197. add BO , BO, #4
  198. .endm
  199. .macro SAVE4x1
  200. flds s0, ALPHA
  201. fmuls s4 , s0 , s8
  202. fmuls s5 , s0 , s9
  203. fmuls s6 , s0 , s10
  204. fmuls s7 , s0 , s11
  205. fsts s4 , [CO1]
  206. fsts s5 , [CO1, #4 ]
  207. fsts s6 , [CO1, #8 ]
  208. fsts s7 , [CO1, #12 ]
  209. add CO1, CO1, #16
  210. .endm
  211. /******************************************************************************/
  212. .macro INIT2x1
  213. flds s8 , FP_ZERO
  214. vmov.f32 s9 , s8
  215. .endm
  216. .macro KERNEL2x1_SUB
  217. flds s4 , [ BO ]
  218. flds s0 , [ AO ]
  219. flds s1 , [ AO, #4 ]
  220. fmacs s8 , s0, s4
  221. fmacs s9 , s1, s4
  222. add AO , AO, #8
  223. add BO , BO, #4
  224. .endm
  225. .macro SAVE2x1
  226. flds s0, ALPHA
  227. fmuls s4 , s0 , s8
  228. fmuls s5 , s0 , s9
  229. fsts s4 , [CO1]
  230. fsts s5 , [CO1, #4 ]
  231. add CO1, CO1, #8
  232. .endm
  233. /******************************************************************************/
  234. .macro INIT1x1
  235. flds s8 , FP_ZERO
  236. .endm
  237. .macro KERNEL1x1_SUB
  238. flds s4 , [ BO ]
  239. flds s0 , [ AO ]
  240. fmacs s8 , s0, s4
  241. add AO , AO, #4
  242. add BO , BO, #4
  243. .endm
  244. .macro SAVE1x1
  245. flds s0, ALPHA
  246. fmuls s4 , s0 , s8
  247. fsts s4 , [CO1]
  248. add CO1, CO1, #4
  249. .endm
  250. /**************************************************************************************
  251. * End of macro definitions
  252. **************************************************************************************/
  253. PROLOGUE
  254. .align 5
  255. push {r4 - r9, fp}
  256. add fp, sp, #24
  257. sub sp, sp, #STACKSIZE // reserve stack
  258. str OLD_M, M
  259. str OLD_N, N
  260. str OLD_K, K
  261. str OLD_A, A
  262. vstr OLD_ALPHA, ALPHA
  263. sub r3, fp, #128
  264. vstm r3, { s8 - s15} // store floating point registers
  265. movs r4, #0
  266. str r4, FP_ZERO
  267. str r4, FP_ZERO_1
  268. ldr r3, OLD_LDC
  269. lsl r3, r3, #2 // ldc = ldc * 4
  270. str r3, LDC
  271. ldr r3, OLD_C
  272. str r3, C
  273. ldr BC, B
  274. ldr r3, OFFSET
  275. #ifndef LEFT
  276. neg r3 , r3
  277. #endif
  278. str r3 , KK
  279. ldr J, N
  280. asrs J, J, #1 // J = J / 2
  281. ble _L1_BEGIN
  282. _L2_BEGIN:
  283. ldr CO1, C // CO1 = C
  284. ldr r4 , LDC
  285. lsl r4 , r4 , #1 // LDC * 2
  286. add r3 , r4, CO1
  287. str r3 , C // store C
  288. #if defined(LEFT)
  289. ldr r3 , OFFSET
  290. str r3 , KK
  291. #endif
  292. ldr AO, A // AO = A
  293. _L2_M4_BEGIN:
  294. ldr I, M
  295. asrs I, I, #2 // I = I / 4
  296. ble _L2_M2_BEGIN
  297. _L2_M4_20:
  298. INIT4x2
  299. #if (defined(LEFT) && defined(TRANSA)) || \
  300. (!defined(LEFT) && !defined(TRANSA))
  301. mov BO, BC
  302. #else
  303. mov BO, BC
  304. ldr r3 , KK
  305. lsls r4 , r3 , #3 // 2 float values
  306. add BO , BO , r4
  307. lsls r4 , r3 , #4 // 4 float values
  308. add AO , AO , r4
  309. #endif
  310. #ifndef TRMMKERNEL
  311. ldr L , K
  312. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  313. ldr L , K
  314. ldr r3, KK
  315. sub L , L, r3
  316. str L , KKK
  317. #else
  318. ldr L , KK
  319. #ifdef LEFT
  320. add L , L , #4 // number of values in AO
  321. #else
  322. add L , L , #2 // number of values in BO
  323. #endif
  324. str L , KKK
  325. #endif
  326. mov K1, L
  327. asrs L , K1, #3 // L = L / 8
  328. ble _L2_M4_40
  329. .align 5
  330. _L2_M4_22:
  331. pld [ AO , #A_PRE ]
  332. pld [ BO , #B_PRE ]
  333. KERNEL4x2_SUB
  334. KERNEL4x2_SUB
  335. pld [ AO , #A_PRE ]
  336. KERNEL4x2_SUB
  337. KERNEL4x2_SUB
  338. pld [ AO , #A_PRE ]
  339. pld [ BO , #B_PRE ]
  340. KERNEL4x2_SUB
  341. KERNEL4x2_SUB
  342. pld [ AO , #A_PRE ]
  343. KERNEL4x2_SUB
  344. KERNEL4x2_SUB
  345. subs L, L, #1
  346. bgt _L2_M4_22
  347. _L2_M4_40:
  348. ands L , K1, #7 // L = L % 8
  349. ble _L2_M4_100
  350. _L2_M4_42:
  351. KERNEL4x2_SUB
  352. subs L, L, #1
  353. bgt _L2_M4_42
  354. _L2_M4_100:
  355. SAVE4x2
  356. #if (defined(LEFT) && defined(TRANSA)) || \
  357. (!defined(LEFT) && !defined(TRANSA))
  358. ldr r3 , K
  359. ldr r4 , KKK
  360. sub r3 , r3 , r4
  361. lsls r4 , r3 , #3 // 2 float values
  362. add BO , BO , r4
  363. lsls r4 , r3 , #4 // 4 float values
  364. add AO , AO , r4
  365. #endif
  366. #if defined(LEFT)
  367. ldr r3 , KK
  368. add r3 , r3 , #4 // number of values in AO
  369. str r3 , KK
  370. #endif
  371. _L2_M4_END:
  372. subs I, I, #1
  373. bgt _L2_M4_20
  374. _L2_M2_BEGIN:
  375. ldr I, M
  376. tst I , #3
  377. ble _L2_END
  378. tst I, #2 // I = I / 2
  379. ble _L2_M1_BEGIN
  380. _L2_M2_20:
  381. INIT2x2
  382. #if (defined(LEFT) && defined(TRANSA)) || \
  383. (!defined(LEFT) && !defined(TRANSA))
  384. mov BO, BC
  385. #else
  386. mov BO, BC
  387. ldr r3 , KK
  388. lsls r4 , r3 , #3 // 2 float values
  389. add BO , BO , r4
  390. lsls r4 , r3 , #3 // 2 float values
  391. add AO , AO , r4
  392. #endif
  393. #ifndef TRMMKERNEL
  394. ldr L , K
  395. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  396. ldr L , K
  397. ldr r3, KK
  398. sub L , L, r3
  399. str L , KKK
  400. #else
  401. ldr L , KK
  402. #ifdef LEFT
  403. add L , L , #2 // number of values in AO
  404. #else
  405. add L , L , #2 // number of values in BO
  406. #endif
  407. str L , KKK
  408. #endif
  409. mov K1, L
  410. asrs L , K1, #3 // L = L / 8
  411. ble _L2_M2_40
  412. _L2_M2_22:
  413. KERNEL2x2_SUB
  414. KERNEL2x2_SUB
  415. KERNEL2x2_SUB
  416. KERNEL2x2_SUB
  417. KERNEL2x2_SUB
  418. KERNEL2x2_SUB
  419. KERNEL2x2_SUB
  420. KERNEL2x2_SUB
  421. subs L, L, #1
  422. bgt _L2_M2_22
  423. _L2_M2_40:
  424. ands L , K1, #7 // L = L % 8
  425. ble _L2_M2_100
  426. _L2_M2_42:
  427. KERNEL2x2_SUB
  428. subs L, L, #1
  429. bgt _L2_M2_42
  430. _L2_M2_100:
  431. SAVE2x2
  432. #if (defined(LEFT) && defined(TRANSA)) || \
  433. (!defined(LEFT) && !defined(TRANSA))
  434. ldr r3 , K
  435. ldr r4 , KKK
  436. sub r3 , r3 , r4
  437. lsls r4 , r3 , #3 // 2 float values
  438. add BO , BO , r4
  439. lsls r4 , r3 , #3 // 2 float values
  440. add AO , AO , r4
  441. #endif
  442. #if defined(LEFT)
  443. ldr r3 , KK
  444. add r3 , r3 , #2 // number of values in AO
  445. str r3 , KK
  446. #endif
  447. _L2_M2_END:
  448. _L2_M1_BEGIN:
  449. tst I, #1 // I = I % 2
  450. ble _L2_END
  451. _L2_M1_20:
  452. INIT1x2
  453. #if (defined(LEFT) && defined(TRANSA)) || \
  454. (!defined(LEFT) && !defined(TRANSA))
  455. mov BO, BC
  456. #else
  457. mov BO, BC
  458. ldr r3 , KK
  459. lsls r4 , r3 , #3 // 2 float values
  460. add BO , BO , r4
  461. lsls r4 , r3 , #2 // 1 float value
  462. add AO , AO , r4
  463. #endif
  464. #ifndef TRMMKERNEL
  465. ldr L , K
  466. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  467. ldr L , K
  468. ldr r3, KK
  469. sub L , L, r3
  470. str L , KKK
  471. #else
  472. ldr L , KK
  473. #ifdef LEFT
  474. add L , L , #1 // number of values in AO
  475. #else
  476. add L , L , #2 // number of values in BO
  477. #endif
  478. str L , KKK
  479. #endif
  480. mov K1, L
  481. asrs L , K1, #3 // L = L / 8
  482. ble _L2_M1_40
  483. _L2_M1_22:
  484. KERNEL1x2_SUB
  485. KERNEL1x2_SUB
  486. KERNEL1x2_SUB
  487. KERNEL1x2_SUB
  488. KERNEL1x2_SUB
  489. KERNEL1x2_SUB
  490. KERNEL1x2_SUB
  491. KERNEL1x2_SUB
  492. subs L, L, #1
  493. bgt _L2_M1_22
  494. _L2_M1_40:
  495. ands L , K1, #7 // L = L % 8
  496. ble _L2_M1_100
  497. _L2_M1_42:
  498. KERNEL1x2_SUB
  499. subs L, L, #1
  500. bgt _L2_M1_42
  501. _L2_M1_100:
  502. SAVE1x2
  503. #if (defined(LEFT) && defined(TRANSA)) || \
  504. (!defined(LEFT) && !defined(TRANSA))
  505. ldr r3 , K
  506. ldr r4 , KKK
  507. sub r3 , r3 , r4
  508. lsls r4 , r3 , #3 // 2 float values
  509. add BO , BO , r4
  510. lsls r4 , r3 , #2 // 1 float value
  511. add AO , AO , r4
  512. #endif
  513. #if defined(LEFT)
  514. ldr r3 , KK
  515. add r3 , r3 , #1 // number of values in AO
  516. str r3 , KK
  517. #endif
  518. _L2_END:
  519. mov r3, BC
  520. ldr r4, K
  521. lsl r4, r4, #3 // k * 2 * 4
  522. add r3, r3, r4 // B = B + K * 2 * 4
  523. mov BC, r3
  524. #if !defined(LEFT)
  525. ldr r3 , KK
  526. add r3 , r3 , #2 // number of values in BO
  527. str r3 , KK
  528. #endif
  529. subs J , #1 // j--
  530. bgt _L2_BEGIN
  531. /*********************************************************************************************/
  532. _L1_BEGIN:
  533. ldr J , N
  534. tst J , #1
  535. ble _L999
  536. ldr CO1, C // CO1 = C
  537. ldr r4 , LDC
  538. add r3 , r4, CO1
  539. str r3 , C // store C
  540. #if defined(LEFT)
  541. ldr r3 , OFFSET
  542. str r3 , KK
  543. #endif
  544. ldr AO, A // AO = A
  545. //pld [AO , #A_PRE-96]
  546. //pld [AO , #A_PRE-64]
  547. //pld [AO , #A_PRE-32]
  548. _L1_M4_BEGIN:
  549. ldr I, M
  550. asrs I, I, #2 // I = I / 4
  551. ble _L1_M2_BEGIN
  552. _L1_M4_20:
  553. INIT4x1
  554. #if (defined(LEFT) && defined(TRANSA)) || \
  555. (!defined(LEFT) && !defined(TRANSA))
  556. mov BO, BC
  557. #else
  558. mov BO, BC
  559. ldr r3 , KK
  560. lsls r4 , r3 , #2 // 1 float value
  561. add BO , BO , r4
  562. lsls r4 , r3 , #4 // 4 float values
  563. add AO , AO , r4
  564. #endif
  565. #ifndef TRMMKERNEL
  566. ldr L , K
  567. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  568. ldr L , K
  569. ldr r3, KK
  570. sub L , L, r3
  571. str L , KKK
  572. #else
  573. ldr L , KK
  574. #ifdef LEFT
  575. add L , L , #4 // number of values in AO
  576. #else
  577. add L , L , #1 // number of values in BO
  578. #endif
  579. str L , KKK
  580. #endif
  581. mov K1, L
  582. asrs L , K1, #3 // L = L / 8
  583. ble _L1_M4_40
  584. .align 5
  585. _L1_M4_22:
  586. KERNEL4x1_SUB
  587. KERNEL4x1_SUB
  588. KERNEL4x1_SUB
  589. KERNEL4x1_SUB
  590. KERNEL4x1_SUB
  591. KERNEL4x1_SUB
  592. KERNEL4x1_SUB
  593. KERNEL4x1_SUB
  594. subs L, L, #1
  595. bgt _L1_M4_22
  596. _L1_M4_40:
  597. ands L , K1, #7 // L = L % 8
  598. ble _L1_M4_100
  599. _L1_M4_42:
  600. KERNEL4x1_SUB
  601. subs L, L, #1
  602. bgt _L1_M4_42
  603. _L1_M4_100:
  604. SAVE4x1
  605. #if (defined(LEFT) && defined(TRANSA)) || \
  606. (!defined(LEFT) && !defined(TRANSA))
  607. ldr r3 , K
  608. ldr r4 , KKK
  609. sub r3 , r3 , r4
  610. lsls r4 , r3 , #2 // 1 float value
  611. add BO , BO , r4
  612. lsls r4 , r3 , #4 // 4 float values
  613. add AO , AO , r4
  614. #endif
  615. #if defined(LEFT)
  616. ldr r3 , KK
  617. add r3 , r3 , #4 // number of values in AO
  618. str r3 , KK
  619. #endif
  620. _L1_M4_END:
  621. subs I, I, #1
  622. bgt _L1_M4_20
  623. _L1_M2_BEGIN:
  624. ldr I, M
  625. tst I , #3
  626. ble _L1_END
  627. tst I, #2 // I = I / 2
  628. ble _L1_M1_BEGIN
  629. _L1_M2_20:
  630. INIT2x1
  631. #if (defined(LEFT) && defined(TRANSA)) || \
  632. (!defined(LEFT) && !defined(TRANSA))
  633. mov BO, BC
  634. #else
  635. mov BO, BC
  636. ldr r3 , KK
  637. lsls r4 , r3 , #2 // 1 float value
  638. add BO , BO , r4
  639. lsls r4 , r3 , #3 // 2 float values
  640. add AO , AO , r4
  641. #endif
  642. #ifndef TRMMKERNEL
  643. ldr L , K
  644. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  645. ldr L , K
  646. ldr r3, KK
  647. sub L , L, r3
  648. str L , KKK
  649. #else
  650. ldr L , KK
  651. #ifdef LEFT
  652. add L , L , #2 // number of values in AO
  653. #else
  654. add L , L , #1 // number of values in BO
  655. #endif
  656. str L , KKK
  657. #endif
  658. mov K1, L
  659. asrs L , K1, #3 // L = L / 8
  660. ble _L1_M2_40
  661. _L1_M2_22:
  662. KERNEL2x1_SUB
  663. KERNEL2x1_SUB
  664. KERNEL2x1_SUB
  665. KERNEL2x1_SUB
  666. KERNEL2x1_SUB
  667. KERNEL2x1_SUB
  668. KERNEL2x1_SUB
  669. KERNEL2x1_SUB
  670. subs L, L, #1
  671. bgt _L1_M2_22
  672. _L1_M2_40:
  673. ands L , K1, #7 // L = L % 8
  674. ble _L1_M2_100
  675. _L1_M2_42:
  676. KERNEL2x1_SUB
  677. subs L, L, #1
  678. bgt _L1_M2_42
  679. _L1_M2_100:
  680. SAVE2x1
  681. #if (defined(LEFT) && defined(TRANSA)) || \
  682. (!defined(LEFT) && !defined(TRANSA))
  683. ldr r3 , K
  684. ldr r4 , KKK
  685. sub r3 , r3 , r4
  686. lsls r4 , r3 , #2 // 1 float value
  687. add BO , BO , r4
  688. lsls r4 , r3 , #3 // 2 float values
  689. add AO , AO , r4
  690. #endif
  691. #if defined(LEFT)
  692. ldr r3 , KK
  693. add r3 , r3 , #2 // number of values in AO
  694. str r3 , KK
  695. #endif
  696. _L1_M2_END:
  697. _L1_M1_BEGIN:
  698. tst I, #1 // I = I % 2
  699. ble _L1_END
  700. _L1_M1_20:
  701. INIT1x1
  702. #if (defined(LEFT) && defined(TRANSA)) || \
  703. (!defined(LEFT) && !defined(TRANSA))
  704. mov BO, BC
  705. #else
  706. mov BO, BC
  707. ldr r3 , KK
  708. lsls r4 , r3 , #2 // 1 float value
  709. add BO , BO , r4
  710. lsls r4 , r3 , #2 // 1 float value
  711. add AO , AO , r4
  712. #endif
  713. #ifndef TRMMKERNEL
  714. ldr L , K
  715. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  716. ldr L , K
  717. ldr r3, KK
  718. sub L , L, r3
  719. str L , KKK
  720. #else
  721. ldr L , KK
  722. #ifdef LEFT
  723. add L , L , #1 // number of values in AO
  724. #else
  725. add L , L , #1 // number of values in BO
  726. #endif
  727. str L , KKK
  728. #endif
  729. mov K1, L
  730. asrs L , K1, #3 // L = L / 8
  731. ble _L1_M1_40
  732. _L1_M1_22:
  733. KERNEL1x1_SUB
  734. KERNEL1x1_SUB
  735. KERNEL1x1_SUB
  736. KERNEL1x1_SUB
  737. KERNEL1x1_SUB
  738. KERNEL1x1_SUB
  739. KERNEL1x1_SUB
  740. KERNEL1x1_SUB
  741. subs L, L, #1
  742. bgt _L1_M1_22
  743. _L1_M1_40:
  744. ands L , K1, #7 // L = L % 8
  745. ble _L1_M1_100
  746. _L1_M1_42:
  747. KERNEL1x1_SUB
  748. subs L, L, #1
  749. bgt _L1_M1_42
  750. _L1_M1_100:
  751. SAVE1x1
  752. _L1_END:
  753. _L999:
  754. sub r3, fp, #128
  755. vldm r3, { s8 - s15} // restore floating point registers
  756. movs r0, #0 // set return value
  757. sub sp, fp, #24
  758. pop {r4 - r9, fp}
  759. bx lr
  760. EPILOGUE