You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dtrmm_kernel_4x2_vfp.S 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 252
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA d0
  42. /******************************************************
  43. * [fp, #-128] - [fp, #-64] is reserved
  44. * for store and restore of floating point
  45. * registers
  46. *******************************************************/
  47. #define KK [fp, #-240 ]
  48. #define KKK [fp, #-244]
  49. #define C [fp, #-248 ]
  50. #define LDC [fp, #-252 ]
  51. #define M [fp, #-256 ]
  52. #define N [fp, #-260 ]
  53. #define K [fp, #-264 ]
  54. #define A [fp, #-268 ]
  55. #define FP_ZERO [fp, #-232]
  56. #define FP_ZERO_0 [fp, #-232]
  57. #define FP_ZERO_1 [fp, #-228]
  58. #define ALPHA [fp, #-276 ]
  59. #if !defined(__ARM_PCS_VFP)
  60. #define OLD_ALPHA_SOFTFP [fp, #4]
  61. #define OLD_A_SOFTFP [fp, #12 ]
  62. #define B [fp, #16 ]
  63. #define OLD_C [fp, #20 ]
  64. #define OLD_LDC [fp, #24 ]
  65. #define OFFSET [fp, #28 ]
  66. #else
  67. #define B [fp, #4 ]
  68. #define OLD_C [fp, #8 ]
  69. #define OLD_LDC [fp, #12 ]
  70. #define OFFSET [fp, #16 ]
  71. #endif
  72. #define I r0
  73. #define J r1
  74. #define L r2
  75. #define AO r5
  76. #define BO r6
  77. #define CO1 r8
  78. #define CO2 r9
  79. #define K1 r7
  80. #define BC r12
  81. #define A_PRE 64
  82. #define B_PRE 64
  83. #define C_PRE 64
  84. /**************************************************************************************
  85. * Macro definitions
  86. **************************************************************************************/
  87. .macro INIT4x2
  88. fldd d8 , FP_ZERO
  89. vmov.f64 d9, d8
  90. vmov.f64 d10, d8
  91. vmov.f64 d11, d8
  92. vmov.f64 d12, d8
  93. vmov.f64 d13, d8
  94. vmov.f64 d14, d8
  95. vmov.f64 d15, d8
  96. .endm
  97. .macro KERNEL4x2_SUB
  98. fldd d4 , [ BO ]
  99. fldd d0 , [ AO ]
  100. fldd d1 , [ AO, #8 ]
  101. pld [ AO , #A_PRE ]
  102. fmacd d8 , d0, d4
  103. fldd d2 , [ AO, #16 ]
  104. fmacd d9 , d1, d4
  105. fldd d3 , [ AO, #24 ]
  106. fmacd d10 , d2, d4
  107. fldd d5 , [ BO, #8 ]
  108. fmacd d11 , d3, d4
  109. fmacd d12 , d0, d5
  110. fmacd d13 , d1, d5
  111. add AO , AO, #32
  112. fmacd d14 , d2, d5
  113. add BO , BO, #16
  114. fmacd d15 , d3, d5
  115. .endm
  116. .macro SAVE4x2
  117. ldr r3 , LDC
  118. add CO2 , CO1, r3
  119. fldd d0, ALPHA
  120. fmuld d4 , d0 , d8
  121. fmuld d5 , d0 , d9
  122. fmuld d6 , d0 , d10
  123. fmuld d7 , d0 , d11
  124. fstd d4 , [CO1]
  125. fstd d5 , [CO1, #8 ]
  126. fstd d6 , [CO1, #16 ]
  127. fstd d7 , [CO1, #24 ]
  128. fmuld d4 , d0 , d12
  129. fmuld d5 , d0 , d13
  130. fmuld d6 , d0 , d14
  131. fmuld d7 , d0 , d15
  132. fstd d4 , [CO2]
  133. fstd d5 , [CO2, #8 ]
  134. fstd d6 , [CO2, #16 ]
  135. fstd d7 , [CO2, #24 ]
  136. add CO1, CO1, #32
  137. .endm
  138. /******************************************************************************/
  139. .macro INIT2x2
  140. fldd d8 , FP_ZERO
  141. vmov.f64 d9, d8
  142. vmov.f64 d12, d8
  143. vmov.f64 d13, d8
  144. .endm
  145. .macro KERNEL2x2_SUB
  146. fldd d4 , [ BO ]
  147. fldd d5 , [ BO, #8 ]
  148. fldd d0 , [ AO ]
  149. fldd d1 , [ AO, #8 ]
  150. fmacd d8 , d0, d4
  151. fmacd d9 , d1, d4
  152. fmacd d12 , d0, d5
  153. fmacd d13 , d1, d5
  154. add AO , AO, #16
  155. add BO , BO, #16
  156. .endm
  157. .macro SAVE2x2
  158. ldr r3 , LDC
  159. add CO2 , CO1, r3
  160. fldd d0, ALPHA
  161. fmuld d4 , d0 , d8
  162. fmuld d5 , d0 , d9
  163. fstd d4 , [CO1]
  164. fstd d5 , [CO1, #8 ]
  165. fmuld d4 , d0 , d12
  166. fmuld d5 , d0 , d13
  167. fstd d4 , [CO2]
  168. fstd d5 , [CO2, #8 ]
  169. add CO1, CO1, #16
  170. .endm
  171. /******************************************************************************/
  172. .macro INIT1x2
  173. fldd d8 , FP_ZERO
  174. vmov.f64 d12, d8
  175. .endm
  176. .macro KERNEL1x2_SUB
  177. fldd d4 , [ BO ]
  178. fldd d5 , [ BO, #8 ]
  179. fldd d0 , [ AO ]
  180. fmacd d8 , d0, d4
  181. fmacd d12 , d0, d5
  182. add AO , AO, #8
  183. add BO , BO, #16
  184. .endm
  185. .macro SAVE1x2
  186. ldr r3 , LDC
  187. add CO2 , CO1, r3
  188. fldd d0, ALPHA
  189. fmuld d4 , d0 , d8
  190. fstd d4 , [CO1]
  191. fmuld d4 , d0 , d12
  192. fstd d4 , [CO2]
  193. add CO1, CO1, #8
  194. .endm
  195. /******************************************************************************/
  196. .macro INIT4x1
  197. fldd d8 , FP_ZERO
  198. vmov.f64 d9, d8
  199. vmov.f64 d10, d8
  200. vmov.f64 d11, d8
  201. .endm
  202. .macro KERNEL4x1_SUB
  203. fldd d4 , [ BO ]
  204. fldd d0 , [ AO ]
  205. fldd d1 , [ AO, #8 ]
  206. fldd d2 , [ AO, #16 ]
  207. fldd d3 , [ AO, #24 ]
  208. fmacd d8 , d0, d4
  209. fmacd d9 , d1, d4
  210. fmacd d10 , d2, d4
  211. fmacd d11 , d3, d4
  212. add AO , AO, #32
  213. add BO , BO, #8
  214. .endm
  215. .macro SAVE4x1
  216. fldd d0, ALPHA
  217. fmuld d4 , d0 , d8
  218. fmuld d5 , d0 , d9
  219. fmuld d6 , d0 , d10
  220. fmuld d7 , d0 , d11
  221. fstd d4 , [CO1]
  222. fstd d5 , [CO1, #8 ]
  223. fstd d6 , [CO1, #16 ]
  224. fstd d7 , [CO1, #24 ]
  225. add CO1, CO1, #32
  226. .endm
  227. /******************************************************************************/
  228. .macro INIT2x1
  229. fldd d8 , FP_ZERO
  230. vmov.f64 d9 , d8
  231. .endm
  232. .macro KERNEL2x1_SUB
  233. fldd d4 , [ BO ]
  234. fldd d0 , [ AO ]
  235. fldd d1 , [ AO, #8 ]
  236. fmacd d8 , d0, d4
  237. fmacd d9 , d1, d4
  238. add AO , AO, #16
  239. add BO , BO, #8
  240. .endm
  241. .macro SAVE2x1
  242. fldd d0, ALPHA
  243. fmuld d4 , d0 , d8
  244. fmuld d5 , d0 , d9
  245. fstd d4 , [CO1]
  246. fstd d5 , [CO1, #8 ]
  247. add CO1, CO1, #16
  248. .endm
  249. /******************************************************************************/
  250. .macro INIT1x1
  251. fldd d8 , FP_ZERO
  252. .endm
  253. .macro KERNEL1x1_SUB
  254. fldd d4 , [ BO ]
  255. fldd d0 , [ AO ]
  256. fmacd d8 , d0, d4
  257. add AO , AO, #8
  258. add BO , BO, #8
  259. .endm
  260. .macro SAVE1x1
  261. fldd d0, ALPHA
  262. fmuld d4 , d0 , d8
  263. fstd d4 , [CO1]
  264. add CO1, CO1, #8
  265. .endm
  266. /**************************************************************************************
  267. * End of macro definitions
  268. **************************************************************************************/
  269. PROLOGUE
  270. .align 5
  271. push {r4 - r9, fp}
  272. add fp, sp, #24
  273. sub sp, sp, #STACKSIZE // reserve stack
  274. #if !defined(__ARM_PCS_VFP)
  275. vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
  276. ldr OLD_A, OLD_A_SOFTFP
  277. #endif
  278. str OLD_M, M
  279. str OLD_N, N
  280. str OLD_K, K
  281. str OLD_A, A
  282. vstr OLD_ALPHA, ALPHA
  283. sub r3, fp, #128
  284. vstm r3, { d8 - d15} // store floating point registers
  285. movs r4, #0
  286. str r4, FP_ZERO
  287. str r4, FP_ZERO_1
  288. ldr r3, OLD_LDC
  289. lsl r3, r3, #3 // ldc = ldc * 8
  290. str r3, LDC
  291. ldr r3, OLD_C
  292. str r3, C
  293. ldr BC, B
  294. ldr r3, OFFSET
  295. #ifndef LEFT
  296. neg r3 , r3
  297. #endif
  298. str r3 , KK
  299. ldr J, N
  300. asrs J, J, #1 // J = J / 2
  301. ble _L1_BEGIN
  302. _L2_BEGIN:
  303. ldr CO1, C // CO1 = C
  304. ldr r4 , LDC
  305. lsl r4 , r4 , #1 // LDC * 2
  306. add r3 , r4, CO1
  307. str r3 , C // store C
  308. #if defined(LEFT)
  309. ldr r3 , OFFSET
  310. str r3 , KK
  311. #endif
  312. ldr AO, A // AO = A
  313. _L2_M4_BEGIN:
  314. ldr I, M
  315. asrs I, I, #2 // I = I / 4
  316. ble _L2_M2_BEGIN
  317. _L2_M4_20:
  318. INIT4x2
  319. #if (defined(LEFT) && defined(TRANSA)) || \
  320. (!defined(LEFT) && !defined(TRANSA))
  321. mov BO, BC
  322. #else
  323. mov BO, BC
  324. ldr r3 , KK
  325. lsls r4 , r3 , #4 // 2 double values
  326. add BO , BO , r4
  327. lsls r4 , r3 , #5 // 4 double values
  328. add AO , AO , r4
  329. #endif
  330. #ifndef TRMMKERNEL
  331. ldr L , K
  332. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  333. ldr L , K
  334. ldr r3, KK
  335. sub L , L, r3
  336. str L , KKK
  337. #else
  338. ldr L , KK
  339. #ifdef LEFT
  340. add L , L , #4 // number of values in AO
  341. #else
  342. add L , L , #2 // number of values in BO
  343. #endif
  344. str L , KKK
  345. #endif
  346. mov K1, L
  347. asrs L , K1, #3 // L = L / 8
  348. ble _L2_M4_40
  349. .align 5
  350. _L2_M4_22:
  351. pld [ BO , #B_PRE ]
  352. KERNEL4x2_SUB
  353. KERNEL4x2_SUB
  354. pld [ BO , #B_PRE ]
  355. KERNEL4x2_SUB
  356. KERNEL4x2_SUB
  357. pld [ BO , #B_PRE ]
  358. KERNEL4x2_SUB
  359. KERNEL4x2_SUB
  360. pld [ BO , #B_PRE ]
  361. KERNEL4x2_SUB
  362. KERNEL4x2_SUB
  363. subs L, L, #1
  364. bgt _L2_M4_22
  365. _L2_M4_40:
  366. ands L , K1, #7 // L = L % 8
  367. ble _L2_M4_100
  368. _L2_M4_42:
  369. KERNEL4x2_SUB
  370. subs L, L, #1
  371. bgt _L2_M4_42
  372. _L2_M4_100:
  373. SAVE4x2
  374. #if (defined(LEFT) && defined(TRANSA)) || \
  375. (!defined(LEFT) && !defined(TRANSA))
  376. ldr r3 , K
  377. ldr r4 , KKK
  378. sub r3 , r3 , r4
  379. lsls r4 , r3 , #4 // 2 double values
  380. add BO , BO , r4
  381. lsls r4 , r3 , #5 // 4 double values
  382. add AO , AO , r4
  383. #endif
  384. #if defined(LEFT)
  385. ldr r3 , KK
  386. add r3 , r3 , #4 // number of values in AO
  387. str r3 , KK
  388. #endif
  389. _L2_M4_END:
  390. subs I, I, #1
  391. bgt _L2_M4_20
  392. _L2_M2_BEGIN:
  393. ldr I, M
  394. tst I , #3
  395. ble _L2_END
  396. tst I, #2 // I = I / 2
  397. ble _L2_M1_BEGIN
  398. _L2_M2_20:
  399. INIT2x2
  400. #if (defined(LEFT) && defined(TRANSA)) || \
  401. (!defined(LEFT) && !defined(TRANSA))
  402. mov BO, BC
  403. #else
  404. mov BO, BC
  405. ldr r3 , KK
  406. lsls r4 , r3 , #4 // 2 double values
  407. add BO , BO , r4
  408. lsls r4 , r3 , #4 // 2 double values
  409. add AO , AO , r4
  410. #endif
  411. #ifndef TRMMKERNEL
  412. ldr L , K
  413. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  414. ldr L , K
  415. ldr r3, KK
  416. sub L , L, r3
  417. str L , KKK
  418. #else
  419. ldr L , KK
  420. #ifdef LEFT
  421. add L , L , #2 // number of values in AO
  422. #else
  423. add L , L , #2 // number of values in BO
  424. #endif
  425. str L , KKK
  426. #endif
  427. mov K1, L
  428. asrs L , K1, #3 // L = L / 8
  429. ble _L2_M2_40
  430. _L2_M2_22:
  431. KERNEL2x2_SUB
  432. KERNEL2x2_SUB
  433. KERNEL2x2_SUB
  434. KERNEL2x2_SUB
  435. KERNEL2x2_SUB
  436. KERNEL2x2_SUB
  437. KERNEL2x2_SUB
  438. KERNEL2x2_SUB
  439. subs L, L, #1
  440. bgt _L2_M2_22
  441. _L2_M2_40:
  442. ands L , K1, #7 // L = L % 8
  443. ble _L2_M2_100
  444. _L2_M2_42:
  445. KERNEL2x2_SUB
  446. subs L, L, #1
  447. bgt _L2_M2_42
  448. _L2_M2_100:
  449. SAVE2x2
  450. #if (defined(LEFT) && defined(TRANSA)) || \
  451. (!defined(LEFT) && !defined(TRANSA))
  452. ldr r3 , K
  453. ldr r4 , KKK
  454. sub r3 , r3 , r4
  455. lsls r4 , r3 , #4 // 2 double values
  456. add BO , BO , r4
  457. lsls r4 , r3 , #4 // 2 double values
  458. add AO , AO , r4
  459. #endif
  460. #if defined(LEFT)
  461. ldr r3 , KK
  462. add r3 , r3 , #2 // number of values in AO
  463. str r3 , KK
  464. #endif
  465. _L2_M2_END:
  466. _L2_M1_BEGIN:
  467. tst I, #1 // I = I % 2
  468. ble _L2_END
  469. _L2_M1_20:
  470. INIT1x2
  471. #if (defined(LEFT) && defined(TRANSA)) || \
  472. (!defined(LEFT) && !defined(TRANSA))
  473. mov BO, BC
  474. #else
  475. mov BO, BC
  476. ldr r3 , KK
  477. lsls r4 , r3 , #4 // 2 double values
  478. add BO , BO , r4
  479. lsls r4 , r3 , #3 // 1 double value
  480. add AO , AO , r4
  481. #endif
  482. #ifndef TRMMKERNEL
  483. ldr L , K
  484. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  485. ldr L , K
  486. ldr r3, KK
  487. sub L , L, r3
  488. str L , KKK
  489. #else
  490. ldr L , KK
  491. #ifdef LEFT
  492. add L , L , #1 // number of values in AO
  493. #else
  494. add L , L , #2 // number of values in BO
  495. #endif
  496. str L , KKK
  497. #endif
  498. mov K1, L
  499. asrs L , K1, #3 // L = L / 8
  500. ble _L2_M1_40
  501. _L2_M1_22:
  502. KERNEL1x2_SUB
  503. KERNEL1x2_SUB
  504. KERNEL1x2_SUB
  505. KERNEL1x2_SUB
  506. KERNEL1x2_SUB
  507. KERNEL1x2_SUB
  508. KERNEL1x2_SUB
  509. KERNEL1x2_SUB
  510. subs L, L, #1
  511. bgt _L2_M1_22
  512. _L2_M1_40:
  513. ands L , K1, #7 // L = L % 8
  514. ble _L2_M1_100
  515. _L2_M1_42:
  516. KERNEL1x2_SUB
  517. subs L, L, #1
  518. bgt _L2_M1_42
  519. _L2_M1_100:
  520. SAVE1x2
  521. #if (defined(LEFT) && defined(TRANSA)) || \
  522. (!defined(LEFT) && !defined(TRANSA))
  523. ldr r3 , K
  524. ldr r4 , KKK
  525. sub r3 , r3 , r4
  526. lsls r4 , r3 , #4 // 2 double values
  527. add BO , BO , r4
  528. lsls r4 , r3 , #3 // 1 double value
  529. add AO , AO , r4
  530. #endif
  531. #if defined(LEFT)
  532. ldr r3 , KK
  533. add r3 , r3 , #1 // number of values in AO
  534. str r3 , KK
  535. #endif
  536. _L2_END:
  537. mov r3, BC
  538. ldr r4, K
  539. lsl r4, r4, #4 // k * 2 * 8
  540. add r3, r3, r4 // B = B + K * 2 * 8
  541. mov BC, r3
  542. #if !defined(LEFT)
  543. ldr r3 , KK
  544. add r3 , r3 , #2 // number of values in BO
  545. str r3 , KK
  546. #endif
  547. subs J , #1 // j--
  548. bgt _L2_BEGIN
  549. /*********************************************************************************************/
  550. _L1_BEGIN:
  551. ldr J , N
  552. tst J , #1
  553. ble _L999
  554. ldr CO1, C // CO1 = C
  555. ldr r4 , LDC
  556. add r3 , r4, CO1
  557. str r3 , C // store C
  558. #if defined(LEFT)
  559. ldr r3 , OFFSET
  560. str r3 , KK
  561. #endif
  562. ldr AO, A // AO = A
  563. //pld [AO , #A_PRE-96]
  564. //pld [AO , #A_PRE-64]
  565. //pld [AO , #A_PRE-32]
  566. _L1_M4_BEGIN:
  567. ldr I, M
  568. asrs I, I, #2 // I = I / 4
  569. ble _L1_M2_BEGIN
  570. _L1_M4_20:
  571. INIT4x1
  572. #if (defined(LEFT) && defined(TRANSA)) || \
  573. (!defined(LEFT) && !defined(TRANSA))
  574. mov BO, BC
  575. #else
  576. mov BO, BC
  577. ldr r3 , KK
  578. lsls r4 , r3 , #3 // 1 double value
  579. add BO , BO , r4
  580. lsls r4 , r3 , #5 // 4 double values
  581. add AO , AO , r4
  582. #endif
  583. #ifndef TRMMKERNEL
  584. ldr L , K
  585. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  586. ldr L , K
  587. ldr r3, KK
  588. sub L , L, r3
  589. str L , KKK
  590. #else
  591. ldr L , KK
  592. #ifdef LEFT
  593. add L , L , #4 // number of values in AO
  594. #else
  595. add L , L , #1 // number of values in BO
  596. #endif
  597. str L , KKK
  598. #endif
  599. mov K1, L
  600. asrs L , K1, #3 // L = L / 8
  601. ble _L1_M4_40
  602. .align 5
  603. _L1_M4_22:
  604. KERNEL4x1_SUB
  605. KERNEL4x1_SUB
  606. KERNEL4x1_SUB
  607. KERNEL4x1_SUB
  608. KERNEL4x1_SUB
  609. KERNEL4x1_SUB
  610. KERNEL4x1_SUB
  611. KERNEL4x1_SUB
  612. subs L, L, #1
  613. bgt _L1_M4_22
  614. _L1_M4_40:
  615. ands L , K1, #7 // L = L % 8
  616. ble _L1_M4_100
  617. _L1_M4_42:
  618. KERNEL4x1_SUB
  619. subs L, L, #1
  620. bgt _L1_M4_42
  621. _L1_M4_100:
  622. SAVE4x1
  623. #if (defined(LEFT) && defined(TRANSA)) || \
  624. (!defined(LEFT) && !defined(TRANSA))
  625. ldr r3 , K
  626. ldr r4 , KKK
  627. sub r3 , r3 , r4
  628. lsls r4 , r3 , #3 // 1 double value
  629. add BO , BO , r4
  630. lsls r4 , r3 , #5 // 4 double values
  631. add AO , AO , r4
  632. #endif
  633. #if defined(LEFT)
  634. ldr r3 , KK
  635. add r3 , r3 , #4 // number of values in AO
  636. str r3 , KK
  637. #endif
  638. _L1_M4_END:
  639. subs I, I, #1
  640. bgt _L1_M4_20
  641. _L1_M2_BEGIN:
  642. ldr I, M
  643. tst I , #3
  644. ble _L1_END
  645. tst I, #2 // I = I / 2
  646. ble _L1_M1_BEGIN
  647. _L1_M2_20:
  648. INIT2x1
  649. #if (defined(LEFT) && defined(TRANSA)) || \
  650. (!defined(LEFT) && !defined(TRANSA))
  651. mov BO, BC
  652. #else
  653. mov BO, BC
  654. ldr r3 , KK
  655. lsls r4 , r3 , #3 // 1 double value
  656. add BO , BO , r4
  657. lsls r4 , r3 , #4 // 2 double values
  658. add AO , AO , r4
  659. #endif
  660. #ifndef TRMMKERNEL
  661. ldr L , K
  662. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  663. ldr L , K
  664. ldr r3, KK
  665. sub L , L, r3
  666. str L , KKK
  667. #else
  668. ldr L , KK
  669. #ifdef LEFT
  670. add L , L , #2 // number of values in AO
  671. #else
  672. add L , L , #1 // number of values in BO
  673. #endif
  674. str L , KKK
  675. #endif
  676. mov K1, L
  677. asrs L , K1, #3 // L = L / 8
  678. ble _L1_M2_40
  679. _L1_M2_22:
  680. KERNEL2x1_SUB
  681. KERNEL2x1_SUB
  682. KERNEL2x1_SUB
  683. KERNEL2x1_SUB
  684. KERNEL2x1_SUB
  685. KERNEL2x1_SUB
  686. KERNEL2x1_SUB
  687. KERNEL2x1_SUB
  688. subs L, L, #1
  689. bgt _L1_M2_22
  690. _L1_M2_40:
  691. ands L , K1, #7 // L = L % 8
  692. ble _L1_M2_100
  693. _L1_M2_42:
  694. KERNEL2x1_SUB
  695. subs L, L, #1
  696. bgt _L1_M2_42
  697. _L1_M2_100:
  698. SAVE2x1
  699. #if (defined(LEFT) && defined(TRANSA)) || \
  700. (!defined(LEFT) && !defined(TRANSA))
  701. ldr r3 , K
  702. ldr r4 , KKK
  703. sub r3 , r3 , r4
  704. lsls r4 , r3 , #3 // 1 double value
  705. add BO , BO , r4
  706. lsls r4 , r3 , #4 // 2 double values
  707. add AO , AO , r4
  708. #endif
  709. #if defined(LEFT)
  710. ldr r3 , KK
  711. add r3 , r3 , #2 // number of values in AO
  712. str r3 , KK
  713. #endif
  714. _L1_M2_END:
  715. _L1_M1_BEGIN:
  716. tst I, #1 // I = I % 2
  717. ble _L1_END
  718. _L1_M1_20:
  719. INIT1x1
  720. #if (defined(LEFT) && defined(TRANSA)) || \
  721. (!defined(LEFT) && !defined(TRANSA))
  722. mov BO, BC
  723. #else
  724. mov BO, BC
  725. ldr r3 , KK
  726. lsls r4 , r3 , #3 // 1 double value
  727. add BO , BO , r4
  728. lsls r4 , r3 , #3 // 1 double value
  729. add AO , AO , r4
  730. #endif
  731. #ifndef TRMMKERNEL
  732. ldr L , K
  733. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  734. ldr L , K
  735. ldr r3, KK
  736. sub L , L, r3
  737. str L , KKK
  738. #else
  739. ldr L , KK
  740. #ifdef LEFT
  741. add L , L , #1 // number of values in AO
  742. #else
  743. add L , L , #1 // number of values in BO
  744. #endif
  745. str L , KKK
  746. #endif
  747. mov K1, L
  748. asrs L , K1, #3 // L = L / 8
  749. ble _L1_M1_40
  750. _L1_M1_22:
  751. KERNEL1x1_SUB
  752. KERNEL1x1_SUB
  753. KERNEL1x1_SUB
  754. KERNEL1x1_SUB
  755. KERNEL1x1_SUB
  756. KERNEL1x1_SUB
  757. KERNEL1x1_SUB
  758. KERNEL1x1_SUB
  759. subs L, L, #1
  760. bgt _L1_M1_22
  761. _L1_M1_40:
  762. ands L , K1, #7 // L = L % 8
  763. ble _L1_M1_100
  764. _L1_M1_42:
  765. KERNEL1x1_SUB
  766. subs L, L, #1
  767. bgt _L1_M1_42
  768. _L1_M1_100:
  769. SAVE1x1
  770. _L1_END:
  771. _L999:
  772. sub r3, fp, #128
  773. vldm r3, { d8 - d15} // restore floating point registers
  774. movs r0, #0 // set return value
  775. sub sp, fp, #24
  776. pop {r4 - r9, fp}
  777. bx lr
  778. EPILOGUE