You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strmm_kernel_4x2_vfp.S 19 kB


  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 252
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA s0
  42. /******************************************************
  43. * [fp, #-128] - [fp, #-64] is reserved
  44. * for store and restore of floating point
  45. * registers
  46. *******************************************************/
  47. #define KK [fp, #-240 ]
  48. #define KKK [fp, #-244]
  49. #define C [fp, #-248 ]
  50. #define LDC [fp, #-252 ]
  51. #define M [fp, #-256 ]
  52. #define N [fp, #-260 ]
  53. #define K [fp, #-264 ]
  54. #define A [fp, #-268 ]
  55. #define FP_ZERO [fp, #-232]
  56. #define FP_ZERO_0 [fp, #-232]
  57. #define FP_ZERO_1 [fp, #-228]
  58. #define ALPHA [fp, #-276 ]
  59. #if !defined(__ARM_PCS_VFP)
  60. #define OLD_ALPHA_SOFTFP r3
  61. #define OLD_A_SOFTFP [fp, #4 ]
  62. #define B [fp, #8 ]
  63. #define OLD_C [fp, #12 ]
  64. #define OLD_LDC [fp, #16 ]
  65. #define OFFSET [fp, #20 ]
  66. #else
  67. #define B [fp, #4 ]
  68. #define OLD_C [fp, #8 ]
  69. #define OLD_LDC [fp, #12 ]
  70. #define OFFSET [fp, #16 ]
  71. #endif
  72. #define I r0
  73. #define J r1
  74. #define L r2
  75. #define AO r5
  76. #define BO r6
  77. #define CO1 r8
  78. #define CO2 r9
  79. #define K1 r7
  80. #define BC r12
  81. #define A_PRE 64
  82. #define B_PRE 64
  83. #define C_PRE 64
  84. /**************************************************************************************
  85. * Macro definitions
  86. **************************************************************************************/
  87. .macro INIT4x2
  88. flds s8 , FP_ZERO
  89. vmov.f32 s9, s8
  90. vmov.f32 s10, s8
  91. vmov.f32 s11, s8
  92. vmov.f32 s12, s8
  93. vmov.f32 s13, s8
  94. vmov.f32 s14, s8
  95. vmov.f32 s15, s8
  96. .endm
  97. .macro KERNEL4x2_SUB
  98. fldmias AO!, { s0 - s3 }
  99. fldmias BO!, { s4 - s5 }
  100. fmacs s8 , s0, s4
  101. fmacs s9 , s1, s4
  102. fmacs s10 , s2, s4
  103. fmacs s11 , s3, s4
  104. fmacs s12 , s0, s5
  105. fmacs s13 , s1, s5
  106. fmacs s14 , s2, s5
  107. fmacs s15 , s3, s5
  108. .endm
  109. .macro SAVE4x2
  110. ldr r3 , LDC
  111. add CO2 , CO1, r3
  112. flds s0, ALPHA
  113. fmuls s4 , s0 , s8
  114. fmuls s5 , s0 , s9
  115. fmuls s6 , s0 , s10
  116. fmuls s7 , s0 , s11
  117. fsts s4 , [CO1]
  118. fsts s5 , [CO1, #4 ]
  119. fsts s6 , [CO1, #8 ]
  120. fsts s7 , [CO1, #12 ]
  121. fmuls s4 , s0 , s12
  122. fmuls s5 , s0 , s13
  123. fmuls s6 , s0 , s14
  124. fmuls s7 , s0 , s15
  125. fsts s4 , [CO2]
  126. fsts s5 , [CO2, #4 ]
  127. fsts s6 , [CO2, #8 ]
  128. fsts s7 , [CO2, #12 ]
  129. add CO1, CO1, #16
  130. .endm
  131. /******************************************************************************/
  132. .macro INIT2x2
  133. flds s8 , FP_ZERO
  134. vmov.f32 s9, s8
  135. vmov.f32 s12, s8
  136. vmov.f32 s13, s8
  137. .endm
  138. .macro KERNEL2x2_SUB
  139. flds s4 , [ BO ]
  140. flds s5 , [ BO, #4 ]
  141. flds s0 , [ AO ]
  142. flds s1 , [ AO, #4 ]
  143. fmacs s8 , s0, s4
  144. fmacs s9 , s1, s4
  145. fmacs s12 , s0, s5
  146. fmacs s13 , s1, s5
  147. add AO , AO, #8
  148. add BO , BO, #8
  149. .endm
  150. .macro SAVE2x2
  151. ldr r3 , LDC
  152. add CO2 , CO1, r3
  153. flds s0, ALPHA
  154. fmuls s4 , s0 , s8
  155. fmuls s5 , s0 , s9
  156. fsts s4 , [CO1]
  157. fsts s5 , [CO1, #4 ]
  158. fmuls s4 , s0 , s12
  159. fmuls s5 , s0 , s13
  160. fsts s4 , [CO2]
  161. fsts s5 , [CO2, #4 ]
  162. add CO1, CO1, #8
  163. .endm
  164. /******************************************************************************/
  165. .macro INIT1x2
  166. flds s8 , FP_ZERO
  167. vmov.f32 s12, s8
  168. .endm
  169. .macro KERNEL1x2_SUB
  170. flds s4 , [ BO ]
  171. flds s5 , [ BO, #4 ]
  172. flds s0 , [ AO ]
  173. fmacs s8 , s0, s4
  174. fmacs s12 , s0, s5
  175. add AO , AO, #4
  176. add BO , BO, #8
  177. .endm
  178. .macro SAVE1x2
  179. ldr r3 , LDC
  180. add CO2 , CO1, r3
  181. flds s0, ALPHA
  182. fmuls s4 , s0 , s8
  183. fsts s4 , [CO1]
  184. fmuls s4 , s0 , s12
  185. fsts s4 , [CO2]
  186. add CO1, CO1, #4
  187. .endm
  188. /******************************************************************************/
  189. .macro INIT4x1
  190. flds s8 , FP_ZERO
  191. vmov.f32 s9, s8
  192. vmov.f32 s10, s8
  193. vmov.f32 s11, s8
  194. .endm
  195. .macro KERNEL4x1_SUB
  196. flds s4 , [ BO ]
  197. flds s0 , [ AO ]
  198. flds s1 , [ AO, #4 ]
  199. flds s2 , [ AO, #8 ]
  200. flds s3 , [ AO, #12 ]
  201. fmacs s8 , s0, s4
  202. fmacs s9 , s1, s4
  203. fmacs s10 , s2, s4
  204. fmacs s11 , s3, s4
  205. add AO , AO, #16
  206. add BO , BO, #4
  207. .endm
  208. .macro SAVE4x1
  209. flds s0, ALPHA
  210. fmuls s4 , s0 , s8
  211. fmuls s5 , s0 , s9
  212. fmuls s6 , s0 , s10
  213. fmuls s7 , s0 , s11
  214. fsts s4 , [CO1]
  215. fsts s5 , [CO1, #4 ]
  216. fsts s6 , [CO1, #8 ]
  217. fsts s7 , [CO1, #12 ]
  218. add CO1, CO1, #16
  219. .endm
  220. /******************************************************************************/
  221. .macro INIT2x1
  222. flds s8 , FP_ZERO
  223. vmov.f32 s9 , s8
  224. .endm
  225. .macro KERNEL2x1_SUB
  226. flds s4 , [ BO ]
  227. flds s0 , [ AO ]
  228. flds s1 , [ AO, #4 ]
  229. fmacs s8 , s0, s4
  230. fmacs s9 , s1, s4
  231. add AO , AO, #8
  232. add BO , BO, #4
  233. .endm
  234. .macro SAVE2x1
  235. flds s0, ALPHA
  236. fmuls s4 , s0 , s8
  237. fmuls s5 , s0 , s9
  238. fsts s4 , [CO1]
  239. fsts s5 , [CO1, #4 ]
  240. add CO1, CO1, #8
  241. .endm
  242. /******************************************************************************/
  243. .macro INIT1x1
  244. flds s8 , FP_ZERO
  245. .endm
  246. .macro KERNEL1x1_SUB
  247. flds s4 , [ BO ]
  248. flds s0 , [ AO ]
  249. fmacs s8 , s0, s4
  250. add AO , AO, #4
  251. add BO , BO, #4
  252. .endm
  253. .macro SAVE1x1
  254. flds s0, ALPHA
  255. fmuls s4 , s0 , s8
  256. fsts s4 , [CO1]
  257. add CO1, CO1, #4
  258. .endm
  259. /**************************************************************************************
  260. * End of macro definitions
  261. **************************************************************************************/
  262. PROLOGUE
  263. .align 5
  264. push {r4 - r9, fp}
  265. add fp, sp, #24
  266. sub sp, sp, #STACKSIZE // reserve stack
  267. #if !defined(__ARM_PCS_VFP)
  268. vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
  269. ldr OLD_A, OLD_A_SOFTFP
  270. #endif
  271. str OLD_M, M
  272. str OLD_N, N
  273. str OLD_K, K
  274. str OLD_A, A
  275. vstr OLD_ALPHA, ALPHA
  276. sub r3, fp, #128
  277. vstm r3, { s8 - s15} // store floating point registers
  278. movs r4, #0
  279. str r4, FP_ZERO
  280. str r4, FP_ZERO_1
  281. ldr r3, OLD_LDC
  282. lsl r3, r3, #2 // ldc = ldc * 4
  283. str r3, LDC
  284. ldr r3, OLD_C
  285. str r3, C
  286. ldr BC, B
  287. ldr r3, OFFSET
  288. #ifndef LEFT
  289. neg r3 , r3
  290. #endif
  291. str r3 , KK
  292. ldr J, N
  293. asrs J, J, #1 // J = J / 2
  294. ble _L1_BEGIN
  295. _L2_BEGIN:
  296. ldr CO1, C // CO1 = C
  297. ldr r4 , LDC
  298. lsl r4 , r4 , #1 // LDC * 2
  299. add r3 , r4, CO1
  300. str r3 , C // store C
  301. #if defined(LEFT)
  302. ldr r3 , OFFSET
  303. str r3 , KK
  304. #endif
  305. ldr AO, A // AO = A
  306. _L2_M4_BEGIN:
  307. ldr I, M
  308. asrs I, I, #2 // I = I / 4
  309. ble _L2_M2_BEGIN
  310. _L2_M4_20:
  311. INIT4x2
  312. #if (defined(LEFT) && defined(TRANSA)) || \
  313. (!defined(LEFT) && !defined(TRANSA))
  314. mov BO, BC
  315. #else
  316. mov BO, BC
  317. ldr r3 , KK
  318. lsls r4 , r3 , #3 // 2 float values
  319. add BO , BO , r4
  320. lsls r4 , r3 , #4 // 4 float values
  321. add AO , AO , r4
  322. #endif
  323. #ifndef TRMMKERNEL
  324. ldr L , K
  325. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  326. ldr L , K
  327. ldr r3, KK
  328. sub L , L, r3
  329. str L , KKK
  330. #else
  331. ldr L , KK
  332. #ifdef LEFT
  333. add L , L , #4 // number of values in AO
  334. #else
  335. add L , L , #2 // number of values in BO
  336. #endif
  337. str L , KKK
  338. #endif
  339. mov K1, L
  340. asrs L , K1, #3 // L = L / 8
  341. ble _L2_M4_40
  342. .align 5
  343. _L2_M4_22:
  344. pld [ AO , #A_PRE ]
  345. pld [ BO , #B_PRE ]
  346. KERNEL4x2_SUB
  347. KERNEL4x2_SUB
  348. pld [ AO , #A_PRE ]
  349. KERNEL4x2_SUB
  350. KERNEL4x2_SUB
  351. pld [ AO , #A_PRE ]
  352. pld [ BO , #B_PRE ]
  353. KERNEL4x2_SUB
  354. KERNEL4x2_SUB
  355. pld [ AO , #A_PRE ]
  356. KERNEL4x2_SUB
  357. KERNEL4x2_SUB
  358. subs L, L, #1
  359. bgt _L2_M4_22
  360. _L2_M4_40:
  361. ands L , K1, #7 // L = L % 8
  362. ble _L2_M4_100
  363. _L2_M4_42:
  364. KERNEL4x2_SUB
  365. subs L, L, #1
  366. bgt _L2_M4_42
  367. _L2_M4_100:
  368. SAVE4x2
  369. #if (defined(LEFT) && defined(TRANSA)) || \
  370. (!defined(LEFT) && !defined(TRANSA))
  371. ldr r3 , K
  372. ldr r4 , KKK
  373. sub r3 , r3 , r4
  374. lsls r4 , r3 , #3 // 2 float values
  375. add BO , BO , r4
  376. lsls r4 , r3 , #4 // 4 float values
  377. add AO , AO , r4
  378. #endif
  379. #if defined(LEFT)
  380. ldr r3 , KK
  381. add r3 , r3 , #4 // number of values in AO
  382. str r3 , KK
  383. #endif
  384. _L2_M4_END:
  385. subs I, I, #1
  386. bgt _L2_M4_20
  387. _L2_M2_BEGIN:
  388. ldr I, M
  389. tst I , #3
  390. ble _L2_END
  391. tst I, #2 // I = I / 2
  392. ble _L2_M1_BEGIN
  393. _L2_M2_20:
  394. INIT2x2
  395. #if (defined(LEFT) && defined(TRANSA)) || \
  396. (!defined(LEFT) && !defined(TRANSA))
  397. mov BO, BC
  398. #else
  399. mov BO, BC
  400. ldr r3 , KK
  401. lsls r4 , r3 , #3 // 2 float values
  402. add BO , BO , r4
  403. lsls r4 , r3 , #3 // 2 float values
  404. add AO , AO , r4
  405. #endif
  406. #ifndef TRMMKERNEL
  407. ldr L , K
  408. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  409. ldr L , K
  410. ldr r3, KK
  411. sub L , L, r3
  412. str L , KKK
  413. #else
  414. ldr L , KK
  415. #ifdef LEFT
  416. add L , L , #2 // number of values in AO
  417. #else
  418. add L , L , #2 // number of values in BO
  419. #endif
  420. str L , KKK
  421. #endif
  422. mov K1, L
  423. asrs L , K1, #3 // L = L / 8
  424. ble _L2_M2_40
  425. _L2_M2_22:
  426. KERNEL2x2_SUB
  427. KERNEL2x2_SUB
  428. KERNEL2x2_SUB
  429. KERNEL2x2_SUB
  430. KERNEL2x2_SUB
  431. KERNEL2x2_SUB
  432. KERNEL2x2_SUB
  433. KERNEL2x2_SUB
  434. subs L, L, #1
  435. bgt _L2_M2_22
  436. _L2_M2_40:
  437. ands L , K1, #7 // L = L % 8
  438. ble _L2_M2_100
  439. _L2_M2_42:
  440. KERNEL2x2_SUB
  441. subs L, L, #1
  442. bgt _L2_M2_42
  443. _L2_M2_100:
  444. SAVE2x2
  445. #if (defined(LEFT) && defined(TRANSA)) || \
  446. (!defined(LEFT) && !defined(TRANSA))
  447. ldr r3 , K
  448. ldr r4 , KKK
  449. sub r3 , r3 , r4
  450. lsls r4 , r3 , #3 // 2 float values
  451. add BO , BO , r4
  452. lsls r4 , r3 , #3 // 2 float values
  453. add AO , AO , r4
  454. #endif
  455. #if defined(LEFT)
  456. ldr r3 , KK
  457. add r3 , r3 , #2 // number of values in AO
  458. str r3 , KK
  459. #endif
  460. _L2_M2_END:
  461. _L2_M1_BEGIN:
  462. tst I, #1 // I = I % 2
  463. ble _L2_END
  464. _L2_M1_20:
  465. INIT1x2
  466. #if (defined(LEFT) && defined(TRANSA)) || \
  467. (!defined(LEFT) && !defined(TRANSA))
  468. mov BO, BC
  469. #else
  470. mov BO, BC
  471. ldr r3 , KK
  472. lsls r4 , r3 , #3 // 2 float values
  473. add BO , BO , r4
  474. lsls r4 , r3 , #2 // 1 float value
  475. add AO , AO , r4
  476. #endif
  477. #ifndef TRMMKERNEL
  478. ldr L , K
  479. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  480. ldr L , K
  481. ldr r3, KK
  482. sub L , L, r3
  483. str L , KKK
  484. #else
  485. ldr L , KK
  486. #ifdef LEFT
  487. add L , L , #1 // number of values in AO
  488. #else
  489. add L , L , #2 // number of values in BO
  490. #endif
  491. str L , KKK
  492. #endif
  493. mov K1, L
  494. asrs L , K1, #3 // L = L / 8
  495. ble _L2_M1_40
  496. _L2_M1_22:
  497. KERNEL1x2_SUB
  498. KERNEL1x2_SUB
  499. KERNEL1x2_SUB
  500. KERNEL1x2_SUB
  501. KERNEL1x2_SUB
  502. KERNEL1x2_SUB
  503. KERNEL1x2_SUB
  504. KERNEL1x2_SUB
  505. subs L, L, #1
  506. bgt _L2_M1_22
  507. _L2_M1_40:
  508. ands L , K1, #7 // L = L % 8
  509. ble _L2_M1_100
  510. _L2_M1_42:
  511. KERNEL1x2_SUB
  512. subs L, L, #1
  513. bgt _L2_M1_42
  514. _L2_M1_100:
  515. SAVE1x2
  516. #if (defined(LEFT) && defined(TRANSA)) || \
  517. (!defined(LEFT) && !defined(TRANSA))
  518. ldr r3 , K
  519. ldr r4 , KKK
  520. sub r3 , r3 , r4
  521. lsls r4 , r3 , #3 // 2 float values
  522. add BO , BO , r4
  523. lsls r4 , r3 , #2 // 1 float value
  524. add AO , AO , r4
  525. #endif
  526. #if defined(LEFT)
  527. ldr r3 , KK
  528. add r3 , r3 , #1 // number of values in AO
  529. str r3 , KK
  530. #endif
  531. _L2_END:
  532. mov r3, BC
  533. ldr r4, K
  534. lsl r4, r4, #3 // k * 2 * 4
  535. add r3, r3, r4 // B = B + K * 2 * 4
  536. mov BC, r3
  537. #if !defined(LEFT)
  538. ldr r3 , KK
  539. add r3 , r3 , #2 // number of values in BO
  540. str r3 , KK
  541. #endif
  542. subs J , #1 // j--
  543. bgt _L2_BEGIN
  544. /*********************************************************************************************/
  545. _L1_BEGIN:
  546. ldr J , N
  547. tst J , #1
  548. ble _L999
  549. ldr CO1, C // CO1 = C
  550. ldr r4 , LDC
  551. add r3 , r4, CO1
  552. str r3 , C // store C
  553. #if defined(LEFT)
  554. ldr r3 , OFFSET
  555. str r3 , KK
  556. #endif
  557. ldr AO, A // AO = A
  558. //pld [AO , #A_PRE-96]
  559. //pld [AO , #A_PRE-64]
  560. //pld [AO , #A_PRE-32]
  561. _L1_M4_BEGIN:
  562. ldr I, M
  563. asrs I, I, #2 // I = I / 4
  564. ble _L1_M2_BEGIN
  565. _L1_M4_20:
  566. INIT4x1
  567. #if (defined(LEFT) && defined(TRANSA)) || \
  568. (!defined(LEFT) && !defined(TRANSA))
  569. mov BO, BC
  570. #else
  571. mov BO, BC
  572. ldr r3 , KK
  573. lsls r4 , r3 , #2 // 1 float value
  574. add BO , BO , r4
  575. lsls r4 , r3 , #4 // 4 float values
  576. add AO , AO , r4
  577. #endif
  578. #ifndef TRMMKERNEL
  579. ldr L , K
  580. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  581. ldr L , K
  582. ldr r3, KK
  583. sub L , L, r3
  584. str L , KKK
  585. #else
  586. ldr L , KK
  587. #ifdef LEFT
  588. add L , L , #4 // number of values in AO
  589. #else
  590. add L , L , #1 // number of values in BO
  591. #endif
  592. str L , KKK
  593. #endif
  594. mov K1, L
  595. asrs L , K1, #3 // L = L / 8
  596. ble _L1_M4_40
  597. .align 5
  598. _L1_M4_22:
  599. KERNEL4x1_SUB
  600. KERNEL4x1_SUB
  601. KERNEL4x1_SUB
  602. KERNEL4x1_SUB
  603. KERNEL4x1_SUB
  604. KERNEL4x1_SUB
  605. KERNEL4x1_SUB
  606. KERNEL4x1_SUB
  607. subs L, L, #1
  608. bgt _L1_M4_22
  609. _L1_M4_40:
  610. ands L , K1, #7 // L = L % 8
  611. ble _L1_M4_100
  612. _L1_M4_42:
  613. KERNEL4x1_SUB
  614. subs L, L, #1
  615. bgt _L1_M4_42
  616. _L1_M4_100:
  617. SAVE4x1
  618. #if (defined(LEFT) && defined(TRANSA)) || \
  619. (!defined(LEFT) && !defined(TRANSA))
  620. ldr r3 , K
  621. ldr r4 , KKK
  622. sub r3 , r3 , r4
  623. lsls r4 , r3 , #2 // 1 float value
  624. add BO , BO , r4
  625. lsls r4 , r3 , #4 // 4 float values
  626. add AO , AO , r4
  627. #endif
  628. #if defined(LEFT)
  629. ldr r3 , KK
  630. add r3 , r3 , #4 // number of values in AO
  631. str r3 , KK
  632. #endif
  633. _L1_M4_END:
  634. subs I, I, #1
  635. bgt _L1_M4_20
  636. _L1_M2_BEGIN:
  637. ldr I, M
  638. tst I , #3
  639. ble _L1_END
  640. tst I, #2 // I = I / 2
  641. ble _L1_M1_BEGIN
  642. _L1_M2_20:
  643. INIT2x1
  644. #if (defined(LEFT) && defined(TRANSA)) || \
  645. (!defined(LEFT) && !defined(TRANSA))
  646. mov BO, BC
  647. #else
  648. mov BO, BC
  649. ldr r3 , KK
  650. lsls r4 , r3 , #2 // 1 float value
  651. add BO , BO , r4
  652. lsls r4 , r3 , #3 // 2 float values
  653. add AO , AO , r4
  654. #endif
  655. #ifndef TRMMKERNEL
  656. ldr L , K
  657. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  658. ldr L , K
  659. ldr r3, KK
  660. sub L , L, r3
  661. str L , KKK
  662. #else
  663. ldr L , KK
  664. #ifdef LEFT
  665. add L , L , #2 // number of values in AO
  666. #else
  667. add L , L , #1 // number of values in BO
  668. #endif
  669. str L , KKK
  670. #endif
  671. mov K1, L
  672. asrs L , K1, #3 // L = L / 8
  673. ble _L1_M2_40
  674. _L1_M2_22:
  675. KERNEL2x1_SUB
  676. KERNEL2x1_SUB
  677. KERNEL2x1_SUB
  678. KERNEL2x1_SUB
  679. KERNEL2x1_SUB
  680. KERNEL2x1_SUB
  681. KERNEL2x1_SUB
  682. KERNEL2x1_SUB
  683. subs L, L, #1
  684. bgt _L1_M2_22
  685. _L1_M2_40:
  686. ands L , K1, #7 // L = L % 8
  687. ble _L1_M2_100
  688. _L1_M2_42:
  689. KERNEL2x1_SUB
  690. subs L, L, #1
  691. bgt _L1_M2_42
  692. _L1_M2_100:
  693. SAVE2x1
  694. #if (defined(LEFT) && defined(TRANSA)) || \
  695. (!defined(LEFT) && !defined(TRANSA))
  696. ldr r3 , K
  697. ldr r4 , KKK
  698. sub r3 , r3 , r4
  699. lsls r4 , r3 , #2 // 1 float value
  700. add BO , BO , r4
  701. lsls r4 , r3 , #3 // 2 float values
  702. add AO , AO , r4
  703. #endif
  704. #if defined(LEFT)
  705. ldr r3 , KK
  706. add r3 , r3 , #2 // number of values in AO
  707. str r3 , KK
  708. #endif
  709. _L1_M2_END:
  710. _L1_M1_BEGIN:
  711. tst I, #1 // I = I % 2
  712. ble _L1_END
  713. _L1_M1_20:
  714. INIT1x1
  715. #if (defined(LEFT) && defined(TRANSA)) || \
  716. (!defined(LEFT) && !defined(TRANSA))
  717. mov BO, BC
  718. #else
  719. mov BO, BC
  720. ldr r3 , KK
  721. lsls r4 , r3 , #2 // 1 float value
  722. add BO , BO , r4
  723. lsls r4 , r3 , #2 // 1 float value
  724. add AO , AO , r4
  725. #endif
  726. #ifndef TRMMKERNEL
  727. ldr L , K
  728. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  729. ldr L , K
  730. ldr r3, KK
  731. sub L , L, r3
  732. str L , KKK
  733. #else
  734. ldr L , KK
  735. #ifdef LEFT
  736. add L , L , #1 // number of values in AO
  737. #else
  738. add L , L , #1 // number of values in BO
  739. #endif
  740. str L , KKK
  741. #endif
  742. mov K1, L
  743. asrs L , K1, #3 // L = L / 8
  744. ble _L1_M1_40
  745. _L1_M1_22:
  746. KERNEL1x1_SUB
  747. KERNEL1x1_SUB
  748. KERNEL1x1_SUB
  749. KERNEL1x1_SUB
  750. KERNEL1x1_SUB
  751. KERNEL1x1_SUB
  752. KERNEL1x1_SUB
  753. KERNEL1x1_SUB
  754. subs L, L, #1
  755. bgt _L1_M1_22
  756. _L1_M1_40:
  757. ands L , K1, #7 // L = L % 8
  758. ble _L1_M1_100
  759. _L1_M1_42:
  760. KERNEL1x1_SUB
  761. subs L, L, #1
  762. bgt _L1_M1_42
  763. _L1_M1_100:
  764. SAVE1x1
  765. _L1_END:
  766. _L999:
  767. sub r3, fp, #128
  768. vldm r3, { s8 - s15} // restore floating point registers
  769. movs r0, #0 // set return value
  770. sub sp, fp, #24
  771. pop {r4 - r9, fp}
  772. bx lr
  773. EPILOGUE