You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_4x2_vfp.S 13 kB


  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/27 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA d0
  42. /******************************************************
  43. * [fp, #-128] - [fp, #-64] is reserved
  44. * for store and restore of floating point
  45. * registers
  46. *******************************************************/
  47. #define LDC [fp, #-252 ]
  48. #define M [fp, #-256 ]
  49. #define N [fp, #-260 ]
  50. #define K [fp, #-264 ]
  51. #define A [fp, #-268 ]
  52. #define ALPHA [fp, #-280]
  53. #define B [fp, #4 ]
  54. #define C [fp, #8 ]
  55. #define OLD_LDC [fp, #12 ]
  56. #define I r0
  57. #define J r1
  58. #define L r2
  59. #define AO r5
  60. #define BO r6
  61. #define CO1 r8
  62. #define CO2 r9
  63. #define K1 r7
  64. #define BC r12
  65. #define A_PRE 96
  66. #define B_PRE 96
  67. #define C_PRE 32
  68. /**************************************************************************************
  69. * Macro definitions
  70. **************************************************************************************/
  71. .macro INIT4x2
  72. vsub.f64 d8 , d8 , d8
  73. vmov.f64 d9, d8
  74. vmov.f64 d10, d8
  75. vmov.f64 d11, d8
  76. vmov.f64 d12, d8
  77. vmov.f64 d13, d8
  78. vmov.f64 d14, d8
  79. vmov.f64 d15, d8
  80. .endm
  81. .macro KERNEL4x2_SUB
  82. pld [ AO, #A_PRE ]
  83. fldd d4 , [ BO ]
  84. fldd d0 , [ AO ]
  85. fldd d1 , [ AO, #8 ]
  86. fmacd d8 , d0, d4
  87. fldd d2 , [ AO, #16 ]
  88. fmacd d9 , d1, d4
  89. fldd d3 , [ AO, #24 ]
  90. fmacd d10 , d2, d4
  91. fldd d5 , [ BO, #8 ]
  92. fmacd d11 , d3, d4
  93. fmacd d12 , d0, d5
  94. fmacd d13 , d1, d5
  95. add AO , AO, #32
  96. fmacd d14 , d2, d5
  97. add BO , BO, #16
  98. fmacd d15 , d3, d5
  99. .endm
  100. .macro SAVE4x2
  101. ldr r3 , LDC
  102. add CO2 , CO1, r3
  103. fldd d0, ALPHA
  104. fldd d4 , [CO1]
  105. fldd d5 , [CO1, #8 ]
  106. pld [ CO1, #C_PRE ]
  107. fmacd d4 , d0 , d8
  108. fldd d6 , [CO1, #16 ]
  109. fmacd d5 , d0 , d9
  110. fldd d7 , [CO1, #24 ]
  111. fmacd d6 , d0 , d10
  112. fstd d4 , [CO1]
  113. fmacd d7 , d0 , d11
  114. fstd d5 , [CO1, #8 ]
  115. fstd d6 , [CO1, #16 ]
  116. fstd d7 , [CO1, #24 ]
  117. fldd d4 , [CO2]
  118. fldd d5 , [CO2, #8 ]
  119. pld [ CO2, #C_PRE ]
  120. fmacd d4 , d0 , d12
  121. fldd d6 , [CO2, #16 ]
  122. fmacd d5 , d0 , d13
  123. fldd d7 , [CO2, #24 ]
  124. fmacd d6 , d0 , d14
  125. fstd d4 , [CO2]
  126. fmacd d7 , d0 , d15
  127. add CO1, CO1, #32
  128. fstd d5 , [CO2, #8 ]
  129. fstd d6 , [CO2, #16 ]
  130. fstd d7 , [CO2, #24 ]
  131. .endm
  132. /******************************************************************************/
  133. .macro INIT2x2
  134. vsub.f64 d8 , d8 , d8
  135. vmov.f64 d9, d8
  136. vmov.f64 d12, d8
  137. vmov.f64 d13, d8
  138. .endm
  139. .macro KERNEL2x2_SUB
  140. fldd d4 , [ BO ]
  141. fldd d5 , [ BO, #8 ]
  142. fldd d0 , [ AO ]
  143. fldd d1 , [ AO, #8 ]
  144. fmacd d8 , d0, d4
  145. fmacd d9 , d1, d4
  146. fmacd d12 , d0, d5
  147. fmacd d13 , d1, d5
  148. add AO , AO, #16
  149. add BO , BO, #16
  150. .endm
  151. .macro SAVE2x2
  152. ldr r3 , LDC
  153. add CO2 , CO1, r3
  154. fldd d0, ALPHA
  155. fldd d4 , [CO1]
  156. fldd d5 , [CO1, #8 ]
  157. fmacd d4 , d0 , d8
  158. fmacd d5 , d0 , d9
  159. fstd d4 , [CO1]
  160. fstd d5 , [CO1, #8 ]
  161. fldd d4 , [CO2]
  162. fldd d5 , [CO2, #8 ]
  163. fmacd d4 , d0 , d12
  164. fmacd d5 , d0 , d13
  165. fstd d4 , [CO2]
  166. fstd d5 , [CO2, #8 ]
  167. add CO1, CO1, #16
  168. .endm
  169. /******************************************************************************/
  170. .macro INIT1x2
  171. vsub.f64 d8 , d8 , d8
  172. vmov.f64 d12, d8
  173. .endm
  174. .macro KERNEL1x2_SUB
  175. fldd d4 , [ BO ]
  176. fldd d5 , [ BO, #8 ]
  177. fldd d0 , [ AO ]
  178. fmacd d8 , d0, d4
  179. fmacd d12 , d0, d5
  180. add AO , AO, #8
  181. add BO , BO, #16
  182. .endm
  183. .macro SAVE1x2
  184. ldr r3 , LDC
  185. add CO2 , CO1, r3
  186. fldd d0, ALPHA
  187. fldd d4 , [CO1]
  188. fmacd d4 , d0 , d8
  189. fstd d4 , [CO1]
  190. fldd d4 , [CO2]
  191. fmacd d4 , d0 , d12
  192. fstd d4 , [CO2]
  193. add CO1, CO1, #8
  194. .endm
  195. /******************************************************************************/
  196. .macro INIT4x1
  197. vsub.f64 d8 , d8 , d8
  198. vmov.f64 d9, d8
  199. vmov.f64 d10, d8
  200. vmov.f64 d11, d8
  201. .endm
  202. .macro KERNEL4x1_SUB
  203. fldd d4 , [ BO ]
  204. fldd d0 , [ AO ]
  205. fldd d1 , [ AO, #8 ]
  206. fldd d2 , [ AO, #16 ]
  207. fldd d3 , [ AO, #24 ]
  208. fmacd d8 , d0, d4
  209. fmacd d9 , d1, d4
  210. fmacd d10 , d2, d4
  211. fmacd d11 , d3, d4
  212. add AO , AO, #32
  213. add BO , BO, #8
  214. .endm
  215. .macro SAVE4x1
  216. fldd d0, ALPHA
  217. fldd d4 , [CO1]
  218. fldd d5 , [CO1, #8 ]
  219. fldd d6 , [CO1, #16 ]
  220. fldd d7 , [CO1, #24 ]
  221. fmacd d4 , d0 , d8
  222. fmacd d5 , d0 , d9
  223. fmacd d6 , d0 , d10
  224. fmacd d7 , d0 , d11
  225. fstd d4 , [CO1]
  226. fstd d5 , [CO1, #8 ]
  227. fstd d6 , [CO1, #16 ]
  228. fstd d7 , [CO1, #24 ]
  229. add CO1, CO1, #32
  230. .endm
  231. /******************************************************************************/
  232. .macro INIT2x1
  233. vsub.f64 d8 , d8 , d8
  234. vmov.f64 d9 , d8
  235. .endm
  236. .macro KERNEL2x1_SUB
  237. fldd d4 , [ BO ]
  238. fldd d0 , [ AO ]
  239. fldd d1 , [ AO, #8 ]
  240. fmacd d8 , d0, d4
  241. fmacd d9 , d1, d4
  242. add AO , AO, #16
  243. add BO , BO, #8
  244. .endm
  245. .macro SAVE2x1
  246. fldd d0, ALPHA
  247. fldd d4 , [CO1]
  248. fldd d5 , [CO1, #8 ]
  249. fmacd d4 , d0 , d8
  250. fmacd d5 , d0 , d9
  251. fstd d4 , [CO1]
  252. fstd d5 , [CO1, #8 ]
  253. add CO1, CO1, #16
  254. .endm
  255. /******************************************************************************/
  256. .macro INIT1x1
  257. vsub.f64 d8 , d8 , d8
  258. .endm
  259. .macro KERNEL1x1_SUB
  260. fldd d4 , [ BO ]
  261. fldd d0 , [ AO ]
  262. fmacd d8 , d0, d4
  263. add AO , AO, #8
  264. add BO , BO, #8
  265. .endm
  266. .macro SAVE1x1
  267. fldd d0, ALPHA
  268. fldd d4 , [CO1]
  269. fmacd d4 , d0 , d8
  270. fstd d4 , [CO1]
  271. add CO1, CO1, #8
  272. .endm
  273. /**************************************************************************************
  274. * End of macro definitions
  275. **************************************************************************************/
  276. PROLOGUE
  277. .align 5
  278. push {r4 - r9, fp}
  279. add fp, sp, #24
  280. sub sp, sp, #STACKSIZE // reserve stack
  281. str OLD_M, M
  282. str OLD_N, N
  283. str OLD_K, K
  284. str OLD_A, A
  285. vstr OLD_ALPHA, ALPHA
  286. sub r3, fp, #128
  287. vstm r3, { d8 - d15} // store floating point registers
  288. ldr r3, OLD_LDC
  289. lsl r3, r3, #3 // ldc = ldc * 8
  290. str r3, LDC
  291. ldr K1, K
  292. ldr BC, B
  293. ldr J, N
  294. asrs J, J, #1 // J = J / 2
  295. ble dgemm_kernel_L1_BEGIN
  296. /*********************************************************************************************/
  297. dgemm_kernel_L2_BEGIN:
  298. ldr CO1, C // CO1 = C
  299. ldr r4 , LDC
  300. lsl r4 , r4 , #1 // LDC * 2
  301. add r3 , r4, CO1
  302. str r3 , C // store C
  303. ldr AO, A // AO = A
  304. dgemm_kernel_L2_M4_BEGIN:
  305. ldr I, M
  306. asrs I, I, #2 // I = I / 4
  307. ble dgemm_kernel_L2_M2_BEGIN
  308. dgemm_kernel_L2_M4_20:
  309. INIT4x2
  310. mov BO, BC
  311. asrs L , K1, #3 // L = L / 8
  312. ble dgemm_kernel_L2_M4_40
  313. .align 5
  314. dgemm_kernel_L2_M4_22:
  315. pld [ BO, #B_PRE ]
  316. KERNEL4x2_SUB
  317. KERNEL4x2_SUB
  318. pld [ BO, #B_PRE ]
  319. KERNEL4x2_SUB
  320. KERNEL4x2_SUB
  321. pld [ BO, #B_PRE ]
  322. KERNEL4x2_SUB
  323. KERNEL4x2_SUB
  324. pld [ BO, #B_PRE ]
  325. KERNEL4x2_SUB
  326. KERNEL4x2_SUB
  327. subs L, L, #1
  328. bgt dgemm_kernel_L2_M4_22
  329. dgemm_kernel_L2_M4_40:
  330. ands L , K1, #7 // L = L % 8
  331. ble dgemm_kernel_L2_M4_100
  332. dgemm_kernel_L2_M4_42:
  333. KERNEL4x2_SUB
  334. subs L, L, #1
  335. bgt dgemm_kernel_L2_M4_42
  336. dgemm_kernel_L2_M4_100:
  337. SAVE4x2
  338. dgemm_kernel_L2_M4_END:
  339. subs I, I, #1
  340. bgt dgemm_kernel_L2_M4_20
  341. dgemm_kernel_L2_M2_BEGIN:
  342. ldr I, M
  343. tst I , #3
  344. ble dgemm_kernel_L2_END
  345. tst I, #2 // I = I / 2
  346. ble dgemm_kernel_L2_M1_BEGIN
  347. dgemm_kernel_L2_M2_20:
  348. INIT2x2
  349. mov BO, BC
  350. asrs L , K1, #3 // L = L / 8
  351. ble dgemm_kernel_L2_M2_40
  352. dgemm_kernel_L2_M2_22:
  353. KERNEL2x2_SUB
  354. KERNEL2x2_SUB
  355. KERNEL2x2_SUB
  356. KERNEL2x2_SUB
  357. KERNEL2x2_SUB
  358. KERNEL2x2_SUB
  359. KERNEL2x2_SUB
  360. KERNEL2x2_SUB
  361. subs L, L, #1
  362. bgt dgemm_kernel_L2_M2_22
  363. dgemm_kernel_L2_M2_40:
  364. ands L , K1, #7 // L = L % 8
  365. ble dgemm_kernel_L2_M2_100
  366. dgemm_kernel_L2_M2_42:
  367. KERNEL2x2_SUB
  368. subs L, L, #1
  369. bgt dgemm_kernel_L2_M2_42
  370. dgemm_kernel_L2_M2_100:
  371. SAVE2x2
  372. dgemm_kernel_L2_M2_END:
  373. dgemm_kernel_L2_M1_BEGIN:
  374. tst I, #1 // I = I % 2
  375. ble dgemm_kernel_L2_END
  376. dgemm_kernel_L2_M1_20:
  377. INIT1x2
  378. mov BO, BC
  379. asrs L , K1, #3 // L = L / 8
  380. ble dgemm_kernel_L2_M1_40
  381. dgemm_kernel_L2_M1_22:
  382. KERNEL1x2_SUB
  383. KERNEL1x2_SUB
  384. KERNEL1x2_SUB
  385. KERNEL1x2_SUB
  386. KERNEL1x2_SUB
  387. KERNEL1x2_SUB
  388. KERNEL1x2_SUB
  389. KERNEL1x2_SUB
  390. subs L, L, #1
  391. bgt dgemm_kernel_L2_M1_22
  392. dgemm_kernel_L2_M1_40:
  393. ands L , K1, #7 // L = L % 8
  394. ble dgemm_kernel_L2_M1_100
  395. dgemm_kernel_L2_M1_42:
  396. KERNEL1x2_SUB
  397. subs L, L, #1
  398. bgt dgemm_kernel_L2_M1_42
  399. dgemm_kernel_L2_M1_100:
  400. SAVE1x2
  401. dgemm_kernel_L2_END:
  402. mov r3, BC
  403. mov r4, K1
  404. lsl r4, r4, #4 // k * 2 * 8
  405. add r3, r3, r4 // B = B + K * 2 * 8
  406. mov BC, r3
  407. subs J , #1 // j--
  408. bgt dgemm_kernel_L2_BEGIN
  409. /*********************************************************************************************/
  410. dgemm_kernel_L1_BEGIN:
  411. ldr J , N
  412. tst J , #1
  413. ble dgemm_kernel_L999
  414. ldr CO1, C // CO1 = C
  415. ldr r4 , LDC
  416. add r3 , r4, CO1
  417. str r3 , C // store C
  418. ldr AO, A // AO = A
  419. dgemm_kernel_L1_M4_BEGIN:
  420. ldr I, M
  421. asrs I, I, #2 // I = I / 4
  422. ble dgemm_kernel_L1_M2_BEGIN
  423. dgemm_kernel_L1_M4_20:
  424. INIT4x1
  425. mov BO, BC
  426. asrs L , K1, #3 // L = L / 8
  427. ble dgemm_kernel_L1_M4_40
  428. .align 5
  429. dgemm_kernel_L1_M4_22:
  430. KERNEL4x1_SUB
  431. KERNEL4x1_SUB
  432. KERNEL4x1_SUB
  433. KERNEL4x1_SUB
  434. KERNEL4x1_SUB
  435. KERNEL4x1_SUB
  436. KERNEL4x1_SUB
  437. KERNEL4x1_SUB
  438. subs L, L, #1
  439. bgt dgemm_kernel_L1_M4_22
  440. dgemm_kernel_L1_M4_40:
  441. ands L , K1, #7 // L = L % 8
  442. ble dgemm_kernel_L1_M4_100
  443. dgemm_kernel_L1_M4_42:
  444. KERNEL4x1_SUB
  445. subs L, L, #1
  446. bgt dgemm_kernel_L1_M4_42
  447. dgemm_kernel_L1_M4_100:
  448. SAVE4x1
  449. dgemm_kernel_L1_M4_END:
  450. subs I, I, #1
  451. bgt dgemm_kernel_L1_M4_20
  452. dgemm_kernel_L1_M2_BEGIN:
  453. ldr I, M
  454. tst I , #3
  455. ble dgemm_kernel_L1_END
  456. tst I, #2 // I = I / 2
  457. ble dgemm_kernel_L1_M1_BEGIN
  458. dgemm_kernel_L1_M2_20:
  459. INIT2x1
  460. mov BO, BC
  461. asrs L , K1, #3 // L = L / 8
  462. ble dgemm_kernel_L1_M2_40
  463. dgemm_kernel_L1_M2_22:
  464. KERNEL2x1_SUB
  465. KERNEL2x1_SUB
  466. KERNEL2x1_SUB
  467. KERNEL2x1_SUB
  468. KERNEL2x1_SUB
  469. KERNEL2x1_SUB
  470. KERNEL2x1_SUB
  471. KERNEL2x1_SUB
  472. subs L, L, #1
  473. bgt dgemm_kernel_L1_M2_22
  474. dgemm_kernel_L1_M2_40:
  475. ands L , K1, #7 // L = L % 8
  476. ble dgemm_kernel_L1_M2_100
  477. dgemm_kernel_L1_M2_42:
  478. KERNEL2x1_SUB
  479. subs L, L, #1
  480. bgt dgemm_kernel_L1_M2_42
  481. dgemm_kernel_L1_M2_100:
  482. SAVE2x1
  483. dgemm_kernel_L1_M2_END:
  484. dgemm_kernel_L1_M1_BEGIN:
  485. tst I, #1 // I = I % 2
  486. ble dgemm_kernel_L1_END
  487. dgemm_kernel_L1_M1_20:
  488. INIT1x1
  489. mov BO, BC
  490. asrs L , K1, #3 // L = L / 8
  491. ble dgemm_kernel_L1_M1_40
  492. dgemm_kernel_L1_M1_22:
  493. KERNEL1x1_SUB
  494. KERNEL1x1_SUB
  495. KERNEL1x1_SUB
  496. KERNEL1x1_SUB
  497. KERNEL1x1_SUB
  498. KERNEL1x1_SUB
  499. KERNEL1x1_SUB
  500. KERNEL1x1_SUB
  501. subs L, L, #1
  502. bgt dgemm_kernel_L1_M1_22
  503. dgemm_kernel_L1_M1_40:
  504. ands L , K1, #7 // L = L % 8
  505. ble dgemm_kernel_L1_M1_100
  506. dgemm_kernel_L1_M1_42:
  507. KERNEL1x1_SUB
  508. subs L, L, #1
  509. bgt dgemm_kernel_L1_M1_42
  510. dgemm_kernel_L1_M1_100:
  511. SAVE1x1
  512. dgemm_kernel_L1_END:
  513. dgemm_kernel_L999:
  514. sub r3, fp, #128
  515. vldm r3, { d8 - d15} // restore floating point registers
  516. movs r0, #0 // set return value
  517. sub sp, fp, #24
  518. pop {r4 - r9, fp}
  519. bx lr
  520. EPILOGUE