You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sdot_vfp.S 6.8 kB

12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago

  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/11 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK (no test for dsdot)
  31. * TEST : OK (no test for dsdot)
  32. *
  33. * 2016/01/23 Saar
  34. * Bugfix for Refs #750 and #740
  35. **************************************************************************************/
  36. #define ASSEMBLER
  37. #include "common.h"
  38. #define STACKSIZE 256
  39. #define N r0
  40. #define X r1
  41. #define INC_X r2
  42. #define OLD_Y r3
  43. /******************************************************
  44. * [fp, #-128] - [fp, #-64] is reserved
  45. * for store and restore of floating point
  46. * registers
  47. *******************************************************/
  48. #define OLD_INC_Y [fp, #4 ]
  49. #define I r5
  50. #define Y r6
  51. #define INC_Y r7
  52. #define X_PRE 512
  53. /**************************************************************************************
  54. * Macro definitions
  55. **************************************************************************************/
  56. #if defined(DSDOT)
  57. .macro KERNEL_F4
  58. vldmia.f32 X!, { s14 }
  59. vldmia.f32 Y!, { s15 }
  60. vmul.f32 s15, s14, s15
  61. vcvt.f64.f32 d4, s15
  62. vadd.f64 d0 , d0, d4
  63. vldmia.f32 X!, { s14 }
  64. vldmia.f32 Y!, { s15 }
  65. vmul.f32 s15, s14, s15
  66. vcvt.f64.f32 d4, s15
  67. vadd.f64 d0 , d0, d4
  68. vldmia.f32 X!, { s14 }
  69. vldmia.f32 Y!, { s15 }
  70. vmul.f32 s15, s14, s15
  71. vcvt.f64.f32 d4, s15
  72. vadd.f64 d0 , d0, d4
  73. vldmia.f32 X!, { s14 }
  74. vldmia.f32 Y!, { s15 }
  75. vmul.f32 s15, s14, s15
  76. vcvt.f64.f32 d4, s15
  77. vadd.f64 d0 , d0, d4
  78. .endm
  79. .macro KERNEL_F1
  80. vldmia.f32 X!, { s14 }
  81. vldmia.f32 Y!, { s15 }
  82. vmul.f32 s15, s14, s15
  83. vcvt.f64.f32 d4, s15
  84. vadd.f64 d0 , d0, d4
  85. .endm
  86. .macro KERNEL_S4
  87. nop
  88. vldmia.f32 X, { s14 }
  89. vldmia.f32 Y, { s15 }
  90. vmul.f32 s15, s14, s15
  91. vcvt.f64.f32 d4, s15
  92. vadd.f64 d0 , d0, d4
  93. add X, X, INC_X
  94. add Y, Y, INC_Y
  95. vldmia.f32 X, { s14 }
  96. vldmia.f32 Y, { s15 }
  97. vmul.f32 s15, s14, s15
  98. vcvt.f64.f32 d4, s15
  99. vadd.f64 d0 , d0, d4
  100. add X, X, INC_X
  101. add Y, Y, INC_Y
  102. vldmia.f32 X, { s14 }
  103. vldmia.f32 Y, { s15 }
  104. vmul.f32 s15, s14, s15
  105. vcvt.f64.f32 d4, s15
  106. vadd.f64 d0 , d0, d4
  107. add X, X, INC_X
  108. add Y, Y, INC_Y
  109. vldmia.f32 X, { s14 }
  110. vldmia.f32 Y, { s15 }
  111. vmul.f32 s15, s14, s15
  112. vcvt.f64.f32 d4, s15
  113. vadd.f64 d0 , d0, d4
  114. add X, X, INC_X
  115. add Y, Y, INC_Y
  116. .endm
  117. .macro KERNEL_S1
  118. vldmia.f32 X, { s14 }
  119. vldmia.f32 Y, { s15 }
  120. vmul.f32 s15, s14, s15
  121. vcvt.f64.f32 d4, s15
  122. vadd.f64 d0 , d0, d4
  123. add X, X, INC_X
  124. add Y, Y, INC_Y
  125. .endm
  126. #else
  127. .macro KERNEL_F4
  128. vldmia.f32 X!, { s8 - s9 }
  129. vldmia.f32 Y!, { s4 - s5}
  130. fmacs s0 , s4, s8
  131. vldmia.f32 X!, { s10 - s11 }
  132. fmacs s1 , s5, s9
  133. vldmia.f32 Y!, { s6 - s7 }
  134. fmacs s0 , s6, s10
  135. fmacs s1 , s7, s11
  136. .endm
  137. .macro KERNEL_F1
  138. vldmia.f32 X!, { s4 }
  139. vldmia.f32 Y!, { s8 }
  140. fmacs s0 , s4, s8
  141. .endm
  142. .macro KERNEL_S4
  143. nop
  144. vldmia.f32 X, { s4 }
  145. vldmia.f32 Y, { s8 }
  146. add X, X, INC_X
  147. add Y, Y, INC_Y
  148. fmacs s0 , s4, s8
  149. vldmia.f32 X, { s5 }
  150. vldmia.f32 Y, { s9 }
  151. add X, X, INC_X
  152. add Y, Y, INC_Y
  153. fmacs s1 , s5, s9
  154. vldmia.f32 X, { s6 }
  155. vldmia.f32 Y, { s10 }
  156. add X, X, INC_X
  157. add Y, Y, INC_Y
  158. fmacs s0 , s6, s10
  159. vldmia.f32 X, { s7 }
  160. vldmia.f32 Y, { s11 }
  161. add X, X, INC_X
  162. add Y, Y, INC_Y
  163. fmacs s1 , s7, s11
  164. .endm
  165. .macro KERNEL_S1
  166. vldmia.f32 X, { s4 }
  167. vldmia.f32 Y, { s8 }
  168. add X, X, INC_X
  169. fmacs s0 , s4, s8
  170. add Y, Y, INC_Y
  171. .endm
  172. #endif
  173. /**************************************************************************************
  174. * End of macro definitions
  175. **************************************************************************************/
  176. PROLOGUE
  177. .align 5
  178. push {r4 - r9, fp}
  179. add fp, sp, #24
  180. sub sp, sp, #STACKSIZE // reserve stack
  181. sub r4, fp, #128
  182. vstm r4, { s8 - s15 } // store floating point registers
  183. mov Y, OLD_Y
  184. ldr INC_Y, OLD_INC_Y
  185. movs r4, #0 // clear floating point register
  186. vmov s0, r4
  187. vmov s1, r4
  188. #if defined(DSDOT)
  189. vcvt.f64.f32 d0, s0
  190. vcvt.f64.f32 d1, s1
  191. #endif
  192. cmp N, #0
  193. ble sdot_kernel_L999
  194. # cmp INC_X, #0
  195. # beq sdot_kernel_L999
  196. # cmp INC_Y, #0
  197. # beq sdot_kernel_L999
  198. cmp INC_X, #1
  199. bne sdot_kernel_S_BEGIN
  200. cmp INC_Y, #1
  201. bne sdot_kernel_S_BEGIN
  202. sdot_kernel_F_BEGIN:
  203. asrs I, N, #2 // I = N / 4
  204. ble sdot_kernel_F1
  205. sdot_kernel_F4:
  206. KERNEL_F4
  207. subs I, I, #1
  208. bne sdot_kernel_F4
  209. sdot_kernel_F1:
  210. ands I, N, #3
  211. ble sdot_kernel_L999
  212. sdot_kernel_F10:
  213. KERNEL_F1
  214. subs I, I, #1
  215. bne sdot_kernel_F10
  216. b sdot_kernel_L999
  217. sdot_kernel_S_BEGIN:
  218. lsl INC_X, INC_X, #2 // INC_X * SIZE
  219. lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
  220. asrs I, N, #2 // I = N / 4
  221. ble sdot_kernel_S1
  222. sdot_kernel_S4:
  223. KERNEL_S4
  224. subs I, I, #1
  225. bne sdot_kernel_S4
  226. sdot_kernel_S1:
  227. ands I, N, #3
  228. ble sdot_kernel_L999
  229. sdot_kernel_S10:
  230. KERNEL_S1
  231. subs I, I, #1
  232. bne sdot_kernel_S10
  233. sdot_kernel_L999:
  234. sub r3, fp, #128
  235. vldm r3, { s8 - s15} // restore floating point registers
  236. #if defined(DSDOT)
  237. vadd.f64 d0 , d0, d1 // set return value
  238. #else
  239. vadd.f32 s0 , s0, s1 // set return value
  240. #endif
  241. #if !defined(__ARM_PCS_VFP)
  242. #if defined(DSDOT)
  243. vmov r0, r1, d0
  244. #else
  245. vmov r0, s0
  246. #endif
  247. #endif
  248. sub sp, fp, #24
  249. pop {r4 - r9, fp}
  250. bx lr
  251. EPILOGUE