You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zaxpy.S 7.5 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N x0 /* vector length */
  30. #define X x3 /* X vector address */
  31. #define INC_X x4 /* X stride */
  32. #define Y x5 /* Y vector address */
  33. #define INC_Y x6 /* Y stride */
  34. #define I x1 /* loop variable */
  35. #define Y_COPY x7 /* loop variable */
  36. /*******************************************************************************
  37. * Macro definitions
  38. *******************************************************************************/
  39. #if !defined(DOUBLE)
  40. #define DA_R s0 /* scale input value */
  41. #define DA_I s1 /* scale input value */
  42. #define SZ 4
  43. #else
  44. #define DA_R d0 /* scale input value */
  45. #define DA_I d1 /* scale input value */
  46. #define SZ 8
  47. #endif
  48. /******************************************************************************/
  49. .macro INIT
  50. #if !defined(CONJ)
  51. #if !defined(DOUBLE)
  52. ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
  53. eor v2.16b, v2.16b, v2.16b
  54. fsub s2, s2, DA_I
  55. ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
  56. ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I
  57. #else
  58. ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
  59. eor v2.16b, v2.16b, v2.16b
  60. fsub d2, d2, DA_I
  61. ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
  62. ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I
  63. #endif
  64. #else
  65. #if !defined(DOUBLE)
  66. eor v2.16b, v2.16b, v2.16b
  67. fsub s2, s2, DA_R
  68. ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R
  69. ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I
  70. #else
  71. eor v2.16b, v2.16b, v2.16b
  72. fsub d2, d2, DA_R
  73. ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R
  74. ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I
  75. #endif
  76. #endif
  77. .endm
  78. .macro KERNEL_F1
  79. #if !defined(DOUBLE)
  80. ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2
  81. ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy]
  82. ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1]
  83. fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix]
  84. // Y[iy+1] += +-DA_R * X[ix+1]
  85. fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1]
  86. // Y[iy+1] += DA_I * X[ix]
  87. st1 {v3.2s}, [Y], #8
  88. #else
  89. ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2
  90. ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy]
  91. ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1]
  92. fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix]
  93. // Y[iy+1] += +-DA_R * X[ix+1]
  94. fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1]
  95. // Y[iy+1] += DA_I * X[ix]
  96. st1 {v3.2d}, [Y], #16
  97. #endif
  98. .endm
  99. .macro KERNEL_INIT_F4
  100. #if !defined(DOUBLE)
  101. ins v16.s[0], v0.s[0]
  102. ins v16.s[1], v16.s[0]
  103. ins v16.d[1], v16.d[0]
  104. #if !defined(CONJ)
  105. ins v17.s[0], v1.s[1]
  106. #else
  107. ins v17.s[0], v1.s[0]
  108. #endif
  109. ins v17.s[1], v17.s[0]
  110. ins v17.d[1], v17.d[0]
  111. #else //DOUBLE
  112. ins v16.d[0], v0.d[0]
  113. ins v16.d[1], v16.d[0]
  114. #if !defined(CONJ)
  115. ins v17.d[0], v1.d[1]
  116. #else
  117. ins v17.d[0], v1.d[0]
  118. #endif
  119. ins v17.d[1], v17.d[0]
  120. #endif
  121. .endm
  122. .macro KERNEL_F4
  123. #if !defined(DOUBLE)
  124. ld2 {v2.4s, v3.4s}, [X], #32
  125. ld2 {v4.4s, v5.4s}, [Y_COPY], #32
  126. fmla v4.4s, v2.4s, v16.4s
  127. #if !defined(CONJ)
  128. fmls v4.4s, v3.4s, v17.4s
  129. #else
  130. fmla v4.4s, v3.4s, v17.4s
  131. #endif
  132. fmla v5.4s, v2.4s, v17.4s
  133. #if !defined(CONJ)
  134. fmla v5.4s, v3.4s, v16.4s
  135. #else
  136. fmls v5.4s, v3.4s, v16.4s
  137. #endif
  138. st2 {v4.4s, v5.4s}, [Y], #32
  139. #else // DOUBLE
  140. ld2 {v2.2d, v3.2d}, [X], #32
  141. ld2 {v4.2d, v5.2d}, [Y_COPY], #32
  142. fmla v4.2d, v2.2d, v16.2d
  143. #if !defined(CONJ)
  144. fmls v4.2d, v3.2d, v17.2d
  145. #else
  146. fmla v4.2d, v3.2d, v17.2d
  147. #endif
  148. fmla v5.2d, v2.2d, v17.2d
  149. #if !defined(CONJ)
  150. fmla v5.2d, v3.2d, v16.2d
  151. #else
  152. fmls v5.2d, v3.2d, v16.2d
  153. #endif
  154. st2 {v4.2d, v5.2d}, [Y], #32
  155. ld2 {v18.2d, v19.2d}, [X], #32
  156. ld2 {v20.2d, v21.2d}, [Y_COPY], #32
  157. fmla v20.2d, v18.2d, v16.2d
  158. #if !defined(CONJ)
  159. fmls v20.2d, v19.2d, v17.2d
  160. #else
  161. fmla v20.2d, v19.2d, v17.2d
  162. #endif
  163. fmla v21.2d, v18.2d, v17.2d
  164. #if !defined(CONJ)
  165. fmla v21.2d, v19.2d, v16.2d
  166. #else
  167. fmls v21.2d, v19.2d, v16.2d
  168. #endif
  169. st2 {v20.2d, v21.2d}, [Y], #32
  170. #endif
  171. PRFM PLDL1KEEP, [X, #512]
  172. PRFM PLDL1KEEP, [Y, #512]
  173. .endm
  174. .macro INIT_S
  175. #if !defined(DOUBLE)
  176. lsl INC_X, INC_X, #3
  177. lsl INC_Y, INC_Y, #3
  178. #else
  179. lsl INC_X, INC_X, #4
  180. lsl INC_Y, INC_Y, #4
  181. #endif
  182. .endm
  183. .macro KERNEL_S1
  184. #if !defined(DOUBLE)
  185. ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
  186. ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy]
  187. ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1]
  188. fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix]
  189. // Y[iy+1] += +-DA_R * X[ix+1]
  190. fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1]
  191. // Y[iy+1] += DA_I * X[ix]
  192. st1 {v3.2s}, [Y], INC_Y
  193. #else
  194. ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
  195. ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy]
  196. ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1]
  197. fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix]
  198. // Y[iy+1] += +-DA_R * X[ix+1]
  199. fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1]
  200. // Y[iy+1] += DA_I * X[ix]
  201. st1 {v3.2d}, [Y], INC_Y
  202. #endif
  203. .endm
  204. /*******************************************************************************
  205. * End of macro definitions
  206. *******************************************************************************/
  207. PROLOGUE
  208. cmp N, xzr
  209. ble .Lzaxpy_kernel_L999
  210. mov Y_COPY, Y
  211. fcmp DA_R, #0.0
  212. bne .L1
  213. fcmp DA_I, #0.0
  214. beq .Lzaxpy_kernel_L999
  215. .L1:
  216. INIT
  217. cmp INC_X, #1
  218. bne .Lzaxpy_kernel_S_BEGIN
  219. cmp INC_Y, #1
  220. bne .Lzaxpy_kernel_S_BEGIN
  221. .Lzaxpy_kernel_F_BEGIN:
  222. asr I, N, #2
  223. cmp I, xzr
  224. beq .Lzaxpy_kernel_F1
  225. KERNEL_INIT_F4
  226. .Lzaxpy_kernel_F4:
  227. KERNEL_F4
  228. subs I, I, #1
  229. bne .Lzaxpy_kernel_F4
  230. .Lzaxpy_kernel_F1:
  231. ands I, N, #3
  232. ble .Lzaxpy_kernel_L999
  233. .Lzaxpy_kernel_F10:
  234. KERNEL_F1
  235. subs I, I, #1
  236. bne .Lzaxpy_kernel_F10
  237. mov w0, wzr
  238. ret
  239. .Lzaxpy_kernel_S_BEGIN:
  240. INIT_S
  241. asr I, N, #2
  242. cmp I, xzr
  243. ble .Lzaxpy_kernel_S1
  244. .Lzaxpy_kernel_S4:
  245. KERNEL_S1
  246. KERNEL_S1
  247. KERNEL_S1
  248. KERNEL_S1
  249. subs I, I, #1
  250. bne .Lzaxpy_kernel_S4
  251. .Lzaxpy_kernel_S1:
  252. ands I, N, #3
  253. ble .Lzaxpy_kernel_L999
  254. .Lzaxpy_kernel_S10:
  255. KERNEL_S1
  256. subs I, I, #1
  257. bne .Lzaxpy_kernel_S10
  258. .Lzaxpy_kernel_L999:
  259. mov w0, wzr
  260. ret