You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zscal.S 8.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N x0 /* vector length */
  30. #define X x3 /* X vector address */
  31. #define INC_X x4 /* X stride */
  32. #define I x5 /* loop variable */
  33. #define X_COPY x6 /* Copy of X */
  34. /*******************************************************************************
  35. * Macro definitions
  36. *******************************************************************************/
  37. #if !defined(DOUBLE)
  38. #define DA_R s0 /* real scale input value */
  39. #define DA_I s1 /* imaginary scale input value */
  40. #else
  41. #define DA_R d0 /* real scale input value */
  42. #define DA_I d1 /* imaginary scale input value */
  43. #endif
  44. /******************************************************************************/
  45. .macro INIT
  46. #if !defined(DOUBLE)
  47. ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
  48. #else
  49. ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
  50. #endif
  51. .endm
  52. .macro KERNEL_F1
  53. #if !defined(DOUBLE)
  54. ld1 {v2.2s}, [X] // X1, X0
  55. fmul s3, DA_R, v2.s[0] // DA_R*X0
  56. fmul s5, DA_I, v2.s[1] // DA_I*X1
  57. fsub s3, s3, s5 // DA_R*X0-DA_I*X1
  58. fmul s4, DA_I, v2.s[0] // DA_I*X0
  59. fmul s5, DA_R, v2.s[1] // DA_R*X1
  60. fadd s4, s4, s5 // DA_I*X0+DA_R*X1
  61. ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
  62. st1 {v3.2s}, [X], #8
  63. #else
  64. ld1 {v2.2d}, [X] // X1, X0
  65. fmul d3, DA_R, v2.d[0] // DA_R*X0
  66. fmul d5, DA_I, v2.d[1] // DA_I*X1
  67. fsub d3, d3, d5 // DA_R*X0-DA_I*X1
  68. fmul d4, DA_I, v2.d[0] // DA_I*X0
  69. fmul d5, DA_R, v2.d[1] // DA_R*X1
  70. fadd d4, d4, d5 // DA_I*X0+DA_R*X1
  71. ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
  72. st1 {v3.2d}, [X], #16
  73. #endif
  74. .endm
  75. .macro KERNEL_INIT_F4
  76. #if !defined(DOUBLE)
  77. ins v16.s[0], v0.s[0]
  78. ins v16.s[1], v16.s[0]
  79. ins v16.d[1], v16.d[0]
  80. ins v17.s[0], v1.s[0]
  81. ins v17.s[1], v17.s[0]
  82. ins v17.d[1], v17.d[0]
  83. #else //DOUBLE
  84. ins v16.d[0], v0.d[0]
  85. ins v16.d[1], v16.d[0]
  86. ins v17.d[0], v1.d[0]
  87. ins v17.d[1], v17.d[0]
  88. #endif
  89. .endm
  90. .macro KERNEL_F4
  91. #if !defined(DOUBLE)
  92. ld2 {v2.4s, v3.4s}, [X], #32
  93. fmul v4.4s, v2.4s, v16.4s
  94. fmul v6.4s, v3.4s, v17.4s
  95. fsub v4.4s, v4.4s, v6.4s
  96. fmul v5.4s, v2.4s, v17.4s
  97. fmul v6.4s, v3.4s, v16.4s
  98. fadd v5.4s, v5.4s, v6.4s
  99. st2 {v4.4s, v5.4s}, [X_COPY], #32
  100. #else // DOUBLE
  101. ld2 {v2.2d, v3.2d}, [X], #32
  102. fmul v4.2d, v2.2d, v16.2d
  103. fmul v6.2d, v3.2d, v17.2d
  104. fsub v4.2d, v4.2d, v6.2d
  105. fmul v5.2d, v2.2d, v17.2d
  106. fmul v6.2d, v3.2d, v16.2d
  107. fadd v5.2d, v5.2d, v6.2d
  108. st2 {v4.2d, v5.2d}, [X_COPY], #32
  109. ld2 {v18.2d, v19.2d}, [X], #32
  110. fmul v20.2d, v18.2d, v16.2d
  111. fmul v6.2d, v19.2d, v17.2d
  112. fsub v20.2d, v20.2d, v6.2d
  113. fmul v21.2d, v18.2d, v17.2d
  114. fmul v6.2d, v19.2d, v16.2d
  115. fadd v21.2d, v21.2d, v6.2d
  116. st2 {v20.2d, v21.2d}, [X_COPY], #32
  117. #endif
  118. PRFM PLDL1KEEP, [X, #1024]
  119. .endm
  120. .macro INIT_S
  121. #if !defined(DOUBLE)
  122. lsl INC_X, INC_X, #3
  123. #else
  124. lsl INC_X, INC_X, #4
  125. #endif
  126. .endm
  127. .macro KERNEL_S1
  128. #if !defined(DOUBLE)
  129. ld1 {v2.2s}, [X] // X1, X0
  130. fmul s3, DA_R, v2.s[0] // DA_R*X0
  131. fmul s5, DA_I, v2.s[1] // DA_I*X1
  132. fsub s3, s3, s5 // DA_R*X0-DA_I*X1
  133. fmul s4, DA_I, v2.s[0] // DA_I*X0
  134. fmul s5, DA_R, v2.s[1] // DA_R*X1
  135. fadd s4, s4, s5 // DA_I*X0+DA_R*X1
  136. ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
  137. st1 {v3.2s}, [X], INC_X
  138. #else
  139. ld1 {v2.2d}, [X] // X1, X0
  140. fmul d3, DA_R, v2.d[0] // DA_R*X0
  141. fmul d5, DA_I, v2.d[1] // DA_I*X1
  142. fsub d3, d3, d5 // DA_R*X0-DA_I*X1
  143. fmul d4, DA_I, v2.d[0] // DA_I*X0
  144. fmul d5, DA_R, v2.d[1] // DA_R*X1
  145. fadd d4, d4, d5 // DA_I*X0+DA_R*X1
  146. ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
  147. st1 {v3.2d}, [X], INC_X
  148. #endif
  149. .endm
  150. /*******************************************************************************
  151. * End of macro definitions
  152. *******************************************************************************/
  153. PROLOGUE
  154. b zscal_begin
  155. data_ar:
  156. .word 0x3e44fae6
  157. data_ai:
  158. .word 0x3d320fa2
  159. data_xr:
  160. .word 0x3f4baff1
  161. data_xi:
  162. .word 0xbe8ef0bd
  163. zscal_begin:
  164. ldr s20, data_ar
  165. ldr s21, data_ai
  166. ldr s22, data_xr
  167. ldr s23, data_xi
  168. fmul s24, s22, s21
  169. fmla s24, s23, v20.s[0]
  170. fmul s25, s22, s21
  171. fmul s26, s23, s20
  172. fadd s25, s25, s26
  173. mov X_COPY, X
  174. cmp N, xzr
  175. ble .Lzscal_kernel_L999
  176. fcmp DA_R, #0.0
  177. bne .Lzscal_kernel_R_non_zero
  178. fcmp DA_I, #0.0
  179. beq .Lzscal_kernel_RI_zero
  180. b .Lzscal_kernel_R_zero
  181. .Lzscal_kernel_R_non_zero:
  182. fcmp DA_I, #0.0
  183. beq .Lzscal_kernel_I_zero
  184. /*******************************************************************************
  185. * A_R != 0 && A_I != 0
  186. *******************************************************************************/
  187. .Lzscal_kernel_RI_non_zero:
  188. INIT
  189. cmp INC_X, #1
  190. bne .Lzscal_kernel_S_BEGIN
  191. .Lzscal_kernel_F_BEGIN:
  192. asr I, N, #2
  193. cmp I, xzr
  194. beq .Lzscal_kernel_F1
  195. KERNEL_INIT_F4
  196. .Lzscal_kernel_F4:
  197. KERNEL_F4
  198. subs I, I, #1
  199. bne .Lzscal_kernel_F4
  200. .Lzscal_kernel_F1:
  201. ands I, N, #3
  202. ble .Lzscal_kernel_L999
  203. .Lzscal_kernel_F10:
  204. KERNEL_F1
  205. subs I, I, #1
  206. bne .Lzscal_kernel_F10
  207. mov w0, wzr
  208. ret
  209. .Lzscal_kernel_S_BEGIN:
  210. INIT_S
  211. asr I, N, #2
  212. cmp I, xzr
  213. ble .Lzscal_kernel_S1
  214. .Lzscal_kernel_S4:
  215. KERNEL_S1
  216. KERNEL_S1
  217. KERNEL_S1
  218. KERNEL_S1
  219. subs I, I, #1
  220. bne .Lzscal_kernel_S4
  221. .Lzscal_kernel_S1:
  222. ands I, N, #3
  223. ble .Lzscal_kernel_L999
  224. .Lzscal_kernel_S10:
  225. KERNEL_S1
  226. subs I, I, #1
  227. bne .Lzscal_kernel_S10
  228. .Lzscal_kernel_L999:
  229. mov w0, wzr
  230. ret
  231. /*******************************************************************************
  232. * A_R == 0 && A_I != 0
  233. *******************************************************************************/
  234. .Lzscal_kernel_R_zero:
  235. INIT_S
  236. #if !defined(DOUBLE)
  237. eor v2.16b, v2.16b, v2.16b
  238. fsub s2, s2, DA_I
  239. ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
  240. #else
  241. eor v2.16b, v2.16b, v2.16b
  242. fsub d2, d2, DA_I
  243. ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
  244. #endif
  245. .Lzscal_kernel_R_zero_1:
  246. #if !defined(DOUBLE)
  247. ld1 {v2.2s}, [X] // X1, X0
  248. fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0
  249. ext v2.8b, v2.8b, v2.8b, #4 // DA_I*X0, -DA_I*X1
  250. st1 {v2.2s}, [X]
  251. #else
  252. ld1 {v2.2d}, [X] // X1, X0
  253. fmul v2.2d, v2.2d, v1.2d // -DA_I*X1, DA_I*X0
  254. ext v2.16b, v2.16b, v2.16b, #8 // DA_I*X0, -DA_I*X1
  255. st1 {v2.2d}, [X]
  256. #endif
  257. add X, X, INC_X
  258. subs N, N, #1
  259. bne .Lzscal_kernel_R_zero_1
  260. mov w0, wzr
  261. ret
  262. /*******************************************************************************
  263. * A_R != 0 && A_I == 0
  264. *******************************************************************************/
  265. .Lzscal_kernel_I_zero:
  266. INIT_S
  267. #if !defined(DOUBLE)
  268. ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
  269. #else
  270. ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
  271. #endif
  272. .Lzscal_kernel_I_zero_1:
  273. #if !defined(DOUBLE)
  274. ld1 {v2.2s}, [X] // X1, X0
  275. fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
  276. st1 {v2.2s}, [X]
  277. #else
  278. ld1 {v2.2d}, [X] // X1, X0
  279. fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
  280. st1 {v2.2d}, [X]
  281. #endif
  282. add X, X, INC_X
  283. subs N, N, #1
  284. bne .Lzscal_kernel_I_zero_1
  285. mov w0, wzr
  286. ret
  287. /*******************************************************************************
  288. * A_R == 0 && A_I == 0
  289. *******************************************************************************/
  290. .Lzscal_kernel_RI_zero:
  291. INIT_S
  292. .Lzscal_kernel_RI_zero_1:
  293. stp DA_R, DA_I, [X]
  294. add X, X, INC_X
  295. subs N, N, #1
  296. bne .Lzscal_kernel_RI_zero_1
  297. mov w0, wzr
  298. ret
  299. EPILOGUE