You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

scal.S 5.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N x0 /* vector length */
  30. #define X x3 /* X vector address */
  31. #define X_COPY x5 /* X vector address */
  32. #define INC_X x4 /* X stride */
  33. #define I x1 /* loop variable */
  34. /*******************************************************************************
  35. * Macro definitions
  36. *******************************************************************************/
  37. #if !defined(DOUBLE)
  38. #define DA s0 /* scale input value */
  39. #define DAV {v0.s}[0]
  40. #define TMPF s1
  41. #define TMPVF {v1.s}[0]
  42. #define SZ 4
  43. #else
  44. #define DA d0 /* scale input value */
  45. #define DAV {v0.d}[0]
  46. #define TMPF d1
  47. #define TMPVF {v1.d}[0]
  48. #define SZ 8
  49. #endif
  50. /******************************************************************************/
  51. .macro KERNEL_F1
  52. ldr TMPF, [X]
  53. fmul TMPF, TMPF, DA
  54. str TMPF, [X], #SZ
  55. .endm
  56. .macro KERNEL_INIT_F8
  57. #if !defined(DOUBLE)
  58. ins v0.s[1], v0.s[0]
  59. ins v0.s[2], v0.s[0]
  60. ins v0.s[3], v0.s[0]
  61. #else
  62. ins v0.d[1], v0.d[0]
  63. #endif
  64. .endm
  65. .macro KERNEL_F8
  66. #if !defined(DOUBLE)
  67. ld1 {v1.4s, v2.4s}, [X]
  68. fmul v1.4s, v1.4s, v0.4s
  69. fmul v2.4s, v2.4s, v0.4s
  70. st1 {v1.4s, v2.4s}, [X], #32
  71. #else // DOUBLE
  72. ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X]
  73. fmul v1.2d, v1.2d, v0.2d
  74. fmul v2.2d, v2.2d, v0.2d
  75. fmul v3.2d, v3.2d, v0.2d
  76. fmul v4.2d, v4.2d, v0.2d
  77. st1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
  78. #endif
  79. PRFM PLDL1KEEP, [X, #1024]
  80. .endm
  81. .macro INIT_S
  82. #if !defined(DOUBLE)
  83. lsl INC_X, INC_X, #2
  84. #else
  85. lsl INC_X, INC_X, #3
  86. #endif
  87. .endm
  88. .macro KERNEL_S1
  89. ldr TMPF, [X]
  90. fmul TMPF, TMPF, DA
  91. st1 TMPVF, [X], INC_X
  92. .endm
  93. .macro KERNEL_S4
  94. #if !defined(DOUBLE)
  95. ldr s1, [X]
  96. add X, X, INC_X
  97. fmul s1, s1, s0
  98. str s1, [X_COPY]
  99. add X_COPY, X_COPY, INC_X
  100. ldr s2, [X]
  101. add X, X, INC_X
  102. fmul s2, s2, s0
  103. str s2, [X_COPY]
  104. add X_COPY, X_COPY, INC_X
  105. ldr s3, [X]
  106. add X, X, INC_X
  107. fmul s3, s3, s0
  108. str s3, [X_COPY]
  109. add X_COPY, X_COPY, INC_X
  110. ldr s4, [X]
  111. add X, X, INC_X
  112. fmul s4, s4, s0
  113. str s4, [X_COPY]
  114. add X_COPY, X_COPY, INC_X
  115. #else
  116. ldr d1, [X]
  117. add X, X, INC_X
  118. fmul d1, d1, d0
  119. str d1, [X_COPY]
  120. add X_COPY, X_COPY, INC_X
  121. ldr d2, [X]
  122. add X, X, INC_X
  123. fmul d2, d2, d0
  124. str d2, [X_COPY]
  125. add X_COPY, X_COPY, INC_X
  126. ldr d3, [X]
  127. add X, X, INC_X
  128. fmul d3, d3, d0
  129. str d3, [X_COPY]
  130. add X_COPY, X_COPY, INC_X
  131. ldr d4, [X]
  132. add X, X, INC_X
  133. fmul d4, d4, d0
  134. str d4, [X_COPY]
  135. add X_COPY, X_COPY, INC_X
  136. #endif
  137. .endm
  138. /*******************************************************************************
  139. * End of macro definitions
  140. *******************************************************************************/
  141. PROLOGUE
  142. cmp N, xzr
  143. ble .Lscal_kernel_L999
  144. fcmp DA, #0.0
  145. beq .Lscal_kernel_zero
  146. cmp INC_X, #1
  147. bne .Lscal_kernel_S_BEGIN
  148. .Lscal_kernel_F_BEGIN:
  149. asr I, N, #3
  150. cmp I, xzr
  151. beq .Lscal_kernel_F1
  152. KERNEL_INIT_F8
  153. .Lscal_kernel_F8:
  154. KERNEL_F8
  155. subs I, I, #1
  156. bne .Lscal_kernel_F8
  157. .Lscal_kernel_F1:
  158. ands I, N, #7
  159. ble .Lscal_kernel_L999
  160. .Lscal_kernel_F10:
  161. KERNEL_F1
  162. subs I, I, #1
  163. bne .Lscal_kernel_F10
  164. mov w0, wzr
  165. ret
  166. .Lscal_kernel_S_BEGIN:
  167. INIT_S
  168. mov X_COPY, X
  169. asr I, N, #2
  170. cmp I, xzr
  171. ble .Lscal_kernel_S1
  172. .Lscal_kernel_S4:
  173. KERNEL_S4
  174. subs I, I, #1
  175. bne .Lscal_kernel_S4
  176. .Lscal_kernel_S1:
  177. ands I, N, #3
  178. ble .Lscal_kernel_L999
  179. .Lscal_kernel_S10:
  180. KERNEL_S1
  181. subs I, I, #1
  182. bne .Lscal_kernel_S10
  183. .Lscal_kernel_L999:
  184. mov w0, wzr
  185. ret
  186. .Lscal_kernel_zero:
  187. INIT_S
  188. .Lscal_kernel_Z1:
  189. st1 DAV, [X], INC_X
  190. subs N, N, #1
  191. bne .Lscal_kernel_Z1
  192. mov w0, wzr
  193. ret
  194. EPILOGUE