You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

asum.S 4.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N x0 /* vector length */
  30. #define X x1 /* X vector address */
  31. #define INC_X x2 /* X stride */
  32. #define I x5 /* loop variable */
  33. /*******************************************************************************
  34. * Macro definitions
  35. *******************************************************************************/
  36. #if !defined(DOUBLE)
  37. #define REG0 wzr
  38. #define SUMF s0
  39. #define TMPF s1
  40. #define TMPVF {v1.s}[0]
  41. #define SZ 4
  42. #else
  43. #define REG0 xzr
  44. #define SUMF d0
  45. #define TMPF d1
  46. #define TMPVF {v1.d}[0]
  47. #define SZ 8
  48. #endif
  49. /******************************************************************************/
  50. .macro KERNEL_F1
  51. ldr TMPF, [X], #SZ
  52. fabs TMPF, TMPF
  53. fadd SUMF, SUMF, TMPF
  54. .endm
  55. .macro KERNEL_F8
  56. #if !defined(DOUBLE)
  57. ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0]
  58. fabs v1.4s, v1.4s // ABS() each value
  59. fabs v2.4s, v2.4s // ABS() each value
  60. fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0]
  61. fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0]
  62. PRFM PLDL1KEEP, [X, #1024]
  63. #else // DOUBLE
  64. ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X]
  65. add X, X, #64
  66. fabs v2.2d, v2.2d
  67. fabs v3.2d, v3.2d
  68. fabs v4.2d, v4.2d
  69. fabs v5.2d, v5.2d
  70. PRFM PLDL1KEEP, [X, #1024]
  71. fadd v2.2d, v2.2d, v3.2d
  72. fadd v4.2d, v4.2d, v5.2d
  73. fadd v0.2d, v0.2d, v2.2d
  74. fadd v0.2d, v0.2d, v4.2d
  75. #endif
  76. .endm
  77. .macro KERNEL_F8_FINALIZE
  78. #if !defined(DOUBLE)
  79. ext v1.16b, v0.16b, v0.16b, #8
  80. fadd v0.2s, v0.2s, v1.2s
  81. faddp SUMF, v0.2s
  82. #else
  83. faddp SUMF, v0.2d
  84. #endif
  85. .endm
  86. .macro INIT_S
  87. #if !defined(DOUBLE)
  88. lsl INC_X, INC_X, #2
  89. #else
  90. lsl INC_X, INC_X, #3
  91. #endif
  92. .endm
  93. .macro KERNEL_S1
  94. ld1 TMPVF, [X], INC_X
  95. fabs TMPF, TMPF
  96. fadd SUMF, SUMF, TMPF
  97. .endm
  98. /*******************************************************************************
  99. * End of macro definitions
  100. *******************************************************************************/
  101. PROLOGUE
  102. fmov SUMF, REG0
  103. #if !defined(DOUBLE)
  104. fmov s1, SUMF
  105. #else
  106. fmov d1, SUMF
  107. #endif
  108. cmp N, xzr
  109. ble .Lasum_kernel_L999
  110. cmp INC_X, xzr
  111. ble .Lasum_kernel_L999
  112. cmp INC_X, #1
  113. bne .Lasum_kernel_S_BEGIN
  114. .Lasum_kernel_F_BEGIN:
  115. asr I, N, #3
  116. cmp I, xzr
  117. beq .Lasum_kernel_F1
  118. .Lasum_kernel_F8:
  119. KERNEL_F8
  120. subs I, I, #1
  121. bne .Lasum_kernel_F8
  122. KERNEL_F8_FINALIZE
  123. .Lasum_kernel_F1:
  124. ands I, N, #7
  125. ble .Lasum_kernel_L999
  126. .Lasum_kernel_F10:
  127. KERNEL_F1
  128. subs I, I, #1
  129. bne .Lasum_kernel_F10
  130. .Lasum_kernel_L999:
  131. ret
  132. .Lasum_kernel_S_BEGIN:
  133. INIT_S
  134. asr I, N, #2
  135. cmp I, xzr
  136. ble .Lasum_kernel_S1
  137. .Lasum_kernel_S4:
  138. KERNEL_S1
  139. KERNEL_S1
  140. KERNEL_S1
  141. KERNEL_S1
  142. subs I, I, #1
  143. bne .Lasum_kernel_S4
  144. .Lasum_kernel_S1:
  145. ands I, N, #3
  146. ble .Lasum_kernel_L999
  147. .Lasum_kernel_S10:
  148. KERNEL_S1
  149. subs I, I, #1
  150. bne .Lasum_kernel_S10
  151. ret
  152. EPILOGUE