You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zaxpy_microk_power8.c 8.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. /***************************************************************************
  2. Copyright (c) 2013-2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2016/03/23 Werner Saar (wernsaar@googlemail.com)
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. * LAPACK-TEST : OK
  33. **************************************************************************************/
  34. #define HAVE_KERNEL_4 1
  35. static void zaxpy_kernel_4 (long n, double *x, double *y,
  36. double alpha_r, double alpha_i)
  37. {
  38. #if !defined(CONJ)
  39. static const double mvec[2] = { -1.0, 1.0 };
  40. #else
  41. static const double mvec[2] = { 1.0, -1.0 };
  42. #endif
  43. const double *mvecp = mvec;
  44. __vector double t0;
  45. __vector double t1;
  46. __vector double t2;
  47. __vector double t3;
  48. __vector double t4;
  49. __vector double t5;
  50. __vector double t6;
  51. __vector double t7;
  52. __vector double t8;
  53. __vector double t9;
  54. __vector double t10;
  55. __vector double t11;
  56. long ytmp;
  57. __asm__
  58. (
  59. XXSPLTD_S(32,%x19,0) // alpha_r
  60. XXSPLTD_S(33,%x20,0) // alpha_i
  61. "lxvd2x 36, 0, %21 \n\t" // mvec
  62. #if !defined(CONJ)
  63. "xvmuldp 33, 33, 36 \n\t" // alpha_i * mvec
  64. #else
  65. "xvmuldp 32, 32, 36 \n\t" // alpha_r * mvec
  66. #endif
  67. "mr %16, %3 \n\t"
  68. "dcbt 0, %2 \n\t"
  69. "dcbt 0, %3 \n\t"
  70. "lxvd2x 40, 0, %2 \n\t" // x0
  71. "lxvd2x 41, %22, %2 \n\t" // x1
  72. "lxvd2x 42, %23, %2 \n\t" // x2
  73. "lxvd2x 43, %24, %2 \n\t" // x3
  74. "lxvd2x 48, 0, %3 \n\t" // y0
  75. "lxvd2x 49, %22, %3 \n\t" // y1
  76. "lxvd2x 50, %23, %3 \n\t" // y2
  77. "lxvd2x 51, %24, %3 \n\t" // y3
  78. XXSWAPD_S(%x8,40) // exchange real and imag part
  79. XXSWAPD_S(%x9,41) // exchange real and imag part
  80. XXSWAPD_S(%x10,42) // exchange real and imag part
  81. XXSWAPD_S(%x11,43) // exchange real and imag part
  82. "addi %2, %2, 64 \n\t"
  83. "addi %3, %3, 64 \n\t"
  84. "lxvd2x 44, 0, %2 \n\t" // x4
  85. "lxvd2x 45, %22, %2 \n\t" // x5
  86. "lxvd2x 46, %23, %2 \n\t" // x6
  87. "lxvd2x 47, %24, %2 \n\t" // x7
  88. "lxvd2x %x4, 0, %3 \n\t" // y4
  89. "lxvd2x %x5, %22, %3 \n\t" // y5
  90. "lxvd2x %x6, %23, %3 \n\t" // y6
  91. "lxvd2x %x7, %24, %3 \n\t" // y7
  92. XXSWAPD_S(%x12,44) // exchange real and imag part
  93. XXSWAPD_S(%x13,45) // exchange real and imag part
  94. XXSWAPD_S(%x14,46) // exchange real and imag part
  95. XXSWAPD_S(%x15,47) // exchange real and imag part
  96. "addi %2, %2, 64 \n\t"
  97. "addi %3, %3, 64 \n\t"
  98. "addic. %1, %1, -8 \n\t"
  99. "ble two%= \n\t"
  100. ".align 5 \n"
  101. "one%=: \n\t"
  102. "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
  103. "xvmaddadp 49, 41, 32 \n\t"
  104. "lxvd2x 40, 0, %2 \n\t" // x0
  105. "lxvd2x 41, %22, %2 \n\t" // x1
  106. "xvmaddadp 50, 42, 32 \n\t"
  107. "xvmaddadp 51, 43, 32 \n\t"
  108. "lxvd2x 42, %23, %2 \n\t" // x2
  109. "lxvd2x 43, %24, %2 \n\t" // x3
  110. "xvmaddadp %x4, 44, 32 \n\t"
  111. "addi %2, %2, 64 \n\t"
  112. "xvmaddadp %x5, 45, 32 \n\t"
  113. "lxvd2x 44, 0, %2 \n\t" // x4
  114. "lxvd2x 45, %22, %2 \n\t" // x5
  115. "xvmaddadp %x6, 46, 32 \n\t"
  116. "xvmaddadp %x7, 47, 32 \n\t"
  117. "lxvd2x 46, %23, %2 \n\t" // x6
  118. "lxvd2x 47, %24, %2 \n\t" // x7
  119. "xvmaddadp 48, %x8, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
  120. "addi %2, %2, 64 \n\t"
  121. "xvmaddadp 49, %x9, 33 \n\t"
  122. "xvmaddadp 50, %x10, 33 \n\t"
  123. "xvmaddadp 51, %x11, 33 \n\t"
  124. "xvmaddadp %x4, %x12, 33 \n\t"
  125. "xvmaddadp %x5, %x13, 33 \n\t"
  126. "xvmaddadp %x6, %x14, 33 \n\t"
  127. "xvmaddadp %x7, %x15, 33 \n\t"
  128. "stxvd2x 48, 0, %16 \n\t"
  129. "stxvd2x 49, %22, %16 \n\t"
  130. "stxvd2x 50, %23, %16 \n\t"
  131. "stxvd2x 51, %24, %16 \n\t"
  132. "addi %16, %16, 64 \n\t"
  133. "stxvd2x %x4, 0, %16 \n\t"
  134. "stxvd2x %x5, %22, %16 \n\t"
  135. "stxvd2x %x6, %23, %16 \n\t"
  136. "stxvd2x %x7, %24, %16 \n\t"
  137. "addi %16, %16, 64 \n\t"
  138. XXSWAPD_S(%x8,40) // exchange real and imag part
  139. XXSWAPD_S(%x9,41) // exchange real and imag part
  140. "lxvd2x 48, 0, %3 \n\t" // y0
  141. "lxvd2x 49, %22, %3 \n\t" // y1
  142. XXSWAPD_S(%x10,42) // exchange real and imag part
  143. XXSWAPD_S(%x11,43) // exchange real and imag part
  144. "lxvd2x 50, %23, %3 \n\t" // y2
  145. "lxvd2x 51, %24, %3 \n\t" // y3
  146. XXSWAPD_S(%x12,44) // exchange real and imag part
  147. "addi %3, %3, 64 \n\t"
  148. XXSWAPD_S(%x13,45) // exchange real and imag part
  149. "lxvd2x %x4, 0, %3 \n\t" // y4
  150. "lxvd2x %x5, %22, %3 \n\t" // y5
  151. XXSWAPD_S(%x14,46) // exchange real and imag part
  152. XXSWAPD_S(%x15,47) // exchange real and imag part
  153. "lxvd2x %x6, %23, %3 \n\t" // y6
  154. "lxvd2x %x7, %24, %3 \n\t" // y7
  155. "addi %3, %3, 64 \n\t"
  156. "addic. %1, %1, -8 \n\t"
  157. "bgt one%= \n"
  158. "two%=: \n\t"
  159. "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
  160. "xvmaddadp 49, 41, 32 \n\t"
  161. "xvmaddadp 50, 42, 32 \n\t"
  162. "xvmaddadp 51, 43, 32 \n\t"
  163. "xvmaddadp %x4, 44, 32 \n\t"
  164. "xvmaddadp %x5, 45, 32 \n\t"
  165. "xvmaddadp %x6, 46, 32 \n\t"
  166. "xvmaddadp %x7, 47, 32 \n\t"
  167. "xvmaddadp 48, %x8, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
  168. "xvmaddadp 49, %x9, 33 \n\t"
  169. "xvmaddadp 50, %x10, 33 \n\t"
  170. "xvmaddadp 51, %x11, 33 \n\t"
  171. "xvmaddadp %x4, %x12, 33 \n\t"
  172. "xvmaddadp %x5, %x13, 33 \n\t"
  173. "xvmaddadp %x6, %x14, 33 \n\t"
  174. "xvmaddadp %x7, %x15, 33 \n\t"
  175. "stxvd2x 48, 0, %16 \n\t"
  176. "stxvd2x 49, %22, %16 \n\t"
  177. "stxvd2x 50, %23, %16 \n\t"
  178. "stxvd2x 51, %24, %16 \n\t"
  179. "addi %16, %16, 64 \n\t"
  180. "stxvd2x %x4, 0, %16 \n\t"
  181. "stxvd2x %x5, %22, %16 \n\t"
  182. "stxvd2x %x6, %23, %16 \n\t"
  183. "stxvd2x %x7, %24, %16 \n"
  184. "#n=%1 x=%17=%2 y=%0=%3 alpha=(%19,%20) mvecp=%18=%16 o16=%22 o32=%23 o48=%24 ytmp=%16\n"
  185. "#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11 t8=%x12 t9=%x13 t10=%x14 t11=%x15"
  186. :
  187. "+m" (*y),
  188. "+r" (n), // 1
  189. "+b" (x), // 2
  190. "+b" (y), // 3
  191. "=wa" (t0), // 4
  192. "=wa" (t1), // 5
  193. "=wa" (t2), // 6
  194. "=wa" (t3), // 7
  195. "=wa" (t4), // 8
  196. "=wa" (t5), // 9
  197. "=wa" (t6), // 10
  198. "=wa" (t7), // 11
  199. "=wa" (t8), // 12
  200. "=wa" (t9), // 13
  201. "=wa" (t10), // 14
  202. "=wa" (t11), // 15
  203. "=b" (ytmp) // 16
  204. :
  205. "m" (*x),
  206. "m" (*mvecp),
  207. "d" (alpha_r), // 19
  208. "d" (alpha_i), // 20
  209. "16" (mvecp), // 21
  210. "b" (16), // 22
  211. "b" (32), // 23
  212. "b" (48) // 24
  213. :
  214. "cr0",
  215. "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
  216. "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
  217. "vs48","vs49","vs50","vs51"
  218. );
  219. }