You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemv_n_microk_power10.c 15 kB


  1. /***************************************************************************
  2. Copyright (c) 2013-2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define HAVE_KERNEL_4x4 1
  28. static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
  29. {
  30. double *a0;
  31. double *a1;
  32. double *a2;
  33. double *a3;
  34. __asm__
  35. (
  36. "lxvp 40, 0(%10) \n\t" // x0, x1
  37. XXSPLTD_S(32,%x9,0) // alpha, alpha
  38. "sldi %6, %13, 3 \n\t" // lda * sizeof (double)
  39. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  40. "xvmuldp 34, 40, 32 \n\t" // x0 * alpha, x1 * alpha
  41. "xvmuldp 35, 41, 32 \n\t" // x2 * alpha, x3 * alpha
  42. #else
  43. "xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha
  44. "xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha
  45. #endif
  46. "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
  47. "add %6, %6, %6 \n\t" // 2 * lda
  48. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  49. XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
  50. XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
  51. XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
  52. XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
  53. #else
  54. XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
  55. XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
  56. XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
  57. XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha
  58. #endif
  59. "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
  60. "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
  61. "dcbt 0, %3 \n\t"
  62. "dcbt 0, %4 \n\t"
  63. "dcbt 0, %5 \n\t"
  64. "dcbt 0, %6 \n\t"
  65. "lxvp 40, 0(%3) \n\t" // a0[0], a0[1]
  66. "lxvp 42, 0(%4) \n\t" // a1[0], a1[1]
  67. "lxvp 44, 0(%5) \n\t" // a2[0], a2[1]
  68. "lxvp 46, 0(%6) \n\t" // a3[0], a3[1]
  69. "dcbt 0, %2 \n\t"
  70. "addi %3, %3, 32 \n\t"
  71. "addi %4, %4, 32 \n\t"
  72. "addi %5, %5, 32 \n\t"
  73. "addi %6, %6, 32 \n\t"
  74. "addic. %1, %1, -4 \n\t"
  75. "ble two%= \n\t"
  76. ".align 5 \n"
  77. "one%=: \n\t"
  78. "lxvp 36, 0(%2) \n\t" // y0, y1
  79. "xvmaddadp 36, 40, 32 \n\t"
  80. "xvmaddadp 37, 41, 32 \n\t"
  81. "lxvp 40, 0(%3) \n\t" // a0[0], a0[1]
  82. "xvmaddadp 36, 42, 33 \n\t"
  83. "addi %3, %3, 32 \n\t"
  84. "xvmaddadp 37, 43, 33 \n\t"
  85. "lxvp 42, 0(%4) \n\t" // a1[0], a1[1]
  86. "xvmaddadp 36, 44, 34 \n\t"
  87. "addi %4, %4, 32 \n\t"
  88. "xvmaddadp 37, 45, 34 \n\t"
  89. "lxvp 44, 0(%5) \n\t" // a2[0], a2[1]
  90. "xvmaddadp 36, 46, 35 \n\t"
  91. "addi %5, %5, 32 \n\t"
  92. "xvmaddadp 37, 47, 35 \n\t"
  93. "stxvp 36, 0(%2) \n\t" // y0, y1
  94. "lxvp 46, 0(%6) \n\t" // a3[0], a3[1]
  95. "addi %6, %6, 32 \n\t"
  96. "addi %2, %2, 32 \n\t"
  97. "addic. %1, %1, -4 \n\t"
  98. "ble two%= \n\t"
  99. "lxvp 36, 0(%2) \n\t" // y0, y1
  100. "xvmaddadp 36, 40, 32 \n\t"
  101. "xvmaddadp 37, 41, 32 \n\t"
  102. "lxvp 40, 0(%3) \n\t" // a0[0], a0[1]
  103. "xvmaddadp 36, 42, 33 \n\t"
  104. "addi %3, %3, 32 \n\t"
  105. "xvmaddadp 37, 43, 33 \n\t"
  106. "lxvp 42, 0(%4) \n\t" // a1[0], a1[1]
  107. "xvmaddadp 36, 44, 34 \n\t"
  108. "addi %4, %4, 32 \n\t"
  109. "xvmaddadp 37, 45, 34 \n\t"
  110. "lxvp 44, 0(%5) \n\t" // a2[0], a2[1]
  111. "xvmaddadp 36, 46, 35 \n\t"
  112. "addi %5, %5, 32 \n\t"
  113. "xvmaddadp 37, 47, 35 \n\t"
  114. "stxvp 36, 0(%2) \n\t" // y0, y1
  115. "lxvp 46, 0(%6) \n\t" // a3[0], a3[1]
  116. "addi %6, %6, 32 \n\t"
  117. "addi %2, %2, 32 \n\t"
  118. "addic. %1, %1, -4 \n\t"
  119. "ble two%= \n\t"
  120. "lxvp 36, 0(%2) \n\t" // y0, y1
  121. "xvmaddadp 36, 40, 32 \n\t"
  122. "xvmaddadp 37, 41, 32 \n\t"
  123. "lxvp 40, 0(%3) \n\t" // a0[0], a0[1]
  124. "xvmaddadp 36, 42, 33 \n\t"
  125. "addi %3, %3, 32 \n\t"
  126. "xvmaddadp 37, 43, 33 \n\t"
  127. "lxvp 42, 0(%4) \n\t" // a1[0], a1[1]
  128. "xvmaddadp 36, 44, 34 \n\t"
  129. "addi %4, %4, 32 \n\t"
  130. "xvmaddadp 37, 45, 34 \n\t"
  131. "lxvp 44, 0(%5) \n\t" // a2[0], a2[1]
  132. "xvmaddadp 36, 46, 35 \n\t"
  133. "addi %5, %5, 32 \n\t"
  134. "xvmaddadp 37, 47, 35 \n\t"
  135. "stxvp 36, 0(%2) \n\t" // y0, y1
  136. "lxvp 46, 0(%6) \n\t" // a3[0], a3[1]
  137. "addi %6, %6, 32 \n\t"
  138. "addi %2, %2, 32 \n\t"
  139. "addic. %1, %1, -4 \n\t"
  140. "ble two%= \n\t"
  141. "lxvp 36, 0(%2) \n\t" // y0, y1
  142. "xvmaddadp 36, 40, 32 \n\t"
  143. "xvmaddadp 37, 41, 32 \n\t"
  144. "lxvp 40, 0(%3) \n\t" // a0[0], a0[1]
  145. "xvmaddadp 36, 42, 33 \n\t"
  146. "addi %3, %3, 32 \n\t"
  147. "xvmaddadp 37, 43, 33 \n\t"
  148. "lxvp 42, 0(%4) \n\t" // a1[0], a1[1]
  149. "xvmaddadp 36, 44, 34 \n\t"
  150. "addi %4, %4, 32 \n\t"
  151. "xvmaddadp 37, 45, 34 \n\t"
  152. "lxvp 44, 0(%5) \n\t" // a2[0], a2[1]
  153. "xvmaddadp 36, 46, 35 \n\t"
  154. "addi %5, %5, 32 \n\t"
  155. "xvmaddadp 37, 47, 35 \n\t"
  156. "stxvp 36, 0(%2) \n\t" // y0, y1
  157. "lxvp 46, 0(%6) \n\t" // a3[0], a3[1]
  158. "addi %6, %6, 32 \n\t"
  159. "addi %2, %2, 32 \n\t"
  160. "addic. %1, %1, -4 \n\t"
  161. "bgt one%= \n"
  162. "two%=: \n\t"
  163. "lxvp 36, 0(%2) \n\t" // y0, y1
  164. "xvmaddadp 36, 40, 32 \n\t"
  165. "xvmaddadp 37, 41, 32 \n\t"
  166. "xvmaddadp 36, 42, 33 \n\t"
  167. "xvmaddadp 37, 43, 33 \n\t"
  168. "xvmaddadp 36, 44, 34 \n\t"
  169. "xvmaddadp 37, 45, 34 \n\t"
  170. "xvmaddadp 36, 46, 35 \n\t"
  171. "xvmaddadp 37, 47, 35 \n\t"
  172. "stxvp 36, 0(%2) \n\t" // y0, y1
  173. "#n=%1 ap=%8=%12 lda=%13 x=%7=%10 y=%0=%2 alpha=%9 o16=%11\n"
  174. "#a0=%3 a1=%4 a2=%5 a3=%6"
  175. :
  176. "+m" (*y),
  177. "+r" (n), // 1
  178. "+b" (y), // 2
  179. "=b" (a0), // 3
  180. "=b" (a1), // 4
  181. "=&b" (a2), // 5
  182. "=&b" (a3) // 6
  183. :
  184. "m" (*x),
  185. "m" (*ap),
  186. "d" (alpha), // 9
  187. "r" (x), // 10
  188. "b" (16), // 11
  189. "3" (ap), // 12
  190. "4" (lda) // 13
  191. :
  192. "cr0",
  193. "vs32","vs33","vs34","vs35","vs36","vs37",
  194. "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
  195. );
  196. }
  197. static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y, double alpha)
  198. {
  199. double *a0;
  200. double *a1;
  201. double *a2;
  202. double *a3;
  203. double *a4;
  204. double *a5;
  205. double *a6;
  206. double *a7;
  207. long tmp;
  208. __asm__
  209. (
  210. "lxvp 34, 0( %15) \n\t" // x0, x1
  211. "lxvp 38, 32( %15) \n\t" // x4, x5
  212. XXSPLTD_S(58,%x14,0) // alpha, alpha
  213. "sldi %10, %17, 3 \n\t" // lda * sizeof (double)
  214. "xvmuldp 34, 34, 58 \n\t" // x0 * alpha, x1 * alpha
  215. "xvmuldp 35, 35, 58 \n\t" // x2 * alpha, x3 * alpha
  216. "xvmuldp 38, 38, 58 \n\t" // x4 * alpha, x5 * alpha
  217. "xvmuldp 39, 39, 58 \n\t" // x6 * alpha, x7 * alpha
  218. "li %11, 32 \n\t"
  219. "add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda
  220. "add %10, %10, %10 \n\t" // 2 * lda
  221. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  222. XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
  223. XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
  224. XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
  225. XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
  226. XXSPLTD_S(48,39,0) // x6 * alpha, x6 * alpha
  227. XXSPLTD_S(49,39,1) // x7 * alpha, x7 * alpha
  228. XXSPLTD_S(39,38,1) // x5 * alpha, x5 * alpha
  229. XXSPLTD_S(38,38,0) // x4 * alpha, x4 * alpha
  230. #else
  231. XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
  232. XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
  233. XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
  234. XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha
  235. XXSPLTD_S(48,39,1) // x6 * alpha, x6 * alpha
  236. XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha
  237. XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha
  238. XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha
  239. #endif
  240. "add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda
  241. "add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda
  242. "add %7, %5, %10 \n\t" // a4 = a2 + 2 * lda
  243. "add %8, %6, %10 \n\t" // a5 = a3 + 2 * lda
  244. "add %9, %7, %10 \n\t" // a6 = a4 + 2 * lda
  245. "add %10, %8, %10 \n\t" // a7 = a5 + 2 * lda
  246. "lxvp 40, 0( %3) \n\t" // a0[0], a0[1]
  247. "lxvp 42, 0( %4) \n\t" // a1[0], a1[1]
  248. "lxvp 44, 0( %5) \n\t" // a2[0], a2[1]
  249. "lxvp 46, 0( %6) \n\t" // a3[0], a3[1]
  250. "lxvp 50, 0( %7) \n\t" // a4[0]
  251. "lxvp 52, 0( %8) \n\t" // a5[0]
  252. "lxvp 54, 0( %9) \n\t" // a6[0]
  253. "lxvp 56, 0( %10) \n\t" // a7[0]
  254. "addic. %1, %1, -4 \n\t"
  255. "ble two%= \n\t"
  256. ".align 5 \n"
  257. "one%=: \n\t"
  258. "lxvp 36, 0( %2) \n\t" // y0, y1
  259. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  260. "xvmaddadp 36, 40, 32 \n\t"
  261. "xvmaddadp 37, 41, 32 \n\t"
  262. #else
  263. "xvmaddadp 36, 40, 34 \n\t"
  264. "xvmaddadp 37, 41, 34 \n\t"
  265. #endif
  266. "lxvpx 40, %3, %11 \n\t" // a0[0], a0[1]
  267. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  268. "xvmaddadp 36, 42, 33 \n\t"
  269. "xvmaddadp 37, 43, 33 \n\t"
  270. #else
  271. "xvmaddadp 36, 42, 35 \n\t"
  272. "xvmaddadp 37, 43, 35 \n\t"
  273. #endif
  274. "lxvpx 42, %4, %11 \n\t" // a1[0], a1[1]
  275. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  276. "xvmaddadp 36, 44, 34 \n\t"
  277. "xvmaddadp 37, 45, 34 \n\t"
  278. #else
  279. "xvmaddadp 36, 44, 32 \n\t"
  280. "xvmaddadp 37, 45, 32 \n\t"
  281. #endif
  282. "lxvpx 44, %5, %11 \n\t" // a2[0], a2[1]
  283. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  284. "xvmaddadp 36, 46, 35 \n\t"
  285. "xvmaddadp 37, 47, 35 \n\t"
  286. #else
  287. "xvmaddadp 36, 46, 33 \n\t"
  288. "xvmaddadp 37, 47, 33 \n\t"
  289. #endif
  290. "lxvpx 46, %6, %11 \n\t" // a3[0], a3[1]
  291. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  292. "xvmaddadp 36, 50, 38 \n\t"
  293. "xvmaddadp 37, 51, 38 \n\t"
  294. #else
  295. "xvmaddadp 36, 50, 48 \n\t"
  296. "xvmaddadp 37, 51, 48 \n\t"
  297. #endif
  298. "lxvpx 50, %7, %11 \n\t" // a4[0]
  299. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  300. "xvmaddadp 36, 52, 39 \n\t"
  301. "xvmaddadp 37, 53, 39 \n\t"
  302. #else
  303. "xvmaddadp 36, 52, 49 \n\t"
  304. "xvmaddadp 37, 53, 49 \n\t"
  305. #endif
  306. "lxvpx 52, %8, %11 \n\t" // a5[0]
  307. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  308. "xvmaddadp 36, 54, 48 \n\t"
  309. "xvmaddadp 37, 55, 48 \n\t"
  310. #else
  311. "xvmaddadp 36, 54, 38 \n\t"
  312. "xvmaddadp 37, 55, 38 \n\t"
  313. #endif
  314. "lxvpx 54, %9, %11 \n\t" // a6[0]
  315. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  316. "xvmaddadp 36, 56, 49 \n\t"
  317. "xvmaddadp 37, 57, 49 \n\t"
  318. #else
  319. "xvmaddadp 36, 56, 39 \n\t"
  320. "xvmaddadp 37, 57, 39 \n\t"
  321. #endif
  322. "lxvpx 56, %10, %11 \n\t" // a7[0]
  323. "addi %11, %11, 32 \n\t"
  324. "stxvp 36, 0( %2) \n\t" // y0, y1
  325. "addi %2, %2, 32 \n\t"
  326. "addic. %1, %1, -4 \n\t"
  327. "bgt one%= \n"
  328. "two%=: \n\t"
  329. "lxvp 36, 0( %2) \n\t" // y0, y1
  330. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  331. "xvmaddadp 36, 40, 32 \n\t"
  332. "xvmaddadp 37, 41, 32 \n\t"
  333. "xvmaddadp 36, 42, 33 \n\t"
  334. "xvmaddadp 37, 43, 33 \n\t"
  335. "xvmaddadp 36, 44, 34 \n\t"
  336. "xvmaddadp 37, 45, 34 \n\t"
  337. "xvmaddadp 36, 46, 35 \n\t"
  338. "xvmaddadp 37, 47, 35 \n\t"
  339. "xvmaddadp 36, 50, 38 \n\t"
  340. "xvmaddadp 37, 51, 38 \n\t"
  341. "xvmaddadp 36, 52, 39 \n\t"
  342. "xvmaddadp 37, 53, 39 \n\t"
  343. "xvmaddadp 36, 54, 48 \n\t"
  344. "xvmaddadp 37, 55, 48 \n\t"
  345. "xvmaddadp 36, 56, 49 \n\t"
  346. "xvmaddadp 37, 57, 49 \n\t"
  347. #else
  348. "xvmaddadp 36, 40, 34 \n\t"
  349. "xvmaddadp 37, 41, 34 \n\t"
  350. "xvmaddadp 36, 42, 35 \n\t"
  351. "xvmaddadp 37, 43, 35 \n\t"
  352. "xvmaddadp 36, 44, 32 \n\t"
  353. "xvmaddadp 37, 45, 32 \n\t"
  354. "xvmaddadp 36, 46, 33 \n\t"
  355. "xvmaddadp 37, 47, 33 \n\t"
  356. "xvmaddadp 36, 50, 48 \n\t"
  357. "xvmaddadp 37, 51, 48 \n\t"
  358. "xvmaddadp 36, 52, 49 \n\t"
  359. "xvmaddadp 37, 53, 49 \n\t"
  360. "xvmaddadp 36, 54, 38 \n\t"
  361. "xvmaddadp 37, 55, 38 \n\t"
  362. "xvmaddadp 36, 56, 39 \n\t"
  363. "xvmaddadp 37, 57, 39 \n\t"
  364. #endif
  365. "stxvp 36, 0( %2) \n\t" // y0, y1
  366. :
  367. "+m" (*y),
  368. "+r" (n), // 1
  369. "+b" (y), // 2
  370. "=b" (a0), // 3
  371. "=b" (a1), // 4
  372. "=&b" (a2), // 5
  373. "=&b" (a3), // 6
  374. "=&b" (a4), // 7
  375. "=&b" (a5), // 8
  376. "=&b" (a6), // 9
  377. "=&b" (a7), // 10
  378. "=b" (tmp)
  379. :
  380. "m" (*x),
  381. "m" (*ap),
  382. "d" (alpha), // 14
  383. "r" (x), // 15
  384. "3" (ap), // 16
  385. "4" (lda) // 17
  386. :
  387. "cr0",
  388. "vs32","vs33","vs34","vs35","vs36","vs37",
  389. "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48",
  390. "vs49","vs50","vs51","vs52","vs53","vs54","vs55","vs56", "vs57", "vs58"
  391. );
  392. }