You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemv_n_msa.c 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577
  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. #define DGEMV_N_8x8() \
  30. { \
  31. LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
  32. LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
  33. LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
  34. LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
  35. LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \
  36. LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \
  37. LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \
  38. LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \
  39. \
  40. y0 += tp0 * t0; \
  41. y1 += tp0 * t1; \
  42. y2 += tp0 * t2; \
  43. y3 += tp0 * t3; \
  44. \
  45. y0 += tp1 * t4; \
  46. y1 += tp1 * t5; \
  47. y2 += tp1 * t6; \
  48. y3 += tp1 * t7; \
  49. \
  50. y0 += tp2 * t8; \
  51. y1 += tp2 * t9; \
  52. y2 += tp2 * t10; \
  53. y3 += tp2 * t11; \
  54. \
  55. y0 += tp3 * t12; \
  56. y1 += tp3 * t13; \
  57. y2 += tp3 * t14; \
  58. y3 += tp3 * t15; \
  59. \
  60. y0 += tp4 * t16; \
  61. y1 += tp4 * t17; \
  62. y2 += tp4 * t18; \
  63. y3 += tp4 * t19; \
  64. \
  65. y0 += tp5 * t20; \
  66. y1 += tp5 * t21; \
  67. y2 += tp5 * t22; \
  68. y3 += tp5 * t23; \
  69. \
  70. y0 += tp6 * t24; \
  71. y1 += tp6 * t25; \
  72. y2 += tp6 * t26; \
  73. y3 += tp6 * t27; \
  74. \
  75. y0 += tp7 * t28; \
  76. y1 += tp7 * t29; \
  77. y2 += tp7 * t30; \
  78. y3 += tp7 * t31; \
  79. }
  80. #define DGEMV_N_4x8() \
  81. { \
  82. LD_DP2(pa0 + k, 2, t0, t1); \
  83. LD_DP2(pa1 + k, 2, t4, t5); \
  84. LD_DP2(pa2 + k, 2, t8, t9); \
  85. LD_DP2(pa3 + k, 2, t12, t13); \
  86. LD_DP2(pa4 + k, 2, t16, t17); \
  87. LD_DP2(pa5 + k, 2, t20, t21); \
  88. LD_DP2(pa6 + k, 2, t24, t25); \
  89. LD_DP2(pa7 + k, 2, t28, t29); \
  90. \
  91. y0 += tp0 * t0; \
  92. y1 += tp0 * t1; \
  93. \
  94. y0 += tp1 * t4; \
  95. y1 += tp1 * t5; \
  96. \
  97. y0 += tp2 * t8; \
  98. y1 += tp2 * t9; \
  99. \
  100. y0 += tp3 * t12; \
  101. y1 += tp3 * t13; \
  102. \
  103. y0 += tp4 * t16; \
  104. y1 += tp4 * t17; \
  105. \
  106. y0 += tp5 * t20; \
  107. y1 += tp5 * t21; \
  108. \
  109. y0 += tp6 * t24; \
  110. y1 += tp6 * t25; \
  111. \
  112. y0 += tp7 * t28; \
  113. y1 += tp7 * t29; \
  114. }
  115. #define DGEMV_N_8x4() \
  116. { \
  117. LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
  118. LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
  119. LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
  120. LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
  121. \
  122. y0 += tp0 * t0; \
  123. y1 += tp0 * t1; \
  124. y2 += tp0 * t2; \
  125. y3 += tp0 * t3; \
  126. \
  127. y0 += tp1 * t4; \
  128. y1 += tp1 * t5; \
  129. y2 += tp1 * t6; \
  130. y3 += tp1 * t7; \
  131. \
  132. y0 += tp2 * t8; \
  133. y1 += tp2 * t9; \
  134. y2 += tp2 * t10; \
  135. y3 += tp2 * t11; \
  136. \
  137. y0 += tp3 * t12; \
  138. y1 += tp3 * t13; \
  139. y2 += tp3 * t14; \
  140. y3 += tp3 * t15; \
  141. }
  142. #define DGEMV_N_4x4() \
  143. { \
  144. LD_DP2(pa0 + k, 2, t0, t1); \
  145. LD_DP2(pa1 + k, 2, t4, t5); \
  146. LD_DP2(pa2 + k, 2, t8, t9); \
  147. LD_DP2(pa3 + k, 2, t12, t13); \
  148. \
  149. y0 += tp0 * t0; \
  150. y1 += tp0 * t1; \
  151. \
  152. y0 += tp1 * t4; \
  153. y1 += tp1 * t5; \
  154. \
  155. y0 += tp2 * t8; \
  156. y1 += tp2 * t9; \
  157. \
  158. y0 += tp3 * t12; \
  159. y1 += tp3 * t13; \
  160. }
  161. #define DGEMV_N_8x2() \
  162. { \
  163. LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
  164. LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
  165. \
  166. y0 += tp0 * t0; \
  167. y1 += tp0 * t1; \
  168. y2 += tp0 * t2; \
  169. y3 += tp0 * t3; \
  170. \
  171. y0 += tp1 * t4; \
  172. y1 += tp1 * t5; \
  173. y2 += tp1 * t6; \
  174. y3 += tp1 * t7; \
  175. }
  176. #define DGEMV_N_4x2() \
  177. { \
  178. LD_DP2(pa0 + k, 2, t0, t1); \
  179. LD_DP2(pa1 + k, 2, t4, t5); \
  180. \
  181. y0 += tp0 * t0; \
  182. y1 += tp0 * t1; \
  183. \
  184. y0 += tp1 * t4; \
  185. y1 += tp1 * t5; \
  186. }
  187. #define DLOAD_X8_SCALE_GP() \
  188. temp0 = alpha * x[0 * inc_x]; \
  189. temp1 = alpha * x[1 * inc_x]; \
  190. temp2 = alpha * x[2 * inc_x]; \
  191. temp3 = alpha * x[3 * inc_x]; \
  192. temp4 = alpha * x[4 * inc_x]; \
  193. temp5 = alpha * x[5 * inc_x]; \
  194. temp6 = alpha * x[6 * inc_x]; \
  195. temp7 = alpha * x[7 * inc_x]; \
  196. \
  197. tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
  198. tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
  199. tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \
  200. tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \
  201. tp4 = COPY_DOUBLE_TO_VECTOR(temp4); \
  202. tp5 = COPY_DOUBLE_TO_VECTOR(temp5); \
  203. tp6 = COPY_DOUBLE_TO_VECTOR(temp6); \
  204. tp7 = COPY_DOUBLE_TO_VECTOR(temp7); \
  205. #define DLOAD_X4_SCALE_GP() \
  206. temp0 = alpha * x[0 * inc_x]; \
  207. temp1 = alpha * x[1 * inc_x]; \
  208. temp2 = alpha * x[2 * inc_x]; \
  209. temp3 = alpha * x[3 * inc_x]; \
  210. \
  211. tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
  212. tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
  213. tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \
  214. tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \
  215. #define DLOAD_X8_SCALE_VECTOR() \
  216. LD_DP4(x, 2, x0, x1, x2, x3); \
  217. \
  218. x0 = x0 * v_alpha; \
  219. x1 = x1 * v_alpha; \
  220. x2 = x2 * v_alpha; \
  221. x3 = x3 * v_alpha; \
  222. \
  223. SPLATI_D2_DP(x0, tp0, tp1); \
  224. SPLATI_D2_DP(x1, tp2, tp3); \
  225. SPLATI_D2_DP(x2, tp4, tp5); \
  226. SPLATI_D2_DP(x3, tp6, tp7); \
  227. #define DLOAD_X4_SCALE_VECTOR() \
  228. LD_DP2(x, 2, x0, x1); \
  229. \
  230. x0 = x0 * v_alpha; \
  231. x1 = x1 * v_alpha; \
  232. \
  233. SPLATI_D2_DP(x0, tp0, tp1); \
  234. SPLATI_D2_DP(x1, tp2, tp3); \
  235. #define DLOAD_Y8_GP() \
  236. y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \
  237. y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \
  238. y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \
  239. y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \
  240. y2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 4 * inc_y))); \
  241. y2 = (v2f64) __msa_insert_d((v2i64) y2, 1, *((long long *)(y + 5 * inc_y))); \
  242. y3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 6 * inc_y))); \
  243. y3 = (v2f64) __msa_insert_d((v2i64) y3, 1, *((long long *)(y + 7 * inc_y))); \
  244. #define DLOAD_Y4_GP() \
  245. y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \
  246. y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \
  247. y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \
  248. y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \
  249. #define DLOAD_Y8_VECTOR() LD_DP4(y, 2, y0, y1, y2, y3);
  250. #define DLOAD_Y4_VECTOR() LD_DP2(y, 2, y0, y1);
  251. #define DSTORE_Y8_GP() \
  252. *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \
  253. *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \
  254. *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \
  255. *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \
  256. *((long long *)(y + 4 * inc_y)) = __msa_copy_s_d((v2i64) y2, 0); \
  257. *((long long *)(y + 5 * inc_y)) = __msa_copy_s_d((v2i64) y2, 1); \
  258. *((long long *)(y + 6 * inc_y)) = __msa_copy_s_d((v2i64) y3, 0); \
  259. *((long long *)(y + 7 * inc_y)) = __msa_copy_s_d((v2i64) y3, 1); \
  260. #define DSTORE_Y4_GP() \
  261. *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \
  262. *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \
  263. *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \
  264. *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \
  265. #define DSTORE_Y8_VECTOR() ST_DP4(y0, y1, y2, y3, y, 2);
  266. #define DSTORE_Y4_VECTOR() ST_DP2(y0, y1, y, 2);
  267. #define DGEMV_N_MSA() \
  268. for (j = (n >> 3); j--;) \
  269. { \
  270. DLOAD_X8_SCALE(); \
  271. \
  272. k = 0; \
  273. y = y_org; \
  274. \
  275. for (i = (m >> 3); i--;) \
  276. { \
  277. DLOAD_Y8(); \
  278. DGEMV_N_8x8(); \
  279. DSTORE_Y8(); \
  280. \
  281. y += 8 * inc_y; \
  282. k += 8; \
  283. } \
  284. \
  285. if (m & 4) \
  286. { \
  287. DLOAD_Y4(); \
  288. DGEMV_N_4x8(); \
  289. DSTORE_Y4(); \
  290. \
  291. y += 4 * inc_y; \
  292. k += 4; \
  293. } \
  294. \
  295. if (m & 3) \
  296. { \
  297. temp0 = alpha * x[0 * inc_x]; \
  298. temp1 = alpha * x[1 * inc_x]; \
  299. temp2 = alpha * x[2 * inc_x]; \
  300. temp3 = alpha * x[3 * inc_x]; \
  301. temp4 = alpha * x[4 * inc_x]; \
  302. temp5 = alpha * x[5 * inc_x]; \
  303. temp6 = alpha * x[6 * inc_x]; \
  304. temp7 = alpha * x[7 * inc_x]; \
  305. \
  306. for (i = (m & 3); i--;) \
  307. { \
  308. temp = y[0]; \
  309. temp += temp0 * pa0[k]; \
  310. temp += temp1 * pa1[k]; \
  311. temp += temp2 * pa2[k]; \
  312. temp += temp3 * pa3[k]; \
  313. temp += temp4 * pa4[k]; \
  314. temp += temp5 * pa5[k]; \
  315. temp += temp6 * pa6[k]; \
  316. temp += temp7 * pa7[k]; \
  317. y[0] = temp; \
  318. \
  319. y += inc_y; \
  320. k++; \
  321. } \
  322. } \
  323. pa0 += 8 * lda; \
  324. pa1 += 8 * lda; \
  325. pa2 += 8 * lda; \
  326. pa3 += 8 * lda; \
  327. pa4 += 8 * lda; \
  328. pa5 += 8 * lda; \
  329. pa6 += 8 * lda; \
  330. pa7 += 8 * lda; \
  331. \
  332. x += 8 * inc_x; \
  333. } \
  334. \
  335. if (n & 4) \
  336. { \
  337. DLOAD_X4_SCALE(); \
  338. \
  339. k = 0; \
  340. y = y_org; \
  341. \
  342. for (i = (m >> 3); i--;) \
  343. { \
  344. DLOAD_Y8(); \
  345. DGEMV_N_8x4(); \
  346. DSTORE_Y8(); \
  347. \
  348. y += 8 * inc_y; \
  349. k += 8; \
  350. } \
  351. \
  352. if (m & 4) \
  353. { \
  354. DLOAD_Y4(); \
  355. DGEMV_N_4x4(); \
  356. DSTORE_Y4(); \
  357. \
  358. y += 4 * inc_y; \
  359. k += 4; \
  360. } \
  361. \
  362. if (m & 3) \
  363. { \
  364. temp0 = alpha * x[0 * inc_x]; \
  365. temp1 = alpha * x[1 * inc_x]; \
  366. temp2 = alpha * x[2 * inc_x]; \
  367. temp3 = alpha * x[3 * inc_x]; \
  368. \
  369. for (i = (m & 3); i--;) \
  370. { \
  371. temp = y[0]; \
  372. temp += temp0 * pa0[k]; \
  373. temp += temp1 * pa1[k]; \
  374. temp += temp2 * pa2[k]; \
  375. temp += temp3 * pa3[k]; \
  376. y[0] = temp; \
  377. \
  378. y += inc_y; \
  379. k++; \
  380. } \
  381. } \
  382. \
  383. pa0 += 4 * lda; \
  384. pa1 += 4 * lda; \
  385. pa2 += 4 * lda; \
  386. pa3 += 4 * lda; \
  387. \
  388. x += 4 * inc_x; \
  389. } \
  390. \
  391. if (n & 2) \
  392. { \
  393. temp0 = alpha * x[0 * inc_x]; \
  394. temp1 = alpha * x[1 * inc_x]; \
  395. \
  396. tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
  397. tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
  398. \
  399. k = 0; \
  400. y = y_org; \
  401. \
  402. for (i = (m >> 3); i--;) \
  403. { \
  404. DLOAD_Y8(); \
  405. DGEMV_N_8x2(); \
  406. DSTORE_Y8(); \
  407. \
  408. y += 8 * inc_y; \
  409. k += 8; \
  410. } \
  411. \
  412. if (m & 4) \
  413. { \
  414. DLOAD_Y4(); \
  415. DGEMV_N_4x2(); \
  416. DSTORE_Y4(); \
  417. \
  418. y += 4 * inc_y; \
  419. k += 4; \
  420. } \
  421. \
  422. if (m & 3) \
  423. { \
  424. temp0 = alpha * x[0 * inc_x]; \
  425. temp1 = alpha * x[1 * inc_x]; \
  426. \
  427. for (i = (m & 3); i--;) \
  428. { \
  429. temp = y[0]; \
  430. temp += temp0 * pa0[k]; \
  431. temp += temp1 * pa1[k]; \
  432. y[0] = temp; \
  433. \
  434. y += inc_y; \
  435. k++; \
  436. } \
  437. } \
  438. \
  439. pa0 += 2 * lda; \
  440. pa1 += 2 * lda; \
  441. \
  442. x += 2 * inc_x; \
  443. } \
  444. \
  445. if (n & 1) \
  446. { \
  447. temp = alpha * x[0]; \
  448. \
  449. k = 0; \
  450. y = y_org; \
  451. \
  452. for (i = m; i--;) \
  453. { \
  454. y[0] += temp * pa0[k]; \
  455. y += inc_y; \
  456. k++; \
  457. } \
  458. } \
  459. int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
  460. BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
  461. FLOAT *buffer)
  462. {
  463. BLASLONG i, j, k;
  464. FLOAT *y_org = y;
  465. FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
  466. FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  467. v2f64 v_alpha;
  468. v2f64 x0, x1, x2, x3, y0 = {0,0}, y1 = {0,0}, y2 = {0,0}, y3 = {0,0};
  469. v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
  470. v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
  471. v2f64 t30, t31, tp0 = {0,0}, tp1 = {0,0}, tp2 = {0,0}, tp3 = {0,0}, tp4 = {0,0}, tp5 = {0,0}, tp6 = {0,0}, tp7 = {0,0};
  472. v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);
  473. pa0 = A;
  474. pa1 = A + lda;
  475. pa2 = A + 2 * lda;
  476. pa3 = A + 3 * lda;
  477. pa4 = A + 4 * lda;
  478. pa5 = A + 5 * lda;
  479. pa6 = A + 6 * lda;
  480. pa7 = A + 7 * lda;
  481. if ((1 == inc_x) && (1 == inc_y))
  482. {
  483. #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR
  484. #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR
  485. #define DLOAD_Y8 DLOAD_Y8_VECTOR
  486. #define DLOAD_Y4 DLOAD_Y4_VECTOR
  487. #define DSTORE_Y8 DSTORE_Y8_VECTOR
  488. #define DSTORE_Y4 DSTORE_Y4_VECTOR
  489. DGEMV_N_MSA();
  490. #undef DLOAD_X8_SCALE
  491. #undef DLOAD_X4_SCALE
  492. #undef DLOAD_Y8
  493. #undef DLOAD_Y4
  494. #undef DSTORE_Y8
  495. #undef DSTORE_Y4
  496. }
  497. else if (1 == inc_y)
  498. {
  499. #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP
  500. #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP
  501. #define DLOAD_Y8 DLOAD_Y8_VECTOR
  502. #define DLOAD_Y4 DLOAD_Y4_VECTOR
  503. #define DSTORE_Y8 DSTORE_Y8_VECTOR
  504. #define DSTORE_Y4 DSTORE_Y4_VECTOR
  505. DGEMV_N_MSA();
  506. #undef DLOAD_X8_SCALE
  507. #undef DLOAD_X4_SCALE
  508. #undef DLOAD_Y8
  509. #undef DLOAD_Y4
  510. #undef DSTORE_Y8
  511. #undef DSTORE_Y4
  512. }
  513. else if (1 == inc_x)
  514. {
  515. #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR
  516. #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR
  517. #define DLOAD_Y8 DLOAD_Y8_GP
  518. #define DLOAD_Y4 DLOAD_Y4_GP
  519. #define DSTORE_Y8 DSTORE_Y8_GP
  520. #define DSTORE_Y4 DSTORE_Y4_GP
  521. DGEMV_N_MSA();
  522. #undef DLOAD_X8_SCALE
  523. #undef DLOAD_X4_SCALE
  524. #undef DLOAD_Y8
  525. #undef DLOAD_Y4
  526. #undef DSTORE_Y8
  527. #undef DSTORE_Y4
  528. }
  529. else
  530. {
  531. #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP
  532. #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP
  533. #define DLOAD_Y8 DLOAD_Y8_GP
  534. #define DLOAD_Y4 DLOAD_Y4_GP
  535. #define DSTORE_Y8 DSTORE_Y8_GP
  536. #define DSTORE_Y4 DSTORE_Y4_GP
  537. DGEMV_N_MSA();
  538. #undef DLOAD_X8_SCALE
  539. #undef DLOAD_X4_SCALE
  540. #undef DLOAD_Y8
  541. #undef DLOAD_Y4
  542. #undef DSTORE_Y8
  543. #undef DSTORE_Y4
  544. }
  545. return(0);
  546. }