You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zscal_msa.c 29 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717
  1. /*******************************************************************************
  2. Copyright (c) 2017, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. /* This will shuffle the elements in 'in' vector as (mask needed :: 01 00 11 10)
  30. 0 1 2 3 => 2 3 0 1 */
  31. #define SHF_78 78
  32. int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
  33. FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
  34. BLASLONG dummy2)
  35. {
  36. BLASLONG i, inc_x2;
  37. FLOAT *px;
  38. FLOAT tp0, tp1, f0, f1;
  39. v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
  40. v2f64 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15;
  41. v2f64 da_i_vec, da_i_vec_neg, da_r_vec;
  42. px = x;
  43. if (1 == inc_x)
  44. {
  45. if ((0.0 == da_r) && (0.0 == da_i))
  46. {
  47. v2f64 zero_v = __msa_cast_to_vector_double(0);
  48. zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
  49. zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
  50. for (i = (n >> 4); i--;)
  51. {
  52. ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
  53. zero_v, zero_v, x, 2);
  54. ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
  55. zero_v, zero_v, x, 2);
  56. }
  57. if (n & 15)
  58. {
  59. if (n & 8)
  60. {
  61. ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
  62. zero_v, zero_v, x, 2);
  63. }
  64. if (n & 4)
  65. {
  66. ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2);
  67. }
  68. if (n & 2)
  69. {
  70. ST_DP2_INC(zero_v, zero_v, x, 2);
  71. }
  72. if (n & 1)
  73. {
  74. ST_DP(zero_v, x);
  75. }
  76. }
  77. }
  78. else if (0.0 == da_r)
  79. {
  80. da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
  81. da_i_vec_neg = -da_i_vec;
  82. da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
  83. if (n > 15)
  84. {
  85. FLOAT *x_pref;
  86. BLASLONG pref_offset;
  87. pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
  88. if (pref_offset > 0)
  89. {
  90. pref_offset = L1_DATA_LINESIZE - pref_offset;
  91. pref_offset = pref_offset / sizeof(FLOAT);
  92. }
  93. x_pref = x + pref_offset + 32 + 16;
  94. LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
  95. for (i = (n >> 4)- 1; i--;)
  96. {
  97. PREF_OFFSET(x_pref, 0);
  98. PREF_OFFSET(x_pref, 32);
  99. PREF_OFFSET(x_pref, 64);
  100. PREF_OFFSET(x_pref, 96);
  101. PREF_OFFSET(x_pref, 128);
  102. PREF_OFFSET(x_pref, 160);
  103. PREF_OFFSET(x_pref, 192);
  104. PREF_OFFSET(x_pref, 224);
  105. x_pref += 32;
  106. x8 = LD_DP(px); px += 2;
  107. x0 *= da_i_vec;
  108. x9 = LD_DP(px); px += 2;
  109. x1 *= da_i_vec;
  110. x10 = LD_DP(px); px += 2;
  111. x2 *= da_i_vec;
  112. x11 = LD_DP(px); px += 2;
  113. x3 *= da_i_vec;
  114. x12 = LD_DP(px); px += 2;
  115. x4 *= da_i_vec;
  116. x13 = LD_DP(px); px += 2;
  117. x5 *= da_i_vec;
  118. x0 = (v2f64) __msa_shf_w((v4i32) x0, SHF_78);
  119. x14 = LD_DP(px); px += 2;
  120. x6 *= da_i_vec;
  121. x1 = (v2f64) __msa_shf_w((v4i32) x1, SHF_78);
  122. x15 = LD_DP(px); px += 2;
  123. x7 *= da_i_vec;
  124. x2 = (v2f64) __msa_shf_w((v4i32) x2, SHF_78);
  125. x8 *= da_i_vec;
  126. x3 = (v2f64) __msa_shf_w((v4i32) x3, SHF_78);
  127. ST_DP(x0, x); x += 2;
  128. x9 *= da_i_vec;
  129. x4 = (v2f64) __msa_shf_w((v4i32) x4, SHF_78);
  130. ST_DP(x1, x); x += 2;
  131. x10 *= da_i_vec;
  132. x5 = (v2f64) __msa_shf_w((v4i32) x5, SHF_78);
  133. ST_DP(x2, x); x += 2;
  134. x11 *= da_i_vec;
  135. x6 = (v2f64) __msa_shf_w((v4i32) x6, SHF_78);
  136. ST_DP(x3, x); x += 2;
  137. x12 *= da_i_vec;
  138. x7 = (v2f64) __msa_shf_w((v4i32) x7, SHF_78);
  139. ST_DP(x4, x); x += 2;
  140. x13 *= da_i_vec;
  141. x8 = (v2f64) __msa_shf_w((v4i32) x8, SHF_78);
  142. ST_DP(x5, x); x += 2;
  143. x14 *= da_i_vec;
  144. x9 = (v2f64) __msa_shf_w((v4i32) x9, SHF_78);
  145. ST_DP(x6, x); x += 2;
  146. x15 *= da_i_vec;
  147. x10 = (v2f64) __msa_shf_w((v4i32) x10, SHF_78);
  148. ST_DP(x7, x); x += 2;
  149. x11 = (v2f64) __msa_shf_w((v4i32) x11, SHF_78);
  150. ST_DP(x8, x); x += 2;
  151. x0 = LD_DP(px); px += 2;
  152. x12 = (v2f64) __msa_shf_w((v4i32) x12, SHF_78);
  153. ST_DP(x9, x); x += 2;
  154. x1 = LD_DP(px); px += 2;
  155. x13 = (v2f64) __msa_shf_w((v4i32) x13, SHF_78);
  156. ST_DP(x10, x); x += 2;
  157. x2 = LD_DP(px); px += 2;
  158. x14 = (v2f64) __msa_shf_w((v4i32) x14, SHF_78);
  159. ST_DP(x11, x); x += 2;
  160. x3 = LD_DP(px); px += 2;
  161. x15 = (v2f64) __msa_shf_w((v4i32) x15, SHF_78);
  162. ST_DP(x12, x); x += 2;
  163. x4 = LD_DP(px); px += 2;
  164. ST_DP(x13, x); x += 2;
  165. x5 = LD_DP(px); px += 2;
  166. ST_DP(x14, x); x += 2;
  167. x6 = LD_DP(px); px += 2;
  168. ST_DP(x15, x); x += 2;
  169. x7 = LD_DP(px); px += 2;
  170. }
  171. LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15);
  172. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  173. x0, x1, x2, x3);
  174. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  175. x4, x5, x6, x7);
  176. MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
  177. x8, x9, x10, x11);
  178. MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
  179. x12, x13, x14, x15);
  180. SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
  181. SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
  182. SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78);
  183. SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78);
  184. ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
  185. x12, x13, x14, x15, x, 2);
  186. }
  187. if (n & 15)
  188. {
  189. if (n & 8)
  190. {
  191. LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
  192. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  193. x0, x1, x2, x3);
  194. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  195. x4, x5, x6, x7);
  196. SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
  197. SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
  198. ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
  199. }
  200. if (n & 4)
  201. {
  202. LD_DP4_INC(px, 2, x0, x1, x2, x3);
  203. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  204. x0, x1, x2, x3);
  205. SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
  206. ST_DP4_INC(x0, x1, x2, x3, x, 2);
  207. }
  208. if (n & 2)
  209. {
  210. LD_DP2_INC(px, 2, x0, x1);
  211. MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
  212. SHF_W2_DP(x0, x1, x0, x1, SHF_78);
  213. ST_DP2_INC(x0, x1, x, 2);
  214. }
  215. if (n & 1)
  216. {
  217. LD_GP2_INC(px, 1, f0, f1);
  218. MUL2(f0, da_i, f1, -da_i, f0, f1);
  219. ST_GP2_INC(f1, f0, x, 1);
  220. }
  221. }
  222. }
  223. else if (0.0 == da_i)
  224. {
  225. da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
  226. if (n > 15)
  227. {
  228. FLOAT *x_pref;
  229. BLASLONG pref_offset;
  230. pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
  231. if (pref_offset > 0)
  232. {
  233. pref_offset = L1_DATA_LINESIZE - pref_offset;
  234. pref_offset = pref_offset / sizeof(FLOAT);
  235. }
  236. x_pref = x + pref_offset + 32 + 16;
  237. LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
  238. for (i = (n >> 4)- 1; i--;)
  239. {
  240. PREF_OFFSET(x_pref, 0);
  241. PREF_OFFSET(x_pref, 32);
  242. PREF_OFFSET(x_pref, 64);
  243. PREF_OFFSET(x_pref, 96);
  244. PREF_OFFSET(x_pref, 128);
  245. PREF_OFFSET(x_pref, 160);
  246. PREF_OFFSET(x_pref, 192);
  247. PREF_OFFSET(x_pref, 224);
  248. x_pref += 32;
  249. x8 = LD_DP(px); px += 2;
  250. x0 *= da_r_vec;
  251. x9 = LD_DP(px); px += 2;
  252. x1 *= da_r_vec;
  253. x10 = LD_DP(px); px += 2;
  254. x2 *= da_r_vec;
  255. x11 = LD_DP(px); px += 2;
  256. x3 *= da_r_vec;
  257. x12 = LD_DP(px); px += 2;
  258. x4 *= da_r_vec;
  259. x13 = LD_DP(px); px += 2;
  260. x5 *= da_r_vec;
  261. ST_DP(x0, x); x += 2;
  262. x14 = LD_DP(px); px += 2;
  263. x6 *= da_r_vec;
  264. ST_DP(x1, x); x += 2;
  265. x15 = LD_DP(px); px += 2;
  266. x7 *= da_r_vec;
  267. ST_DP(x2, x); x += 2;
  268. x8 *= da_r_vec;
  269. ST_DP(x3, x); x += 2;
  270. x9 *= da_r_vec;
  271. ST_DP(x4, x); x += 2;
  272. x10 *= da_r_vec;
  273. ST_DP(x5, x); x += 2;
  274. x11 *= da_r_vec;
  275. ST_DP(x6, x); x += 2;
  276. x12 *= da_r_vec;
  277. ST_DP(x7, x); x += 2;
  278. x13 *= da_r_vec;
  279. ST_DP(x8, x); x += 2;
  280. x0 = LD_DP(px); px += 2;
  281. x14 *= da_r_vec;
  282. ST_DP(x9, x); x += 2;
  283. x1 = LD_DP(px); px += 2;
  284. x15 *= da_r_vec;
  285. ST_DP(x10, x); x += 2;
  286. x2 = LD_DP(px); px += 2;
  287. ST_DP(x11, x); x += 2;
  288. x3 = LD_DP(px); px += 2;
  289. ST_DP(x12, x); x += 2;
  290. x4 = LD_DP(px); px += 2;
  291. ST_DP(x13, x); x += 2;
  292. x5 = LD_DP(px); px += 2;
  293. ST_DP(x14, x); x += 2;
  294. x6 = LD_DP(px); px += 2;
  295. ST_DP(x15, x); x += 2;
  296. x7 = LD_DP(px); px += 2;
  297. }
  298. LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15);
  299. MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
  300. x0, x1, x2, x3);
  301. MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
  302. x4, x5, x6, x7);
  303. MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
  304. x8, x9, x10, x11);
  305. MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
  306. x12, x13, x14, x15);
  307. ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
  308. x12, x13, x14, x15, x, 2);
  309. }
  310. if (n & 15)
  311. {
  312. if (n & 8)
  313. {
  314. LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
  315. MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
  316. x0, x1, x2, x3);
  317. MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
  318. x4, x5, x6, x7);
  319. ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
  320. }
  321. if (n & 4)
  322. {
  323. LD_DP4_INC(px, 2, x0, x1, x2, x3);
  324. MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
  325. x0, x1, x2, x3);
  326. ST_DP4_INC(x0, x1, x2, x3, x, 2);
  327. }
  328. if (n & 2)
  329. {
  330. LD_DP2_INC(px, 2, x0, x1);
  331. MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
  332. ST_DP2_INC(x0, x1, x, 2);
  333. }
  334. if (n & 1)
  335. {
  336. LD_GP2_INC(px, 1, f0, f1);
  337. MUL2(f0, da_r, f1, da_r, f0, f1);
  338. ST_GP2_INC(f0, f1, x, 1);
  339. }
  340. }
  341. }
  342. else
  343. {
  344. FLOAT *x_pref;
  345. BLASLONG pref_offset;
  346. pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
  347. if (pref_offset > 0)
  348. {
  349. pref_offset = L1_DATA_LINESIZE - pref_offset;
  350. pref_offset = pref_offset / sizeof(FLOAT);
  351. }
  352. x_pref = x + pref_offset + 32;
  353. da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
  354. da_i_vec_neg = -da_i_vec;
  355. da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
  356. da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
  357. for (i = (n >> 4); i--;)
  358. {
  359. PREF_OFFSET(x_pref, 0);
  360. PREF_OFFSET(x_pref, 32);
  361. PREF_OFFSET(x_pref, 64);
  362. PREF_OFFSET(x_pref, 96);
  363. PREF_OFFSET(x_pref, 128);
  364. PREF_OFFSET(x_pref, 160);
  365. PREF_OFFSET(x_pref, 192);
  366. PREF_OFFSET(x_pref, 224);
  367. x_pref += 32;
  368. LD_DP16_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
  369. x11, x12, x13, x14, x15);
  370. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  371. d0, d1, d2, d3);
  372. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  373. d4, d5, d6, d7);
  374. MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
  375. d8, d9, d10, d11);
  376. MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
  377. d12, d13, d14, d15);
  378. SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
  379. SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
  380. SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78);
  381. SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78);
  382. FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
  383. FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
  384. FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
  385. FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
  386. ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
  387. d12, d13, d14, d15, x, 2);
  388. }
  389. if (n & 15)
  390. {
  391. if (n & 8)
  392. {
  393. LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
  394. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  395. d0, d1, d2, d3);
  396. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  397. d4, d5, d6, d7);
  398. SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
  399. SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
  400. FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
  401. FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
  402. ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 2);
  403. }
  404. if (n & 4)
  405. {
  406. LD_DP4_INC(px, 2, x0, x1, x2, x3);
  407. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  408. d0, d1, d2, d3);
  409. SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
  410. FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
  411. ST_DP4_INC(d0, d1, d2, d3, x, 2);
  412. }
  413. if (n & 2)
  414. {
  415. LD_DP2_INC(px, 2, x0, x1);
  416. MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
  417. SHF_W2_DP(d0, d1, d0, d1, SHF_78);
  418. FMADD2(x0, x1, da_r_vec, d0, d1);
  419. ST_DP2_INC(d0, d1, x, 2);
  420. }
  421. if (n & 1)
  422. {
  423. LD_GP2_INC(px, 1, f0, f1);
  424. tp0 = da_r * f0;
  425. tp0 -= da_i * f1;
  426. tp1 = da_r * f1;
  427. tp1 += da_i * f0;
  428. ST_GP2_INC(tp0, tp1, x, 1);
  429. }
  430. }
  431. }
  432. }
  433. else
  434. {
  435. inc_x2 = 2 * inc_x;
  436. if ((0.0 == da_r) && (0.0 == da_i))
  437. {
  438. v2f64 zero_v = __msa_cast_to_vector_double(0);
  439. zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
  440. zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
  441. for (i = (n >> 4); i--;)
  442. {
  443. ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
  444. zero_v, zero_v, x, inc_x2);
  445. ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
  446. zero_v, zero_v, x, inc_x2);
  447. }
  448. if (n & 15)
  449. {
  450. if (n & 8)
  451. {
  452. ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
  453. zero_v, zero_v, x, inc_x2);
  454. }
  455. if (n & 4)
  456. {
  457. ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, inc_x2);
  458. }
  459. if (n & 2)
  460. {
  461. ST_DP2_INC(zero_v, zero_v, x, inc_x2);
  462. }
  463. if (n & 1)
  464. {
  465. ST_DP(zero_v, x);
  466. }
  467. }
  468. }
  469. else if (0.0 == da_r)
  470. {
  471. da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
  472. da_i_vec_neg = -da_i_vec;
  473. da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
  474. for (i = (n >> 4); i--;)
  475. {
  476. LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
  477. x10, x11, x12, x13, x14, x15);
  478. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  479. x0, x1, x2, x3);
  480. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  481. x4, x5, x6, x7);
  482. MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
  483. x8, x9, x10, x11);
  484. MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
  485. x12, x13, x14, x15);
  486. SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
  487. SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
  488. SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78);
  489. SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78);
  490. ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
  491. x12, x13, x14, x15, x, inc_x2);
  492. }
  493. if (n & 15)
  494. {
  495. if (n & 8)
  496. {
  497. LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
  498. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  499. x0, x1, x2, x3);
  500. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  501. x4, x5, x6, x7);
  502. SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
  503. SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
  504. ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2);
  505. }
  506. if (n & 4)
  507. {
  508. LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
  509. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  510. x0, x1, x2, x3);
  511. SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
  512. ST_DP4_INC(x0, x1, x2, x3, x, inc_x2);
  513. }
  514. if (n & 2)
  515. {
  516. LD_DP2_INC(px, inc_x2, x0, x1);
  517. MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
  518. SHF_W2_DP(x0, x1, x0, x1, SHF_78);
  519. ST_DP2_INC(x0, x1, x, inc_x2);
  520. }
  521. if (n & 1)
  522. {
  523. LD_GP2_INC(px, 1, f0, f1);
  524. MUL2(f0, da_i, f1, -da_i, f0, f1);
  525. ST_GP2_INC(f1, f0, x, 1);
  526. }
  527. }
  528. }
  529. else if (0.0 == da_i)
  530. {
  531. da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
  532. for (i = (n >> 4); i--;)
  533. {
  534. LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
  535. x10, x11, x12, x13, x14, x15);
  536. MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
  537. x0, x1, x2, x3);
  538. MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
  539. x4, x5, x6, x7);
  540. MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
  541. x8, x9, x10, x11);
  542. MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
  543. x12, x13, x14, x15);
  544. ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
  545. x12, x13, x14, x15, x, inc_x2);
  546. }
  547. if (n & 15)
  548. {
  549. if (n & 8)
  550. {
  551. LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
  552. MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
  553. x0, x1, x2, x3);
  554. MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
  555. x4, x5, x6, x7);
  556. ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2);
  557. }
  558. if (n & 4)
  559. {
  560. LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
  561. MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
  562. x0, x1, x2, x3);
  563. ST_DP4_INC(x0, x1, x2, x3, x, inc_x2);
  564. }
  565. if (n & 2)
  566. {
  567. LD_DP2_INC(px, inc_x2, x0, x1);
  568. MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
  569. ST_DP2_INC(x0, x1, x, inc_x2);
  570. }
  571. if (n & 1)
  572. {
  573. LD_GP2_INC(px, 1, f0, f1);
  574. MUL2(f0, da_r, f1, da_r, f0, f1);
  575. ST_GP2_INC(f0, f1, x, 1);
  576. }
  577. }
  578. }
  579. else
  580. {
  581. da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
  582. da_i_vec_neg = -da_i_vec;
  583. da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
  584. da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
  585. for (i = (n >> 4); i--;)
  586. {
  587. LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
  588. x10, x11, x12, x13, x14, x15);
  589. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  590. d0, d1, d2, d3);
  591. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  592. d4, d5, d6, d7);
  593. MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
  594. d8, d9, d10, d11);
  595. MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
  596. d12, d13, d14, d15);
  597. SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
  598. SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
  599. SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78);
  600. SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78);
  601. FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
  602. FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
  603. FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
  604. FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
  605. ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
  606. d12, d13, d14, d15, x, inc_x2);
  607. }
  608. if (n & 15)
  609. {
  610. if (n & 8)
  611. {
  612. LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
  613. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  614. d0, d1, d2, d3);
  615. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  616. d4, d5, d6, d7);
  617. SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
  618. SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
  619. FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
  620. FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
  621. ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, inc_x2);
  622. }
  623. if (n & 4)
  624. {
  625. LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
  626. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  627. d0, d1, d2, d3);
  628. SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
  629. FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
  630. ST_DP4_INC(d0, d1, d2, d3, x, inc_x2);
  631. }
  632. if (n & 2)
  633. {
  634. LD_DP2_INC(px, inc_x2, x0, x1);
  635. MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
  636. SHF_W2_DP(d0, d1, d0, d1, SHF_78);
  637. FMADD2(x0, x1, da_r_vec, d0, d1);
  638. ST_DP2_INC(d0, d1, x, inc_x2);
  639. }
  640. if (n & 1)
  641. {
  642. LD_GP2_INC(px, 1, f0, f1);
  643. tp0 = da_r * f0;
  644. tp0 -= da_i * f1;
  645. tp1 = da_r * f1;
  646. tp1 += da_i * f0;
  647. ST_GP2_INC(tp0, tp1, x, 1);
  648. }
  649. }
  650. }
  651. }
  652. return (0);
  653. }