You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zscal_msa.c 29 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713
  1. /*******************************************************************************
  2. Copyright (c) 2017, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. /* This will shuffle the elements in 'in' vector as (mask needed :: 01 00 11 10)
  30. 0 1 2 3 => 2 3 0 1 */
  31. #define SHF_78 78
  32. int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
  33. FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
  34. BLASLONG dummy2)
  35. {
  36. BLASLONG i, inc_x2;
  37. FLOAT *px;
  38. FLOAT tp0, tp1, f0, f1;
  39. v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
  40. v2f64 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15;
  41. v2f64 da_i_vec, da_i_vec_neg, da_r_vec;
  42. px = x;
  43. if (1 == inc_x)
  44. {
  45. if ((0.0 == da_r) && (0.0 == da_i))
  46. {
  47. v2f64 zero_v = {0.0, 0.0};
  48. for (i = (n >> 4); i--;)
  49. {
  50. ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
  51. zero_v, zero_v, x, 2);
  52. ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
  53. zero_v, zero_v, x, 2);
  54. }
  55. if (n & 15)
  56. {
  57. if (n & 8)
  58. {
  59. ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
  60. zero_v, zero_v, x, 2);
  61. }
  62. if (n & 4)
  63. {
  64. ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2);
  65. }
  66. if (n & 2)
  67. {
  68. ST_DP2_INC(zero_v, zero_v, x, 2);
  69. }
  70. if (n & 1)
  71. {
  72. ST_DP(zero_v, x);
  73. }
  74. }
  75. }
  76. else if (0.0 == da_r)
  77. {
  78. da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
  79. da_i_vec_neg = -da_i_vec;
  80. da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
  81. if (n > 15)
  82. {
  83. FLOAT *x_pref;
  84. BLASLONG pref_offset;
  85. pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
  86. if (pref_offset > 0)
  87. {
  88. pref_offset = L1_DATA_LINESIZE - pref_offset;
  89. pref_offset = pref_offset / sizeof(FLOAT);
  90. }
  91. x_pref = x + pref_offset + 32 + 16;
  92. LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
  93. for (i = (n >> 4)- 1; i--;)
  94. {
  95. PREF_OFFSET(x_pref, 0);
  96. PREF_OFFSET(x_pref, 32);
  97. PREF_OFFSET(x_pref, 64);
  98. PREF_OFFSET(x_pref, 96);
  99. PREF_OFFSET(x_pref, 128);
  100. PREF_OFFSET(x_pref, 160);
  101. PREF_OFFSET(x_pref, 192);
  102. PREF_OFFSET(x_pref, 224);
  103. x_pref += 32;
  104. x8 = LD_DP(px); px += 2;
  105. x0 *= da_i_vec;
  106. x9 = LD_DP(px); px += 2;
  107. x1 *= da_i_vec;
  108. x10 = LD_DP(px); px += 2;
  109. x2 *= da_i_vec;
  110. x11 = LD_DP(px); px += 2;
  111. x3 *= da_i_vec;
  112. x12 = LD_DP(px); px += 2;
  113. x4 *= da_i_vec;
  114. x13 = LD_DP(px); px += 2;
  115. x5 *= da_i_vec;
  116. x0 = (v2f64) __msa_shf_w((v4i32) x0, SHF_78);
  117. x14 = LD_DP(px); px += 2;
  118. x6 *= da_i_vec;
  119. x1 = (v2f64) __msa_shf_w((v4i32) x1, SHF_78);
  120. x15 = LD_DP(px); px += 2;
  121. x7 *= da_i_vec;
  122. x2 = (v2f64) __msa_shf_w((v4i32) x2, SHF_78);
  123. x8 *= da_i_vec;
  124. x3 = (v2f64) __msa_shf_w((v4i32) x3, SHF_78);
  125. ST_DP(x0, x); x += 2;
  126. x9 *= da_i_vec;
  127. x4 = (v2f64) __msa_shf_w((v4i32) x4, SHF_78);
  128. ST_DP(x1, x); x += 2;
  129. x10 *= da_i_vec;
  130. x5 = (v2f64) __msa_shf_w((v4i32) x5, SHF_78);
  131. ST_DP(x2, x); x += 2;
  132. x11 *= da_i_vec;
  133. x6 = (v2f64) __msa_shf_w((v4i32) x6, SHF_78);
  134. ST_DP(x3, x); x += 2;
  135. x12 *= da_i_vec;
  136. x7 = (v2f64) __msa_shf_w((v4i32) x7, SHF_78);
  137. ST_DP(x4, x); x += 2;
  138. x13 *= da_i_vec;
  139. x8 = (v2f64) __msa_shf_w((v4i32) x8, SHF_78);
  140. ST_DP(x5, x); x += 2;
  141. x14 *= da_i_vec;
  142. x9 = (v2f64) __msa_shf_w((v4i32) x9, SHF_78);
  143. ST_DP(x6, x); x += 2;
  144. x15 *= da_i_vec;
  145. x10 = (v2f64) __msa_shf_w((v4i32) x10, SHF_78);
  146. ST_DP(x7, x); x += 2;
  147. x11 = (v2f64) __msa_shf_w((v4i32) x11, SHF_78);
  148. ST_DP(x8, x); x += 2;
  149. x0 = LD_DP(px); px += 2;
  150. x12 = (v2f64) __msa_shf_w((v4i32) x12, SHF_78);
  151. ST_DP(x9, x); x += 2;
  152. x1 = LD_DP(px); px += 2;
  153. x13 = (v2f64) __msa_shf_w((v4i32) x13, SHF_78);
  154. ST_DP(x10, x); x += 2;
  155. x2 = LD_DP(px); px += 2;
  156. x14 = (v2f64) __msa_shf_w((v4i32) x14, SHF_78);
  157. ST_DP(x11, x); x += 2;
  158. x3 = LD_DP(px); px += 2;
  159. x15 = (v2f64) __msa_shf_w((v4i32) x15, SHF_78);
  160. ST_DP(x12, x); x += 2;
  161. x4 = LD_DP(px); px += 2;
  162. ST_DP(x13, x); x += 2;
  163. x5 = LD_DP(px); px += 2;
  164. ST_DP(x14, x); x += 2;
  165. x6 = LD_DP(px); px += 2;
  166. ST_DP(x15, x); x += 2;
  167. x7 = LD_DP(px); px += 2;
  168. }
  169. LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15);
  170. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  171. x0, x1, x2, x3);
  172. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  173. x4, x5, x6, x7);
  174. MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
  175. x8, x9, x10, x11);
  176. MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
  177. x12, x13, x14, x15);
  178. SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
  179. SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
  180. SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78);
  181. SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78);
  182. ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
  183. x12, x13, x14, x15, x, 2);
  184. }
  185. if (n & 15)
  186. {
  187. if (n & 8)
  188. {
  189. LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
  190. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  191. x0, x1, x2, x3);
  192. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  193. x4, x5, x6, x7);
  194. SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
  195. SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
  196. ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
  197. }
  198. if (n & 4)
  199. {
  200. LD_DP4_INC(px, 2, x0, x1, x2, x3);
  201. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  202. x0, x1, x2, x3);
  203. SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
  204. ST_DP4_INC(x0, x1, x2, x3, x, 2);
  205. }
  206. if (n & 2)
  207. {
  208. LD_DP2_INC(px, 2, x0, x1);
  209. MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
  210. SHF_W2_DP(x0, x1, x0, x1, SHF_78);
  211. ST_DP2_INC(x0, x1, x, 2);
  212. }
  213. if (n & 1)
  214. {
  215. LD_GP2_INC(px, 1, f0, f1);
  216. MUL2(f0, da_i, f1, -da_i, f0, f1);
  217. ST_GP2_INC(f1, f0, x, 1);
  218. }
  219. }
  220. }
  221. else if (0.0 == da_i)
  222. {
  223. da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
  224. if (n > 15)
  225. {
  226. FLOAT *x_pref;
  227. BLASLONG pref_offset;
  228. pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
  229. if (pref_offset > 0)
  230. {
  231. pref_offset = L1_DATA_LINESIZE - pref_offset;
  232. pref_offset = pref_offset / sizeof(FLOAT);
  233. }
  234. x_pref = x + pref_offset + 32 + 16;
  235. LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
  236. for (i = (n >> 4)- 1; i--;)
  237. {
  238. PREF_OFFSET(x_pref, 0);
  239. PREF_OFFSET(x_pref, 32);
  240. PREF_OFFSET(x_pref, 64);
  241. PREF_OFFSET(x_pref, 96);
  242. PREF_OFFSET(x_pref, 128);
  243. PREF_OFFSET(x_pref, 160);
  244. PREF_OFFSET(x_pref, 192);
  245. PREF_OFFSET(x_pref, 224);
  246. x_pref += 32;
  247. x8 = LD_DP(px); px += 2;
  248. x0 *= da_r_vec;
  249. x9 = LD_DP(px); px += 2;
  250. x1 *= da_r_vec;
  251. x10 = LD_DP(px); px += 2;
  252. x2 *= da_r_vec;
  253. x11 = LD_DP(px); px += 2;
  254. x3 *= da_r_vec;
  255. x12 = LD_DP(px); px += 2;
  256. x4 *= da_r_vec;
  257. x13 = LD_DP(px); px += 2;
  258. x5 *= da_r_vec;
  259. ST_DP(x0, x); x += 2;
  260. x14 = LD_DP(px); px += 2;
  261. x6 *= da_r_vec;
  262. ST_DP(x1, x); x += 2;
  263. x15 = LD_DP(px); px += 2;
  264. x7 *= da_r_vec;
  265. ST_DP(x2, x); x += 2;
  266. x8 *= da_r_vec;
  267. ST_DP(x3, x); x += 2;
  268. x9 *= da_r_vec;
  269. ST_DP(x4, x); x += 2;
  270. x10 *= da_r_vec;
  271. ST_DP(x5, x); x += 2;
  272. x11 *= da_r_vec;
  273. ST_DP(x6, x); x += 2;
  274. x12 *= da_r_vec;
  275. ST_DP(x7, x); x += 2;
  276. x13 *= da_r_vec;
  277. ST_DP(x8, x); x += 2;
  278. x0 = LD_DP(px); px += 2;
  279. x14 *= da_r_vec;
  280. ST_DP(x9, x); x += 2;
  281. x1 = LD_DP(px); px += 2;
  282. x15 *= da_r_vec;
  283. ST_DP(x10, x); x += 2;
  284. x2 = LD_DP(px); px += 2;
  285. ST_DP(x11, x); x += 2;
  286. x3 = LD_DP(px); px += 2;
  287. ST_DP(x12, x); x += 2;
  288. x4 = LD_DP(px); px += 2;
  289. ST_DP(x13, x); x += 2;
  290. x5 = LD_DP(px); px += 2;
  291. ST_DP(x14, x); x += 2;
  292. x6 = LD_DP(px); px += 2;
  293. ST_DP(x15, x); x += 2;
  294. x7 = LD_DP(px); px += 2;
  295. }
  296. LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15);
  297. MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
  298. x0, x1, x2, x3);
  299. MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
  300. x4, x5, x6, x7);
  301. MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
  302. x8, x9, x10, x11);
  303. MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
  304. x12, x13, x14, x15);
  305. ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
  306. x12, x13, x14, x15, x, 2);
  307. }
  308. if (n & 15)
  309. {
  310. if (n & 8)
  311. {
  312. LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
  313. MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
  314. x0, x1, x2, x3);
  315. MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
  316. x4, x5, x6, x7);
  317. ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
  318. }
  319. if (n & 4)
  320. {
  321. LD_DP4_INC(px, 2, x0, x1, x2, x3);
  322. MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
  323. x0, x1, x2, x3);
  324. ST_DP4_INC(x0, x1, x2, x3, x, 2);
  325. }
  326. if (n & 2)
  327. {
  328. LD_DP2_INC(px, 2, x0, x1);
  329. MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
  330. ST_DP2_INC(x0, x1, x, 2);
  331. }
  332. if (n & 1)
  333. {
  334. LD_GP2_INC(px, 1, f0, f1);
  335. MUL2(f0, da_r, f1, da_r, f0, f1);
  336. ST_GP2_INC(f0, f1, x, 1);
  337. }
  338. }
  339. }
  340. else
  341. {
  342. FLOAT *x_pref;
  343. BLASLONG pref_offset;
  344. pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
  345. if (pref_offset > 0)
  346. {
  347. pref_offset = L1_DATA_LINESIZE - pref_offset;
  348. pref_offset = pref_offset / sizeof(FLOAT);
  349. }
  350. x_pref = x + pref_offset + 32;
  351. da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
  352. da_i_vec_neg = -da_i_vec;
  353. da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
  354. da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
  355. for (i = (n >> 4); i--;)
  356. {
  357. PREF_OFFSET(x_pref, 0);
  358. PREF_OFFSET(x_pref, 32);
  359. PREF_OFFSET(x_pref, 64);
  360. PREF_OFFSET(x_pref, 96);
  361. PREF_OFFSET(x_pref, 128);
  362. PREF_OFFSET(x_pref, 160);
  363. PREF_OFFSET(x_pref, 192);
  364. PREF_OFFSET(x_pref, 224);
  365. x_pref += 32;
  366. LD_DP16_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
  367. x11, x12, x13, x14, x15);
  368. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  369. d0, d1, d2, d3);
  370. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  371. d4, d5, d6, d7);
  372. MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
  373. d8, d9, d10, d11);
  374. MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
  375. d12, d13, d14, d15);
  376. SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
  377. SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
  378. SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78);
  379. SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78);
  380. FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
  381. FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
  382. FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
  383. FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
  384. ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
  385. d12, d13, d14, d15, x, 2);
  386. }
  387. if (n & 15)
  388. {
  389. if (n & 8)
  390. {
  391. LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
  392. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  393. d0, d1, d2, d3);
  394. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  395. d4, d5, d6, d7);
  396. SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
  397. SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
  398. FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
  399. FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
  400. ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 2);
  401. }
  402. if (n & 4)
  403. {
  404. LD_DP4_INC(px, 2, x0, x1, x2, x3);
  405. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  406. d0, d1, d2, d3);
  407. SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
  408. FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
  409. ST_DP4_INC(d0, d1, d2, d3, x, 2);
  410. }
  411. if (n & 2)
  412. {
  413. LD_DP2_INC(px, 2, x0, x1);
  414. MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
  415. SHF_W2_DP(d0, d1, d0, d1, SHF_78);
  416. FMADD2(x0, x1, da_r_vec, d0, d1);
  417. ST_DP2_INC(d0, d1, x, 2);
  418. }
  419. if (n & 1)
  420. {
  421. LD_GP2_INC(px, 1, f0, f1);
  422. tp0 = da_r * f0;
  423. tp0 -= da_i * f1;
  424. tp1 = da_r * f1;
  425. tp1 += da_i * f0;
  426. ST_GP2_INC(tp0, tp1, x, 1);
  427. }
  428. }
  429. }
  430. }
  431. else
  432. {
  433. inc_x2 = 2 * inc_x;
  434. if ((0.0 == da_r) && (0.0 == da_i))
  435. {
  436. v2f64 zero_v = {0.0, 0.0};
  437. for (i = (n >> 4); i--;)
  438. {
  439. ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
  440. zero_v, zero_v, x, inc_x2);
  441. ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
  442. zero_v, zero_v, x, inc_x2);
  443. }
  444. if (n & 15)
  445. {
  446. if (n & 8)
  447. {
  448. ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
  449. zero_v, zero_v, x, inc_x2);
  450. }
  451. if (n & 4)
  452. {
  453. ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, inc_x2);
  454. }
  455. if (n & 2)
  456. {
  457. ST_DP2_INC(zero_v, zero_v, x, inc_x2);
  458. }
  459. if (n & 1)
  460. {
  461. ST_DP(zero_v, x);
  462. }
  463. }
  464. }
  465. else if (0.0 == da_r)
  466. {
  467. da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
  468. da_i_vec_neg = -da_i_vec;
  469. da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
  470. for (i = (n >> 4); i--;)
  471. {
  472. LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
  473. x10, x11, x12, x13, x14, x15);
  474. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  475. x0, x1, x2, x3);
  476. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  477. x4, x5, x6, x7);
  478. MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
  479. x8, x9, x10, x11);
  480. MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
  481. x12, x13, x14, x15);
  482. SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
  483. SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
  484. SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78);
  485. SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78);
  486. ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
  487. x12, x13, x14, x15, x, inc_x2);
  488. }
  489. if (n & 15)
  490. {
  491. if (n & 8)
  492. {
  493. LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
  494. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  495. x0, x1, x2, x3);
  496. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  497. x4, x5, x6, x7);
  498. SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
  499. SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
  500. ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2);
  501. }
  502. if (n & 4)
  503. {
  504. LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
  505. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  506. x0, x1, x2, x3);
  507. SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
  508. ST_DP4_INC(x0, x1, x2, x3, x, inc_x2);
  509. }
  510. if (n & 2)
  511. {
  512. LD_DP2_INC(px, inc_x2, x0, x1);
  513. MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
  514. SHF_W2_DP(x0, x1, x0, x1, SHF_78);
  515. ST_DP2_INC(x0, x1, x, inc_x2);
  516. }
  517. if (n & 1)
  518. {
  519. LD_GP2_INC(px, 1, f0, f1);
  520. MUL2(f0, da_i, f1, -da_i, f0, f1);
  521. ST_GP2_INC(f1, f0, x, 1);
  522. }
  523. }
  524. }
  525. else if (0.0 == da_i)
  526. {
  527. da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
  528. for (i = (n >> 4); i--;)
  529. {
  530. LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
  531. x10, x11, x12, x13, x14, x15);
  532. MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
  533. x0, x1, x2, x3);
  534. MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
  535. x4, x5, x6, x7);
  536. MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
  537. x8, x9, x10, x11);
  538. MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
  539. x12, x13, x14, x15);
  540. ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
  541. x12, x13, x14, x15, x, inc_x2);
  542. }
  543. if (n & 15)
  544. {
  545. if (n & 8)
  546. {
  547. LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
  548. MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
  549. x0, x1, x2, x3);
  550. MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
  551. x4, x5, x6, x7);
  552. ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2);
  553. }
  554. if (n & 4)
  555. {
  556. LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
  557. MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
  558. x0, x1, x2, x3);
  559. ST_DP4_INC(x0, x1, x2, x3, x, inc_x2);
  560. }
  561. if (n & 2)
  562. {
  563. LD_DP2_INC(px, inc_x2, x0, x1);
  564. MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
  565. ST_DP2_INC(x0, x1, x, inc_x2);
  566. }
  567. if (n & 1)
  568. {
  569. LD_GP2_INC(px, 1, f0, f1);
  570. MUL2(f0, da_r, f1, da_r, f0, f1);
  571. ST_GP2_INC(f0, f1, x, 1);
  572. }
  573. }
  574. }
  575. else
  576. {
  577. da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
  578. da_i_vec_neg = -da_i_vec;
  579. da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
  580. da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
  581. for (i = (n >> 4); i--;)
  582. {
  583. LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
  584. x10, x11, x12, x13, x14, x15);
  585. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  586. d0, d1, d2, d3);
  587. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  588. d4, d5, d6, d7);
  589. MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
  590. d8, d9, d10, d11);
  591. MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
  592. d12, d13, d14, d15);
  593. SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
  594. SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
  595. SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78);
  596. SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78);
  597. FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
  598. FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
  599. FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
  600. FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
  601. ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
  602. d12, d13, d14, d15, x, inc_x2);
  603. }
  604. if (n & 15)
  605. {
  606. if (n & 8)
  607. {
  608. LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
  609. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  610. d0, d1, d2, d3);
  611. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  612. d4, d5, d6, d7);
  613. SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
  614. SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
  615. FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
  616. FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
  617. ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, inc_x2);
  618. }
  619. if (n & 4)
  620. {
  621. LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
  622. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  623. d0, d1, d2, d3);
  624. SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
  625. FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
  626. ST_DP4_INC(d0, d1, d2, d3, x, inc_x2);
  627. }
  628. if (n & 2)
  629. {
  630. LD_DP2_INC(px, inc_x2, x0, x1);
  631. MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
  632. SHF_W2_DP(d0, d1, d0, d1, SHF_78);
  633. FMADD2(x0, x1, da_r_vec, d0, d1);
  634. ST_DP2_INC(d0, d1, x, inc_x2);
  635. }
  636. if (n & 1)
  637. {
  638. LD_GP2_INC(px, 1, f0, f1);
  639. tp0 = da_r * f0;
  640. tp0 -= da_i * f1;
  641. tp1 = da_r * f1;
  642. tp1 += da_i * f0;
  643. ST_GP2_INC(tp0, tp1, x, 1);
  644. }
  645. }
  646. }
  647. }
  648. return (0);
  649. }