You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cscal_msa.c 38 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012
  1. /*******************************************************************************
  2. Copyright (c) 2017, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. /* This will shuffle the elements in 'in' vector as (mask needed :: 10 11 00 01)
  30. 0 1 2 3 => 1 0 3 2 */
  31. #define SHF_177 177
  32. int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
  33. FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
  34. BLASLONG dummy2)
  35. {
  36. BLASLONG i, inc_x2;
  37. FLOAT *px;
  38. FLOAT tp0, tp1, tp2, tp3, f0, f1, f2, f3;
  39. v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
  40. v4f32 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15;
  41. v4f32 da_i_vec, da_i_vec_neg, da_r_vec;
  42. px = x;
  43. if (1 == inc_x)
  44. {
  45. if ((0.0 == da_r) && (0.0 == da_i))
  46. {
  47. v4f32 zero_v = __msa_cast_to_vector_float(0);
  48. zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
  49. zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
  50. zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
  51. zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
  52. for (i = (n >> 5); i--;)
  53. {
  54. ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
  55. zero_v, zero_v, x, 4);
  56. ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
  57. zero_v, zero_v, x, 4);
  58. }
  59. if (n & 31)
  60. {
  61. if (n & 16)
  62. {
  63. ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
  64. zero_v, zero_v, x, 4);
  65. }
  66. if (n & 8)
  67. {
  68. ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4);
  69. }
  70. if (n & 4)
  71. {
  72. ST_SP2_INC(zero_v, zero_v, x, 4);
  73. }
  74. if (n & 2)
  75. {
  76. ST_SP(zero_v, x); x += 4;
  77. }
  78. if (n & 1)
  79. {
  80. *x = 0; x += 1;
  81. *x = 0;
  82. }
  83. }
  84. }
  85. else if (0.0 == da_r)
  86. {
  87. da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
  88. da_i_vec_neg = -da_i_vec;
  89. da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
  90. if (n > 31)
  91. {
  92. FLOAT *x_pref;
  93. BLASLONG pref_offset;
  94. pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
  95. if (pref_offset > 0)
  96. {
  97. pref_offset = L1_DATA_LINESIZE - pref_offset;
  98. pref_offset = pref_offset / sizeof(FLOAT);
  99. }
  100. x_pref = x + pref_offset + 64 + 32;
  101. LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
  102. for (i = (n >> 5)- 1; i--;)
  103. {
  104. PREF_OFFSET(x_pref, 0);
  105. PREF_OFFSET(x_pref, 32);
  106. PREF_OFFSET(x_pref, 64);
  107. PREF_OFFSET(x_pref, 96);
  108. PREF_OFFSET(x_pref, 128);
  109. PREF_OFFSET(x_pref, 160);
  110. PREF_OFFSET(x_pref, 192);
  111. PREF_OFFSET(x_pref, 224);
  112. x_pref += 64;
  113. x8 = LD_SP(px); px += 4;
  114. x0 *= da_i_vec;
  115. x9 = LD_SP(px); px += 4;
  116. x1 *= da_i_vec;
  117. x10 = LD_SP(px); px += 4;
  118. x2 *= da_i_vec;
  119. x11 = LD_SP(px); px += 4;
  120. x3 *= da_i_vec;
  121. x12 = LD_SP(px); px += 4;
  122. x4 *= da_i_vec;
  123. x13 = LD_SP(px); px += 4;
  124. x5 *= da_i_vec;
  125. x0 = (v4f32) __msa_shf_w((v4i32) x0, SHF_177);
  126. x14 = LD_SP(px); px += 4;
  127. x6 *= da_i_vec;
  128. x1 = (v4f32) __msa_shf_w((v4i32) x1, SHF_177);
  129. x15 = LD_SP(px); px += 4;
  130. x7 *= da_i_vec;
  131. x2 = (v4f32) __msa_shf_w((v4i32) x2, SHF_177);
  132. x8 *= da_i_vec;
  133. x3 = (v4f32) __msa_shf_w((v4i32) x3, SHF_177);
  134. ST_SP(x0, x); x += 4;
  135. x9 *= da_i_vec;
  136. x4 = (v4f32) __msa_shf_w((v4i32) x4, SHF_177);
  137. ST_SP(x1, x); x += 4;
  138. x10 *= da_i_vec;
  139. x5 = (v4f32) __msa_shf_w((v4i32) x5, SHF_177);
  140. ST_SP(x2, x); x += 4;
  141. x11 *= da_i_vec;
  142. x6 = (v4f32) __msa_shf_w((v4i32) x6, SHF_177);
  143. ST_SP(x3, x); x += 4;
  144. x12 *= da_i_vec;
  145. x7 = (v4f32) __msa_shf_w((v4i32) x7, SHF_177);
  146. ST_SP(x4, x); x += 4;
  147. x13 *= da_i_vec;
  148. x8 = (v4f32) __msa_shf_w((v4i32) x8, SHF_177);
  149. ST_SP(x5, x); x += 4;
  150. x14 *= da_i_vec;
  151. x9 = (v4f32) __msa_shf_w((v4i32) x9, SHF_177);
  152. ST_SP(x6, x); x += 4;
  153. x15 *= da_i_vec;
  154. x10 = (v4f32) __msa_shf_w((v4i32) x10, SHF_177);
  155. ST_SP(x7, x); x += 4;
  156. x11 = (v4f32) __msa_shf_w((v4i32) x11, SHF_177);
  157. ST_SP(x8, x); x += 4;
  158. x0 = LD_SP(px); px += 4;
  159. x12 = (v4f32) __msa_shf_w((v4i32) x12, SHF_177);
  160. ST_SP(x9, x); x += 4;
  161. x1 = LD_SP(px); px += 4;
  162. x13 = (v4f32) __msa_shf_w((v4i32) x13, SHF_177);
  163. ST_SP(x10, x); x += 4;
  164. x2 = LD_SP(px); px += 4;
  165. x14 = (v4f32) __msa_shf_w((v4i32) x14, SHF_177);
  166. ST_SP(x11, x); x += 4;
  167. x3 = LD_SP(px); px += 4;
  168. x15 = (v4f32) __msa_shf_w((v4i32) x15, SHF_177);
  169. ST_SP(x12, x); x += 4;
  170. x4 = LD_SP(px); px += 4;
  171. ST_SP(x13, x); x += 4;
  172. x5 = LD_SP(px); px += 4;
  173. ST_SP(x14, x); x += 4;
  174. x6 = LD_SP(px); px += 4;
  175. ST_SP(x15, x); x += 4;
  176. x7 = LD_SP(px); px += 4;
  177. }
  178. LD_SP8_INC(px, 4, x8, x9, x10, x11, x12, x13, x14, x15);
  179. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  180. x0, x1, x2, x3);
  181. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  182. x4, x5, x6, x7);
  183. MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
  184. x8, x9, x10, x11);
  185. MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
  186. x12, x13, x14, x15);
  187. SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
  188. SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177);
  189. SHF_W4_SP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_177);
  190. SHF_W4_SP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_177);
  191. ST_SP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
  192. x12, x13, x14, x15, x, 4);
  193. }
  194. if (n & 31)
  195. {
  196. if (n & 16)
  197. {
  198. LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
  199. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  200. x0, x1, x2, x3);
  201. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  202. x4, x5, x6, x7);
  203. SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
  204. SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177);
  205. ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
  206. }
  207. if (n & 8)
  208. {
  209. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  210. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  211. x0, x1, x2, x3);
  212. SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
  213. ST_SP4_INC(x0, x1, x2, x3, x, 4);
  214. }
  215. if (n & 4)
  216. {
  217. LD_SP2_INC(px, 4, x0, x1);
  218. MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
  219. SHF_W2_SP(x0, x1, x0, x1, SHF_177);
  220. ST_SP2_INC(x0, x1, x, 4);
  221. }
  222. if (n & 2)
  223. {
  224. LD_GP4_INC(px, 1, f0, f1, f2, f3);
  225. MUL4(f0, da_i, f1, -da_i, f2, da_i, f3, -da_i,
  226. f0, f1, f2, f3);
  227. ST_GP4_INC(f1, f0, f3, f2, x, 1);
  228. }
  229. if (n & 1)
  230. {
  231. LD_GP2_INC(px, 1, f0, f1);
  232. MUL2(f0, da_i, f1, -da_i, f0, f1);
  233. ST_GP2_INC(f1, f0, x, 1);
  234. }
  235. }
  236. }
  237. else if (0.0 == da_i)
  238. {
  239. da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
  240. if (n > 31)
  241. {
  242. FLOAT *x_pref;
  243. BLASLONG pref_offset;
  244. pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
  245. if (pref_offset > 0)
  246. {
  247. pref_offset = L1_DATA_LINESIZE - pref_offset;
  248. pref_offset = pref_offset / sizeof(FLOAT);
  249. }
  250. x_pref = x + pref_offset + 64 + 32;
  251. LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
  252. for (i = (n >> 5)- 1; i--;)
  253. {
  254. PREF_OFFSET(x_pref, 0);
  255. PREF_OFFSET(x_pref, 32);
  256. PREF_OFFSET(x_pref, 64);
  257. PREF_OFFSET(x_pref, 96);
  258. PREF_OFFSET(x_pref, 128);
  259. PREF_OFFSET(x_pref, 160);
  260. PREF_OFFSET(x_pref, 192);
  261. PREF_OFFSET(x_pref, 224);
  262. x_pref += 64;
  263. x8 = LD_SP(px); px += 4;
  264. x0 *= da_r_vec;
  265. x9 = LD_SP(px); px += 4;
  266. x1 *= da_r_vec;
  267. x10 = LD_SP(px); px += 4;
  268. x2 *= da_r_vec;
  269. x11 = LD_SP(px); px += 4;
  270. x3 *= da_r_vec;
  271. x12 = LD_SP(px); px += 4;
  272. x4 *= da_r_vec;
  273. x13 = LD_SP(px); px += 4;
  274. x5 *= da_r_vec;
  275. ST_SP(x0, x); x += 4;
  276. x14 = LD_SP(px); px += 4;
  277. x6 *= da_r_vec;
  278. ST_SP(x1, x); x += 4;
  279. x15 = LD_SP(px); px += 4;
  280. x7 *= da_r_vec;
  281. ST_SP(x2, x); x += 4;
  282. x8 *= da_r_vec;
  283. ST_SP(x3, x); x += 4;
  284. x9 *= da_r_vec;
  285. ST_SP(x4, x); x += 4;
  286. x10 *= da_r_vec;
  287. ST_SP(x5, x); x += 4;
  288. x11 *= da_r_vec;
  289. ST_SP(x6, x); x += 4;
  290. x12 *= da_r_vec;
  291. ST_SP(x7, x); x += 4;
  292. x13 *= da_r_vec;
  293. ST_SP(x8, x); x += 4;
  294. x0 = LD_SP(px); px += 4;
  295. x14 *= da_r_vec;
  296. ST_SP(x9, x); x += 4;
  297. x1 = LD_SP(px); px += 4;
  298. x15 *= da_r_vec;
  299. ST_SP(x10, x); x += 4;
  300. x2 = LD_SP(px); px += 4;
  301. ST_SP(x11, x); x += 4;
  302. x3 = LD_SP(px); px += 4;
  303. ST_SP(x12, x); x += 4;
  304. x4 = LD_SP(px); px += 4;
  305. ST_SP(x13, x); x += 4;
  306. x5 = LD_SP(px); px += 4;
  307. ST_SP(x14, x); x += 4;
  308. x6 = LD_SP(px); px += 4;
  309. ST_SP(x15, x); x += 4;
  310. x7 = LD_SP(px); px += 4;
  311. }
  312. LD_SP8_INC(px, 4, x8, x9, x10, x11, x12, x13, x14, x15);
  313. MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
  314. x0, x1, x2, x3);
  315. MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
  316. x4, x5, x6, x7);
  317. MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
  318. x8, x9, x10, x11);
  319. MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
  320. x12, x13, x14, x15);
  321. ST_SP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
  322. x12, x13, x14, x15, x, 4);
  323. }
  324. if (n & 31)
  325. {
  326. if (n & 16)
  327. {
  328. LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
  329. MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
  330. x0, x1, x2, x3);
  331. MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
  332. x4, x5, x6, x7);
  333. ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
  334. }
  335. if (n & 8)
  336. {
  337. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  338. MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
  339. x0, x1, x2, x3);
  340. ST_SP4_INC(x0, x1, x2, x3, x, 4);
  341. }
  342. if (n & 4)
  343. {
  344. LD_SP2_INC(px, 4, x0, x1);
  345. MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
  346. ST_SP2_INC(x0, x1, x, 4);
  347. }
  348. if (n & 2)
  349. {
  350. LD_GP4_INC(px, 1, f0, f1, f2, f3);
  351. MUL4(f0, da_r, f1, da_r, f2, da_r, f3, da_r, f0, f1, f2, f3);
  352. ST_GP4_INC(f0, f1, f2, f3, x, 1);
  353. }
  354. if (n & 1)
  355. {
  356. LD_GP2_INC(px, 1, f0, f1);
  357. MUL2(f0, da_r, f1, da_r, f0, f1);
  358. ST_GP2_INC(f0, f1, x, 1);
  359. }
  360. }
  361. }
  362. else
  363. {
  364. FLOAT *x_pref;
  365. BLASLONG pref_offset;
  366. pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
  367. if (pref_offset > 0)
  368. {
  369. pref_offset = L1_DATA_LINESIZE - pref_offset;
  370. pref_offset = pref_offset / sizeof(FLOAT);
  371. }
  372. x_pref = x + pref_offset + 64;
  373. da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
  374. da_i_vec_neg = -da_i_vec;
  375. da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
  376. da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
  377. for (i = (n >> 5); i--;)
  378. {
  379. PREF_OFFSET(x_pref, 0);
  380. PREF_OFFSET(x_pref, 32);
  381. PREF_OFFSET(x_pref, 64);
  382. PREF_OFFSET(x_pref, 96);
  383. PREF_OFFSET(x_pref, 128);
  384. PREF_OFFSET(x_pref, 160);
  385. PREF_OFFSET(x_pref, 192);
  386. PREF_OFFSET(x_pref, 224);
  387. x_pref += 64;
  388. LD_SP16_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
  389. x11, x12, x13, x14, x15);
  390. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  391. d0, d1, d2, d3);
  392. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  393. d4, d5, d6, d7);
  394. MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
  395. d8, d9, d10, d11);
  396. MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
  397. d12, d13, d14, d15);
  398. SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177);
  399. SHF_W4_SP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_177);
  400. SHF_W4_SP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_177);
  401. SHF_W4_SP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_177);
  402. FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
  403. FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
  404. FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
  405. FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
  406. ST_SP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
  407. d12, d13, d14, d15, x, 4);
  408. }
  409. if (n & 31)
  410. {
  411. if (n & 16)
  412. {
  413. LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
  414. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  415. d0, d1, d2, d3);
  416. MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
  417. d4, d5, d6, d7);
  418. SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177);
  419. SHF_W4_SP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_177);
  420. FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
  421. FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
  422. ST_SP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 4);
  423. }
  424. if (n & 8)
  425. {
  426. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  427. MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
  428. d0, d1, d2, d3);
  429. SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177);
  430. FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
  431. ST_SP4_INC(d0, d1, d2, d3, x, 4);
  432. }
  433. if (n & 4)
  434. {
  435. LD_SP2_INC(px, 4, x0, x1);
  436. MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
  437. SHF_W2_SP(d0, d1, d0, d1, SHF_177);
  438. FMADD2(x0, x1, da_r_vec, d0, d1);
  439. ST_SP2_INC(d0, d1, x, 4);
  440. }
  441. if (n & 2)
  442. {
  443. LD_GP4_INC(px, 1, f0, f1, f2, f3);
  444. tp0 = da_r * f0;
  445. tp0 -= da_i * f1;
  446. tp1 = da_r * f1;
  447. tp1 += da_i * f0;
  448. tp2 = da_r * f2;
  449. tp2 -= da_i * f3;
  450. tp3 = da_r * f3;
  451. tp3 += da_i * f2;
  452. ST_GP4_INC(tp0, tp1, tp2, tp3, x, 1);
  453. }
  454. if (n & 1)
  455. {
  456. LD_GP2_INC(px, 1, f0, f1);
  457. tp0 = da_r * f0;
  458. tp0 -= da_i * f1;
  459. tp1 = da_r * f1;
  460. tp1 += da_i * f0;
  461. ST_GP2_INC(tp0, tp1, x, 1);
  462. }
  463. }
  464. }
  465. }
  466. else
  467. {
  468. inc_x2 = 2 * inc_x;
  469. if ((0.0 == da_r) && (0.0 == da_i))
  470. {
  471. for (i = n; i--;)
  472. {
  473. *x = 0;
  474. *(x + 1) = 0;
  475. x += inc_x2;
  476. }
  477. }
  478. else if (0.0 == da_r)
  479. {
  480. da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
  481. da_i_vec_neg = -da_i_vec;
  482. da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
  483. for (i = (n >> 4); i--;)
  484. {
  485. LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
  486. x10, x11, x12, x13, x14, x15);
  487. PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
  488. PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7);
  489. MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
  490. d0, d1, d2, d3);
  491. MUL4(d4, da_i_vec, d5, da_i_vec, d6, da_i_vec, d7, da_i_vec,
  492. d4, d5, d6, d7);
  493. *x = d0[1];
  494. *(x + 1) = d0[0];
  495. x += inc_x2;
  496. *x = d0[3];
  497. *(x + 1) = d0[2];
  498. x += inc_x2;
  499. *x = d1[1];
  500. *(x + 1) = d1[0];
  501. x += inc_x2;
  502. *x = d1[3];
  503. *(x + 1) = d1[2];
  504. x += inc_x2;
  505. *x = d2[1];
  506. *(x + 1) = d2[0];
  507. x += inc_x2;
  508. *x = d2[3];
  509. *(x + 1) = d2[2];
  510. x += inc_x2;
  511. *x = d3[1];
  512. *(x + 1) = d3[0];
  513. x += inc_x2;
  514. *x = d3[3];
  515. *(x + 1) = d3[2];
  516. x += inc_x2;
  517. *x = d4[1];
  518. *(x + 1) = d4[0];
  519. x += inc_x2;
  520. *x = d4[3];
  521. *(x + 1) = d4[2];
  522. x += inc_x2;
  523. *x = d5[1];
  524. *(x + 1) = d5[0];
  525. x += inc_x2;
  526. *x = d5[3];
  527. *(x + 1) = d5[2];
  528. x += inc_x2;
  529. *x = d6[1];
  530. *(x + 1) = d6[0];
  531. x += inc_x2;
  532. *x = d6[3];
  533. *(x + 1) = d6[2];
  534. x += inc_x2;
  535. *x = d7[1];
  536. *(x + 1) = d7[0];
  537. x += inc_x2;
  538. *x = d7[3];
  539. *(x + 1) = d7[2];
  540. x += inc_x2;
  541. }
  542. if (n & 15)
  543. {
  544. if (n & 8)
  545. {
  546. LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
  547. PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
  548. MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
  549. d0, d1, d2, d3);
  550. *x = d0[1];
  551. *(x + 1) = d0[0];
  552. x += inc_x2;
  553. *x = d0[3];
  554. *(x + 1) = d0[2];
  555. x += inc_x2;
  556. *x = d1[1];
  557. *(x + 1) = d1[0];
  558. x += inc_x2;
  559. *x = d1[3];
  560. *(x + 1) = d1[2];
  561. x += inc_x2;
  562. *x = d2[1];
  563. *(x + 1) = d2[0];
  564. x += inc_x2;
  565. *x = d2[3];
  566. *(x + 1) = d2[2];
  567. x += inc_x2;
  568. *x = d3[1];
  569. *(x + 1) = d3[0];
  570. x += inc_x2;
  571. *x = d3[3];
  572. *(x + 1) = d3[2];
  573. x += inc_x2;
  574. }
  575. if (n & 4)
  576. {
  577. LD_SP4_INC(px, inc_x2, x0, x1, x2, x3);
  578. PCKEV_D2_SP(x1, x0, x3, x2, d0, d1);
  579. MUL2(d0, da_i_vec, d1, da_i_vec, d0, d1);
  580. *x = d0[1];
  581. *(x + 1) = d0[0];
  582. x += inc_x2;
  583. *x = d0[3];
  584. *(x + 1) = d0[2];
  585. x += inc_x2;
  586. *x = d1[1];
  587. *(x + 1) = d1[0];
  588. x += inc_x2;
  589. *x = d1[3];
  590. *(x + 1) = d1[2];
  591. x += inc_x2;
  592. }
  593. if (n & 2)
  594. {
  595. f0 = *px;
  596. f1 = *(px + 1);
  597. px += inc_x2;
  598. f2 = *px;
  599. f3 = *(px + 1);
  600. px += inc_x2;
  601. MUL4(f0, da_i, f1, -da_i, f2, da_i, f3, -da_i, f0, f1, f2, f3);
  602. *x = f1;
  603. *(x + 1) = f0;
  604. x += inc_x2;
  605. *x = f3;
  606. *(x + 1) = f2;
  607. x += inc_x2;
  608. }
  609. if (n & 1)
  610. {
  611. f0 = *x;
  612. f1 = *(x + 1);
  613. MUL2(f0, da_i, f1, -da_i, f0, f1);
  614. *x = f1;
  615. *(x + 1) = f0;
  616. }
  617. }
  618. }
  619. else if (0.0 == da_i)
  620. {
  621. da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
  622. for (i = (n >> 4); i--;)
  623. {
  624. LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
  625. x10, x11, x12, x13, x14, x15);
  626. PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
  627. PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7);
  628. MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
  629. d0, d1, d2, d3);
  630. MUL4(d4, da_r_vec, d5, da_r_vec, d6, da_r_vec, d7, da_r_vec,
  631. d4, d5, d6, d7);
  632. *x = d0[0];
  633. *(x + 1) = d0[1];
  634. x += inc_x2;
  635. *x = d0[2];
  636. *(x + 1) = d0[3];
  637. x += inc_x2;
  638. *x = d1[0];
  639. *(x + 1) = d1[1];
  640. x += inc_x2;
  641. *x = d1[2];
  642. *(x + 1) = d1[3];
  643. x += inc_x2;
  644. *x = d2[0];
  645. *(x + 1) = d2[1];
  646. x += inc_x2;
  647. *x = d2[2];
  648. *(x + 1) = d2[3];
  649. x += inc_x2;
  650. *x = d3[0];
  651. *(x + 1) = d3[1];
  652. x += inc_x2;
  653. *x = d3[2];
  654. *(x + 1) = d3[3];
  655. x += inc_x2;
  656. *x = d4[0];
  657. *(x + 1) = d4[1];
  658. x += inc_x2;
  659. *x = d4[2];
  660. *(x + 1) = d4[3];
  661. x += inc_x2;
  662. *x = d5[0];
  663. *(x + 1) = d5[1];
  664. x += inc_x2;
  665. *x = d5[2];
  666. *(x + 1) = d5[3];
  667. x += inc_x2;
  668. *x = d6[0];
  669. *(x + 1) = d6[1];
  670. x += inc_x2;
  671. *x = d6[2];
  672. *(x + 1) = d6[3];
  673. x += inc_x2;
  674. *x = d7[0];
  675. *(x + 1) = d7[1];
  676. x += inc_x2;
  677. *x = d7[2];
  678. *(x + 1) = d7[3];
  679. x += inc_x2;
  680. }
  681. if (n & 15)
  682. {
  683. if (n & 8)
  684. {
  685. LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
  686. PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
  687. MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
  688. d0, d1, d2, d3);
  689. *x = d0[0];
  690. *(x + 1) = d0[1];
  691. x += inc_x2;
  692. *x = d0[2];
  693. *(x + 1) = d0[3];
  694. x += inc_x2;
  695. *x = d1[0];
  696. *(x + 1) = d1[1];
  697. x += inc_x2;
  698. *x = d1[2];
  699. *(x + 1) = d1[3];
  700. x += inc_x2;
  701. *x = d2[0];
  702. *(x + 1) = d2[1];
  703. x += inc_x2;
  704. *x = d2[2];
  705. *(x + 1) = d2[3];
  706. x += inc_x2;
  707. *x = d3[0];
  708. *(x + 1) = d3[1];
  709. x += inc_x2;
  710. *x = d3[2];
  711. *(x + 1) = d3[3];
  712. x += inc_x2;
  713. }
  714. if (n & 4)
  715. {
  716. LD_SP4_INC(px, inc_x2, x0, x1, x2, x3);
  717. PCKEV_D2_SP(x1, x0, x3, x2, d0, d1);
  718. MUL2(d0, da_r_vec, d1, da_r_vec, d0, d1);
  719. *x = d0[0];
  720. *(x + 1) = d0[1];
  721. x += inc_x2;
  722. *x = d0[2];
  723. *(x + 1) = d0[3];
  724. x += inc_x2;
  725. *x = d1[0];
  726. *(x + 1) = d1[1];
  727. x += inc_x2;
  728. *x = d1[2];
  729. *(x + 1) = d1[3];
  730. x += inc_x2;
  731. }
  732. if (n & 2)
  733. {
  734. f0 = *px;
  735. f1 = *(px + 1);
  736. px += inc_x2;
  737. f2 = *px;
  738. f3 = *(px + 1);
  739. px += inc_x2;
  740. MUL4(f0, da_r, f1, da_r, f2, da_r, f3, da_r, f0, f1, f2, f3);
  741. *x = f0;
  742. *(x + 1) = f1;
  743. x += inc_x2;
  744. *x = f2;
  745. *(x + 1) = f3;
  746. x += inc_x2;
  747. }
  748. if (n & 1)
  749. {
  750. f0 = *x;
  751. f1 = *(x + 1);
  752. MUL2(f0, da_r, f1, da_r, f0, f1);
  753. *x = f0;
  754. *(x + 1) = f1;
  755. }
  756. }
  757. }
  758. else
  759. {
  760. da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
  761. da_i_vec_neg = -da_i_vec;
  762. da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
  763. da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
  764. for (i = (n >> 4); i--;)
  765. {
  766. LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
  767. x10, x11, x12, x13, x14, x15);
  768. PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
  769. PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7);
  770. MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
  771. x0, x1, x2, x3);
  772. MUL4(d4, da_i_vec, d5, da_i_vec, d6, da_i_vec, d7, da_i_vec,
  773. x4, x5, x6, x7);
  774. MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
  775. d0, d1, d2, d3);
  776. MUL4(d4, da_r_vec, d5, da_r_vec, d6, da_r_vec, d7, da_r_vec,
  777. d4, d5, d6, d7);
  778. SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
  779. SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177);
  780. ADD4(d0, x0, d1, x1, d2, x2, d3, x3, d0, d1, d2, d3);
  781. ADD4(d4, x4, d5, x5, d6, x6, d7, x7, d4, d5, d6, d7);
  782. *x = d0[0];
  783. *(x + 1) = d0[1];
  784. x += inc_x2;
  785. *x = d0[2];
  786. *(x + 1) = d0[3];
  787. x += inc_x2;
  788. *x = d1[0];
  789. *(x + 1) = d1[1];
  790. x += inc_x2;
  791. *x = d1[2];
  792. *(x + 1) = d1[3];
  793. x += inc_x2;
  794. *x = d2[0];
  795. *(x + 1) = d2[1];
  796. x += inc_x2;
  797. *x = d2[2];
  798. *(x + 1) = d2[3];
  799. x += inc_x2;
  800. *x = d3[0];
  801. *(x + 1) = d3[1];
  802. x += inc_x2;
  803. *x = d3[2];
  804. *(x + 1) = d3[3];
  805. x += inc_x2;
  806. *x = d4[0];
  807. *(x + 1) = d4[1];
  808. x += inc_x2;
  809. *x = d4[2];
  810. *(x + 1) = d4[3];
  811. x += inc_x2;
  812. *x = d5[0];
  813. *(x + 1) = d5[1];
  814. x += inc_x2;
  815. *x = d5[2];
  816. *(x + 1) = d5[3];
  817. x += inc_x2;
  818. *x = d6[0];
  819. *(x + 1) = d6[1];
  820. x += inc_x2;
  821. *x = d6[2];
  822. *(x + 1) = d6[3];
  823. x += inc_x2;
  824. *x = d7[0];
  825. *(x + 1) = d7[1];
  826. x += inc_x2;
  827. *x = d7[2];
  828. *(x + 1) = d7[3];
  829. x += inc_x2;
  830. }
  831. if (n & 15)
  832. {
  833. if (n & 8)
  834. {
  835. LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
  836. PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
  837. MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
  838. x0, x1, x2, x3);
  839. MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
  840. d0, d1, d2, d3);
  841. SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
  842. ADD4(d0, x0, d1, x1, d2, x2, d3, x3, d0, d1, d2, d3);
  843. *x = d0[0];
  844. *(x + 1) = d0[1];
  845. x += inc_x2;
  846. *x = d0[2];
  847. *(x + 1) = d0[3];
  848. x += inc_x2;
  849. *x = d1[0];
  850. *(x + 1) = d1[1];
  851. x += inc_x2;
  852. *x = d1[2];
  853. *(x + 1) = d1[3];
  854. x += inc_x2;
  855. *x = d2[0];
  856. *(x + 1) = d2[1];
  857. x += inc_x2;
  858. *x = d2[2];
  859. *(x + 1) = d2[3];
  860. x += inc_x2;
  861. *x = d3[0];
  862. *(x + 1) = d3[1];
  863. x += inc_x2;
  864. *x = d3[2];
  865. *(x + 1) = d3[3];
  866. x += inc_x2;
  867. }
  868. if (n & 4)
  869. {
  870. LD_SP4_INC(px, inc_x2, x0, x1, x2, x3);
  871. PCKEV_D2_SP(x1, x0, x3, x2, d0, d1);
  872. MUL2(d0, da_i_vec, d1, da_i_vec, x0, x1);
  873. MUL2(d0, da_r_vec, d1, da_r_vec, d0, d1);
  874. SHF_W2_SP(x0, x1, x0, x1, SHF_177);
  875. ADD2(d0, x0, d1, x1, d0, d1);
  876. *x = d0[0];
  877. *(x + 1) = d0[1];
  878. x += inc_x2;
  879. *x = d0[2];
  880. *(x + 1) = d0[3];
  881. x += inc_x2;
  882. *x = d1[0];
  883. *(x + 1) = d1[1];
  884. x += inc_x2;
  885. *x = d1[2];
  886. *(x + 1) = d1[3];
  887. x += inc_x2;
  888. }
  889. if (n & 2)
  890. {
  891. f0 = *px;;
  892. f1 = *(px + 1);
  893. px += inc_x2;
  894. f2 = *px;
  895. f3 = *(px + 1);
  896. px += inc_x2;
  897. tp0 = da_r * f0;
  898. tp0 -= da_i * f1;
  899. tp1 = da_r * f1;
  900. tp1 += da_i * f0;
  901. tp2 = da_r * f2;
  902. tp2 -= da_i * f3;
  903. tp3 = da_r * f3;
  904. tp3 += da_i * f2;
  905. *x = tp0;
  906. *(x + 1) = tp1;
  907. x += inc_x2;
  908. *x = tp2;
  909. *(x + 1) = tp3;
  910. x += inc_x2;
  911. }
  912. if (n & 1)
  913. {
  914. f0 = *px; px += 1;
  915. f1 = *px;
  916. tp0 = da_r * f0;
  917. tp0 -= da_i * f1;
  918. tp1 = da_r * f1;
  919. tp1 += da_i * f0;
  920. *x = tp0; x += 1;
  921. *x = tp1;
  922. }
  923. }
  924. }
  925. }
  926. return (0);
  927. }