You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

crot_msa.c 35 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072
  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
  30. FLOAT c, FLOAT s)
  31. {
  32. BLASLONG i, j;
  33. FLOAT *px, *py;
  34. FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
  35. FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3;
  36. BLASLONG inc_x2, inc_y2;
  37. v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
  38. v4f32 out0, out1, out2, out3, out4, out5, out6, out7;
  39. v4f32 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0;
  40. if (n <= 0) return (0);
  41. px = x;
  42. py = y;
  43. if ((1 == inc_x) && (1 == inc_y))
  44. {
  45. if ((0 == c) && (0 == s))
  46. {
  47. v4f32 zero = __msa_cast_to_vector_float(0);
  48. zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
  49. zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
  50. zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
  51. zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
  52. /* process 2 elements */
  53. for (j = (n >> 1); j--;)
  54. {
  55. ST_SP(zero, px);
  56. ST_SP(zero, py);
  57. px += 4;
  58. py += 4;
  59. }
  60. if (n & 1)
  61. {
  62. px[0] = 0;
  63. px[1] = 0;
  64. py[0] = 0;
  65. py[1] = 0;
  66. }
  67. }
  68. else if ((1 == c) && (1 == s))
  69. {
  70. if (n >> 4)
  71. {
  72. BLASLONG pref_offsetx, pref_offsety;
  73. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  74. if (pref_offsetx > 0)
  75. {
  76. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  77. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  78. }
  79. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  80. if (pref_offsety > 0)
  81. {
  82. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  83. pref_offsety = pref_offsety / sizeof(FLOAT);
  84. }
  85. x0 = LD_SP(px); px += 4;
  86. x1 = LD_SP(px); px += 4;
  87. x2 = LD_SP(px); px += 4;
  88. x3 = LD_SP(px); px += 4;
  89. y0 = LD_SP(py); py += 4;
  90. y1 = LD_SP(py); py += 4;
  91. y2 = LD_SP(py); py += 4;
  92. y3 = LD_SP(py); py += 4;
  93. for (j = (n >> 4) - 1; j--;)
  94. {
  95. PREFETCH(px + pref_offsetx + 32);
  96. PREFETCH(px + pref_offsetx + 40);
  97. PREFETCH(px + pref_offsetx + 48);
  98. PREFETCH(px + pref_offsetx + 56);
  99. PREFETCH(py + pref_offsety + 32);
  100. PREFETCH(py + pref_offsety + 40);
  101. PREFETCH(py + pref_offsety + 48);
  102. PREFETCH(py + pref_offsety + 56);
  103. out0 = x0 + y0;
  104. x4 = LD_SP(px); px += 4;
  105. out1 = y0 - x0;
  106. x5 = LD_SP(px); px += 4;
  107. out2 = x1 + y1;
  108. x6 = LD_SP(px); px += 4;
  109. out3 = y1 - x1;
  110. x7 = LD_SP(px); px += 4;
  111. out4 = x2 + y2;
  112. y4 = LD_SP(py); py += 4;
  113. out5 = y2 - x2;
  114. y5 = LD_SP(py); py += 4;
  115. out6 = x3 + y3;
  116. y6 = LD_SP(py); py += 4;
  117. out7 = y3 - x3;
  118. y7 = LD_SP(py); py += 4;
  119. ST_SP(out0, x); x += 4;
  120. out8 = x4 + y4;
  121. ST_SP(out1, y); y += 4;
  122. out9 = y4 - x4;
  123. ST_SP(out2, x); x += 4;
  124. out10 = x5 + y5;
  125. ST_SP(out3, y); y += 4;
  126. out11 = y5 - x5;
  127. ST_SP(out4, x); x += 4;
  128. out12 = x6 + y6;
  129. ST_SP(out5, y); y += 4;
  130. out13 = y6 - x6;
  131. ST_SP(out6, x); x += 4;
  132. out14 = x7 + y7;
  133. ST_SP(out7, y); y += 4;
  134. out15 = y7 - x7;
  135. x0 = LD_SP(px); px += 4;
  136. ST_SP(out8, x); x += 4;
  137. x1 = LD_SP(px); px += 4;
  138. ST_SP(out10, x); x += 4;
  139. x2 = LD_SP(px); px += 4;
  140. ST_SP(out12, x); x += 4;
  141. x3 = LD_SP(px); px += 4;
  142. ST_SP(out14, x); x += 4;
  143. y0 = LD_SP(py); py += 4;
  144. ST_SP(out9, y); y += 4;
  145. y1 = LD_SP(py); py += 4;
  146. ST_SP(out11, y); y += 4;
  147. y2 = LD_SP(py); py += 4;
  148. ST_SP(out13, y); y += 4;
  149. y3 = LD_SP(py); py += 4;
  150. ST_SP(out15, y); y += 4;
  151. }
  152. x4 = LD_SP(px); px += 4;
  153. x5 = LD_SP(px); px += 4;
  154. x6 = LD_SP(px); px += 4;
  155. x7 = LD_SP(px); px += 4;
  156. y4 = LD_SP(py); py += 4;
  157. y5 = LD_SP(py); py += 4;
  158. y6 = LD_SP(py); py += 4;
  159. y7 = LD_SP(py); py += 4;
  160. out0 = x0 + y0;
  161. out1 = y0 - x0;
  162. out2 = x1 + y1;
  163. out3 = y1 - x1;
  164. out4 = x2 + y2;
  165. out5 = y2 - x2;
  166. out6 = x3 + y3;
  167. out7 = y3 - x3;
  168. out8 = x4 + y4;
  169. out9 = y4 - x4;
  170. out10 = x5 + y5;
  171. out11 = y5 - x5;
  172. out12 = x6 + y6;
  173. out13 = y6 - x6;
  174. out14 = x7 + y7;
  175. out15 = y7 - x7;
  176. ST_SP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 4);
  177. ST_SP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 4);
  178. }
  179. if (n & 8)
  180. {
  181. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  182. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  183. out0 = x0 + y0;
  184. out1 = y0 - x0;
  185. out2 = x1 + y1;
  186. out3 = y1 - x1;
  187. out4 = x2 + y2;
  188. out5 = y2 - x2;
  189. out6 = x3 + y3;
  190. out7 = y3 - x3;
  191. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  192. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  193. }
  194. if (n & 4)
  195. {
  196. LD_SP2_INC(px, 4, x0, x1);
  197. LD_SP2_INC(py, 4, y0, y1);
  198. out0 = x0 + y0;
  199. out1 = y0 - x0;
  200. out2 = x1 + y1;
  201. out3 = y1 - x1;
  202. ST_SP2_INC(out0, out2, x, 4);
  203. ST_SP2_INC(out1, out3, y, 4);
  204. }
  205. if (n & 2)
  206. {
  207. x0 = LD_SP(px);
  208. y0 = LD_SP(py);
  209. px += 4;
  210. py += 4;
  211. out0 = x0 + y0;
  212. out1 = y0 - x0;
  213. ST_SP(out0, x);
  214. ST_SP(out1, y);
  215. x += 4;
  216. y += 4;
  217. }
  218. if (n & 1)
  219. {
  220. LD_GP2_INC(px, 1, fx0, fx1);
  221. LD_GP2_INC(py, 1, fy0, fy1);
  222. tp0 = fx0 + fy0;
  223. tp1 = fy0 - fx0;
  224. tp2 = fx1 + fy1;
  225. tp3 = fy1 - fx1;
  226. ST_GP2_INC(tp0, tp2, x, 1);
  227. ST_GP2_INC(tp1, tp3, y, 1);
  228. }
  229. }
  230. else if (0 == s)
  231. {
  232. c0 = COPY_FLOAT_TO_VECTOR(c);
  233. if (n >> 4)
  234. {
  235. BLASLONG pref_offsetx, pref_offsety;
  236. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  237. if (pref_offsetx > 0)
  238. {
  239. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  240. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  241. }
  242. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  243. if (pref_offsety > 0)
  244. {
  245. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  246. pref_offsety = pref_offsety / sizeof(FLOAT);
  247. }
  248. LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
  249. for (j = (n >> 4) - 1; j--;)
  250. {
  251. PREFETCH(px + pref_offsetx + 32);
  252. PREFETCH(px + pref_offsetx + 40);
  253. PREFETCH(px + pref_offsetx + 48);
  254. PREFETCH(px + pref_offsetx + 56);
  255. PREFETCH(py + pref_offsety + 32);
  256. PREFETCH(py + pref_offsety + 40);
  257. PREFETCH(py + pref_offsety + 48);
  258. PREFETCH(py + pref_offsety + 56);
  259. y0 = LD_SP(py); py += 4;
  260. x0 *= c0;
  261. y1 = LD_SP(py); py += 4;
  262. x1 *= c0;
  263. y2 = LD_SP(py); py += 4;
  264. x2 *= c0;
  265. y3 = LD_SP(py); py += 4;
  266. x3 *= c0;
  267. y4 = LD_SP(py); py += 4;
  268. x4 *= c0;
  269. y5 = LD_SP(py); py += 4;
  270. x5 *= c0;
  271. y6 = LD_SP(py); py += 4;
  272. x6 *= c0;
  273. y7 = LD_SP(py); py += 4;
  274. x7 *= c0;
  275. ST_SP(x0, x); x += 4;
  276. y0 *= c0;
  277. ST_SP(x1, x); x += 4;
  278. y1 *= c0;
  279. ST_SP(x2, x); x += 4;
  280. y2 *= c0;
  281. ST_SP(x3, x); x += 4;
  282. y3 *= c0;
  283. ST_SP(x4, x); x += 4;
  284. y4 *= c0;
  285. ST_SP(x5, x); x += 4;
  286. y5 *= c0;
  287. ST_SP(x6, x); x += 4;
  288. y6 *= c0;
  289. ST_SP(x7, x); x += 4;
  290. y7 *= c0;
  291. x0 = LD_SP(px); px += 4;
  292. ST_SP(y0, y); y += 4;
  293. x1 = LD_SP(px); px += 4;
  294. ST_SP(y1, y); y += 4;
  295. x2 = LD_SP(px); px += 4;
  296. ST_SP(y2, y); y += 4;
  297. x3 = LD_SP(px); px += 4;
  298. ST_SP(y3, y); y += 4;
  299. x4 = LD_SP(px); px += 4;
  300. ST_SP(y4, y); y += 4;
  301. x5 = LD_SP(px); px += 4;
  302. ST_SP(y5, y); y += 4;
  303. x6 = LD_SP(px); px += 4;
  304. ST_SP(y6, y); y += 4;
  305. x7 = LD_SP(px); px += 4;
  306. ST_SP(y7, y); y += 4;
  307. }
  308. LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
  309. x0 *= c0;
  310. y0 *= c0;
  311. x1 *= c0;
  312. y1 *= c0;
  313. x2 *= c0;
  314. y2 *= c0;
  315. x3 *= c0;
  316. y3 *= c0;
  317. x4 *= c0;
  318. y4 *= c0;
  319. x5 *= c0;
  320. y5 *= c0;
  321. x6 *= c0;
  322. y6 *= c0;
  323. x7 *= c0;
  324. y7 *= c0;
  325. ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
  326. ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
  327. }
  328. if (n & 8)
  329. {
  330. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  331. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  332. x0 *= c0;
  333. y0 *= c0;
  334. x1 *= c0;
  335. y1 *= c0;
  336. x2 *= c0;
  337. y2 *= c0;
  338. x3 *= c0;
  339. y3 *= c0;
  340. ST_SP4_INC(x0, x1, x2, x3, x, 4);
  341. ST_SP4_INC(y0, y1, y2, y3, y, 4);
  342. }
  343. if (n & 4)
  344. {
  345. LD_SP2_INC(px, 4, x0, x1);
  346. LD_SP2_INC(py, 4, y0, y1);
  347. x0 *= c0;
  348. y0 *= c0;
  349. x1 *= c0;
  350. y1 *= c0;
  351. ST_SP2_INC(x0, x1, x, 4);
  352. ST_SP2_INC(y0, y1, y, 4);
  353. }
  354. if (n & 2)
  355. {
  356. x0 = LD_SP(px);
  357. y0 = LD_SP(py);
  358. px += 4;
  359. py += 4;
  360. x0 *= c0;
  361. y0 *= c0;
  362. ST_SP(x0, x);
  363. ST_SP(y0, y);
  364. x += 4;
  365. y += 4;
  366. }
  367. if (n & 1)
  368. {
  369. LD_GP2_INC(px, 1, fx0, fx1);
  370. LD_GP2_INC(py, 1, fy0, fy1);
  371. tp0 = (c * fx0);
  372. tp1 = (c * fy0);
  373. tp2 = (c * fx1);
  374. tp3 = (c * fy1);
  375. ST_GP2_INC(tp0, tp2, x, 1);
  376. ST_GP2_INC(tp1, tp3, y, 1);
  377. }
  378. }
  379. else if (0 == c)
  380. {
  381. s0 = COPY_FLOAT_TO_VECTOR(s);
  382. /* process 16 floats */
  383. if (n >> 4)
  384. {
  385. BLASLONG pref_offsetx, pref_offsety;
  386. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  387. if (pref_offsetx > 0)
  388. {
  389. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  390. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  391. }
  392. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  393. if (pref_offsety > 0)
  394. {
  395. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  396. pref_offsety = pref_offsety / sizeof(FLOAT);
  397. }
  398. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  399. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  400. for (j = (n >> 4) - 1; j--;)
  401. {
  402. PREFETCH(px + pref_offsetx + 32);
  403. PREFETCH(px + pref_offsetx + 40);
  404. PREFETCH(px + pref_offsetx + 48);
  405. PREFETCH(px + pref_offsetx + 56);
  406. PREFETCH(py + pref_offsety + 32);
  407. PREFETCH(py + pref_offsety + 40);
  408. PREFETCH(py + pref_offsety + 48);
  409. PREFETCH(py + pref_offsety + 56);
  410. x4 = LD_SP(px); px += 4;
  411. out0 = s0 * y0;
  412. x5 = LD_SP(px); px += 4;
  413. out2 = s0 * y1;
  414. x6 = LD_SP(px); px += 4;
  415. out4 = s0 * y2;
  416. x7 = LD_SP(px); px += 4;
  417. out6 = s0 * y3;
  418. y4 = LD_SP(py); py += 4;
  419. out1 = -(s0 * x0);
  420. y5 = LD_SP(py); py += 4;
  421. out3 = -(s0 * x1);
  422. y6 = LD_SP(py); py += 4;
  423. out5 = -(s0 * x2);
  424. y7 = LD_SP(py); py += 4;
  425. out7 = -(s0 * x3);
  426. ST_SP(out0, x); x += 4;
  427. out0 = s0 * y4;
  428. ST_SP(out2, x); x += 4;
  429. out2 = s0 * y5;
  430. ST_SP(out4, x); x += 4;
  431. out4 = s0 * y6;
  432. ST_SP(out6, x); x += 4;
  433. out6 = s0 * y7;
  434. ST_SP(out1, y); y += 4;
  435. out1 = -(s0 * x4);
  436. ST_SP(out3, y); y += 4;
  437. out3 = -(s0 * x5);
  438. ST_SP(out5, y); y += 4;
  439. out5 = -(s0 * x6);
  440. ST_SP(out7, y); y += 4;
  441. out7 = -(s0 * x7);
  442. x0 = LD_SP(px); px += 4;
  443. ST_SP(out0, x); x += 4;
  444. x1 = LD_SP(px); px += 4;
  445. ST_SP(out2, x); x += 4;
  446. x2 = LD_SP(px); px += 4;
  447. ST_SP(out4, x); x += 4;
  448. x3 = LD_SP(px); px += 4;
  449. ST_SP(out6, x); x += 4;
  450. y0 = LD_SP(py); py += 4;
  451. ST_SP(out1, y); y += 4;
  452. y1 = LD_SP(py); py += 4;
  453. ST_SP(out3, y); y += 4;
  454. y2 = LD_SP(py); py += 4;
  455. ST_SP(out5, y); y += 4;
  456. y3 = LD_SP(py); py += 4;
  457. ST_SP(out7, y); y += 4;
  458. }
  459. out0 = s0 * y0;
  460. out2 = s0 * y1;
  461. out4 = s0 * y2;
  462. out6 = s0 * y3;
  463. out1 = -(s0 * x0);
  464. out3 = -(s0 * x1);
  465. out5 = -(s0 * x2);
  466. out7 = -(s0 * x3);
  467. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  468. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  469. LD_SP4_INC(px, 4, x4, x5, x6, x7);
  470. LD_SP4_INC(py, 4, y4, y5, y6, y7);
  471. out0 = s0 * y4;
  472. out2 = s0 * y5;
  473. out4 = s0 * y6;
  474. out6 = s0 * y7;
  475. out1 = -(s0 * x4);
  476. out3 = -(s0 * x5);
  477. out5 = -(s0 * x6);
  478. out7 = -(s0 * x7);
  479. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  480. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  481. }
  482. if (n & 8)
  483. {
  484. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  485. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  486. out0 = s0 * y0;
  487. out1 = - (s0 * x0);
  488. out2 = s0 * y1;
  489. out3 = - (s0 * x1);
  490. out4 = s0 * y2;
  491. out5 = - (s0 * x2);
  492. out6 = s0 * y3;
  493. out7 = - (s0 * x3);
  494. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  495. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  496. }
  497. if (n & 4)
  498. {
  499. LD_SP2_INC(px, 4, x0, x1);
  500. LD_SP2_INC(py, 4, y0, y1);
  501. out0 = s0 * y0;
  502. out1 = - (s0 * x0);
  503. out2 = s0 * y1;
  504. out3 = - (s0 * x1);
  505. ST_SP2_INC(out0, out2, x, 4);
  506. ST_SP2_INC(out1, out3, y, 4);
  507. }
  508. if (n & 2)
  509. {
  510. x0 = LD_SP(px); px += 4;
  511. y0 = LD_SP(py); py += 4;
  512. out0 = s0 * y0;
  513. out1 = - (s0 * x0);
  514. ST_SP(out0, x); x += 4;
  515. ST_SP(out1, y); y += 4;
  516. }
  517. if (n & 1)
  518. {
  519. LD_GP2_INC(px, 1, fx0, fx1);
  520. LD_GP2_INC(py, 1, fy0, fy1);
  521. tp0 = s * fy0;
  522. tp1 = - (s * fx0);
  523. tp2 = s * fy1;
  524. tp3 = - (s * fx1);
  525. ST_GP2_INC(tp0, tp2, x, 1);
  526. ST_GP2_INC(tp1, tp3, y, 1);
  527. }
  528. }
  529. else
  530. {
  531. c0 = COPY_FLOAT_TO_VECTOR(c);
  532. s0 = COPY_FLOAT_TO_VECTOR(s);
  533. if (n >> 4)
  534. {
  535. BLASLONG pref_offsetx, pref_offsety;
  536. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  537. if (pref_offsetx > 0)
  538. {
  539. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  540. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  541. }
  542. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  543. if (pref_offsety > 0)
  544. {
  545. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  546. pref_offsety = pref_offsety / sizeof(FLOAT);
  547. }
  548. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  549. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  550. for (j = (n >> 4) - 1; j--;)
  551. {
  552. PREFETCH(px + pref_offsetx + 32);
  553. PREFETCH(px + pref_offsetx + 40);
  554. PREFETCH(px + pref_offsetx + 48);
  555. PREFETCH(px + pref_offsetx + 56);
  556. PREFETCH(py + pref_offsety + 32);
  557. PREFETCH(py + pref_offsety + 40);
  558. PREFETCH(py + pref_offsety + 48);
  559. PREFETCH(py + pref_offsety + 56);
  560. x4 = LD_SP(px); px += 4;
  561. out0 = c0 * x0;
  562. x5 = LD_SP(px); px += 4;
  563. out1 = c0 * y0;
  564. x6 = LD_SP(px); px += 4;
  565. out2 = c0 * x1;
  566. x7 = LD_SP(px); px += 4;
  567. out3 = c0 * y1;
  568. y4 = LD_SP(py); py += 4;
  569. out4 = c0 * x2;
  570. y5 = LD_SP(py); py += 4;
  571. out5 = c0 * y2;
  572. y6 = LD_SP(py); py += 4;
  573. out6 = c0 * x3;
  574. y7 = LD_SP(py); py += 4;
  575. out7 = c0 * y3;
  576. out0 += s0 * y0;
  577. out1 -= s0 * x0;
  578. out2 += s0 * y1;
  579. out3 -= s0 * x1;
  580. out4 += s0 * y2;
  581. out5 -= s0 * x2;
  582. out6 += s0 * y3;
  583. out7 -= s0 * x3;
  584. ST_SP(out0, x); x += 4;
  585. out8 = c0 * x4;
  586. ST_SP(out2, x); x += 4;
  587. out9 = c0 * y4;
  588. ST_SP(out4, x); x += 4;
  589. out10 = c0 * x5;
  590. ST_SP(out6, x); x += 4;
  591. out11 = c0 * y5;
  592. ST_SP(out1, y); y += 4;
  593. out12 = c0 * x6;
  594. ST_SP(out3, y); y += 4;
  595. out13 = c0 * y6;
  596. ST_SP(out5, y); y += 4;
  597. out14 = c0 * x7;
  598. ST_SP(out7, y); y += 4;
  599. out15 = c0 * y7;
  600. x0 = LD_SP(px); px += 4;
  601. out8 += s0 * y4;
  602. x1 = LD_SP(px); px += 4;
  603. out9 -= s0 * x4;
  604. x2 = LD_SP(px); px += 4;
  605. out10 += s0 * y5;
  606. x3 = LD_SP(px); px += 4;
  607. out11 -= s0 * x5;
  608. y0 = LD_SP(py); py += 4;
  609. out12 += s0 * y6;
  610. y1 = LD_SP(py); py += 4;
  611. out13 -= s0 * x6;
  612. y2 = LD_SP(py); py += 4;
  613. out14 += s0 * y7;
  614. y3 = LD_SP(py); py += 4;
  615. out15 -= s0 * x7;
  616. ST_SP(out8, x); x += 4;
  617. ST_SP(out10, x); x += 4;
  618. ST_SP(out12, x); x += 4;
  619. ST_SP(out14, x); x += 4;
  620. ST_SP(out9, y); y += 4;
  621. ST_SP(out11, y); y += 4;
  622. ST_SP(out13, y); y += 4;
  623. ST_SP(out15, y); y += 4;
  624. }
  625. out0 = c0 * x0;
  626. out0 += s0 * y0;
  627. out1 = c0 * y0;
  628. out1 -= s0 * x0;
  629. out2 = c0 * x1;
  630. out2 += s0 * y1;
  631. out3 = c0 * y1;
  632. out3 -= s0 * x1;
  633. out4 = c0 * x2;
  634. out4 += s0 * y2;
  635. out5 = c0 * y2;
  636. out5 -= s0 * x2;
  637. out6 = c0 * x3;
  638. out6 += s0 * y3;
  639. out7 = c0 * y3;
  640. out7 -= s0 * x3;
  641. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  642. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  643. LD_SP4_INC(px, 4, x4, x5, x6, x7);
  644. LD_SP4_INC(py, 4, y4, y5, y6, y7);
  645. out8 = c0 * x4;
  646. out8 += s0 * y4;
  647. out9 = c0 * y4;
  648. out9 -= s0 * x4;
  649. out10 = c0 * x5;
  650. out10 += s0 * y5;
  651. out11 = c0 * y5;
  652. out11 -= s0 * x5;
  653. out12 = c0 * x6;
  654. out12 += s0 * y6;
  655. out13 = c0 * y6;
  656. out13 -= s0 * x6;
  657. out14 = c0 * x7;
  658. out14 += s0 * y7;
  659. out15 = c0 * y7;
  660. out15 -= s0 * x7;
  661. ST_SP4_INC(out8, out10, out12, out14, x, 4);
  662. ST_SP4_INC(out9, out11, out13, out15, y, 4);
  663. }
  664. if (n & 8)
  665. {
  666. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  667. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  668. out0 = (c0 * x0) + (s0 * y0);
  669. out1 = (c0 * y0) - (s0 * x0);
  670. out2 = (c0 * x1) + (s0 * y1);
  671. out3 = (c0 * y1) - (s0 * x1);
  672. out4 = (c0 * x2) + (s0 * y2);
  673. out5 = (c0 * y2) - (s0 * x2);
  674. out6 = (c0 * x3) + (s0 * y3);
  675. out7 = (c0 * y3) - (s0 * x3);
  676. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  677. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  678. }
  679. if (n & 4)
  680. {
  681. LD_SP2_INC(px, 4, x0, x1);
  682. LD_SP2_INC(py, 4, y0, y1);
  683. out0 = (c0 * x0) + (s0 * y0);
  684. out1 = (c0 * y0) - (s0 * x0);
  685. out2 = (c0 * x1) + (s0 * y1);
  686. out3 = (c0 * y1) - (s0 * x1);
  687. ST_SP2_INC(out0, out2, x, 4);
  688. ST_SP2_INC(out1, out3, y, 4);
  689. }
  690. if (n & 2)
  691. {
  692. x0 = LD_SP(px);
  693. y0 = LD_SP(py);
  694. px += 4;
  695. py += 4;
  696. out0 = (c0 * x0) + (s0 * y0);
  697. out1 = (c0 * y0) - (s0 * x0);
  698. ST_SP(out0, x);
  699. ST_SP(out1, y);
  700. x += 4;
  701. y += 4;
  702. }
  703. if (n & 1)
  704. {
  705. LD_GP2_INC(px, 1, fx0, fx1);
  706. LD_GP2_INC(py, 1, fy0, fy1);
  707. tp0 = (c * fx0) + (s * fy0);
  708. tp1 = (c * fy0) - (s * fx0);
  709. tp2 = (c * fx1) + (s * fy1);
  710. tp3 = (c * fy1) - (s * fx1);
  711. ST_GP2_INC(tp0, tp2, x, 1);
  712. ST_GP2_INC(tp1, tp3, y, 1);
  713. }
  714. }
  715. }
  716. else
  717. {
  718. inc_x2 = 2 * inc_x;
  719. inc_y2 = 2 * inc_y;
  720. if ((0 == c) && (0 == s))
  721. {
  722. for (i = n; i--;)
  723. {
  724. *x = 0;
  725. *(x + 1) = 0;
  726. *y = 0;
  727. *(y + 1) = 0;
  728. x += inc_x2;
  729. y += inc_y2;
  730. }
  731. }
  732. else if ((1 == c) && (1 == s))
  733. {
  734. if (n >> 1)
  735. {
  736. fx0 = *px;
  737. fx1 = *(px+1); px += inc_x2;
  738. fx2 = *px;
  739. fx3 = *(px+1); px += inc_x2;
  740. fy0 = *py;
  741. fy1 = *(py+1); py += inc_y2;
  742. fy2 = *py;
  743. fy3 = *(py+1); py += inc_y2;
  744. for (i = (n >> 1) - 1; i--;)
  745. {
  746. tp0 = fx0 + fy0;
  747. tp1 = fx1 + fy1;
  748. tp2 = fy0 - fx0;
  749. tp3 = fy1 - fx1;
  750. tp4 = fx2 + fy2;
  751. tp5 = fx3 + fy3;
  752. tp6 = fy2 - fx2;
  753. tp7 = fy3 - fx3;
  754. fx0 = *px;
  755. *x = tp0;
  756. fx1 = *(px+1); px += inc_x2;
  757. *(x+1) = tp1; x += inc_x2;
  758. fx2 = *px;
  759. *x = tp4;
  760. fx3 = *(px+1); px += inc_x2;
  761. *(x+1) = tp5; x += inc_x2;
  762. fy0 = *py;
  763. *y = tp2;
  764. fy1 = *(py+1); py += inc_y2;
  765. *(y+1) = tp3; y += inc_y2;
  766. fy2 = *py;
  767. *y = tp6;
  768. fy3 = *(py+1); py += inc_y2;
  769. *(y+1) = tp7; y += inc_y2;
  770. }
  771. tp0 = fx0 + fy0;
  772. tp1 = fx1 + fy1;
  773. tp2 = fy0 - fx0;
  774. tp3 = fy1 - fx1;
  775. tp4 = fx2 + fy2;
  776. tp5 = fx3 + fy3;
  777. tp6 = fy2 - fx2;
  778. tp7 = fy3 - fx3;
  779. *x = tp0;
  780. *(x+1) = tp1; x += inc_x2;
  781. *x = tp4;
  782. *(x+1) = tp5; x += inc_x2;
  783. *y = tp2;
  784. *(y+1) = tp3; y += inc_y2;
  785. *y = tp6;
  786. *(y+1) = tp7; y += inc_y2;
  787. }
  788. if (n & 1)
  789. {
  790. fx0 = *px;
  791. fx1 = *(px+1);
  792. fy0 = *py;
  793. fy1 = *(py+1);
  794. tp0 = fx0 + fy0;
  795. tp1 = fx1 + fy1;
  796. tp2 = fy0 - fx0;
  797. tp3 = fy1 - fx1;
  798. *x = tp0;
  799. *(x+1) = tp1;
  800. *y = tp2;
  801. *(y+1) = tp3;
  802. }
  803. }
  804. else if (0 == s)
  805. {
  806. if (n >> 1)
  807. {
  808. fx0 = *px;
  809. fx1 = *(px+1); px += inc_x2;
  810. fx2 = *px;
  811. fx3 = *(px+1); px += inc_x2;
  812. fy0 = *py;
  813. fy1 = *(py+1); py += inc_y2;
  814. fy2 = *py;
  815. fy3 = *(py+1); py += inc_y2;
  816. for (i = (n >> 1) - 1; i--;)
  817. {
  818. tp0 = c * fx0;
  819. tp1 = c * fx1;
  820. tp2 = c * fx2;
  821. tp3 = c * fx3;
  822. tp4 = c * fy0;
  823. tp5 = c * fy1;
  824. tp6 = c * fy2;
  825. tp7 = c * fy3;
  826. fx0 = *px;
  827. *x = tp0;
  828. fx1 = *(px+1); px += inc_x2;
  829. *(x+1) = tp1; x += inc_x2;
  830. fx2 = *px;
  831. *x = tp2;
  832. fx3 = *(px+1); px += inc_x2;
  833. *(x+1) = tp3; x += inc_x2;
  834. fy0 = *py;
  835. *y = tp4;
  836. fy1 = *(py+1); py += inc_y2;
  837. *(y+1) = tp5; y += inc_y2;
  838. fy2 = *py;
  839. *y = tp6;
  840. fy3 = *(py+1); py += inc_y2;
  841. *(y+1) = tp7; y += inc_y2;
  842. }
  843. tp0 = c * fx0;
  844. tp1 = c * fx1;
  845. tp2 = c * fx2;
  846. tp3 = c * fx3;
  847. tp4 = c * fy0;
  848. tp5 = c * fy1;
  849. tp6 = c * fy2;
  850. tp7 = c * fy3;
  851. *x = tp0;
  852. *(x+1) = tp1; x += inc_x2;
  853. *x = tp2;
  854. *(x+1) = tp3; x += inc_x2;
  855. *y = tp4;
  856. *(y+1) = tp5; y += inc_y2;
  857. *y = tp6;
  858. *(y+1) = tp7; y += inc_y2;
  859. }
  860. if (n & 1)
  861. {
  862. fx0 = *px;
  863. fx1 = *(px+1);
  864. fy0 = *py;
  865. fy1 = *(py+1);
  866. tp0 = c * fx0;
  867. tp1 = c * fx1;
  868. tp2 = c * fy0;
  869. tp3 = c * fy1;
  870. *x = tp0;
  871. *(x+1) = tp1;
  872. *y = tp2;
  873. *(y+1) = tp3;
  874. }
  875. }
  876. else
  877. {
  878. if (n >> 1)
  879. {
  880. fx0 = *px;
  881. fx1 = *(px+1); px += inc_x2;
  882. fx2 = *px;
  883. fx3 = *(px+1); px += inc_x2;
  884. fy0 = *py;
  885. fy1 = *(py+1); py += inc_y2;
  886. fy2 = *py;
  887. fy3 = *(py+1); py += inc_y2;
  888. for (i = (n >> 1) - 1; i--;)
  889. {
  890. tp0 = c * fx0 + s * fy0;
  891. tp1 = c * fx1 + s * fy1;
  892. tp2 = c * fy0 - s * fx0;
  893. tp3 = c * fy1 - s * fx1;
  894. tp4 = c * fx2 + s * fy2;
  895. tp5 = c * fx3 + s * fy3;
  896. tp6 = c * fy2 - s * fx2;
  897. tp7 = c * fy3 - s * fx3;
  898. fx0 = *px;
  899. *x = tp0;
  900. fx1 = *(px+1); px += inc_x2;
  901. *(x+1) = tp1; x += inc_x2;
  902. fx2 = *px;
  903. *x = tp4;
  904. fx3 = *(px+1); px += inc_x2;
  905. *(x+1) = tp5; x += inc_x2;
  906. fy0 = *py;
  907. *y = tp2;
  908. fy1 = *(py+1); py += inc_y2;
  909. *(y+1) = tp3; y += inc_y2;
  910. fy2 = *py;
  911. *y = tp6;
  912. fy3 = *(py+1); py += inc_y2;
  913. *(y+1) = tp7; y += inc_y2;
  914. }
  915. tp0 = c * fx0 + s * fy0;
  916. tp1 = c * fx1 + s * fy1;
  917. tp2 = c * fy0 - s * fx0;
  918. tp3 = c * fy1 - s * fx1;
  919. tp4 = c * fx2 + s * fy2;
  920. tp5 = c * fx3 + s * fy3;
  921. tp6 = c * fy2 - s * fx2;
  922. tp7 = c * fy3 - s * fx3;
  923. *x = tp0;
  924. *(x+1) = tp1; x += inc_x2;
  925. *x = tp4;
  926. *(x+1) = tp5; x += inc_x2;
  927. *y = tp2;
  928. *(y+1) = tp3; y += inc_y2;
  929. *y = tp6;
  930. *(y+1) = tp7; y += inc_y2;
  931. }
  932. if (n & 1)
  933. {
  934. fx0 = *px;
  935. fx1 = *(px+1);
  936. fy0 = *py;
  937. fy1 = *(py+1);
  938. tp0 = c * fx0 + s * fy0;
  939. tp1 = c * fx1 + s * fy1;
  940. tp2 = c * fy0 - s * fx0;
  941. tp3 = c * fy1 - s * fx1;
  942. *x = tp0;
  943. *(x+1) = tp1;
  944. *y = tp2;
  945. *(y+1) = tp3;
  946. }
  947. }
  948. }
  949. return 0;
  950. }