You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

crot_msa.c 35 kB


  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
  30. FLOAT c, FLOAT s)
  31. {
  32. BLASLONG i, j;
  33. FLOAT *px, *py;
  34. FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
  35. FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3;
  36. BLASLONG inc_x2, inc_y2;
  37. v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
  38. v4f32 out0, out1, out2, out3, out4, out5, out6, out7;
  39. v4f32 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0;
  40. if (n <= 0) return (0);
  41. px = x;
  42. py = y;
  43. if ((1 == inc_x) && (1 == inc_y))
  44. {
  45. if ((0 == c) && (0 == s))
  46. {
  47. v4f32 zero = {0.0, 0.0, 0.0, 0.0};
  48. /* process 2 elements */
  49. for (j = (n >> 1); j--;)
  50. {
  51. ST_SP(zero, px);
  52. ST_SP(zero, py);
  53. px += 4;
  54. py += 4;
  55. }
  56. if (n & 1)
  57. {
  58. px[0] = 0;
  59. px[1] = 0;
  60. py[0] = 0;
  61. py[1] = 0;
  62. }
  63. }
  64. else if ((1 == c) && (1 == s))
  65. {
  66. if (n >> 4)
  67. {
  68. BLASLONG pref_offsetx, pref_offsety;
  69. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  70. if (pref_offsetx > 0)
  71. {
  72. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  73. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  74. }
  75. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  76. if (pref_offsety > 0)
  77. {
  78. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  79. pref_offsety = pref_offsety / sizeof(FLOAT);
  80. }
  81. x0 = LD_SP(px); px += 4;
  82. x1 = LD_SP(px); px += 4;
  83. x2 = LD_SP(px); px += 4;
  84. x3 = LD_SP(px); px += 4;
  85. y0 = LD_SP(py); py += 4;
  86. y1 = LD_SP(py); py += 4;
  87. y2 = LD_SP(py); py += 4;
  88. y3 = LD_SP(py); py += 4;
  89. for (j = (n >> 4) - 1; j--;)
  90. {
  91. PREFETCH(px + pref_offsetx + 32);
  92. PREFETCH(px + pref_offsetx + 40);
  93. PREFETCH(px + pref_offsetx + 48);
  94. PREFETCH(px + pref_offsetx + 56);
  95. PREFETCH(py + pref_offsety + 32);
  96. PREFETCH(py + pref_offsety + 40);
  97. PREFETCH(py + pref_offsety + 48);
  98. PREFETCH(py + pref_offsety + 56);
  99. out0 = x0 + y0;
  100. x4 = LD_SP(px); px += 4;
  101. out1 = y0 - x0;
  102. x5 = LD_SP(px); px += 4;
  103. out2 = x1 + y1;
  104. x6 = LD_SP(px); px += 4;
  105. out3 = y1 - x1;
  106. x7 = LD_SP(px); px += 4;
  107. out4 = x2 + y2;
  108. y4 = LD_SP(py); py += 4;
  109. out5 = y2 - x2;
  110. y5 = LD_SP(py); py += 4;
  111. out6 = x3 + y3;
  112. y6 = LD_SP(py); py += 4;
  113. out7 = y3 - x3;
  114. y7 = LD_SP(py); py += 4;
  115. ST_SP(out0, x); x += 4;
  116. out8 = x4 + y4;
  117. ST_SP(out1, y); y += 4;
  118. out9 = y4 - x4;
  119. ST_SP(out2, x); x += 4;
  120. out10 = x5 + y5;
  121. ST_SP(out3, y); y += 4;
  122. out11 = y5 - x5;
  123. ST_SP(out4, x); x += 4;
  124. out12 = x6 + y6;
  125. ST_SP(out5, y); y += 4;
  126. out13 = y6 - x6;
  127. ST_SP(out6, x); x += 4;
  128. out14 = x7 + y7;
  129. ST_SP(out7, y); y += 4;
  130. out15 = y7 - x7;
  131. x0 = LD_SP(px); px += 4;
  132. ST_SP(out8, x); x += 4;
  133. x1 = LD_SP(px); px += 4;
  134. ST_SP(out10, x); x += 4;
  135. x2 = LD_SP(px); px += 4;
  136. ST_SP(out12, x); x += 4;
  137. x3 = LD_SP(px); px += 4;
  138. ST_SP(out14, x); x += 4;
  139. y0 = LD_SP(py); py += 4;
  140. ST_SP(out9, y); y += 4;
  141. y1 = LD_SP(py); py += 4;
  142. ST_SP(out11, y); y += 4;
  143. y2 = LD_SP(py); py += 4;
  144. ST_SP(out13, y); y += 4;
  145. y3 = LD_SP(py); py += 4;
  146. ST_SP(out15, y); y += 4;
  147. }
  148. x4 = LD_SP(px); px += 4;
  149. x5 = LD_SP(px); px += 4;
  150. x6 = LD_SP(px); px += 4;
  151. x7 = LD_SP(px); px += 4;
  152. y4 = LD_SP(py); py += 4;
  153. y5 = LD_SP(py); py += 4;
  154. y6 = LD_SP(py); py += 4;
  155. y7 = LD_SP(py); py += 4;
  156. out0 = x0 + y0;
  157. out1 = y0 - x0;
  158. out2 = x1 + y1;
  159. out3 = y1 - x1;
  160. out4 = x2 + y2;
  161. out5 = y2 - x2;
  162. out6 = x3 + y3;
  163. out7 = y3 - x3;
  164. out8 = x4 + y4;
  165. out9 = y4 - x4;
  166. out10 = x5 + y5;
  167. out11 = y5 - x5;
  168. out12 = x6 + y6;
  169. out13 = y6 - x6;
  170. out14 = x7 + y7;
  171. out15 = y7 - x7;
  172. ST_SP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 4);
  173. ST_SP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 4);
  174. }
  175. if (n & 8)
  176. {
  177. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  178. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  179. out0 = x0 + y0;
  180. out1 = y0 - x0;
  181. out2 = x1 + y1;
  182. out3 = y1 - x1;
  183. out4 = x2 + y2;
  184. out5 = y2 - x2;
  185. out6 = x3 + y3;
  186. out7 = y3 - x3;
  187. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  188. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  189. }
  190. if (n & 4)
  191. {
  192. LD_SP2_INC(px, 4, x0, x1);
  193. LD_SP2_INC(py, 4, y0, y1);
  194. out0 = x0 + y0;
  195. out1 = y0 - x0;
  196. out2 = x1 + y1;
  197. out3 = y1 - x1;
  198. ST_SP2_INC(out0, out2, x, 4);
  199. ST_SP2_INC(out1, out3, y, 4);
  200. }
  201. if (n & 2)
  202. {
  203. x0 = LD_SP(px);
  204. y0 = LD_SP(py);
  205. px += 4;
  206. py += 4;
  207. out0 = x0 + y0;
  208. out1 = y0 - x0;
  209. ST_SP(out0, x);
  210. ST_SP(out1, y);
  211. x += 4;
  212. y += 4;
  213. }
  214. if (n & 1)
  215. {
  216. LD_GP2_INC(px, 1, fx0, fx1);
  217. LD_GP2_INC(py, 1, fy0, fy1);
  218. tp0 = fx0 + fy0;
  219. tp1 = fy0 - fx0;
  220. tp2 = fx1 + fy1;
  221. tp3 = fy1 - fx1;
  222. ST_GP2_INC(tp0, tp2, x, 1);
  223. ST_GP2_INC(tp1, tp3, y, 1);
  224. }
  225. }
  226. else if (0 == s)
  227. {
  228. c0 = COPY_FLOAT_TO_VECTOR(c);
  229. if (n >> 4)
  230. {
  231. BLASLONG pref_offsetx, pref_offsety;
  232. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  233. if (pref_offsetx > 0)
  234. {
  235. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  236. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  237. }
  238. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  239. if (pref_offsety > 0)
  240. {
  241. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  242. pref_offsety = pref_offsety / sizeof(FLOAT);
  243. }
  244. LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
  245. for (j = (n >> 4) - 1; j--;)
  246. {
  247. PREFETCH(px + pref_offsetx + 32);
  248. PREFETCH(px + pref_offsetx + 40);
  249. PREFETCH(px + pref_offsetx + 48);
  250. PREFETCH(px + pref_offsetx + 56);
  251. PREFETCH(py + pref_offsety + 32);
  252. PREFETCH(py + pref_offsety + 40);
  253. PREFETCH(py + pref_offsety + 48);
  254. PREFETCH(py + pref_offsety + 56);
  255. y0 = LD_SP(py); py += 4;
  256. x0 *= c0;
  257. y1 = LD_SP(py); py += 4;
  258. x1 *= c0;
  259. y2 = LD_SP(py); py += 4;
  260. x2 *= c0;
  261. y3 = LD_SP(py); py += 4;
  262. x3 *= c0;
  263. y4 = LD_SP(py); py += 4;
  264. x4 *= c0;
  265. y5 = LD_SP(py); py += 4;
  266. x5 *= c0;
  267. y6 = LD_SP(py); py += 4;
  268. x6 *= c0;
  269. y7 = LD_SP(py); py += 4;
  270. x7 *= c0;
  271. ST_SP(x0, x); x += 4;
  272. y0 *= c0;
  273. ST_SP(x1, x); x += 4;
  274. y1 *= c0;
  275. ST_SP(x2, x); x += 4;
  276. y2 *= c0;
  277. ST_SP(x3, x); x += 4;
  278. y3 *= c0;
  279. ST_SP(x4, x); x += 4;
  280. y4 *= c0;
  281. ST_SP(x5, x); x += 4;
  282. y5 *= c0;
  283. ST_SP(x6, x); x += 4;
  284. y6 *= c0;
  285. ST_SP(x7, x); x += 4;
  286. y7 *= c0;
  287. x0 = LD_SP(px); px += 4;
  288. ST_SP(y0, y); y += 4;
  289. x1 = LD_SP(px); px += 4;
  290. ST_SP(y1, y); y += 4;
  291. x2 = LD_SP(px); px += 4;
  292. ST_SP(y2, y); y += 4;
  293. x3 = LD_SP(px); px += 4;
  294. ST_SP(y3, y); y += 4;
  295. x4 = LD_SP(px); px += 4;
  296. ST_SP(y4, y); y += 4;
  297. x5 = LD_SP(px); px += 4;
  298. ST_SP(y5, y); y += 4;
  299. x6 = LD_SP(px); px += 4;
  300. ST_SP(y6, y); y += 4;
  301. x7 = LD_SP(px); px += 4;
  302. ST_SP(y7, y); y += 4;
  303. }
  304. LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
  305. x0 *= c0;
  306. y0 *= c0;
  307. x1 *= c0;
  308. y1 *= c0;
  309. x2 *= c0;
  310. y2 *= c0;
  311. x3 *= c0;
  312. y3 *= c0;
  313. x4 *= c0;
  314. y4 *= c0;
  315. x5 *= c0;
  316. y5 *= c0;
  317. x6 *= c0;
  318. y6 *= c0;
  319. x7 *= c0;
  320. y7 *= c0;
  321. ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
  322. ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
  323. }
  324. if (n & 8)
  325. {
  326. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  327. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  328. x0 *= c0;
  329. y0 *= c0;
  330. x1 *= c0;
  331. y1 *= c0;
  332. x2 *= c0;
  333. y2 *= c0;
  334. x3 *= c0;
  335. y3 *= c0;
  336. ST_SP4_INC(x0, x1, x2, x3, x, 4);
  337. ST_SP4_INC(y0, y1, y2, y3, y, 4);
  338. }
  339. if (n & 4)
  340. {
  341. LD_SP2_INC(px, 4, x0, x1);
  342. LD_SP2_INC(py, 4, y0, y1);
  343. x0 *= c0;
  344. y0 *= c0;
  345. x1 *= c0;
  346. y1 *= c0;
  347. ST_SP2_INC(x0, x1, x, 4);
  348. ST_SP2_INC(y0, y1, y, 4);
  349. }
  350. if (n & 2)
  351. {
  352. x0 = LD_SP(px);
  353. y0 = LD_SP(py);
  354. px += 4;
  355. py += 4;
  356. x0 *= c0;
  357. y0 *= c0;
  358. ST_SP(x0, x);
  359. ST_SP(y0, y);
  360. x += 4;
  361. y += 4;
  362. }
  363. if (n & 1)
  364. {
  365. LD_GP2_INC(px, 1, fx0, fx1);
  366. LD_GP2_INC(py, 1, fy0, fy1);
  367. tp0 = (c * fx0);
  368. tp1 = (c * fy0);
  369. tp2 = (c * fx1);
  370. tp3 = (c * fy1);
  371. ST_GP2_INC(tp0, tp2, x, 1);
  372. ST_GP2_INC(tp1, tp3, y, 1);
  373. }
  374. }
  375. else if (0 == c)
  376. {
  377. s0 = COPY_FLOAT_TO_VECTOR(s);
  378. /* process 16 floats */
  379. if (n >> 4)
  380. {
  381. BLASLONG pref_offsetx, pref_offsety;
  382. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  383. if (pref_offsetx > 0)
  384. {
  385. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  386. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  387. }
  388. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  389. if (pref_offsety > 0)
  390. {
  391. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  392. pref_offsety = pref_offsety / sizeof(FLOAT);
  393. }
  394. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  395. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  396. for (j = (n >> 4) - 1; j--;)
  397. {
  398. PREFETCH(px + pref_offsetx + 32);
  399. PREFETCH(px + pref_offsetx + 40);
  400. PREFETCH(px + pref_offsetx + 48);
  401. PREFETCH(px + pref_offsetx + 56);
  402. PREFETCH(py + pref_offsety + 32);
  403. PREFETCH(py + pref_offsety + 40);
  404. PREFETCH(py + pref_offsety + 48);
  405. PREFETCH(py + pref_offsety + 56);
  406. x4 = LD_SP(px); px += 4;
  407. out0 = s0 * y0;
  408. x5 = LD_SP(px); px += 4;
  409. out2 = s0 * y1;
  410. x6 = LD_SP(px); px += 4;
  411. out4 = s0 * y2;
  412. x7 = LD_SP(px); px += 4;
  413. out6 = s0 * y3;
  414. y4 = LD_SP(py); py += 4;
  415. out1 = -(s0 * x0);
  416. y5 = LD_SP(py); py += 4;
  417. out3 = -(s0 * x1);
  418. y6 = LD_SP(py); py += 4;
  419. out5 = -(s0 * x2);
  420. y7 = LD_SP(py); py += 4;
  421. out7 = -(s0 * x3);
  422. ST_SP(out0, x); x += 4;
  423. out0 = s0 * y4;
  424. ST_SP(out2, x); x += 4;
  425. out2 = s0 * y5;
  426. ST_SP(out4, x); x += 4;
  427. out4 = s0 * y6;
  428. ST_SP(out6, x); x += 4;
  429. out6 = s0 * y7;
  430. ST_SP(out1, y); y += 4;
  431. out1 = -(s0 * x4);
  432. ST_SP(out3, y); y += 4;
  433. out3 = -(s0 * x5);
  434. ST_SP(out5, y); y += 4;
  435. out5 = -(s0 * x6);
  436. ST_SP(out7, y); y += 4;
  437. out7 = -(s0 * x7);
  438. x0 = LD_SP(px); px += 4;
  439. ST_SP(out0, x); x += 4;
  440. x1 = LD_SP(px); px += 4;
  441. ST_SP(out2, x); x += 4;
  442. x2 = LD_SP(px); px += 4;
  443. ST_SP(out4, x); x += 4;
  444. x3 = LD_SP(px); px += 4;
  445. ST_SP(out6, x); x += 4;
  446. y0 = LD_SP(py); py += 4;
  447. ST_SP(out1, y); y += 4;
  448. y1 = LD_SP(py); py += 4;
  449. ST_SP(out3, y); y += 4;
  450. y2 = LD_SP(py); py += 4;
  451. ST_SP(out5, y); y += 4;
  452. y3 = LD_SP(py); py += 4;
  453. ST_SP(out7, y); y += 4;
  454. }
  455. out0 = s0 * y0;
  456. out2 = s0 * y1;
  457. out4 = s0 * y2;
  458. out6 = s0 * y3;
  459. out1 = -(s0 * x0);
  460. out3 = -(s0 * x1);
  461. out5 = -(s0 * x2);
  462. out7 = -(s0 * x3);
  463. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  464. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  465. LD_SP4_INC(px, 4, x4, x5, x6, x7);
  466. LD_SP4_INC(py, 4, y4, y5, y6, y7);
  467. out0 = s0 * y4;
  468. out2 = s0 * y5;
  469. out4 = s0 * y6;
  470. out6 = s0 * y7;
  471. out1 = -(s0 * x4);
  472. out3 = -(s0 * x5);
  473. out5 = -(s0 * x6);
  474. out7 = -(s0 * x7);
  475. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  476. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  477. }
  478. if (n & 8)
  479. {
  480. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  481. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  482. out0 = s0 * y0;
  483. out1 = - (s0 * x0);
  484. out2 = s0 * y1;
  485. out3 = - (s0 * x1);
  486. out4 = s0 * y2;
  487. out5 = - (s0 * x2);
  488. out6 = s0 * y3;
  489. out7 = - (s0 * x3);
  490. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  491. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  492. }
  493. if (n & 4)
  494. {
  495. LD_SP2_INC(px, 4, x0, x1);
  496. LD_SP2_INC(py, 4, y0, y1);
  497. out0 = s0 * y0;
  498. out1 = - (s0 * x0);
  499. out2 = s0 * y1;
  500. out3 = - (s0 * x1);
  501. ST_SP2_INC(out0, out2, x, 4);
  502. ST_SP2_INC(out1, out3, y, 4);
  503. }
  504. if (n & 2)
  505. {
  506. x0 = LD_SP(px); px += 4;
  507. y0 = LD_SP(py); py += 4;
  508. out0 = s0 * y0;
  509. out1 = - (s0 * x0);
  510. ST_SP(out0, x); x += 4;
  511. ST_SP(out1, y); y += 4;
  512. }
  513. if (n & 1)
  514. {
  515. LD_GP2_INC(px, 1, fx0, fx1);
  516. LD_GP2_INC(py, 1, fy0, fy1);
  517. tp0 = s * fy0;
  518. tp1 = - (s * fx0);
  519. tp2 = s * fy1;
  520. tp3 = - (s * fx1);
  521. ST_GP2_INC(tp0, tp2, x, 1);
  522. ST_GP2_INC(tp1, tp3, y, 1);
  523. }
  524. }
  525. else
  526. {
  527. c0 = COPY_FLOAT_TO_VECTOR(c);
  528. s0 = COPY_FLOAT_TO_VECTOR(s);
  529. if (n >> 4)
  530. {
  531. BLASLONG pref_offsetx, pref_offsety;
  532. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  533. if (pref_offsetx > 0)
  534. {
  535. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  536. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  537. }
  538. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  539. if (pref_offsety > 0)
  540. {
  541. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  542. pref_offsety = pref_offsety / sizeof(FLOAT);
  543. }
  544. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  545. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  546. for (j = (n >> 4) - 1; j--;)
  547. {
  548. PREFETCH(px + pref_offsetx + 32);
  549. PREFETCH(px + pref_offsetx + 40);
  550. PREFETCH(px + pref_offsetx + 48);
  551. PREFETCH(px + pref_offsetx + 56);
  552. PREFETCH(py + pref_offsety + 32);
  553. PREFETCH(py + pref_offsety + 40);
  554. PREFETCH(py + pref_offsety + 48);
  555. PREFETCH(py + pref_offsety + 56);
  556. x4 = LD_SP(px); px += 4;
  557. out0 = c0 * x0;
  558. x5 = LD_SP(px); px += 4;
  559. out1 = c0 * y0;
  560. x6 = LD_SP(px); px += 4;
  561. out2 = c0 * x1;
  562. x7 = LD_SP(px); px += 4;
  563. out3 = c0 * y1;
  564. y4 = LD_SP(py); py += 4;
  565. out4 = c0 * x2;
  566. y5 = LD_SP(py); py += 4;
  567. out5 = c0 * y2;
  568. y6 = LD_SP(py); py += 4;
  569. out6 = c0 * x3;
  570. y7 = LD_SP(py); py += 4;
  571. out7 = c0 * y3;
  572. out0 += s0 * y0;
  573. out1 -= s0 * x0;
  574. out2 += s0 * y1;
  575. out3 -= s0 * x1;
  576. out4 += s0 * y2;
  577. out5 -= s0 * x2;
  578. out6 += s0 * y3;
  579. out7 -= s0 * x3;
  580. ST_SP(out0, x); x += 4;
  581. out8 = c0 * x4;
  582. ST_SP(out2, x); x += 4;
  583. out9 = c0 * y4;
  584. ST_SP(out4, x); x += 4;
  585. out10 = c0 * x5;
  586. ST_SP(out6, x); x += 4;
  587. out11 = c0 * y5;
  588. ST_SP(out1, y); y += 4;
  589. out12 = c0 * x6;
  590. ST_SP(out3, y); y += 4;
  591. out13 = c0 * y6;
  592. ST_SP(out5, y); y += 4;
  593. out14 = c0 * x7;
  594. ST_SP(out7, y); y += 4;
  595. out15 = c0 * y7;
  596. x0 = LD_SP(px); px += 4;
  597. out8 += s0 * y4;
  598. x1 = LD_SP(px); px += 4;
  599. out9 -= s0 * x4;
  600. x2 = LD_SP(px); px += 4;
  601. out10 += s0 * y5;
  602. x3 = LD_SP(px); px += 4;
  603. out11 -= s0 * x5;
  604. y0 = LD_SP(py); py += 4;
  605. out12 += s0 * y6;
  606. y1 = LD_SP(py); py += 4;
  607. out13 -= s0 * x6;
  608. y2 = LD_SP(py); py += 4;
  609. out14 += s0 * y7;
  610. y3 = LD_SP(py); py += 4;
  611. out15 -= s0 * x7;
  612. ST_SP(out8, x); x += 4;
  613. ST_SP(out10, x); x += 4;
  614. ST_SP(out12, x); x += 4;
  615. ST_SP(out14, x); x += 4;
  616. ST_SP(out9, y); y += 4;
  617. ST_SP(out11, y); y += 4;
  618. ST_SP(out13, y); y += 4;
  619. ST_SP(out15, y); y += 4;
  620. }
  621. out0 = c0 * x0;
  622. out0 += s0 * y0;
  623. out1 = c0 * y0;
  624. out1 -= s0 * x0;
  625. out2 = c0 * x1;
  626. out2 += s0 * y1;
  627. out3 = c0 * y1;
  628. out3 -= s0 * x1;
  629. out4 = c0 * x2;
  630. out4 += s0 * y2;
  631. out5 = c0 * y2;
  632. out5 -= s0 * x2;
  633. out6 = c0 * x3;
  634. out6 += s0 * y3;
  635. out7 = c0 * y3;
  636. out7 -= s0 * x3;
  637. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  638. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  639. LD_SP4_INC(px, 4, x4, x5, x6, x7);
  640. LD_SP4_INC(py, 4, y4, y5, y6, y7);
  641. out8 = c0 * x4;
  642. out8 += s0 * y4;
  643. out9 = c0 * y4;
  644. out9 -= s0 * x4;
  645. out10 = c0 * x5;
  646. out10 += s0 * y5;
  647. out11 = c0 * y5;
  648. out11 -= s0 * x5;
  649. out12 = c0 * x6;
  650. out12 += s0 * y6;
  651. out13 = c0 * y6;
  652. out13 -= s0 * x6;
  653. out14 = c0 * x7;
  654. out14 += s0 * y7;
  655. out15 = c0 * y7;
  656. out15 -= s0 * x7;
  657. ST_SP4_INC(out8, out10, out12, out14, x, 4);
  658. ST_SP4_INC(out9, out11, out13, out15, y, 4);
  659. }
  660. if (n & 8)
  661. {
  662. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  663. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  664. out0 = (c0 * x0) + (s0 * y0);
  665. out1 = (c0 * y0) - (s0 * x0);
  666. out2 = (c0 * x1) + (s0 * y1);
  667. out3 = (c0 * y1) - (s0 * x1);
  668. out4 = (c0 * x2) + (s0 * y2);
  669. out5 = (c0 * y2) - (s0 * x2);
  670. out6 = (c0 * x3) + (s0 * y3);
  671. out7 = (c0 * y3) - (s0 * x3);
  672. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  673. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  674. }
  675. if (n & 4)
  676. {
  677. LD_SP2_INC(px, 4, x0, x1);
  678. LD_SP2_INC(py, 4, y0, y1);
  679. out0 = (c0 * x0) + (s0 * y0);
  680. out1 = (c0 * y0) - (s0 * x0);
  681. out2 = (c0 * x1) + (s0 * y1);
  682. out3 = (c0 * y1) - (s0 * x1);
  683. ST_SP2_INC(out0, out2, x, 4);
  684. ST_SP2_INC(out1, out3, y, 4);
  685. }
  686. if (n & 2)
  687. {
  688. x0 = LD_SP(px);
  689. y0 = LD_SP(py);
  690. px += 4;
  691. py += 4;
  692. out0 = (c0 * x0) + (s0 * y0);
  693. out1 = (c0 * y0) - (s0 * x0);
  694. ST_SP(out0, x);
  695. ST_SP(out1, y);
  696. x += 4;
  697. y += 4;
  698. }
  699. if (n & 1)
  700. {
  701. LD_GP2_INC(px, 1, fx0, fx1);
  702. LD_GP2_INC(py, 1, fy0, fy1);
  703. tp0 = (c * fx0) + (s * fy0);
  704. tp1 = (c * fy0) - (s * fx0);
  705. tp2 = (c * fx1) + (s * fy1);
  706. tp3 = (c * fy1) - (s * fx1);
  707. ST_GP2_INC(tp0, tp2, x, 1);
  708. ST_GP2_INC(tp1, tp3, y, 1);
  709. }
  710. }
  711. }
  712. else
  713. {
  714. inc_x2 = 2 * inc_x;
  715. inc_y2 = 2 * inc_y;
  716. if ((0 == c) && (0 == s))
  717. {
  718. for (i = n; i--;)
  719. {
  720. *x = 0;
  721. *(x + 1) = 0;
  722. *y = 0;
  723. *(y + 1) = 0;
  724. x += inc_x2;
  725. y += inc_y2;
  726. }
  727. }
  728. else if ((1 == c) && (1 == s))
  729. {
  730. if (n >> 1)
  731. {
  732. fx0 = *px;
  733. fx1 = *(px+1); px += inc_x2;
  734. fx2 = *px;
  735. fx3 = *(px+1); px += inc_x2;
  736. fy0 = *py;
  737. fy1 = *(py+1); py += inc_y2;
  738. fy2 = *py;
  739. fy3 = *(py+1); py += inc_y2;
  740. for (i = (n >> 1) - 1; i--;)
  741. {
  742. tp0 = fx0 + fy0;
  743. tp1 = fx1 + fy1;
  744. tp2 = fy0 - fx0;
  745. tp3 = fy1 - fx1;
  746. tp4 = fx2 + fy2;
  747. tp5 = fx3 + fy3;
  748. tp6 = fy2 - fx2;
  749. tp7 = fy3 - fx3;
  750. fx0 = *px;
  751. *x = tp0;
  752. fx1 = *(px+1); px += inc_x2;
  753. *(x+1) = tp1; x += inc_x2;
  754. fx2 = *px;
  755. *x = tp4;
  756. fx3 = *(px+1); px += inc_x2;
  757. *(x+1) = tp5; x += inc_x2;
  758. fy0 = *py;
  759. *y = tp2;
  760. fy1 = *(py+1); py += inc_y2;
  761. *(y+1) = tp3; y += inc_y2;
  762. fy2 = *py;
  763. *y = tp6;
  764. fy3 = *(py+1); py += inc_y2;
  765. *(y+1) = tp7; y += inc_y2;
  766. }
  767. tp0 = fx0 + fy0;
  768. tp1 = fx1 + fy1;
  769. tp2 = fy0 - fx0;
  770. tp3 = fy1 - fx1;
  771. tp4 = fx2 + fy2;
  772. tp5 = fx3 + fy3;
  773. tp6 = fy2 - fx2;
  774. tp7 = fy3 - fx3;
  775. *x = tp0;
  776. *(x+1) = tp1; x += inc_x2;
  777. *x = tp4;
  778. *(x+1) = tp5; x += inc_x2;
  779. *y = tp2;
  780. *(y+1) = tp3; y += inc_y2;
  781. *y = tp6;
  782. *(y+1) = tp7; y += inc_y2;
  783. }
  784. if (n & 1)
  785. {
  786. fx0 = *px;
  787. fx1 = *(px+1);
  788. fy0 = *py;
  789. fy1 = *(py+1);
  790. tp0 = fx0 + fy0;
  791. tp1 = fx1 + fy1;
  792. tp2 = fy0 - fx0;
  793. tp3 = fy1 - fx1;
  794. *x = tp0;
  795. *(x+1) = tp1;
  796. *y = tp2;
  797. *(y+1) = tp3;
  798. }
  799. }
  800. else if (0 == s)
  801. {
  802. if (n >> 1)
  803. {
  804. fx0 = *px;
  805. fx1 = *(px+1); px += inc_x2;
  806. fx2 = *px;
  807. fx3 = *(px+1); px += inc_x2;
  808. fy0 = *py;
  809. fy1 = *(py+1); py += inc_y2;
  810. fy2 = *py;
  811. fy3 = *(py+1); py += inc_y2;
  812. for (i = (n >> 1) - 1; i--;)
  813. {
  814. tp0 = c * fx0;
  815. tp1 = c * fx1;
  816. tp2 = c * fx2;
  817. tp3 = c * fx3;
  818. tp4 = c * fy0;
  819. tp5 = c * fy1;
  820. tp6 = c * fy2;
  821. tp7 = c * fy3;
  822. fx0 = *px;
  823. *x = tp0;
  824. fx1 = *(px+1); px += inc_x2;
  825. *(x+1) = tp1; x += inc_x2;
  826. fx2 = *px;
  827. *x = tp2;
  828. fx3 = *(px+1); px += inc_x2;
  829. *(x+1) = tp3; x += inc_x2;
  830. fy0 = *py;
  831. *y = tp4;
  832. fy1 = *(py+1); py += inc_y2;
  833. *(y+1) = tp5; y += inc_y2;
  834. fy2 = *py;
  835. *y = tp6;
  836. fy3 = *(py+1); py += inc_y2;
  837. *(y+1) = tp7; y += inc_y2;
  838. }
  839. tp0 = c * fx0;
  840. tp1 = c * fx1;
  841. tp2 = c * fx2;
  842. tp3 = c * fx3;
  843. tp4 = c * fy0;
  844. tp5 = c * fy1;
  845. tp6 = c * fy2;
  846. tp7 = c * fy3;
  847. *x = tp0;
  848. *(x+1) = tp1; x += inc_x2;
  849. *x = tp2;
  850. *(x+1) = tp3; x += inc_x2;
  851. *y = tp4;
  852. *(y+1) = tp5; y += inc_y2;
  853. *y = tp6;
  854. *(y+1) = tp7; y += inc_y2;
  855. }
  856. if (n & 1)
  857. {
  858. fx0 = *px;
  859. fx1 = *(px+1);
  860. fy0 = *py;
  861. fy1 = *(py+1);
  862. tp0 = c * fx0;
  863. tp1 = c * fx1;
  864. tp2 = c * fy0;
  865. tp3 = c * fy1;
  866. *x = tp0;
  867. *(x+1) = tp1;
  868. *y = tp2;
  869. *(y+1) = tp3;
  870. }
  871. }
  872. else
  873. {
  874. if (n >> 1)
  875. {
  876. fx0 = *px;
  877. fx1 = *(px+1); px += inc_x2;
  878. fx2 = *px;
  879. fx3 = *(px+1); px += inc_x2;
  880. fy0 = *py;
  881. fy1 = *(py+1); py += inc_y2;
  882. fy2 = *py;
  883. fy3 = *(py+1); py += inc_y2;
  884. for (i = (n >> 1) - 1; i--;)
  885. {
  886. tp0 = c * fx0 + s * fy0;
  887. tp1 = c * fx1 + s * fy1;
  888. tp2 = c * fy0 - s * fx0;
  889. tp3 = c * fy1 - s * fx1;
  890. tp4 = c * fx2 + s * fy2;
  891. tp5 = c * fx3 + s * fy3;
  892. tp6 = c * fy2 - s * fx2;
  893. tp7 = c * fy3 - s * fx3;
  894. fx0 = *px;
  895. *x = tp0;
  896. fx1 = *(px+1); px += inc_x2;
  897. *(x+1) = tp1; x += inc_x2;
  898. fx2 = *px;
  899. *x = tp4;
  900. fx3 = *(px+1); px += inc_x2;
  901. *(x+1) = tp5; x += inc_x2;
  902. fy0 = *py;
  903. *y = tp2;
  904. fy1 = *(py+1); py += inc_y2;
  905. *(y+1) = tp3; y += inc_y2;
  906. fy2 = *py;
  907. *y = tp6;
  908. fy3 = *(py+1); py += inc_y2;
  909. *(y+1) = tp7; y += inc_y2;
  910. }
  911. tp0 = c * fx0 + s * fy0;
  912. tp1 = c * fx1 + s * fy1;
  913. tp2 = c * fy0 - s * fx0;
  914. tp3 = c * fy1 - s * fx1;
  915. tp4 = c * fx2 + s * fy2;
  916. tp5 = c * fx3 + s * fy3;
  917. tp6 = c * fy2 - s * fx2;
  918. tp7 = c * fy3 - s * fx3;
  919. *x = tp0;
  920. *(x+1) = tp1; x += inc_x2;
  921. *x = tp4;
  922. *(x+1) = tp5; x += inc_x2;
  923. *y = tp2;
  924. *(y+1) = tp3; y += inc_y2;
  925. *y = tp6;
  926. *(y+1) = tp7; y += inc_y2;
  927. }
  928. if (n & 1)
  929. {
  930. fx0 = *px;
  931. fx1 = *(px+1);
  932. fy0 = *py;
  933. fy1 = *(py+1);
  934. tp0 = c * fx0 + s * fy0;
  935. tp1 = c * fx1 + s * fy1;
  936. tp2 = c * fy0 - s * fx0;
  937. tp3 = c * fy1 - s * fx1;
  938. *x = tp0;
  939. *(x+1) = tp1;
  940. *y = tp2;
  941. *(y+1) = tp3;
  942. }
  943. }
  944. }
  945. return 0;
  946. }