You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

srot_msa.c 37 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123
  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
  30. FLOAT c, FLOAT s)
  31. {
  32. BLASLONG i, j;
  33. FLOAT *px, *py;
  34. FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
  35. FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3;
  36. v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
  37. v4f32 out0, out1, out2, out3, out4, out5, out6, out7;
  38. v4f32 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0;
  39. if (n <= 0) return (0);
  40. px = x;
  41. py = y;
  42. if ((1 == inc_x) && (1 == inc_y))
  43. {
  44. if ((0 == c) && (0 == s))
  45. {
  46. v4f32 zero = __msa_cast_to_vector_float(0);
  47. zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
  48. zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
  49. zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
  50. zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
  51. /* process 4 floats */
  52. for (j = (n >> 2); j--;)
  53. {
  54. ST_SP(zero, px);
  55. ST_SP(zero, py);
  56. px += 4;
  57. py += 4;
  58. }
  59. if (n & 2)
  60. {
  61. px[0] = 0;
  62. py[0] = 0;
  63. px[1] = 0;
  64. py[1] = 0;
  65. px += 2;
  66. py += 2;
  67. }
  68. if (n & 1)
  69. {
  70. px[0] = 0;
  71. py[0] = 0;
  72. }
  73. }
  74. else if ((1 == c) && (1 == s))
  75. {
  76. if (n >> 5)
  77. {
  78. BLASLONG pref_offsetx, pref_offsety;
  79. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  80. if (pref_offsetx > 0)
  81. {
  82. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  83. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  84. }
  85. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  86. if (pref_offsety > 0)
  87. {
  88. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  89. pref_offsety = pref_offsety / sizeof(FLOAT);
  90. }
  91. x0 = LD_SP(px); px += 4;
  92. x1 = LD_SP(px); px += 4;
  93. x2 = LD_SP(px); px += 4;
  94. x3 = LD_SP(px); px += 4;
  95. y0 = LD_SP(py); py += 4;
  96. y1 = LD_SP(py); py += 4;
  97. y2 = LD_SP(py); py += 4;
  98. y3 = LD_SP(py); py += 4;
  99. for (j = (n >> 5) - 1; j--;)
  100. {
  101. PREFETCH(px + pref_offsetx + 32);
  102. PREFETCH(px + pref_offsetx + 40);
  103. PREFETCH(px + pref_offsetx + 48);
  104. PREFETCH(px + pref_offsetx + 56);
  105. PREFETCH(py + pref_offsety + 32);
  106. PREFETCH(py + pref_offsety + 40);
  107. PREFETCH(py + pref_offsety + 48);
  108. PREFETCH(py + pref_offsety + 56);
  109. out0 = x0 + y0;
  110. x4 = LD_SP(px); px += 4;
  111. out1 = y0 - x0;
  112. x5 = LD_SP(px); px += 4;
  113. out2 = x1 + y1;
  114. x6 = LD_SP(px); px += 4;
  115. out3 = y1 - x1;
  116. x7 = LD_SP(px); px += 4;
  117. out4 = x2 + y2;
  118. y4 = LD_SP(py); py += 4;
  119. out5 = y2 - x2;
  120. y5 = LD_SP(py); py += 4;
  121. out6 = x3 + y3;
  122. y6 = LD_SP(py); py += 4;
  123. out7 = y3 - x3;
  124. y7 = LD_SP(py); py += 4;
  125. ST_SP(out0, x); x += 4;
  126. out8 = x4 + y4;
  127. ST_SP(out1, y); y += 4;
  128. out9 = y4 - x4;
  129. ST_SP(out2, x); x += 4;
  130. out10 = x5 + y5;
  131. ST_SP(out3, y); y += 4;
  132. out11 = y5 - x5;
  133. ST_SP(out4, x); x += 4;
  134. out12 = x6 + y6;
  135. ST_SP(out5, y); y += 4;
  136. out13 = y6 - x6;
  137. ST_SP(out6, x); x += 4;
  138. out14 = x7 + y7;
  139. ST_SP(out7, y); y += 4;
  140. out15 = y7 - x7;
  141. x0 = LD_SP(px); px += 4;
  142. ST_SP(out8, x); x += 4;
  143. x1 = LD_SP(px); px += 4;
  144. ST_SP(out10, x); x += 4;
  145. x2 = LD_SP(px); px += 4;
  146. ST_SP(out12, x); x += 4;
  147. x3 = LD_SP(px); px += 4;
  148. ST_SP(out14, x); x += 4;
  149. y0 = LD_SP(py); py += 4;
  150. ST_SP(out9, y); y += 4;
  151. y1 = LD_SP(py); py += 4;
  152. ST_SP(out11, y); y += 4;
  153. y2 = LD_SP(py); py += 4;
  154. ST_SP(out13, y); y += 4;
  155. y3 = LD_SP(py); py += 4;
  156. ST_SP(out15, y); y += 4;
  157. }
  158. x4 = LD_SP(px); px += 4;
  159. x5 = LD_SP(px); px += 4;
  160. x6 = LD_SP(px); px += 4;
  161. x7 = LD_SP(px); px += 4;
  162. y4 = LD_SP(py); py += 4;
  163. y5 = LD_SP(py); py += 4;
  164. y6 = LD_SP(py); py += 4;
  165. y7 = LD_SP(py); py += 4;
  166. out0 = x0 + y0;
  167. out1 = y0 - x0;
  168. out2 = x1 + y1;
  169. out3 = y1 - x1;
  170. out4 = x2 + y2;
  171. out5 = y2 - x2;
  172. out6 = x3 + y3;
  173. out7 = y3 - x3;
  174. out8 = x4 + y4;
  175. out9 = y4 - x4;
  176. out10 = x5 + y5;
  177. out11 = y5 - x5;
  178. out12 = x6 + y6;
  179. out13 = y6 - x6;
  180. out14 = x7 + y7;
  181. out15 = y7 - x7;
  182. ST_SP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 4);
  183. ST_SP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 4);
  184. }
  185. if (n & 16)
  186. {
  187. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  188. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  189. out0 = x0 + y0;
  190. out1 = y0 - x0;
  191. out2 = x1 + y1;
  192. out3 = y1 - x1;
  193. out4 = x2 + y2;
  194. out5 = y2 - x2;
  195. out6 = x3 + y3;
  196. out7 = y3 - x3;
  197. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  198. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  199. }
  200. if (n & 8)
  201. {
  202. LD_SP2_INC(px, 4, x0, x1);
  203. LD_SP2_INC(py, 4, y0, y1);
  204. out0 = x0 + y0;
  205. out1 = y0 - x0;
  206. out2 = x1 + y1;
  207. out3 = y1 - x1;
  208. ST_SP2_INC(out0, out2, x, 4);
  209. ST_SP2_INC(out1, out3, y, 4);
  210. }
  211. if (n & 4)
  212. {
  213. x0 = LD_SP(px);
  214. y0 = LD_SP(py);
  215. px += 4;
  216. py += 4;
  217. out0 = x0 + y0;
  218. out1 = y0 - x0;
  219. ST_SP(out0, x);
  220. ST_SP(out1, y);
  221. x += 4;
  222. y += 4;
  223. }
  224. if (n & 2)
  225. {
  226. LD_GP2_INC(px, 1, fx0, fx1);
  227. LD_GP2_INC(py, 1, fy0, fy1);
  228. tp0 = fx0 + fy0;
  229. tp1 = fy0 - fx0;
  230. tp2 = fx1 + fy1;
  231. tp3 = fy1 - fx1;
  232. ST_GP2_INC(tp0, tp2, x, 1);
  233. ST_GP2_INC(tp1, tp3, y, 1);
  234. }
  235. if (n & 1)
  236. {
  237. fx0 = *px;
  238. fy0 = *py;
  239. tp0 = fx0 + fy0;
  240. tp1 = fy0 - fx0;
  241. *x = tp0;
  242. *y = tp1;
  243. }
  244. }
  245. else if (0 == s)
  246. {
  247. c0 = COPY_FLOAT_TO_VECTOR(c);
  248. if (n >> 5)
  249. {
  250. BLASLONG pref_offsetx, pref_offsety;
  251. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  252. if (pref_offsetx > 0)
  253. {
  254. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  255. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  256. }
  257. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  258. if (pref_offsety > 0)
  259. {
  260. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  261. pref_offsety = pref_offsety / sizeof(FLOAT);
  262. }
  263. LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
  264. for (j = (n >> 5) - 1; j--;)
  265. {
  266. PREFETCH(px + pref_offsetx + 32);
  267. PREFETCH(px + pref_offsetx + 40);
  268. PREFETCH(px + pref_offsetx + 48);
  269. PREFETCH(px + pref_offsetx + 56);
  270. PREFETCH(py + pref_offsety + 32);
  271. PREFETCH(py + pref_offsety + 40);
  272. PREFETCH(py + pref_offsety + 48);
  273. PREFETCH(py + pref_offsety + 56);
  274. y0 = LD_SP(py); py += 4;
  275. x0 *= c0;
  276. y1 = LD_SP(py); py += 4;
  277. x1 *= c0;
  278. y2 = LD_SP(py); py += 4;
  279. x2 *= c0;
  280. y3 = LD_SP(py); py += 4;
  281. x3 *= c0;
  282. y4 = LD_SP(py); py += 4;
  283. x4 *= c0;
  284. y5 = LD_SP(py); py += 4;
  285. x5 *= c0;
  286. y6 = LD_SP(py); py += 4;
  287. x6 *= c0;
  288. y7 = LD_SP(py); py += 4;
  289. x7 *= c0;
  290. ST_SP(x0, x); x += 4;
  291. y0 *= c0;
  292. ST_SP(x1, x); x += 4;
  293. y1 *= c0;
  294. ST_SP(x2, x); x += 4;
  295. y2 *= c0;
  296. ST_SP(x3, x); x += 4;
  297. y3 *= c0;
  298. ST_SP(x4, x); x += 4;
  299. y4 *= c0;
  300. ST_SP(x5, x); x += 4;
  301. y5 *= c0;
  302. ST_SP(x6, x); x += 4;
  303. y6 *= c0;
  304. ST_SP(x7, x); x += 4;
  305. y7 *= c0;
  306. x0 = LD_SP(px); px += 4;
  307. ST_SP(y0, y); y += 4;
  308. x1 = LD_SP(px); px += 4;
  309. ST_SP(y1, y); y += 4;
  310. x2 = LD_SP(px); px += 4;
  311. ST_SP(y2, y); y += 4;
  312. x3 = LD_SP(px); px += 4;
  313. ST_SP(y3, y); y += 4;
  314. x4 = LD_SP(px); px += 4;
  315. ST_SP(y4, y); y += 4;
  316. x5 = LD_SP(px); px += 4;
  317. ST_SP(y5, y); y += 4;
  318. x6 = LD_SP(px); px += 4;
  319. ST_SP(y6, y); y += 4;
  320. x7 = LD_SP(px); px += 4;
  321. ST_SP(y7, y); y += 4;
  322. }
  323. LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
  324. x0 *= c0;
  325. y0 *= c0;
  326. x1 *= c0;
  327. y1 *= c0;
  328. x2 *= c0;
  329. y2 *= c0;
  330. x3 *= c0;
  331. y3 *= c0;
  332. x4 *= c0;
  333. y4 *= c0;
  334. x5 *= c0;
  335. y5 *= c0;
  336. x6 *= c0;
  337. y6 *= c0;
  338. x7 *= c0;
  339. y7 *= c0;
  340. ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
  341. ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
  342. }
  343. if (n & 16)
  344. {
  345. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  346. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  347. x0 *= c0;
  348. y0 *= c0;
  349. x1 *= c0;
  350. y1 *= c0;
  351. x2 *= c0;
  352. y2 *= c0;
  353. x3 *= c0;
  354. y3 *= c0;
  355. ST_SP4_INC(x0, x1, x2, x3, x, 4);
  356. ST_SP4_INC(y0, y1, y2, y3, y, 4);
  357. }
  358. if (n & 8)
  359. {
  360. LD_SP2_INC(px, 4, x0, x1);
  361. LD_SP2_INC(py, 4, y0, y1);
  362. x0 *= c0;
  363. y0 *= c0;
  364. x1 *= c0;
  365. y1 *= c0;
  366. ST_SP2_INC(x0, x1, x, 4);
  367. ST_SP2_INC(y0, y1, y, 4);
  368. }
  369. if (n & 4)
  370. {
  371. x0 = LD_SP(px);
  372. y0 = LD_SP(py);
  373. px += 4;
  374. py += 4;
  375. x0 *= c0;
  376. y0 *= c0;
  377. ST_SP(x0, x);
  378. ST_SP(y0, y);
  379. x += 4;
  380. y += 4;
  381. }
  382. if (n & 2)
  383. {
  384. LD_GP2_INC(px, 1, fx0, fx1);
  385. LD_GP2_INC(py, 1, fy0, fy1);
  386. tp0 = (c * fx0);
  387. tp1 = (c * fy0);
  388. tp2 = (c * fx1);
  389. tp3 = (c * fy1);
  390. ST_GP2_INC(tp0, tp2, x, 1);
  391. ST_GP2_INC(tp1, tp3, y, 1);
  392. }
  393. if (n & 1)
  394. {
  395. fx0 = *px;
  396. fy0 = *py;
  397. tp0 = (c * fx0);
  398. tp1 = (c * fy0);
  399. *x = tp0;
  400. *y = tp1;
  401. }
  402. }
  403. else if (0 == c)
  404. {
  405. s0 = COPY_FLOAT_TO_VECTOR(s);
  406. /* process 16 floats */
  407. if (n >> 5)
  408. {
  409. BLASLONG pref_offsetx, pref_offsety;
  410. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  411. if (pref_offsetx > 0)
  412. {
  413. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  414. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  415. }
  416. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  417. if (pref_offsety > 0)
  418. {
  419. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  420. pref_offsety = pref_offsety / sizeof(FLOAT);
  421. }
  422. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  423. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  424. for (j = (n >> 5) - 1; j--;)
  425. {
  426. PREFETCH(px + pref_offsetx + 32);
  427. PREFETCH(px + pref_offsetx + 40);
  428. PREFETCH(px + pref_offsetx + 48);
  429. PREFETCH(px + pref_offsetx + 56);
  430. PREFETCH(py + pref_offsety + 32);
  431. PREFETCH(py + pref_offsety + 40);
  432. PREFETCH(py + pref_offsety + 48);
  433. PREFETCH(py + pref_offsety + 56);
  434. x4 = LD_SP(px); px += 4;
  435. out0 = s0 * y0;
  436. x5 = LD_SP(px); px += 4;
  437. out2 = s0 * y1;
  438. x6 = LD_SP(px); px += 4;
  439. out4 = s0 * y2;
  440. x7 = LD_SP(px); px += 4;
  441. out6 = s0 * y3;
  442. y4 = LD_SP(py); py += 4;
  443. out1 = -(s0 * x0);
  444. y5 = LD_SP(py); py += 4;
  445. out3 = -(s0 * x1);
  446. y6 = LD_SP(py); py += 4;
  447. out5 = -(s0 * x2);
  448. y7 = LD_SP(py); py += 4;
  449. out7 = -(s0 * x3);
  450. ST_SP(out0, x); x += 4;
  451. out0 = s0 * y4;
  452. ST_SP(out2, x); x += 4;
  453. out2 = s0 * y5;
  454. ST_SP(out4, x); x += 4;
  455. out4 = s0 * y6;
  456. ST_SP(out6, x); x += 4;
  457. out6 = s0 * y7;
  458. ST_SP(out1, y); y += 4;
  459. out1 = -(s0 * x4);
  460. ST_SP(out3, y); y += 4;
  461. out3 = -(s0 * x5);
  462. ST_SP(out5, y); y += 4;
  463. out5 = -(s0 * x6);
  464. ST_SP(out7, y); y += 4;
  465. out7 = -(s0 * x7);
  466. x0 = LD_SP(px); px += 4;
  467. ST_SP(out0, x); x += 4;
  468. x1 = LD_SP(px); px += 4;
  469. ST_SP(out2, x); x += 4;
  470. x2 = LD_SP(px); px += 4;
  471. ST_SP(out4, x); x += 4;
  472. x3 = LD_SP(px); px += 4;
  473. ST_SP(out6, x); x += 4;
  474. y0 = LD_SP(py); py += 4;
  475. ST_SP(out1, y); y += 4;
  476. y1 = LD_SP(py); py += 4;
  477. ST_SP(out3, y); y += 4;
  478. y2 = LD_SP(py); py += 4;
  479. ST_SP(out5, y); y += 4;
  480. y3 = LD_SP(py); py += 4;
  481. ST_SP(out7, y); y += 4;
  482. }
  483. out0 = s0 * y0;
  484. out2 = s0 * y1;
  485. out4 = s0 * y2;
  486. out6 = s0 * y3;
  487. out1 = -(s0 * x0);
  488. out3 = -(s0 * x1);
  489. out5 = -(s0 * x2);
  490. out7 = -(s0 * x3);
  491. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  492. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  493. LD_SP4_INC(px, 4, x4, x5, x6, x7);
  494. LD_SP4_INC(py, 4, y4, y5, y6, y7);
  495. out0 = s0 * y4;
  496. out2 = s0 * y5;
  497. out4 = s0 * y6;
  498. out6 = s0 * y7;
  499. out1 = -(s0 * x4);
  500. out3 = -(s0 * x5);
  501. out5 = -(s0 * x6);
  502. out7 = -(s0 * x7);
  503. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  504. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  505. }
  506. if (n & 16)
  507. {
  508. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  509. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  510. out0 = s0 * y0;
  511. out1 = - (s0 * x0);
  512. out2 = s0 * y1;
  513. out3 = - (s0 * x1);
  514. out4 = s0 * y2;
  515. out5 = - (s0 * x2);
  516. out6 = s0 * y3;
  517. out7 = - (s0 * x3);
  518. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  519. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  520. }
  521. if (n & 8)
  522. {
  523. LD_SP2_INC(px, 4, x0, x1);
  524. LD_SP2_INC(py, 4, y0, y1);
  525. out0 = s0 * y0;
  526. out1 = - (s0 * x0);
  527. out2 = s0 * y1;
  528. out3 = - (s0 * x1);
  529. ST_SP2_INC(out0, out2, x, 4);
  530. ST_SP2_INC(out1, out3, y, 4);
  531. }
  532. if (n & 4)
  533. {
  534. x0 = LD_SP(px); px += 4;
  535. y0 = LD_SP(py); py += 4;
  536. out0 = s0 * y0;
  537. out1 = - (s0 * x0);
  538. ST_SP(out0, x); x += 4;
  539. ST_SP(out1, y); y += 4;
  540. }
  541. if (n & 2)
  542. {
  543. LD_GP2_INC(px, 1, fx0, fx1);
  544. LD_GP2_INC(py, 1, fy0, fy1);
  545. tp0 = s * fy0;
  546. tp1 = - (s * fx0);
  547. tp2 = s * fy1;
  548. tp3 = - (s * fx1);
  549. ST_GP2_INC(tp0, tp2, x, 1);
  550. ST_GP2_INC(tp1, tp3, y, 1);
  551. }
  552. if (n & 1)
  553. {
  554. fx0 = *px;
  555. fy0 = *py;
  556. tp0 = s * fy0;
  557. tp1 = - (s * fx0);
  558. *x = tp0;
  559. *y = tp1;
  560. }
  561. }
  562. else
  563. {
  564. c0 = COPY_FLOAT_TO_VECTOR(c);
  565. s0 = COPY_FLOAT_TO_VECTOR(s);
  566. /* process 16 floats */
  567. if (n >> 5)
  568. {
  569. BLASLONG pref_offsetx, pref_offsety;
  570. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  571. if (pref_offsetx > 0)
  572. {
  573. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  574. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  575. }
  576. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  577. if (pref_offsety > 0)
  578. {
  579. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  580. pref_offsety = pref_offsety / sizeof(FLOAT);
  581. }
  582. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  583. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  584. for (j = (n >> 5) - 1; j--;)
  585. {
  586. PREFETCH(px + pref_offsetx + 32);
  587. PREFETCH(px + pref_offsetx + 40);
  588. PREFETCH(px + pref_offsetx + 48);
  589. PREFETCH(px + pref_offsetx + 56);
  590. PREFETCH(py + pref_offsety + 32);
  591. PREFETCH(py + pref_offsety + 40);
  592. PREFETCH(py + pref_offsety + 48);
  593. PREFETCH(py + pref_offsety + 56);
  594. x4 = LD_SP(px); px += 4;
  595. out0 = c0 * x0;
  596. x5 = LD_SP(px); px += 4;
  597. out2 = c0 * x1;
  598. x6 = LD_SP(px); px += 4;
  599. out4 = c0 * x2;
  600. x7 = LD_SP(px); px += 4;
  601. out6 = c0 * x3;
  602. y4 = LD_SP(py); py += 4;
  603. out1 = c0 * y0;
  604. y5 = LD_SP(py); py += 4;
  605. out3 = c0 * y1;
  606. y6 = LD_SP(py); py += 4;
  607. out5 = c0 * y2;
  608. y7 = LD_SP(py); py += 4;
  609. out7 = c0 * y3;
  610. out0 += s0 * y0;
  611. out2 += s0 * y1;
  612. out4 += s0 * y2;
  613. out6 += s0 * y3;
  614. out1 -= s0 * x0;
  615. out3 -= s0 * x1;
  616. out5 -= s0 * x2;
  617. out7 -= s0 * x3;
  618. ST_SP(out0, x); x += 4;
  619. out0 = c0 * x4;
  620. ST_SP(out2, x); x += 4;
  621. out2 = c0 * x5;
  622. ST_SP(out4, x); x += 4;
  623. out4 = c0 * x6;
  624. ST_SP(out6, x); x += 4;
  625. out6 = c0 * x7;
  626. ST_SP(out1, y); y += 4;
  627. out1 = c0 * y4;
  628. ST_SP(out3, y); y += 4;
  629. out3 = c0 * y5;
  630. ST_SP(out5, y); y += 4;
  631. out5 = c0 * y6;
  632. ST_SP(out7, y); y += 4;
  633. out7 = c0 * y7;
  634. x0 = LD_SP(px); px += 4;
  635. out0 += s0 * y4;
  636. x1 = LD_SP(px); px += 4;
  637. out2 += s0 * y5;
  638. x2 = LD_SP(px); px += 4;
  639. out4 += s0 * y6;
  640. x3 = LD_SP(px); px += 4;
  641. out6 += s0 * y7;
  642. y0 = LD_SP(py); py += 4;
  643. out1 -= s0 * x4;
  644. y1 = LD_SP(py); py += 4;
  645. out3 -= s0 * x5;
  646. y2 = LD_SP(py); py += 4;
  647. out5 -= s0 * x6;
  648. y3 = LD_SP(py); py += 4;
  649. out7 -= s0 * x7;
  650. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  651. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  652. }
  653. out0 = c0 * x0;
  654. out2 = c0 * x1;
  655. out4 = c0 * x2;
  656. out6 = c0 * x3;
  657. out1 = c0 * y0;
  658. out3 = c0 * y1;
  659. out5 = c0 * y2;
  660. out7 = c0 * y3;
  661. out0 += s0 * y0;
  662. out2 += s0 * y1;
  663. out4 += s0 * y2;
  664. out6 += s0 * y3;
  665. out1 -= s0 * x0;
  666. out3 -= s0 * x1;
  667. out5 -= s0 * x2;
  668. out7 -= s0 * x3;
  669. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  670. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  671. LD_SP4_INC(px, 4, x4, x5, x6, x7);
  672. LD_SP4_INC(py, 4, y4, y5, y6, y7);
  673. out0 = c0 * x4;
  674. out2 = c0 * x5;
  675. out4 = c0 * x6;
  676. out6 = c0 * x7;
  677. out1 = c0 * y4;
  678. out3 = c0 * y5;
  679. out5 = c0 * y6;
  680. out7 = c0 * y7;
  681. out0 += s0 * y4;
  682. out2 += s0 * y5;
  683. out4 += s0 * y6;
  684. out6 += s0 * y7;
  685. out1 -= s0 * x4;
  686. out3 -= s0 * x5;
  687. out5 -= s0 * x6;
  688. out7 -= s0 * x7;
  689. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  690. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  691. }
  692. if (n & 16)
  693. {
  694. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  695. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  696. out0 = (c0 * x0) + (s0 * y0);
  697. out1 = (c0 * y0) - (s0 * x0);
  698. out2 = (c0 * x1) + (s0 * y1);
  699. out3 = (c0 * y1) - (s0 * x1);
  700. out4 = (c0 * x2) + (s0 * y2);
  701. out5 = (c0 * y2) - (s0 * x2);
  702. out6 = (c0 * x3) + (s0 * y3);
  703. out7 = (c0 * y3) - (s0 * x3);
  704. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  705. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  706. }
  707. if (n & 8)
  708. {
  709. LD_SP2_INC(px, 4, x0, x1);
  710. LD_SP2_INC(py, 4, y0, y1);
  711. out0 = (c0 * x0) + (s0 * y0);
  712. out1 = (c0 * y0) - (s0 * x0);
  713. out2 = (c0 * x1) + (s0 * y1);
  714. out3 = (c0 * y1) - (s0 * x1);
  715. ST_SP2_INC(out0, out2, x, 4);
  716. ST_SP2_INC(out1, out3, y, 4);
  717. }
  718. if (n & 4)
  719. {
  720. x0 = LD_SP(px);
  721. y0 = LD_SP(py);
  722. px += 4;
  723. py += 4;
  724. out0 = (c0 * x0) + (s0 * y0);
  725. out1 = (c0 * y0) - (s0 * x0);
  726. ST_SP(out0, x);
  727. ST_SP(out1, y);
  728. x += 4;
  729. y += 4;
  730. }
  731. if (n & 2)
  732. {
  733. LD_GP2_INC(px, 1, fx0, fx1);
  734. LD_GP2_INC(py, 1, fy0, fy1);
  735. tp0 = (c * fx0) + (s * fy0);
  736. tp1 = (c * fy0) - (s * fx0);
  737. tp2 = (c * fx1) + (s * fy1);
  738. tp3 = (c * fy1) - (s * fx1);
  739. ST_GP2_INC(tp0, tp2, x, 1);
  740. ST_GP2_INC(tp1, tp3, y, 1);
  741. }
  742. if (n & 1)
  743. {
  744. fx0 = *px;
  745. fy0 = *py;
  746. tp0 = (c * fx0) + (s * fy0);
  747. tp1 = (c * fy0) - (s * fx0);
  748. *x = tp0;
  749. *y = tp1;
  750. }
  751. }
  752. }
  753. else
  754. {
  755. if ((0 == c) && (0 == s))
  756. {
  757. for (i = n; i--;)
  758. {
  759. *x = 0;
  760. *y = 0;
  761. x += inc_x;
  762. y += inc_y;
  763. }
  764. }
  765. else if ((1 == c) && (1 == s))
  766. {
  767. if (n >> 2)
  768. {
  769. fx0 = *px; px += inc_x;
  770. fx1 = *px; px += inc_x;
  771. fx2 = *px; px += inc_x;
  772. fx3 = *px; px += inc_x;
  773. fy0 = *py; py += inc_y;
  774. fy1 = *py; py += inc_y;
  775. fy2 = *py; py += inc_y;
  776. fy3 = *py; py += inc_y;
  777. for (i = (n >> 2) -1; i--;)
  778. {
  779. tp0 = fx0 + fy0;
  780. tp1 = fy0 - fx0;
  781. tp2 = fx1 + fy1;
  782. tp3 = fy1 - fx1;
  783. tp4 = fx2 + fy2;
  784. tp5 = fy2 - fx2;
  785. tp6 = fx3 + fy3;
  786. tp7 = fy3 - fx3;
  787. fx0 = *px; px += inc_x;
  788. *x = tp0; x += inc_x;
  789. fx1 = *px; px += inc_x;
  790. *x = tp2; x += inc_x;
  791. fx2 = *px; px += inc_x;
  792. *x = tp4; x += inc_x;
  793. fx3 = *px; px += inc_x;
  794. *x = tp6; x += inc_x;
  795. fy0 = *py; py += inc_y;
  796. *y = tp1; y += inc_y;
  797. fy1 = *py; py += inc_y;
  798. *y = tp3; y += inc_y;
  799. fy2 = *py; py += inc_y;
  800. *y = tp5; y += inc_y;
  801. fy3 = *py; py += inc_y;
  802. *y = tp7; y += inc_y;
  803. }
  804. tp0 = fx0 + fy0;
  805. tp1 = fy0 - fx0;
  806. tp2 = fx1 + fy1;
  807. tp3 = fy1 - fx1;
  808. tp4 = fx2 + fy2;
  809. tp5 = fy2 - fx2;
  810. tp6 = fx3 + fy3;
  811. tp7 = fy3 - fx3;
  812. *x = tp0; x += inc_x;
  813. *x = tp2; x += inc_x;
  814. *x = tp4; x += inc_x;
  815. *x = tp6; x += inc_x;
  816. *y = tp1; y += inc_y;
  817. *y = tp3; y += inc_y;
  818. *y = tp5; y += inc_y;
  819. *y = tp7; y += inc_y;
  820. }
  821. if (n & 2)
  822. {
  823. LD_GP2_INC(px, inc_x, fx0, fx1);
  824. LD_GP2_INC(py, inc_y, fy0, fy1);
  825. tp0 = fx0 + fy0;
  826. tp1 = fy0 - fx0;
  827. tp2 = fx1 + fy1;
  828. tp3 = fy1 - fx1;
  829. ST_GP2_INC(tp0, tp2, x, inc_x);
  830. ST_GP2_INC(tp1, tp3, y, inc_y);
  831. }
  832. if (n & 1)
  833. {
  834. fx0 = *px;
  835. fy0 = *py;
  836. tp0 = fx0 + fy0;
  837. tp1 = fy0 - fx0;
  838. *x = tp0;
  839. *y = tp1;
  840. }
  841. }
  842. else if (0 == s)
  843. {
  844. if (n >> 2)
  845. {
  846. fx0 = *px; px += inc_x;
  847. fx1 = *px; px += inc_x;
  848. fx2 = *px; px += inc_x;
  849. fx3 = *px; px += inc_x;
  850. fy0 = *py; py += inc_y;
  851. fy1 = *py; py += inc_y;
  852. fy2 = *py; py += inc_y;
  853. fy3 = *py; py += inc_y;
  854. for (i = (n >> 2) - 1; i--;)
  855. {
  856. tp0 = c * fx0;
  857. tp1 = c * fy0;
  858. tp2 = c * fx1;
  859. tp3 = c * fy1;
  860. tp4 = c * fx2;
  861. tp5 = c * fy2;
  862. tp6 = c * fx3;
  863. tp7 = c * fy3;
  864. fx0 = *px; px += inc_x;
  865. *x = tp0; x += inc_x;
  866. fx1 = *px; px += inc_x;
  867. *x = tp2; x += inc_x;
  868. fx2 = *px; px += inc_x;
  869. *x = tp4; x += inc_x;
  870. fx3 = *px; px += inc_x;
  871. *x = tp6; x += inc_x;
  872. fy0 = *py; py += inc_y;
  873. *y = tp1; y += inc_y;
  874. fy1 = *py; py += inc_y;
  875. *y = tp3; y += inc_y;
  876. fy2 = *py; py += inc_y;
  877. *y = tp5; y += inc_y;
  878. fy3 = *py; py += inc_y;
  879. *y = tp7; y += inc_y;
  880. }
  881. tp0 = c * fx0;
  882. tp1 = c * fy0;
  883. tp2 = c * fx1;
  884. tp3 = c * fy1;
  885. tp4 = c * fx2;
  886. tp5 = c * fy2;
  887. tp6 = c * fx3;
  888. tp7 = c * fy3;
  889. *x = tp0; x += inc_x;
  890. *x = tp2; x += inc_x;
  891. *x = tp4; x += inc_x;
  892. *x = tp6; x += inc_x;
  893. *y = tp1; y += inc_y;
  894. *y = tp3; y += inc_y;
  895. *y = tp5; y += inc_y;
  896. *y = tp7; y += inc_y;
  897. }
  898. if (n & 2)
  899. {
  900. LD_GP2_INC(px, inc_x, fx0, fx1);
  901. LD_GP2_INC(py, inc_y, fy0, fy1);
  902. tp0 = c * fx0;
  903. tp1 = c * fy0;
  904. tp2 = c * fx1;
  905. tp3 = c * fy1;
  906. ST_GP2_INC(tp0, tp2, x, inc_x);
  907. ST_GP2_INC(tp1, tp3, y, inc_y);
  908. }
  909. if (n & 1)
  910. {
  911. fx0 = *px;
  912. fy0 = *py;
  913. tp0 = c * fx0;
  914. tp1 = c * fy0;
  915. *x = tp0;
  916. *y = tp1;
  917. }
  918. }
  919. else
  920. {
  921. if (n >> 2)
  922. {
  923. fx0 = *px; px += inc_x;
  924. fx1 = *px; px += inc_x;
  925. fx2 = *px; px += inc_x;
  926. fx3 = *px; px += inc_x;
  927. fy0 = *py; py += inc_y;
  928. fy1 = *py; py += inc_y;
  929. fy2 = *py; py += inc_y;
  930. fy3 = *py; py += inc_y;
  931. for (i = (n >> 2) - 1; i--;)
  932. {
  933. tp0 = c * fx0 + s * fy0;
  934. tp1 = c * fy0 - s * fx0;
  935. tp2 = c * fx1 + s * fy1;
  936. tp3 = c * fy1 - s * fx1;
  937. tp4 = c * fx2 + s * fy2;
  938. tp5 = c * fy2 - s * fx2;
  939. tp6 = c * fx3 + s * fy3;
  940. tp7 = c * fy3 - s * fx3;
  941. fx0 = *px; px += inc_x;
  942. *x = tp0; x += inc_x;
  943. fx1 = *px; px += inc_x;
  944. *x = tp2; x += inc_x;
  945. fx2 = *px; px += inc_x;
  946. *x = tp4; x += inc_x;
  947. fx3 = *px; px += inc_x;
  948. *x = tp6; x += inc_x;
  949. fy0 = *py; py += inc_y;
  950. *y = tp1; y += inc_y;
  951. fy1 = *py; py += inc_y;
  952. *y = tp3; y += inc_y;
  953. fy2 = *py; py += inc_y;
  954. *y = tp5; y += inc_y;
  955. fy3 = *py; py += inc_y;
  956. *y = tp7; y += inc_y;
  957. }
  958. tp0 = c * fx0 + s * fy0;
  959. tp1 = c * fy0 - s * fx0;
  960. tp2 = c * fx1 + s * fy1;
  961. tp3 = c * fy1 - s * fx1;
  962. tp4 = c * fx2 + s * fy2;
  963. tp5 = c * fy2 - s * fx2;
  964. tp6 = c * fx3 + s * fy3;
  965. tp7 = c * fy3 - s * fx3;
  966. *x = tp0; x += inc_x;
  967. *x = tp2; x += inc_x;
  968. *x = tp4; x += inc_x;
  969. *x = tp6; x += inc_x;
  970. *y = tp1; y += inc_y;
  971. *y = tp3; y += inc_y;
  972. *y = tp5; y += inc_y;
  973. *y = tp7; y += inc_y;
  974. }
  975. if (n & 2)
  976. {
  977. LD_GP2_INC(px, inc_x, fx0, fx1);
  978. LD_GP2_INC(py, inc_y, fy0, fy1);
  979. tp0 = c * fx0 + s * fy0;
  980. tp1 = c * fy0 - s * fx0;
  981. tp2 = c * fx1 + s * fy1;
  982. tp3 = c * fy1 - s * fx1;
  983. ST_GP2_INC(tp0, tp2, x, inc_x);
  984. ST_GP2_INC(tp1, tp3, y, inc_y);
  985. }
  986. if (n & 1)
  987. {
  988. fx0 = *px;
  989. fy0 = *py;
  990. tp0 = c * fx0 + s * fy0;
  991. tp1 = c * fy0 - s * fx0;
  992. *x = tp0;
  993. *y = tp1;
  994. }
  995. }
  996. }
  997. return 0;
  998. }