You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

srot_msa.c 36 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119
  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
  30. FLOAT c, FLOAT s)
  31. {
  32. BLASLONG i, j;
  33. FLOAT *px, *py;
  34. FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
  35. FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3;
  36. v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
  37. v4f32 out0, out1, out2, out3, out4, out5, out6, out7;
  38. v4f32 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0;
  39. if (n <= 0) return (0);
  40. px = x;
  41. py = y;
  42. if ((1 == inc_x) && (1 == inc_y))
  43. {
  44. if ((0 == c) && (0 == s))
  45. {
  46. v4f32 zero = {0.0, 0.0, 0.0, 0.0};
  47. /* process 4 floats */
  48. for (j = (n >> 2); j--;)
  49. {
  50. ST_SP(zero, px);
  51. ST_SP(zero, py);
  52. px += 4;
  53. py += 4;
  54. }
  55. if (n & 2)
  56. {
  57. px[0] = 0;
  58. py[0] = 0;
  59. px[1] = 0;
  60. py[1] = 0;
  61. px += 2;
  62. py += 2;
  63. }
  64. if (n & 1)
  65. {
  66. px[0] = 0;
  67. py[0] = 0;
  68. }
  69. }
  70. else if ((1 == c) && (1 == s))
  71. {
  72. if (n >> 5)
  73. {
  74. BLASLONG pref_offsetx, pref_offsety;
  75. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  76. if (pref_offsetx > 0)
  77. {
  78. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  79. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  80. }
  81. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  82. if (pref_offsety > 0)
  83. {
  84. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  85. pref_offsety = pref_offsety / sizeof(FLOAT);
  86. }
  87. x0 = LD_SP(px); px += 4;
  88. x1 = LD_SP(px); px += 4;
  89. x2 = LD_SP(px); px += 4;
  90. x3 = LD_SP(px); px += 4;
  91. y0 = LD_SP(py); py += 4;
  92. y1 = LD_SP(py); py += 4;
  93. y2 = LD_SP(py); py += 4;
  94. y3 = LD_SP(py); py += 4;
  95. for (j = (n >> 5) - 1; j--;)
  96. {
  97. PREFETCH(px + pref_offsetx + 32);
  98. PREFETCH(px + pref_offsetx + 40);
  99. PREFETCH(px + pref_offsetx + 48);
  100. PREFETCH(px + pref_offsetx + 56);
  101. PREFETCH(py + pref_offsety + 32);
  102. PREFETCH(py + pref_offsety + 40);
  103. PREFETCH(py + pref_offsety + 48);
  104. PREFETCH(py + pref_offsety + 56);
  105. out0 = x0 + y0;
  106. x4 = LD_SP(px); px += 4;
  107. out1 = y0 - x0;
  108. x5 = LD_SP(px); px += 4;
  109. out2 = x1 + y1;
  110. x6 = LD_SP(px); px += 4;
  111. out3 = y1 - x1;
  112. x7 = LD_SP(px); px += 4;
  113. out4 = x2 + y2;
  114. y4 = LD_SP(py); py += 4;
  115. out5 = y2 - x2;
  116. y5 = LD_SP(py); py += 4;
  117. out6 = x3 + y3;
  118. y6 = LD_SP(py); py += 4;
  119. out7 = y3 - x3;
  120. y7 = LD_SP(py); py += 4;
  121. ST_SP(out0, x); x += 4;
  122. out8 = x4 + y4;
  123. ST_SP(out1, y); y += 4;
  124. out9 = y4 - x4;
  125. ST_SP(out2, x); x += 4;
  126. out10 = x5 + y5;
  127. ST_SP(out3, y); y += 4;
  128. out11 = y5 - x5;
  129. ST_SP(out4, x); x += 4;
  130. out12 = x6 + y6;
  131. ST_SP(out5, y); y += 4;
  132. out13 = y6 - x6;
  133. ST_SP(out6, x); x += 4;
  134. out14 = x7 + y7;
  135. ST_SP(out7, y); y += 4;
  136. out15 = y7 - x7;
  137. x0 = LD_SP(px); px += 4;
  138. ST_SP(out8, x); x += 4;
  139. x1 = LD_SP(px); px += 4;
  140. ST_SP(out10, x); x += 4;
  141. x2 = LD_SP(px); px += 4;
  142. ST_SP(out12, x); x += 4;
  143. x3 = LD_SP(px); px += 4;
  144. ST_SP(out14, x); x += 4;
  145. y0 = LD_SP(py); py += 4;
  146. ST_SP(out9, y); y += 4;
  147. y1 = LD_SP(py); py += 4;
  148. ST_SP(out11, y); y += 4;
  149. y2 = LD_SP(py); py += 4;
  150. ST_SP(out13, y); y += 4;
  151. y3 = LD_SP(py); py += 4;
  152. ST_SP(out15, y); y += 4;
  153. }
  154. x4 = LD_SP(px); px += 4;
  155. x5 = LD_SP(px); px += 4;
  156. x6 = LD_SP(px); px += 4;
  157. x7 = LD_SP(px); px += 4;
  158. y4 = LD_SP(py); py += 4;
  159. y5 = LD_SP(py); py += 4;
  160. y6 = LD_SP(py); py += 4;
  161. y7 = LD_SP(py); py += 4;
  162. out0 = x0 + y0;
  163. out1 = y0 - x0;
  164. out2 = x1 + y1;
  165. out3 = y1 - x1;
  166. out4 = x2 + y2;
  167. out5 = y2 - x2;
  168. out6 = x3 + y3;
  169. out7 = y3 - x3;
  170. out8 = x4 + y4;
  171. out9 = y4 - x4;
  172. out10 = x5 + y5;
  173. out11 = y5 - x5;
  174. out12 = x6 + y6;
  175. out13 = y6 - x6;
  176. out14 = x7 + y7;
  177. out15 = y7 - x7;
  178. ST_SP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 4);
  179. ST_SP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 4);
  180. }
  181. if (n & 16)
  182. {
  183. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  184. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  185. out0 = x0 + y0;
  186. out1 = y0 - x0;
  187. out2 = x1 + y1;
  188. out3 = y1 - x1;
  189. out4 = x2 + y2;
  190. out5 = y2 - x2;
  191. out6 = x3 + y3;
  192. out7 = y3 - x3;
  193. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  194. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  195. }
  196. if (n & 8)
  197. {
  198. LD_SP2_INC(px, 4, x0, x1);
  199. LD_SP2_INC(py, 4, y0, y1);
  200. out0 = x0 + y0;
  201. out1 = y0 - x0;
  202. out2 = x1 + y1;
  203. out3 = y1 - x1;
  204. ST_SP2_INC(out0, out2, x, 4);
  205. ST_SP2_INC(out1, out3, y, 4);
  206. }
  207. if (n & 4)
  208. {
  209. x0 = LD_SP(px);
  210. y0 = LD_SP(py);
  211. px += 4;
  212. py += 4;
  213. out0 = x0 + y0;
  214. out1 = y0 - x0;
  215. ST_SP(out0, x);
  216. ST_SP(out1, y);
  217. x += 4;
  218. y += 4;
  219. }
  220. if (n & 2)
  221. {
  222. LD_GP2_INC(px, 1, fx0, fx1);
  223. LD_GP2_INC(py, 1, fy0, fy1);
  224. tp0 = fx0 + fy0;
  225. tp1 = fy0 - fx0;
  226. tp2 = fx1 + fy1;
  227. tp3 = fy1 - fx1;
  228. ST_GP2_INC(tp0, tp2, x, 1);
  229. ST_GP2_INC(tp1, tp3, y, 1);
  230. }
  231. if (n & 1)
  232. {
  233. fx0 = *px;
  234. fy0 = *py;
  235. tp0 = fx0 + fy0;
  236. tp1 = fy0 - fx0;
  237. *x = tp0;
  238. *y = tp1;
  239. }
  240. }
  241. else if (0 == s)
  242. {
  243. c0 = COPY_FLOAT_TO_VECTOR(c);
  244. if (n >> 5)
  245. {
  246. BLASLONG pref_offsetx, pref_offsety;
  247. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  248. if (pref_offsetx > 0)
  249. {
  250. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  251. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  252. }
  253. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  254. if (pref_offsety > 0)
  255. {
  256. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  257. pref_offsety = pref_offsety / sizeof(FLOAT);
  258. }
  259. LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
  260. for (j = (n >> 5) - 1; j--;)
  261. {
  262. PREFETCH(px + pref_offsetx + 32);
  263. PREFETCH(px + pref_offsetx + 40);
  264. PREFETCH(px + pref_offsetx + 48);
  265. PREFETCH(px + pref_offsetx + 56);
  266. PREFETCH(py + pref_offsety + 32);
  267. PREFETCH(py + pref_offsety + 40);
  268. PREFETCH(py + pref_offsety + 48);
  269. PREFETCH(py + pref_offsety + 56);
  270. y0 = LD_SP(py); py += 4;
  271. x0 *= c0;
  272. y1 = LD_SP(py); py += 4;
  273. x1 *= c0;
  274. y2 = LD_SP(py); py += 4;
  275. x2 *= c0;
  276. y3 = LD_SP(py); py += 4;
  277. x3 *= c0;
  278. y4 = LD_SP(py); py += 4;
  279. x4 *= c0;
  280. y5 = LD_SP(py); py += 4;
  281. x5 *= c0;
  282. y6 = LD_SP(py); py += 4;
  283. x6 *= c0;
  284. y7 = LD_SP(py); py += 4;
  285. x7 *= c0;
  286. ST_SP(x0, x); x += 4;
  287. y0 *= c0;
  288. ST_SP(x1, x); x += 4;
  289. y1 *= c0;
  290. ST_SP(x2, x); x += 4;
  291. y2 *= c0;
  292. ST_SP(x3, x); x += 4;
  293. y3 *= c0;
  294. ST_SP(x4, x); x += 4;
  295. y4 *= c0;
  296. ST_SP(x5, x); x += 4;
  297. y5 *= c0;
  298. ST_SP(x6, x); x += 4;
  299. y6 *= c0;
  300. ST_SP(x7, x); x += 4;
  301. y7 *= c0;
  302. x0 = LD_SP(px); px += 4;
  303. ST_SP(y0, y); y += 4;
  304. x1 = LD_SP(px); px += 4;
  305. ST_SP(y1, y); y += 4;
  306. x2 = LD_SP(px); px += 4;
  307. ST_SP(y2, y); y += 4;
  308. x3 = LD_SP(px); px += 4;
  309. ST_SP(y3, y); y += 4;
  310. x4 = LD_SP(px); px += 4;
  311. ST_SP(y4, y); y += 4;
  312. x5 = LD_SP(px); px += 4;
  313. ST_SP(y5, y); y += 4;
  314. x6 = LD_SP(px); px += 4;
  315. ST_SP(y6, y); y += 4;
  316. x7 = LD_SP(px); px += 4;
  317. ST_SP(y7, y); y += 4;
  318. }
  319. LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
  320. x0 *= c0;
  321. y0 *= c0;
  322. x1 *= c0;
  323. y1 *= c0;
  324. x2 *= c0;
  325. y2 *= c0;
  326. x3 *= c0;
  327. y3 *= c0;
  328. x4 *= c0;
  329. y4 *= c0;
  330. x5 *= c0;
  331. y5 *= c0;
  332. x6 *= c0;
  333. y6 *= c0;
  334. x7 *= c0;
  335. y7 *= c0;
  336. ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
  337. ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
  338. }
  339. if (n & 16)
  340. {
  341. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  342. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  343. x0 *= c0;
  344. y0 *= c0;
  345. x1 *= c0;
  346. y1 *= c0;
  347. x2 *= c0;
  348. y2 *= c0;
  349. x3 *= c0;
  350. y3 *= c0;
  351. ST_SP4_INC(x0, x1, x2, x3, x, 4);
  352. ST_SP4_INC(y0, y1, y2, y3, y, 4);
  353. }
  354. if (n & 8)
  355. {
  356. LD_SP2_INC(px, 4, x0, x1);
  357. LD_SP2_INC(py, 4, y0, y1);
  358. x0 *= c0;
  359. y0 *= c0;
  360. x1 *= c0;
  361. y1 *= c0;
  362. ST_SP2_INC(x0, x1, x, 4);
  363. ST_SP2_INC(y0, y1, y, 4);
  364. }
  365. if (n & 4)
  366. {
  367. x0 = LD_SP(px);
  368. y0 = LD_SP(py);
  369. px += 4;
  370. py += 4;
  371. x0 *= c0;
  372. y0 *= c0;
  373. ST_SP(x0, x);
  374. ST_SP(y0, y);
  375. x += 4;
  376. y += 4;
  377. }
  378. if (n & 2)
  379. {
  380. LD_GP2_INC(px, 1, fx0, fx1);
  381. LD_GP2_INC(py, 1, fy0, fy1);
  382. tp0 = (c * fx0);
  383. tp1 = (c * fy0);
  384. tp2 = (c * fx1);
  385. tp3 = (c * fy1);
  386. ST_GP2_INC(tp0, tp2, x, 1);
  387. ST_GP2_INC(tp1, tp3, y, 1);
  388. }
  389. if (n & 1)
  390. {
  391. fx0 = *px;
  392. fy0 = *py;
  393. tp0 = (c * fx0);
  394. tp1 = (c * fy0);
  395. *x = tp0;
  396. *y = tp1;
  397. }
  398. }
  399. else if (0 == c)
  400. {
  401. s0 = COPY_FLOAT_TO_VECTOR(s);
  402. /* process 16 floats */
  403. if (n >> 5)
  404. {
  405. BLASLONG pref_offsetx, pref_offsety;
  406. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  407. if (pref_offsetx > 0)
  408. {
  409. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  410. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  411. }
  412. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  413. if (pref_offsety > 0)
  414. {
  415. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  416. pref_offsety = pref_offsety / sizeof(FLOAT);
  417. }
  418. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  419. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  420. for (j = (n >> 5) - 1; j--;)
  421. {
  422. PREFETCH(px + pref_offsetx + 32);
  423. PREFETCH(px + pref_offsetx + 40);
  424. PREFETCH(px + pref_offsetx + 48);
  425. PREFETCH(px + pref_offsetx + 56);
  426. PREFETCH(py + pref_offsety + 32);
  427. PREFETCH(py + pref_offsety + 40);
  428. PREFETCH(py + pref_offsety + 48);
  429. PREFETCH(py + pref_offsety + 56);
  430. x4 = LD_SP(px); px += 4;
  431. out0 = s0 * y0;
  432. x5 = LD_SP(px); px += 4;
  433. out2 = s0 * y1;
  434. x6 = LD_SP(px); px += 4;
  435. out4 = s0 * y2;
  436. x7 = LD_SP(px); px += 4;
  437. out6 = s0 * y3;
  438. y4 = LD_SP(py); py += 4;
  439. out1 = -(s0 * x0);
  440. y5 = LD_SP(py); py += 4;
  441. out3 = -(s0 * x1);
  442. y6 = LD_SP(py); py += 4;
  443. out5 = -(s0 * x2);
  444. y7 = LD_SP(py); py += 4;
  445. out7 = -(s0 * x3);
  446. ST_SP(out0, x); x += 4;
  447. out0 = s0 * y4;
  448. ST_SP(out2, x); x += 4;
  449. out2 = s0 * y5;
  450. ST_SP(out4, x); x += 4;
  451. out4 = s0 * y6;
  452. ST_SP(out6, x); x += 4;
  453. out6 = s0 * y7;
  454. ST_SP(out1, y); y += 4;
  455. out1 = -(s0 * x4);
  456. ST_SP(out3, y); y += 4;
  457. out3 = -(s0 * x5);
  458. ST_SP(out5, y); y += 4;
  459. out5 = -(s0 * x6);
  460. ST_SP(out7, y); y += 4;
  461. out7 = -(s0 * x7);
  462. x0 = LD_SP(px); px += 4;
  463. ST_SP(out0, x); x += 4;
  464. x1 = LD_SP(px); px += 4;
  465. ST_SP(out2, x); x += 4;
  466. x2 = LD_SP(px); px += 4;
  467. ST_SP(out4, x); x += 4;
  468. x3 = LD_SP(px); px += 4;
  469. ST_SP(out6, x); x += 4;
  470. y0 = LD_SP(py); py += 4;
  471. ST_SP(out1, y); y += 4;
  472. y1 = LD_SP(py); py += 4;
  473. ST_SP(out3, y); y += 4;
  474. y2 = LD_SP(py); py += 4;
  475. ST_SP(out5, y); y += 4;
  476. y3 = LD_SP(py); py += 4;
  477. ST_SP(out7, y); y += 4;
  478. }
  479. out0 = s0 * y0;
  480. out2 = s0 * y1;
  481. out4 = s0 * y2;
  482. out6 = s0 * y3;
  483. out1 = -(s0 * x0);
  484. out3 = -(s0 * x1);
  485. out5 = -(s0 * x2);
  486. out7 = -(s0 * x3);
  487. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  488. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  489. LD_SP4_INC(px, 4, x4, x5, x6, x7);
  490. LD_SP4_INC(py, 4, y4, y5, y6, y7);
  491. out0 = s0 * y4;
  492. out2 = s0 * y5;
  493. out4 = s0 * y6;
  494. out6 = s0 * y7;
  495. out1 = -(s0 * x4);
  496. out3 = -(s0 * x5);
  497. out5 = -(s0 * x6);
  498. out7 = -(s0 * x7);
  499. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  500. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  501. }
  502. if (n & 16)
  503. {
  504. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  505. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  506. out0 = s0 * y0;
  507. out1 = - (s0 * x0);
  508. out2 = s0 * y1;
  509. out3 = - (s0 * x1);
  510. out4 = s0 * y2;
  511. out5 = - (s0 * x2);
  512. out6 = s0 * y3;
  513. out7 = - (s0 * x3);
  514. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  515. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  516. }
  517. if (n & 8)
  518. {
  519. LD_SP2_INC(px, 4, x0, x1);
  520. LD_SP2_INC(py, 4, y0, y1);
  521. out0 = s0 * y0;
  522. out1 = - (s0 * x0);
  523. out2 = s0 * y1;
  524. out3 = - (s0 * x1);
  525. ST_SP2_INC(out0, out2, x, 4);
  526. ST_SP2_INC(out1, out3, y, 4);
  527. }
  528. if (n & 4)
  529. {
  530. x0 = LD_SP(px); px += 4;
  531. y0 = LD_SP(py); py += 4;
  532. out0 = s0 * y0;
  533. out1 = - (s0 * x0);
  534. ST_SP(out0, x); x += 4;
  535. ST_SP(out1, y); y += 4;
  536. }
  537. if (n & 2)
  538. {
  539. LD_GP2_INC(px, 1, fx0, fx1);
  540. LD_GP2_INC(py, 1, fy0, fy1);
  541. tp0 = s * fy0;
  542. tp1 = - (s * fx0);
  543. tp2 = s * fy1;
  544. tp3 = - (s * fx1);
  545. ST_GP2_INC(tp0, tp2, x, 1);
  546. ST_GP2_INC(tp1, tp3, y, 1);
  547. }
  548. if (n & 1)
  549. {
  550. fx0 = *px;
  551. fy0 = *py;
  552. tp0 = s * fy0;
  553. tp1 = - (s * fx0);
  554. *x = tp0;
  555. *y = tp1;
  556. }
  557. }
  558. else
  559. {
  560. c0 = COPY_FLOAT_TO_VECTOR(c);
  561. s0 = COPY_FLOAT_TO_VECTOR(s);
  562. /* process 16 floats */
  563. if (n >> 5)
  564. {
  565. BLASLONG pref_offsetx, pref_offsety;
  566. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  567. if (pref_offsetx > 0)
  568. {
  569. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  570. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  571. }
  572. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  573. if (pref_offsety > 0)
  574. {
  575. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  576. pref_offsety = pref_offsety / sizeof(FLOAT);
  577. }
  578. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  579. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  580. for (j = (n >> 5) - 1; j--;)
  581. {
  582. PREFETCH(px + pref_offsetx + 32);
  583. PREFETCH(px + pref_offsetx + 40);
  584. PREFETCH(px + pref_offsetx + 48);
  585. PREFETCH(px + pref_offsetx + 56);
  586. PREFETCH(py + pref_offsety + 32);
  587. PREFETCH(py + pref_offsety + 40);
  588. PREFETCH(py + pref_offsety + 48);
  589. PREFETCH(py + pref_offsety + 56);
  590. x4 = LD_SP(px); px += 4;
  591. out0 = c0 * x0;
  592. x5 = LD_SP(px); px += 4;
  593. out2 = c0 * x1;
  594. x6 = LD_SP(px); px += 4;
  595. out4 = c0 * x2;
  596. x7 = LD_SP(px); px += 4;
  597. out6 = c0 * x3;
  598. y4 = LD_SP(py); py += 4;
  599. out1 = c0 * y0;
  600. y5 = LD_SP(py); py += 4;
  601. out3 = c0 * y1;
  602. y6 = LD_SP(py); py += 4;
  603. out5 = c0 * y2;
  604. y7 = LD_SP(py); py += 4;
  605. out7 = c0 * y3;
  606. out0 += s0 * y0;
  607. out2 += s0 * y1;
  608. out4 += s0 * y2;
  609. out6 += s0 * y3;
  610. out1 -= s0 * x0;
  611. out3 -= s0 * x1;
  612. out5 -= s0 * x2;
  613. out7 -= s0 * x3;
  614. ST_SP(out0, x); x += 4;
  615. out0 = c0 * x4;
  616. ST_SP(out2, x); x += 4;
  617. out2 = c0 * x5;
  618. ST_SP(out4, x); x += 4;
  619. out4 = c0 * x6;
  620. ST_SP(out6, x); x += 4;
  621. out6 = c0 * x7;
  622. ST_SP(out1, y); y += 4;
  623. out1 = c0 * y4;
  624. ST_SP(out3, y); y += 4;
  625. out3 = c0 * y5;
  626. ST_SP(out5, y); y += 4;
  627. out5 = c0 * y6;
  628. ST_SP(out7, y); y += 4;
  629. out7 = c0 * y7;
  630. x0 = LD_SP(px); px += 4;
  631. out0 += s0 * y4;
  632. x1 = LD_SP(px); px += 4;
  633. out2 += s0 * y5;
  634. x2 = LD_SP(px); px += 4;
  635. out4 += s0 * y6;
  636. x3 = LD_SP(px); px += 4;
  637. out6 += s0 * y7;
  638. y0 = LD_SP(py); py += 4;
  639. out1 -= s0 * x4;
  640. y1 = LD_SP(py); py += 4;
  641. out3 -= s0 * x5;
  642. y2 = LD_SP(py); py += 4;
  643. out5 -= s0 * x6;
  644. y3 = LD_SP(py); py += 4;
  645. out7 -= s0 * x7;
  646. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  647. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  648. }
  649. out0 = c0 * x0;
  650. out2 = c0 * x1;
  651. out4 = c0 * x2;
  652. out6 = c0 * x3;
  653. out1 = c0 * y0;
  654. out3 = c0 * y1;
  655. out5 = c0 * y2;
  656. out7 = c0 * y3;
  657. out0 += s0 * y0;
  658. out2 += s0 * y1;
  659. out4 += s0 * y2;
  660. out6 += s0 * y3;
  661. out1 -= s0 * x0;
  662. out3 -= s0 * x1;
  663. out5 -= s0 * x2;
  664. out7 -= s0 * x3;
  665. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  666. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  667. LD_SP4_INC(px, 4, x4, x5, x6, x7);
  668. LD_SP4_INC(py, 4, y4, y5, y6, y7);
  669. out0 = c0 * x4;
  670. out2 = c0 * x5;
  671. out4 = c0 * x6;
  672. out6 = c0 * x7;
  673. out1 = c0 * y4;
  674. out3 = c0 * y5;
  675. out5 = c0 * y6;
  676. out7 = c0 * y7;
  677. out0 += s0 * y4;
  678. out2 += s0 * y5;
  679. out4 += s0 * y6;
  680. out6 += s0 * y7;
  681. out1 -= s0 * x4;
  682. out3 -= s0 * x5;
  683. out5 -= s0 * x6;
  684. out7 -= s0 * x7;
  685. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  686. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  687. }
  688. if (n & 16)
  689. {
  690. LD_SP4_INC(px, 4, x0, x1, x2, x3);
  691. LD_SP4_INC(py, 4, y0, y1, y2, y3);
  692. out0 = (c0 * x0) + (s0 * y0);
  693. out1 = (c0 * y0) - (s0 * x0);
  694. out2 = (c0 * x1) + (s0 * y1);
  695. out3 = (c0 * y1) - (s0 * x1);
  696. out4 = (c0 * x2) + (s0 * y2);
  697. out5 = (c0 * y2) - (s0 * x2);
  698. out6 = (c0 * x3) + (s0 * y3);
  699. out7 = (c0 * y3) - (s0 * x3);
  700. ST_SP4_INC(out0, out2, out4, out6, x, 4);
  701. ST_SP4_INC(out1, out3, out5, out7, y, 4);
  702. }
  703. if (n & 8)
  704. {
  705. LD_SP2_INC(px, 4, x0, x1);
  706. LD_SP2_INC(py, 4, y0, y1);
  707. out0 = (c0 * x0) + (s0 * y0);
  708. out1 = (c0 * y0) - (s0 * x0);
  709. out2 = (c0 * x1) + (s0 * y1);
  710. out3 = (c0 * y1) - (s0 * x1);
  711. ST_SP2_INC(out0, out2, x, 4);
  712. ST_SP2_INC(out1, out3, y, 4);
  713. }
  714. if (n & 4)
  715. {
  716. x0 = LD_SP(px);
  717. y0 = LD_SP(py);
  718. px += 4;
  719. py += 4;
  720. out0 = (c0 * x0) + (s0 * y0);
  721. out1 = (c0 * y0) - (s0 * x0);
  722. ST_SP(out0, x);
  723. ST_SP(out1, y);
  724. x += 4;
  725. y += 4;
  726. }
  727. if (n & 2)
  728. {
  729. LD_GP2_INC(px, 1, fx0, fx1);
  730. LD_GP2_INC(py, 1, fy0, fy1);
  731. tp0 = (c * fx0) + (s * fy0);
  732. tp1 = (c * fy0) - (s * fx0);
  733. tp2 = (c * fx1) + (s * fy1);
  734. tp3 = (c * fy1) - (s * fx1);
  735. ST_GP2_INC(tp0, tp2, x, 1);
  736. ST_GP2_INC(tp1, tp3, y, 1);
  737. }
  738. if (n & 1)
  739. {
  740. fx0 = *px;
  741. fy0 = *py;
  742. tp0 = (c * fx0) + (s * fy0);
  743. tp1 = (c * fy0) - (s * fx0);
  744. *x = tp0;
  745. *y = tp1;
  746. }
  747. }
  748. }
  749. else
  750. {
  751. if ((0 == c) && (0 == s))
  752. {
  753. for (i = n; i--;)
  754. {
  755. *x = 0;
  756. *y = 0;
  757. x += inc_x;
  758. y += inc_y;
  759. }
  760. }
  761. else if ((1 == c) && (1 == s))
  762. {
  763. if (n >> 2)
  764. {
  765. fx0 = *px; px += inc_x;
  766. fx1 = *px; px += inc_x;
  767. fx2 = *px; px += inc_x;
  768. fx3 = *px; px += inc_x;
  769. fy0 = *py; py += inc_y;
  770. fy1 = *py; py += inc_y;
  771. fy2 = *py; py += inc_y;
  772. fy3 = *py; py += inc_y;
  773. for (i = (n >> 2) -1; i--;)
  774. {
  775. tp0 = fx0 + fy0;
  776. tp1 = fy0 - fx0;
  777. tp2 = fx1 + fy1;
  778. tp3 = fy1 - fx1;
  779. tp4 = fx2 + fy2;
  780. tp5 = fy2 - fx2;
  781. tp6 = fx3 + fy3;
  782. tp7 = fy3 - fx3;
  783. fx0 = *px; px += inc_x;
  784. *x = tp0; x += inc_x;
  785. fx1 = *px; px += inc_x;
  786. *x = tp2; x += inc_x;
  787. fx2 = *px; px += inc_x;
  788. *x = tp4; x += inc_x;
  789. fx3 = *px; px += inc_x;
  790. *x = tp6; x += inc_x;
  791. fy0 = *py; py += inc_y;
  792. *y = tp1; y += inc_y;
  793. fy1 = *py; py += inc_y;
  794. *y = tp3; y += inc_y;
  795. fy2 = *py; py += inc_y;
  796. *y = tp5; y += inc_y;
  797. fy3 = *py; py += inc_y;
  798. *y = tp7; y += inc_y;
  799. }
  800. tp0 = fx0 + fy0;
  801. tp1 = fy0 - fx0;
  802. tp2 = fx1 + fy1;
  803. tp3 = fy1 - fx1;
  804. tp4 = fx2 + fy2;
  805. tp5 = fy2 - fx2;
  806. tp6 = fx3 + fy3;
  807. tp7 = fy3 - fx3;
  808. *x = tp0; x += inc_x;
  809. *x = tp2; x += inc_x;
  810. *x = tp4; x += inc_x;
  811. *x = tp6; x += inc_x;
  812. *y = tp1; y += inc_y;
  813. *y = tp3; y += inc_y;
  814. *y = tp5; y += inc_y;
  815. *y = tp7; y += inc_y;
  816. }
  817. if (n & 2)
  818. {
  819. LD_GP2_INC(px, inc_x, fx0, fx1);
  820. LD_GP2_INC(py, inc_y, fy0, fy1);
  821. tp0 = fx0 + fy0;
  822. tp1 = fy0 - fx0;
  823. tp2 = fx1 + fy1;
  824. tp3 = fy1 - fx1;
  825. ST_GP2_INC(tp0, tp2, x, inc_x);
  826. ST_GP2_INC(tp1, tp3, y, inc_y);
  827. }
  828. if (n & 1)
  829. {
  830. fx0 = *px;
  831. fy0 = *py;
  832. tp0 = fx0 + fy0;
  833. tp1 = fy0 - fx0;
  834. *x = tp0;
  835. *y = tp1;
  836. }
  837. }
  838. else if (0 == s)
  839. {
  840. if (n >> 2)
  841. {
  842. fx0 = *px; px += inc_x;
  843. fx1 = *px; px += inc_x;
  844. fx2 = *px; px += inc_x;
  845. fx3 = *px; px += inc_x;
  846. fy0 = *py; py += inc_y;
  847. fy1 = *py; py += inc_y;
  848. fy2 = *py; py += inc_y;
  849. fy3 = *py; py += inc_y;
  850. for (i = (n >> 2) - 1; i--;)
  851. {
  852. tp0 = c * fx0;
  853. tp1 = c * fy0;
  854. tp2 = c * fx1;
  855. tp3 = c * fy1;
  856. tp4 = c * fx2;
  857. tp5 = c * fy2;
  858. tp6 = c * fx3;
  859. tp7 = c * fy3;
  860. fx0 = *px; px += inc_x;
  861. *x = tp0; x += inc_x;
  862. fx1 = *px; px += inc_x;
  863. *x = tp2; x += inc_x;
  864. fx2 = *px; px += inc_x;
  865. *x = tp4; x += inc_x;
  866. fx3 = *px; px += inc_x;
  867. *x = tp6; x += inc_x;
  868. fy0 = *py; py += inc_y;
  869. *y = tp1; y += inc_y;
  870. fy1 = *py; py += inc_y;
  871. *y = tp3; y += inc_y;
  872. fy2 = *py; py += inc_y;
  873. *y = tp5; y += inc_y;
  874. fy3 = *py; py += inc_y;
  875. *y = tp7; y += inc_y;
  876. }
  877. tp0 = c * fx0;
  878. tp1 = c * fy0;
  879. tp2 = c * fx1;
  880. tp3 = c * fy1;
  881. tp4 = c * fx2;
  882. tp5 = c * fy2;
  883. tp6 = c * fx3;
  884. tp7 = c * fy3;
  885. *x = tp0; x += inc_x;
  886. *x = tp2; x += inc_x;
  887. *x = tp4; x += inc_x;
  888. *x = tp6; x += inc_x;
  889. *y = tp1; y += inc_y;
  890. *y = tp3; y += inc_y;
  891. *y = tp5; y += inc_y;
  892. *y = tp7; y += inc_y;
  893. }
  894. if (n & 2)
  895. {
  896. LD_GP2_INC(px, inc_x, fx0, fx1);
  897. LD_GP2_INC(py, inc_y, fy0, fy1);
  898. tp0 = c * fx0;
  899. tp1 = c * fy0;
  900. tp2 = c * fx1;
  901. tp3 = c * fy1;
  902. ST_GP2_INC(tp0, tp2, x, inc_x);
  903. ST_GP2_INC(tp1, tp3, y, inc_y);
  904. }
  905. if (n & 1)
  906. {
  907. fx0 = *px;
  908. fy0 = *py;
  909. tp0 = c * fx0;
  910. tp1 = c * fy0;
  911. *x = tp0;
  912. *y = tp1;
  913. }
  914. }
  915. else
  916. {
  917. if (n >> 2)
  918. {
  919. fx0 = *px; px += inc_x;
  920. fx1 = *px; px += inc_x;
  921. fx2 = *px; px += inc_x;
  922. fx3 = *px; px += inc_x;
  923. fy0 = *py; py += inc_y;
  924. fy1 = *py; py += inc_y;
  925. fy2 = *py; py += inc_y;
  926. fy3 = *py; py += inc_y;
  927. for (i = (n >> 2) - 1; i--;)
  928. {
  929. tp0 = c * fx0 + s * fy0;
  930. tp1 = c * fy0 - s * fx0;
  931. tp2 = c * fx1 + s * fy1;
  932. tp3 = c * fy1 - s * fx1;
  933. tp4 = c * fx2 + s * fy2;
  934. tp5 = c * fy2 - s * fx2;
  935. tp6 = c * fx3 + s * fy3;
  936. tp7 = c * fy3 - s * fx3;
  937. fx0 = *px; px += inc_x;
  938. *x = tp0; x += inc_x;
  939. fx1 = *px; px += inc_x;
  940. *x = tp2; x += inc_x;
  941. fx2 = *px; px += inc_x;
  942. *x = tp4; x += inc_x;
  943. fx3 = *px; px += inc_x;
  944. *x = tp6; x += inc_x;
  945. fy0 = *py; py += inc_y;
  946. *y = tp1; y += inc_y;
  947. fy1 = *py; py += inc_y;
  948. *y = tp3; y += inc_y;
  949. fy2 = *py; py += inc_y;
  950. *y = tp5; y += inc_y;
  951. fy3 = *py; py += inc_y;
  952. *y = tp7; y += inc_y;
  953. }
  954. tp0 = c * fx0 + s * fy0;
  955. tp1 = c * fy0 - s * fx0;
  956. tp2 = c * fx1 + s * fy1;
  957. tp3 = c * fy1 - s * fx1;
  958. tp4 = c * fx2 + s * fy2;
  959. tp5 = c * fy2 - s * fx2;
  960. tp6 = c * fx3 + s * fy3;
  961. tp7 = c * fy3 - s * fx3;
  962. *x = tp0; x += inc_x;
  963. *x = tp2; x += inc_x;
  964. *x = tp4; x += inc_x;
  965. *x = tp6; x += inc_x;
  966. *y = tp1; y += inc_y;
  967. *y = tp3; y += inc_y;
  968. *y = tp5; y += inc_y;
  969. *y = tp7; y += inc_y;
  970. }
  971. if (n & 2)
  972. {
  973. LD_GP2_INC(px, inc_x, fx0, fx1);
  974. LD_GP2_INC(py, inc_y, fy0, fy1);
  975. tp0 = c * fx0 + s * fy0;
  976. tp1 = c * fy0 - s * fx0;
  977. tp2 = c * fx1 + s * fy1;
  978. tp3 = c * fy1 - s * fx1;
  979. ST_GP2_INC(tp0, tp2, x, inc_x);
  980. ST_GP2_INC(tp1, tp3, y, inc_y);
  981. }
  982. if (n & 1)
  983. {
  984. fx0 = *px;
  985. fy0 = *py;
  986. tp0 = c * fx0 + s * fy0;
  987. tp1 = c * fy0 - s * fx0;
  988. *x = tp0;
  989. *y = tp1;
  990. }
  991. }
  992. }
  993. return 0;
  994. }