You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zrot_msa.c 63 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738
  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. #define PROCESS_ZROT(inc_x2, inc_y2) \
  30. if ((0 == c) && (0 == s)) \
  31. { \
  32. v2f64 zero = {0, 0}; \
  33. zero = (v2f64) __msa_insert_d((v2i64) zero, 0, 0.0); \
  34. zero = (v2f64) __msa_insert_d((v2i64) zero, 1, 0.0); \
  35. \
  36. /* process 4 floats */ \
  37. for (j = (n >> 1); j--;) \
  38. { \
  39. ST_DP2_INC(zero, zero, px, inc_x2); \
  40. ST_DP2_INC(zero, zero, py, inc_y2); \
  41. } \
  42. \
  43. if (n & 1) \
  44. { \
  45. ST_DP(zero, px); \
  46. ST_DP(zero, py); \
  47. } \
  48. } \
  49. else if ((1 == c) && (1 == s)) \
  50. { \
  51. /* process 8 elements */ \
  52. if (n >> 3) \
  53. { \
  54. BLASLONG pref_offsetx, pref_offsety; \
  55. \
  56. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \
  57. if (pref_offsetx > 0) \
  58. { \
  59. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \
  60. pref_offsetx = pref_offsetx / sizeof(FLOAT); \
  61. } \
  62. \
  63. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \
  64. if (pref_offsety > 0) \
  65. { \
  66. pref_offsety = L1_DATA_LINESIZE - pref_offsety; \
  67. pref_offsety = pref_offsety / sizeof(FLOAT); \
  68. } \
  69. \
  70. x0 = LD_DP(px); px += inc_x2; \
  71. x1 = LD_DP(px); px += inc_x2; \
  72. x2 = LD_DP(px); px += inc_x2; \
  73. x3 = LD_DP(px); px += inc_x2; \
  74. y0 = LD_DP(py); py += inc_y2; \
  75. y1 = LD_DP(py); py += inc_y2; \
  76. y2 = LD_DP(py); py += inc_y2; \
  77. y3 = LD_DP(py); py += inc_y2; \
  78. \
  79. for (j = (n >> 3) - 1; j--;) \
  80. { \
  81. PREFETCH(px + pref_offsetx + 16); \
  82. PREFETCH(px + pref_offsetx + 20); \
  83. PREFETCH(px + pref_offsetx + 24); \
  84. PREFETCH(px + pref_offsetx + 28); \
  85. PREFETCH(py + pref_offsety + 16); \
  86. PREFETCH(py + pref_offsety + 20); \
  87. PREFETCH(py + pref_offsety + 24); \
  88. PREFETCH(py + pref_offsety + 28); \
  89. \
  90. out0 = x0 + y0; \
  91. x4 = LD_DP(px); px += inc_x2; \
  92. out1 = y0 - x0; \
  93. x5 = LD_DP(px); px += inc_x2; \
  94. out2 = x1 + y1; \
  95. x6 = LD_DP(px); px += inc_x2; \
  96. out3 = y1 - x1; \
  97. x7 = LD_DP(px); px += inc_x2; \
  98. out4 = x2 + y2; \
  99. y4 = LD_DP(py); py += inc_y2; \
  100. out5 = y2 - x2; \
  101. y5 = LD_DP(py); py += inc_y2; \
  102. out6 = x3 + y3; \
  103. y6 = LD_DP(py); py += inc_y2; \
  104. out7 = y3 - x3; \
  105. y7 = LD_DP(py); py += inc_y2; \
  106. \
  107. ST_DP(out0, x); x += inc_x2; \
  108. out8 = x4 + y4; \
  109. ST_DP(out1, y); y += inc_y2; \
  110. out9 = y4 - x4; \
  111. ST_DP(out2, x); x += inc_x2; \
  112. out10 = x5 + y5; \
  113. ST_DP(out3, y); y += inc_y2; \
  114. out11 = y5 - x5; \
  115. ST_DP(out4, x); x += inc_x2; \
  116. out12 = x6 + y6; \
  117. ST_DP(out5, y); y += inc_y2; \
  118. out13 = y6 - x6; \
  119. ST_DP(out6, x); x += inc_x2; \
  120. out14 = x7 + y7; \
  121. ST_DP(out7, y); y += inc_y2; \
  122. out15 = y7 - x7; \
  123. \
  124. x0 = LD_DP(px); px += inc_x2; \
  125. ST_DP(out8, x); x += inc_x2; \
  126. x1 = LD_DP(px); px += inc_x2; \
  127. ST_DP(out10, x); x += inc_x2; \
  128. x2 = LD_DP(px); px += inc_x2; \
  129. ST_DP(out12, x); x += inc_x2; \
  130. x3 = LD_DP(px); px += inc_x2; \
  131. ST_DP(out14, x); x += inc_x2; \
  132. \
  133. y0 = LD_DP(py); py += inc_y2; \
  134. ST_DP(out9, y); y += inc_y2; \
  135. y1 = LD_DP(py); py += inc_y2; \
  136. ST_DP(out11, y); y += inc_y2; \
  137. y2 = LD_DP(py); py += inc_y2; \
  138. ST_DP(out13, y); y += inc_y2; \
  139. y3 = LD_DP(py); py += inc_y2; \
  140. ST_DP(out15, y); y += inc_y2; \
  141. } \
  142. \
  143. x4 = LD_DP(px); px += inc_x2; \
  144. x5 = LD_DP(px); px += inc_x2; \
  145. x6 = LD_DP(px); px += inc_x2; \
  146. x7 = LD_DP(px); px += inc_x2; \
  147. y4 = LD_DP(py); py += inc_y2; \
  148. y5 = LD_DP(py); py += inc_y2; \
  149. y6 = LD_DP(py); py += inc_y2; \
  150. y7 = LD_DP(py); py += inc_y2; \
  151. \
  152. out0 = x0 + y0; \
  153. out1 = y0 - x0; \
  154. out2 = x1 + y1; \
  155. out3 = y1 - x1; \
  156. out4 = x2 + y2; \
  157. out5 = y2 - x2; \
  158. out6 = x3 + y3; \
  159. out7 = y3 - x3; \
  160. out8 = x4 + y4; \
  161. out9 = y4 - x4; \
  162. out10 = x5 + y5; \
  163. out11 = y5 - x5; \
  164. out12 = x6 + y6; \
  165. out13 = y6 - x6; \
  166. out14 = x7 + y7; \
  167. out15 = y7 - x7; \
  168. \
  169. ST_DP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, inc_x2); \
  170. ST_DP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, inc_y2); \
  171. } \
  172. if (n & 4) \
  173. { \
  174. LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \
  175. LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \
  176. \
  177. out0 = x0 + y0; \
  178. out1 = y0 - x0; \
  179. out2 = x1 + y1; \
  180. out3 = y1 - x1; \
  181. out4 = x2 + y2; \
  182. out5 = y2 - x2; \
  183. out6 = x3 + y3; \
  184. out7 = y3 - x3; \
  185. \
  186. ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
  187. ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
  188. } \
  189. if (n & 2) \
  190. { \
  191. LD_DP2_INC(px, inc_x2, x0, x1); \
  192. LD_DP2_INC(py, inc_y2, y0, y1); \
  193. \
  194. out0 = x0 + y0; \
  195. out1 = y0 - x0; \
  196. out2 = x1 + y1; \
  197. out3 = y1 - x1; \
  198. \
  199. ST_DP2_INC(out0, out2, x, inc_x2); \
  200. ST_DP2_INC(out1, out3, y, inc_y2); \
  201. } \
  202. if (n & 1) \
  203. { \
  204. x0 = LD_DP(px); \
  205. y0 = LD_DP(py); \
  206. \
  207. out0 = x0 + y0; \
  208. out1 = y0 - x0; \
  209. \
  210. ST_DP(out0, px); \
  211. ST_DP(out1, py); \
  212. } \
  213. } \
  214. else if (0 == s) \
  215. { \
  216. c0 = COPY_DOUBLE_TO_VECTOR(c); \
  217. \
  218. if (n >> 3) \
  219. { \
  220. BLASLONG pref_offsetx, pref_offsety; \
  221. \
  222. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \
  223. if (pref_offsetx > 0) \
  224. { \
  225. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \
  226. pref_offsetx = pref_offsetx / sizeof(FLOAT); \
  227. } \
  228. \
  229. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \
  230. if (pref_offsety > 0) \
  231. { \
  232. pref_offsety = L1_DATA_LINESIZE - pref_offsety; \
  233. pref_offsety = pref_offsety / sizeof(FLOAT); \
  234. } \
  235. \
  236. LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); \
  237. \
  238. for (j = (n >> 3) - 1; j--;) \
  239. { \
  240. PREFETCH(px + pref_offsetx + 16); \
  241. PREFETCH(px + pref_offsetx + 20); \
  242. PREFETCH(px + pref_offsetx + 24); \
  243. PREFETCH(px + pref_offsetx + 28); \
  244. PREFETCH(py + pref_offsety + 16); \
  245. PREFETCH(py + pref_offsety + 20); \
  246. PREFETCH(py + pref_offsety + 24); \
  247. PREFETCH(py + pref_offsety + 28); \
  248. \
  249. y0 = LD_DP(py); py += inc_y2; \
  250. x0 *= c0; \
  251. y1 = LD_DP(py); py += inc_y2; \
  252. x1 *= c0; \
  253. y2 = LD_DP(py); py += inc_y2; \
  254. x2 *= c0; \
  255. y3 = LD_DP(py); py += inc_y2; \
  256. x3 *= c0; \
  257. y4 = LD_DP(py); py += inc_y2; \
  258. x4 *= c0; \
  259. y5 = LD_DP(py); py += inc_y2; \
  260. x5 *= c0; \
  261. y6 = LD_DP(py); py += inc_y2; \
  262. x6 *= c0; \
  263. y7 = LD_DP(py); py += inc_y2; \
  264. x7 *= c0; \
  265. \
  266. ST_DP(x0, x); x += inc_x2; \
  267. y0 *= c0; \
  268. ST_DP(x1, x); x += inc_x2; \
  269. y1 *= c0; \
  270. ST_DP(x2, x); x += inc_x2; \
  271. y2 *= c0; \
  272. ST_DP(x3, x); x += inc_x2; \
  273. y3 *= c0; \
  274. ST_DP(x4, x); x += inc_x2; \
  275. y4 *= c0; \
  276. ST_DP(x5, x); x += inc_x2; \
  277. y5 *= c0; \
  278. ST_DP(x6, x); x += inc_x2; \
  279. y6 *= c0; \
  280. ST_DP(x7, x); x += inc_x2; \
  281. y7 *= c0; \
  282. \
  283. x0 = LD_DP(px); px += inc_x2; \
  284. ST_DP(y0, y); y += inc_y2; \
  285. x1 = LD_DP(px); px += inc_x2; \
  286. ST_DP(y1, y); y += inc_y2; \
  287. x2 = LD_DP(px); px += inc_x2; \
  288. ST_DP(y2, y); y += inc_y2; \
  289. x3 = LD_DP(px); px += inc_x2; \
  290. ST_DP(y3, y); y += inc_y2; \
  291. x4 = LD_DP(px); px += inc_x2; \
  292. ST_DP(y4, y); y += inc_y2; \
  293. x5 = LD_DP(px); px += inc_x2; \
  294. ST_DP(y5, y); y += inc_y2; \
  295. x6 = LD_DP(px); px += inc_x2; \
  296. ST_DP(y6, y); y += inc_y2; \
  297. x7 = LD_DP(px); px += inc_x2; \
  298. ST_DP(y7, y); y += inc_y2; \
  299. } \
  300. \
  301. LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); \
  302. \
  303. x0 *= c0; \
  304. y0 *= c0; \
  305. x1 *= c0; \
  306. y1 *= c0; \
  307. x2 *= c0; \
  308. y2 *= c0; \
  309. x3 *= c0; \
  310. y3 *= c0; \
  311. x4 *= c0; \
  312. y4 *= c0; \
  313. x5 *= c0; \
  314. y5 *= c0; \
  315. x6 *= c0; \
  316. y6 *= c0; \
  317. x7 *= c0; \
  318. y7 *= c0; \
  319. \
  320. ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2); \
  321. ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2); \
  322. } \
  323. \
  324. if (n & 4) \
  325. { \
  326. LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \
  327. LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \
  328. \
  329. out0 = c0 * x0; \
  330. out1 = c0 * y0; \
  331. out2 = c0 * x1; \
  332. out3 = c0 * y1; \
  333. out4 = c0 * x2; \
  334. out5 = c0 * y2; \
  335. out6 = c0 * x3; \
  336. out7 = c0 * y3; \
  337. \
  338. ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
  339. ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
  340. } \
  341. if (n & 2) \
  342. { \
  343. LD_DP2_INC(px, inc_x2, x0, x1); \
  344. LD_DP2_INC(py, inc_y2, y0, y1); \
  345. \
  346. out0 = c0 * x0; \
  347. out1 = c0 * y0; \
  348. out2 = c0 * x1; \
  349. out3 = c0 * y1; \
  350. \
  351. ST_DP2_INC(out0, out2, x, inc_x2); \
  352. ST_DP2_INC(out1, out3, y, inc_y2); \
  353. } \
  354. if (n & 1) \
  355. { \
  356. x0 = LD_DP(px); \
  357. y0 = LD_DP(py); \
  358. \
  359. out0 = c0 * x0; \
  360. out1 = c0 * y0; \
  361. \
  362. ST_DP(out0, px); \
  363. ST_DP(out1, py); \
  364. } \
  365. } \
  366. else if (0 == c) \
  367. { \
  368. s0 = COPY_DOUBLE_TO_VECTOR(s); \
  369. \
  370. /* process 16 floats */ \
  371. if (n >> 3) \
  372. { \
  373. BLASLONG pref_offsetx, pref_offsety; \
  374. \
  375. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \
  376. if (pref_offsetx > 0) \
  377. { \
  378. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \
  379. pref_offsetx = pref_offsetx / sizeof(FLOAT); \
  380. } \
  381. \
  382. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \
  383. if (pref_offsety > 0) \
  384. { \
  385. pref_offsety = L1_DATA_LINESIZE - pref_offsety; \
  386. pref_offsety = pref_offsety / sizeof(FLOAT); \
  387. } \
  388. \
  389. LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \
  390. LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \
  391. \
  392. for (j = (n >> 3) - 1; j--;) \
  393. { \
  394. PREFETCH(px + pref_offsetx + 16); \
  395. PREFETCH(px + pref_offsetx + 20); \
  396. PREFETCH(px + pref_offsetx + 24); \
  397. PREFETCH(px + pref_offsetx + 28); \
  398. PREFETCH(py + pref_offsety + 16); \
  399. PREFETCH(py + pref_offsety + 20); \
  400. PREFETCH(py + pref_offsety + 24); \
  401. PREFETCH(py + pref_offsety + 28); \
  402. \
  403. x4 = LD_DP(px); px += inc_x2; \
  404. out0 = s0 * y0; \
  405. x5 = LD_DP(px); px += inc_x2; \
  406. out2 = s0 * y1; \
  407. x6 = LD_DP(px); px += inc_x2; \
  408. out4 = s0 * y2; \
  409. x7 = LD_DP(px); px += inc_x2; \
  410. out6 = s0 * y3; \
  411. y4 = LD_DP(py); py += inc_y2; \
  412. out1 = -(s0 * x0); \
  413. y5 = LD_DP(py); py += inc_y2; \
  414. out3 = -(s0 * x1); \
  415. y6 = LD_DP(py); py += inc_y2; \
  416. out5 = -(s0 * x2); \
  417. y7 = LD_DP(py); py += inc_y2; \
  418. out7 = -(s0 * x3); \
  419. \
  420. ST_DP(out0, x); x += inc_y2; \
  421. out0 = s0 * y4; \
  422. ST_DP(out2, x); x += inc_y2; \
  423. out2 = s0 * y5; \
  424. ST_DP(out4, x); x += inc_y2; \
  425. out4 = s0 * y6; \
  426. ST_DP(out6, x); x += inc_y2; \
  427. out6 = s0 * y7; \
  428. ST_DP(out1, y); y += inc_y2; \
  429. out1 = -(s0 * x4); \
  430. ST_DP(out3, y); y += inc_y2; \
  431. out3 = -(s0 * x5); \
  432. ST_DP(out5, y); y += inc_y2; \
  433. out5 = -(s0 * x6); \
  434. ST_DP(out7, y); y += inc_y2; \
  435. out7 = -(s0 * x7); \
  436. \
  437. x0 = LD_DP(px); px += inc_x2; \
  438. ST_DP(out0, x); x += inc_y2; \
  439. x1 = LD_DP(px); px += inc_x2; \
  440. ST_DP(out2, x); x += inc_y2; \
  441. x2 = LD_DP(px); px += inc_x2; \
  442. ST_DP(out4, x); x += inc_y2; \
  443. x3 = LD_DP(px); px += inc_x2; \
  444. ST_DP(out6, x); x += inc_y2; \
  445. y0 = LD_DP(py); py += inc_y2; \
  446. ST_DP(out1, y); y += inc_y2; \
  447. y1 = LD_DP(py); py += inc_y2; \
  448. ST_DP(out3, y); y += inc_y2; \
  449. y2 = LD_DP(py); py += inc_y2; \
  450. ST_DP(out5, y); y += inc_y2; \
  451. y3 = LD_DP(py); py += inc_y2; \
  452. ST_DP(out7, y); y += inc_y2; \
  453. } \
  454. \
  455. out0 = s0 * y0; \
  456. out2 = s0 * y1; \
  457. out4 = s0 * y2; \
  458. out6 = s0 * y3; \
  459. out1 = -(s0 * x0); \
  460. out3 = -(s0 * x1); \
  461. out5 = -(s0 * x2); \
  462. out7 = -(s0 * x3); \
  463. \
  464. ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
  465. ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
  466. \
  467. LD_DP4_INC(px, inc_x2, x4, x5, x6, x7); \
  468. LD_DP4_INC(py, inc_y2, y4, y5, y6, y7); \
  469. \
  470. out0 = s0 * y4; \
  471. out2 = s0 * y5; \
  472. out4 = s0 * y6; \
  473. out6 = s0 * y7; \
  474. out1 = -(s0 * x4); \
  475. out3 = -(s0 * x5); \
  476. out5 = -(s0 * x6); \
  477. out7 = -(s0 * x7); \
  478. \
  479. ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
  480. ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
  481. } \
  482. if (n & 4) \
  483. { \
  484. LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \
  485. LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \
  486. \
  487. out0 = s0 * y0; \
  488. out1 = - (s0 * x0); \
  489. out2 = s0 * y1; \
  490. out3 = - (s0 * x1); \
  491. out4 = s0 * y2; \
  492. out5 = - (s0 * x2); \
  493. out6 = s0 * y3; \
  494. out7 = - (s0 * x3); \
  495. \
  496. ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
  497. ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
  498. } \
  499. if (n & 2) \
  500. { \
  501. LD_DP2_INC(px, inc_x2, x0, x1); \
  502. LD_DP2_INC(py, inc_y2, y0, y1); \
  503. \
  504. out0 = s0 * y0; \
  505. out1 = - (s0 * x0); \
  506. out2 = s0 * y1; \
  507. out3 = - (s0 * x1); \
  508. \
  509. ST_DP2_INC(out0, out2, x, inc_x2); \
  510. ST_DP2_INC(out1, out3, y, inc_y2); \
  511. } \
  512. if (n & 1) \
  513. { \
  514. x0 = LD_DP(px); px += inc_x2; \
  515. y0 = LD_DP(py); py += inc_y2; \
  516. \
  517. out0 = s0 * y0; \
  518. out1 = - (s0 * x0); \
  519. \
  520. ST_DP(out0, x); x += inc_x2; \
  521. ST_DP(out1, y); y += inc_y2; \
  522. } \
  523. } \
  524. else \
  525. { \
  526. c0 = COPY_DOUBLE_TO_VECTOR(c); \
  527. s0 = COPY_DOUBLE_TO_VECTOR(s); \
  528. \
  529. if (n >> 3) \
  530. { \
  531. BLASLONG pref_offsetx, pref_offsety; \
  532. \
  533. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \
  534. if (pref_offsetx > 0) \
  535. { \
  536. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \
  537. pref_offsetx = pref_offsetx / sizeof(FLOAT); \
  538. } \
  539. \
  540. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \
  541. if (pref_offsety > 0) \
  542. { \
  543. pref_offsety = L1_DATA_LINESIZE - pref_offsety; \
  544. pref_offsety = pref_offsety / sizeof(FLOAT); \
  545. } \
  546. \
  547. LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \
  548. LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \
  549. \
  550. for (j = (n >> 3) - 1; j--;) \
  551. { \
  552. PREFETCH(px + pref_offsetx + 16); \
  553. PREFETCH(px + pref_offsetx + 20); \
  554. PREFETCH(px + pref_offsetx + 24); \
  555. PREFETCH(px + pref_offsetx + 28); \
  556. PREFETCH(py + pref_offsety + 16); \
  557. PREFETCH(py + pref_offsety + 20); \
  558. PREFETCH(py + pref_offsety + 24); \
  559. PREFETCH(py + pref_offsety + 28); \
  560. \
  561. x4 = LD_DP(px); px += inc_x2; \
  562. out0 = c0 * x0; \
  563. x5 = LD_DP(px); px += inc_x2; \
  564. out2 = c0 * x1; \
  565. x6 = LD_DP(px); px += inc_x2; \
  566. out4 = c0 * x2; \
  567. x7 = LD_DP(px); px += inc_x2; \
  568. out6 = c0 * x3; \
  569. y4 = LD_DP(py); py += inc_y2; \
  570. out1 = c0 * y0; \
  571. y5 = LD_DP(py); py += inc_y2; \
  572. out3 = c0 * y1; \
  573. y6 = LD_DP(py); py += inc_y2; \
  574. out5 = c0 * y2; \
  575. y7 = LD_DP(py); py += inc_y2; \
  576. out7 = c0 * y3; \
  577. \
  578. out0 += s0 * y0; \
  579. out2 += s0 * y1; \
  580. out4 += s0 * y2; \
  581. out6 += s0 * y3; \
  582. out1 -= s0 * x0; \
  583. out3 -= s0 * x1; \
  584. out5 -= s0 * x2; \
  585. out7 -= s0 * x3; \
  586. \
  587. ST_DP(out0, x); x += inc_x2; \
  588. out0 = c0 * x4; \
  589. ST_DP(out2, x); x += inc_x2; \
  590. out2 = c0 * x5; \
  591. ST_DP(out4, x); x += inc_x2; \
  592. out4 = c0 * x6; \
  593. ST_DP(out6, x); x += inc_x2; \
  594. out6 = c0 * x7; \
  595. ST_DP(out1, y); y += inc_y2; \
  596. out1 = c0 * y4; \
  597. ST_DP(out3, y); y += inc_y2; \
  598. out3 = c0 * y5; \
  599. ST_DP(out5, y); y += inc_y2; \
  600. out5 = c0 * y6; \
  601. ST_DP(out7, y); y += inc_y2; \
  602. out7 = c0 * y7; \
  603. \
  604. x0 = LD_DP(px); px += inc_x2; \
  605. out0 += s0 * y4; \
  606. x1 = LD_DP(px); px += inc_x2; \
  607. out2 += s0 * y5; \
  608. x2 = LD_DP(px); px += inc_x2; \
  609. out4 += s0 * y6; \
  610. x3 = LD_DP(px); px += inc_x2; \
  611. out6 += s0 * y7; \
  612. y0 = LD_DP(py); py += inc_y2; \
  613. out1 -= s0 * x4; \
  614. y1 = LD_DP(py); py += inc_y2; \
  615. out3 -= s0 * x5; \
  616. y2 = LD_DP(py); py += inc_y2; \
  617. out5 -= s0 * x6; \
  618. y3 = LD_DP(py); py += inc_y2; \
  619. out7 -= s0 * x7; \
  620. \
  621. ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
  622. ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
  623. } \
  624. \
  625. out0 = c0 * x0; \
  626. out0 += s0 * y0; \
  627. out1 = c0 * y0; \
  628. out1 -= s0 * x0; \
  629. out2 = c0 * x1; \
  630. out2 += s0 * y1; \
  631. out3 = c0 * y1; \
  632. out3 -= s0 * x1; \
  633. out4 = c0 * x2; \
  634. out4 += s0 * y2; \
  635. out5 = c0 * y2; \
  636. out5 -= s0 * x2; \
  637. out6 = c0 * x3; \
  638. out6 += s0 * y3; \
  639. out7 = c0 * y3; \
  640. out7 -= s0 * x3; \
  641. \
  642. ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
  643. ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
  644. \
  645. LD_DP4_INC(px, inc_x2, x4, x5, x6, x7); \
  646. LD_DP4_INC(py, inc_y2, y4, y5, y6, y7); \
  647. \
  648. out8 = c0 * x4; \
  649. out8 += s0 * y4; \
  650. out9 = c0 * y4; \
  651. out9 -= s0 * x4; \
  652. out10 = c0 * x5; \
  653. out10 += s0 * y5; \
  654. out11 = c0 * y5; \
  655. out11 -= s0 * x5; \
  656. out12 = c0 * x6; \
  657. out12 += s0 * y6; \
  658. out13 = c0 * y6; \
  659. out13 -= s0 * x6; \
  660. out14 = c0 * x7; \
  661. out14 += s0 * y7; \
  662. out15 = c0 * y7; \
  663. out15 -= s0 * x7; \
  664. \
  665. ST_DP4_INC(out8, out10, out12, out14, x, inc_x2); \
  666. ST_DP4_INC(out9, out11, out13, out15, y, inc_y2); \
  667. } \
  668. if (n & 4) \
  669. { \
  670. LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \
  671. LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \
  672. \
  673. out0 = (c0 * x0) + (s0 * y0); \
  674. out1 = (c0 * y0) - (s0 * x0); \
  675. out2 = (c0 * x1) + (s0 * y1); \
  676. out3 = (c0 * y1) - (s0 * x1); \
  677. out4 = (c0 * x2) + (s0 * y2); \
  678. out5 = (c0 * y2) - (s0 * x2); \
  679. out6 = (c0 * x3) + (s0 * y3); \
  680. out7 = (c0 * y3) - (s0 * x3); \
  681. \
  682. ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
  683. ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
  684. } \
  685. if (n & 2) \
  686. { \
  687. LD_DP2_INC(px, inc_x2, x0, x1); \
  688. LD_DP2_INC(py, inc_y2, y0, y1); \
  689. \
  690. out0 = (c0 * x0) + (s0 * y0); \
  691. out1 = (c0 * y0) - (s0 * x0); \
  692. out2 = (c0 * x1) + (s0 * y1); \
  693. out3 = (c0 * y1) - (s0 * x1); \
  694. \
  695. ST_DP2_INC(out0, out2, x, inc_x2); \
  696. ST_DP2_INC(out1, out3, y, inc_y2); \
  697. } \
  698. if (n & 1) \
  699. { \
  700. x0 = LD_DP(px); \
  701. y0 = LD_DP(py); \
  702. \
  703. out0 = (c0 * x0) + (s0 * y0); \
  704. out1 = (c0 * y0) - (s0 * x0); \
  705. \
  706. ST_DP(out0, px); \
  707. ST_DP(out1, py); \
  708. } \
  709. }
  710. int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
  711. FLOAT c, FLOAT s)
  712. {
  713. BLASLONG j;
  714. FLOAT *px, *py;
  715. v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
  716. v2f64 out0, out1, out2, out3, out4, out5, out6, out7, c0, s0;
  717. v2f64 out8, out9, out10, out11, out12, out13, out14, out15;
  718. px = x;
  719. py = y;
  720. if ((1 == inc_x) && (1 == inc_y))
  721. {
  722. PROCESS_ZROT(2, 2);
  723. }
  724. else
  725. {
  726. inc_x *= 2;
  727. inc_y *= 2;
  728. PROCESS_ZROT(inc_x, inc_y);
  729. }
  730. return 0;
  731. }