You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

drot_msa.c 35 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055
  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
  30. FLOAT c, FLOAT s)
  31. {
  32. BLASLONG i, j;
  33. FLOAT *px, *py;
  34. FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
  35. FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3;
  36. v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
  37. v2f64 out0, out1, out2, out3, out4, out5, out6, out7;
  38. v2f64 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0;
  39. if (n <= 0) return (0);
  40. px = x;
  41. py = y;
  42. if ((1 == inc_x) && (1 == inc_y))
  43. {
  44. if ((0 == c) && (0 == s))
  45. {
  46. v2f64 zero = {0, 0};
  47. zero = (v2f64) __msa_insert_d((v2i64) zero, 0, 0.0);
  48. zero = (v2f64) __msa_insert_d((v2i64) zero, 1, 0.0);
  49. /* process 4 floats */
  50. for (j = (n >> 2); j--;)
  51. {
  52. ST_DP(zero, px);
  53. ST_DP(zero, py);
  54. px += 2;
  55. py += 2;
  56. ST_DP(zero, px);
  57. ST_DP(zero, py);
  58. px += 2;
  59. py += 2;
  60. }
  61. if (n & 2)
  62. {
  63. ST_DP(zero, px);
  64. ST_DP(zero, py);
  65. px += 2;
  66. py += 2;
  67. }
  68. if (n & 1)
  69. {
  70. px[0] = 0;
  71. py[0] = 0;
  72. }
  73. }
  74. else if ((1 == c) && (1 == s))
  75. {
  76. if (n >> 4)
  77. {
  78. BLASLONG pref_offsetx, pref_offsety;
  79. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  80. if (pref_offsetx > 0)
  81. {
  82. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  83. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  84. }
  85. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  86. if (pref_offsety > 0)
  87. {
  88. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  89. pref_offsety = pref_offsety / sizeof(FLOAT);
  90. }
  91. x0 = LD_DP(px); px += 2;
  92. x1 = LD_DP(px); px += 2;
  93. x2 = LD_DP(px); px += 2;
  94. x3 = LD_DP(px); px += 2;
  95. y0 = LD_DP(py); py += 2;
  96. y1 = LD_DP(py); py += 2;
  97. y2 = LD_DP(py); py += 2;
  98. y3 = LD_DP(py); py += 2;
  99. for (j = (n >> 4) - 1; j--;)
  100. {
  101. PREFETCH(px + pref_offsetx + 16);
  102. PREFETCH(px + pref_offsetx + 20);
  103. PREFETCH(px + pref_offsetx + 24);
  104. PREFETCH(px + pref_offsetx + 28);
  105. PREFETCH(py + pref_offsety + 16);
  106. PREFETCH(py + pref_offsety + 20);
  107. PREFETCH(py + pref_offsety + 24);
  108. PREFETCH(py + pref_offsety + 28);
  109. out0 = x0 + y0;
  110. x4 = LD_DP(px); px += 2;
  111. out1 = y0 - x0;
  112. x5 = LD_DP(px); px += 2;
  113. out2 = x1 + y1;
  114. x6 = LD_DP(px); px += 2;
  115. out3 = y1 - x1;
  116. x7 = LD_DP(px); px += 2;
  117. out4 = x2 + y2;
  118. y4 = LD_DP(py); py += 2;
  119. out5 = y2 - x2;
  120. y5 = LD_DP(py); py += 2;
  121. out6 = x3 + y3;
  122. y6 = LD_DP(py); py += 2;
  123. out7 = y3 - x3;
  124. y7 = LD_DP(py); py += 2;
  125. ST_DP(out0, x); x += 2;
  126. out8 = x4 + y4;
  127. ST_DP(out1, y); y += 2;
  128. out9 = y4 - x4;
  129. ST_DP(out2, x); x += 2;
  130. out10 = x5 + y5;
  131. ST_DP(out3, y); y += 2;
  132. out11 = y5 - x5;
  133. ST_DP(out4, x); x += 2;
  134. out12 = x6 + y6;
  135. ST_DP(out5, y); y += 2;
  136. out13 = y6 - x6;
  137. ST_DP(out6, x); x += 2;
  138. out14 = x7 + y7;
  139. ST_DP(out7, y); y += 2;
  140. out15 = y7 - x7;
  141. x0 = LD_DP(px); px += 2;
  142. ST_DP(out8, x); x += 2;
  143. x1 = LD_DP(px); px += 2;
  144. ST_DP(out10, x); x += 2;
  145. x2 = LD_DP(px); px += 2;
  146. ST_DP(out12, x); x += 2;
  147. x3 = LD_DP(px); px += 2;
  148. ST_DP(out14, x); x += 2;
  149. y0 = LD_DP(py); py += 2;
  150. ST_DP(out9, y); y += 2;
  151. y1 = LD_DP(py); py += 2;
  152. ST_DP(out11, y); y += 2;
  153. y2 = LD_DP(py); py += 2;
  154. ST_DP(out13, y); y += 2;
  155. y3 = LD_DP(py); py += 2;
  156. ST_DP(out15, y); y += 2;
  157. }
  158. x4 = LD_DP(px); px += 2;
  159. x5 = LD_DP(px); px += 2;
  160. x6 = LD_DP(px); px += 2;
  161. x7 = LD_DP(px); px += 2;
  162. y4 = LD_DP(py); py += 2;
  163. y5 = LD_DP(py); py += 2;
  164. y6 = LD_DP(py); py += 2;
  165. y7 = LD_DP(py); py += 2;
  166. out0 = x0 + y0;
  167. out1 = y0 - x0;
  168. out2 = x1 + y1;
  169. out3 = y1 - x1;
  170. out4 = x2 + y2;
  171. out5 = y2 - x2;
  172. out6 = x3 + y3;
  173. out7 = y3 - x3;
  174. out8 = x4 + y4;
  175. out9 = y4 - x4;
  176. out10 = x5 + y5;
  177. out11 = y5 - x5;
  178. out12 = x6 + y6;
  179. out13 = y6 - x6;
  180. out14 = x7 + y7;
  181. out15 = y7 - x7;
  182. ST_DP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 2);
  183. ST_DP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 2);
  184. }
  185. if (n & 8)
  186. {
  187. LD_DP4_INC(px, 2, x0, x1, x2, x3);
  188. LD_DP4_INC(py, 2, y0, y1, y2, y3);
  189. out0 = x0 + y0;
  190. out1 = y0 - x0;
  191. out2 = x1 + y1;
  192. out3 = y1 - x1;
  193. out4 = x2 + y2;
  194. out5 = y2 - x2;
  195. out6 = x3 + y3;
  196. out7 = y3 - x3;
  197. ST_DP4_INC(out0, out2, out4, out6, x, 2);
  198. ST_DP4_INC(out1, out3, out5, out7, y, 2);
  199. }
  200. if (n & 4)
  201. {
  202. LD_DP2_INC(px, 2, x0, x1);
  203. LD_DP2_INC(py, 2, y0, y1);
  204. out0 = x0 + y0;
  205. out1 = y0 - x0;
  206. out2 = x1 + y1;
  207. out3 = y1 - x1;
  208. ST_DP2_INC(out0, out2, x, 2);
  209. ST_DP2_INC(out1, out3, y, 2);
  210. }
  211. if (n & 2)
  212. {
  213. x0 = LD_DP(px);
  214. y0 = LD_DP(py);
  215. px += 2;
  216. py += 2;
  217. out0 = x0 + y0;
  218. out1 = y0 - x0;
  219. ST_DP(out0, x);
  220. ST_DP(out1, y);
  221. x += 2;
  222. y += 2;
  223. }
  224. if (n & 1)
  225. {
  226. tp0 = *x + *y;
  227. *y = *y - *x;
  228. *x = tp0;
  229. }
  230. }
  231. else if (0 == s)
  232. {
  233. c0 = COPY_DOUBLE_TO_VECTOR(c);
  234. if (n >> 4)
  235. {
  236. BLASLONG pref_offsetx, pref_offsety;
  237. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  238. if (pref_offsetx > 0)
  239. {
  240. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  241. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  242. }
  243. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  244. if (pref_offsety > 0)
  245. {
  246. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  247. pref_offsety = pref_offsety / sizeof(FLOAT);
  248. }
  249. LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
  250. for (j = (n >> 4) - 1; j--;)
  251. {
  252. PREFETCH(px + pref_offsetx + 16);
  253. PREFETCH(px + pref_offsetx + 20);
  254. PREFETCH(px + pref_offsetx + 24);
  255. PREFETCH(px + pref_offsetx + 28);
  256. PREFETCH(py + pref_offsety + 16);
  257. PREFETCH(py + pref_offsety + 20);
  258. PREFETCH(py + pref_offsety + 24);
  259. PREFETCH(py + pref_offsety + 28);
  260. y0 = LD_DP(py); py += 2;
  261. x0 *= c0;
  262. y1 = LD_DP(py); py += 2;
  263. x1 *= c0;
  264. y2 = LD_DP(py); py += 2;
  265. x2 *= c0;
  266. y3 = LD_DP(py); py += 2;
  267. x3 *= c0;
  268. y4 = LD_DP(py); py += 2;
  269. x4 *= c0;
  270. y5 = LD_DP(py); py += 2;
  271. x5 *= c0;
  272. y6 = LD_DP(py); py += 2;
  273. x6 *= c0;
  274. y7 = LD_DP(py); py += 2;
  275. x7 *= c0;
  276. ST_DP(x0, x); x += 2;
  277. y0 *= c0;
  278. ST_DP(x1, x); x += 2;
  279. y1 *= c0;
  280. ST_DP(x2, x); x += 2;
  281. y2 *= c0;
  282. ST_DP(x3, x); x += 2;
  283. y3 *= c0;
  284. ST_DP(x4, x); x += 2;
  285. y4 *= c0;
  286. ST_DP(x5, x); x += 2;
  287. y5 *= c0;
  288. ST_DP(x6, x); x += 2;
  289. y6 *= c0;
  290. ST_DP(x7, x); x += 2;
  291. y7 *= c0;
  292. x0 = LD_DP(px); px += 2;
  293. ST_DP(y0, y); y += 2;
  294. x1 = LD_DP(px); px += 2;
  295. ST_DP(y1, y); y += 2;
  296. x2 = LD_DP(px); px += 2;
  297. ST_DP(y2, y); y += 2;
  298. x3 = LD_DP(px); px += 2;
  299. ST_DP(y3, y); y += 2;
  300. x4 = LD_DP(px); px += 2;
  301. ST_DP(y4, y); y += 2;
  302. x5 = LD_DP(px); px += 2;
  303. ST_DP(y5, y); y += 2;
  304. x6 = LD_DP(px); px += 2;
  305. ST_DP(y6, y); y += 2;
  306. x7 = LD_DP(px); px += 2;
  307. ST_DP(y7, y); y += 2;
  308. }
  309. LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
  310. x0 *= c0;
  311. y0 *= c0;
  312. x1 *= c0;
  313. y1 *= c0;
  314. x2 *= c0;
  315. y2 *= c0;
  316. x3 *= c0;
  317. y3 *= c0;
  318. x4 *= c0;
  319. y4 *= c0;
  320. x5 *= c0;
  321. y5 *= c0;
  322. x6 *= c0;
  323. y6 *= c0;
  324. x7 *= c0;
  325. y7 *= c0;
  326. ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
  327. ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
  328. }
  329. if (n & 8)
  330. {
  331. LD_DP4_INC(px, 2, x0, x1, x2, x3);
  332. LD_DP4_INC(py, 2, y0, y1, y2, y3);
  333. out0 = c0 * x0;
  334. out1 = c0 * y0;
  335. out2 = c0 * x1;
  336. out3 = c0 * y1;
  337. out4 = c0 * x2;
  338. out5 = c0 * y2;
  339. out6 = c0 * x3;
  340. out7 = c0 * y3;
  341. ST_DP4_INC(out0, out2, out4, out6, x, 2);
  342. ST_DP4_INC(out1, out3, out5, out7, y, 2);
  343. }
  344. if (n & 4)
  345. {
  346. LD_DP2_INC(px, 2, x0, x1);
  347. LD_DP2_INC(py, 2, y0, y1);
  348. out0 = c0 * x0;
  349. out1 = c0 * y0;
  350. out2 = c0 * x1;
  351. out3 = c0 * y1;
  352. ST_DP2_INC(out0, out2, x, 2);
  353. ST_DP2_INC(out1, out3, y, 2);
  354. }
  355. if (n & 2)
  356. {
  357. x0 = LD_DP(px);
  358. y0 = LD_DP(py);
  359. px += 2;
  360. py += 2;
  361. out0 = c0 * x0;
  362. out1 = c0 * y0;
  363. ST_DP(out0, x);
  364. ST_DP(out1, y);
  365. x += 2;
  366. y += 2;
  367. }
  368. if (n & 1)
  369. {
  370. *x *= c;
  371. *y *= c;
  372. }
  373. }
  374. else if (0 == c)
  375. {
  376. s0 = COPY_DOUBLE_TO_VECTOR(s);
  377. /* process 16 floats */
  378. if (n >> 4)
  379. {
  380. BLASLONG pref_offsetx, pref_offsety;
  381. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  382. if (pref_offsetx > 0)
  383. {
  384. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  385. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  386. }
  387. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  388. if (pref_offsety > 0)
  389. {
  390. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  391. pref_offsety = pref_offsety / sizeof(FLOAT);
  392. }
  393. LD_DP4_INC(px, 2, x0, x1, x2, x3);
  394. LD_DP4_INC(py, 2, y0, y1, y2, y3);
  395. for (j = (n >> 4) - 1; j--;)
  396. {
  397. PREFETCH(px + pref_offsetx + 16);
  398. PREFETCH(px + pref_offsetx + 20);
  399. PREFETCH(px + pref_offsetx + 24);
  400. PREFETCH(px + pref_offsetx + 28);
  401. PREFETCH(py + pref_offsety + 16);
  402. PREFETCH(py + pref_offsety + 20);
  403. PREFETCH(py + pref_offsety + 24);
  404. PREFETCH(py + pref_offsety + 28);
  405. x4 = LD_DP(px); px += 2;
  406. out0 = s0 * y0;
  407. x5 = LD_DP(px); px += 2;
  408. out2 = s0 * y1;
  409. x6 = LD_DP(px); px += 2;
  410. out4 = s0 * y2;
  411. x7 = LD_DP(px); px += 2;
  412. out6 = s0 * y3;
  413. y4 = LD_DP(py); py += 2;
  414. out1 = -(s0 * x0);
  415. y5 = LD_DP(py); py += 2;
  416. out3 = -(s0 * x1);
  417. y6 = LD_DP(py); py += 2;
  418. out5 = -(s0 * x2);
  419. y7 = LD_DP(py); py += 2;
  420. out7 = -(s0 * x3);
  421. ST_DP(out0, x); x += 2;
  422. out0 = s0 * y4;
  423. ST_DP(out2, x); x += 2;
  424. out2 = s0 * y5;
  425. ST_DP(out4, x); x += 2;
  426. out4 = s0 * y6;
  427. ST_DP(out6, x); x += 2;
  428. out6 = s0 * y7;
  429. ST_DP(out1, y); y += 2;
  430. out1 = -(s0 * x4);
  431. ST_DP(out3, y); y += 2;
  432. out3 = -(s0 * x5);
  433. ST_DP(out5, y); y += 2;
  434. out5 = -(s0 * x6);
  435. ST_DP(out7, y); y += 2;
  436. out7 = -(s0 * x7);
  437. x0 = LD_DP(px); px += 2;
  438. ST_DP(out0, x); x += 2;
  439. x1 = LD_DP(px); px += 2;
  440. ST_DP(out2, x); x += 2;
  441. x2 = LD_DP(px); px += 2;
  442. ST_DP(out4, x); x += 2;
  443. x3 = LD_DP(px); px += 2;
  444. ST_DP(out6, x); x += 2;
  445. y0 = LD_DP(py); py += 2;
  446. ST_DP(out1, y); y += 2;
  447. y1 = LD_DP(py); py += 2;
  448. ST_DP(out3, y); y += 2;
  449. y2 = LD_DP(py); py += 2;
  450. ST_DP(out5, y); y += 2;
  451. y3 = LD_DP(py); py += 2;
  452. ST_DP(out7, y); y += 2;
  453. }
  454. out0 = s0 * y0;
  455. out2 = s0 * y1;
  456. out4 = s0 * y2;
  457. out6 = s0 * y3;
  458. out1 = -(s0 * x0);
  459. out3 = -(s0 * x1);
  460. out5 = -(s0 * x2);
  461. out7 = -(s0 * x3);
  462. ST_DP4_INC(out0, out2, out4, out6, x, 2);
  463. ST_DP4_INC(out1, out3, out5, out7, y, 2);
  464. LD_DP4_INC(px, 2, x4, x5, x6, x7);
  465. LD_DP4_INC(py, 2, y4, y5, y6, y7);
  466. out0 = s0 * y4;
  467. out2 = s0 * y5;
  468. out4 = s0 * y6;
  469. out6 = s0 * y7;
  470. out1 = -(s0 * x4);
  471. out3 = -(s0 * x5);
  472. out5 = -(s0 * x6);
  473. out7 = -(s0 * x7);
  474. ST_DP4_INC(out0, out2, out4, out6, x, 2);
  475. ST_DP4_INC(out1, out3, out5, out7, y, 2);
  476. }
  477. if (n & 8)
  478. {
  479. LD_DP4_INC(px, 2, x0, x1, x2, x3);
  480. LD_DP4_INC(py, 2, y0, y1, y2, y3);
  481. out0 = s0 * y0;
  482. out1 = - (s0 * x0);
  483. out2 = s0 * y1;
  484. out3 = - (s0 * x1);
  485. out4 = s0 * y2;
  486. out5 = - (s0 * x2);
  487. out6 = s0 * y3;
  488. out7 = - (s0 * x3);
  489. ST_DP4_INC(out0, out2, out4, out6, x, 2);
  490. ST_DP4_INC(out1, out3, out5, out7, y, 2);
  491. }
  492. if (n & 4)
  493. {
  494. LD_DP2_INC(px, 2, x0, x1);
  495. LD_DP2_INC(py, 2, y0, y1);
  496. out0 = s0 * y0;
  497. out1 = - (s0 * x0);
  498. out2 = s0 * y1;
  499. out3 = - (s0 * x1);
  500. ST_DP2_INC(out0, out2, x, 2);
  501. ST_DP2_INC(out1, out3, y, 2);
  502. }
  503. if (n & 2)
  504. {
  505. x0 = LD_DP(px); px += 2;
  506. y0 = LD_DP(py); py += 2;
  507. out0 = s0 * y0;
  508. out1 = - (s0 * x0);
  509. ST_DP(out0, x); x += 2;
  510. ST_DP(out1, y); y += 2;
  511. }
  512. if (n & 1)
  513. {
  514. LD_GP2_INC(px, 1, fx0, fx1);
  515. LD_GP2_INC(py, 1, fy0, fy1);
  516. tp0 = s * fy0;
  517. tp1 = - (s * fx0);
  518. tp2 = s * fy1;
  519. tp3 = - (s * fx1);
  520. ST_GP2_INC(tp0, tp2, x, 1);
  521. ST_GP2_INC(tp1, tp3, y, 1);
  522. }
  523. }
  524. else
  525. {
  526. c0 = COPY_DOUBLE_TO_VECTOR(c);
  527. s0 = COPY_DOUBLE_TO_VECTOR(s);
  528. /* process 14 doubles */
  529. if (n >> 4)
  530. {
  531. BLASLONG pref_offsetx, pref_offsety;
  532. pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
  533. if (pref_offsetx > 0)
  534. {
  535. pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
  536. pref_offsetx = pref_offsetx / sizeof(FLOAT);
  537. }
  538. pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
  539. if (pref_offsety > 0)
  540. {
  541. pref_offsety = L1_DATA_LINESIZE - pref_offsety;
  542. pref_offsety = pref_offsety / sizeof(FLOAT);
  543. }
  544. LD_DP4_INC(px, 2, x0, x1, x2, x3);
  545. LD_DP4_INC(py, 2, y0, y1, y2, y3);
  546. for (j = (n >> 4) - 1; j--;)
  547. {
  548. PREFETCH(px + pref_offsetx + 16);
  549. PREFETCH(px + pref_offsetx + 20);
  550. PREFETCH(px + pref_offsetx + 24);
  551. PREFETCH(px + pref_offsetx + 28);
  552. PREFETCH(py + pref_offsety + 16);
  553. PREFETCH(py + pref_offsety + 20);
  554. PREFETCH(py + pref_offsety + 24);
  555. PREFETCH(py + pref_offsety + 28);
  556. x4 = LD_DP(px); px += 2;
  557. out0 = c0 * x0;
  558. x5 = LD_DP(px); px += 2;
  559. out2 = c0 * x1;
  560. x6 = LD_DP(px); px += 2;
  561. out4 = c0 * x2;
  562. x7 = LD_DP(px); px += 2;
  563. out6 = c0 * x3;
  564. y4 = LD_DP(py); py += 2;
  565. out1 = c0 * y0;
  566. y5 = LD_DP(py); py += 2;
  567. out3 = c0 * y1;
  568. y6 = LD_DP(py); py += 2;
  569. out5 = c0 * y2;
  570. y7 = LD_DP(py); py += 2;
  571. out7 = c0 * y3;
  572. out0 += s0 * y0;
  573. out2 += s0 * y1;
  574. out4 += s0 * y2;
  575. out6 += s0 * y3;
  576. out1 -= s0 * x0;
  577. out3 -= s0 * x1;
  578. out5 -= s0 * x2;
  579. out7 -= s0 * x3;
  580. ST_DP(out0, x); x += 2;
  581. out0 = c0 * x4;
  582. ST_DP(out2, x); x += 2;
  583. out2 = c0 * x5;
  584. ST_DP(out4, x); x += 2;
  585. out4 = c0 * x6;
  586. ST_DP(out6, x); x += 2;
  587. out6 = c0 * x7;
  588. ST_DP(out1, y); y += 2;
  589. out1 = c0 * y4;
  590. ST_DP(out3, y); y += 2;
  591. out3 = c0 * y5;
  592. ST_DP(out5, y); y += 2;
  593. out5 = c0 * y6;
  594. ST_DP(out7, y); y += 2;
  595. out7 = c0 * y7;
  596. x0 = LD_DP(px); px += 2;
  597. out0 += s0 * y4;
  598. x1 = LD_DP(px); px += 2;
  599. out2 += s0 * y5;
  600. x2 = LD_DP(px); px += 2;
  601. out4 += s0 * y6;
  602. x3 = LD_DP(px); px += 2;
  603. out6 += s0 * y7;
  604. y0 = LD_DP(py); py += 2;
  605. out1 -= s0 * x4;
  606. y1 = LD_DP(py); py += 2;
  607. out3 -= s0 * x5;
  608. y2 = LD_DP(py); py += 2;
  609. out5 -= s0 * x6;
  610. y3 = LD_DP(py); py += 2;
  611. out7 -= s0 * x7;
  612. ST_DP4_INC(out0, out2, out4, out6, x, 2);
  613. ST_DP4_INC(out1, out3, out5, out7, y, 2);
  614. }
  615. out0 = c0 * x0;
  616. out0 += s0 * y0;
  617. out1 = c0 * y0;
  618. out1 -= s0 * x0;
  619. out2 = c0 * x1;
  620. out2 += s0 * y1;
  621. out3 = c0 * y1;
  622. out3 -= s0 * x1;
  623. out4 = c0 * x2;
  624. out4 += s0 * y2;
  625. out5 = c0 * y2;
  626. out5 -= s0 * x2;
  627. out6 = c0 * x3;
  628. out6 += s0 * y3;
  629. out7 = c0 * y3;
  630. out7 -= s0 * x3;
  631. ST_DP4_INC(out0, out2, out4, out6, x, 2);
  632. ST_DP4_INC(out1, out3, out5, out7, y, 2);
  633. LD_DP4_INC(px, 2, x4, x5, x6, x7);
  634. LD_DP4_INC(py, 2, y4, y5, y6, y7);
  635. out8 = c0 * x4;
  636. out8 += s0 * y4;
  637. out9 = c0 * y4;
  638. out9 -= s0 * x4;
  639. out10 = c0 * x5;
  640. out10 += s0 * y5;
  641. out11 = c0 * y5;
  642. out11 -= s0 * x5;
  643. out12 = c0 * x6;
  644. out12 += s0 * y6;
  645. out13 = c0 * y6;
  646. out13 -= s0 * x6;
  647. out14 = c0 * x7;
  648. out14 += s0 * y7;
  649. out15 = c0 * y7;
  650. out15 -= s0 * x7;
  651. ST_DP4_INC(out8, out10, out12, out14, x, 2);
  652. ST_DP4_INC(out9, out11, out13, out15, y, 2);
  653. }
  654. if (n & 8)
  655. {
  656. LD_DP4_INC(px, 2, x0, x1, x2, x3);
  657. LD_DP4_INC(py, 2, y0, y1, y2, y3);
  658. out0 = (c0 * x0) + (s0 * y0);
  659. out1 = (c0 * y0) - (s0 * x0);
  660. out2 = (c0 * x1) + (s0 * y1);
  661. out3 = (c0 * y1) - (s0 * x1);
  662. out4 = (c0 * x2) + (s0 * y2);
  663. out5 = (c0 * y2) - (s0 * x2);
  664. out6 = (c0 * x3) + (s0 * y3);
  665. out7 = (c0 * y3) - (s0 * x3);
  666. ST_DP4_INC(out0, out2, out4, out6, x, 2);
  667. ST_DP4_INC(out1, out3, out5, out7, y, 2);
  668. }
  669. if (n & 4)
  670. {
  671. LD_DP2_INC(px, 2, x0, x1);
  672. LD_DP2_INC(py, 2, y0, y1);
  673. out0 = (c0 * x0) + (s0 * y0);
  674. out1 = (c0 * y0) - (s0 * x0);
  675. out2 = (c0 * x1) + (s0 * y1);
  676. out3 = (c0 * y1) - (s0 * x1);
  677. ST_DP2_INC(out0, out2, x, 2);
  678. ST_DP2_INC(out1, out3, y, 2);
  679. }
  680. if (n & 2)
  681. {
  682. x0 = LD_DP(px);
  683. y0 = LD_DP(py);
  684. px += 2;
  685. py += 2;
  686. out0 = (c0 * x0) + (s0 * y0);
  687. out1 = (c0 * y0) - (s0 * x0);
  688. ST_DP(out0, x);
  689. ST_DP(out1, y);
  690. x += 2;
  691. y += 2;
  692. }
  693. if (n & 1)
  694. {
  695. tp0 = c * *x + s * *y;
  696. *y = c * *y - s * *x;
  697. *x = tp0;
  698. }
  699. }
  700. }
  701. else
  702. {
  703. if ((0 == c) && (0 == s))
  704. {
  705. for (i = n; i--;)
  706. {
  707. *x = 0;
  708. *y = 0;
  709. x += inc_x;
  710. y += inc_y;
  711. }
  712. }
  713. else if ((1 == c) && (1 == s))
  714. {
  715. if (n >> 2)
  716. {
  717. fx0 = *px; px += inc_x;
  718. fx1 = *px; px += inc_x;
  719. fx2 = *px; px += inc_x;
  720. fx3 = *px; px += inc_x;
  721. fy0 = *py; py += inc_y;
  722. fy1 = *py; py += inc_y;
  723. fy2 = *py; py += inc_y;
  724. fy3 = *py; py += inc_y;
  725. for (i = (n >> 2) -1; i--;)
  726. {
  727. tp0 = fx0 + fy0;
  728. tp1 = fy0 - fx0;
  729. tp2 = fx1 + fy1;
  730. tp3 = fy1 - fx1;
  731. tp4 = fx2 + fy2;
  732. tp5 = fy2 - fx2;
  733. tp6 = fx3 + fy3;
  734. tp7 = fy3 - fx3;
  735. fx0 = *px; px += inc_x;
  736. *x = tp0; x += inc_x;
  737. fx1 = *px; px += inc_x;
  738. *x = tp2; x += inc_x;
  739. fx2 = *px; px += inc_x;
  740. *x = tp4; x += inc_x;
  741. fx3 = *px; px += inc_x;
  742. *x = tp6; x += inc_x;
  743. fy0 = *py; py += inc_y;
  744. *y = tp1; y += inc_y;
  745. fy1 = *py; py += inc_y;
  746. *y = tp3; y += inc_y;
  747. fy2 = *py; py += inc_y;
  748. *y = tp5; y += inc_y;
  749. fy3 = *py; py += inc_y;
  750. *y = tp7; y += inc_y;
  751. }
  752. tp0 = fx0 + fy0;
  753. tp1 = fy0 - fx0;
  754. tp2 = fx1 + fy1;
  755. tp3 = fy1 - fx1;
  756. tp4 = fx2 + fy2;
  757. tp5 = fy2 - fx2;
  758. tp6 = fx3 + fy3;
  759. tp7 = fy3 - fx3;
  760. *x = tp0; x += inc_x;
  761. *x = tp2; x += inc_x;
  762. *x = tp4; x += inc_x;
  763. *x = tp6; x += inc_x;
  764. *y = tp1; y += inc_y;
  765. *y = tp3; y += inc_y;
  766. *y = tp5; y += inc_y;
  767. *y = tp7; y += inc_y;
  768. }
  769. if (n & 2)
  770. {
  771. LD_GP2_INC(px, inc_x, fx0, fx1);
  772. LD_GP2_INC(py, inc_y, fy0, fy1);
  773. tp0 = fx0 + fy0;
  774. tp1 = fy0 - fx0;
  775. tp2 = fx1 + fy1;
  776. tp3 = fy1 - fx1;
  777. ST_GP2_INC(tp0, tp2, x, inc_x);
  778. ST_GP2_INC(tp1, tp3, y, inc_y);
  779. }
  780. if (n & 1)
  781. {
  782. fx0 = *px;
  783. fy0 = *py;
  784. tp0 = fx0 + fy0;
  785. tp1 = fy0 - fx0;
  786. *x = tp0;
  787. *y = tp1;
  788. }
  789. }
  790. else if (0 == s)
  791. {
  792. if (n >> 2)
  793. {
  794. fx0 = *px; px += inc_x;
  795. fx1 = *px; px += inc_x;
  796. fx2 = *px; px += inc_x;
  797. fx3 = *px; px += inc_x;
  798. fy0 = *py; py += inc_y;
  799. fy1 = *py; py += inc_y;
  800. fy2 = *py; py += inc_y;
  801. fy3 = *py; py += inc_y;
  802. for (i = (n >> 2) - 1; i--;)
  803. {
  804. tp0 = c * fx0;
  805. tp1 = c * fy0;
  806. tp2 = c * fx1;
  807. tp3 = c * fy1;
  808. tp4 = c * fx2;
  809. tp5 = c * fy2;
  810. tp6 = c * fx3;
  811. tp7 = c * fy3;
  812. fx0 = *px; px += inc_x;
  813. *x = tp0; x += inc_x;
  814. fx1 = *px; px += inc_x;
  815. *x = tp2; x += inc_x;
  816. fx2 = *px; px += inc_x;
  817. *x = tp4; x += inc_x;
  818. fx3 = *px; px += inc_x;
  819. *x = tp6; x += inc_x;
  820. fy0 = *py; py += inc_y;
  821. *y = tp1; y += inc_y;
  822. fy1 = *py; py += inc_y;
  823. *y = tp3; y += inc_y;
  824. fy2 = *py; py += inc_y;
  825. *y = tp5; y += inc_y;
  826. fy3 = *py; py += inc_y;
  827. *y = tp7; y += inc_y;
  828. }
  829. tp0 = c * fx0;
  830. tp1 = c * fy0;
  831. tp2 = c * fx1;
  832. tp3 = c * fy1;
  833. tp4 = c * fx2;
  834. tp5 = c * fy2;
  835. tp6 = c * fx3;
  836. tp7 = c * fy3;
  837. *x = tp0; x += inc_x;
  838. *x = tp2; x += inc_x;
  839. *x = tp4; x += inc_x;
  840. *x = tp6; x += inc_x;
  841. *y = tp1; y += inc_y;
  842. *y = tp3; y += inc_y;
  843. *y = tp5; y += inc_y;
  844. *y = tp7; y += inc_y;
  845. }
  846. if (n & 2)
  847. {
  848. LD_GP2_INC(px, inc_x, fx0, fx1);
  849. LD_GP2_INC(py, inc_y, fy0, fy1);
  850. tp0 = c * fx0;
  851. tp1 = c * fy0;
  852. tp2 = c * fx1;
  853. tp3 = c * fy1;
  854. ST_GP2_INC(tp0, tp2, x, inc_x);
  855. ST_GP2_INC(tp1, tp3, y, inc_y);
  856. }
  857. if (n & 1)
  858. {
  859. fx0 = *px;
  860. fy0 = *py;
  861. tp0 = c * fx0;
  862. tp1 = c * fy0;
  863. *x = tp0;
  864. *y = tp1;
  865. }
  866. }
  867. else
  868. {
  869. if (n >> 2)
  870. {
  871. fx0 = *px; px += inc_x;
  872. fx1 = *px; px += inc_x;
  873. fx2 = *px; px += inc_x;
  874. fx3 = *px; px += inc_x;
  875. fy0 = *py; py += inc_y;
  876. fy1 = *py; py += inc_y;
  877. fy2 = *py; py += inc_y;
  878. fy3 = *py; py += inc_y;
  879. for (i = (n >> 2) - 1; i--;)
  880. {
  881. tp0 = c * fx0 + s * fy0;
  882. tp1 = c * fy0 - s * fx0;
  883. tp2 = c * fx1 + s * fy1;
  884. tp3 = c * fy1 - s * fx1;
  885. tp4 = c * fx2 + s * fy2;
  886. tp5 = c * fy2 - s * fx2;
  887. tp6 = c * fx3 + s * fy3;
  888. tp7 = c * fy3 - s * fx3;
  889. fx0 = *px; px += inc_x;
  890. *x = tp0; x += inc_x;
  891. fx1 = *px; px += inc_x;
  892. *x = tp2; x += inc_x;
  893. fx2 = *px; px += inc_x;
  894. *x = tp4; x += inc_x;
  895. fx3 = *px; px += inc_x;
  896. *x = tp6; x += inc_x;
  897. fy0 = *py; py += inc_y;
  898. *y = tp1; y += inc_y;
  899. fy1 = *py; py += inc_y;
  900. *y = tp3; y += inc_y;
  901. fy2 = *py; py += inc_y;
  902. *y = tp5; y += inc_y;
  903. fy3 = *py; py += inc_y;
  904. *y = tp7; y += inc_y;
  905. }
  906. tp0 = c * fx0 + s * fy0;
  907. tp1 = c * fy0 - s * fx0;
  908. tp2 = c * fx1 + s * fy1;
  909. tp3 = c * fy1 - s * fx1;
  910. tp4 = c * fx2 + s * fy2;
  911. tp5 = c * fy2 - s * fx2;
  912. tp6 = c * fx3 + s * fy3;
  913. tp7 = c * fy3 - s * fx3;
  914. *x = tp0; x += inc_x;
  915. *x = tp2; x += inc_x;
  916. *x = tp4; x += inc_x;
  917. *x = tp6; x += inc_x;
  918. *y = tp1; y += inc_y;
  919. *y = tp3; y += inc_y;
  920. *y = tp5; y += inc_y;
  921. *y = tp7; y += inc_y;
  922. }
  923. if (n & 2)
  924. {
  925. LD_GP2_INC(px, inc_x, fx0, fx1);
  926. LD_GP2_INC(py, inc_y, fy0, fy1);
  927. tp0 = (c * fx0) + (s * fy0);
  928. tp1 = (c * fy0) - (s * fx0);
  929. tp2 = (c * fx1) + (s * fy1);
  930. tp3 = (c * fy1) - (s * fx1);
  931. ST_GP2_INC(tp0, tp2, x, inc_x);
  932. ST_GP2_INC(tp1, tp3, y, inc_y);
  933. }
  934. if (n & 1)
  935. {
  936. fx0 = *px;
  937. fy0 = *py;
  938. tp0 = (c * fx0) + (s * fy0);
  939. tp1 = (c * fy0) - (s * fx0);
  940. *x = tp0;
  941. *y = tp1;
  942. }
  943. }
  944. }
  945. return 0;
  946. }