|
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072 |
- /*******************************************************************************
- Copyright (c) 2016, The OpenBLAS Project
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- 3. Neither the name of the OpenBLAS project nor the names of
- its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
- USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *******************************************************************************/
-
- #include "common.h"
- #include "macros_msa.h"
-
- int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
- FLOAT c, FLOAT s)
- {
- BLASLONG i, j;
- FLOAT *px, *py;
- FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
- FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3;
- BLASLONG inc_x2, inc_y2;
- v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
- v4f32 out0, out1, out2, out3, out4, out5, out6, out7;
- v4f32 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0;
-
- if (n <= 0) return (0);
-
- px = x;
- py = y;
-
- if ((1 == inc_x) && (1 == inc_y))
- {
- if ((0 == c) && (0 == s))
- {
- v4f32 zero = __msa_cast_to_vector_float(0);
- zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
- zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
- zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
- zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
-
- /* process 2 elements */
- for (j = (n >> 1); j--;)
- {
- ST_SP(zero, px);
- ST_SP(zero, py);
-
- px += 4;
- py += 4;
- }
- if (n & 1)
- {
- px[0] = 0;
- px[1] = 0;
- py[0] = 0;
- py[1] = 0;
- }
- }
- else if ((1 == c) && (1 == s))
- {
- if (n >> 4)
- {
- BLASLONG pref_offsetx, pref_offsety;
-
- pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
- if (pref_offsetx > 0)
- {
- pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
- pref_offsetx = pref_offsetx / sizeof(FLOAT);
- }
-
- pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
- if (pref_offsety > 0)
- {
- pref_offsety = L1_DATA_LINESIZE - pref_offsety;
- pref_offsety = pref_offsety / sizeof(FLOAT);
- }
-
- x0 = LD_SP(px); px += 4;
- x1 = LD_SP(px); px += 4;
- x2 = LD_SP(px); px += 4;
- x3 = LD_SP(px); px += 4;
- y0 = LD_SP(py); py += 4;
- y1 = LD_SP(py); py += 4;
- y2 = LD_SP(py); py += 4;
- y3 = LD_SP(py); py += 4;
-
- for (j = (n >> 4) - 1; j--;)
- {
- PREFETCH(px + pref_offsetx + 32);
- PREFETCH(px + pref_offsetx + 40);
- PREFETCH(px + pref_offsetx + 48);
- PREFETCH(px + pref_offsetx + 56);
- PREFETCH(py + pref_offsety + 32);
- PREFETCH(py + pref_offsety + 40);
- PREFETCH(py + pref_offsety + 48);
- PREFETCH(py + pref_offsety + 56);
-
- out0 = x0 + y0;
- x4 = LD_SP(px); px += 4;
- out1 = y0 - x0;
- x5 = LD_SP(px); px += 4;
- out2 = x1 + y1;
- x6 = LD_SP(px); px += 4;
- out3 = y1 - x1;
- x7 = LD_SP(px); px += 4;
- out4 = x2 + y2;
- y4 = LD_SP(py); py += 4;
- out5 = y2 - x2;
- y5 = LD_SP(py); py += 4;
- out6 = x3 + y3;
- y6 = LD_SP(py); py += 4;
- out7 = y3 - x3;
- y7 = LD_SP(py); py += 4;
-
- ST_SP(out0, x); x += 4;
- out8 = x4 + y4;
- ST_SP(out1, y); y += 4;
- out9 = y4 - x4;
- ST_SP(out2, x); x += 4;
- out10 = x5 + y5;
- ST_SP(out3, y); y += 4;
- out11 = y5 - x5;
- ST_SP(out4, x); x += 4;
- out12 = x6 + y6;
- ST_SP(out5, y); y += 4;
- out13 = y6 - x6;
- ST_SP(out6, x); x += 4;
- out14 = x7 + y7;
- ST_SP(out7, y); y += 4;
- out15 = y7 - x7;
-
- x0 = LD_SP(px); px += 4;
- ST_SP(out8, x); x += 4;
- x1 = LD_SP(px); px += 4;
- ST_SP(out10, x); x += 4;
- x2 = LD_SP(px); px += 4;
- ST_SP(out12, x); x += 4;
- x3 = LD_SP(px); px += 4;
- ST_SP(out14, x); x += 4;
-
- y0 = LD_SP(py); py += 4;
- ST_SP(out9, y); y += 4;
- y1 = LD_SP(py); py += 4;
- ST_SP(out11, y); y += 4;
- y2 = LD_SP(py); py += 4;
- ST_SP(out13, y); y += 4;
- y3 = LD_SP(py); py += 4;
- ST_SP(out15, y); y += 4;
- }
-
- x4 = LD_SP(px); px += 4;
- x5 = LD_SP(px); px += 4;
- x6 = LD_SP(px); px += 4;
- x7 = LD_SP(px); px += 4;
- y4 = LD_SP(py); py += 4;
- y5 = LD_SP(py); py += 4;
- y6 = LD_SP(py); py += 4;
- y7 = LD_SP(py); py += 4;
-
- out0 = x0 + y0;
- out1 = y0 - x0;
- out2 = x1 + y1;
- out3 = y1 - x1;
- out4 = x2 + y2;
- out5 = y2 - x2;
- out6 = x3 + y3;
- out7 = y3 - x3;
- out8 = x4 + y4;
- out9 = y4 - x4;
- out10 = x5 + y5;
- out11 = y5 - x5;
- out12 = x6 + y6;
- out13 = y6 - x6;
- out14 = x7 + y7;
- out15 = y7 - x7;
-
- ST_SP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 4);
- ST_SP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 4);
- }
- if (n & 8)
- {
- LD_SP4_INC(px, 4, x0, x1, x2, x3);
- LD_SP4_INC(py, 4, y0, y1, y2, y3);
-
- out0 = x0 + y0;
- out1 = y0 - x0;
- out2 = x1 + y1;
- out3 = y1 - x1;
- out4 = x2 + y2;
- out5 = y2 - x2;
- out6 = x3 + y3;
- out7 = y3 - x3;
-
- ST_SP4_INC(out0, out2, out4, out6, x, 4);
- ST_SP4_INC(out1, out3, out5, out7, y, 4);
- }
- if (n & 4)
- {
- LD_SP2_INC(px, 4, x0, x1);
- LD_SP2_INC(py, 4, y0, y1);
-
- out0 = x0 + y0;
- out1 = y0 - x0;
- out2 = x1 + y1;
- out3 = y1 - x1;
-
- ST_SP2_INC(out0, out2, x, 4);
- ST_SP2_INC(out1, out3, y, 4);
- }
- if (n & 2)
- {
- x0 = LD_SP(px);
- y0 = LD_SP(py);
- px += 4;
- py += 4;
-
- out0 = x0 + y0;
- out1 = y0 - x0;
-
- ST_SP(out0, x);
- ST_SP(out1, y);
- x += 4;
- y += 4;
- }
- if (n & 1)
- {
- LD_GP2_INC(px, 1, fx0, fx1);
- LD_GP2_INC(py, 1, fy0, fy1);
-
- tp0 = fx0 + fy0;
- tp1 = fy0 - fx0;
- tp2 = fx1 + fy1;
- tp3 = fy1 - fx1;
-
- ST_GP2_INC(tp0, tp2, x, 1);
- ST_GP2_INC(tp1, tp3, y, 1);
- }
- }
- else if (0 == s)
- {
-
- c0 = COPY_FLOAT_TO_VECTOR(c);
-
- if (n >> 4)
- {
- BLASLONG pref_offsetx, pref_offsety;
-
- pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
- if (pref_offsetx > 0)
- {
- pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
- pref_offsetx = pref_offsetx / sizeof(FLOAT);
- }
-
- pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
- if (pref_offsety > 0)
- {
- pref_offsety = L1_DATA_LINESIZE - pref_offsety;
- pref_offsety = pref_offsety / sizeof(FLOAT);
- }
-
- LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
-
- for (j = (n >> 4) - 1; j--;)
- {
- PREFETCH(px + pref_offsetx + 32);
- PREFETCH(px + pref_offsetx + 40);
- PREFETCH(px + pref_offsetx + 48);
- PREFETCH(px + pref_offsetx + 56);
- PREFETCH(py + pref_offsety + 32);
- PREFETCH(py + pref_offsety + 40);
- PREFETCH(py + pref_offsety + 48);
- PREFETCH(py + pref_offsety + 56);
-
- y0 = LD_SP(py); py += 4;
- x0 *= c0;
- y1 = LD_SP(py); py += 4;
- x1 *= c0;
- y2 = LD_SP(py); py += 4;
- x2 *= c0;
- y3 = LD_SP(py); py += 4;
- x3 *= c0;
- y4 = LD_SP(py); py += 4;
- x4 *= c0;
- y5 = LD_SP(py); py += 4;
- x5 *= c0;
- y6 = LD_SP(py); py += 4;
- x6 *= c0;
- y7 = LD_SP(py); py += 4;
- x7 *= c0;
-
- ST_SP(x0, x); x += 4;
- y0 *= c0;
- ST_SP(x1, x); x += 4;
- y1 *= c0;
- ST_SP(x2, x); x += 4;
- y2 *= c0;
- ST_SP(x3, x); x += 4;
- y3 *= c0;
- ST_SP(x4, x); x += 4;
- y4 *= c0;
- ST_SP(x5, x); x += 4;
- y5 *= c0;
- ST_SP(x6, x); x += 4;
- y6 *= c0;
- ST_SP(x7, x); x += 4;
- y7 *= c0;
-
- x0 = LD_SP(px); px += 4;
- ST_SP(y0, y); y += 4;
- x1 = LD_SP(px); px += 4;
- ST_SP(y1, y); y += 4;
- x2 = LD_SP(px); px += 4;
- ST_SP(y2, y); y += 4;
- x3 = LD_SP(px); px += 4;
- ST_SP(y3, y); y += 4;
- x4 = LD_SP(px); px += 4;
- ST_SP(y4, y); y += 4;
- x5 = LD_SP(px); px += 4;
- ST_SP(y5, y); y += 4;
- x6 = LD_SP(px); px += 4;
- ST_SP(y6, y); y += 4;
- x7 = LD_SP(px); px += 4;
- ST_SP(y7, y); y += 4;
- }
-
- LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
-
- x0 *= c0;
- y0 *= c0;
- x1 *= c0;
- y1 *= c0;
- x2 *= c0;
- y2 *= c0;
- x3 *= c0;
- y3 *= c0;
- x4 *= c0;
- y4 *= c0;
- x5 *= c0;
- y5 *= c0;
- x6 *= c0;
- y6 *= c0;
- x7 *= c0;
- y7 *= c0;
-
- ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
- ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
- }
- if (n & 8)
- {
- LD_SP4_INC(px, 4, x0, x1, x2, x3);
- LD_SP4_INC(py, 4, y0, y1, y2, y3);
-
- x0 *= c0;
- y0 *= c0;
- x1 *= c0;
- y1 *= c0;
- x2 *= c0;
- y2 *= c0;
- x3 *= c0;
- y3 *= c0;
-
- ST_SP4_INC(x0, x1, x2, x3, x, 4);
- ST_SP4_INC(y0, y1, y2, y3, y, 4);
- }
- if (n & 4)
- {
- LD_SP2_INC(px, 4, x0, x1);
- LD_SP2_INC(py, 4, y0, y1);
-
- x0 *= c0;
- y0 *= c0;
- x1 *= c0;
- y1 *= c0;
-
- ST_SP2_INC(x0, x1, x, 4);
- ST_SP2_INC(y0, y1, y, 4);
- }
- if (n & 2)
- {
- x0 = LD_SP(px);
- y0 = LD_SP(py);
- px += 4;
- py += 4;
-
- x0 *= c0;
- y0 *= c0;
-
- ST_SP(x0, x);
- ST_SP(y0, y);
- x += 4;
- y += 4;
- }
- if (n & 1)
- {
- LD_GP2_INC(px, 1, fx0, fx1);
- LD_GP2_INC(py, 1, fy0, fy1);
-
- tp0 = (c * fx0);
- tp1 = (c * fy0);
- tp2 = (c * fx1);
- tp3 = (c * fy1);
-
- ST_GP2_INC(tp0, tp2, x, 1);
- ST_GP2_INC(tp1, tp3, y, 1);
- }
- }
- else if (0 == c)
- {
- s0 = COPY_FLOAT_TO_VECTOR(s);
-
- /* process 16 floats */
- if (n >> 4)
- {
- BLASLONG pref_offsetx, pref_offsety;
-
- pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
- if (pref_offsetx > 0)
- {
- pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
- pref_offsetx = pref_offsetx / sizeof(FLOAT);
- }
-
- pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
- if (pref_offsety > 0)
- {
- pref_offsety = L1_DATA_LINESIZE - pref_offsety;
- pref_offsety = pref_offsety / sizeof(FLOAT);
- }
-
- LD_SP4_INC(px, 4, x0, x1, x2, x3);
- LD_SP4_INC(py, 4, y0, y1, y2, y3);
-
- for (j = (n >> 4) - 1; j--;)
- {
- PREFETCH(px + pref_offsetx + 32);
- PREFETCH(px + pref_offsetx + 40);
- PREFETCH(px + pref_offsetx + 48);
- PREFETCH(px + pref_offsetx + 56);
- PREFETCH(py + pref_offsety + 32);
- PREFETCH(py + pref_offsety + 40);
- PREFETCH(py + pref_offsety + 48);
- PREFETCH(py + pref_offsety + 56);
-
- x4 = LD_SP(px); px += 4;
- out0 = s0 * y0;
- x5 = LD_SP(px); px += 4;
- out2 = s0 * y1;
- x6 = LD_SP(px); px += 4;
- out4 = s0 * y2;
- x7 = LD_SP(px); px += 4;
- out6 = s0 * y3;
- y4 = LD_SP(py); py += 4;
- out1 = -(s0 * x0);
- y5 = LD_SP(py); py += 4;
- out3 = -(s0 * x1);
- y6 = LD_SP(py); py += 4;
- out5 = -(s0 * x2);
- y7 = LD_SP(py); py += 4;
- out7 = -(s0 * x3);
-
- ST_SP(out0, x); x += 4;
- out0 = s0 * y4;
- ST_SP(out2, x); x += 4;
- out2 = s0 * y5;
- ST_SP(out4, x); x += 4;
- out4 = s0 * y6;
- ST_SP(out6, x); x += 4;
- out6 = s0 * y7;
- ST_SP(out1, y); y += 4;
- out1 = -(s0 * x4);
- ST_SP(out3, y); y += 4;
- out3 = -(s0 * x5);
- ST_SP(out5, y); y += 4;
- out5 = -(s0 * x6);
- ST_SP(out7, y); y += 4;
- out7 = -(s0 * x7);
-
- x0 = LD_SP(px); px += 4;
- ST_SP(out0, x); x += 4;
- x1 = LD_SP(px); px += 4;
- ST_SP(out2, x); x += 4;
- x2 = LD_SP(px); px += 4;
- ST_SP(out4, x); x += 4;
- x3 = LD_SP(px); px += 4;
- ST_SP(out6, x); x += 4;
- y0 = LD_SP(py); py += 4;
- ST_SP(out1, y); y += 4;
- y1 = LD_SP(py); py += 4;
- ST_SP(out3, y); y += 4;
- y2 = LD_SP(py); py += 4;
- ST_SP(out5, y); y += 4;
- y3 = LD_SP(py); py += 4;
- ST_SP(out7, y); y += 4;
- }
-
- out0 = s0 * y0;
- out2 = s0 * y1;
- out4 = s0 * y2;
- out6 = s0 * y3;
- out1 = -(s0 * x0);
- out3 = -(s0 * x1);
- out5 = -(s0 * x2);
- out7 = -(s0 * x3);
-
- ST_SP4_INC(out0, out2, out4, out6, x, 4);
- ST_SP4_INC(out1, out3, out5, out7, y, 4);
-
- LD_SP4_INC(px, 4, x4, x5, x6, x7);
- LD_SP4_INC(py, 4, y4, y5, y6, y7);
-
- out0 = s0 * y4;
- out2 = s0 * y5;
- out4 = s0 * y6;
- out6 = s0 * y7;
- out1 = -(s0 * x4);
- out3 = -(s0 * x5);
- out5 = -(s0 * x6);
- out7 = -(s0 * x7);
-
- ST_SP4_INC(out0, out2, out4, out6, x, 4);
- ST_SP4_INC(out1, out3, out5, out7, y, 4);
- }
- if (n & 8)
- {
- LD_SP4_INC(px, 4, x0, x1, x2, x3);
- LD_SP4_INC(py, 4, y0, y1, y2, y3);
-
- out0 = s0 * y0;
- out1 = - (s0 * x0);
- out2 = s0 * y1;
- out3 = - (s0 * x1);
- out4 = s0 * y2;
- out5 = - (s0 * x2);
- out6 = s0 * y3;
- out7 = - (s0 * x3);
-
- ST_SP4_INC(out0, out2, out4, out6, x, 4);
- ST_SP4_INC(out1, out3, out5, out7, y, 4);
- }
- if (n & 4)
- {
- LD_SP2_INC(px, 4, x0, x1);
- LD_SP2_INC(py, 4, y0, y1);
-
- out0 = s0 * y0;
- out1 = - (s0 * x0);
- out2 = s0 * y1;
- out3 = - (s0 * x1);
-
- ST_SP2_INC(out0, out2, x, 4);
- ST_SP2_INC(out1, out3, y, 4);
- }
- if (n & 2)
- {
- x0 = LD_SP(px); px += 4;
- y0 = LD_SP(py); py += 4;
-
- out0 = s0 * y0;
- out1 = - (s0 * x0);
-
- ST_SP(out0, x); x += 4;
- ST_SP(out1, y); y += 4;
- }
- if (n & 1)
- {
- LD_GP2_INC(px, 1, fx0, fx1);
- LD_GP2_INC(py, 1, fy0, fy1);
-
- tp0 = s * fy0;
- tp1 = - (s * fx0);
- tp2 = s * fy1;
- tp3 = - (s * fx1);
-
- ST_GP2_INC(tp0, tp2, x, 1);
- ST_GP2_INC(tp1, tp3, y, 1);
- }
- }
- else
- {
- c0 = COPY_FLOAT_TO_VECTOR(c);
- s0 = COPY_FLOAT_TO_VECTOR(s);
-
- if (n >> 4)
- {
- BLASLONG pref_offsetx, pref_offsety;
-
- pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
- if (pref_offsetx > 0)
- {
- pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
- pref_offsetx = pref_offsetx / sizeof(FLOAT);
- }
-
- pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
- if (pref_offsety > 0)
- {
- pref_offsety = L1_DATA_LINESIZE - pref_offsety;
- pref_offsety = pref_offsety / sizeof(FLOAT);
- }
-
- LD_SP4_INC(px, 4, x0, x1, x2, x3);
- LD_SP4_INC(py, 4, y0, y1, y2, y3);
-
- for (j = (n >> 4) - 1; j--;)
- {
- PREFETCH(px + pref_offsetx + 32);
- PREFETCH(px + pref_offsetx + 40);
- PREFETCH(px + pref_offsetx + 48);
- PREFETCH(px + pref_offsetx + 56);
- PREFETCH(py + pref_offsety + 32);
- PREFETCH(py + pref_offsety + 40);
- PREFETCH(py + pref_offsety + 48);
- PREFETCH(py + pref_offsety + 56);
-
- x4 = LD_SP(px); px += 4;
- out0 = c0 * x0;
- x5 = LD_SP(px); px += 4;
- out1 = c0 * y0;
- x6 = LD_SP(px); px += 4;
- out2 = c0 * x1;
- x7 = LD_SP(px); px += 4;
- out3 = c0 * y1;
- y4 = LD_SP(py); py += 4;
- out4 = c0 * x2;
- y5 = LD_SP(py); py += 4;
- out5 = c0 * y2;
- y6 = LD_SP(py); py += 4;
- out6 = c0 * x3;
- y7 = LD_SP(py); py += 4;
- out7 = c0 * y3;
-
- out0 += s0 * y0;
- out1 -= s0 * x0;
- out2 += s0 * y1;
- out3 -= s0 * x1;
- out4 += s0 * y2;
- out5 -= s0 * x2;
- out6 += s0 * y3;
- out7 -= s0 * x3;
-
- ST_SP(out0, x); x += 4;
- out8 = c0 * x4;
- ST_SP(out2, x); x += 4;
- out9 = c0 * y4;
- ST_SP(out4, x); x += 4;
- out10 = c0 * x5;
- ST_SP(out6, x); x += 4;
- out11 = c0 * y5;
- ST_SP(out1, y); y += 4;
- out12 = c0 * x6;
- ST_SP(out3, y); y += 4;
- out13 = c0 * y6;
- ST_SP(out5, y); y += 4;
- out14 = c0 * x7;
- ST_SP(out7, y); y += 4;
- out15 = c0 * y7;
-
- x0 = LD_SP(px); px += 4;
- out8 += s0 * y4;
- x1 = LD_SP(px); px += 4;
- out9 -= s0 * x4;
- x2 = LD_SP(px); px += 4;
- out10 += s0 * y5;
- x3 = LD_SP(px); px += 4;
- out11 -= s0 * x5;
- y0 = LD_SP(py); py += 4;
- out12 += s0 * y6;
- y1 = LD_SP(py); py += 4;
- out13 -= s0 * x6;
- y2 = LD_SP(py); py += 4;
- out14 += s0 * y7;
- y3 = LD_SP(py); py += 4;
- out15 -= s0 * x7;
-
- ST_SP(out8, x); x += 4;
- ST_SP(out10, x); x += 4;
- ST_SP(out12, x); x += 4;
- ST_SP(out14, x); x += 4;
- ST_SP(out9, y); y += 4;
- ST_SP(out11, y); y += 4;
- ST_SP(out13, y); y += 4;
- ST_SP(out15, y); y += 4;
- }
-
- out0 = c0 * x0;
- out0 += s0 * y0;
- out1 = c0 * y0;
- out1 -= s0 * x0;
- out2 = c0 * x1;
- out2 += s0 * y1;
- out3 = c0 * y1;
- out3 -= s0 * x1;
- out4 = c0 * x2;
- out4 += s0 * y2;
- out5 = c0 * y2;
- out5 -= s0 * x2;
- out6 = c0 * x3;
- out6 += s0 * y3;
- out7 = c0 * y3;
- out7 -= s0 * x3;
-
- ST_SP4_INC(out0, out2, out4, out6, x, 4);
- ST_SP4_INC(out1, out3, out5, out7, y, 4);
-
- LD_SP4_INC(px, 4, x4, x5, x6, x7);
- LD_SP4_INC(py, 4, y4, y5, y6, y7);
-
- out8 = c0 * x4;
- out8 += s0 * y4;
- out9 = c0 * y4;
- out9 -= s0 * x4;
- out10 = c0 * x5;
- out10 += s0 * y5;
- out11 = c0 * y5;
- out11 -= s0 * x5;
- out12 = c0 * x6;
- out12 += s0 * y6;
- out13 = c0 * y6;
- out13 -= s0 * x6;
- out14 = c0 * x7;
- out14 += s0 * y7;
- out15 = c0 * y7;
- out15 -= s0 * x7;
-
- ST_SP4_INC(out8, out10, out12, out14, x, 4);
- ST_SP4_INC(out9, out11, out13, out15, y, 4);
- }
- if (n & 8)
- {
- LD_SP4_INC(px, 4, x0, x1, x2, x3);
- LD_SP4_INC(py, 4, y0, y1, y2, y3);
-
- out0 = (c0 * x0) + (s0 * y0);
- out1 = (c0 * y0) - (s0 * x0);
- out2 = (c0 * x1) + (s0 * y1);
- out3 = (c0 * y1) - (s0 * x1);
- out4 = (c0 * x2) + (s0 * y2);
- out5 = (c0 * y2) - (s0 * x2);
- out6 = (c0 * x3) + (s0 * y3);
- out7 = (c0 * y3) - (s0 * x3);
-
- ST_SP4_INC(out0, out2, out4, out6, x, 4);
- ST_SP4_INC(out1, out3, out5, out7, y, 4);
- }
- if (n & 4)
- {
- LD_SP2_INC(px, 4, x0, x1);
- LD_SP2_INC(py, 4, y0, y1);
-
- out0 = (c0 * x0) + (s0 * y0);
- out1 = (c0 * y0) - (s0 * x0);
- out2 = (c0 * x1) + (s0 * y1);
- out3 = (c0 * y1) - (s0 * x1);
-
- ST_SP2_INC(out0, out2, x, 4);
- ST_SP2_INC(out1, out3, y, 4);
- }
- if (n & 2)
- {
- x0 = LD_SP(px);
- y0 = LD_SP(py);
- px += 4;
- py += 4;
-
- out0 = (c0 * x0) + (s0 * y0);
- out1 = (c0 * y0) - (s0 * x0);
-
- ST_SP(out0, x);
- ST_SP(out1, y);
- x += 4;
- y += 4;
- }
- if (n & 1)
- {
- LD_GP2_INC(px, 1, fx0, fx1);
- LD_GP2_INC(py, 1, fy0, fy1);
-
- tp0 = (c * fx0) + (s * fy0);
- tp1 = (c * fy0) - (s * fx0);
- tp2 = (c * fx1) + (s * fy1);
- tp3 = (c * fy1) - (s * fx1);
-
- ST_GP2_INC(tp0, tp2, x, 1);
- ST_GP2_INC(tp1, tp3, y, 1);
- }
- }
- }
- else
- {
- inc_x2 = 2 * inc_x;
- inc_y2 = 2 * inc_y;
-
- if ((0 == c) && (0 == s))
- {
- for (i = n; i--;)
- {
- *x = 0;
- *(x + 1) = 0;
- *y = 0;
- *(y + 1) = 0;
-
- x += inc_x2;
- y += inc_y2;
- }
- }
- else if ((1 == c) && (1 == s))
- {
- if (n >> 1)
- {
- fx0 = *px;
- fx1 = *(px+1); px += inc_x2;
- fx2 = *px;
- fx3 = *(px+1); px += inc_x2;
-
- fy0 = *py;
- fy1 = *(py+1); py += inc_y2;
- fy2 = *py;
- fy3 = *(py+1); py += inc_y2;
-
- for (i = (n >> 1) - 1; i--;)
- {
- tp0 = fx0 + fy0;
- tp1 = fx1 + fy1;
- tp2 = fy0 - fx0;
- tp3 = fy1 - fx1;
- tp4 = fx2 + fy2;
- tp5 = fx3 + fy3;
- tp6 = fy2 - fx2;
- tp7 = fy3 - fx3;
-
- fx0 = *px;
- *x = tp0;
- fx1 = *(px+1); px += inc_x2;
- *(x+1) = tp1; x += inc_x2;
- fx2 = *px;
- *x = tp4;
- fx3 = *(px+1); px += inc_x2;
- *(x+1) = tp5; x += inc_x2;
-
- fy0 = *py;
- *y = tp2;
- fy1 = *(py+1); py += inc_y2;
- *(y+1) = tp3; y += inc_y2;
- fy2 = *py;
- *y = tp6;
- fy3 = *(py+1); py += inc_y2;
- *(y+1) = tp7; y += inc_y2;
- }
-
- tp0 = fx0 + fy0;
- tp1 = fx1 + fy1;
- tp2 = fy0 - fx0;
- tp3 = fy1 - fx1;
- tp4 = fx2 + fy2;
- tp5 = fx3 + fy3;
- tp6 = fy2 - fx2;
- tp7 = fy3 - fx3;
-
- *x = tp0;
- *(x+1) = tp1; x += inc_x2;
- *x = tp4;
- *(x+1) = tp5; x += inc_x2;
-
- *y = tp2;
- *(y+1) = tp3; y += inc_y2;
- *y = tp6;
- *(y+1) = tp7; y += inc_y2;
- }
- if (n & 1)
- {
- fx0 = *px;
- fx1 = *(px+1);
-
- fy0 = *py;
- fy1 = *(py+1);
-
- tp0 = fx0 + fy0;
- tp1 = fx1 + fy1;
- tp2 = fy0 - fx0;
- tp3 = fy1 - fx1;
-
- *x = tp0;
- *(x+1) = tp1;
-
- *y = tp2;
- *(y+1) = tp3;
- }
- }
- else if (0 == s)
- {
- if (n >> 1)
- {
- fx0 = *px;
- fx1 = *(px+1); px += inc_x2;
- fx2 = *px;
- fx3 = *(px+1); px += inc_x2;
-
- fy0 = *py;
- fy1 = *(py+1); py += inc_y2;
- fy2 = *py;
- fy3 = *(py+1); py += inc_y2;
-
- for (i = (n >> 1) - 1; i--;)
- {
- tp0 = c * fx0;
- tp1 = c * fx1;
- tp2 = c * fx2;
- tp3 = c * fx3;
- tp4 = c * fy0;
- tp5 = c * fy1;
- tp6 = c * fy2;
- tp7 = c * fy3;
-
- fx0 = *px;
- *x = tp0;
- fx1 = *(px+1); px += inc_x2;
- *(x+1) = tp1; x += inc_x2;
- fx2 = *px;
- *x = tp2;
- fx3 = *(px+1); px += inc_x2;
- *(x+1) = tp3; x += inc_x2;
- fy0 = *py;
- *y = tp4;
- fy1 = *(py+1); py += inc_y2;
- *(y+1) = tp5; y += inc_y2;
- fy2 = *py;
- *y = tp6;
- fy3 = *(py+1); py += inc_y2;
- *(y+1) = tp7; y += inc_y2;
- }
-
- tp0 = c * fx0;
- tp1 = c * fx1;
- tp2 = c * fx2;
- tp3 = c * fx3;
- tp4 = c * fy0;
- tp5 = c * fy1;
- tp6 = c * fy2;
- tp7 = c * fy3;
-
- *x = tp0;
- *(x+1) = tp1; x += inc_x2;
- *x = tp2;
- *(x+1) = tp3; x += inc_x2;
-
- *y = tp4;
- *(y+1) = tp5; y += inc_y2;
- *y = tp6;
- *(y+1) = tp7; y += inc_y2;
- }
- if (n & 1)
- {
- fx0 = *px;
- fx1 = *(px+1);
-
- fy0 = *py;
- fy1 = *(py+1);
-
- tp0 = c * fx0;
- tp1 = c * fx1;
- tp2 = c * fy0;
- tp3 = c * fy1;
-
- *x = tp0;
- *(x+1) = tp1;
-
- *y = tp2;
- *(y+1) = tp3;
- }
- }
- else
- {
- if (n >> 1)
- {
- fx0 = *px;
- fx1 = *(px+1); px += inc_x2;
- fx2 = *px;
- fx3 = *(px+1); px += inc_x2;
- fy0 = *py;
- fy1 = *(py+1); py += inc_y2;
- fy2 = *py;
- fy3 = *(py+1); py += inc_y2;
-
- for (i = (n >> 1) - 1; i--;)
- {
- tp0 = c * fx0 + s * fy0;
- tp1 = c * fx1 + s * fy1;
- tp2 = c * fy0 - s * fx0;
- tp3 = c * fy1 - s * fx1;
- tp4 = c * fx2 + s * fy2;
- tp5 = c * fx3 + s * fy3;
- tp6 = c * fy2 - s * fx2;
- tp7 = c * fy3 - s * fx3;
-
- fx0 = *px;
- *x = tp0;
- fx1 = *(px+1); px += inc_x2;
- *(x+1) = tp1; x += inc_x2;
- fx2 = *px;
- *x = tp4;
- fx3 = *(px+1); px += inc_x2;
- *(x+1) = tp5; x += inc_x2;
- fy0 = *py;
- *y = tp2;
- fy1 = *(py+1); py += inc_y2;
- *(y+1) = tp3; y += inc_y2;
- fy2 = *py;
- *y = tp6;
- fy3 = *(py+1); py += inc_y2;
- *(y+1) = tp7; y += inc_y2;
- }
-
- tp0 = c * fx0 + s * fy0;
- tp1 = c * fx1 + s * fy1;
- tp2 = c * fy0 - s * fx0;
- tp3 = c * fy1 - s * fx1;
- tp4 = c * fx2 + s * fy2;
- tp5 = c * fx3 + s * fy3;
- tp6 = c * fy2 - s * fx2;
- tp7 = c * fy3 - s * fx3;
-
- *x = tp0;
- *(x+1) = tp1; x += inc_x2;
- *x = tp4;
- *(x+1) = tp5; x += inc_x2;
- *y = tp2;
- *(y+1) = tp3; y += inc_y2;
- *y = tp6;
- *(y+1) = tp7; y += inc_y2;
- }
- if (n & 1)
- {
- fx0 = *px;
- fx1 = *(px+1);
-
- fy0 = *py;
- fy1 = *(py+1);
-
- tp0 = c * fx0 + s * fy0;
- tp1 = c * fx1 + s * fy1;
- tp2 = c * fy0 - s * fx0;
- tp3 = c * fy1 - s * fx1;
-
- *x = tp0;
- *(x+1) = tp1;
-
- *y = tp2;
- *(y+1) = tp3;
- }
- }
- }
-
- return 0;
- }
|