|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923 |
- #include "common.h"
- /********************************
- ADD1 a*c
- ADD2 b*c
- ADD3 a*d
- ADD4 b*d
- *********************************/
- int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,
- FLOAT* C,BLASLONG ldc, BLASLONG offset)
- {
- BLASLONG i,j,k;
- FLOAT *C0,*C1,*ptrba,*ptrbb;
- FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
- BLASLONG off, temp;
-
- #if defined(TRMMKERNEL) && !defined(LEFT)
- off = -offset;
- #endif
- for (j=0; j<bn/2; j+=1)
- {
- #if defined(TRMMKERNEL) && defined(LEFT)
- off = offset;
- #endif
- C0 = C;
- C1 = C0+2*ldc;
- ptrba = ba;
- for (i=0; i<bm/2; i+=1)
- {
- #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- ptrbb = bb;
- #else
- ptrba += off*2*2;
- ptrbb = bb+off*2*2;
- #endif
- res0 = 0;
- res1 = 0;
- res2 = 0;
- res3 = 0;
- res4 = 0;
- res5 = 0;
- res6 = 0;
- res7 = 0;
- #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
- temp = bk - off;
- #elif defined(LEFT)
- temp = off + 2;
- #else
- temp = off + 2;
- #endif
- for (k=0; k<temp/4; k+=1)
- {
- #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- load0 = ptrba[4*0+0];
- load1 = ptrbb[4*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*0+1];
- res1 = res1+load2*load1;
- load3 = ptrbb[4*0+1];
- res0 = res0-load2*load3;
- res1 = res1+load0*load3;
- load4 = ptrba[4*0+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*0+3];
- res3 = res3+load5*load1;
- res2 = res2-load5*load3;
- res3 = res3+load4*load3;
- load6 = ptrbb[4*0+2];
- res4 = res4+load0*load6;
- res5 = res5+load2*load6;
- load7 = ptrbb[4*0+3];
- res4 = res4-load2*load7;
- res5 = res5+load0*load7;
- res6 = res6+load4*load6;
- res7 = res7+load5*load6;
- res6 = res6-load5*load7;
- res7 = res7+load4*load7;
- load8 = ptrba[4*1+0];
- load9 = ptrbb[4*1+0];
- res0 = res0+load8*load9;
- load10 = ptrba[4*1+1];
- res1 = res1+load10*load9;
- load11 = ptrbb[4*1+1];
- res0 = res0-load10*load11;
- res1 = res1+load8*load11;
- load12 = ptrba[4*1+2];
- res2 = res2+load12*load9;
- load13 = ptrba[4*1+3];
- res3 = res3+load13*load9;
- res2 = res2-load13*load11;
- res3 = res3+load12*load11;
- load14 = ptrbb[4*1+2];
- res4 = res4+load8*load14;
- res5 = res5+load10*load14;
- load15 = ptrbb[4*1+3];
- res4 = res4-load10*load15;
- res5 = res5+load8*load15;
- res6 = res6+load12*load14;
- res7 = res7+load13*load14;
- res6 = res6-load13*load15;
- res7 = res7+load12*load15;
- load0 = ptrba[4*2+0];
- load1 = ptrbb[4*2+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*2+1];
- res1 = res1+load2*load1;
- load3 = ptrbb[4*2+1];
- res0 = res0-load2*load3;
- res1 = res1+load0*load3;
- load4 = ptrba[4*2+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*2+3];
- res3 = res3+load5*load1;
- res2 = res2-load5*load3;
- res3 = res3+load4*load3;
- load6 = ptrbb[4*2+2];
- res4 = res4+load0*load6;
- res5 = res5+load2*load6;
- load7 = ptrbb[4*2+3];
- res4 = res4-load2*load7;
- res5 = res5+load0*load7;
- res6 = res6+load4*load6;
- res7 = res7+load5*load6;
- res6 = res6-load5*load7;
- res7 = res7+load4*load7;
- load8 = ptrba[4*3+0];
- load9 = ptrbb[4*3+0];
- res0 = res0+load8*load9;
- load10 = ptrba[4*3+1];
- res1 = res1+load10*load9;
- load11 = ptrbb[4*3+1];
- res0 = res0-load10*load11;
- res1 = res1+load8*load11;
- load12 = ptrba[4*3+2];
- res2 = res2+load12*load9;
- load13 = ptrba[4*3+3];
- res3 = res3+load13*load9;
- res2 = res2-load13*load11;
- res3 = res3+load12*load11;
- load14 = ptrbb[4*3+2];
- res4 = res4+load8*load14;
- res5 = res5+load10*load14;
- load15 = ptrbb[4*3+3];
- res4 = res4-load10*load15;
- res5 = res5+load8*load15;
- res6 = res6+load12*load14;
- res7 = res7+load13*load14;
- res6 = res6-load13*load15;
- res7 = res7+load12*load15;
- #endif
- #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
- load0 = ptrba[4*0+0];
- load1 = ptrbb[4*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*0+1];
- res1 = res1+load2*load1;
- load3 = ptrbb[4*0+1];
- res0 = res0+load2*load3;
- res1 = res1-load0*load3;
- load4 = ptrba[4*0+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*0+3];
- res3 = res3+load5*load1;
- res2 = res2+load5*load3;
- res3 = res3-load4*load3;
- load6 = ptrbb[4*0+2];
- res4 = res4+load0*load6;
- res5 = res5+load2*load6;
- load7 = ptrbb[4*0+3];
- res4 = res4+load2*load7;
- res5 = res5-load0*load7;
- res6 = res6+load4*load6;
- res7 = res7+load5*load6;
- res6 = res6+load5*load7;
- res7 = res7-load4*load7;
- load8 = ptrba[4*1+0];
- load9 = ptrbb[4*1+0];
- res0 = res0+load8*load9;
- load10 = ptrba[4*1+1];
- res1 = res1+load10*load9;
- load11 = ptrbb[4*1+1];
- res0 = res0+load10*load11;
- res1 = res1-load8*load11;
- load12 = ptrba[4*1+2];
- res2 = res2+load12*load9;
- load13 = ptrba[4*1+3];
- res3 = res3+load13*load9;
- res2 = res2+load13*load11;
- res3 = res3-load12*load11;
- load14 = ptrbb[4*1+2];
- res4 = res4+load8*load14;
- res5 = res5+load10*load14;
- load15 = ptrbb[4*1+3];
- res4 = res4+load10*load15;
- res5 = res5-load8*load15;
- res6 = res6+load12*load14;
- res7 = res7+load13*load14;
- res6 = res6+load13*load15;
- res7 = res7-load12*load15;
- load0 = ptrba[4*2+0];
- load1 = ptrbb[4*2+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*2+1];
- res1 = res1+load2*load1;
- load3 = ptrbb[4*2+1];
- res0 = res0+load2*load3;
- res1 = res1-load0*load3;
- load4 = ptrba[4*2+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*2+3];
- res3 = res3+load5*load1;
- res2 = res2+load5*load3;
- res3 = res3-load4*load3;
- load6 = ptrbb[4*2+2];
- res4 = res4+load0*load6;
- res5 = res5+load2*load6;
- load7 = ptrbb[4*2+3];
- res4 = res4+load2*load7;
- res5 = res5-load0*load7;
- res6 = res6+load4*load6;
- res7 = res7+load5*load6;
- res6 = res6+load5*load7;
- res7 = res7-load4*load7;
- load8 = ptrba[4*3+0];
- load9 = ptrbb[4*3+0];
- res0 = res0+load8*load9;
- load10 = ptrba[4*3+1];
- res1 = res1+load10*load9;
- load11 = ptrbb[4*3+1];
- res0 = res0+load10*load11;
- res1 = res1-load8*load11;
- load12 = ptrba[4*3+2];
- res2 = res2+load12*load9;
- load13 = ptrba[4*3+3];
- res3 = res3+load13*load9;
- res2 = res2+load13*load11;
- res3 = res3-load12*load11;
- load14 = ptrbb[4*3+2];
- res4 = res4+load8*load14;
- res5 = res5+load10*load14;
- load15 = ptrbb[4*3+3];
- res4 = res4+load10*load15;
- res5 = res5-load8*load15;
- res6 = res6+load12*load14;
- res7 = res7+load13*load14;
- res6 = res6+load13*load15;
- res7 = res7-load12*load15;
- #endif
- #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
- load0 = ptrba[4*0+0];
- load1 = ptrbb[4*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*0+1];
- res1 = res1-load2*load1;
- load3 = ptrbb[4*0+1];
- res0 = res0+load2*load3;
- res1 = res1+load0*load3;
- load4 = ptrba[4*0+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*0+3];
- res3 = res3-load5*load1;
- res2 = res2+load5*load3;
- res3 = res3+load4*load3;
- load6 = ptrbb[4*0+2];
- res4 = res4+load0*load6;
- res5 = res5-load2*load6;
- load7 = ptrbb[4*0+3];
- res4 = res4+load2*load7;
- res5 = res5+load0*load7;
- res6 = res6+load4*load6;
- res7 = res7-load5*load6;
- res6 = res6+load5*load7;
- res7 = res7+load4*load7;
- load8 = ptrba[4*1+0];
- load9 = ptrbb[4*1+0];
- res0 = res0+load8*load9;
- load10 = ptrba[4*1+1];
- res1 = res1-load10*load9;
- load11 = ptrbb[4*1+1];
- res0 = res0+load10*load11;
- res1 = res1+load8*load11;
- load12 = ptrba[4*1+2];
- res2 = res2+load12*load9;
- load13 = ptrba[4*1+3];
- res3 = res3-load13*load9;
- res2 = res2+load13*load11;
- res3 = res3+load12*load11;
- load14 = ptrbb[4*1+2];
- res4 = res4+load8*load14;
- res5 = res5-load10*load14;
- load15 = ptrbb[4*1+3];
- res4 = res4+load10*load15;
- res5 = res5+load8*load15;
- res6 = res6+load12*load14;
- res7 = res7-load13*load14;
- res6 = res6+load13*load15;
- res7 = res7+load12*load15;
- load0 = ptrba[4*2+0];
- load1 = ptrbb[4*2+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*2+1];
- res1 = res1-load2*load1;
- load3 = ptrbb[4*2+1];
- res0 = res0+load2*load3;
- res1 = res1+load0*load3;
- load4 = ptrba[4*2+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*2+3];
- res3 = res3-load5*load1;
- res2 = res2+load5*load3;
- res3 = res3+load4*load3;
- load6 = ptrbb[4*2+2];
- res4 = res4+load0*load6;
- res5 = res5-load2*load6;
- load7 = ptrbb[4*2+3];
- res4 = res4+load2*load7;
- res5 = res5+load0*load7;
- res6 = res6+load4*load6;
- res7 = res7-load5*load6;
- res6 = res6+load5*load7;
- res7 = res7+load4*load7;
- load8 = ptrba[4*3+0];
- load9 = ptrbb[4*3+0];
- res0 = res0+load8*load9;
- load10 = ptrba[4*3+1];
- res1 = res1-load10*load9;
- load11 = ptrbb[4*3+1];
- res0 = res0+load10*load11;
- res1 = res1+load8*load11;
- load12 = ptrba[4*3+2];
- res2 = res2+load12*load9;
- load13 = ptrba[4*3+3];
- res3 = res3-load13*load9;
- res2 = res2+load13*load11;
- res3 = res3+load12*load11;
- load14 = ptrbb[4*3+2];
- res4 = res4+load8*load14;
- res5 = res5-load10*load14;
- load15 = ptrbb[4*3+3];
- res4 = res4+load10*load15;
- res5 = res5+load8*load15;
- res6 = res6+load12*load14;
- res7 = res7-load13*load14;
- res6 = res6+load13*load15;
- res7 = res7+load12*load15;
- #endif
- #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
- load0 = ptrba[4*0+0];
- load1 = ptrbb[4*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*0+1];
- res1 = res1-load2*load1;
- load3 = ptrbb[4*0+1];
- res0 = res0-load2*load3;
- res1 = res1-load0*load3;
- load4 = ptrba[4*0+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*0+3];
- res3 = res3-load5*load1;
- res2 = res2-load5*load3;
- res3 = res3-load4*load3;
- load6 = ptrbb[4*0+2];
- res4 = res4+load0*load6;
- res5 = res5-load2*load6;
- load7 = ptrbb[4*0+3];
- res4 = res4-load2*load7;
- res5 = res5-load0*load7;
- res6 = res6+load4*load6;
- res7 = res7-load5*load6;
- res6 = res6-load5*load7;
- res7 = res7-load4*load7;
- load8 = ptrba[4*1+0];
- load9 = ptrbb[4*1+0];
- res0 = res0+load8*load9;
- load10 = ptrba[4*1+1];
- res1 = res1-load10*load9;
- load11 = ptrbb[4*1+1];
- res0 = res0-load10*load11;
- res1 = res1-load8*load11;
- load12 = ptrba[4*1+2];
- res2 = res2+load12*load9;
- load13 = ptrba[4*1+3];
- res3 = res3-load13*load9;
- res2 = res2-load13*load11;
- res3 = res3-load12*load11;
- load14 = ptrbb[4*1+2];
- res4 = res4+load8*load14;
- res5 = res5-load10*load14;
- load15 = ptrbb[4*1+3];
- res4 = res4-load10*load15;
- res5 = res5-load8*load15;
- res6 = res6+load12*load14;
- res7 = res7-load13*load14;
- res6 = res6-load13*load15;
- res7 = res7-load12*load15;
- load0 = ptrba[4*2+0];
- load1 = ptrbb[4*2+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*2+1];
- res1 = res1-load2*load1;
- load3 = ptrbb[4*2+1];
- res0 = res0-load2*load3;
- res1 = res1-load0*load3;
- load4 = ptrba[4*2+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*2+3];
- res3 = res3-load5*load1;
- res2 = res2-load5*load3;
- res3 = res3-load4*load3;
- load6 = ptrbb[4*2+2];
- res4 = res4+load0*load6;
- res5 = res5-load2*load6;
- load7 = ptrbb[4*2+3];
- res4 = res4-load2*load7;
- res5 = res5-load0*load7;
- res6 = res6+load4*load6;
- res7 = res7-load5*load6;
- res6 = res6-load5*load7;
- res7 = res7-load4*load7;
- load8 = ptrba[4*3+0];
- load9 = ptrbb[4*3+0];
- res0 = res0+load8*load9;
- load10 = ptrba[4*3+1];
- res1 = res1-load10*load9;
- load11 = ptrbb[4*3+1];
- res0 = res0-load10*load11;
- res1 = res1-load8*load11;
- load12 = ptrba[4*3+2];
- res2 = res2+load12*load9;
- load13 = ptrba[4*3+3];
- res3 = res3-load13*load9;
- res2 = res2-load13*load11;
- res3 = res3-load12*load11;
- load14 = ptrbb[4*3+2];
- res4 = res4+load8*load14;
- res5 = res5-load10*load14;
- load15 = ptrbb[4*3+3];
- res4 = res4-load10*load15;
- res5 = res5-load8*load15;
- res6 = res6+load12*load14;
- res7 = res7-load13*load14;
- res6 = res6-load13*load15;
- res7 = res7-load12*load15;
- #endif
- ptrba = ptrba+16;
- ptrbb = ptrbb+16;
- }
- for (k=0; k<(temp&3); k+=1)
- {
- #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- load0 = ptrba[4*0+0];
- load1 = ptrbb[4*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*0+1];
- res1 = res1+load2*load1;
- load3 = ptrbb[4*0+1];
- res0 = res0-load2*load3;
- res1 = res1+load0*load3;
- load4 = ptrba[4*0+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*0+3];
- res3 = res3+load5*load1;
- res2 = res2-load5*load3;
- res3 = res3+load4*load3;
- load6 = ptrbb[4*0+2];
- res4 = res4+load0*load6;
- res5 = res5+load2*load6;
- load7 = ptrbb[4*0+3];
- res4 = res4-load2*load7;
- res5 = res5+load0*load7;
- res6 = res6+load4*load6;
- res7 = res7+load5*load6;
- res6 = res6-load5*load7;
- res7 = res7+load4*load7;
- #endif
- #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
- load0 = ptrba[4*0+0];
- load1 = ptrbb[4*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*0+1];
- res1 = res1+load2*load1;
- load3 = ptrbb[4*0+1];
- res0 = res0+load2*load3;
- res1 = res1-load0*load3;
- load4 = ptrba[4*0+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*0+3];
- res3 = res3+load5*load1;
- res2 = res2+load5*load3;
- res3 = res3-load4*load3;
- load6 = ptrbb[4*0+2];
- res4 = res4+load0*load6;
- res5 = res5+load2*load6;
- load7 = ptrbb[4*0+3];
- res4 = res4+load2*load7;
- res5 = res5-load0*load7;
- res6 = res6+load4*load6;
- res7 = res7+load5*load6;
- res6 = res6+load5*load7;
- res7 = res7-load4*load7;
- #endif
- #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
- load0 = ptrba[4*0+0];
- load1 = ptrbb[4*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*0+1];
- res1 = res1-load2*load1;
- load3 = ptrbb[4*0+1];
- res0 = res0+load2*load3;
- res1 = res1+load0*load3;
- load4 = ptrba[4*0+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*0+3];
- res3 = res3-load5*load1;
- res2 = res2+load5*load3;
- res3 = res3+load4*load3;
- load6 = ptrbb[4*0+2];
- res4 = res4+load0*load6;
- res5 = res5-load2*load6;
- load7 = ptrbb[4*0+3];
- res4 = res4+load2*load7;
- res5 = res5+load0*load7;
- res6 = res6+load4*load6;
- res7 = res7-load5*load6;
- res6 = res6+load5*load7;
- res7 = res7+load4*load7;
- #endif
- #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
- load0 = ptrba[4*0+0];
- load1 = ptrbb[4*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*0+1];
- res1 = res1-load2*load1;
- load3 = ptrbb[4*0+1];
- res0 = res0-load2*load3;
- res1 = res1-load0*load3;
- load4 = ptrba[4*0+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*0+3];
- res3 = res3-load5*load1;
- res2 = res2-load5*load3;
- res3 = res3-load4*load3;
- load6 = ptrbb[4*0+2];
- res4 = res4+load0*load6;
- res5 = res5-load2*load6;
- load7 = ptrbb[4*0+3];
- res4 = res4-load2*load7;
- res5 = res5-load0*load7;
- res6 = res6+load4*load6;
- res7 = res7-load5*load6;
- res6 = res6-load5*load7;
- res7 = res7-load4*load7;
- #endif
- ptrba = ptrba+4;
- ptrbb = ptrbb+4;
- }
- load0 = res0*alphar-res1*alphai;
- load1 = res1*alphar+res0*alphai;
- C0[0] = load0;
- C0[1] = load1;
-
- load2 = res2*alphar-res3*alphai;
- load3 = res3*alphar+res2*alphai;
- C0[2] = load2;
- C0[3] = load3;
-
- load4 = res4*alphar-res5*alphai;
- load5 = res5*alphar+res4*alphai;
- C1[0] = load4;
- C1[1] = load5;
-
- load6 = res6*alphar-res7*alphai;
- load7 = res7*alphar+res6*alphai;
- C1[2] = load6;
- C1[3] = load7;
- #if ( defined(LEFT) && defined(TRANSA)) || \
- (!defined(LEFT) && !defined(TRANSA))
- temp = bk - off;
- #ifdef LEFT
- temp -= 2;
- #else
- temp -= 2;
- #endif
- ptrba += temp*2*2;
- ptrbb += temp*2*2;
- #endif
- #ifdef LEFT
- off += 2;
- #endif
-
- C0 = C0+4;
- C1 = C1+4;
- }
- for (i=0; i<(bm&1); i+=1)
- {
- #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- ptrbb = bb;
- #else
- ptrba += off*2;
- ptrbb = bb + off*2*2;
- #endif
- res0 = 0;
- res1 = 0;
- res2 = 0;
- res3 = 0;
- #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
- temp = bk - off;
- #elif defined(LEFT)
- temp = off+1;
- #else
- temp = off+2;
- #endif
- for (k=0; k<temp; k+=1)
- {
- #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- load0 = ptrba[2*0+0];
- load1 = ptrbb[4*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[2*0+1];
- res1 = res1+load2*load1;
- load3 = ptrbb[4*0+1];
- res0 = res0-load2*load3;
- res1 = res1+load0*load3;
- load4 = ptrbb[4*0+2];
- res2 = res2+load0*load4;
- res3 = res3+load2*load4;
- load5 = ptrbb[4*0+3];
- res2 = res2-load2*load5;
- res3 = res3+load0*load5;
- #endif
- #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
- load0 = ptrba[2*0+0];
- load1 = ptrbb[4*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[2*0+1];
- res1 = res1+load2*load1;
- load3 = ptrbb[4*0+1];
- res0 = res0+load2*load3;
- res1 = res1-load0*load3;
- load4 = ptrbb[4*0+2];
- res2 = res2+load0*load4;
- res3 = res3+load2*load4;
- load5 = ptrbb[4*0+3];
- res2 = res2+load2*load5;
- res3 = res3-load0*load5;
- #endif
- #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
- load0 = ptrba[2*0+0];
- load1 = ptrbb[4*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[2*0+1];
- res1 = res1-load2*load1;
- load3 = ptrbb[4*0+1];
- res0 = res0+load2*load3;
- res1 = res1+load0*load3;
- load4 = ptrbb[4*0+2];
- res2 = res2+load0*load4;
- res3 = res3-load2*load4;
- load5 = ptrbb[4*0+3];
- res2 = res2+load2*load5;
- res3 = res3+load0*load5;
- #endif
- #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
- load0 = ptrba[2*0+0];
- load1 = ptrbb[4*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[2*0+1];
- res1 = res1-load2*load1;
- load3 = ptrbb[4*0+1];
- res0 = res0-load2*load3;
- res1 = res1-load0*load3;
- load4 = ptrbb[4*0+2];
- res2 = res2+load0*load4;
- res3 = res3-load2*load4;
- load5 = ptrbb[4*0+3];
- res2 = res2-load2*load5;
- res3 = res3-load0*load5;
- #endif
- ptrba = ptrba+2;
- ptrbb = ptrbb+4;
- }
- load0 = res0*alphar-res1*alphai;
- load1 = res1*alphar+res0*alphai;
- C0[0] = load0;
- C0[1] = load1;
-
- load2 = res2*alphar-res3*alphai;
- load3 = res3*alphar+res2*alphai;
- C1[0] = load2;
- C1[1] = load3;
- #if ( defined(LEFT) && defined(TRANSA)) || \
- (!defined(LEFT) && !defined(TRANSA))
- temp = bk - off;
- #ifdef LEFT
- temp -= 1;
- #else
- temp -= 2;
- #endif
- ptrba += temp*2;
- ptrbb += temp*2*2;
- #endif
- #ifdef LEFT
- off += 1;
- #endif
- C0 = C0+2;
- C1 = C1+2;
- }
- #if defined(TRMMKERNEL) && !defined(LEFT)
- off += 2;
- #endif
- k = (bk<<2);
- bb = bb+k;
- i = (ldc<<2);
- C = C+i;
- }
- for (j=0; j<(bn&1); j+=1)
- {
- C0 = C;
- #if defined(TRMMKERNEL) && defined(LEFT)
- off = offset;
- #endif
- ptrba = ba;
- for (i=0; i<bm/2; i+=1)
- {
- #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- ptrbb = bb;
- #else
- ptrba += off*2*2;
- ptrbb = bb+off*2;
- #endif
- res0 = 0;
- res1 = 0;
- res2 = 0;
- res3 = 0;
- #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
- temp = bk - off;
- #elif defined(LEFT)
- temp = off + 2;
- #else
- temp = off + 1;
- #endif
- for (k=0; k<temp; k+=1)
- {
- #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- load0 = ptrba[4*0+0];
- load1 = ptrbb[2*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*0+1];
- res1 = res1+load2*load1;
- load3 = ptrbb[2*0+1];
- res0 = res0-load2*load3;
- res1 = res1+load0*load3;
- load4 = ptrba[4*0+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*0+3];
- res3 = res3+load5*load1;
- res2 = res2-load5*load3;
- res3 = res3+load4*load3;
- #endif
- #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
- load0 = ptrba[4*0+0];
- load1 = ptrbb[2*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*0+1];
- res1 = res1+load2*load1;
- load3 = ptrbb[2*0+1];
- res0 = res0+load2*load3;
- res1 = res1-load0*load3;
- load4 = ptrba[4*0+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*0+3];
- res3 = res3+load5*load1;
- res2 = res2+load5*load3;
- res3 = res3-load4*load3;
- #endif
- #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
- load0 = ptrba[4*0+0];
- load1 = ptrbb[2*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*0+1];
- res1 = res1-load2*load1;
- load3 = ptrbb[2*0+1];
- res0 = res0+load2*load3;
- res1 = res1+load0*load3;
- load4 = ptrba[4*0+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*0+3];
- res3 = res3-load5*load1;
- res2 = res2+load5*load3;
- res3 = res3+load4*load3;
- #endif
- #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
- load0 = ptrba[4*0+0];
- load1 = ptrbb[2*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[4*0+1];
- res1 = res1-load2*load1;
- load3 = ptrbb[2*0+1];
- res0 = res0-load2*load3;
- res1 = res1-load0*load3;
- load4 = ptrba[4*0+2];
- res2 = res2+load4*load1;
- load5 = ptrba[4*0+3];
- res3 = res3-load5*load1;
- res2 = res2-load5*load3;
- res3 = res3-load4*load3;
- #endif
- ptrba = ptrba+4;
- ptrbb = ptrbb+2;
- }
- load0 = res0*alphar-res1*alphai;
- load1 = res1*alphar+res0*alphai;
- C0[0] = load0;
- C0[1] = load1;
-
- load2 = res2*alphar-res3*alphai;
- load3 = res3*alphar+res2*alphai;
- C0[2] = load2;
- C0[3] = load3;
- #if ( defined(LEFT) && defined(TRANSA)) || \
- (!defined(LEFT) && !defined(TRANSA))
- temp = bk-off;
- #ifdef LEFT
- temp -= 2;
- #else
- temp -= 1;
- #endif
- ptrba += temp*2*2;
- ptrbb += temp*2;
- #endif
- #ifdef LEFT
- off += 2;
- #endif
- C0 = C0+4;
- }
- for (i=0; i<(bm&1); i+=1)
- {
- #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- ptrbb = bb;
- #else
- ptrba += off*2;
- ptrbb = bb + off*2;
- #endif
- res0 = 0;
- res1 = 0;
- #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
- temp = bk-off;
- #elif defined(LEFT)
- temp = off + 1;
- #else
- temp = off + 1;
- #endif
- for (k=0; k<temp; k+=1)
- {
- #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- load0 = ptrba[2*0+0];
- load1 = ptrbb[2*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[2*0+1];
- res1 = res1+load2*load1;
- load3 = ptrbb[2*0+1];
- res0 = res0-load2*load3;
- res1 = res1+load0*load3;
- #endif
- #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
- load0 = ptrba[2*0+0];
- load1 = ptrbb[2*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[2*0+1];
- res1 = res1+load2*load1;
- load3 = ptrbb[2*0+1];
- res0 = res0+load2*load3;
- res1 = res1-load0*load3;
- #endif
- #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
- load0 = ptrba[2*0+0];
- load1 = ptrbb[2*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[2*0+1];
- res1 = res1-load2*load1;
- load3 = ptrbb[2*0+1];
- res0 = res0+load2*load3;
- res1 = res1+load0*load3;
- #endif
- #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
- load0 = ptrba[2*0+0];
- load1 = ptrbb[2*0+0];
- res0 = res0+load0*load1;
- load2 = ptrba[2*0+1];
- res1 = res1-load2*load1;
- load3 = ptrbb[2*0+1];
- res0 = res0-load2*load3;
- res1 = res1-load0*load3;
- #endif
- ptrba = ptrba+2;
- ptrbb = ptrbb+2;
- }
- load0 = res0*alphar-res1*alphai;
- load1 = res1*alphar+res0*alphai;
- C0[0] = load0;
- C0[1] = load1;
- #if ( defined(LEFT) && defined(TRANSA)) || \
- (!defined(LEFT) && !defined(TRANSA))
- temp = bk - off;
- #ifdef LEFT
- temp -= 1;
- #else
- temp -= 1;
- #endif
- ptrba += temp*2;
- ptrbb += temp*2;
- #endif
- #ifdef LEFT
- off += 1;
- #endif
- C0 = C0+2;
- }
- k = (bk<<1);
- bb = bb+k;
- i = (ldc<<1);
- C = C+i;
- }
- return 0;
- }
|