@@ -34,8 +34,10 @@ Please read GotoBLAS_01Readme.txt | |||
Additional support CPU: | |||
x86_64: | |||
Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. | |||
Intel Sandy Bridge | |||
MIPS64: | |||
ICT Loongson 3A //Level 3 BLAS subroutines are optimized. | |||
ICT Loongson 3A | |||
ICT Loongson 3B (Experimental) | |||
4.Usages | |||
Link with libopenblas.a or -lopenblas for shared library. | |||
@@ -70,10 +72,10 @@ OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas | |||
8.ChangeLog | |||
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. | |||
9.Known Issues | |||
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit | |||
is 64. On 32 bits, it is 32. | |||
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. | |||
9.Troubleshooting | |||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. | |||
* The number of CPUs/Cores should less than or equal to 256. | |||
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. | |||
10. Specification of Git Branches | |||
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | |||
@@ -425,6 +425,7 @@ REALNAME: | |||
#define ALIGN_2 .align 2 | |||
#define ALIGN_3 .align 3 | |||
#define ALIGN_4 .align 4 | |||
#define ALIGN_5 .align 5 | |||
#define ffreep fstp | |||
#endif | |||
@@ -0,0 +1,235 @@ | |||
/***************************************************************************** | |||
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the ISCAS nor the names of its contributors may | |||
be used to endorse or promote products derived from this software | |||
without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
**********************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||
{ | |||
BLASLONG i,j; | |||
BLASLONG idx=0; | |||
BLASLONG ii; | |||
FLOAT *src0,*src1,*src2,*src3,*dest0; | |||
for (j=0; j<col/4; j+=1) | |||
{ | |||
src0 = src; | |||
src1 = src0+2*srcdim; | |||
src2 = src1+2*srcdim; | |||
src3 = src2+2*srcdim; | |||
src = src3+2*srcdim; | |||
dest0 = dest; | |||
ii = (row<<3); | |||
dest = dest+ii; | |||
for (i=0; i<row/4; i+=1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src1[0]; | |||
dest0[3] = src1[1]; | |||
dest0[4] = src2[0]; | |||
dest0[5] = src2[1]; | |||
dest0[6] = src3[0]; | |||
dest0[7] = src3[1]; | |||
dest0[8] = src0[2]; | |||
dest0[9] = src0[3]; | |||
dest0[10] = src1[2]; | |||
dest0[11] = src1[3]; | |||
dest0[12] = src2[2]; | |||
dest0[13] = src2[3]; | |||
dest0[14] = src3[2]; | |||
dest0[15] = src3[3]; | |||
dest0[16] = src0[4]; | |||
dest0[17] = src0[5]; | |||
dest0[18] = src1[4]; | |||
dest0[19] = src1[5]; | |||
dest0[20] = src2[4]; | |||
dest0[21] = src2[5]; | |||
dest0[22] = src3[4]; | |||
dest0[23] = src3[5]; | |||
dest0[24] = src0[6]; | |||
dest0[25] = src0[7]; | |||
dest0[26] = src1[6]; | |||
dest0[27] = src1[7]; | |||
dest0[28] = src2[6]; | |||
dest0[29] = src2[7]; | |||
dest0[30] = src3[6]; | |||
dest0[31] = src3[7]; | |||
src0 = src0+8; | |||
src1 = src1+8; | |||
src2 = src2+8; | |||
src3 = src3+8; | |||
ii = (4<<3); | |||
dest0 = dest0+ii; | |||
} | |||
if (row&2) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src1[0]; | |||
dest0[3] = src1[1]; | |||
dest0[4] = src2[0]; | |||
dest0[5] = src2[1]; | |||
dest0[6] = src3[0]; | |||
dest0[7] = src3[1]; | |||
dest0[8] = src0[2]; | |||
dest0[9] = src0[3]; | |||
dest0[10] = src1[2]; | |||
dest0[11] = src1[3]; | |||
dest0[12] = src2[2]; | |||
dest0[13] = src2[3]; | |||
dest0[14] = src3[2]; | |||
dest0[15] = src3[3]; | |||
src0 = src0+4; | |||
src1 = src1+4; | |||
src2 = src2+4; | |||
src3 = src3+4; | |||
ii = (2<<3); | |||
dest0 = dest0+ii; | |||
} | |||
if (row&1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src1[0]; | |||
dest0[3] = src1[1]; | |||
dest0[4] = src2[0]; | |||
dest0[5] = src2[1]; | |||
dest0[6] = src3[0]; | |||
dest0[7] = src3[1]; | |||
src0 = src0+2; | |||
src1 = src1+2; | |||
src2 = src2+2; | |||
src3 = src3+2; | |||
ii = (1<<3); | |||
dest0 = dest0+ii; | |||
} | |||
} | |||
if (col&2) | |||
{ | |||
src0 = src; | |||
src1 = src0+2*srcdim; | |||
src = src1+2*srcdim; | |||
dest0 = dest; | |||
ii = (row<<2); | |||
dest = dest+ii; | |||
for (i=0; i<row/4; i+=1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src1[0]; | |||
dest0[3] = src1[1]; | |||
dest0[4] = src0[2]; | |||
dest0[5] = src0[3]; | |||
dest0[6] = src1[2]; | |||
dest0[7] = src1[3]; | |||
dest0[8] = src0[4]; | |||
dest0[9] = src0[5]; | |||
dest0[10] = src1[4]; | |||
dest0[11] = src1[5]; | |||
dest0[12] = src0[6]; | |||
dest0[13] = src0[7]; | |||
dest0[14] = src1[6]; | |||
dest0[15] = src1[7]; | |||
src0 = src0+8; | |||
src1 = src1+8; | |||
ii = (4<<2); | |||
dest0 = dest0+ii; | |||
} | |||
if (row&2) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src1[0]; | |||
dest0[3] = src1[1]; | |||
dest0[4] = src0[2]; | |||
dest0[5] = src0[3]; | |||
dest0[6] = src1[2]; | |||
dest0[7] = src1[3]; | |||
src0 = src0+4; | |||
src1 = src1+4; | |||
ii = (2<<2); | |||
dest0 = dest0+ii; | |||
} | |||
if (row&1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src1[0]; | |||
dest0[3] = src1[1]; | |||
src0 = src0+2; | |||
src1 = src1+2; | |||
ii = (1<<2); | |||
dest0 = dest0+ii; | |||
} | |||
} | |||
if (col&1) | |||
{ | |||
src0 = src; | |||
src = src0+2*srcdim; | |||
dest0 = dest; | |||
ii = (row<<1); | |||
dest = dest+ii; | |||
for (i=0; i<row/4; i+=1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src0[2]; | |||
dest0[3] = src0[3]; | |||
dest0[4] = src0[4]; | |||
dest0[5] = src0[5]; | |||
dest0[6] = src0[6]; | |||
dest0[7] = src0[7]; | |||
src0 = src0+8; | |||
ii = (4<<1); | |||
dest0 = dest0+ii; | |||
} | |||
if (row&2) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src0[2]; | |||
dest0[3] = src0[3]; | |||
src0 = src0+4; | |||
ii = (2<<1); | |||
dest0 = dest0+ii; | |||
} | |||
if (row&1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
src0 = src0+2; | |||
ii = (1<<1); | |||
dest0 = dest0+ii; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,401 @@ | |||
/***************************************************************************** | |||
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the ISCAS nor the names of its contributors may | |||
be used to endorse or promote products derived from this software | |||
without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
**********************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||
{ | |||
BLASLONG i,j; | |||
BLASLONG idx=0; | |||
BLASLONG ii; | |||
FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0; | |||
for (j=0; j<col/8; j+=1) | |||
{ | |||
src0 = src; | |||
src1 = src0+2*srcdim; | |||
src2 = src1+2*srcdim; | |||
src3 = src2+2*srcdim; | |||
src4 = src3+2*srcdim; | |||
src5 = src4+2*srcdim; | |||
src6 = src5+2*srcdim; | |||
src7 = src6+2*srcdim; | |||
src = src7+2*srcdim; | |||
dest0 = dest; | |||
ii = (row<<4); | |||
dest = dest+ii; | |||
for (i=0; i<row/4; i+=1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src1[0]; | |||
dest0[3] = src1[1]; | |||
dest0[4] = src2[0]; | |||
dest0[5] = src2[1]; | |||
dest0[6] = src3[0]; | |||
dest0[7] = src3[1]; | |||
dest0[8] = src4[0]; | |||
dest0[9] = src4[1]; | |||
dest0[10] = src5[0]; | |||
dest0[11] = src5[1]; | |||
dest0[12] = src6[0]; | |||
dest0[13] = src6[1]; | |||
dest0[14] = src7[0]; | |||
dest0[15] = src7[1]; | |||
dest0[16] = src0[2]; | |||
dest0[17] = src0[3]; | |||
dest0[18] = src1[2]; | |||
dest0[19] = src1[3]; | |||
dest0[20] = src2[2]; | |||
dest0[21] = src2[3]; | |||
dest0[22] = src3[2]; | |||
dest0[23] = src3[3]; | |||
dest0[24] = src4[2]; | |||
dest0[25] = src4[3]; | |||
dest0[26] = src5[2]; | |||
dest0[27] = src5[3]; | |||
dest0[28] = src6[2]; | |||
dest0[29] = src6[3]; | |||
dest0[30] = src7[2]; | |||
dest0[31] = src7[3]; | |||
dest0[32] = src0[4]; | |||
dest0[33] = src0[5]; | |||
dest0[34] = src1[4]; | |||
dest0[35] = src1[5]; | |||
dest0[36] = src2[4]; | |||
dest0[37] = src2[5]; | |||
dest0[38] = src3[4]; | |||
dest0[39] = src3[5]; | |||
dest0[40] = src4[4]; | |||
dest0[41] = src4[5]; | |||
dest0[42] = src5[4]; | |||
dest0[43] = src5[5]; | |||
dest0[44] = src6[4]; | |||
dest0[45] = src6[5]; | |||
dest0[46] = src7[4]; | |||
dest0[47] = src7[5]; | |||
dest0[48] = src0[6]; | |||
dest0[49] = src0[7]; | |||
dest0[50] = src1[6]; | |||
dest0[51] = src1[7]; | |||
dest0[52] = src2[6]; | |||
dest0[53] = src2[7]; | |||
dest0[54] = src3[6]; | |||
dest0[55] = src3[7]; | |||
dest0[56] = src4[6]; | |||
dest0[57] = src4[7]; | |||
dest0[58] = src5[6]; | |||
dest0[59] = src5[7]; | |||
dest0[60] = src6[6]; | |||
dest0[61] = src6[7]; | |||
dest0[62] = src7[6]; | |||
dest0[63] = src7[7]; | |||
src0 = src0+8; | |||
src1 = src1+8; | |||
src2 = src2+8; | |||
src3 = src3+8; | |||
src4 = src4+8; | |||
src5 = src5+8; | |||
src6 = src6+8; | |||
src7 = src7+8; | |||
ii = (4<<4); | |||
dest0 = dest0+ii; | |||
} | |||
if (row&2) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src1[0]; | |||
dest0[3] = src1[1]; | |||
dest0[4] = src2[0]; | |||
dest0[5] = src2[1]; | |||
dest0[6] = src3[0]; | |||
dest0[7] = src3[1]; | |||
dest0[8] = src4[0]; | |||
dest0[9] = src4[1]; | |||
dest0[10] = src5[0]; | |||
dest0[11] = src5[1]; | |||
dest0[12] = src6[0]; | |||
dest0[13] = src6[1]; | |||
dest0[14] = src7[0]; | |||
dest0[15] = src7[1]; | |||
dest0[16] = src0[2]; | |||
dest0[17] = src0[3]; | |||
dest0[18] = src1[2]; | |||
dest0[19] = src1[3]; | |||
dest0[20] = src2[2]; | |||
dest0[21] = src2[3]; | |||
dest0[22] = src3[2]; | |||
dest0[23] = src3[3]; | |||
dest0[24] = src4[2]; | |||
dest0[25] = src4[3]; | |||
dest0[26] = src5[2]; | |||
dest0[27] = src5[3]; | |||
dest0[28] = src6[2]; | |||
dest0[29] = src6[3]; | |||
dest0[30] = src7[2]; | |||
dest0[31] = src7[3]; | |||
src0 = src0+4; | |||
src1 = src1+4; | |||
src2 = src2+4; | |||
src3 = src3+4; | |||
src4 = src4+4; | |||
src5 = src5+4; | |||
src6 = src6+4; | |||
src7 = src7+4; | |||
ii = (2<<4); | |||
dest0 = dest0+ii; | |||
} | |||
if (row&1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src1[0]; | |||
dest0[3] = src1[1]; | |||
dest0[4] = src2[0]; | |||
dest0[5] = src2[1]; | |||
dest0[6] = src3[0]; | |||
dest0[7] = src3[1]; | |||
dest0[8] = src4[0]; | |||
dest0[9] = src4[1]; | |||
dest0[10] = src5[0]; | |||
dest0[11] = src5[1]; | |||
dest0[12] = src6[0]; | |||
dest0[13] = src6[1]; | |||
dest0[14] = src7[0]; | |||
dest0[15] = src7[1]; | |||
src0 = src0+2; | |||
src1 = src1+2; | |||
src2 = src2+2; | |||
src3 = src3+2; | |||
src4 = src4+2; | |||
src5 = src5+2; | |||
src6 = src6+2; | |||
src7 = src7+2; | |||
ii = (1<<4); | |||
dest0 = dest0+ii; | |||
} | |||
} | |||
if (col&4) | |||
{ | |||
src0 = src; | |||
src1 = src0+2*srcdim; | |||
src2 = src1+2*srcdim; | |||
src3 = src2+2*srcdim; | |||
src = src3+2*srcdim; | |||
dest0 = dest; | |||
ii = (row<<3); | |||
dest = dest+ii; | |||
for (i=0; i<row/4; i+=1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src1[0]; | |||
dest0[3] = src1[1]; | |||
dest0[4] = src2[0]; | |||
dest0[5] = src2[1]; | |||
dest0[6] = src3[0]; | |||
dest0[7] = src3[1]; | |||
dest0[8] = src0[2]; | |||
dest0[9] = src0[3]; | |||
dest0[10] = src1[2]; | |||
dest0[11] = src1[3]; | |||
dest0[12] = src2[2]; | |||
dest0[13] = src2[3]; | |||
dest0[14] = src3[2]; | |||
dest0[15] = src3[3]; | |||
dest0[16] = src0[4]; | |||
dest0[17] = src0[5]; | |||
dest0[18] = src1[4]; | |||
dest0[19] = src1[5]; | |||
dest0[20] = src2[4]; | |||
dest0[21] = src2[5]; | |||
dest0[22] = src3[4]; | |||
dest0[23] = src3[5]; | |||
dest0[24] = src0[6]; | |||
dest0[25] = src0[7]; | |||
dest0[26] = src1[6]; | |||
dest0[27] = src1[7]; | |||
dest0[28] = src2[6]; | |||
dest0[29] = src2[7]; | |||
dest0[30] = src3[6]; | |||
dest0[31] = src3[7]; | |||
src0 = src0+8; | |||
src1 = src1+8; | |||
src2 = src2+8; | |||
src3 = src3+8; | |||
ii = (4<<3); | |||
dest0 = dest0+ii; | |||
} | |||
if (row&2) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src1[0]; | |||
dest0[3] = src1[1]; | |||
dest0[4] = src2[0]; | |||
dest0[5] = src2[1]; | |||
dest0[6] = src3[0]; | |||
dest0[7] = src3[1]; | |||
dest0[8] = src0[2]; | |||
dest0[9] = src0[3]; | |||
dest0[10] = src1[2]; | |||
dest0[11] = src1[3]; | |||
dest0[12] = src2[2]; | |||
dest0[13] = src2[3]; | |||
dest0[14] = src3[2]; | |||
dest0[15] = src3[3]; | |||
src0 = src0+4; | |||
src1 = src1+4; | |||
src2 = src2+4; | |||
src3 = src3+4; | |||
ii = (2<<3); | |||
dest0 = dest0+ii; | |||
} | |||
if (row&1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src1[0]; | |||
dest0[3] = src1[1]; | |||
dest0[4] = src2[0]; | |||
dest0[5] = src2[1]; | |||
dest0[6] = src3[0]; | |||
dest0[7] = src3[1]; | |||
src0 = src0+2; | |||
src1 = src1+2; | |||
src2 = src2+2; | |||
src3 = src3+2; | |||
ii = (1<<3); | |||
dest0 = dest0+ii; | |||
} | |||
} | |||
if (col&2) | |||
{ | |||
src0 = src; | |||
src1 = src0+2*srcdim; | |||
src = src1+2*srcdim; | |||
dest0 = dest; | |||
ii = (row<<2); | |||
dest = dest+ii; | |||
for (i=0; i<row/4; i+=1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src1[0]; | |||
dest0[3] = src1[1]; | |||
dest0[4] = src0[2]; | |||
dest0[5] = src0[3]; | |||
dest0[6] = src1[2]; | |||
dest0[7] = src1[3]; | |||
dest0[8] = src0[4]; | |||
dest0[9] = src0[5]; | |||
dest0[10] = src1[4]; | |||
dest0[11] = src1[5]; | |||
dest0[12] = src0[6]; | |||
dest0[13] = src0[7]; | |||
dest0[14] = src1[6]; | |||
dest0[15] = src1[7]; | |||
src0 = src0+8; | |||
src1 = src1+8; | |||
ii = (4<<2); | |||
dest0 = dest0+ii; | |||
} | |||
if (row&2) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src1[0]; | |||
dest0[3] = src1[1]; | |||
dest0[4] = src0[2]; | |||
dest0[5] = src0[3]; | |||
dest0[6] = src1[2]; | |||
dest0[7] = src1[3]; | |||
src0 = src0+4; | |||
src1 = src1+4; | |||
ii = (2<<2); | |||
dest0 = dest0+ii; | |||
} | |||
if (row&1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src1[0]; | |||
dest0[3] = src1[1]; | |||
src0 = src0+2; | |||
src1 = src1+2; | |||
ii = (1<<2); | |||
dest0 = dest0+ii; | |||
} | |||
} | |||
if (col&1) | |||
{ | |||
src0 = src; | |||
src = src0+2*srcdim; | |||
dest0 = dest; | |||
ii = (row<<1); | |||
dest = dest+ii; | |||
for (i=0; i<row/4; i+=1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src0[2]; | |||
dest0[3] = src0[3]; | |||
dest0[4] = src0[4]; | |||
dest0[5] = src0[5]; | |||
dest0[6] = src0[6]; | |||
dest0[7] = src0[7]; | |||
src0 = src0+8; | |||
ii = (4<<1); | |||
dest0 = dest0+ii; | |||
} | |||
if (row&2) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src0[2]; | |||
dest0[3] = src0[3]; | |||
src0 = src0+4; | |||
ii = (2<<1); | |||
dest0 = dest0+ii; | |||
} | |||
if (row&1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
src0 = src0+2; | |||
ii = (1<<1); | |||
dest0 = dest0+ii; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,237 @@ | |||
/***************************************************************************** | |||
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the ISCAS nor the names of its contributors may | |||
be used to endorse or promote products derived from this software | |||
without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
**********************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||
{ | |||
BLASLONG i,j; | |||
BLASLONG idx=0; | |||
BLASLONG ii; | |||
FLOAT *src0,*src1,*src2,*src3,*dest0; | |||
FLOAT *dest1,*dest2; | |||
ii = col&-4; | |||
ii = ii*(2*row); | |||
dest2 = dest+ii; | |||
ii = col&-2; | |||
ii = ii*(2*row); | |||
dest1 = dest+ii; | |||
for (j=0; j<row/4; j+=1) | |||
{ | |||
src0 = src; | |||
src1 = src0+2*srcdim; | |||
src2 = src1+2*srcdim; | |||
src3 = src2+2*srcdim; | |||
src = src3+2*srcdim; | |||
dest0 = dest; | |||
ii = (4<<3); | |||
dest = dest+ii; | |||
for (i=0; i<col/4; i+=1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src0[2]; | |||
dest0[3] = src0[3]; | |||
dest0[4] = src0[4]; | |||
dest0[5] = src0[5]; | |||
dest0[6] = src0[6]; | |||
dest0[7] = src0[7]; | |||
dest0[8] = src1[0]; | |||
dest0[9] = src1[1]; | |||
dest0[10] = src1[2]; | |||
dest0[11] = src1[3]; | |||
dest0[12] = src1[4]; | |||
dest0[13] = src1[5]; | |||
dest0[14] = src1[6]; | |||
dest0[15] = src1[7]; | |||
dest0[16] = src2[0]; | |||
dest0[17] = src2[1]; | |||
dest0[18] = src2[2]; | |||
dest0[19] = src2[3]; | |||
dest0[20] = src2[4]; | |||
dest0[21] = src2[5]; | |||
dest0[22] = src2[6]; | |||
dest0[23] = src2[7]; | |||
dest0[24] = src3[0]; | |||
dest0[25] = src3[1]; | |||
dest0[26] = src3[2]; | |||
dest0[27] = src3[3]; | |||
dest0[28] = src3[4]; | |||
dest0[29] = src3[5]; | |||
dest0[30] = src3[6]; | |||
dest0[31] = src3[7]; | |||
src0 = src0+8; | |||
src1 = src1+8; | |||
src2 = src2+8; | |||
src3 = src3+8; | |||
ii = (row<<3); | |||
dest0 = dest0+ii; | |||
} | |||
if (col&2) | |||
{ | |||
dest2[0] = src0[0]; | |||
dest2[1] = src0[1]; | |||
dest2[2] = src0[2]; | |||
dest2[3] = src0[3]; | |||
dest2[4] = src1[0]; | |||
dest2[5] = src1[1]; | |||
dest2[6] = src1[2]; | |||
dest2[7] = src1[3]; | |||
dest2[8] = src2[0]; | |||
dest2[9] = src2[1]; | |||
dest2[10] = src2[2]; | |||
dest2[11] = src2[3]; | |||
dest2[12] = src3[0]; | |||
dest2[13] = src3[1]; | |||
dest2[14] = src3[2]; | |||
dest2[15] = src3[3]; | |||
src0 = src0+4; | |||
src1 = src1+4; | |||
src2 = src2+4; | |||
src3 = src3+4; | |||
dest2 = dest2+16; | |||
} | |||
if (col&1) | |||
{ | |||
dest1[0] = src0[0]; | |||
dest1[1] = src0[1]; | |||
dest1[2] = src1[0]; | |||
dest1[3] = src1[1]; | |||
dest1[4] = src2[0]; | |||
dest1[5] = src2[1]; | |||
dest1[6] = src3[0]; | |||
dest1[7] = src3[1]; | |||
src0 = src0+2; | |||
src1 = src1+2; | |||
src2 = src2+2; | |||
src3 = src3+2; | |||
dest1 = dest1+8; | |||
} | |||
} | |||
if (row&2) | |||
{ | |||
src0 = src; | |||
src1 = src0+2*srcdim; | |||
src = src1+2*srcdim; | |||
dest0 = dest; | |||
ii = (2<<3); | |||
dest = dest+ii; | |||
for (i=0; i<col/4; i+=1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src0[2]; | |||
dest0[3] = src0[3]; | |||
dest0[4] = src0[4]; | |||
dest0[5] = src0[5]; | |||
dest0[6] = src0[6]; | |||
dest0[7] = src0[7]; | |||
dest0[8] = src1[0]; | |||
dest0[9] = src1[1]; | |||
dest0[10] = src1[2]; | |||
dest0[11] = src1[3]; | |||
dest0[12] = src1[4]; | |||
dest0[13] = src1[5]; | |||
dest0[14] = src1[6]; | |||
dest0[15] = src1[7]; | |||
src0 = src0+8; | |||
src1 = src1+8; | |||
ii = (row<<3); | |||
dest0 = dest0+ii; | |||
} | |||
if (col&2) | |||
{ | |||
dest2[0] = src0[0]; | |||
dest2[1] = src0[1]; | |||
dest2[2] = src0[2]; | |||
dest2[3] = src0[3]; | |||
dest2[4] = src1[0]; | |||
dest2[5] = src1[1]; | |||
dest2[6] = src1[2]; | |||
dest2[7] = src1[3]; | |||
src0 = src0+4; | |||
src1 = src1+4; | |||
dest2 = dest2+8; | |||
} | |||
if (col&1) | |||
{ | |||
dest1[0] = src0[0]; | |||
dest1[1] = src0[1]; | |||
dest1[2] = src1[0]; | |||
dest1[3] = src1[1]; | |||
src0 = src0+2; | |||
src1 = src1+2; | |||
dest1 = dest1+4; | |||
} | |||
} | |||
if (row&1) | |||
{ | |||
src0 = src; | |||
src = src0+2*srcdim; | |||
dest0 = dest; | |||
ii = (1<<3); | |||
dest = dest+ii; | |||
for (i=0; i<col/4; i+=1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src0[2]; | |||
dest0[3] = src0[3]; | |||
dest0[4] = src0[4]; | |||
dest0[5] = src0[5]; | |||
dest0[6] = src0[6]; | |||
dest0[7] = src0[7]; | |||
src0 = src0+8; | |||
ii = (row<<3); | |||
dest0 = dest0+ii; | |||
} | |||
if (col&2) | |||
{ | |||
dest2[0] = src0[0]; | |||
dest2[1] = src0[1]; | |||
dest2[2] = src0[2]; | |||
dest2[3] = src0[3]; | |||
src0 = src0+4; | |||
dest2 = dest2+4; | |||
} | |||
if (col&1) | |||
{ | |||
dest1[0] = src0[0]; | |||
dest1[1] = src0[1]; | |||
src0 = src0+2; | |||
dest1 = dest1+2; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,370 @@ | |||
/***************************************************************************** | |||
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the ISCAS nor the names of its contributors may | |||
be used to endorse or promote products derived from this software | |||
without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
**********************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||
{ | |||
BLASLONG i,j; | |||
BLASLONG idx=0; | |||
BLASLONG ii; | |||
FLOAT *src0,*src1,*src2,*src3,*dest0; | |||
FLOAT *dest1,*dest2,*dest4; | |||
ii = col&-8; | |||
ii = ii*(2*row); | |||
dest4 = dest+ii; | |||
ii = col&-4; | |||
ii = ii*(2*row); | |||
dest2 = dest+ii; | |||
ii = col&-2; | |||
ii = ii*(2*row); | |||
dest1 = dest+ii; | |||
for (j=0; j<row/4; j+=1) | |||
{ | |||
src0 = src; | |||
src1 = src0+2*srcdim; | |||
src2 = src1+2*srcdim; | |||
src3 = src2+2*srcdim; | |||
src = src3+2*srcdim; | |||
dest0 = dest; | |||
ii = (4<<4); | |||
dest = dest+ii; | |||
for (i=0; i<col/8; i+=1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src0[2]; | |||
dest0[3] = src0[3]; | |||
dest0[4] = src0[4]; | |||
dest0[5] = src0[5]; | |||
dest0[6] = src0[6]; | |||
dest0[7] = src0[7]; | |||
dest0[8] = src0[8]; | |||
dest0[9] = src0[9]; | |||
dest0[10] = src0[10]; | |||
dest0[11] = src0[11]; | |||
dest0[12] = src0[12]; | |||
dest0[13] = src0[13]; | |||
dest0[14] = src0[14]; | |||
dest0[15] = src0[15]; | |||
dest0[16] = src1[0]; | |||
dest0[17] = src1[1]; | |||
dest0[18] = src1[2]; | |||
dest0[19] = src1[3]; | |||
dest0[20] = src1[4]; | |||
dest0[21] = src1[5]; | |||
dest0[22] = src1[6]; | |||
dest0[23] = src1[7]; | |||
dest0[24] = src1[8]; | |||
dest0[25] = src1[9]; | |||
dest0[26] = src1[10]; | |||
dest0[27] = src1[11]; | |||
dest0[28] = src1[12]; | |||
dest0[29] = src1[13]; | |||
dest0[30] = src1[14]; | |||
dest0[31] = src1[15]; | |||
dest0[32] = src2[0]; | |||
dest0[33] = src2[1]; | |||
dest0[34] = src2[2]; | |||
dest0[35] = src2[3]; | |||
dest0[36] = src2[4]; | |||
dest0[37] = src2[5]; | |||
dest0[38] = src2[6]; | |||
dest0[39] = src2[7]; | |||
dest0[40] = src2[8]; | |||
dest0[41] = src2[9]; | |||
dest0[42] = src2[10]; | |||
dest0[43] = src2[11]; | |||
dest0[44] = src2[12]; | |||
dest0[45] = src2[13]; | |||
dest0[46] = src2[14]; | |||
dest0[47] = src2[15]; | |||
dest0[48] = src3[0]; | |||
dest0[49] = src3[1]; | |||
dest0[50] = src3[2]; | |||
dest0[51] = src3[3]; | |||
dest0[52] = src3[4]; | |||
dest0[53] = src3[5]; | |||
dest0[54] = src3[6]; | |||
dest0[55] = src3[7]; | |||
dest0[56] = src3[8]; | |||
dest0[57] = src3[9]; | |||
dest0[58] = src3[10]; | |||
dest0[59] = src3[11]; | |||
dest0[60] = src3[12]; | |||
dest0[61] = src3[13]; | |||
dest0[62] = src3[14]; | |||
dest0[63] = src3[15]; | |||
src0 = src0+16; | |||
src1 = src1+16; | |||
src2 = src2+16; | |||
src3 = src3+16; | |||
ii = (row<<4); | |||
dest0 = dest0+ii; | |||
} | |||
if (col&4) | |||
{ | |||
dest4[0] = src0[0]; | |||
dest4[1] = src0[1]; | |||
dest4[2] = src0[2]; | |||
dest4[3] = src0[3]; | |||
dest4[4] = src0[4]; | |||
dest4[5] = src0[5]; | |||
dest4[6] = src0[6]; | |||
dest4[7] = src0[7]; | |||
dest4[8] = src1[0]; | |||
dest4[9] = src1[1]; | |||
dest4[10] = src1[2]; | |||
dest4[11] = src1[3]; | |||
dest4[12] = src1[4]; | |||
dest4[13] = src1[5]; | |||
dest4[14] = src1[6]; | |||
dest4[15] = src1[7]; | |||
dest4[16] = src2[0]; | |||
dest4[17] = src2[1]; | |||
dest4[18] = src2[2]; | |||
dest4[19] = src2[3]; | |||
dest4[20] = src2[4]; | |||
dest4[21] = src2[5]; | |||
dest4[22] = src2[6]; | |||
dest4[23] = src2[7]; | |||
dest4[24] = src3[0]; | |||
dest4[25] = src3[1]; | |||
dest4[26] = src3[2]; | |||
dest4[27] = src3[3]; | |||
dest4[28] = src3[4]; | |||
dest4[29] = src3[5]; | |||
dest4[30] = src3[6]; | |||
dest4[31] = src3[7]; | |||
src0 = src0+8; | |||
src1 = src1+8; | |||
src2 = src2+8; | |||
src3 = src3+8; | |||
dest4 = dest4+32; | |||
} | |||
if (col&2) | |||
{ | |||
dest2[0] = src0[0]; | |||
dest2[1] = src0[1]; | |||
dest2[2] = src0[2]; | |||
dest2[3] = src0[3]; | |||
dest2[4] = src1[0]; | |||
dest2[5] = src1[1]; | |||
dest2[6] = src1[2]; | |||
dest2[7] = src1[3]; | |||
dest2[8] = src2[0]; | |||
dest2[9] = src2[1]; | |||
dest2[10] = src2[2]; | |||
dest2[11] = src2[3]; | |||
dest2[12] = src3[0]; | |||
dest2[13] = src3[1]; | |||
dest2[14] = src3[2]; | |||
dest2[15] = src3[3]; | |||
src0 = src0+4; | |||
src1 = src1+4; | |||
src2 = src2+4; | |||
src3 = src3+4; | |||
dest2 = dest2+16; | |||
} | |||
if (col&1) | |||
{ | |||
dest1[0] = src0[0]; | |||
dest1[1] = src0[1]; | |||
dest1[2] = src1[0]; | |||
dest1[3] = src1[1]; | |||
dest1[4] = src2[0]; | |||
dest1[5] = src2[1]; | |||
dest1[6] = src3[0]; | |||
dest1[7] = src3[1]; | |||
src0 = src0+2; | |||
src1 = src1+2; | |||
src2 = src2+2; | |||
src3 = src3+2; | |||
dest1 = dest1+8; | |||
} | |||
} | |||
if (row&2) | |||
{ | |||
src0 = src; | |||
src1 = src0+2*srcdim; | |||
src = src1+2*srcdim; | |||
dest0 = dest; | |||
ii = (2<<4); | |||
dest = dest+ii; | |||
for (i=0; i<col/8; i+=1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src0[2]; | |||
dest0[3] = src0[3]; | |||
dest0[4] = src0[4]; | |||
dest0[5] = src0[5]; | |||
dest0[6] = src0[6]; | |||
dest0[7] = src0[7]; | |||
dest0[8] = src0[8]; | |||
dest0[9] = src0[9]; | |||
dest0[10] = src0[10]; | |||
dest0[11] = src0[11]; | |||
dest0[12] = src0[12]; | |||
dest0[13] = src0[13]; | |||
dest0[14] = src0[14]; | |||
dest0[15] = src0[15]; | |||
dest0[16] = src1[0]; | |||
dest0[17] = src1[1]; | |||
dest0[18] = src1[2]; | |||
dest0[19] = src1[3]; | |||
dest0[20] = src1[4]; | |||
dest0[21] = src1[5]; | |||
dest0[22] = src1[6]; | |||
dest0[23] = src1[7]; | |||
dest0[24] = src1[8]; | |||
dest0[25] = src1[9]; | |||
dest0[26] = src1[10]; | |||
dest0[27] = src1[11]; | |||
dest0[28] = src1[12]; | |||
dest0[29] = src1[13]; | |||
dest0[30] = src1[14]; | |||
dest0[31] = src1[15]; | |||
src0 = src0+16; | |||
src1 = src1+16; | |||
ii = (row<<4); | |||
dest0 = dest0+ii; | |||
} | |||
if (col&4) | |||
{ | |||
dest4[0] = src0[0]; | |||
dest4[1] = src0[1]; | |||
dest4[2] = src0[2]; | |||
dest4[3] = src0[3]; | |||
dest4[4] = src0[4]; | |||
dest4[5] = src0[5]; | |||
dest4[6] = src0[6]; | |||
dest4[7] = src0[7]; | |||
dest4[8] = src1[0]; | |||
dest4[9] = src1[1]; | |||
dest4[10] = src1[2]; | |||
dest4[11] = src1[3]; | |||
dest4[12] = src1[4]; | |||
dest4[13] = src1[5]; | |||
dest4[14] = src1[6]; | |||
dest4[15] = src1[7]; | |||
src0 = src0+8; | |||
src1 = src1+8; | |||
dest4 = dest4+16; | |||
} | |||
if (col&2) | |||
{ | |||
dest2[0] = src0[0]; | |||
dest2[1] = src0[1]; | |||
dest2[2] = src0[2]; | |||
dest2[3] = src0[3]; | |||
dest2[4] = src1[0]; | |||
dest2[5] = src1[1]; | |||
dest2[6] = src1[2]; | |||
dest2[7] = src1[3]; | |||
src0 = src0+4; | |||
src1 = src1+4; | |||
dest2 = dest2+8; | |||
} | |||
if (col&1) | |||
{ | |||
dest1[0] = src0[0]; | |||
dest1[1] = src0[1]; | |||
dest1[2] = src1[0]; | |||
dest1[3] = src1[1]; | |||
src0 = src0+2; | |||
src1 = src1+2; | |||
dest1 = dest1+4; | |||
} | |||
} | |||
if (row&1) | |||
{ | |||
src0 = src; | |||
src = src0+2*srcdim; | |||
dest0 = dest; | |||
ii = (1<<4); | |||
dest = dest+ii; | |||
for (i=0; i<col/8; i+=1) | |||
{ | |||
dest0[0] = src0[0]; | |||
dest0[1] = src0[1]; | |||
dest0[2] = src0[2]; | |||
dest0[3] = src0[3]; | |||
dest0[4] = src0[4]; | |||
dest0[5] = src0[5]; | |||
dest0[6] = src0[6]; | |||
dest0[7] = src0[7]; | |||
dest0[8] = src0[8]; | |||
dest0[9] = src0[9]; | |||
dest0[10] = src0[10]; | |||
dest0[11] = src0[11]; | |||
dest0[12] = src0[12]; | |||
dest0[13] = src0[13]; | |||
dest0[14] = src0[14]; | |||
dest0[15] = src0[15]; | |||
src0 = src0+16; | |||
ii = (row<<4); | |||
dest0 = dest0+ii; | |||
} | |||
if (col&4) | |||
{ | |||
dest4[0] = src0[0]; | |||
dest4[1] = src0[1]; | |||
dest4[2] = src0[2]; | |||
dest4[3] = src0[3]; | |||
dest4[4] = src0[4]; | |||
dest4[5] = src0[5]; | |||
dest4[6] = src0[6]; | |||
dest4[7] = src0[7]; | |||
src0 = src0+8; | |||
dest4 = dest4+8; | |||
} | |||
if (col&2) | |||
{ | |||
dest2[0] = src0[0]; | |||
dest2[1] = src0[1]; | |||
dest2[2] = src0[2]; | |||
dest2[3] = src0[3]; | |||
src0 = src0+4; | |||
dest2 = dest2+4; | |||
} | |||
if (col&1) | |||
{ | |||
dest1[0] = src0[0]; | |||
dest1[1] = src0[1]; | |||
src0 = src0+2; | |||
dest1 = dest1+2; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -1,59 +1,84 @@ | |||
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S | |||
SGEMMINCOPY = gemm_ncopy_4.S | |||
SGEMMITCOPY = gemm_tcopy_4.S | |||
SGEMMKERNEL = sgemm_kernel_8x8_sandy.S | |||
SGEMMINCOPY = | |||
SGEMMITCOPY = | |||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMINCOPYOBJ = | |||
SGEMMITCOPYOBJ = | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMKERNEL = gemm_kernel_2x8_nehalem.S | |||
DGEMMINCOPY = dgemm_ncopy_2.S | |||
DGEMMITCOPY = dgemm_tcopy_2.S | |||
DGEMMONCOPY = dgemm_ncopy_8.S | |||
DGEMMOTCOPY = dgemm_tcopy_8.S | |||
DGEMMKERNEL = dgemm_kernel_4x8_sandy.S | |||
DGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
DGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
#DGEMMONCOPY = gemm_ncopy_4.S | |||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
#DGEMMOTCOPY = gemm_tcopy_4.S | |||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S | |||
CGEMMINCOPY = zgemm_ncopy_2.S | |||
CGEMMITCOPY = zgemm_tcopy_2.S | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
#CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S | |||
CGEMMKERNEL = cgemm_kernel_4x8_sandy.S | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c | |||
CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S | |||
ZGEMMINCOPY = zgemm_ncopy_1.S | |||
ZGEMMITCOPY = zgemm_tcopy_1.S | |||
#ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S | |||
ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S | |||
ZGEMMINCOPY = | |||
ZGEMMITCOPY = | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMINCOPYOBJ = | |||
ZGEMMITCOPYOBJ = | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S | |||
STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S | |||
STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | |||
STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S | |||
DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S | |||
DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S | |||
DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S | |||
DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S | |||
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S | |||
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S | |||
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S | |||
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S | |||
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S | |||
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S | |||
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S | |||
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S | |||
#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S | |||
#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S | |||
#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | |||
#STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S | |||
#DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S | |||
#DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S | |||
#DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S | |||
#DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S | |||
#CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S | |||
#CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S | |||
#CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S | |||
#CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S | |||
#ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S | |||
#ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S | |||
#ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S | |||
#ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S | |||
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S |
@@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
#if defined(BARCELONA) || defined(SHANGHAI) | |||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
#define SNUMOPT 8 | |||
#define DNUMOPT 4 | |||
@@ -208,68 +208,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
#define SGEMM_DEFAULT_R sgemm_r | |||
#define QGEMM_DEFAULT_R qgemm_r | |||
#define DGEMM_DEFAULT_R dgemm_r | |||
#define CGEMM_DEFAULT_R cgemm_r | |||
#define ZGEMM_DEFAULT_R zgemm_r | |||
#define XGEMM_DEFAULT_R xgemm_r | |||
#define SYMV_P 16 | |||
#define HAVE_EXCLUSIVE_CACHE | |||
#define GEMM_THREAD gemm_thread_mn | |||
#endif | |||
#if defined(BOBCAT) | |||
#define SNUMOPT 8 | |||
#define DNUMOPT 4 | |||
#define GEMM_DEFAULT_OFFSET_A 64 | |||
#define GEMM_DEFAULT_OFFSET_B 832 | |||
#define GEMM_DEFAULT_ALIGN 0x0fffUL | |||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||
#define QGEMM_DEFAULT_UNROLL_N 2 | |||
#define CGEMM_DEFAULT_UNROLL_N 2 | |||
#define ZGEMM_DEFAULT_UNROLL_N 2 | |||
#define XGEMM_DEFAULT_UNROLL_N 1 | |||
#ifdef ARCH_X86 | |||
#define SGEMM_DEFAULT_UNROLL_M 4 | |||
#define DGEMM_DEFAULT_UNROLL_M 2 | |||
#define QGEMM_DEFAULT_UNROLL_M 2 | |||
#define CGEMM_DEFAULT_UNROLL_M 2 | |||
#define ZGEMM_DEFAULT_UNROLL_M 1 | |||
#define XGEMM_DEFAULT_UNROLL_M 1 | |||
#else | |||
#define SGEMM_DEFAULT_UNROLL_M 8 | |||
#define DGEMM_DEFAULT_UNROLL_M 4 | |||
#define QGEMM_DEFAULT_UNROLL_M 2 | |||
#define CGEMM_DEFAULT_UNROLL_M 4 | |||
#define ZGEMM_DEFAULT_UNROLL_M 2 | |||
#define XGEMM_DEFAULT_UNROLL_M 1 | |||
#endif | |||
#define SGEMM_DEFAULT_P 448 | |||
#define DGEMM_DEFAULT_P 224 | |||
#define QGEMM_DEFAULT_P 112 | |||
#define CGEMM_DEFAULT_P 224 | |||
#define ZGEMM_DEFAULT_P 112 | |||
#define XGEMM_DEFAULT_P 56 | |||
#define SGEMM_DEFAULT_Q 224 | |||
#define DGEMM_DEFAULT_Q 224 | |||
#define QGEMM_DEFAULT_Q 224 | |||
#define CGEMM_DEFAULT_Q 224 | |||
#define ZGEMM_DEFAULT_Q 224 | |||
#define XGEMM_DEFAULT_Q 224 | |||
#define SGEMM_DEFAULT_R sgemm_r | |||
#define QGEMM_DEFAULT_R qgemm_r | |||
#define DGEMM_DEFAULT_R dgemm_r | |||
@@ -980,7 +918,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define SNUMOPT 8 | |||
#define DNUMOPT 4 | |||
#define GEMM_DEFAULT_OFFSET_A 32 | |||
#define GEMM_DEFAULT_OFFSET_A 0 | |||
#define GEMM_DEFAULT_OFFSET_B 0 | |||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
@@ -990,57 +928,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#ifdef ARCH_X86 | |||
#define SGEMM_DEFAULT_UNROLL_M 4 | |||
#define DGEMM_DEFAULT_UNROLL_M 2 | |||
#define DGEMM_DEFAULT_UNROLL_M 8 | |||
#define QGEMM_DEFAULT_UNROLL_M 2 | |||
#define CGEMM_DEFAULT_UNROLL_M 2 | |||
#define ZGEMM_DEFAULT_UNROLL_M 1 | |||
#define XGEMM_DEFAULT_UNROLL_M 1 | |||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||
#define DGEMM_DEFAULT_UNROLL_N 8 | |||
#define QGEMM_DEFAULT_UNROLL_N 2 | |||
#define CGEMM_DEFAULT_UNROLL_N 2 | |||
#define ZGEMM_DEFAULT_UNROLL_N 2 | |||
#define XGEMM_DEFAULT_UNROLL_N 1 | |||
#else | |||
#define SGEMM_DEFAULT_UNROLL_M 4 | |||
#define DGEMM_DEFAULT_UNROLL_M 2 | |||
#define SGEMM_DEFAULT_UNROLL_M 8 | |||
#define DGEMM_DEFAULT_UNROLL_M 8 | |||
#define QGEMM_DEFAULT_UNROLL_M 2 | |||
#define CGEMM_DEFAULT_UNROLL_M 2 | |||
#define ZGEMM_DEFAULT_UNROLL_M 1 | |||
#define CGEMM_DEFAULT_UNROLL_M 8 | |||
#define ZGEMM_DEFAULT_UNROLL_M 4 | |||
#define XGEMM_DEFAULT_UNROLL_M 1 | |||
#define SGEMM_DEFAULT_UNROLL_N 8 | |||
#define DGEMM_DEFAULT_UNROLL_N 8 | |||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||
#define QGEMM_DEFAULT_UNROLL_N 2 | |||
#define CGEMM_DEFAULT_UNROLL_N 4 | |||
#define ZGEMM_DEFAULT_UNROLL_N 4 | |||
#define XGEMM_DEFAULT_UNROLL_N 1 | |||
#endif | |||
#define SGEMM_DEFAULT_P 504 | |||
#define SGEMM_DEFAULT_P 512 | |||
#define SGEMM_DEFAULT_R sgemm_r | |||
//#define SGEMM_DEFAULT_R 1024 | |||
#define DGEMM_DEFAULT_P 504 | |||
#define DGEMM_DEFAULT_P 512 | |||
#define DGEMM_DEFAULT_R dgemm_r | |||
//#define DGEMM_DEFAULT_R 1024 | |||
#define QGEMM_DEFAULT_P 504 | |||
#define QGEMM_DEFAULT_R qgemm_r | |||
#define CGEMM_DEFAULT_P 252 | |||
#define CGEMM_DEFAULT_R cgemm_r | |||
#define CGEMM_DEFAULT_P 128 | |||
//#define CGEMM_DEFAULT_R cgemm_r | |||
#define CGEMM_DEFAULT_R 1024 | |||
#define ZGEMM_DEFAULT_P 252 | |||
#define ZGEMM_DEFAULT_P 512 | |||
#define ZGEMM_DEFAULT_R zgemm_r | |||
//#define ZGEMM_DEFAULT_R 1024 | |||
#define XGEMM_DEFAULT_P 252 | |||
#define XGEMM_DEFAULT_R xgemm_r | |||
#define SGEMM_DEFAULT_Q 512 | |||
#define SGEMM_DEFAULT_Q 256 | |||
#define DGEMM_DEFAULT_Q 256 | |||
#define QGEMM_DEFAULT_Q 128 | |||
#define CGEMM_DEFAULT_Q 512 | |||
#define ZGEMM_DEFAULT_Q 256 | |||
#define CGEMM_DEFAULT_Q 256 | |||
#define ZGEMM_DEFAULT_Q 192 | |||
#define XGEMM_DEFAULT_Q 128 | |||
#define GETRF_FACTOR 0.72 | |||