Browse Source

STRSM optimization for MIPS P5600 and I6400 using MSA

Signed-off-by: Kaustubh Raste <kaustubh.raste@imgtec.com>
tags/v0.2.19^2
Kaustubh Raste 9 years ago
parent
commit
ad9f317870
11 changed files with 8549 additions and 12 deletions
  1. +1
    -0
      CONTRIBUTORS.md
  2. +4
    -4
      kernel/mips/KERNEL.P5600
  3. +3
    -3
      kernel/mips/dtrsm_kernel_LN_8x4_msa.c
  4. +3
    -3
      kernel/mips/dtrsm_kernel_LT_8x4_msa.c
  5. +1
    -1
      kernel/mips/dtrsm_kernel_RN_8x4_msa.c
  6. +1
    -1
      kernel/mips/dtrsm_kernel_RT_8x4_msa.c
  7. +24
    -0
      kernel/mips/macros_msa.h
  8. +2133
    -0
      kernel/mips/strsm_kernel_LN_8x8_msa.c
  9. +2099
    -0
      kernel/mips/strsm_kernel_LT_8x8_msa.c
  10. +2162
    -0
      kernel/mips/strsm_kernel_RN_8x8_msa.c
  11. +2118
    -0
      kernel/mips/strsm_kernel_RT_8x8_msa.c

+ 1
- 0
CONTRIBUTORS.md View File

@@ -160,3 +160,4 @@ In chronological order:

* Kaustubh Raste <https://github.com/ksraste/>
* [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA
* [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA

+ 4
- 4
kernel/mips/KERNEL.P5600 View File

@@ -113,10 +113,10 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o

STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c

DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c


+ 3
- 3
kernel/mips/dtrsm_kernel_LN_8x4_msa.c View File

@@ -1170,7 +1170,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,

for (j = (n >> 2); j--;)
{
kk = m;
kk = m + offset;

if (m & 7)
{
@@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
{
if (n & 2)
{
kk = m;
kk = m + offset;

if (m & 7)
{
@@ -1291,7 +1291,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,

if (n & 1)
{
kk = m;
kk = m + offset;

if (m & 7)
{


+ 3
- 3
kernel/mips/dtrsm_kernel_LT_8x4_msa.c View File

@@ -1182,7 +1182,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,

for (j = (n >> 2); j--;)
{
kk = 0;
kk = offset;
aa = a;
cc = c;

@@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
{
if (n & 2)
{
kk = 0;
kk = offset;
aa = a;
cc = c;

@@ -1282,7 +1282,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,

if (n & 1)
{
kk = 0;
kk = offset;
aa = a;
cc = c;



+ 1
- 1
kernel/mips/dtrsm_kernel_RN_8x4_msa.c View File

@@ -809,7 +809,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
BLASLONG i, j, kk;
FLOAT *aa, *cc;

kk = 0;
kk = -offset;

for (j = (n >> 2); j--;)
{


+ 1
- 1
kernel/mips/dtrsm_kernel_RT_8x4_msa.c View File

@@ -865,7 +865,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
BLASLONG i, j, kk;
FLOAT *aa, *cc, *bb;

kk = n;
kk = n - offset;
c += n * ldc;
b += n * k;



+ 24
- 0
kernel/mips/macros_msa.h View File

@@ -137,6 +137,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}
#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)

/* Description : Indexed word element values are replicated to all
elements in output vector
Arguments : Inputs - in, stidx
Outputs - out0, out1
Return Type - as per RTYPE
Details : 'stidx' element value from 'in' vector is replicated to all
elements in 'out0' vector
'stidx + 1' element value from 'in' vector is replicated to all
elements in 'out1' vector
Valid index range for word operation is 0-3
*/
#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
{ \
out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
}

#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
{ \
SPLATI_W2(RTYPE, in, 0, out0, out1); \
SPLATI_W2(RTYPE, in, 2, out2, out3); \
}
#define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__)

/* Description : Transpose 4x4 block with word elements in vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1, out2, out3


+ 2133
- 0
kernel/mips/strsm_kernel_LN_8x8_msa.c
File diff suppressed because it is too large
View File


+ 2099
- 0
kernel/mips/strsm_kernel_LT_8x8_msa.c
File diff suppressed because it is too large
View File


+ 2162
- 0
kernel/mips/strsm_kernel_RN_8x8_msa.c
File diff suppressed because it is too large
View File


+ 2118
- 0
kernel/mips/strsm_kernel_RT_8x8_msa.c
File diff suppressed because it is too large
View File


Loading…
Cancel
Save