Signed-off-by: Kaustubh Raste <kaustubh.raste@imgtec.com>tags/v0.2.19^2
| @@ -160,3 +160,4 @@ In chronological order: | |||||
| * Kaustubh Raste <https://github.com/ksraste/> | * Kaustubh Raste <https://github.com/ksraste/> | ||||
| * [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA | * [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA | ||||
| * [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA | |||||
| @@ -113,10 +113,10 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | ZGEMMONCOPYOBJ = zgemm_oncopy.o | ||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | ZGEMMOTCOPYOBJ = zgemm_otcopy.o | ||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c | |||||
| STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c | |||||
| STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c | |||||
| STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c | |||||
| DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c | DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c | ||||
| DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c | DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c | ||||
| @@ -1170,7 +1170,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, | |||||
| for (j = (n >> 2); j--;) | for (j = (n >> 2); j--;) | ||||
| { | { | ||||
| kk = m; | |||||
| kk = m + offset; | |||||
| if (m & 7) | if (m & 7) | ||||
| { | { | ||||
| @@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, | |||||
| { | { | ||||
| if (n & 2) | if (n & 2) | ||||
| { | { | ||||
| kk = m; | |||||
| kk = m + offset; | |||||
| if (m & 7) | if (m & 7) | ||||
| { | { | ||||
| @@ -1291,7 +1291,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, | |||||
| if (n & 1) | if (n & 1) | ||||
| { | { | ||||
| kk = m; | |||||
| kk = m + offset; | |||||
| if (m & 7) | if (m & 7) | ||||
| { | { | ||||
| @@ -1182,7 +1182,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, | |||||
| for (j = (n >> 2); j--;) | for (j = (n >> 2); j--;) | ||||
| { | { | ||||
| kk = 0; | |||||
| kk = offset; | |||||
| aa = a; | aa = a; | ||||
| cc = c; | cc = c; | ||||
| @@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, | |||||
| { | { | ||||
| if (n & 2) | if (n & 2) | ||||
| { | { | ||||
| kk = 0; | |||||
| kk = offset; | |||||
| aa = a; | aa = a; | ||||
| cc = c; | cc = c; | ||||
| @@ -1282,7 +1282,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, | |||||
| if (n & 1) | if (n & 1) | ||||
| { | { | ||||
| kk = 0; | |||||
| kk = offset; | |||||
| aa = a; | aa = a; | ||||
| cc = c; | cc = c; | ||||
| @@ -809,7 +809,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, | |||||
| BLASLONG i, j, kk; | BLASLONG i, j, kk; | ||||
| FLOAT *aa, *cc; | FLOAT *aa, *cc; | ||||
| kk = 0; | |||||
| kk = -offset; | |||||
| for (j = (n >> 2); j--;) | for (j = (n >> 2); j--;) | ||||
| { | { | ||||
| @@ -865,7 +865,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, | |||||
| BLASLONG i, j, kk; | BLASLONG i, j, kk; | ||||
| FLOAT *aa, *cc, *bb; | FLOAT *aa, *cc, *bb; | ||||
| kk = n; | |||||
| kk = n - offset; | |||||
| c += n * ldc; | c += n * ldc; | ||||
| b += n * k; | b += n * k; | ||||
| @@ -137,6 +137,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| } | } | ||||
| #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__) | #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__) | ||||
| /* Description : Indexed word element values are replicated to all | |||||
| elements in output vector | |||||
| Arguments : Inputs - in, stidx | |||||
| Outputs - out0, out1 | |||||
| Return Type - as per RTYPE | |||||
| Details : 'stidx' element value from 'in' vector is replicated to all | |||||
| elements in 'out0' vector | |||||
| 'stidx + 1' element value from 'in' vector is replicated to all | |||||
| elements in 'out1' vector | |||||
| Valid index range for word operation is 0-3 | |||||
| */ | |||||
| #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \ | |||||
| { \ | |||||
| out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \ | |||||
| out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \ | |||||
| } | |||||
| #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \ | |||||
| { \ | |||||
| SPLATI_W2(RTYPE, in, 0, out0, out1); \ | |||||
| SPLATI_W2(RTYPE, in, 2, out2, out3); \ | |||||
| } | |||||
| #define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__) | |||||
| /* Description : Transpose 4x4 block with word elements in vectors | /* Description : Transpose 4x4 block with word elements in vectors | ||||
| Arguments : Inputs - in0, in1, in2, in3 | Arguments : Inputs - in0, in1, in2, in3 | ||||
| Outputs - out0, out1, out2, out3 | Outputs - out0, out1, out2, out3 | ||||