|
|
@@ -219,6 +219,7 @@ CNAME(BLASLONG M, |
|
|
|
|
|
|
|
const BLASLONG v_m2 = M & -v_size2; |
|
|
|
const BLASLONG v_m1 = M & -v_size; |
|
|
|
const BLASLONG n8 = N & -8; |
|
|
|
const BLASLONG n4 = N & -4; |
|
|
|
|
|
|
|
const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; |
|
|
@@ -238,23 +239,35 @@ CNAME(BLASLONG M, |
|
|
|
CREATE_A_POINTER(1, v_size); |
|
|
|
|
|
|
|
BLASLONG j = 0; |
|
|
|
for (; j < n4; j += 4) { |
|
|
|
for (; j < n8; j += 8) { |
|
|
|
|
|
|
|
CREATE_B_POINTER(0, 0); |
|
|
|
CREATE_B_POINTER(1, 1); |
|
|
|
CREATE_B_POINTER(2, 2); |
|
|
|
CREATE_B_POINTER(3, 3); |
|
|
|
UPDATE_B_POINTER(4); |
|
|
|
CREATE_B_POINTER(4, 4); |
|
|
|
CREATE_B_POINTER(5, 5); |
|
|
|
CREATE_B_POINTER(6, 6); |
|
|
|
CREATE_B_POINTER(7, 7); |
|
|
|
UPDATE_B_POINTER(8); |
|
|
|
|
|
|
|
BLASLONG k = 0; |
|
|
|
DECLARE_RESULT_VECTOR(0, 0); |
|
|
|
DECLARE_RESULT_VECTOR(0, 1); |
|
|
|
DECLARE_RESULT_VECTOR(0, 2); |
|
|
|
DECLARE_RESULT_VECTOR(0, 3); |
|
|
|
DECLARE_RESULT_VECTOR(0, 4); |
|
|
|
DECLARE_RESULT_VECTOR(0, 5); |
|
|
|
DECLARE_RESULT_VECTOR(0, 6); |
|
|
|
DECLARE_RESULT_VECTOR(0, 7); |
|
|
|
DECLARE_RESULT_VECTOR(1, 0); |
|
|
|
DECLARE_RESULT_VECTOR(1, 1); |
|
|
|
DECLARE_RESULT_VECTOR(1, 2); |
|
|
|
DECLARE_RESULT_VECTOR(1, 3); |
|
|
|
DECLARE_RESULT_VECTOR(1, 4); |
|
|
|
DECLARE_RESULT_VECTOR(1, 5); |
|
|
|
DECLARE_RESULT_VECTOR(1, 6); |
|
|
|
DECLARE_RESULT_VECTOR(1, 7); |
|
|
|
|
|
|
|
if (LIKELY(packed_a != NULL)) { |
|
|
|
if (j == 0) { |
|
|
@@ -267,12 +280,21 @@ CNAME(BLASLONG M, |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); |
|
|
|
QUADWORD_LOAD_B(4, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 4, 4, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 5, 4, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 6, 4, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 7, 4, 3, 0); |
|
|
|
GATHER_LOAD_A(pg_true, 1, 0); |
|
|
|
VECTOR_PACK_A(1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 4, 4, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 5, 4, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 6, 4, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 7, 4, 3, 0); |
|
|
|
} |
|
|
|
} else { |
|
|
|
for (; k < K; k++) { |
|
|
@@ -283,16 +305,102 @@ CNAME(BLASLONG M, |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); |
|
|
|
QUADWORD_LOAD_B(4, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 4, 4, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 5, 4, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 6, 4, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 7, 4, 3, 0); |
|
|
|
UNPACK_VECTOR_A(1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 4, 4, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 5, 4, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 6, 4, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 7, 4, 3, 0); |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
for (; k < K; k++) { |
|
|
|
|
|
|
|
QUADWORD_LOAD_B(0, 0); |
|
|
|
GATHER_LOAD_A(pg_true, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); |
|
|
|
QUADWORD_LOAD_B(4, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 4, 4, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 5, 4, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 6, 4, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 7, 4, 3, 0); |
|
|
|
GATHER_LOAD_A(pg_true, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 4, 4, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 5, 4, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 6, 4, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 7, 4, 3, 0); |
|
|
|
} |
|
|
|
} |
|
|
|
VECTOR_STORE(pg_true, 0, 0); |
|
|
|
VECTOR_STORE(pg_true, 0, 1); |
|
|
|
VECTOR_STORE(pg_true, 0, 2); |
|
|
|
VECTOR_STORE(pg_true, 0, 3); |
|
|
|
VECTOR_STORE(pg_true, 0, 4); |
|
|
|
VECTOR_STORE(pg_true, 0, 5); |
|
|
|
VECTOR_STORE(pg_true, 0, 6); |
|
|
|
VECTOR_STORE(pg_true, 0, 7); |
|
|
|
VECTOR_STORE(pg_true, 1, 0); |
|
|
|
VECTOR_STORE(pg_true, 1, 1); |
|
|
|
VECTOR_STORE(pg_true, 1, 2); |
|
|
|
VECTOR_STORE(pg_true, 1, 3); |
|
|
|
VECTOR_STORE(pg_true, 1, 4); |
|
|
|
VECTOR_STORE(pg_true, 1, 5); |
|
|
|
VECTOR_STORE(pg_true, 1, 6); |
|
|
|
VECTOR_STORE(pg_true, 1, 7); |
|
|
|
INCR_C_POINTER(0, 8); |
|
|
|
INCR_C_POINTER(1, 8); |
|
|
|
} |
|
|
|
for (; j < n4; j += 4) { |
|
|
|
|
|
|
|
CREATE_B_POINTER(0, 0); |
|
|
|
CREATE_B_POINTER(1, 1); |
|
|
|
CREATE_B_POINTER(2, 2); |
|
|
|
CREATE_B_POINTER(3, 3); |
|
|
|
UPDATE_B_POINTER(4); |
|
|
|
|
|
|
|
BLASLONG k = 0; |
|
|
|
DECLARE_RESULT_VECTOR(0, 0); |
|
|
|
DECLARE_RESULT_VECTOR(0, 1); |
|
|
|
DECLARE_RESULT_VECTOR(0, 2); |
|
|
|
DECLARE_RESULT_VECTOR(0, 3); |
|
|
|
DECLARE_RESULT_VECTOR(1, 0); |
|
|
|
DECLARE_RESULT_VECTOR(1, 1); |
|
|
|
DECLARE_RESULT_VECTOR(1, 2); |
|
|
|
DECLARE_RESULT_VECTOR(1, 3); |
|
|
|
|
|
|
|
if (LIKELY(packed_a != NULL)) { |
|
|
|
for (; k < K; k++) { |
|
|
|
|
|
|
|
QUADWORD_LOAD_B(0, 0); |
|
|
|
UNPACK_VECTOR_A(0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); |
|
|
|
UNPACK_VECTOR_A(1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); |
|
|
|
} |
|
|
|
} else { |
|
|
|
for (; k < K; k++) { |
|
|
|
|
|
|
|
QUADWORD_LOAD_B(0, 0); |
|
|
|
GATHER_LOAD_A(pg_true, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); |
|
|
@@ -361,6 +469,52 @@ CNAME(BLASLONG M, |
|
|
|
CREATE_A_POINTER(0, 0); |
|
|
|
|
|
|
|
BLASLONG j = 0; |
|
|
|
for (; j < n8; j += 8) { |
|
|
|
|
|
|
|
CREATE_B_POINTER(0, 0); |
|
|
|
CREATE_B_POINTER(1, 1); |
|
|
|
CREATE_B_POINTER(2, 2); |
|
|
|
CREATE_B_POINTER(3, 3); |
|
|
|
CREATE_B_POINTER(4, 4); |
|
|
|
CREATE_B_POINTER(5, 5); |
|
|
|
CREATE_B_POINTER(6, 6); |
|
|
|
CREATE_B_POINTER(7, 7); |
|
|
|
UPDATE_B_POINTER(8); |
|
|
|
|
|
|
|
BLASLONG k = 0; |
|
|
|
DECLARE_RESULT_VECTOR(0, 0); |
|
|
|
DECLARE_RESULT_VECTOR(0, 1); |
|
|
|
DECLARE_RESULT_VECTOR(0, 2); |
|
|
|
DECLARE_RESULT_VECTOR(0, 3); |
|
|
|
DECLARE_RESULT_VECTOR(0, 4); |
|
|
|
DECLARE_RESULT_VECTOR(0, 5); |
|
|
|
DECLARE_RESULT_VECTOR(0, 6); |
|
|
|
DECLARE_RESULT_VECTOR(0, 7); |
|
|
|
|
|
|
|
for (; k < K; k++) { |
|
|
|
|
|
|
|
QUADWORD_LOAD_B(0, 0); |
|
|
|
GATHER_LOAD_A(pg_true, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); |
|
|
|
QUADWORD_LOAD_B(4, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 4, 4, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 5, 4, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 6, 4, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 7, 4, 3, 0); |
|
|
|
} |
|
|
|
VECTOR_STORE(pg_true, 0, 0); |
|
|
|
VECTOR_STORE(pg_true, 0, 1); |
|
|
|
VECTOR_STORE(pg_true, 0, 2); |
|
|
|
VECTOR_STORE(pg_true, 0, 3); |
|
|
|
VECTOR_STORE(pg_true, 0, 4); |
|
|
|
VECTOR_STORE(pg_true, 0, 5); |
|
|
|
VECTOR_STORE(pg_true, 0, 6); |
|
|
|
VECTOR_STORE(pg_true, 0, 7); |
|
|
|
INCR_C_POINTER(0, 8); |
|
|
|
} |
|
|
|
for (; j < n4; j += 4) { |
|
|
|
|
|
|
|
CREATE_B_POINTER(0, 0); |
|
|
@@ -418,6 +572,52 @@ CNAME(BLASLONG M, |
|
|
|
CREATE_A_POINTER(0, 0); |
|
|
|
|
|
|
|
BLASLONG j = 0; |
|
|
|
for (; j < n8; j += 8) { |
|
|
|
|
|
|
|
CREATE_B_POINTER(0, 0); |
|
|
|
CREATE_B_POINTER(1, 1); |
|
|
|
CREATE_B_POINTER(2, 2); |
|
|
|
CREATE_B_POINTER(3, 3); |
|
|
|
CREATE_B_POINTER(4, 4); |
|
|
|
CREATE_B_POINTER(5, 5); |
|
|
|
CREATE_B_POINTER(6, 6); |
|
|
|
CREATE_B_POINTER(7, 7); |
|
|
|
UPDATE_B_POINTER(8); |
|
|
|
|
|
|
|
BLASLONG k = 0; |
|
|
|
DECLARE_RESULT_VECTOR(0, 0); |
|
|
|
DECLARE_RESULT_VECTOR(0, 1); |
|
|
|
DECLARE_RESULT_VECTOR(0, 2); |
|
|
|
DECLARE_RESULT_VECTOR(0, 3); |
|
|
|
DECLARE_RESULT_VECTOR(0, 4); |
|
|
|
DECLARE_RESULT_VECTOR(0, 5); |
|
|
|
DECLARE_RESULT_VECTOR(0, 6); |
|
|
|
DECLARE_RESULT_VECTOR(0, 7); |
|
|
|
|
|
|
|
for (; k < K; k++) { |
|
|
|
|
|
|
|
QUADWORD_LOAD_B(0, 0); |
|
|
|
GATHER_LOAD_A(pg_tail, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); |
|
|
|
QUADWORD_LOAD_B(4, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 4, 4, 0, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 5, 4, 1, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 6, 4, 2, 0); |
|
|
|
UPDATE_RESULT_VECTOR_QUADWORD(0, 7, 4, 3, 0); |
|
|
|
} |
|
|
|
VECTOR_STORE(pg_tail, 0, 0); |
|
|
|
VECTOR_STORE(pg_tail, 0, 1); |
|
|
|
VECTOR_STORE(pg_tail, 0, 2); |
|
|
|
VECTOR_STORE(pg_tail, 0, 3); |
|
|
|
VECTOR_STORE(pg_tail, 0, 4); |
|
|
|
VECTOR_STORE(pg_tail, 0, 5); |
|
|
|
VECTOR_STORE(pg_tail, 0, 6); |
|
|
|
VECTOR_STORE(pg_tail, 0, 7); |
|
|
|
INCR_C_POINTER(0, 8); |
|
|
|
} |
|
|
|
for (; j < n4; j += 4) { |
|
|
|
|
|
|
|
CREATE_B_POINTER(0, 0); |
|
|
|