@@ -84,6 +84,11 @@ DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
CGEMVTKERNEL = zgemv_t.S | |||
ZGEMVTKERNEL = zgemv_t.S | |||
SSYMV_L_KERNEL = symv_L_sve_v1x4.c | |||
SSYMV_U_KERNEL = symv_U_sve_v1x4.c | |||
DSYMV_L_KERNEL = symv_L_sve_v1x4.c | |||
DSYMV_U_KERNEL = symv_U_sve_v1x4.c | |||
SASUMKERNEL = sasum_thunderx2t99.c | |||
DASUMKERNEL = dasum_thunderx2t99.c | |||
CASUMKERNEL = casum_thunderx2t99.c | |||
@@ -70,6 +70,10 @@ DGEMVTKERNEL = gemv_t.S | |||
CGEMVTKERNEL = zgemv_t.S | |||
ZGEMVTKERNEL = zgemv_t.S | |||
SSYMV_L_KERNEL = symv_L_asimd_4x4.c | |||
SSYMV_U_KERNEL = symv_U_asimd_4x4.c | |||
DSYMV_L_KERNEL = symv_L_asimd_4x4.c | |||
DSYMV_U_KERNEL = symv_U_asimd_4x4.c | |||
SASUMKERNEL = sasum_thunderx2t99.c | |||
DASUMKERNEL = dasum_thunderx2t99.c | |||
@@ -0,0 +1,113 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2025, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "symv_microk_asimd_4x4.c" | |||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
{ | |||
BLASLONG i, j; | |||
FLOAT temp1, temp2; | |||
FLOAT tmp1[4]; | |||
FLOAT tmp2[4]; | |||
FLOAT *a0, *a1, *a2, *a3; | |||
FLOAT x0, x1, x2, x3; | |||
FLOAT *X = x; | |||
FLOAT *Y = y; | |||
if (inc_y != 1) { | |||
Y = buffer; | |||
COPY_K(m, y, inc_y, Y, 1); | |||
} | |||
if (inc_x != 1) { | |||
if (inc_y != 1) { | |||
X = Y + m; | |||
} else { | |||
X = buffer; | |||
} | |||
COPY_K(m, x, inc_x, X, 1); | |||
} | |||
BLASLONG offset1 = (offset / 4) * 4; | |||
for (j = 0; j < offset1; j+=4) { | |||
a0 = &a[j*lda]; | |||
a1 = a0 + lda; | |||
a2 = a1 + lda; | |||
a3 = a2 + lda; | |||
x0 = X[j]; | |||
x1 = X[j+1]; | |||
x2 = X[j+2]; | |||
x3 = X[j+3]; | |||
tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3; | |||
tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3; | |||
tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3; | |||
tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3; | |||
tmp1[0] = alpha * x0; | |||
tmp1[1] = alpha * x1; | |||
tmp1[2] = alpha * x2; | |||
tmp1[3] = alpha * x3; | |||
BLASLONG m2 = (m/4)*4; | |||
if (m2 > j+4) | |||
symv_kernel_4x4(j+4, m2, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||
for (i = m2; i < m; i++) { | |||
Y[i] += tmp1[0] * a0[i]; | |||
tmp2[0] += a0[i] * X[i]; | |||
Y[i] += tmp1[1] * a1[i]; | |||
tmp2[1] += a1[i] * X[i]; | |||
Y[i] += tmp1[2] * a2[i]; | |||
tmp2[2] += a2[i] * X[i]; | |||
Y[i] += tmp1[3] * a3[i]; | |||
tmp2[3] += a3[i] * X[i]; | |||
} | |||
Y[j] += alpha * tmp2[0]; | |||
Y[j+1] += alpha * tmp2[1]; | |||
Y[j+2] += alpha * tmp2[2]; | |||
Y[j+3] += alpha * tmp2[3]; | |||
} | |||
for (j = offset1; j < offset; j++) { | |||
temp1 = alpha * X[j]; | |||
temp2 = 0.0; | |||
Y[j] += temp1 * a[j*lda+j]; | |||
for (i = j+1; i < m; i++) { | |||
Y[i] += temp1 * a[j*lda+i]; | |||
temp2 += a[j*lda+i] * X[i]; | |||
} | |||
Y[j] += alpha * temp2; | |||
} | |||
if (inc_y != 1) { | |||
COPY_K(m, Y, 1, y, inc_y); | |||
} | |||
return(0); | |||
} |
@@ -0,0 +1,103 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2025, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "symv_microk_sve_v1x4.c" | |||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
{ | |||
BLASLONG i, j; | |||
FLOAT temp1, temp2; | |||
FLOAT tmp1[4]; | |||
FLOAT tmp2[4]; | |||
FLOAT *a0, *a1, *a2, *a3; | |||
FLOAT x0, x1, x2, x3; | |||
FLOAT *X = x; | |||
FLOAT *Y = y; | |||
if (inc_y != 1) { | |||
Y = buffer; | |||
COPY_K(m, y, inc_y, Y, 1); | |||
} | |||
if (inc_x != 1) { | |||
if (inc_y != 1) { | |||
X = Y + m; | |||
} else { | |||
X = buffer; | |||
} | |||
COPY_K(m, x, inc_x, X, 1); | |||
} | |||
BLASLONG offset1 = (offset / 4) * 4; | |||
for (j = 0; j < offset1; j+=4) { | |||
a0 = &a[j*lda]; | |||
a1 = a0 + lda; | |||
a2 = a1 + lda; | |||
a3 = a2 + lda; | |||
x0 = X[j]; | |||
x1 = X[j+1]; | |||
x2 = X[j+2]; | |||
x3 = X[j+3]; | |||
tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3; | |||
tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3; | |||
tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3; | |||
tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3; | |||
tmp1[0] = alpha * x0; | |||
tmp1[1] = alpha * x1; | |||
tmp1[2] = alpha * x2; | |||
tmp1[3] = alpha * x3; | |||
symv_kernel_v1x4(j+4, m, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||
Y[j] += alpha * tmp2[0]; | |||
Y[j+1] += alpha * tmp2[1]; | |||
Y[j+2] += alpha * tmp2[2]; | |||
Y[j+3] += alpha * tmp2[3]; | |||
} | |||
for (j = offset1; j < offset; j++) { | |||
temp1 = alpha * X[j]; | |||
temp2 = 0.0; | |||
a0 = &a[j*lda]; | |||
Y[j] += temp1 * a0[j]; | |||
for (i = j+1; i < m; i++) { | |||
Y[i] += temp1 * a0[i]; | |||
temp2 += a0[i] * X[i]; | |||
} | |||
Y[j] += alpha * temp2; | |||
} | |||
if (inc_y != 1) { | |||
COPY_K(m, Y, 1, y, inc_y); | |||
} | |||
return(0); | |||
} |
@@ -0,0 +1,106 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2025, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "symv_microk_asimd_4x4.c" | |||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
{ | |||
BLASLONG i, j, j1, j2, m2; | |||
FLOAT temp1, temp2; | |||
FLOAT tmp1[4]; | |||
FLOAT tmp2[4]; | |||
FLOAT *a0, *a1, *a2, *a3; | |||
FLOAT *X = x; | |||
FLOAT *Y = y; | |||
BLASLONG m1 = m - offset; | |||
if (inc_y != 1) { | |||
Y = buffer; | |||
COPY_K(m, y, inc_y, Y, 1); | |||
} | |||
if (inc_x != 1) { | |||
if (inc_y != 1) { | |||
X = Y + m; | |||
} else { | |||
X = buffer; | |||
} | |||
COPY_K(m, x, inc_x, X, 1); | |||
} | |||
m2 = m - (offset % 4); | |||
for (j = m1; j < m2; j += 4) { | |||
tmp1[0] = alpha * X[j]; | |||
tmp1[1] = alpha * X[j+1]; | |||
tmp1[2] = alpha * X[j+2]; | |||
tmp1[3] = alpha * X[j+3]; | |||
tmp2[0] = 0.0; | |||
tmp2[1] = 0.0; | |||
tmp2[2] = 0.0; | |||
tmp2[3] = 0.0; | |||
a0 = &a[j*lda]; | |||
a1 = a0 + lda; | |||
a2 = a1 + lda; | |||
a3 = a2 + lda; | |||
j1 = (j / 4) * 4; | |||
if ( j1 ) | |||
symv_kernel_4x4(0, j1, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||
j2 = 0; | |||
for (j1 = j ; j1 < j+4 ; j1++) { | |||
temp1 = tmp1[j2]; | |||
temp2 = tmp2[j2]; | |||
a0 = &a[j1*lda]; | |||
for (i=j ; i<j1; i++) { | |||
Y[i] += temp1 * a0[i]; | |||
temp2 += a0[i] * X[i]; | |||
} | |||
Y[j1] += temp1 * a0[j1] + alpha * temp2; | |||
j2++; | |||
} | |||
} | |||
for ( ; j < m; j++) { | |||
temp1 = alpha * X[j]; | |||
temp2 = 0.0; | |||
a0 = &a[j*lda]; | |||
for (i = 0 ; i < j; i++) { | |||
Y[i] += temp1 * a0[i]; | |||
temp2 += a0[i] * X[i]; | |||
} | |||
Y[j] += temp1 * a0[j] + alpha * temp2; | |||
} | |||
if (inc_y != 1) { | |||
COPY_K(m, Y, 1, y, inc_y); | |||
} | |||
return(0); | |||
} |
@@ -0,0 +1,104 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2025, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "symv_microk_sve_v1x4.c" | |||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
{ | |||
BLASLONG i, j, j1, j2, m2; | |||
FLOAT temp1, temp2; | |||
FLOAT tmp1[4]; | |||
FLOAT tmp2[4]; | |||
FLOAT *a0, *a1, *a2, *a3; | |||
FLOAT *X = x; | |||
FLOAT *Y = y; | |||
BLASLONG m1 = m - offset; | |||
if (inc_y != 1) { | |||
Y = buffer; | |||
COPY_K(m, y, inc_y, Y, 1); | |||
} | |||
if (inc_x != 1) { | |||
if (inc_y != 1) { | |||
X = Y + m; | |||
} else { | |||
X = buffer; | |||
} | |||
COPY_K(m, x, inc_x, X, 1); | |||
} | |||
m2 = m - (offset % 4); | |||
for (j = m1; j < m2; j += 4) { | |||
tmp1[0] = alpha * X[j]; | |||
tmp1[1] = alpha * X[j+1]; | |||
tmp1[2] = alpha * X[j+2]; | |||
tmp1[3] = alpha * X[j+3]; | |||
tmp2[0] = 0.0; | |||
tmp2[1] = 0.0; | |||
tmp2[2] = 0.0; | |||
tmp2[3] = 0.0; | |||
a0 = &a[j*lda]; | |||
a1 = a0 + lda; | |||
a2 = a1 + lda; | |||
a3 = a2 + lda; | |||
symv_kernel_v1x4(0, j, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||
j2 = 0; | |||
for (j1 = j ; j1 < j+4 ; j1++) { | |||
temp1 = tmp1[j2]; | |||
temp2 = tmp2[j2]; | |||
a0 = &a[j1*lda]; | |||
for (i=j ; i<j1; i++) { | |||
Y[i] += temp1 * a0[i]; | |||
temp2 += a0[i] * X[i]; | |||
} | |||
Y[j1] += temp1 * a0[j1] + alpha * temp2; | |||
j2++; | |||
} | |||
} | |||
for ( ; j < m; j++) { | |||
temp1 = alpha * X[j]; | |||
temp2 = 0.0; | |||
a0 = &a[j*lda]; | |||
for (i = 0 ; i < j; i++) { | |||
Y[i] += temp1 * a0[i]; | |||
temp2 += a0[i] * X[i]; | |||
} | |||
Y[j] += temp1 * a0[j] + alpha * temp2; | |||
} | |||
if (inc_y != 1) { | |||
COPY_K(m, Y, 1, y, inc_y); | |||
} | |||
return(0); | |||
} |
@@ -0,0 +1,120 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2025, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <arm_neon.h> | |||
static void symv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, | |||
FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||
{ | |||
#ifdef DOUBLE | |||
float64x2_t vtmpx0 = vld1q_dup_f64(&temp1[0]); | |||
float64x2_t vtmpx1 = vld1q_dup_f64(&temp1[1]); | |||
float64x2_t vtmpx2 = vld1q_dup_f64(&temp1[2]); | |||
float64x2_t vtmpx3 = vld1q_dup_f64(&temp1[3]); | |||
float64x2_t vtmpy0 = {0.0, 0.0}; | |||
float64x2_t vtmpy1 = {0.0, 0.0}; | |||
float64x2_t vtmpy2 = {0.0, 0.0}; | |||
float64x2_t vtmpy3 = {0.0, 0.0}; | |||
float64x2_t vxl, vxh, vyl, vyh; | |||
float64x2_t vap0l, vap0h, vap1l, vap1h, vap2l, vap2h, vap3l, vap3h; | |||
BLASLONG i; | |||
for (i = from; i < to; i+=4) { | |||
vyl = vld1q_f64(&y[i]); | |||
vyh = vld1q_f64(&y[i+2]); | |||
vxl = vld1q_f64(&x[i]); | |||
vxh = vld1q_f64(&x[i+2]); | |||
vap0l = vld1q_f64(&a0[i]); | |||
vap0h = vld1q_f64(&a0[i+2]); | |||
vap1l = vld1q_f64(&a1[i]); | |||
vap1h = vld1q_f64(&a1[i+2]); | |||
vap2l = vld1q_f64(&a2[i]); | |||
vap2h = vld1q_f64(&a2[i+2]); | |||
vap3l = vld1q_f64(&a3[i]); | |||
vap3h = vld1q_f64(&a3[i+2]); | |||
vyl = vfmaq_f64(vyl, vtmpx0, vap0l); | |||
vyh = vfmaq_f64(vyh, vtmpx0, vap0h); | |||
vyl = vfmaq_f64(vyl, vtmpx1, vap1l); | |||
vyh = vfmaq_f64(vyh, vtmpx1, vap1h); | |||
vyl = vfmaq_f64(vyl, vtmpx2, vap2l); | |||
vyh = vfmaq_f64(vyh, vtmpx2, vap2h); | |||
vyl = vfmaq_f64(vyl, vtmpx3, vap3l); | |||
vyh = vfmaq_f64(vyh, vtmpx3, vap3h); | |||
vtmpy0 = vfmaq_f64(vtmpy0, vxl, vap0l); | |||
vtmpy0 = vfmaq_f64(vtmpy0, vxh, vap0h); | |||
vtmpy1 = vfmaq_f64(vtmpy1, vxl, vap1l); | |||
vtmpy2 = vfmaq_f64(vtmpy2, vxl, vap2l); | |||
vtmpy1 = vfmaq_f64(vtmpy1, vxh, vap1h); | |||
vtmpy2 = vfmaq_f64(vtmpy2, vxh, vap2h); | |||
vtmpy3 = vfmaq_f64(vtmpy3, vxl, vap3l); | |||
vtmpy3 = vfmaq_f64(vtmpy3, vxh, vap3h); | |||
vst1q_f64(&y[i], vyl); | |||
vst1q_f64(&y[i+2], vyh); | |||
} | |||
temp2[0] += vaddvq_f64(vtmpy0); | |||
temp2[1] += vaddvq_f64(vtmpy1); | |||
temp2[2] += vaddvq_f64(vtmpy2); | |||
temp2[3] += vaddvq_f64(vtmpy3); | |||
#else | |||
float32x4_t vtmpx0 = vld1q_dup_f32(&temp1[0]); | |||
float32x4_t vtmpx1 = vld1q_dup_f32(&temp1[1]); | |||
float32x4_t vtmpx2 = vld1q_dup_f32(&temp1[2]); | |||
float32x4_t vtmpx3 = vld1q_dup_f32(&temp1[3]); | |||
float32x4_t vtmpy0 = {0.0, 0.0, 0.0, 0.0}; | |||
float32x4_t vtmpy1 = {0.0, 0.0, 0.0, 0.0}; | |||
float32x4_t vtmpy2 = {0.0, 0.0, 0.0, 0.0}; | |||
float32x4_t vtmpy3 = {0.0, 0.0, 0.0, 0.0}; | |||
float32x4_t vx, vy; | |||
float32x4_t vap0, vap1, vap2, vap3; | |||
BLASLONG i; | |||
for (i = from; i < to; i+=4) { | |||
vy = vld1q_f32(&y[i]); | |||
vx = vld1q_f32(&x[i]); | |||
vap0 = vld1q_f32(&a0[i]); | |||
vap1 = vld1q_f32(&a1[i]); | |||
vap2 = vld1q_f32(&a2[i]); | |||
vap3 = vld1q_f32(&a3[i]); | |||
vy = vfmaq_f32(vy, vtmpx0, vap0); | |||
vy = vfmaq_f32(vy, vtmpx1, vap1); | |||
vy = vfmaq_f32(vy, vtmpx2, vap2); | |||
vy = vfmaq_f32(vy, vtmpx3, vap3); | |||
vtmpy0 = vfmaq_f32(vtmpy0, vx, vap0); | |||
vtmpy1 = vfmaq_f32(vtmpy1, vx, vap1); | |||
vtmpy2 = vfmaq_f32(vtmpy2, vx, vap2); | |||
vtmpy3 = vfmaq_f32(vtmpy3, vx, vap3); | |||
vst1q_f32(&y[i], vy); | |||
} | |||
temp2[0] += vaddvq_f32(vtmpy0); | |||
temp2[1] += vaddvq_f32(vtmpy1); | |||
temp2[2] += vaddvq_f32(vtmpy2); | |||
temp2[3] += vaddvq_f32(vtmpy3); | |||
#endif | |||
} |
@@ -0,0 +1,89 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2025, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
#ifdef DOUBLE | |||
#define SV_COUNT svcntd | |||
#define SV_TYPE svfloat64_t | |||
#define SV_TRUE svptrue_b64 | |||
#define SV_WHILE svwhilelt_b64_s64 | |||
#define SV_DUP svdup_f64 | |||
#else | |||
#define SV_COUNT svcntw | |||
#define SV_TYPE svfloat32_t | |||
#define SV_TRUE svptrue_b32 | |||
#define SV_WHILE svwhilelt_b32_s64 | |||
#define SV_DUP svdup_f32 | |||
#endif | |||
static void symv_kernel_v1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, | |||
FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||
{ | |||
SV_TYPE vtmpx0 = SV_DUP(temp1[0]); | |||
SV_TYPE vtmpx1 = SV_DUP(temp1[1]); | |||
SV_TYPE vtmpx2 = SV_DUP(temp1[2]); | |||
SV_TYPE vtmpx3 = SV_DUP(temp1[3]); | |||
SV_TYPE vtmpy0 = SV_DUP(0.0); | |||
SV_TYPE vtmpy1 = SV_DUP(0.0); | |||
SV_TYPE vtmpy2 = SV_DUP(0.0); | |||
SV_TYPE vtmpy3 = SV_DUP(0.0); | |||
SV_TYPE vx, vy; | |||
SV_TYPE vap0, vap1, vap2, vap3; | |||
BLASLONG i; | |||
uint64_t sve_size = SV_COUNT(); | |||
svbool_t pg; | |||
for (i = from; i < to; i += sve_size) { | |||
pg = SV_WHILE(i, to); | |||
vy = svld1(pg, &y[i]); | |||
vx = svld1(pg, &x[i]); | |||
vap0 = svld1(pg, &a0[i]); | |||
vap1 = svld1(pg, &a1[i]); | |||
vap2 = svld1(pg, &a2[i]); | |||
vap3 = svld1(pg, &a3[i]); | |||
vy = svmla_m(pg, vy, vtmpx0, vap0); | |||
vy = svmla_m(pg, vy, vtmpx1, vap1); | |||
vy = svmla_m(pg, vy, vtmpx2, vap2); | |||
vy = svmla_m(pg, vy, vtmpx3, vap3); | |||
vtmpy0 = svmla_m(pg, vtmpy0, vx, vap0); | |||
vtmpy1 = svmla_m(pg, vtmpy1, vx, vap1); | |||
vtmpy2 = svmla_m(pg, vtmpy2, vx, vap2); | |||
vtmpy3 = svmla_m(pg, vtmpy3, vx, vap3); | |||
svst1(pg, &y[i], vy); | |||
} | |||
pg = SV_TRUE(); | |||
temp2[0] += svaddv(pg, vtmpy0); | |||
temp2[1] += svaddv(pg, vtmpy1); | |||
temp2[2] += svaddv(pg, vtmpy2); | |||
temp2[3] += svaddv(pg, vtmpy3); | |||
} |