@@ -84,6 +84,11 @@ DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||||
CGEMVTKERNEL = zgemv_t.S | CGEMVTKERNEL = zgemv_t.S | ||||
ZGEMVTKERNEL = zgemv_t.S | ZGEMVTKERNEL = zgemv_t.S | ||||
SSYMV_L_KERNEL = symv_L_sve_v1x4.c | |||||
SSYMV_U_KERNEL = symv_U_sve_v1x4.c | |||||
DSYMV_L_KERNEL = symv_L_sve_v1x4.c | |||||
DSYMV_U_KERNEL = symv_U_sve_v1x4.c | |||||
SASUMKERNEL = sasum_thunderx2t99.c | SASUMKERNEL = sasum_thunderx2t99.c | ||||
DASUMKERNEL = dasum_thunderx2t99.c | DASUMKERNEL = dasum_thunderx2t99.c | ||||
CASUMKERNEL = casum_thunderx2t99.c | CASUMKERNEL = casum_thunderx2t99.c | ||||
@@ -70,6 +70,10 @@ DGEMVTKERNEL = gemv_t.S | |||||
CGEMVTKERNEL = zgemv_t.S | CGEMVTKERNEL = zgemv_t.S | ||||
ZGEMVTKERNEL = zgemv_t.S | ZGEMVTKERNEL = zgemv_t.S | ||||
SSYMV_L_KERNEL = symv_L_asimd_4x4.c | |||||
SSYMV_U_KERNEL = symv_U_asimd_4x4.c | |||||
DSYMV_L_KERNEL = symv_L_asimd_4x4.c | |||||
DSYMV_U_KERNEL = symv_U_asimd_4x4.c | |||||
SASUMKERNEL = sasum_thunderx2t99.c | SASUMKERNEL = sasum_thunderx2t99.c | ||||
DASUMKERNEL = dasum_thunderx2t99.c | DASUMKERNEL = dasum_thunderx2t99.c | ||||
@@ -0,0 +1,113 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2025, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written | |||||
permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "symv_microk_asimd_4x4.c" | |||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||||
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
{ | |||||
BLASLONG i, j; | |||||
FLOAT temp1, temp2; | |||||
FLOAT tmp1[4]; | |||||
FLOAT tmp2[4]; | |||||
FLOAT *a0, *a1, *a2, *a3; | |||||
FLOAT x0, x1, x2, x3; | |||||
FLOAT *X = x; | |||||
FLOAT *Y = y; | |||||
if (inc_y != 1) { | |||||
Y = buffer; | |||||
COPY_K(m, y, inc_y, Y, 1); | |||||
} | |||||
if (inc_x != 1) { | |||||
if (inc_y != 1) { | |||||
X = Y + m; | |||||
} else { | |||||
X = buffer; | |||||
} | |||||
COPY_K(m, x, inc_x, X, 1); | |||||
} | |||||
BLASLONG offset1 = (offset / 4) * 4; | |||||
for (j = 0; j < offset1; j+=4) { | |||||
a0 = &a[j*lda]; | |||||
a1 = a0 + lda; | |||||
a2 = a1 + lda; | |||||
a3 = a2 + lda; | |||||
x0 = X[j]; | |||||
x1 = X[j+1]; | |||||
x2 = X[j+2]; | |||||
x3 = X[j+3]; | |||||
tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3; | |||||
tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3; | |||||
tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3; | |||||
tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3; | |||||
tmp1[0] = alpha * x0; | |||||
tmp1[1] = alpha * x1; | |||||
tmp1[2] = alpha * x2; | |||||
tmp1[3] = alpha * x3; | |||||
BLASLONG m2 = (m/4)*4; | |||||
if (m2 > j+4) | |||||
symv_kernel_4x4(j+4, m2, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||||
for (i = m2; i < m; i++) { | |||||
Y[i] += tmp1[0] * a0[i]; | |||||
tmp2[0] += a0[i] * X[i]; | |||||
Y[i] += tmp1[1] * a1[i]; | |||||
tmp2[1] += a1[i] * X[i]; | |||||
Y[i] += tmp1[2] * a2[i]; | |||||
tmp2[2] += a2[i] * X[i]; | |||||
Y[i] += tmp1[3] * a3[i]; | |||||
tmp2[3] += a3[i] * X[i]; | |||||
} | |||||
Y[j] += alpha * tmp2[0]; | |||||
Y[j+1] += alpha * tmp2[1]; | |||||
Y[j+2] += alpha * tmp2[2]; | |||||
Y[j+3] += alpha * tmp2[3]; | |||||
} | |||||
for (j = offset1; j < offset; j++) { | |||||
temp1 = alpha * X[j]; | |||||
temp2 = 0.0; | |||||
Y[j] += temp1 * a[j*lda+j]; | |||||
for (i = j+1; i < m; i++) { | |||||
Y[i] += temp1 * a[j*lda+i]; | |||||
temp2 += a[j*lda+i] * X[i]; | |||||
} | |||||
Y[j] += alpha * temp2; | |||||
} | |||||
if (inc_y != 1) { | |||||
COPY_K(m, Y, 1, y, inc_y); | |||||
} | |||||
return(0); | |||||
} |
@@ -0,0 +1,103 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2025, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written | |||||
permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "symv_microk_sve_v1x4.c" | |||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||||
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
{ | |||||
BLASLONG i, j; | |||||
FLOAT temp1, temp2; | |||||
FLOAT tmp1[4]; | |||||
FLOAT tmp2[4]; | |||||
FLOAT *a0, *a1, *a2, *a3; | |||||
FLOAT x0, x1, x2, x3; | |||||
FLOAT *X = x; | |||||
FLOAT *Y = y; | |||||
if (inc_y != 1) { | |||||
Y = buffer; | |||||
COPY_K(m, y, inc_y, Y, 1); | |||||
} | |||||
if (inc_x != 1) { | |||||
if (inc_y != 1) { | |||||
X = Y + m; | |||||
} else { | |||||
X = buffer; | |||||
} | |||||
COPY_K(m, x, inc_x, X, 1); | |||||
} | |||||
BLASLONG offset1 = (offset / 4) * 4; | |||||
for (j = 0; j < offset1; j+=4) { | |||||
a0 = &a[j*lda]; | |||||
a1 = a0 + lda; | |||||
a2 = a1 + lda; | |||||
a3 = a2 + lda; | |||||
x0 = X[j]; | |||||
x1 = X[j+1]; | |||||
x2 = X[j+2]; | |||||
x3 = X[j+3]; | |||||
tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3; | |||||
tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3; | |||||
tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3; | |||||
tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3; | |||||
tmp1[0] = alpha * x0; | |||||
tmp1[1] = alpha * x1; | |||||
tmp1[2] = alpha * x2; | |||||
tmp1[3] = alpha * x3; | |||||
symv_kernel_v1x4(j+4, m, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||||
Y[j] += alpha * tmp2[0]; | |||||
Y[j+1] += alpha * tmp2[1]; | |||||
Y[j+2] += alpha * tmp2[2]; | |||||
Y[j+3] += alpha * tmp2[3]; | |||||
} | |||||
for (j = offset1; j < offset; j++) { | |||||
temp1 = alpha * X[j]; | |||||
temp2 = 0.0; | |||||
a0 = &a[j*lda]; | |||||
Y[j] += temp1 * a0[j]; | |||||
for (i = j+1; i < m; i++) { | |||||
Y[i] += temp1 * a0[i]; | |||||
temp2 += a0[i] * X[i]; | |||||
} | |||||
Y[j] += alpha * temp2; | |||||
} | |||||
if (inc_y != 1) { | |||||
COPY_K(m, Y, 1, y, inc_y); | |||||
} | |||||
return(0); | |||||
} |
@@ -0,0 +1,106 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2025, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written | |||||
permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "symv_microk_asimd_4x4.c" | |||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||||
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
{ | |||||
BLASLONG i, j, j1, j2, m2; | |||||
FLOAT temp1, temp2; | |||||
FLOAT tmp1[4]; | |||||
FLOAT tmp2[4]; | |||||
FLOAT *a0, *a1, *a2, *a3; | |||||
FLOAT *X = x; | |||||
FLOAT *Y = y; | |||||
BLASLONG m1 = m - offset; | |||||
if (inc_y != 1) { | |||||
Y = buffer; | |||||
COPY_K(m, y, inc_y, Y, 1); | |||||
} | |||||
if (inc_x != 1) { | |||||
if (inc_y != 1) { | |||||
X = Y + m; | |||||
} else { | |||||
X = buffer; | |||||
} | |||||
COPY_K(m, x, inc_x, X, 1); | |||||
} | |||||
m2 = m - (offset % 4); | |||||
for (j = m1; j < m2; j += 4) { | |||||
tmp1[0] = alpha * X[j]; | |||||
tmp1[1] = alpha * X[j+1]; | |||||
tmp1[2] = alpha * X[j+2]; | |||||
tmp1[3] = alpha * X[j+3]; | |||||
tmp2[0] = 0.0; | |||||
tmp2[1] = 0.0; | |||||
tmp2[2] = 0.0; | |||||
tmp2[3] = 0.0; | |||||
a0 = &a[j*lda]; | |||||
a1 = a0 + lda; | |||||
a2 = a1 + lda; | |||||
a3 = a2 + lda; | |||||
j1 = (j / 4) * 4; | |||||
if ( j1 ) | |||||
symv_kernel_4x4(0, j1, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||||
j2 = 0; | |||||
for (j1 = j ; j1 < j+4 ; j1++) { | |||||
temp1 = tmp1[j2]; | |||||
temp2 = tmp2[j2]; | |||||
a0 = &a[j1*lda]; | |||||
for (i=j ; i<j1; i++) { | |||||
Y[i] += temp1 * a0[i]; | |||||
temp2 += a0[i] * X[i]; | |||||
} | |||||
Y[j1] += temp1 * a0[j1] + alpha * temp2; | |||||
j2++; | |||||
} | |||||
} | |||||
for ( ; j < m; j++) { | |||||
temp1 = alpha * X[j]; | |||||
temp2 = 0.0; | |||||
a0 = &a[j*lda]; | |||||
for (i = 0 ; i < j; i++) { | |||||
Y[i] += temp1 * a0[i]; | |||||
temp2 += a0[i] * X[i]; | |||||
} | |||||
Y[j] += temp1 * a0[j] + alpha * temp2; | |||||
} | |||||
if (inc_y != 1) { | |||||
COPY_K(m, Y, 1, y, inc_y); | |||||
} | |||||
return(0); | |||||
} |
@@ -0,0 +1,104 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2025, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written | |||||
permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "symv_microk_sve_v1x4.c" | |||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||||
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
{ | |||||
BLASLONG i, j, j1, j2, m2; | |||||
FLOAT temp1, temp2; | |||||
FLOAT tmp1[4]; | |||||
FLOAT tmp2[4]; | |||||
FLOAT *a0, *a1, *a2, *a3; | |||||
FLOAT *X = x; | |||||
FLOAT *Y = y; | |||||
BLASLONG m1 = m - offset; | |||||
if (inc_y != 1) { | |||||
Y = buffer; | |||||
COPY_K(m, y, inc_y, Y, 1); | |||||
} | |||||
if (inc_x != 1) { | |||||
if (inc_y != 1) { | |||||
X = Y + m; | |||||
} else { | |||||
X = buffer; | |||||
} | |||||
COPY_K(m, x, inc_x, X, 1); | |||||
} | |||||
m2 = m - (offset % 4); | |||||
for (j = m1; j < m2; j += 4) { | |||||
tmp1[0] = alpha * X[j]; | |||||
tmp1[1] = alpha * X[j+1]; | |||||
tmp1[2] = alpha * X[j+2]; | |||||
tmp1[3] = alpha * X[j+3]; | |||||
tmp2[0] = 0.0; | |||||
tmp2[1] = 0.0; | |||||
tmp2[2] = 0.0; | |||||
tmp2[3] = 0.0; | |||||
a0 = &a[j*lda]; | |||||
a1 = a0 + lda; | |||||
a2 = a1 + lda; | |||||
a3 = a2 + lda; | |||||
symv_kernel_v1x4(0, j, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||||
j2 = 0; | |||||
for (j1 = j ; j1 < j+4 ; j1++) { | |||||
temp1 = tmp1[j2]; | |||||
temp2 = tmp2[j2]; | |||||
a0 = &a[j1*lda]; | |||||
for (i=j ; i<j1; i++) { | |||||
Y[i] += temp1 * a0[i]; | |||||
temp2 += a0[i] * X[i]; | |||||
} | |||||
Y[j1] += temp1 * a0[j1] + alpha * temp2; | |||||
j2++; | |||||
} | |||||
} | |||||
for ( ; j < m; j++) { | |||||
temp1 = alpha * X[j]; | |||||
temp2 = 0.0; | |||||
a0 = &a[j*lda]; | |||||
for (i = 0 ; i < j; i++) { | |||||
Y[i] += temp1 * a0[i]; | |||||
temp2 += a0[i] * X[i]; | |||||
} | |||||
Y[j] += temp1 * a0[j] + alpha * temp2; | |||||
} | |||||
if (inc_y != 1) { | |||||
COPY_K(m, Y, 1, y, inc_y); | |||||
} | |||||
return(0); | |||||
} |
@@ -0,0 +1,120 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2025, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written | |||||
permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <arm_neon.h> | |||||
static void symv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, | |||||
FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||||
{ | |||||
#ifdef DOUBLE | |||||
float64x2_t vtmpx0 = vld1q_dup_f64(&temp1[0]); | |||||
float64x2_t vtmpx1 = vld1q_dup_f64(&temp1[1]); | |||||
float64x2_t vtmpx2 = vld1q_dup_f64(&temp1[2]); | |||||
float64x2_t vtmpx3 = vld1q_dup_f64(&temp1[3]); | |||||
float64x2_t vtmpy0 = {0.0, 0.0}; | |||||
float64x2_t vtmpy1 = {0.0, 0.0}; | |||||
float64x2_t vtmpy2 = {0.0, 0.0}; | |||||
float64x2_t vtmpy3 = {0.0, 0.0}; | |||||
float64x2_t vxl, vxh, vyl, vyh; | |||||
float64x2_t vap0l, vap0h, vap1l, vap1h, vap2l, vap2h, vap3l, vap3h; | |||||
BLASLONG i; | |||||
for (i = from; i < to; i+=4) { | |||||
vyl = vld1q_f64(&y[i]); | |||||
vyh = vld1q_f64(&y[i+2]); | |||||
vxl = vld1q_f64(&x[i]); | |||||
vxh = vld1q_f64(&x[i+2]); | |||||
vap0l = vld1q_f64(&a0[i]); | |||||
vap0h = vld1q_f64(&a0[i+2]); | |||||
vap1l = vld1q_f64(&a1[i]); | |||||
vap1h = vld1q_f64(&a1[i+2]); | |||||
vap2l = vld1q_f64(&a2[i]); | |||||
vap2h = vld1q_f64(&a2[i+2]); | |||||
vap3l = vld1q_f64(&a3[i]); | |||||
vap3h = vld1q_f64(&a3[i+2]); | |||||
vyl = vfmaq_f64(vyl, vtmpx0, vap0l); | |||||
vyh = vfmaq_f64(vyh, vtmpx0, vap0h); | |||||
vyl = vfmaq_f64(vyl, vtmpx1, vap1l); | |||||
vyh = vfmaq_f64(vyh, vtmpx1, vap1h); | |||||
vyl = vfmaq_f64(vyl, vtmpx2, vap2l); | |||||
vyh = vfmaq_f64(vyh, vtmpx2, vap2h); | |||||
vyl = vfmaq_f64(vyl, vtmpx3, vap3l); | |||||
vyh = vfmaq_f64(vyh, vtmpx3, vap3h); | |||||
vtmpy0 = vfmaq_f64(vtmpy0, vxl, vap0l); | |||||
vtmpy0 = vfmaq_f64(vtmpy0, vxh, vap0h); | |||||
vtmpy1 = vfmaq_f64(vtmpy1, vxl, vap1l); | |||||
vtmpy2 = vfmaq_f64(vtmpy2, vxl, vap2l); | |||||
vtmpy1 = vfmaq_f64(vtmpy1, vxh, vap1h); | |||||
vtmpy2 = vfmaq_f64(vtmpy2, vxh, vap2h); | |||||
vtmpy3 = vfmaq_f64(vtmpy3, vxl, vap3l); | |||||
vtmpy3 = vfmaq_f64(vtmpy3, vxh, vap3h); | |||||
vst1q_f64(&y[i], vyl); | |||||
vst1q_f64(&y[i+2], vyh); | |||||
} | |||||
temp2[0] += vaddvq_f64(vtmpy0); | |||||
temp2[1] += vaddvq_f64(vtmpy1); | |||||
temp2[2] += vaddvq_f64(vtmpy2); | |||||
temp2[3] += vaddvq_f64(vtmpy3); | |||||
#else | |||||
float32x4_t vtmpx0 = vld1q_dup_f32(&temp1[0]); | |||||
float32x4_t vtmpx1 = vld1q_dup_f32(&temp1[1]); | |||||
float32x4_t vtmpx2 = vld1q_dup_f32(&temp1[2]); | |||||
float32x4_t vtmpx3 = vld1q_dup_f32(&temp1[3]); | |||||
float32x4_t vtmpy0 = {0.0, 0.0, 0.0, 0.0}; | |||||
float32x4_t vtmpy1 = {0.0, 0.0, 0.0, 0.0}; | |||||
float32x4_t vtmpy2 = {0.0, 0.0, 0.0, 0.0}; | |||||
float32x4_t vtmpy3 = {0.0, 0.0, 0.0, 0.0}; | |||||
float32x4_t vx, vy; | |||||
float32x4_t vap0, vap1, vap2, vap3; | |||||
BLASLONG i; | |||||
for (i = from; i < to; i+=4) { | |||||
vy = vld1q_f32(&y[i]); | |||||
vx = vld1q_f32(&x[i]); | |||||
vap0 = vld1q_f32(&a0[i]); | |||||
vap1 = vld1q_f32(&a1[i]); | |||||
vap2 = vld1q_f32(&a2[i]); | |||||
vap3 = vld1q_f32(&a3[i]); | |||||
vy = vfmaq_f32(vy, vtmpx0, vap0); | |||||
vy = vfmaq_f32(vy, vtmpx1, vap1); | |||||
vy = vfmaq_f32(vy, vtmpx2, vap2); | |||||
vy = vfmaq_f32(vy, vtmpx3, vap3); | |||||
vtmpy0 = vfmaq_f32(vtmpy0, vx, vap0); | |||||
vtmpy1 = vfmaq_f32(vtmpy1, vx, vap1); | |||||
vtmpy2 = vfmaq_f32(vtmpy2, vx, vap2); | |||||
vtmpy3 = vfmaq_f32(vtmpy3, vx, vap3); | |||||
vst1q_f32(&y[i], vy); | |||||
} | |||||
temp2[0] += vaddvq_f32(vtmpy0); | |||||
temp2[1] += vaddvq_f32(vtmpy1); | |||||
temp2[2] += vaddvq_f32(vtmpy2); | |||||
temp2[3] += vaddvq_f32(vtmpy3); | |||||
#endif | |||||
} |
@@ -0,0 +1,89 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2025, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written | |||||
permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <arm_sve.h> | |||||
#ifdef DOUBLE | |||||
#define SV_COUNT svcntd | |||||
#define SV_TYPE svfloat64_t | |||||
#define SV_TRUE svptrue_b64 | |||||
#define SV_WHILE svwhilelt_b64_s64 | |||||
#define SV_DUP svdup_f64 | |||||
#else | |||||
#define SV_COUNT svcntw | |||||
#define SV_TYPE svfloat32_t | |||||
#define SV_TRUE svptrue_b32 | |||||
#define SV_WHILE svwhilelt_b32_s64 | |||||
#define SV_DUP svdup_f32 | |||||
#endif | |||||
static void symv_kernel_v1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, | |||||
FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||||
{ | |||||
SV_TYPE vtmpx0 = SV_DUP(temp1[0]); | |||||
SV_TYPE vtmpx1 = SV_DUP(temp1[1]); | |||||
SV_TYPE vtmpx2 = SV_DUP(temp1[2]); | |||||
SV_TYPE vtmpx3 = SV_DUP(temp1[3]); | |||||
SV_TYPE vtmpy0 = SV_DUP(0.0); | |||||
SV_TYPE vtmpy1 = SV_DUP(0.0); | |||||
SV_TYPE vtmpy2 = SV_DUP(0.0); | |||||
SV_TYPE vtmpy3 = SV_DUP(0.0); | |||||
SV_TYPE vx, vy; | |||||
SV_TYPE vap0, vap1, vap2, vap3; | |||||
BLASLONG i; | |||||
uint64_t sve_size = SV_COUNT(); | |||||
svbool_t pg; | |||||
for (i = from; i < to; i += sve_size) { | |||||
pg = SV_WHILE(i, to); | |||||
vy = svld1(pg, &y[i]); | |||||
vx = svld1(pg, &x[i]); | |||||
vap0 = svld1(pg, &a0[i]); | |||||
vap1 = svld1(pg, &a1[i]); | |||||
vap2 = svld1(pg, &a2[i]); | |||||
vap3 = svld1(pg, &a3[i]); | |||||
vy = svmla_m(pg, vy, vtmpx0, vap0); | |||||
vy = svmla_m(pg, vy, vtmpx1, vap1); | |||||
vy = svmla_m(pg, vy, vtmpx2, vap2); | |||||
vy = svmla_m(pg, vy, vtmpx3, vap3); | |||||
vtmpy0 = svmla_m(pg, vtmpy0, vx, vap0); | |||||
vtmpy1 = svmla_m(pg, vtmpy1, vx, vap1); | |||||
vtmpy2 = svmla_m(pg, vtmpy2, vx, vap2); | |||||
vtmpy3 = svmla_m(pg, vtmpy3, vx, vap3); | |||||
svst1(pg, &y[i], vy); | |||||
} | |||||
pg = SV_TRUE(); | |||||
temp2[0] += svaddv(pg, vtmpy0); | |||||
temp2[1] += svaddv(pg, vtmpy1); | |||||
temp2[2] += svaddv(pg, vtmpy2); | |||||
temp2[3] += svaddv(pg, vtmpy3); | |||||
} |