From d53572880398016fb5bee5b7aa96131926d295ec Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Wed, 9 Apr 2025 12:54:57 +0000 Subject: [PATCH] Improve performance for SGEMVN on NEONVERSEN1 --- kernel/arm64/KERNEL.NEOVERSEN1 | 2 +- kernel/arm64/sgemv_n_neon.c | 219 +++++++++++++++++++++++++++++++++ 2 files changed, 220 insertions(+), 1 deletion(-) create mode 100644 kernel/arm64/sgemv_n_neon.c diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 index e623814d6..de4d33c74 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN1 +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -60,7 +60,7 @@ DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S -SGEMVNKERNEL = gemv_n.S +SGEMVNKERNEL = sgemv_n_neon.c DGEMVNKERNEL = gemv_n.S CGEMVNKERNEL = zgemv_n.S ZGEMVNKERNEL = zgemv_n.S diff --git a/kernel/arm64/sgemv_n_neon.c b/kernel/arm64/sgemv_n_neon.c new file mode 100644 index 000000000..5fa86b350 --- /dev/null +++ b/kernel/arm64/sgemv_n_neon.c @@ -0,0 +1,219 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + if (inc_x == 1 && inc_y == 1) { + FLOAT *a0_ptr = a + lda * 0; + FLOAT *a1_ptr = a + lda * 1; + FLOAT *a2_ptr = a + lda * 2; + FLOAT *a3_ptr = a + lda * 3; + FLOAT *a4_ptr = a + lda * 4; + FLOAT *a5_ptr = a + lda * 5; + FLOAT *a6_ptr = a + lda * 6; + FLOAT *a7_ptr = a + lda * 7; + + j = 0; + while (j + 3 < n) { + float32x4_t x0_vec = vld1q_f32(x + j); + x0_vec = vmulq_n_f32(x0_vec, alpha); + i = 0; + while (i + 7 < m) { + float32x4_t a00_vec = vld1q_f32(a0_ptr + i); + float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4); + float32x4_t a10_vec = vld1q_f32(a1_ptr + i); + float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4); + float32x4_t a20_vec = vld1q_f32(a2_ptr + i); + float32x4_t a21_vec = vld1q_f32(a2_ptr + i + 4); + float32x4_t a30_vec = vld1q_f32(a3_ptr + i); + float32x4_t a31_vec = vld1q_f32(a3_ptr + i + 4); + + float32x4_t y0_vec = vld1q_f32(y + i); + float32x4_t y1_vec = vld1q_f32(y + i + 4); + y0_vec = vmlaq_laneq_f32(y0_vec, a00_vec, x0_vec, 0); + y0_vec = vmlaq_laneq_f32(y0_vec, a10_vec, x0_vec, 1); + y0_vec = vmlaq_laneq_f32(y0_vec, a20_vec, x0_vec, 2); + y0_vec = vmlaq_laneq_f32(y0_vec, a30_vec, x0_vec, 3); + y1_vec = vmlaq_laneq_f32(y1_vec, a01_vec, x0_vec, 0); + y1_vec = vmlaq_laneq_f32(y1_vec, a11_vec, x0_vec, 1); + y1_vec = vmlaq_laneq_f32(y1_vec, a21_vec, x0_vec, 2); + y1_vec = vmlaq_laneq_f32(y1_vec, a31_vec, x0_vec, 3); + + vst1q_f32(y + i, y0_vec); + vst1q_f32(y + i + 4, y1_vec); + + i += 8; + } + while (i + 3 < m) { + float32x4_t a0_vec = vld1q_f32(a0_ptr + i); + float32x4_t a1_vec = vld1q_f32(a1_ptr + i); + float32x4_t a2_vec = vld1q_f32(a2_ptr + i); + float32x4_t a3_vec = vld1q_f32(a3_ptr + i); + + float32x4_t y_vec = vld1q_f32(y + i); + y_vec = vmlaq_laneq_f32(y_vec, a0_vec, x0_vec, 0); + y_vec = vmlaq_laneq_f32(y_vec, a1_vec, x0_vec, 1); + y_vec = vmlaq_laneq_f32(y_vec, a2_vec, x0_vec, 2); + y_vec = vmlaq_laneq_f32(y_vec, a3_vec, x0_vec, 3); + + vst1q_f32(y + i, y_vec); + + i += 4; + } + while (i + 1 < m) { + float32x2_t a0_vec = vld1_f32(a0_ptr + i); + float32x2_t a1_vec = vld1_f32(a1_ptr + i); + float32x2_t a2_vec = vld1_f32(a2_ptr + i); + float32x2_t a3_vec = vld1_f32(a3_ptr + i); + + float32x2_t y_vec = vld1_f32(y + i); + y_vec = vmla_laneq_f32(y_vec, a0_vec, x0_vec, 0); + y_vec = vmla_laneq_f32(y_vec, a1_vec, x0_vec, 1); + y_vec = vmla_laneq_f32(y_vec, a2_vec, x0_vec, 2); + y_vec = vmla_laneq_f32(y_vec, a3_vec, x0_vec, 3); + + vst1_f32(y + i, y_vec); + + i += 2; + } + while (i < m) { + y[i] += a0_ptr[i] * x0_vec[0]; + y[i] += a1_ptr[i] * x0_vec[1]; + y[i] += a2_ptr[i] * x0_vec[2]; + y[i] += a3_ptr[i] * x0_vec[3]; + + i++; + } + + a0_ptr += lda * 4; + a1_ptr += lda * 4; + a2_ptr += lda * 4; + a3_ptr += lda * 4; + + j += 4; + } + while (j + 1 < n) { + float32x2_t x0_vec = vld1_f32(x + j); + x0_vec = vmul_n_f32(x0_vec, alpha); + i = 0; + while (i + 7 < m) { + float32x4_t a00_vec = vld1q_f32(a0_ptr + i); + float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4); + float32x4_t a10_vec = vld1q_f32(a1_ptr + i); + float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4); + + float32x4_t y0_vec = vld1q_f32(y + i); + float32x4_t y1_vec = vld1q_f32(y + i + 4); + y0_vec = vmlaq_lane_f32(y0_vec, a00_vec, x0_vec, 0); + y0_vec = vmlaq_lane_f32(y0_vec, a10_vec, x0_vec, 1); + y1_vec = vmlaq_lane_f32(y1_vec, a01_vec, x0_vec, 0); + y1_vec = vmlaq_lane_f32(y1_vec, a11_vec, x0_vec, 1); + + vst1q_f32(y + i, y0_vec); + vst1q_f32(y + i + 4, y1_vec); + + i += 8; + } + while (i + 3 < m) { + float32x4_t a0_vec = vld1q_f32(a0_ptr + i); + float32x4_t a1_vec = vld1q_f32(a1_ptr + i); + + float32x4_t y_vec = vld1q_f32(y + i); + y_vec = vmlaq_lane_f32(y_vec, a0_vec, x0_vec, 0); + y_vec = vmlaq_lane_f32(y_vec, a1_vec, x0_vec, 1); + + vst1q_f32(y + i, y_vec); + + i += 4; + } + while (i + 1 < m) { + float32x2_t a0_vec = vld1_f32(a0_ptr + i); + float32x2_t a1_vec = vld1_f32(a1_ptr + i); + + float32x2_t y_vec = vld1_f32(y + i); + y_vec = vmla_lane_f32(y_vec, a0_vec, x0_vec, 0); + y_vec = vmla_lane_f32(y_vec, a1_vec, x0_vec, 1); + + vst1_f32(y + i, y_vec); + + i += 2; + } + while (i < m) { + y[i] += a0_ptr[i] * x0_vec[0]; + y[i] += a1_ptr[i] * x0_vec[1]; + + i++; + } + + a0_ptr += lda * 2; + a1_ptr += lda * 2; + + j += 2; + } + while (j < n) { + i = 0; + temp = alpha * x[j]; + while (i < m) { + y[i] += a0_ptr[i] * temp; + i++; + } + + a0_ptr += lda; + j++; + } + return (0); + } + + for (j = 0; j < n; j++) { + temp = alpha * x[ix]; + iy = 0; + for (i = 0; i < m; i++) { + y[iy] += temp * a_ptr[i]; + iy += inc_y; + } + a_ptr += lda; + ix += inc_x; + } + return (0); +}