From d53572880398016fb5bee5b7aa96131926d295ec Mon Sep 17 00:00:00 2001
From: Annop Wongwathanarat <annop.wongwathanarat@arm.com>
Date: Wed, 9 Apr 2025 12:54:57 +0000
Subject: [PATCH] Improve performance for SGEMVN on NEONVERSEN1

---
 kernel/arm64/KERNEL.NEOVERSEN1 |   2 +-
 kernel/arm64/sgemv_n_neon.c    | 219 +++++++++++++++++++++++++++++++++
 2 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 kernel/arm64/sgemv_n_neon.c

diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1
index e623814d6..de4d33c74 100644
--- a/kernel/arm64/KERNEL.NEOVERSEN1
+++ b/kernel/arm64/KERNEL.NEOVERSEN1
@@ -60,7 +60,7 @@ DSCALKERNEL  = scal.S
 CSCALKERNEL  = zscal.S
 ZSCALKERNEL  = zscal.S
 
-SGEMVNKERNEL = gemv_n.S
+SGEMVNKERNEL = sgemv_n_neon.c
 DGEMVNKERNEL = gemv_n.S
 CGEMVNKERNEL = zgemv_n.S
 ZGEMVNKERNEL = zgemv_n.S
diff --git a/kernel/arm64/sgemv_n_neon.c b/kernel/arm64/sgemv_n_neon.c
new file mode 100644
index 000000000..5fa86b350
--- /dev/null
+++ b/kernel/arm64/sgemv_n_neon.c
@@ -0,0 +1,219 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <arm_neon.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+  BLASLONG i;
+  BLASLONG ix,iy;
+  BLASLONG j;
+  FLOAT *a_ptr;
+  FLOAT temp;
+
+  ix = 0;
+  a_ptr = a;
+
+  if (inc_x == 1 && inc_y == 1) {
+    FLOAT *a0_ptr = a + lda * 0;
+    FLOAT *a1_ptr = a + lda * 1;
+    FLOAT *a2_ptr = a + lda * 2;
+    FLOAT *a3_ptr = a + lda * 3;
+    FLOAT *a4_ptr = a + lda * 4;
+    FLOAT *a5_ptr = a + lda * 5;
+    FLOAT *a6_ptr = a + lda * 6;
+    FLOAT *a7_ptr = a + lda * 7;
+
+    j = 0;
+    while (j + 3 < n) {
+      float32x4_t x0_vec = vld1q_f32(x + j);
+      x0_vec = vmulq_n_f32(x0_vec, alpha);
+      i = 0;
+      while (i + 7 < m) {
+        float32x4_t a00_vec = vld1q_f32(a0_ptr + i);
+        float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4);
+        float32x4_t a10_vec = vld1q_f32(a1_ptr + i);
+        float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4);
+        float32x4_t a20_vec = vld1q_f32(a2_ptr + i);
+        float32x4_t a21_vec = vld1q_f32(a2_ptr + i + 4);
+        float32x4_t a30_vec = vld1q_f32(a3_ptr + i);
+        float32x4_t a31_vec = vld1q_f32(a3_ptr + i + 4);
+
+        float32x4_t y0_vec = vld1q_f32(y + i);
+        float32x4_t y1_vec = vld1q_f32(y + i + 4);
+        y0_vec = vmlaq_laneq_f32(y0_vec, a00_vec, x0_vec, 0);
+        y0_vec = vmlaq_laneq_f32(y0_vec, a10_vec, x0_vec, 1);
+        y0_vec = vmlaq_laneq_f32(y0_vec, a20_vec, x0_vec, 2);
+        y0_vec = vmlaq_laneq_f32(y0_vec, a30_vec, x0_vec, 3);
+        y1_vec = vmlaq_laneq_f32(y1_vec, a01_vec, x0_vec, 0);
+        y1_vec = vmlaq_laneq_f32(y1_vec, a11_vec, x0_vec, 1);
+        y1_vec = vmlaq_laneq_f32(y1_vec, a21_vec, x0_vec, 2);
+        y1_vec = vmlaq_laneq_f32(y1_vec, a31_vec, x0_vec, 3);
+
+        vst1q_f32(y + i, y0_vec);
+        vst1q_f32(y + i + 4, y1_vec);
+
+        i += 8;
+      }
+      while (i + 3 < m) {
+        float32x4_t a0_vec = vld1q_f32(a0_ptr + i);
+        float32x4_t a1_vec = vld1q_f32(a1_ptr + i);
+        float32x4_t a2_vec = vld1q_f32(a2_ptr + i);
+        float32x4_t a3_vec = vld1q_f32(a3_ptr + i);
+
+        float32x4_t y_vec = vld1q_f32(y + i);
+        y_vec = vmlaq_laneq_f32(y_vec, a0_vec, x0_vec, 0);
+        y_vec = vmlaq_laneq_f32(y_vec, a1_vec, x0_vec, 1);
+        y_vec = vmlaq_laneq_f32(y_vec, a2_vec, x0_vec, 2);
+        y_vec = vmlaq_laneq_f32(y_vec, a3_vec, x0_vec, 3);
+
+        vst1q_f32(y + i, y_vec);
+
+        i += 4;
+      }
+      while (i + 1 < m) {
+        float32x2_t a0_vec = vld1_f32(a0_ptr + i);
+        float32x2_t a1_vec = vld1_f32(a1_ptr + i);
+        float32x2_t a2_vec = vld1_f32(a2_ptr + i);
+        float32x2_t a3_vec = vld1_f32(a3_ptr + i);
+
+        float32x2_t y_vec = vld1_f32(y + i);
+        y_vec = vmla_laneq_f32(y_vec, a0_vec, x0_vec, 0);
+        y_vec = vmla_laneq_f32(y_vec, a1_vec, x0_vec, 1);
+        y_vec = vmla_laneq_f32(y_vec, a2_vec, x0_vec, 2);
+        y_vec = vmla_laneq_f32(y_vec, a3_vec, x0_vec, 3);
+
+        vst1_f32(y + i, y_vec);
+
+        i += 2;
+      }
+      while (i < m) {
+        y[i] += a0_ptr[i] * x0_vec[0];
+        y[i] += a1_ptr[i] * x0_vec[1];
+        y[i] += a2_ptr[i] * x0_vec[2];
+        y[i] += a3_ptr[i] * x0_vec[3];
+
+        i++;
+      }
+
+      a0_ptr += lda * 4;
+      a1_ptr += lda * 4;
+      a2_ptr += lda * 4;
+      a3_ptr += lda * 4;
+
+      j += 4;
+    }
+    while (j + 1 < n) {
+      float32x2_t x0_vec = vld1_f32(x + j);
+      x0_vec = vmul_n_f32(x0_vec, alpha);
+      i = 0;
+      while (i + 7 < m) {
+        float32x4_t a00_vec = vld1q_f32(a0_ptr + i);
+        float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4);
+        float32x4_t a10_vec = vld1q_f32(a1_ptr + i);
+        float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4);
+
+        float32x4_t y0_vec = vld1q_f32(y + i);
+        float32x4_t y1_vec = vld1q_f32(y + i + 4);
+        y0_vec = vmlaq_lane_f32(y0_vec, a00_vec, x0_vec, 0);
+        y0_vec = vmlaq_lane_f32(y0_vec, a10_vec, x0_vec, 1);
+        y1_vec = vmlaq_lane_f32(y1_vec, a01_vec, x0_vec, 0);
+        y1_vec = vmlaq_lane_f32(y1_vec, a11_vec, x0_vec, 1);
+
+        vst1q_f32(y + i, y0_vec);
+        vst1q_f32(y + i + 4, y1_vec);
+
+        i += 8;
+      }
+      while (i + 3 < m) {
+        float32x4_t a0_vec = vld1q_f32(a0_ptr + i);
+        float32x4_t a1_vec = vld1q_f32(a1_ptr + i);
+
+        float32x4_t y_vec = vld1q_f32(y + i);
+        y_vec = vmlaq_lane_f32(y_vec, a0_vec, x0_vec, 0);
+        y_vec = vmlaq_lane_f32(y_vec, a1_vec, x0_vec, 1);
+
+        vst1q_f32(y + i, y_vec);
+
+        i += 4;
+      }
+      while (i + 1 < m) {
+        float32x2_t a0_vec = vld1_f32(a0_ptr + i);
+        float32x2_t a1_vec = vld1_f32(a1_ptr + i);
+
+        float32x2_t y_vec = vld1_f32(y + i);
+        y_vec = vmla_lane_f32(y_vec, a0_vec, x0_vec, 0);
+        y_vec = vmla_lane_f32(y_vec, a1_vec, x0_vec, 1);
+
+        vst1_f32(y + i, y_vec);
+
+        i += 2;
+      }
+      while (i < m) {
+        y[i] += a0_ptr[i] * x0_vec[0];
+        y[i] += a1_ptr[i] * x0_vec[1];
+
+        i++;
+      }
+
+      a0_ptr += lda * 2;
+      a1_ptr += lda * 2;
+
+      j += 2;
+    }
+    while (j < n) {
+      i = 0;
+      temp = alpha * x[j];
+      while (i < m) {
+        y[i] += a0_ptr[i] * temp;
+        i++;
+      }
+
+      a0_ptr += lda;
+      j++;
+    }
+    return (0);
+  }
+
+  for (j = 0; j < n; j++) {
+    temp = alpha * x[ix];
+    iy = 0;
+    for (i = 0; i < m; i++) {
+      y[iy] += temp * a_ptr[i];
+      iy += inc_y;
+    }
+    a_ptr += lda;
+    ix += inc_x;
+  }
+  return (0);
+}