You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sbgemm_ncopy_4_neoversev1.c 5.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. /***************************************************************************
  2. * Copyright (c) 2024-2025, The OpenBLAS Project
  3. * All rights reserved.
  4. * Redistribution and use in source and binary forms, with or without
  5. * modification, are permitted provided that the following conditions are
  6. * met:
  7. * 1. Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * 2. Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in
  11. * the documentation and/or other materials provided with the
  12. * distribution.
  13. * 3. Neither the name of the OpenBLAS project nor the names of
  14. * its contributors may be used to endorse or promote products
  15. * derived from this software without specific prior written permission.
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26. * POSSIBILITY OF SUCH DAMAGE.
  27. * *****************************************************************************/
  28. #include <arm_sve.h>
  29. #include "common.h"
  30. int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
  31. IFLOAT *a_offset;
  32. IFLOAT *a_offsetx[4];
  33. IFLOAT *b_offset;
  34. a_offset = a;
  35. b_offset = b;
  36. bfloat16_t zero_value_bf16;
  37. *((uint16_t *)(&zero_value_bf16)) = 0;
  38. svbool_t pg16_all = svptrue_b16(); // 16 elements for sve-256 machine.
  39. svbool_t pg16_first_8 = svwhilelt_b16(0, 8);
  40. svbfloat16_t v0, v1, v2, v3;
  41. svuint64_t t0, t1;
  42. BLASLONG rest = m & 7;
  43. svbool_t pg16_rest = svwhilelt_b16_s32(0, rest);
  44. for (BLASLONG j = 0; j < n / 4; j++) {
  45. a_offsetx[0] = a_offset;
  46. a_offsetx[1] = a_offsetx[0] + lda;
  47. a_offsetx[2] = a_offsetx[1] + lda;
  48. a_offsetx[3] = a_offsetx[2] + lda;
  49. a_offset += 4 * lda;
  50. for (BLASLONG i = 0; i < m / 8; i++) {
  51. v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]);
  52. v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]);
  53. v2 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[2]);
  54. v3 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[3]);
  55. t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1));
  56. t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3));
  57. svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0));
  58. svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16,
  59. svreinterpret_bf16_u64(t1));
  60. a_offsetx[0] += 8;
  61. a_offsetx[1] += 8;
  62. a_offsetx[2] += 8;
  63. a_offsetx[3] += 8;
  64. b_offset += 32;
  65. }
  66. if (rest) { // remainder along k dim
  67. v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]);
  68. v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]);
  69. v2 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[2]);
  70. v3 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[3]);
  71. t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1));
  72. t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3));
  73. svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0));
  74. svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16,
  75. svreinterpret_bf16_u64(t1));
  76. b_offset += 32;
  77. }
  78. }
  79. if (n & 2) {
  80. a_offsetx[0] = a_offset;
  81. a_offsetx[1] = a_offsetx[0] + lda;
  82. a_offset += 2 * lda;
  83. for (BLASLONG i = 0; i < m / 8; i++) {
  84. v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]);
  85. v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]);
  86. t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1));
  87. svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0));
  88. b_offset += 16;
  89. a_offsetx[0] += 8;
  90. a_offsetx[1] += 8;
  91. }
  92. if (rest) { // remainder along k dim
  93. v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]);
  94. v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]);
  95. t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1));
  96. svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0));
  97. b_offset += 16;
  98. }
  99. }
  100. if (n & 1) {
  101. a_offsetx[0] = a_offset;
  102. for (BLASLONG i = 0; i < m / 8; i++) {
  103. v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]);
  104. v1 = svdup_bf16(zero_value_bf16);
  105. t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1));
  106. svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0));
  107. b_offset += 16;
  108. a_offsetx[0] += 8;
  109. }
  110. if (rest) { // remainder along k dim
  111. v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]);
  112. v1 = svdup_bf16(zero_value_bf16);
  113. t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1));
  114. svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0));
  115. }
  116. }
  117. return 0;
  118. }