You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sbgemm_oncopy_16_spr.c 4.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. /***************************************************************************
  2. * Copyright (c) 2021, The OpenBLAS Project
  3. * All rights reserved.
  4. * Redistribution and use in source and binary forms, with or without
  5. * modification, are permitted provided that the following conditions are
  6. * met:
  7. * 1. Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * 2. Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in
  11. * the documentation and/or other materials provided with the
  12. * distribution.
  13. * 3. Neither the name of the OpenBLAS project nor the names of
  14. * its contributors may be used to endorse or promote products
  15. * derived from this software without specific prior written permission.
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. * *****************************************************************************/
  27. #include <immintrin.h>
  28. #include "common.h"
  29. typedef struct {
  30. char palette_id;
  31. char start_row;
  32. char dummy0[14]; // bytes 2-15 reserved, must be zero
  33. short tile_colsb[8];
  34. char dummy1[16]; // bytes 32-47 reserved, must be zero
  35. char tile_rows[8];
  36. char dummy2[16]; // bytes 56-63 reserved, must be zero
  37. } tilecfg;
  38. #define T_16x32 0
  39. #define T_16xm 1
  40. #define T_nx32 2
  41. #define T_nxm 3
  42. #define TCONF(cfg, m, n) \
  43. memset(&cfg, 0, sizeof(tilecfg)); \
  44. cfg.palette_id = 1; \
  45. cfg.tile_rows[T_16x32] = 16; \
  46. cfg.tile_colsb[T_16x32] = 64; \
  47. if (m) { \
  48. cfg.tile_rows[T_16xm] = 16; \
  49. cfg.tile_colsb[T_16xm] = m * 2; \
  50. } \
  51. if (n) { \
  52. cfg.tile_rows[T_nx32] = n; \
  53. cfg.tile_colsb[T_nx32] = 64; \
  54. } \
  55. if (m && n) { \
  56. cfg.tile_rows[T_nxm] = n; \
  57. cfg.tile_colsb[T_nxm] = m * 2; \
  58. } \
  59. _tile_loadconfig(&cfg);
  60. int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
  61. BLASLONG i, j;
  62. IFLOAT *aoffset, *boffset;
  63. IFLOAT *aoffset0;
  64. aoffset = a;
  65. boffset = b;
  66. BLASLONG n16 = n & ~15;
  67. BLASLONG m32 = m & ~31;
  68. BLASLONG m2 = m & ~1;
  69. BLASLONG tail_m = m2 - m32;
  70. BLASLONG tail_n = n - n16;
  71. tilecfg cfg;
  72. TCONF(cfg, tail_m, tail_n);
  73. for (j = 0; j < n16; j += 16) {
  74. aoffset0 = aoffset;
  75. for (i = 0; i < m32; i += 32) {
  76. _tile_loadd(T_16x32, aoffset0, lda * 2);
  77. _tile_stored(T_16x32, boffset, 32 * 2);
  78. aoffset0 += 32;
  79. boffset += 32 * 16;
  80. }
  81. if (i < m2) {
  82. _tile_loadd(T_16xm, aoffset0, lda * 2);
  83. _tile_stored(T_16xm, boffset, tail_m * 2);
  84. aoffset0 += tail_m;
  85. boffset += tail_m * 16;
  86. i = m2;
  87. }
  88. if (i < m) {
  89. /* the tail odd k should put alone */
  90. for (int ii = 0; ii < 16; ii++) {
  91. *(boffset + ii) = *(aoffset0 + lda * ii);
  92. }
  93. boffset += 16;
  94. }
  95. aoffset += 16 * lda;
  96. }
  97. if (j < n) {
  98. aoffset0 = aoffset;
  99. for (i = 0; i < m32; i += 32) {
  100. _tile_loadd(T_nx32, aoffset0, lda * 2);
  101. _tile_stored(T_nx32, boffset, 32 * 2);
  102. aoffset0 += 32;
  103. boffset += 32 * tail_n;
  104. }
  105. if (i < m2) {
  106. _tile_loadd(T_nxm, aoffset0, lda * 2);
  107. _tile_stored(T_nxm, boffset, tail_m * 2);
  108. aoffset0 += tail_m;
  109. boffset += tail_m * tail_n;
  110. }
  111. if (i < m) {
  112. for (int ii = 0; ii < tail_n; ii++) {
  113. *(boffset + ii) = *(aoffset0 + lda * ii);
  114. }
  115. }
  116. }
  117. return 0;
  118. }