You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_beta.c 5.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* Copyright 2025 The OpenBLAS Project. */
  4. /* All rights reserved. */
  5. /* */
  6. /* Redistribution and use in source and binary forms, with or */
  7. /* without modification, are permitted provided that the following */
  8. /* conditions are met: */
  9. /* */
  10. /* 1. Redistributions of source code must retain the above */
  11. /* copyright notice, this list of conditions and the following */
  12. /* disclaimer. */
  13. /* */
  14. /* 2. Redistributions in binary form must reproduce the above */
  15. /* copyright notice, this list of conditions and the following */
  16. /* disclaimer in the documentation and/or other materials */
  17. /* provided with the distribution. */
  18. /* */
  19. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  20. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  21. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  22. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  23. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  24. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  25. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  26. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  27. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  28. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  29. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  30. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  31. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  32. /* POSSIBILITY OF SUCH DAMAGE. */
  33. /* */
  34. /* The views and conclusions contained in the software and */
  35. /* documentation are those of the authors and should not be */
  36. /* interpreted as representing official policies, either expressed */
  37. /* or implied, of The University of Texas at Austin. */
  38. /*********************************************************************/
  39. #include "common.h"
  40. #if defined(BFLOAT16) && defined(BGEMM) && defined(BFLOAT16CONVERSION)
  41. static float
  42. bfloat16tof32 (bfloat16 f16)
  43. {
  44. float result = 0;
  45. unsigned short* q = (unsigned short*)(&result);
  46. #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  47. q[0] = f16;
  48. #else
  49. q[1] = f16;
  50. #endif
  51. return result;
  52. }
  53. static bfloat16
  54. f32tobfloat16(float f32)
  55. {
  56. unsigned short* q = (unsigned short*)(&f32);
  57. #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  58. return q[0];
  59. #else
  60. return q[1];
  61. #endif
  62. }
  63. #define BF16TOF32(x) (bfloat16tof32(x))
  64. #define F32TOBF16(x) (f32tobfloat16(x))
  65. #else
  66. #define BF16TOF32(x) x
  67. #define F32TOBF16(x) x
  68. #endif
  69. int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
  70. IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5,
  71. FLOAT *c, BLASLONG ldc){
  72. BLASLONG i, j;
  73. BLASLONG chunk, remain;
  74. FLOAT *c_offset1, *c_offset;
  75. c_offset = c;
  76. chunk = m >> 3;
  77. remain = m & 7;
  78. if (beta == ZERO){
  79. for(j=n; j>0; j--){
  80. c_offset1 = c_offset;
  81. c_offset += ldc;
  82. for(i=chunk; i>0; i--){
  83. *(c_offset1 + 0) = F32TOBF16(ZERO);
  84. *(c_offset1 + 1) = F32TOBF16(ZERO);
  85. *(c_offset1 + 2) = F32TOBF16(ZERO);
  86. *(c_offset1 + 3) = F32TOBF16(ZERO);
  87. *(c_offset1 + 4) = F32TOBF16(ZERO);
  88. *(c_offset1 + 5) = F32TOBF16(ZERO);
  89. *(c_offset1 + 6) = F32TOBF16(ZERO);
  90. *(c_offset1 + 7) = F32TOBF16(ZERO);
  91. c_offset1 += 8;
  92. }
  93. for(i=remain; i>0; i--){
  94. *c_offset1 = F32TOBF16(ZERO);
  95. c_offset1 ++;
  96. }
  97. }
  98. } else {
  99. for(j=n; j>0; j--){
  100. c_offset1 = c_offset;
  101. c_offset += ldc;
  102. for(i=chunk; i>0; i--){
  103. *(c_offset1 + 0) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[0]));
  104. *(c_offset1 + 1) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[1]));
  105. *(c_offset1 + 2) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[2]));
  106. *(c_offset1 + 3) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[3]));
  107. *(c_offset1 + 4) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[4]));
  108. *(c_offset1 + 5) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[5]));
  109. *(c_offset1 + 6) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[6]));
  110. *(c_offset1 + 7) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[7]));
  111. c_offset1 += 8;
  112. }
  113. for(i=remain; i>0; i--){
  114. *c_offset1 = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[0]));
  115. c_offset1 ++;
  116. }
  117. }
  118. }
  119. return 0;
  120. };