You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zherk_kernel.c 5.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. #ifndef CONJ
  41. #define GEMM_KERNEL GEMM_KERNEL_R
  42. #define GEMM_KERNEL_B0 GEMM_KERNEL_R_B0
  43. #else
  44. #define GEMM_KERNEL GEMM_KERNEL_L
  45. #define GEMM_KERNEL_B0 GEMM_KERNEL_L_B0
  46. #endif
  47. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
  48. FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
  49. BLASLONG i, j;
  50. BLASLONG loop;
  51. FLOAT *cc, *ss;
  52. FLOAT subbuffer[GEMM_UNROLL_MN * (GEMM_UNROLL_MN + 1) * COMPSIZE];
  53. if (m + offset < 0) {
  54. #ifndef LOWER
  55. GEMM_KERNEL(m, n, k,
  56. alpha_r, ZERO,
  57. a, b, c, ldc);
  58. #endif
  59. return 0;
  60. }
  61. if (n < offset) {
  62. #ifdef LOWER
  63. GEMM_KERNEL(m, n, k,
  64. alpha_r, ZERO,
  65. a, b, c, ldc);
  66. #endif
  67. return 0;
  68. }
  69. if (offset > 0) {
  70. #ifdef LOWER
  71. GEMM_KERNEL(m, offset, k,
  72. alpha_r, ZERO,
  73. a, b, c, ldc);
  74. #endif
  75. b += offset * k * COMPSIZE;
  76. c += offset * ldc * COMPSIZE;
  77. n -= offset;
  78. offset = 0;
  79. if (n <= 0) return 0;
  80. }
  81. if (n > m + offset) {
  82. #ifndef LOWER
  83. GEMM_KERNEL(m, n - m - offset, k,
  84. alpha_r, ZERO,
  85. a,
  86. b + (m + offset) * k * COMPSIZE,
  87. c + (m + offset) * ldc * COMPSIZE, ldc);
  88. #endif
  89. n = m + offset;
  90. if (n <= 0) return 0;
  91. }
  92. if (offset < 0) {
  93. #ifndef LOWER
  94. GEMM_KERNEL(-offset, n, k,
  95. alpha_r, ZERO,
  96. a, b, c, ldc);
  97. #endif
  98. a -= offset * k * COMPSIZE;
  99. c -= offset * COMPSIZE;
  100. m += offset;
  101. offset = 0;
  102. if (m <= 0) return 0;
  103. }
  104. if (m > n - offset) {
  105. #ifdef LOWER
  106. GEMM_KERNEL(m - n + offset, n, k,
  107. alpha_r, ZERO,
  108. a + (n - offset) * k * COMPSIZE,
  109. b,
  110. c + (n - offset) * COMPSIZE, ldc);
  111. #endif
  112. m = n + offset;
  113. if (m <= 0) return 0;
  114. }
  115. for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) {
  116. int mm, nn;
  117. mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
  118. nn = MIN(GEMM_UNROLL_MN, n - loop);
  119. #ifndef LOWER
  120. GEMM_KERNEL(mm, nn, k,
  121. alpha_r, ZERO,
  122. a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc);
  123. #endif
  124. GEMM_BETA(nn, nn, 0, ZERO, ZERO,
  125. NULL, 0, NULL, 0, subbuffer, nn);
  126. GEMM_KERNEL(nn, nn, k,
  127. alpha_r, ZERO,
  128. a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn);
  129. cc = c + (loop + loop * ldc) * COMPSIZE;
  130. ss = subbuffer;
  131. #ifndef LOWER
  132. for (j = 0; j < nn; j ++) {
  133. for (i = 0; i <j; i ++) {
  134. cc[i * 2 + 0] += ss[i * 2 + 0];
  135. cc[i * 2 + 1] += ss[i * 2 + 1];
  136. }
  137. cc[j * 2 + 0] += ss[i * 2 + 0];
  138. cc[j * 2 + 1] = ZERO;
  139. ss += nn * COMPSIZE;
  140. cc += ldc * COMPSIZE;
  141. }
  142. #else
  143. for (j = 0; j < nn; j ++) {
  144. cc[j * 2 + 0] += ss[j * 2 + 0];
  145. cc[j * 2 + 1] = ZERO;
  146. for (i = j + 1; i < nn; i ++) {
  147. cc[i * 2 + 0] += ss[i * 2 + 0];
  148. cc[i * 2 + 1] += ss[i * 2 + 1];
  149. }
  150. ss += nn * COMPSIZE;
  151. cc += ldc * COMPSIZE;
  152. }
  153. #endif
  154. #ifdef LOWER
  155. GEMM_KERNEL(m - mm - nn, nn, k,
  156. alpha_r, ZERO,
  157. a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE,
  158. c + (mm + nn + loop * ldc) * COMPSIZE, ldc);
  159. #endif
  160. }
  161. return 0;
  162. }