You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemmkernel_2x2.c 4.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. #include "common.h"
  2. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
  3. #ifdef TRMMKERNEL
  4. ,BLASLONG offset
  5. #endif
  6. )
  7. {
  8. BLASLONG i,j,k;
  9. FLOAT *C0,*C1,*ptrba,*ptrbb;
  10. FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
  11. for (j=0; j<bn/2; j+=1)
  12. {
  13. C0 = C;
  14. C1 = C0+ldc;
  15. ptrba = ba;
  16. for (i=0; i<bm/2; i+=1)
  17. {
  18. ptrbb = bb;
  19. res0 = 0;
  20. res1 = 0;
  21. res2 = 0;
  22. res3 = 0;
  23. for (k=0; k<bk/4; k+=1)
  24. {
  25. load0 = ptrba[2*0+0];
  26. load1 = ptrbb[2*0+0];
  27. res0 = res0+load0*load1;
  28. load2 = ptrba[2*0+1];
  29. res1 = res1+load2*load1;
  30. load3 = ptrbb[2*0+1];
  31. res2 = res2+load0*load3;
  32. res3 = res3+load2*load3;
  33. load4 = ptrba[2*1+0];
  34. load5 = ptrbb[2*1+0];
  35. res0 = res0+load4*load5;
  36. load6 = ptrba[2*1+1];
  37. res1 = res1+load6*load5;
  38. load7 = ptrbb[2*1+1];
  39. res2 = res2+load4*load7;
  40. res3 = res3+load6*load7;
  41. load0 = ptrba[2*2+0];
  42. load1 = ptrbb[2*2+0];
  43. res0 = res0+load0*load1;
  44. load2 = ptrba[2*2+1];
  45. res1 = res1+load2*load1;
  46. load3 = ptrbb[2*2+1];
  47. res2 = res2+load0*load3;
  48. res3 = res3+load2*load3;
  49. load4 = ptrba[2*3+0];
  50. load5 = ptrbb[2*3+0];
  51. res0 = res0+load4*load5;
  52. load6 = ptrba[2*3+1];
  53. res1 = res1+load6*load5;
  54. load7 = ptrbb[2*3+1];
  55. res2 = res2+load4*load7;
  56. res3 = res3+load6*load7;
  57. ptrba = ptrba+8;
  58. ptrbb = ptrbb+8;
  59. }
  60. for (k=0; k<(bk&3); k+=1)
  61. {
  62. load0 = ptrba[2*0+0];
  63. load1 = ptrbb[2*0+0];
  64. res0 = res0+load0*load1;
  65. load2 = ptrba[2*0+1];
  66. res1 = res1+load2*load1;
  67. load3 = ptrbb[2*0+1];
  68. res2 = res2+load0*load3;
  69. res3 = res3+load2*load3;
  70. ptrba = ptrba+2;
  71. ptrbb = ptrbb+2;
  72. }
  73. res0 = res0*alpha;
  74. C0[0] = C0[0]+res0;
  75. res1 = res1*alpha;
  76. C0[1] = C0[1]+res1;
  77. res2 = res2*alpha;
  78. C1[0] = C1[0]+res2;
  79. res3 = res3*alpha;
  80. C1[1] = C1[1]+res3;
  81. C0 = C0+2;
  82. C1 = C1+2;
  83. }
  84. for (i=0; i<(bm&1); i+=1)
  85. {
  86. ptrbb = bb;
  87. res0 = 0;
  88. res1 = 0;
  89. for (k=0; k<bk; k+=1)
  90. {
  91. load0 = ptrba[0+0];
  92. load1 = ptrbb[2*0+0];
  93. res0 = res0+load0*load1;
  94. load2 = ptrbb[2*0+1];
  95. res1 = res1+load0*load2;
  96. ptrba = ptrba+1;
  97. ptrbb = ptrbb+2;
  98. }
  99. res0 = res0*alpha;
  100. C0[0] = C0[0]+res0;
  101. res1 = res1*alpha;
  102. C1[0] = C1[0]+res1;
  103. C0 = C0+1;
  104. C1 = C1+1;
  105. }
  106. k = (bk<<1);
  107. bb = bb+k;
  108. i = (ldc<<1);
  109. C = C+i;
  110. }
  111. for (j=0; j<(bn&1); j+=1)
  112. {
  113. C0 = C;
  114. ptrba = ba;
  115. for (i=0; i<bm/2; i+=1)
  116. {
  117. ptrbb = bb;
  118. res0 = 0;
  119. res1 = 0;
  120. for (k=0; k<bk; k+=1)
  121. {
  122. load0 = ptrba[2*0+0];
  123. load1 = ptrbb[0+0];
  124. res0 = res0+load0*load1;
  125. load2 = ptrba[2*0+1];
  126. res1 = res1+load2*load1;
  127. ptrba = ptrba+2;
  128. ptrbb = ptrbb+1;
  129. }
  130. res0 = res0*alpha;
  131. C0[0] = C0[0]+res0;
  132. res1 = res1*alpha;
  133. C0[1] = C0[1]+res1;
  134. C0 = C0+2;
  135. }
  136. for (i=0; i<(bm&1); i+=1)
  137. {
  138. ptrbb = bb;
  139. res0 = 0;
  140. for (k=0; k<bk; k+=1)
  141. {
  142. load0 = ptrba[0+0];
  143. load1 = ptrbb[0+0];
  144. res0 = res0+load0*load1;
  145. ptrba = ptrba+1;
  146. ptrbb = ptrbb+1;
  147. }
  148. res0 = res0*alpha;
  149. C0[0] = C0[0]+res0;
  150. C0 = C0+1;
  151. }
  152. k = (bk<<0);
  153. bb = bb+k;
  154. C = C+ldc;
  155. }
  156. return 0;
  157. }