You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trmmkernel_2x2.c 7.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. #include "common.h"
  2. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
  3. #ifdef TRMMKERNEL
  4. ,BLASLONG offset
  5. #endif
  6. )
  7. {
  8. BLASLONG i,j,k;
  9. FLOAT *C0,*C1,*ptrba,*ptrbb;
  10. FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
  11. BLASLONG off, temp;
  12. #if defined(TRMMKERNEL) && !defined(LEFT)
  13. off = -offset;
  14. #else
  15. off = 0;
  16. #endif
  17. for (j=0; j<bn/2; j+=1)
  18. {
  19. C0 = C;
  20. C1 = C0+ldc;
  21. #if defined(TRMMKERNEL) && defined(LEFT)
  22. off = offset;
  23. #endif
  24. ptrba = ba;
  25. for (i=0; i<bm/2; i+=1)
  26. {
  27. #if (defined(LEFT) && defined(TRANSA)) || \
  28. (!defined(LEFT) && !defined(TRANSA))
  29. ptrbb = bb;
  30. #else
  31. ptrba += off*2;
  32. ptrbb = bb + off*2;
  33. #endif
  34. res0 = 0;
  35. res1 = 0;
  36. res2 = 0;
  37. res3 = 0;
  38. #if (defined(LEFT) && !defined(TRANSA)) || \
  39. (!defined(LEFT) && defined(TRANSA))
  40. temp = bk-off;
  41. #elif defined(LEFT)
  42. temp = off+2;
  43. #else
  44. temp = off+2;
  45. #endif
  46. for (k=0; k<temp/4; k+=1)
  47. {
  48. load0 = ptrba[2*0+0];
  49. load1 = ptrbb[2*0+0];
  50. res0 = res0+load0*load1;
  51. load2 = ptrba[2*0+1];
  52. res1 = res1+load2*load1;
  53. load3 = ptrbb[2*0+1];
  54. res2 = res2+load0*load3;
  55. res3 = res3+load2*load3;
  56. load4 = ptrba[2*1+0];
  57. load5 = ptrbb[2*1+0];
  58. res0 = res0+load4*load5;
  59. load6 = ptrba[2*1+1];
  60. res1 = res1+load6*load5;
  61. load7 = ptrbb[2*1+1];
  62. res2 = res2+load4*load7;
  63. res3 = res3+load6*load7;
  64. load0 = ptrba[2*2+0];
  65. load1 = ptrbb[2*2+0];
  66. res0 = res0+load0*load1;
  67. load2 = ptrba[2*2+1];
  68. res1 = res1+load2*load1;
  69. load3 = ptrbb[2*2+1];
  70. res2 = res2+load0*load3;
  71. res3 = res3+load2*load3;
  72. load4 = ptrba[2*3+0];
  73. load5 = ptrbb[2*3+0];
  74. res0 = res0+load4*load5;
  75. load6 = ptrba[2*3+1];
  76. res1 = res1+load6*load5;
  77. load7 = ptrbb[2*3+1];
  78. res2 = res2+load4*load7;
  79. res3 = res3+load6*load7;
  80. ptrba = ptrba+8;
  81. ptrbb = ptrbb+8;
  82. }
  83. for (k=0; k<(temp&3); k+=1)
  84. {
  85. load0 = ptrba[2*0+0];
  86. load1 = ptrbb[2*0+0];
  87. res0 = res0+load0*load1;
  88. load2 = ptrba[2*0+1];
  89. res1 = res1+load2*load1;
  90. load3 = ptrbb[2*0+1];
  91. res2 = res2+load0*load3;
  92. res3 = res3+load2*load3;
  93. ptrba = ptrba+2;
  94. ptrbb = ptrbb+2;
  95. }
  96. res0 = res0*alpha;
  97. C0[0] = res0;
  98. res1 = res1*alpha;
  99. C0[1] = res1;
  100. res2 = res2*alpha;
  101. C1[0] = res2;
  102. res3 = res3*alpha;
  103. C1[1] = res3;
  104. #if ( defined(LEFT) && defined(TRANSA)) || \
  105. (!defined(LEFT) && !defined(TRANSA))
  106. temp = bk - off;
  107. #ifdef LEFT
  108. temp -= 2;
  109. #else
  110. temp -= 2;
  111. #endif
  112. ptrba += temp*2;
  113. ptrbb += temp*2;
  114. #endif
  115. #ifdef LEFT
  116. off += 2;
  117. #endif
  118. C0 = C0+2;
  119. C1 = C1+2;
  120. }
  121. for (i=0; i<(bm&1); i+=1)
  122. {
  123. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  124. ptrbb = bb;
  125. #else
  126. ptrba += off;
  127. ptrbb = bb+off*2;
  128. #endif
  129. res0 = 0;
  130. res1 = 0;
  131. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  132. temp = bk-off;
  133. #elif defined(LEFT)
  134. temp = off+1;
  135. #else
  136. temp = off+2;
  137. #endif
  138. for (k=0; k<temp; k+=1)
  139. {
  140. load0 = ptrba[0+0];
  141. load1 = ptrbb[2*0+0];
  142. res0 = res0+load0*load1;
  143. load2 = ptrbb[2*0+1];
  144. res1 = res1+load0*load2;
  145. ptrba = ptrba+1;
  146. ptrbb = ptrbb+2;
  147. }
  148. res0 = res0*alpha;
  149. C0[0] = res0;
  150. res1 = res1*alpha;
  151. C1[0] = res1;
  152. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  153. temp = bk-off;
  154. #ifdef LEFT
  155. temp -= 1;
  156. #else
  157. temp -= 2;
  158. #endif
  159. ptrba += temp;
  160. ptrbb += temp*2;
  161. #endif
  162. #ifdef LEFT
  163. off += 1;
  164. #endif
  165. C0 = C0+1;
  166. C1 = C1+1;
  167. }
  168. #if defined(TRMMKERNEL) && !defined(LEFT)
  169. off += 2;
  170. #endif
  171. k = (bk<<1);
  172. bb = bb+k;
  173. i = (ldc<<1);
  174. C = C+i;
  175. }
  176. for (j=0; j<(bn&1); j+=1)
  177. {
  178. C0 = C;
  179. #if defined(TRMMKERNEL) && defined(LEFT)
  180. off = offset;
  181. #endif
  182. ptrba = ba;
  183. for (i=0; i<bm/2; i+=1)
  184. {
  185. #if (defined(LEFT) && defined(TRANSA)) || \
  186. (!defined(LEFT) && !defined(TRANSA))
  187. ptrbb = bb;
  188. #else
  189. ptrba += off*2;
  190. ptrbb = bb + off;
  191. #endif
  192. res0 = 0;
  193. res1 = 0;
  194. #if (defined(LEFT) && !defined(TRANSA)) || \
  195. (!defined(LEFT) && defined(TRANSA))
  196. temp = bk-off;
  197. #elif defined(LEFT)
  198. temp = off+2;
  199. #else
  200. temp = off+1;
  201. #endif
  202. for (k=0; k<temp; k+=1)
  203. {
  204. load0 = ptrba[2*0+0];
  205. load1 = ptrbb[0+0];
  206. res0 = res0+load0*load1;
  207. load2 = ptrba[2*0+1];
  208. res1 = res1+load2*load1;
  209. ptrba = ptrba+2;
  210. ptrbb = ptrbb+1;
  211. }
  212. res0 = res0*alpha;
  213. C0[0] = res0;
  214. res1 = res1*alpha;
  215. C0[1] = res1;
  216. #if ( defined(LEFT) && defined(TRANSA)) || \
  217. (!defined(LEFT) && !defined(TRANSA))
  218. temp = bk - off;
  219. #ifdef LEFT
  220. temp -= 2;
  221. #else
  222. temp -= 1;
  223. #endif
  224. ptrba += temp*2;
  225. ptrbb += temp;
  226. #endif
  227. #ifdef LEFT
  228. off += 2;
  229. #endif
  230. C0 = C0+2;
  231. }
  232. for (i=0; i<(bm&1); i+=1)
  233. {
  234. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  235. ptrbb = bb;
  236. #else
  237. ptrba += off;
  238. ptrbb = bb+off;
  239. #endif
  240. res0 = 0;
  241. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  242. temp = bk-off;
  243. #elif defined(LEFT)
  244. temp = off + 1;
  245. #else
  246. temp = off + 1;
  247. #endif
  248. for (k=0; k<temp; k+=1)
  249. {
  250. load0 = ptrba[0+0];
  251. load1 = ptrbb[0+0];
  252. res0 = res0+load0*load1;
  253. ptrba = ptrba+1;
  254. ptrbb = ptrbb+1;
  255. }
  256. res0 = res0*alpha;
  257. C0[0] = res0;
  258. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  259. temp = bk-off;
  260. #ifdef LEFT
  261. temp -= 1;
  262. #else
  263. temp -= 1;
  264. #endif
  265. ptrba += temp;
  266. ptrbb += temp;
  267. #endif
  268. #ifdef LEFT
  269. off += 1;
  270. #endif
  271. C0 = C0+1;
  272. }
  273. #if defined(TRMMKERNEL) && !defined(LEFT)
  274. off += 1;
  275. #endif
  276. k = (bk<<0);
  277. bb = bb+k;
  278. C = C+ldc;
  279. }
  280. return 0;
  281. }