You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trmmkernel_2x2.c 7.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. #include "common.h"
  2. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
  3. #ifdef TRMMKERNEL
  4. ,BLASLONG offset
  5. #endif
  6. )
  7. {
  8. BLASLONG i,j,k;
  9. FLOAT *C0,*C1,*ptrba,*ptrbb;
  10. FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
  11. BLASLONG off, temp;
  12. #if defined(TRMMKERNEL) && !defined(LEFT)
  13. off = -offset;
  14. #endif
  15. for (j=0; j<bn/2; j+=1)
  16. {
  17. C0 = C;
  18. C1 = C0+ldc;
  19. #if defined(TRMMKERNEL) && defined(LEFT)
  20. off = offset;
  21. #endif
  22. ptrba = ba;
  23. for (i=0; i<bm/2; i+=1)
  24. {
  25. #if (defined(LEFT) && defined(TRANSA)) || \
  26. (!defined(LEFT) && !defined(TRANSA))
  27. ptrbb = bb;
  28. #else
  29. ptrba += off*2;
  30. ptrbb = bb + off*2;
  31. #endif
  32. res0 = 0;
  33. res1 = 0;
  34. res2 = 0;
  35. res3 = 0;
  36. #if (defined(LEFT) && !defined(TRANSA)) || \
  37. (!defined(LEFT) && defined(TRANSA))
  38. temp = bk-off;
  39. #elif defined(LEFT)
  40. temp = off+2;
  41. #else
  42. temp = off+2;
  43. #endif
  44. for (k=0; k<temp/4; k+=1)
  45. {
  46. load0 = ptrba[2*0+0];
  47. load1 = ptrbb[2*0+0];
  48. res0 = res0+load0*load1;
  49. load2 = ptrba[2*0+1];
  50. res1 = res1+load2*load1;
  51. load3 = ptrbb[2*0+1];
  52. res2 = res2+load0*load3;
  53. res3 = res3+load2*load3;
  54. load4 = ptrba[2*1+0];
  55. load5 = ptrbb[2*1+0];
  56. res0 = res0+load4*load5;
  57. load6 = ptrba[2*1+1];
  58. res1 = res1+load6*load5;
  59. load7 = ptrbb[2*1+1];
  60. res2 = res2+load4*load7;
  61. res3 = res3+load6*load7;
  62. load0 = ptrba[2*2+0];
  63. load1 = ptrbb[2*2+0];
  64. res0 = res0+load0*load1;
  65. load2 = ptrba[2*2+1];
  66. res1 = res1+load2*load1;
  67. load3 = ptrbb[2*2+1];
  68. res2 = res2+load0*load3;
  69. res3 = res3+load2*load3;
  70. load4 = ptrba[2*3+0];
  71. load5 = ptrbb[2*3+0];
  72. res0 = res0+load4*load5;
  73. load6 = ptrba[2*3+1];
  74. res1 = res1+load6*load5;
  75. load7 = ptrbb[2*3+1];
  76. res2 = res2+load4*load7;
  77. res3 = res3+load6*load7;
  78. ptrba = ptrba+8;
  79. ptrbb = ptrbb+8;
  80. }
  81. for (k=0; k<(temp&3); k+=1)
  82. {
  83. load0 = ptrba[2*0+0];
  84. load1 = ptrbb[2*0+0];
  85. res0 = res0+load0*load1;
  86. load2 = ptrba[2*0+1];
  87. res1 = res1+load2*load1;
  88. load3 = ptrbb[2*0+1];
  89. res2 = res2+load0*load3;
  90. res3 = res3+load2*load3;
  91. ptrba = ptrba+2;
  92. ptrbb = ptrbb+2;
  93. }
  94. res0 = res0*alpha;
  95. C0[0] = res0;
  96. res1 = res1*alpha;
  97. C0[1] = res1;
  98. res2 = res2*alpha;
  99. C1[0] = res2;
  100. res3 = res3*alpha;
  101. C1[1] = res3;
  102. #if ( defined(LEFT) && defined(TRANSA)) || \
  103. (!defined(LEFT) && !defined(TRANSA))
  104. temp = bk - off;
  105. #ifdef LEFT
  106. temp -= 2;
  107. #else
  108. temp -= 2;
  109. #endif
  110. ptrba += temp*2;
  111. ptrbb += temp*2;
  112. #endif
  113. #ifdef LEFT
  114. off += 2;
  115. #endif
  116. C0 = C0+2;
  117. C1 = C1+2;
  118. }
  119. for (i=0; i<(bm&1); i+=1)
  120. {
  121. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  122. ptrbb = bb;
  123. #else
  124. ptrba += off;
  125. ptrbb = bb+off*2;
  126. #endif
  127. res0 = 0;
  128. res1 = 0;
  129. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  130. temp = bk-off;
  131. #elif defined(LEFT)
  132. temp = off+1;
  133. #else
  134. temp = off+2;
  135. #endif
  136. for (k=0; k<temp; k+=1)
  137. {
  138. load0 = ptrba[0+0];
  139. load1 = ptrbb[2*0+0];
  140. res0 = res0+load0*load1;
  141. load2 = ptrbb[2*0+1];
  142. res1 = res1+load0*load2;
  143. ptrba = ptrba+1;
  144. ptrbb = ptrbb+2;
  145. }
  146. res0 = res0*alpha;
  147. C0[0] = res0;
  148. res1 = res1*alpha;
  149. C1[0] = res1;
  150. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  151. temp = bk-off;
  152. #ifdef LEFT
  153. temp -= 1;
  154. #else
  155. temp -= 2;
  156. #endif
  157. ptrba += temp;
  158. ptrbb += temp*2;
  159. #endif
  160. #ifdef LEFT
  161. off += 1;
  162. #endif
  163. C0 = C0+1;
  164. C1 = C1+1;
  165. }
  166. #if defined(TRMMKERNEL) && !defined(LEFT)
  167. off += 2;
  168. #endif
  169. k = (bk<<1);
  170. bb = bb+k;
  171. i = (ldc<<1);
  172. C = C+i;
  173. }
  174. for (j=0; j<(bn&1); j+=1)
  175. {
  176. C0 = C;
  177. #if defined(TRMMKERNEL) && defined(LEFT)
  178. off = offset;
  179. #endif
  180. ptrba = ba;
  181. for (i=0; i<bm/2; i+=1)
  182. {
  183. #if (defined(LEFT) && defined(TRANSA)) || \
  184. (!defined(LEFT) && !defined(TRANSA))
  185. ptrbb = bb;
  186. #else
  187. ptrba += off*2;
  188. ptrbb = bb + off;
  189. #endif
  190. res0 = 0;
  191. res1 = 0;
  192. #if (defined(LEFT) && !defined(TRANSA)) || \
  193. (!defined(LEFT) && defined(TRANSA))
  194. temp = bk-off;
  195. #elif defined(LEFT)
  196. temp = off+2;
  197. #else
  198. temp = off+1;
  199. #endif
  200. for (k=0; k<temp; k+=1)
  201. {
  202. load0 = ptrba[2*0+0];
  203. load1 = ptrbb[0+0];
  204. res0 = res0+load0*load1;
  205. load2 = ptrba[2*0+1];
  206. res1 = res1+load2*load1;
  207. ptrba = ptrba+2;
  208. ptrbb = ptrbb+1;
  209. }
  210. res0 = res0*alpha;
  211. C0[0] = res0;
  212. res1 = res1*alpha;
  213. C0[1] = res1;
  214. #if ( defined(LEFT) && defined(TRANSA)) || \
  215. (!defined(LEFT) && !defined(TRANSA))
  216. temp = bk - off;
  217. #ifdef LEFT
  218. temp -= 2;
  219. #else
  220. temp -= 1;
  221. #endif
  222. ptrba += temp*2;
  223. ptrbb += temp;
  224. #endif
  225. #ifdef LEFT
  226. off += 2;
  227. #endif
  228. C0 = C0+2;
  229. }
  230. for (i=0; i<(bm&1); i+=1)
  231. {
  232. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  233. ptrbb = bb;
  234. #else
  235. ptrba += off;
  236. ptrbb = bb+off;
  237. #endif
  238. res0 = 0;
  239. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  240. temp = bk-off;
  241. #elif defined(LEFT)
  242. temp = off + 1;
  243. #else
  244. temp = off + 1;
  245. #endif
  246. for (k=0; k<temp; k+=1)
  247. {
  248. load0 = ptrba[0+0];
  249. load1 = ptrbb[0+0];
  250. res0 = res0+load0*load1;
  251. ptrba = ptrba+1;
  252. ptrbb = ptrbb+1;
  253. }
  254. res0 = res0*alpha;
  255. C0[0] = res0;
  256. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  257. temp = bk-off;
  258. #ifdef LEFT
  259. temp -= 1;
  260. #else
  261. temp -= 1;
  262. #endif
  263. ptrba += temp;
  264. ptrbb += temp;
  265. #endif
  266. #ifdef LEFT
  267. off += 1;
  268. #endif
  269. C0 = C0+1;
  270. }
  271. #if defined(TRMMKERNEL) && !defined(LEFT)
  272. off += 1;
  273. #endif
  274. k = (bk<<0);
  275. bb = bb+k;
  276. C = C+ldc;
  277. }
  278. return 0;
  279. }