You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trmmkernel_4x2.c 8.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528
  1. #include "common.h"
  2. #include <stdbool.h>
  3. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
  4. {
  5. BLASLONG i,j,k;
  6. FLOAT *C0,*C1,*ptrba,*ptrbb;
  7. FLOAT res0_0;
  8. FLOAT res0_1;
  9. FLOAT res0_2;
  10. FLOAT res0_3;
  11. FLOAT res1_0;
  12. FLOAT res1_1;
  13. FLOAT res1_2;
  14. FLOAT res1_3;
  15. FLOAT a0;
  16. FLOAT a1;
  17. FLOAT b0;
  18. FLOAT b1;
  19. BLASLONG off, temp;
  20. bool left;
  21. bool transposed;
  22. bool backwards;
  23. #ifdef LEFT
  24. left = true;
  25. #else
  26. left = false;
  27. #endif
  28. #ifdef TRANSA
  29. transposed = true;
  30. #else
  31. transposed = false;
  32. #endif
  33. backwards = left != transposed;
  34. if (!left) {
  35. off = -offset;
  36. }
  37. for (j=0; j<(bn/2); j+=2) // do the Mx2 loops
  38. {
  39. C0 = C;
  40. C1 = C0+ldc;
  41. #if defined(TRMMKERNEL) && defined(LEFT)
  42. off = offset;
  43. #endif
  44. ptrba = ba;
  45. for (i=0; i<bm/4; i+=1) // do blocks of 4x2
  46. {
  47. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  48. ptrbb = bb;
  49. #else
  50. ptrba += off*4;
  51. ptrbb = bb + off*2;
  52. #endif
  53. res0_0 = 0;
  54. res0_1 = 0;
  55. res0_2 = 0;
  56. res0_3 = 0;
  57. res1_0 = 0;
  58. res1_1 = 0;
  59. res1_2 = 0;
  60. res1_3 = 0;
  61. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  62. temp = bk-off;
  63. #elif defined(LEFT)
  64. temp = off+4; // number of values in A
  65. #else
  66. temp = off+2; // number of values in B
  67. #endif
  68. for (k=0; k<temp; k++)
  69. {
  70. b0 = ptrbb[0];
  71. b1 = ptrbb[1];
  72. a0 = ptrba[0];
  73. res0_0 += a0*b0;
  74. res1_0 += a0*b1;
  75. a1 = ptrba[1];
  76. res0_1 += a1*b0;
  77. res1_1 += a1*b1;
  78. a0 = ptrba[2];
  79. res0_2 += a0*b0;
  80. res1_2 += a0*b1;
  81. a1 = ptrba[3];
  82. res0_3 += a1*b0;
  83. res1_3 += a1*b1;
  84. ptrba = ptrba+4;
  85. ptrbb = ptrbb+2;
  86. }
  87. res0_0 *= alpha;
  88. res0_1 *= alpha;
  89. res0_2 *= alpha;
  90. res0_3 *= alpha;
  91. res1_0 *= alpha;
  92. res1_1 *= alpha;
  93. res1_2 *= alpha;
  94. res1_3 *= alpha;
  95. C0[0] = res0_0;
  96. C0[1] = res0_1;
  97. C0[2] = res0_2;
  98. C0[3] = res0_3;
  99. C1[0] = res1_0;
  100. C1[1] = res1_1;
  101. C1[2] = res1_2;
  102. C1[3] = res1_3;
  103. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  104. temp = bk - off;
  105. #ifdef LEFT
  106. temp -= 4; // number of values in A
  107. #else
  108. temp -= 2; // number of values in B
  109. #endif
  110. ptrba += temp*4;
  111. ptrbb += temp*2;
  112. #endif
  113. #ifdef LEFT
  114. off += 4; // number of values in A
  115. #endif
  116. C0 = C0+4;
  117. C1 = C1+4;
  118. }
  119. if ( bm & 2 ) // do any 2x2 loop
  120. {
  121. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  122. ptrbb = bb;
  123. #else
  124. ptrba += off*2;
  125. ptrbb = bb + off*2;
  126. #endif
  127. res0_0 = 0;
  128. res0_1 = 0;
  129. res1_0 = 0;
  130. res1_1 = 0;
  131. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  132. temp = bk-off;
  133. #elif defined(LEFT)
  134. temp = off+2; // number of values in A
  135. #else
  136. temp = off+2; // number of values in B
  137. #endif
  138. for (k=0; k<temp; k++)
  139. {
  140. b0 = ptrbb[0];
  141. b1 = ptrbb[1];
  142. a0 = ptrba[0];
  143. res0_0 += a0*b0;
  144. res1_0 += a0*b1;
  145. a1 = ptrba[1];
  146. res0_1 += a1*b0;
  147. res1_1 += a1*b1;
  148. ptrba = ptrba+2;
  149. ptrbb = ptrbb+2;
  150. }
  151. res0_0 *= alpha;
  152. res0_1 *= alpha;
  153. res1_0 *= alpha;
  154. res1_1 *= alpha;
  155. C0[0] = res0_0;
  156. C0[1] = res0_1;
  157. C1[0] = res1_0;
  158. C1[1] = res1_1;
  159. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  160. temp = bk - off;
  161. #ifdef LEFT
  162. temp -= 2; // number of values in A
  163. #else
  164. temp -= 2; // number of values in B
  165. #endif
  166. ptrba += temp*2;
  167. ptrbb += temp*2;
  168. #endif
  169. #ifdef LEFT
  170. off += 2; // number of values in A
  171. #endif
  172. C0 = C0+2;
  173. C1 = C1+2;
  174. }
  175. if ( bm & 1 ) // do any 1x2 loop
  176. {
  177. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  178. ptrbb = bb;
  179. #else
  180. ptrba += off*1;
  181. ptrbb = bb + off*2;
  182. #endif
  183. res0_0 = 0;
  184. res1_0 = 0;
  185. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  186. temp = bk-off;
  187. #elif defined(LEFT)
  188. temp = off+1; // number of values in A
  189. #else
  190. temp = off+2; // number of values in B
  191. #endif
  192. for (k=0; k<temp; k++)
  193. {
  194. b0 = ptrbb[0];
  195. b1 = ptrbb[1];
  196. a0 = ptrba[0];
  197. res0_0 += a0*b0;
  198. res1_0 += a0*b1;
  199. ptrba = ptrba+1;
  200. ptrbb = ptrbb+2;
  201. }
  202. res0_0 *= alpha;
  203. res1_0 *= alpha;
  204. C0[0] = res0_0;
  205. C1[0] = res1_0;
  206. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  207. temp = bk - off;
  208. #ifdef LEFT
  209. temp -= 1; // number of values in A
  210. #else
  211. temp -= 2; // number of values in B
  212. #endif
  213. ptrba += temp*1;
  214. ptrbb += temp*2;
  215. #endif
  216. #ifdef LEFT
  217. off += 1; // number of values in A
  218. #endif
  219. C0 = C0+1;
  220. C1 = C1+1;
  221. }
  222. #if defined(TRMMKERNEL) && !defined(LEFT)
  223. off += 2;
  224. #endif
  225. k = (bk<<1);
  226. bb = bb+k;
  227. i = (ldc<<1);
  228. C = C+i;
  229. }
  230. for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
  231. {
  232. C0 = C;
  233. #if defined(TRMMKERNEL) && defined(LEFT)
  234. off = offset;
  235. #endif
  236. ptrba = ba;
  237. for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
  238. {
  239. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  240. ptrbb = bb;
  241. #else
  242. ptrba += off*4;
  243. ptrbb = bb + off*1;
  244. #endif
  245. res0_0 = 0;
  246. res0_1 = 0;
  247. res0_2 = 0;
  248. res0_3 = 0;
  249. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  250. temp = bk-off;
  251. #elif defined(LEFT)
  252. temp = off+4; // number of values in A
  253. #else
  254. temp = off+1; // number of values in B
  255. #endif
  256. for (k=0; k<temp; k++)
  257. {
  258. b0 = ptrbb[0];
  259. a0 = ptrba[0];
  260. res0_0 += a0*b0;
  261. a1 = ptrba[1];
  262. res0_1 += a1*b0;
  263. a0 = ptrba[2];
  264. res0_2 += a0*b0;
  265. a1 = ptrba[3];
  266. res0_3 += a1*b0;
  267. ptrba = ptrba+4;
  268. ptrbb = ptrbb+1;
  269. }
  270. res0_0 *= alpha;
  271. res0_1 *= alpha;
  272. res0_2 *= alpha;
  273. res0_3 *= alpha;
  274. C0[0] = res0_0;
  275. C0[1] = res0_1;
  276. C0[2] = res0_2;
  277. C0[3] = res0_3;
  278. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  279. temp = bk - off;
  280. #ifdef LEFT
  281. temp -= 4; // number of values in A
  282. #else
  283. temp -= 1; // number of values in B
  284. #endif
  285. ptrba += temp*4;
  286. ptrbb += temp*1;
  287. #endif
  288. #ifdef LEFT
  289. off += 4; // number of values in A
  290. #endif
  291. C0 = C0+4;
  292. }
  293. if ( bm & 2 ) // do any 2x1 loop
  294. {
  295. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  296. ptrbb = bb;
  297. #else
  298. ptrba += off*2;
  299. ptrbb = bb + off*1;
  300. #endif
  301. res0_0 = 0;
  302. res0_1 = 0;
  303. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  304. temp = bk-off;
  305. #elif defined(LEFT)
  306. temp = off+2; // number of values in A
  307. #else
  308. temp = off+1; // number of values in B
  309. #endif
  310. for (k=0; k<temp; k++)
  311. {
  312. b0 = ptrbb[0];
  313. a0 = ptrba[0];
  314. res0_0 += a0*b0;
  315. a1 = ptrba[1];
  316. res0_1 += a1*b0;
  317. ptrba = ptrba+2;
  318. ptrbb = ptrbb+1;
  319. }
  320. res0_0 *= alpha;
  321. res0_1 *= alpha;
  322. C0[0] = res0_0;
  323. C0[1] = res0_1;
  324. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  325. temp = bk - off;
  326. #ifdef LEFT
  327. temp -= 2; // number of values in A
  328. #else
  329. temp -= 1; // number of values in B
  330. #endif
  331. ptrba += temp*2;
  332. ptrbb += temp*1;
  333. #endif
  334. #ifdef LEFT
  335. off += 2; // number of values in A
  336. #endif
  337. C0 = C0+2;
  338. }
  339. if ( bm & 1 ) // do any 1x1 loop
  340. {
  341. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  342. ptrbb = bb;
  343. #else
  344. ptrba += off*1;
  345. ptrbb = bb + off*1;
  346. #endif
  347. res0_0 = 0;
  348. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  349. temp = bk-off;
  350. #elif defined(LEFT)
  351. temp = off+1; // number of values in A
  352. #else
  353. temp = off+1; // number of values in B
  354. #endif
  355. for (k=0; k<temp; k++)
  356. {
  357. b0 = ptrbb[0];
  358. a0 = ptrba[0];
  359. res0_0 += a0*b0;
  360. ptrba = ptrba+1;
  361. ptrbb = ptrbb+1;
  362. }
  363. res0_0 *= alpha;
  364. C0[0] = res0_0;
  365. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  366. temp = bk - off;
  367. #ifdef LEFT
  368. temp -= 1; // number of values in A
  369. #else
  370. temp -= 1; // number of values in B
  371. #endif
  372. ptrba += temp*1;
  373. ptrbb += temp*1;
  374. #endif
  375. #ifdef LEFT
  376. off += 1; // number of values in A
  377. #endif
  378. C0 = C0+1;
  379. }
  380. #if defined(TRMMKERNEL) && !defined(LEFT)
  381. off += 1;
  382. #endif
  383. k = (bk<<0);
  384. bb = bb+k;
  385. C = C+ldc;
  386. }
  387. return 0;
  388. }