You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trmmkernel_8x2.c 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750
  1. #include "common.h"
  2. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
  3. {
  4. BLASLONG i,j,k;
  5. FLOAT *C0,*C1,*ptrba,*ptrbb;
  6. FLOAT res0_0;
  7. FLOAT res0_1;
  8. FLOAT res0_2;
  9. FLOAT res0_3;
  10. FLOAT res0_4;
  11. FLOAT res0_5;
  12. FLOAT res0_6;
  13. FLOAT res0_7;
  14. FLOAT res1_0;
  15. FLOAT res1_1;
  16. FLOAT res1_2;
  17. FLOAT res1_3;
  18. FLOAT res1_4;
  19. FLOAT res1_5;
  20. FLOAT res1_6;
  21. FLOAT res1_7;
  22. FLOAT a0;
  23. FLOAT a1;
  24. FLOAT b0;
  25. FLOAT b1;
  26. BLASLONG off, temp;
  27. #if !defined(LEFT)
  28. off = -offset;
  29. #endif
  30. for (j=0; j<bn/2; j+=1)
  31. {
  32. C0 = C;
  33. C1 = C0+ldc;
  34. #if defined(TRMMKERNEL) && defined(LEFT)
  35. off = offset;
  36. #endif
  37. ptrba = ba;
  38. for (i=0; i<bm/8; i+=1)
  39. {
  40. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  41. ptrbb = bb;
  42. #else
  43. ptrba += off*8;
  44. ptrbb = bb + off*2;
  45. #endif
  46. res0_0 = 0;
  47. res0_1 = 0;
  48. res0_2 = 0;
  49. res0_3 = 0;
  50. res0_4 = 0;
  51. res0_5 = 0;
  52. res0_6 = 0;
  53. res0_7 = 0;
  54. res1_0 = 0;
  55. res1_1 = 0;
  56. res1_2 = 0;
  57. res1_3 = 0;
  58. res1_4 = 0;
  59. res1_5 = 0;
  60. res1_6 = 0;
  61. res1_7 = 0;
  62. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  63. temp = bk-off;
  64. #elif defined(LEFT)
  65. temp = off+8; // number of values in A
  66. #else
  67. temp = off+2; // number of values in B
  68. #endif
  69. for (k=0; k<temp; k++)
  70. {
  71. b0 = ptrbb[0];
  72. b1 = ptrbb[1];
  73. a0 = ptrba[0];
  74. res0_0 += a0*b0;
  75. res1_0 += a0*b1;
  76. a1 = ptrba[1];
  77. res0_1 += a1*b0;
  78. res1_1 += a1*b1;
  79. a0 = ptrba[2];
  80. res0_2 += a0*b0;
  81. res1_2 += a0*b1;
  82. a1 = ptrba[3];
  83. res0_3 += a1*b0;
  84. res1_3 += a1*b1;
  85. a0 = ptrba[4];
  86. res0_4 += a0*b0;
  87. res1_4 += a0*b1;
  88. a1 = ptrba[5];
  89. res0_5 += a1*b0;
  90. res1_5 += a1*b1;
  91. a0 = ptrba[6];
  92. res0_6 += a0*b0;
  93. res1_6 += a0*b1;
  94. a1 = ptrba[7];
  95. res0_7 += a1*b0;
  96. res1_7 += a1*b1;
  97. ptrba = ptrba+8;
  98. ptrbb = ptrbb+2;
  99. }
  100. res0_0 *= alpha;
  101. res0_1 *= alpha;
  102. res0_2 *= alpha;
  103. res0_3 *= alpha;
  104. res0_4 *= alpha;
  105. res0_5 *= alpha;
  106. res0_6 *= alpha;
  107. res0_7 *= alpha;
  108. res1_0 *= alpha;
  109. res1_1 *= alpha;
  110. res1_2 *= alpha;
  111. res1_3 *= alpha;
  112. res1_4 *= alpha;
  113. res1_5 *= alpha;
  114. res1_6 *= alpha;
  115. res1_7 *= alpha;
  116. C0[0] = res0_0;
  117. C0[1] = res0_1;
  118. C0[2] = res0_2;
  119. C0[3] = res0_3;
  120. C0[4] = res0_4;
  121. C0[5] = res0_5;
  122. C0[6] = res0_6;
  123. C0[7] = res0_7;
  124. C1[0] = res1_0;
  125. C1[1] = res1_1;
  126. C1[2] = res1_2;
  127. C1[3] = res1_3;
  128. C1[4] = res1_4;
  129. C1[5] = res1_5;
  130. C1[6] = res1_6;
  131. C1[7] = res1_7;
  132. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  133. temp = bk - off;
  134. #ifdef LEFT
  135. temp -= 8; // number of values in A
  136. #else
  137. temp -= 2; // number of values in B
  138. #endif
  139. ptrba += temp*8;
  140. ptrbb += temp*2;
  141. #endif
  142. #ifdef LEFT
  143. off += 8; // number of values in A
  144. #endif
  145. C0 = C0+8;
  146. C1 = C1+8;
  147. }
  148. if ( bm & 4 )
  149. {
  150. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  151. ptrbb = bb;
  152. #else
  153. ptrba += off*4;
  154. ptrbb = bb + off*2;
  155. #endif
  156. res0_0 = 0;
  157. res0_1 = 0;
  158. res0_2 = 0;
  159. res0_3 = 0;
  160. res1_0 = 0;
  161. res1_1 = 0;
  162. res1_2 = 0;
  163. res1_3 = 0;
  164. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  165. temp = bk-off;
  166. #elif defined(LEFT)
  167. temp = off+4; // number of values in A
  168. #else
  169. temp = off+2; // number of values in B
  170. #endif
  171. for (k=0; k<temp; k++)
  172. {
  173. b0 = ptrbb[0];
  174. b1 = ptrbb[1];
  175. a0 = ptrba[0];
  176. res0_0 += a0*b0;
  177. res1_0 += a0*b1;
  178. a1 = ptrba[1];
  179. res0_1 += a1*b0;
  180. res1_1 += a1*b1;
  181. a0 = ptrba[2];
  182. res0_2 += a0*b0;
  183. res1_2 += a0*b1;
  184. a1 = ptrba[3];
  185. res0_3 += a1*b0;
  186. res1_3 += a1*b1;
  187. ptrba = ptrba+4;
  188. ptrbb = ptrbb+2;
  189. }
  190. res0_0 *= alpha;
  191. res0_1 *= alpha;
  192. res0_2 *= alpha;
  193. res0_3 *= alpha;
  194. res1_0 *= alpha;
  195. res1_1 *= alpha;
  196. res1_2 *= alpha;
  197. res1_3 *= alpha;
  198. C0[0] = res0_0;
  199. C0[1] = res0_1;
  200. C0[2] = res0_2;
  201. C0[3] = res0_3;
  202. C1[0] = res1_0;
  203. C1[1] = res1_1;
  204. C1[2] = res1_2;
  205. C1[3] = res1_3;
  206. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  207. temp = bk - off;
  208. #ifdef LEFT
  209. temp -= 4; // number of values in A
  210. #else
  211. temp -= 2; // number of values in B
  212. #endif
  213. ptrba += temp*4;
  214. ptrbb += temp*2;
  215. #endif
  216. #ifdef LEFT
  217. off += 4; // number of values in A
  218. #endif
  219. C0 = C0+4;
  220. C1 = C1+4;
  221. }
  222. if ( bm & 2 )
  223. {
  224. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  225. ptrbb = bb;
  226. #else
  227. ptrba += off*2;
  228. ptrbb = bb + off*2;
  229. #endif
  230. res0_0 = 0;
  231. res0_1 = 0;
  232. res1_0 = 0;
  233. res1_1 = 0;
  234. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  235. temp = bk-off;
  236. #elif defined(LEFT)
  237. temp = off+2; // number of values in A
  238. #else
  239. temp = off+2; // number of values in B
  240. #endif
  241. for (k=0; k<temp; k++)
  242. {
  243. b0 = ptrbb[0];
  244. b1 = ptrbb[1];
  245. a0 = ptrba[0];
  246. res0_0 += a0*b0;
  247. res1_0 += a0*b1;
  248. a1 = ptrba[1];
  249. res0_1 += a1*b0;
  250. res1_1 += a1*b1;
  251. ptrba = ptrba+2;
  252. ptrbb = ptrbb+2;
  253. }
  254. res0_0 *= alpha;
  255. res0_1 *= alpha;
  256. res1_0 *= alpha;
  257. res1_1 *= alpha;
  258. C0[0] = res0_0;
  259. C0[1] = res0_1;
  260. C1[0] = res1_0;
  261. C1[1] = res1_1;
  262. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  263. temp = bk - off;
  264. #ifdef LEFT
  265. temp -= 2; // number of values in A
  266. #else
  267. temp -= 2; // number of values in B
  268. #endif
  269. ptrba += temp*2;
  270. ptrbb += temp*2;
  271. #endif
  272. #ifdef LEFT
  273. off += 2; // number of values in A
  274. #endif
  275. C0 = C0+2;
  276. C1 = C1+2;
  277. }
  278. if ( bm & 1 )
  279. {
  280. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  281. ptrbb = bb;
  282. #else
  283. ptrba += off*1;
  284. ptrbb = bb + off*2;
  285. #endif
  286. res0_0 = 0;
  287. res1_0 = 0;
  288. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  289. temp = bk-off;
  290. #elif defined(LEFT)
  291. temp = off+1; // number of values in A
  292. #else
  293. temp = off+2; // number of values in B
  294. #endif
  295. for (k=0; k<temp; k++)
  296. {
  297. b0 = ptrbb[0];
  298. b1 = ptrbb[1];
  299. a0 = ptrba[0];
  300. res0_0 += a0*b0;
  301. res1_0 += a0*b1;
  302. ptrba = ptrba+1;
  303. ptrbb = ptrbb+2;
  304. }
  305. res0_0 *= alpha;
  306. res1_0 *= alpha;
  307. C0[0] = res0_0;
  308. C1[0] = res1_0;
  309. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  310. temp = bk - off;
  311. #ifdef LEFT
  312. temp -= 1; // number of values in A
  313. #else
  314. temp -= 2; // number of values in B
  315. #endif
  316. ptrba += temp*1;
  317. ptrbb += temp*2;
  318. #endif
  319. #ifdef LEFT
  320. off += 1; // number of values in A
  321. #endif
  322. C0 = C0+1;
  323. C1 = C1+1;
  324. }
  325. #if defined(TRMMKERNEL) && !defined(LEFT)
  326. off += 2;
  327. #endif
  328. k = (bk<<1);
  329. bb = bb+k;
  330. i = (ldc<<1);
  331. C = C+i;
  332. }
  333. for (j=0; j<(bn&1); j+=1)
  334. {
  335. C0 = C;
  336. #if defined(TRMMKERNEL) && defined(LEFT)
  337. off = offset;
  338. #endif
  339. ptrba = ba;
  340. for (i=0; i<bm/8; i+=1)
  341. {
  342. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  343. ptrbb = bb;
  344. #else
  345. ptrba += off*8;
  346. ptrbb = bb + off*1;
  347. #endif
  348. res0_0 = 0;
  349. res0_1 = 0;
  350. res0_2 = 0;
  351. res0_3 = 0;
  352. res0_4 = 0;
  353. res0_5 = 0;
  354. res0_6 = 0;
  355. res0_7 = 0;
  356. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  357. temp = bk-off;
  358. #elif defined(LEFT)
  359. temp = off+8; // number of values in A
  360. #else
  361. temp = off+1; // number of values in B
  362. #endif
  363. for (k=0; k<temp; k++)
  364. {
  365. b0 = ptrbb[0];
  366. a0 = ptrba[0];
  367. res0_0 += a0*b0;
  368. a1 = ptrba[1];
  369. res0_1 += a1*b0;
  370. a0 = ptrba[2];
  371. res0_2 += a0*b0;
  372. a1 = ptrba[3];
  373. res0_3 += a1*b0;
  374. a0 = ptrba[4];
  375. res0_4 += a0*b0;
  376. a1 = ptrba[5];
  377. res0_5 += a1*b0;
  378. a0 = ptrba[6];
  379. res0_6 += a0*b0;
  380. a1 = ptrba[7];
  381. res0_7 += a1*b0;
  382. ptrba = ptrba+8;
  383. ptrbb = ptrbb+1;
  384. }
  385. res0_0 *= alpha;
  386. res0_1 *= alpha;
  387. res0_2 *= alpha;
  388. res0_3 *= alpha;
  389. res0_4 *= alpha;
  390. res0_5 *= alpha;
  391. res0_6 *= alpha;
  392. res0_7 *= alpha;
  393. C0[0] = res0_0;
  394. C0[1] = res0_1;
  395. C0[2] = res0_2;
  396. C0[3] = res0_3;
  397. C0[4] = res0_4;
  398. C0[5] = res0_5;
  399. C0[6] = res0_6;
  400. C0[7] = res0_7;
  401. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  402. temp = bk - off;
  403. #ifdef LEFT
  404. temp -= 8; // number of values in A
  405. #else
  406. temp -= 1; // number of values in B
  407. #endif
  408. ptrba += temp*8;
  409. ptrbb += temp*1;
  410. #endif
  411. #ifdef LEFT
  412. off += 8; // number of values in A
  413. #endif
  414. C0 = C0+8;
  415. }
  416. if ( bm & 4 )
  417. {
  418. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  419. ptrbb = bb;
  420. #else
  421. ptrba += off*4;
  422. ptrbb = bb + off*1;
  423. #endif
  424. res0_0 = 0;
  425. res0_1 = 0;
  426. res0_2 = 0;
  427. res0_3 = 0;
  428. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  429. temp = bk-off;
  430. #elif defined(LEFT)
  431. temp = off+4; // number of values in A
  432. #else
  433. temp = off+1; // number of values in B
  434. #endif
  435. for (k=0; k<temp; k++)
  436. {
  437. b0 = ptrbb[0];
  438. a0 = ptrba[0];
  439. res0_0 += a0*b0;
  440. a1 = ptrba[1];
  441. res0_1 += a1*b0;
  442. a0 = ptrba[2];
  443. res0_2 += a0*b0;
  444. a1 = ptrba[3];
  445. res0_3 += a1*b0;
  446. ptrba = ptrba+4;
  447. ptrbb = ptrbb+1;
  448. }
  449. res0_0 *= alpha;
  450. res0_1 *= alpha;
  451. res0_2 *= alpha;
  452. res0_3 *= alpha;
  453. C0[0] = res0_0;
  454. C0[1] = res0_1;
  455. C0[2] = res0_2;
  456. C0[3] = res0_3;
  457. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  458. temp = bk - off;
  459. #ifdef LEFT
  460. temp -= 4; // number of values in A
  461. #else
  462. temp -= 1; // number of values in B
  463. #endif
  464. ptrba += temp*4;
  465. ptrbb += temp*1;
  466. #endif
  467. #ifdef LEFT
  468. off += 4; // number of values in A
  469. #endif
  470. C0 = C0+4;
  471. }
  472. if ( bm & 2 )
  473. {
  474. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  475. ptrbb = bb;
  476. #else
  477. ptrba += off*2;
  478. ptrbb = bb + off*1;
  479. #endif
  480. res0_0 = 0;
  481. res0_1 = 0;
  482. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  483. temp = bk-off;
  484. #elif defined(LEFT)
  485. temp = off+2; // number of values in A
  486. #else
  487. temp = off+1; // number of values in B
  488. #endif
  489. for (k=0; k<temp; k++)
  490. {
  491. b0 = ptrbb[0];
  492. a0 = ptrba[0];
  493. res0_0 += a0*b0;
  494. a1 = ptrba[1];
  495. res0_1 += a1*b0;
  496. ptrba = ptrba+2;
  497. ptrbb = ptrbb+1;
  498. }
  499. res0_0 *= alpha;
  500. res0_1 *= alpha;
  501. C0[0] = res0_0;
  502. C0[1] = res0_1;
  503. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  504. temp = bk - off;
  505. #ifdef LEFT
  506. temp -= 2; // number of values in A
  507. #else
  508. temp -= 1; // number of values in B
  509. #endif
  510. ptrba += temp*2;
  511. ptrbb += temp*1;
  512. #endif
  513. #ifdef LEFT
  514. off += 2; // number of values in A
  515. #endif
  516. C0 = C0+2;
  517. }
  518. if ( bm & 1 )
  519. {
  520. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  521. ptrbb = bb;
  522. #else
  523. ptrba += off*1;
  524. ptrbb = bb + off*1;
  525. #endif
  526. res0_0 = 0;
  527. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  528. temp = bk-off;
  529. #elif defined(LEFT)
  530. temp = off+1; // number of values in A
  531. #else
  532. temp = off+1; // number of values in B
  533. #endif
  534. for (k=0; k<temp; k++)
  535. {
  536. b0 = ptrbb[0];
  537. a0 = ptrba[0];
  538. res0_0 += a0*b0;
  539. ptrba = ptrba+1;
  540. ptrbb = ptrbb+1;
  541. }
  542. res0_0 *= alpha;
  543. C0[0] = res0_0;
  544. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  545. temp = bk - off;
  546. #ifdef LEFT
  547. temp -= 1; // number of values in A
  548. #else
  549. temp -= 1; // number of values in B
  550. #endif
  551. ptrba += temp*1;
  552. ptrbb += temp*1;
  553. #endif
  554. #ifdef LEFT
  555. off += 1; // number of values in A
  556. #endif
  557. C0 = C0+1;
  558. }
  559. #if defined(TRMMKERNEL) && !defined(LEFT)
  560. off += 1;
  561. #endif
  562. k = (bk<<0);
  563. bb = bb+k;
  564. C = C+ldc;
  565. }
  566. return 0;
  567. }