You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trmmkernel_8x2.c 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752
  1. #include "common.h"
  2. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
  3. {
  4. BLASLONG i,j,k;
  5. FLOAT *C0,*C1,*ptrba,*ptrbb;
  6. FLOAT res0_0;
  7. FLOAT res0_1;
  8. FLOAT res0_2;
  9. FLOAT res0_3;
  10. FLOAT res0_4;
  11. FLOAT res0_5;
  12. FLOAT res0_6;
  13. FLOAT res0_7;
  14. FLOAT res1_0;
  15. FLOAT res1_1;
  16. FLOAT res1_2;
  17. FLOAT res1_3;
  18. FLOAT res1_4;
  19. FLOAT res1_5;
  20. FLOAT res1_6;
  21. FLOAT res1_7;
  22. FLOAT a0;
  23. FLOAT a1;
  24. FLOAT b0;
  25. FLOAT b1;
  26. BLASLONG off, temp;
  27. #if !defined(LEFT)
  28. off = -offset;
  29. #else
  30. off = 0;
  31. #endif
  32. for (j=0; j<bn/2; j+=1)
  33. {
  34. C0 = C;
  35. C1 = C0+ldc;
  36. #if defined(TRMMKERNEL) && defined(LEFT)
  37. off = offset;
  38. #endif
  39. ptrba = ba;
  40. for (i=0; i<bm/8; i+=1)
  41. {
  42. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  43. ptrbb = bb;
  44. #else
  45. ptrba += off*8;
  46. ptrbb = bb + off*2;
  47. #endif
  48. res0_0 = 0;
  49. res0_1 = 0;
  50. res0_2 = 0;
  51. res0_3 = 0;
  52. res0_4 = 0;
  53. res0_5 = 0;
  54. res0_6 = 0;
  55. res0_7 = 0;
  56. res1_0 = 0;
  57. res1_1 = 0;
  58. res1_2 = 0;
  59. res1_3 = 0;
  60. res1_4 = 0;
  61. res1_5 = 0;
  62. res1_6 = 0;
  63. res1_7 = 0;
  64. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  65. temp = bk-off;
  66. #elif defined(LEFT)
  67. temp = off+8; // number of values in A
  68. #else
  69. temp = off+2; // number of values in B
  70. #endif
  71. for (k=0; k<temp; k++)
  72. {
  73. b0 = ptrbb[0];
  74. b1 = ptrbb[1];
  75. a0 = ptrba[0];
  76. res0_0 += a0*b0;
  77. res1_0 += a0*b1;
  78. a1 = ptrba[1];
  79. res0_1 += a1*b0;
  80. res1_1 += a1*b1;
  81. a0 = ptrba[2];
  82. res0_2 += a0*b0;
  83. res1_2 += a0*b1;
  84. a1 = ptrba[3];
  85. res0_3 += a1*b0;
  86. res1_3 += a1*b1;
  87. a0 = ptrba[4];
  88. res0_4 += a0*b0;
  89. res1_4 += a0*b1;
  90. a1 = ptrba[5];
  91. res0_5 += a1*b0;
  92. res1_5 += a1*b1;
  93. a0 = ptrba[6];
  94. res0_6 += a0*b0;
  95. res1_6 += a0*b1;
  96. a1 = ptrba[7];
  97. res0_7 += a1*b0;
  98. res1_7 += a1*b1;
  99. ptrba = ptrba+8;
  100. ptrbb = ptrbb+2;
  101. }
  102. res0_0 *= alpha;
  103. res0_1 *= alpha;
  104. res0_2 *= alpha;
  105. res0_3 *= alpha;
  106. res0_4 *= alpha;
  107. res0_5 *= alpha;
  108. res0_6 *= alpha;
  109. res0_7 *= alpha;
  110. res1_0 *= alpha;
  111. res1_1 *= alpha;
  112. res1_2 *= alpha;
  113. res1_3 *= alpha;
  114. res1_4 *= alpha;
  115. res1_5 *= alpha;
  116. res1_6 *= alpha;
  117. res1_7 *= alpha;
  118. C0[0] = res0_0;
  119. C0[1] = res0_1;
  120. C0[2] = res0_2;
  121. C0[3] = res0_3;
  122. C0[4] = res0_4;
  123. C0[5] = res0_5;
  124. C0[6] = res0_6;
  125. C0[7] = res0_7;
  126. C1[0] = res1_0;
  127. C1[1] = res1_1;
  128. C1[2] = res1_2;
  129. C1[3] = res1_3;
  130. C1[4] = res1_4;
  131. C1[5] = res1_5;
  132. C1[6] = res1_6;
  133. C1[7] = res1_7;
  134. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  135. temp = bk - off;
  136. #ifdef LEFT
  137. temp -= 8; // number of values in A
  138. #else
  139. temp -= 2; // number of values in B
  140. #endif
  141. ptrba += temp*8;
  142. ptrbb += temp*2;
  143. #endif
  144. #ifdef LEFT
  145. off += 8; // number of values in A
  146. #endif
  147. C0 = C0+8;
  148. C1 = C1+8;
  149. }
  150. if ( bm & 4 )
  151. {
  152. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  153. ptrbb = bb;
  154. #else
  155. ptrba += off*4;
  156. ptrbb = bb + off*2;
  157. #endif
  158. res0_0 = 0;
  159. res0_1 = 0;
  160. res0_2 = 0;
  161. res0_3 = 0;
  162. res1_0 = 0;
  163. res1_1 = 0;
  164. res1_2 = 0;
  165. res1_3 = 0;
  166. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  167. temp = bk-off;
  168. #elif defined(LEFT)
  169. temp = off+4; // number of values in A
  170. #else
  171. temp = off+2; // number of values in B
  172. #endif
  173. for (k=0; k<temp; k++)
  174. {
  175. b0 = ptrbb[0];
  176. b1 = ptrbb[1];
  177. a0 = ptrba[0];
  178. res0_0 += a0*b0;
  179. res1_0 += a0*b1;
  180. a1 = ptrba[1];
  181. res0_1 += a1*b0;
  182. res1_1 += a1*b1;
  183. a0 = ptrba[2];
  184. res0_2 += a0*b0;
  185. res1_2 += a0*b1;
  186. a1 = ptrba[3];
  187. res0_3 += a1*b0;
  188. res1_3 += a1*b1;
  189. ptrba = ptrba+4;
  190. ptrbb = ptrbb+2;
  191. }
  192. res0_0 *= alpha;
  193. res0_1 *= alpha;
  194. res0_2 *= alpha;
  195. res0_3 *= alpha;
  196. res1_0 *= alpha;
  197. res1_1 *= alpha;
  198. res1_2 *= alpha;
  199. res1_3 *= alpha;
  200. C0[0] = res0_0;
  201. C0[1] = res0_1;
  202. C0[2] = res0_2;
  203. C0[3] = res0_3;
  204. C1[0] = res1_0;
  205. C1[1] = res1_1;
  206. C1[2] = res1_2;
  207. C1[3] = res1_3;
  208. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  209. temp = bk - off;
  210. #ifdef LEFT
  211. temp -= 4; // number of values in A
  212. #else
  213. temp -= 2; // number of values in B
  214. #endif
  215. ptrba += temp*4;
  216. ptrbb += temp*2;
  217. #endif
  218. #ifdef LEFT
  219. off += 4; // number of values in A
  220. #endif
  221. C0 = C0+4;
  222. C1 = C1+4;
  223. }
  224. if ( bm & 2 )
  225. {
  226. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  227. ptrbb = bb;
  228. #else
  229. ptrba += off*2;
  230. ptrbb = bb + off*2;
  231. #endif
  232. res0_0 = 0;
  233. res0_1 = 0;
  234. res1_0 = 0;
  235. res1_1 = 0;
  236. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  237. temp = bk-off;
  238. #elif defined(LEFT)
  239. temp = off+2; // number of values in A
  240. #else
  241. temp = off+2; // number of values in B
  242. #endif
  243. for (k=0; k<temp; k++)
  244. {
  245. b0 = ptrbb[0];
  246. b1 = ptrbb[1];
  247. a0 = ptrba[0];
  248. res0_0 += a0*b0;
  249. res1_0 += a0*b1;
  250. a1 = ptrba[1];
  251. res0_1 += a1*b0;
  252. res1_1 += a1*b1;
  253. ptrba = ptrba+2;
  254. ptrbb = ptrbb+2;
  255. }
  256. res0_0 *= alpha;
  257. res0_1 *= alpha;
  258. res1_0 *= alpha;
  259. res1_1 *= alpha;
  260. C0[0] = res0_0;
  261. C0[1] = res0_1;
  262. C1[0] = res1_0;
  263. C1[1] = res1_1;
  264. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  265. temp = bk - off;
  266. #ifdef LEFT
  267. temp -= 2; // number of values in A
  268. #else
  269. temp -= 2; // number of values in B
  270. #endif
  271. ptrba += temp*2;
  272. ptrbb += temp*2;
  273. #endif
  274. #ifdef LEFT
  275. off += 2; // number of values in A
  276. #endif
  277. C0 = C0+2;
  278. C1 = C1+2;
  279. }
  280. if ( bm & 1 )
  281. {
  282. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  283. ptrbb = bb;
  284. #else
  285. ptrba += off*1;
  286. ptrbb = bb + off*2;
  287. #endif
  288. res0_0 = 0;
  289. res1_0 = 0;
  290. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  291. temp = bk-off;
  292. #elif defined(LEFT)
  293. temp = off+1; // number of values in A
  294. #else
  295. temp = off+2; // number of values in B
  296. #endif
  297. for (k=0; k<temp; k++)
  298. {
  299. b0 = ptrbb[0];
  300. b1 = ptrbb[1];
  301. a0 = ptrba[0];
  302. res0_0 += a0*b0;
  303. res1_0 += a0*b1;
  304. ptrba = ptrba+1;
  305. ptrbb = ptrbb+2;
  306. }
  307. res0_0 *= alpha;
  308. res1_0 *= alpha;
  309. C0[0] = res0_0;
  310. C1[0] = res1_0;
  311. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  312. temp = bk - off;
  313. #ifdef LEFT
  314. temp -= 1; // number of values in A
  315. #else
  316. temp -= 2; // number of values in B
  317. #endif
  318. ptrba += temp*1;
  319. ptrbb += temp*2;
  320. #endif
  321. #ifdef LEFT
  322. off += 1; // number of values in A
  323. #endif
  324. C0 = C0+1;
  325. C1 = C1+1;
  326. }
  327. #if defined(TRMMKERNEL) && !defined(LEFT)
  328. off += 2;
  329. #endif
  330. k = (bk<<1);
  331. bb = bb+k;
  332. i = (ldc<<1);
  333. C = C+i;
  334. }
  335. for (j=0; j<(bn&1); j+=1)
  336. {
  337. C0 = C;
  338. #if defined(TRMMKERNEL) && defined(LEFT)
  339. off = offset;
  340. #endif
  341. ptrba = ba;
  342. for (i=0; i<bm/8; i+=1)
  343. {
  344. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  345. ptrbb = bb;
  346. #else
  347. ptrba += off*8;
  348. ptrbb = bb + off*1;
  349. #endif
  350. res0_0 = 0;
  351. res0_1 = 0;
  352. res0_2 = 0;
  353. res0_3 = 0;
  354. res0_4 = 0;
  355. res0_5 = 0;
  356. res0_6 = 0;
  357. res0_7 = 0;
  358. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  359. temp = bk-off;
  360. #elif defined(LEFT)
  361. temp = off+8; // number of values in A
  362. #else
  363. temp = off+1; // number of values in B
  364. #endif
  365. for (k=0; k<temp; k++)
  366. {
  367. b0 = ptrbb[0];
  368. a0 = ptrba[0];
  369. res0_0 += a0*b0;
  370. a1 = ptrba[1];
  371. res0_1 += a1*b0;
  372. a0 = ptrba[2];
  373. res0_2 += a0*b0;
  374. a1 = ptrba[3];
  375. res0_3 += a1*b0;
  376. a0 = ptrba[4];
  377. res0_4 += a0*b0;
  378. a1 = ptrba[5];
  379. res0_5 += a1*b0;
  380. a0 = ptrba[6];
  381. res0_6 += a0*b0;
  382. a1 = ptrba[7];
  383. res0_7 += a1*b0;
  384. ptrba = ptrba+8;
  385. ptrbb = ptrbb+1;
  386. }
  387. res0_0 *= alpha;
  388. res0_1 *= alpha;
  389. res0_2 *= alpha;
  390. res0_3 *= alpha;
  391. res0_4 *= alpha;
  392. res0_5 *= alpha;
  393. res0_6 *= alpha;
  394. res0_7 *= alpha;
  395. C0[0] = res0_0;
  396. C0[1] = res0_1;
  397. C0[2] = res0_2;
  398. C0[3] = res0_3;
  399. C0[4] = res0_4;
  400. C0[5] = res0_5;
  401. C0[6] = res0_6;
  402. C0[7] = res0_7;
  403. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  404. temp = bk - off;
  405. #ifdef LEFT
  406. temp -= 8; // number of values in A
  407. #else
  408. temp -= 1; // number of values in B
  409. #endif
  410. ptrba += temp*8;
  411. ptrbb += temp*1;
  412. #endif
  413. #ifdef LEFT
  414. off += 8; // number of values in A
  415. #endif
  416. C0 = C0+8;
  417. }
  418. if ( bm & 4 )
  419. {
  420. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  421. ptrbb = bb;
  422. #else
  423. ptrba += off*4;
  424. ptrbb = bb + off*1;
  425. #endif
  426. res0_0 = 0;
  427. res0_1 = 0;
  428. res0_2 = 0;
  429. res0_3 = 0;
  430. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  431. temp = bk-off;
  432. #elif defined(LEFT)
  433. temp = off+4; // number of values in A
  434. #else
  435. temp = off+1; // number of values in B
  436. #endif
  437. for (k=0; k<temp; k++)
  438. {
  439. b0 = ptrbb[0];
  440. a0 = ptrba[0];
  441. res0_0 += a0*b0;
  442. a1 = ptrba[1];
  443. res0_1 += a1*b0;
  444. a0 = ptrba[2];
  445. res0_2 += a0*b0;
  446. a1 = ptrba[3];
  447. res0_3 += a1*b0;
  448. ptrba = ptrba+4;
  449. ptrbb = ptrbb+1;
  450. }
  451. res0_0 *= alpha;
  452. res0_1 *= alpha;
  453. res0_2 *= alpha;
  454. res0_3 *= alpha;
  455. C0[0] = res0_0;
  456. C0[1] = res0_1;
  457. C0[2] = res0_2;
  458. C0[3] = res0_3;
  459. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  460. temp = bk - off;
  461. #ifdef LEFT
  462. temp -= 4; // number of values in A
  463. #else
  464. temp -= 1; // number of values in B
  465. #endif
  466. ptrba += temp*4;
  467. ptrbb += temp*1;
  468. #endif
  469. #ifdef LEFT
  470. off += 4; // number of values in A
  471. #endif
  472. C0 = C0+4;
  473. }
  474. if ( bm & 2 )
  475. {
  476. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  477. ptrbb = bb;
  478. #else
  479. ptrba += off*2;
  480. ptrbb = bb + off*1;
  481. #endif
  482. res0_0 = 0;
  483. res0_1 = 0;
  484. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  485. temp = bk-off;
  486. #elif defined(LEFT)
  487. temp = off+2; // number of values in A
  488. #else
  489. temp = off+1; // number of values in B
  490. #endif
  491. for (k=0; k<temp; k++)
  492. {
  493. b0 = ptrbb[0];
  494. a0 = ptrba[0];
  495. res0_0 += a0*b0;
  496. a1 = ptrba[1];
  497. res0_1 += a1*b0;
  498. ptrba = ptrba+2;
  499. ptrbb = ptrbb+1;
  500. }
  501. res0_0 *= alpha;
  502. res0_1 *= alpha;
  503. C0[0] = res0_0;
  504. C0[1] = res0_1;
  505. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  506. temp = bk - off;
  507. #ifdef LEFT
  508. temp -= 2; // number of values in A
  509. #else
  510. temp -= 1; // number of values in B
  511. #endif
  512. ptrba += temp*2;
  513. ptrbb += temp*1;
  514. #endif
  515. #ifdef LEFT
  516. off += 2; // number of values in A
  517. #endif
  518. C0 = C0+2;
  519. }
  520. if ( bm & 1 )
  521. {
  522. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  523. ptrbb = bb;
  524. #else
  525. ptrba += off*1;
  526. ptrbb = bb + off*1;
  527. #endif
  528. res0_0 = 0;
  529. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  530. temp = bk-off;
  531. #elif defined(LEFT)
  532. temp = off+1; // number of values in A
  533. #else
  534. temp = off+1; // number of values in B
  535. #endif
  536. for (k=0; k<temp; k++)
  537. {
  538. b0 = ptrbb[0];
  539. a0 = ptrba[0];
  540. res0_0 += a0*b0;
  541. ptrba = ptrba+1;
  542. ptrbb = ptrbb+1;
  543. }
  544. res0_0 *= alpha;
  545. C0[0] = res0_0;
  546. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  547. temp = bk - off;
  548. #ifdef LEFT
  549. temp -= 1; // number of values in A
  550. #else
  551. temp -= 1; // number of values in B
  552. #endif
  553. ptrba += temp*1;
  554. ptrbb += temp*1;
  555. #endif
  556. #ifdef LEFT
  557. off += 1; // number of values in A
  558. #endif
  559. C0 = C0+1;
  560. }
  561. #if defined(TRMMKERNEL) && !defined(LEFT)
  562. off += 1;
  563. #endif
  564. k = (bk<<0);
  565. bb = bb+k;
  566. C = C+ldc;
  567. }
  568. return 0;
  569. }