You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trmmkernel_8x4.c 28 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317
  1. #include "common.h"
  2. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
  3. {
  4. BLASLONG i,j,k;
  5. FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
  6. FLOAT res0_0;
  7. FLOAT res0_1;
  8. FLOAT res0_2;
  9. FLOAT res0_3;
  10. FLOAT res0_4;
  11. FLOAT res0_5;
  12. FLOAT res0_6;
  13. FLOAT res0_7;
  14. FLOAT res1_0;
  15. FLOAT res1_1;
  16. FLOAT res1_2;
  17. FLOAT res1_3;
  18. FLOAT res1_4;
  19. FLOAT res1_5;
  20. FLOAT res1_6;
  21. FLOAT res1_7;
  22. FLOAT res2_0;
  23. FLOAT res2_1;
  24. FLOAT res2_2;
  25. FLOAT res2_3;
  26. FLOAT res2_4;
  27. FLOAT res2_5;
  28. FLOAT res2_6;
  29. FLOAT res2_7;
  30. FLOAT res3_0;
  31. FLOAT res3_1;
  32. FLOAT res3_2;
  33. FLOAT res3_3;
  34. FLOAT res3_4;
  35. FLOAT res3_5;
  36. FLOAT res3_6;
  37. FLOAT res3_7;
  38. FLOAT a0;
  39. FLOAT a1;
  40. FLOAT b0;
  41. FLOAT b1;
  42. FLOAT b2;
  43. FLOAT b3;
  44. BLASLONG off, temp;
  45. #if !defined(LEFT)
  46. off = -offset;
  47. #else
  48. off = 0;
  49. #endif
  50. for (j=0; j<bn/4; j+=1)
  51. {
  52. C0 = C;
  53. C1 = C0+ldc;
  54. C2 = C1+ldc;
  55. C3 = C2+ldc;
  56. #if defined(TRMMKERNEL) && defined(LEFT)
  57. off = offset;
  58. #endif
  59. ptrba = ba;
  60. for (i=0; i<bm/8; i+=1)
  61. {
  62. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  63. ptrbb = bb;
  64. #else
  65. ptrba += off*8;
  66. ptrbb = bb + off*4;
  67. #endif
  68. res0_0 = 0;
  69. res0_1 = 0;
  70. res0_2 = 0;
  71. res0_3 = 0;
  72. res0_4 = 0;
  73. res0_5 = 0;
  74. res0_6 = 0;
  75. res0_7 = 0;
  76. res1_0 = 0;
  77. res1_1 = 0;
  78. res1_2 = 0;
  79. res1_3 = 0;
  80. res1_4 = 0;
  81. res1_5 = 0;
  82. res1_6 = 0;
  83. res1_7 = 0;
  84. res2_0 = 0;
  85. res2_1 = 0;
  86. res2_2 = 0;
  87. res2_3 = 0;
  88. res2_4 = 0;
  89. res2_5 = 0;
  90. res2_6 = 0;
  91. res2_7 = 0;
  92. res3_0 = 0;
  93. res3_1 = 0;
  94. res3_2 = 0;
  95. res3_3 = 0;
  96. res3_4 = 0;
  97. res3_5 = 0;
  98. res3_6 = 0;
  99. res3_7 = 0;
  100. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  101. temp = bk-off;
  102. #elif defined(LEFT)
  103. temp = off+8; // number of values in A
  104. #else
  105. temp = off+4; // number of values in B
  106. #endif
  107. for (k=0; k<temp; k++)
  108. {
  109. b0 = ptrbb[0];
  110. b1 = ptrbb[1];
  111. b2 = ptrbb[2];
  112. b3 = ptrbb[3];
  113. a0 = ptrba[0];
  114. res0_0 += a0*b0;
  115. res1_0 += a0*b1;
  116. res2_0 += a0*b2;
  117. res3_0 += a0*b3;
  118. a1 = ptrba[1];
  119. res0_1 += a1*b0;
  120. res1_1 += a1*b1;
  121. res2_1 += a1*b2;
  122. res3_1 += a1*b3;
  123. a0 = ptrba[2];
  124. res0_2 += a0*b0;
  125. res1_2 += a0*b1;
  126. res2_2 += a0*b2;
  127. res3_2 += a0*b3;
  128. a1 = ptrba[3];
  129. res0_3 += a1*b0;
  130. res1_3 += a1*b1;
  131. res2_3 += a1*b2;
  132. res3_3 += a1*b3;
  133. a0 = ptrba[4];
  134. res0_4 += a0*b0;
  135. res1_4 += a0*b1;
  136. res2_4 += a0*b2;
  137. res3_4 += a0*b3;
  138. a1 = ptrba[5];
  139. res0_5 += a1*b0;
  140. res1_5 += a1*b1;
  141. res2_5 += a1*b2;
  142. res3_5 += a1*b3;
  143. a0 = ptrba[6];
  144. res0_6 += a0*b0;
  145. res1_6 += a0*b1;
  146. res2_6 += a0*b2;
  147. res3_6 += a0*b3;
  148. a1 = ptrba[7];
  149. res0_7 += a1*b0;
  150. res1_7 += a1*b1;
  151. res2_7 += a1*b2;
  152. res3_7 += a1*b3;
  153. ptrba = ptrba+8;
  154. ptrbb = ptrbb+4;
  155. }
  156. res0_0 *= alpha;
  157. res0_1 *= alpha;
  158. res0_2 *= alpha;
  159. res0_3 *= alpha;
  160. res0_4 *= alpha;
  161. res0_5 *= alpha;
  162. res0_6 *= alpha;
  163. res0_7 *= alpha;
  164. res1_0 *= alpha;
  165. res1_1 *= alpha;
  166. res1_2 *= alpha;
  167. res1_3 *= alpha;
  168. res1_4 *= alpha;
  169. res1_5 *= alpha;
  170. res1_6 *= alpha;
  171. res1_7 *= alpha;
  172. res2_0 *= alpha;
  173. res2_1 *= alpha;
  174. res2_2 *= alpha;
  175. res2_3 *= alpha;
  176. res2_4 *= alpha;
  177. res2_5 *= alpha;
  178. res2_6 *= alpha;
  179. res2_7 *= alpha;
  180. res3_0 *= alpha;
  181. res3_1 *= alpha;
  182. res3_2 *= alpha;
  183. res3_3 *= alpha;
  184. res3_4 *= alpha;
  185. res3_5 *= alpha;
  186. res3_6 *= alpha;
  187. res3_7 *= alpha;
  188. C0[0] = res0_0;
  189. C0[1] = res0_1;
  190. C0[2] = res0_2;
  191. C0[3] = res0_3;
  192. C0[4] = res0_4;
  193. C0[5] = res0_5;
  194. C0[6] = res0_6;
  195. C0[7] = res0_7;
  196. C1[0] = res1_0;
  197. C1[1] = res1_1;
  198. C1[2] = res1_2;
  199. C1[3] = res1_3;
  200. C1[4] = res1_4;
  201. C1[5] = res1_5;
  202. C1[6] = res1_6;
  203. C1[7] = res1_7;
  204. C2[0] = res2_0;
  205. C2[1] = res2_1;
  206. C2[2] = res2_2;
  207. C2[3] = res2_3;
  208. C2[4] = res2_4;
  209. C2[5] = res2_5;
  210. C2[6] = res2_6;
  211. C2[7] = res2_7;
  212. C3[0] = res3_0;
  213. C3[1] = res3_1;
  214. C3[2] = res3_2;
  215. C3[3] = res3_3;
  216. C3[4] = res3_4;
  217. C3[5] = res3_5;
  218. C3[6] = res3_6;
  219. C3[7] = res3_7;
  220. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  221. temp = bk - off;
  222. #ifdef LEFT
  223. temp -= 8; // number of values in A
  224. #else
  225. temp -= 4; // number of values in B
  226. #endif
  227. ptrba += temp*8;
  228. ptrbb += temp*4;
  229. #endif
  230. #ifdef LEFT
  231. off += 8; // number of values in A
  232. #endif
  233. C0 = C0+8;
  234. C1 = C1+8;
  235. C2 = C2+8;
  236. C3 = C3+8;
  237. }
  238. if ( bm & 4 )
  239. {
  240. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  241. ptrbb = bb;
  242. #else
  243. ptrba += off*4;
  244. ptrbb = bb + off*4;
  245. #endif
  246. res0_0 = 0;
  247. res0_1 = 0;
  248. res0_2 = 0;
  249. res0_3 = 0;
  250. res1_0 = 0;
  251. res1_1 = 0;
  252. res1_2 = 0;
  253. res1_3 = 0;
  254. res2_0 = 0;
  255. res2_1 = 0;
  256. res2_2 = 0;
  257. res2_3 = 0;
  258. res3_0 = 0;
  259. res3_1 = 0;
  260. res3_2 = 0;
  261. res3_3 = 0;
  262. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  263. temp = bk-off;
  264. #elif defined(LEFT)
  265. temp = off+4; // number of values in A
  266. #else
  267. temp = off+4; // number of values in B
  268. #endif
  269. for (k=0; k<temp; k++)
  270. {
  271. b0 = ptrbb[0];
  272. b1 = ptrbb[1];
  273. b2 = ptrbb[2];
  274. b3 = ptrbb[3];
  275. a0 = ptrba[0];
  276. res0_0 += a0*b0;
  277. res1_0 += a0*b1;
  278. res2_0 += a0*b2;
  279. res3_0 += a0*b3;
  280. a1 = ptrba[1];
  281. res0_1 += a1*b0;
  282. res1_1 += a1*b1;
  283. res2_1 += a1*b2;
  284. res3_1 += a1*b3;
  285. a0 = ptrba[2];
  286. res0_2 += a0*b0;
  287. res1_2 += a0*b1;
  288. res2_2 += a0*b2;
  289. res3_2 += a0*b3;
  290. a1 = ptrba[3];
  291. res0_3 += a1*b0;
  292. res1_3 += a1*b1;
  293. res2_3 += a1*b2;
  294. res3_3 += a1*b3;
  295. ptrba = ptrba+4;
  296. ptrbb = ptrbb+4;
  297. }
  298. res0_0 *= alpha;
  299. res0_1 *= alpha;
  300. res0_2 *= alpha;
  301. res0_3 *= alpha;
  302. res1_0 *= alpha;
  303. res1_1 *= alpha;
  304. res1_2 *= alpha;
  305. res1_3 *= alpha;
  306. res2_0 *= alpha;
  307. res2_1 *= alpha;
  308. res2_2 *= alpha;
  309. res2_3 *= alpha;
  310. res3_0 *= alpha;
  311. res3_1 *= alpha;
  312. res3_2 *= alpha;
  313. res3_3 *= alpha;
  314. C0[0] = res0_0;
  315. C0[1] = res0_1;
  316. C0[2] = res0_2;
  317. C0[3] = res0_3;
  318. C1[0] = res1_0;
  319. C1[1] = res1_1;
  320. C1[2] = res1_2;
  321. C1[3] = res1_3;
  322. C2[0] = res2_0;
  323. C2[1] = res2_1;
  324. C2[2] = res2_2;
  325. C2[3] = res2_3;
  326. C3[0] = res3_0;
  327. C3[1] = res3_1;
  328. C3[2] = res3_2;
  329. C3[3] = res3_3;
  330. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  331. temp = bk - off;
  332. #ifdef LEFT
  333. temp -= 4; // number of values in A
  334. #else
  335. temp -= 4; // number of values in B
  336. #endif
  337. ptrba += temp*4;
  338. ptrbb += temp*4;
  339. #endif
  340. #ifdef LEFT
  341. off += 4; // number of values in A
  342. #endif
  343. C0 = C0+4;
  344. C1 = C1+4;
  345. C2 = C2+4;
  346. C3 = C3+4;
  347. }
  348. if ( bm & 2 )
  349. {
  350. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  351. ptrbb = bb;
  352. #else
  353. ptrba += off*2;
  354. ptrbb = bb + off*4;
  355. #endif
  356. res0_0 = 0;
  357. res0_1 = 0;
  358. res1_0 = 0;
  359. res1_1 = 0;
  360. res2_0 = 0;
  361. res2_1 = 0;
  362. res3_0 = 0;
  363. res3_1 = 0;
  364. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  365. temp = bk-off;
  366. #elif defined(LEFT)
  367. temp = off+2; // number of values in A
  368. #else
  369. temp = off+4; // number of values in B
  370. #endif
  371. for (k=0; k<temp; k++)
  372. {
  373. b0 = ptrbb[0];
  374. b1 = ptrbb[1];
  375. b2 = ptrbb[2];
  376. b3 = ptrbb[3];
  377. a0 = ptrba[0];
  378. res0_0 += a0*b0;
  379. res1_0 += a0*b1;
  380. res2_0 += a0*b2;
  381. res3_0 += a0*b3;
  382. a1 = ptrba[1];
  383. res0_1 += a1*b0;
  384. res1_1 += a1*b1;
  385. res2_1 += a1*b2;
  386. res3_1 += a1*b3;
  387. ptrba = ptrba+2;
  388. ptrbb = ptrbb+4;
  389. }
  390. res0_0 *= alpha;
  391. res0_1 *= alpha;
  392. res1_0 *= alpha;
  393. res1_1 *= alpha;
  394. res2_0 *= alpha;
  395. res2_1 *= alpha;
  396. res3_0 *= alpha;
  397. res3_1 *= alpha;
  398. C0[0] = res0_0;
  399. C0[1] = res0_1;
  400. C1[0] = res1_0;
  401. C1[1] = res1_1;
  402. C2[0] = res2_0;
  403. C2[1] = res2_1;
  404. C3[0] = res3_0;
  405. C3[1] = res3_1;
  406. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  407. temp = bk - off;
  408. #ifdef LEFT
  409. temp -= 2; // number of values in A
  410. #else
  411. temp -= 4; // number of values in B
  412. #endif
  413. ptrba += temp*2;
  414. ptrbb += temp*4;
  415. #endif
  416. #ifdef LEFT
  417. off += 2; // number of values in A
  418. #endif
  419. C0 = C0+2;
  420. C1 = C1+2;
  421. C2 = C2+2;
  422. C3 = C3+2;
  423. }
  424. if ( bm & 1 )
  425. {
  426. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  427. ptrbb = bb;
  428. #else
  429. ptrba += off*1;
  430. ptrbb = bb + off*4;
  431. #endif
  432. res0_0 = 0;
  433. res1_0 = 0;
  434. res2_0 = 0;
  435. res3_0 = 0;
  436. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  437. temp = bk-off;
  438. #elif defined(LEFT)
  439. temp = off+1; // number of values in A
  440. #else
  441. temp = off+4; // number of values in B
  442. #endif
  443. for (k=0; k<temp; k++)
  444. {
  445. b0 = ptrbb[0];
  446. b1 = ptrbb[1];
  447. b2 = ptrbb[2];
  448. b3 = ptrbb[3];
  449. a0 = ptrba[0];
  450. res0_0 += a0*b0;
  451. res1_0 += a0*b1;
  452. res2_0 += a0*b2;
  453. res3_0 += a0*b3;
  454. ptrba = ptrba+1;
  455. ptrbb = ptrbb+4;
  456. }
  457. res0_0 *= alpha;
  458. res1_0 *= alpha;
  459. res2_0 *= alpha;
  460. res3_0 *= alpha;
  461. C0[0] = res0_0;
  462. C1[0] = res1_0;
  463. C2[0] = res2_0;
  464. C3[0] = res3_0;
  465. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  466. temp = bk - off;
  467. #ifdef LEFT
  468. temp -= 1; // number of values in A
  469. #else
  470. temp -= 4; // number of values in B
  471. #endif
  472. ptrba += temp*1;
  473. ptrbb += temp*4;
  474. #endif
  475. #ifdef LEFT
  476. off += 1; // number of values in A
  477. #endif
  478. C0 = C0+1;
  479. C1 = C1+1;
  480. C2 = C2+1;
  481. C3 = C3+1;
  482. }
  483. #if defined(TRMMKERNEL) && !defined(LEFT)
  484. off += 4;
  485. #endif
  486. k = (bk<<2);
  487. bb = bb+k;
  488. i = (ldc<<2);
  489. C = C+i;
  490. }
  491. for (j=0; j<(bn&2); j+=2)
  492. {
  493. C0 = C;
  494. C1 = C0+ldc;
  495. #if defined(TRMMKERNEL) && defined(LEFT)
  496. off = offset;
  497. #endif
  498. ptrba = ba;
  499. for (i=0; i<bm/8; i+=1)
  500. {
  501. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  502. ptrbb = bb;
  503. #else
  504. ptrba += off*8;
  505. ptrbb = bb + off*2;
  506. #endif
  507. res0_0 = 0;
  508. res0_1 = 0;
  509. res0_2 = 0;
  510. res0_3 = 0;
  511. res0_4 = 0;
  512. res0_5 = 0;
  513. res0_6 = 0;
  514. res0_7 = 0;
  515. res1_0 = 0;
  516. res1_1 = 0;
  517. res1_2 = 0;
  518. res1_3 = 0;
  519. res1_4 = 0;
  520. res1_5 = 0;
  521. res1_6 = 0;
  522. res1_7 = 0;
  523. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  524. temp = bk-off;
  525. #elif defined(LEFT)
  526. temp = off+8; // number of values in A
  527. #else
  528. temp = off+2; // number of values in B
  529. #endif
  530. for (k=0; k<temp; k++)
  531. {
  532. b0 = ptrbb[0];
  533. b1 = ptrbb[1];
  534. a0 = ptrba[0];
  535. res0_0 += a0*b0;
  536. res1_0 += a0*b1;
  537. a1 = ptrba[1];
  538. res0_1 += a1*b0;
  539. res1_1 += a1*b1;
  540. a0 = ptrba[2];
  541. res0_2 += a0*b0;
  542. res1_2 += a0*b1;
  543. a1 = ptrba[3];
  544. res0_3 += a1*b0;
  545. res1_3 += a1*b1;
  546. a0 = ptrba[4];
  547. res0_4 += a0*b0;
  548. res1_4 += a0*b1;
  549. a1 = ptrba[5];
  550. res0_5 += a1*b0;
  551. res1_5 += a1*b1;
  552. a0 = ptrba[6];
  553. res0_6 += a0*b0;
  554. res1_6 += a0*b1;
  555. a1 = ptrba[7];
  556. res0_7 += a1*b0;
  557. res1_7 += a1*b1;
  558. ptrba = ptrba+8;
  559. ptrbb = ptrbb+2;
  560. }
  561. res0_0 *= alpha;
  562. res0_1 *= alpha;
  563. res0_2 *= alpha;
  564. res0_3 *= alpha;
  565. res0_4 *= alpha;
  566. res0_5 *= alpha;
  567. res0_6 *= alpha;
  568. res0_7 *= alpha;
  569. res1_0 *= alpha;
  570. res1_1 *= alpha;
  571. res1_2 *= alpha;
  572. res1_3 *= alpha;
  573. res1_4 *= alpha;
  574. res1_5 *= alpha;
  575. res1_6 *= alpha;
  576. res1_7 *= alpha;
  577. C0[0] = res0_0;
  578. C0[1] = res0_1;
  579. C0[2] = res0_2;
  580. C0[3] = res0_3;
  581. C0[4] = res0_4;
  582. C0[5] = res0_5;
  583. C0[6] = res0_6;
  584. C0[7] = res0_7;
  585. C1[0] = res1_0;
  586. C1[1] = res1_1;
  587. C1[2] = res1_2;
  588. C1[3] = res1_3;
  589. C1[4] = res1_4;
  590. C1[5] = res1_5;
  591. C1[6] = res1_6;
  592. C1[7] = res1_7;
  593. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  594. temp = bk - off;
  595. #ifdef LEFT
  596. temp -= 8; // number of values in A
  597. #else
  598. temp -= 2; // number of values in B
  599. #endif
  600. ptrba += temp*8;
  601. ptrbb += temp*2;
  602. #endif
  603. #ifdef LEFT
  604. off += 8; // number of values in A
  605. #endif
  606. C0 = C0+8;
  607. C1 = C1+8;
  608. }
  609. if ( bm & 4 )
  610. {
  611. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  612. ptrbb = bb;
  613. #else
  614. ptrba += off*4;
  615. ptrbb = bb + off*2;
  616. #endif
  617. res0_0 = 0;
  618. res0_1 = 0;
  619. res0_2 = 0;
  620. res0_3 = 0;
  621. res1_0 = 0;
  622. res1_1 = 0;
  623. res1_2 = 0;
  624. res1_3 = 0;
  625. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  626. temp = bk-off;
  627. #elif defined(LEFT)
  628. temp = off+4; // number of values in A
  629. #else
  630. temp = off+2; // number of values in B
  631. #endif
  632. for (k=0; k<temp; k++)
  633. {
  634. b0 = ptrbb[0];
  635. b1 = ptrbb[1];
  636. a0 = ptrba[0];
  637. res0_0 += a0*b0;
  638. res1_0 += a0*b1;
  639. a1 = ptrba[1];
  640. res0_1 += a1*b0;
  641. res1_1 += a1*b1;
  642. a0 = ptrba[2];
  643. res0_2 += a0*b0;
  644. res1_2 += a0*b1;
  645. a1 = ptrba[3];
  646. res0_3 += a1*b0;
  647. res1_3 += a1*b1;
  648. ptrba = ptrba+4;
  649. ptrbb = ptrbb+2;
  650. }
  651. res0_0 *= alpha;
  652. res0_1 *= alpha;
  653. res0_2 *= alpha;
  654. res0_3 *= alpha;
  655. res1_0 *= alpha;
  656. res1_1 *= alpha;
  657. res1_2 *= alpha;
  658. res1_3 *= alpha;
  659. C0[0] = res0_0;
  660. C0[1] = res0_1;
  661. C0[2] = res0_2;
  662. C0[3] = res0_3;
  663. C1[0] = res1_0;
  664. C1[1] = res1_1;
  665. C1[2] = res1_2;
  666. C1[3] = res1_3;
  667. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  668. temp = bk - off;
  669. #ifdef LEFT
  670. temp -= 4; // number of values in A
  671. #else
  672. temp -= 2; // number of values in B
  673. #endif
  674. ptrba += temp*4;
  675. ptrbb += temp*2;
  676. #endif
  677. #ifdef LEFT
  678. off += 4; // number of values in A
  679. #endif
  680. C0 = C0+4;
  681. C1 = C1+4;
  682. }
  683. if ( bm & 2 )
  684. {
  685. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  686. ptrbb = bb;
  687. #else
  688. ptrba += off*2;
  689. ptrbb = bb + off*2;
  690. #endif
  691. res0_0 = 0;
  692. res0_1 = 0;
  693. res1_0 = 0;
  694. res1_1 = 0;
  695. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  696. temp = bk-off;
  697. #elif defined(LEFT)
  698. temp = off+2; // number of values in A
  699. #else
  700. temp = off+2; // number of values in B
  701. #endif
  702. for (k=0; k<temp; k++)
  703. {
  704. b0 = ptrbb[0];
  705. b1 = ptrbb[1];
  706. a0 = ptrba[0];
  707. res0_0 += a0*b0;
  708. res1_0 += a0*b1;
  709. a1 = ptrba[1];
  710. res0_1 += a1*b0;
  711. res1_1 += a1*b1;
  712. ptrba = ptrba+2;
  713. ptrbb = ptrbb+2;
  714. }
  715. res0_0 *= alpha;
  716. res0_1 *= alpha;
  717. res1_0 *= alpha;
  718. res1_1 *= alpha;
  719. C0[0] = res0_0;
  720. C0[1] = res0_1;
  721. C1[0] = res1_0;
  722. C1[1] = res1_1;
  723. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  724. temp = bk - off;
  725. #ifdef LEFT
  726. temp -= 2; // number of values in A
  727. #else
  728. temp -= 2; // number of values in B
  729. #endif
  730. ptrba += temp*2;
  731. ptrbb += temp*2;
  732. #endif
  733. #ifdef LEFT
  734. off += 2; // number of values in A
  735. #endif
  736. C0 = C0+2;
  737. C1 = C1+2;
  738. }
  739. if ( bm & 1 )
  740. {
  741. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  742. ptrbb = bb;
  743. #else
  744. ptrba += off*1;
  745. ptrbb = bb + off*2;
  746. #endif
  747. res0_0 = 0;
  748. res1_0 = 0;
  749. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  750. temp = bk-off;
  751. #elif defined(LEFT)
  752. temp = off+1; // number of values in A
  753. #else
  754. temp = off+2; // number of values in B
  755. #endif
  756. for (k=0; k<temp; k++)
  757. {
  758. b0 = ptrbb[0];
  759. b1 = ptrbb[1];
  760. a0 = ptrba[0];
  761. res0_0 += a0*b0;
  762. res1_0 += a0*b1;
  763. ptrba = ptrba+1;
  764. ptrbb = ptrbb+2;
  765. }
  766. res0_0 *= alpha;
  767. res1_0 *= alpha;
  768. C0[0] = res0_0;
  769. C1[0] = res1_0;
  770. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  771. temp = bk - off;
  772. #ifdef LEFT
  773. temp -= 1; // number of values in A
  774. #else
  775. temp -= 2; // number of values in B
  776. #endif
  777. ptrba += temp*1;
  778. ptrbb += temp*2;
  779. #endif
  780. #ifdef LEFT
  781. off += 1; // number of values in A
  782. #endif
  783. C0 = C0+1;
  784. C1 = C1+1;
  785. }
  786. #if defined(TRMMKERNEL) && !defined(LEFT)
  787. off += 2;
  788. #endif
  789. k = (bk<<1);
  790. bb = bb+k;
  791. i = (ldc<<1);
  792. C = C+i;
  793. }
  794. for (j=0; j<(bn&1); j+=1)
  795. {
  796. C0 = C;
  797. #if defined(TRMMKERNEL) && defined(LEFT)
  798. off = offset;
  799. #endif
  800. ptrba = ba;
  801. for (i=0; i<bm/8; i+=1)
  802. {
  803. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  804. ptrbb = bb;
  805. #else
  806. ptrba += off*8;
  807. ptrbb = bb + off*1;
  808. #endif
  809. res0_0 = 0;
  810. res0_1 = 0;
  811. res0_2 = 0;
  812. res0_3 = 0;
  813. res0_4 = 0;
  814. res0_5 = 0;
  815. res0_6 = 0;
  816. res0_7 = 0;
  817. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  818. temp = bk-off;
  819. #elif defined(LEFT)
  820. temp = off+8; // number of values in A
  821. #else
  822. temp = off+1; // number of values in B
  823. #endif
  824. for (k=0; k<temp; k++)
  825. {
  826. b0 = ptrbb[0];
  827. a0 = ptrba[0];
  828. res0_0 += a0*b0;
  829. a1 = ptrba[1];
  830. res0_1 += a1*b0;
  831. a0 = ptrba[2];
  832. res0_2 += a0*b0;
  833. a1 = ptrba[3];
  834. res0_3 += a1*b0;
  835. a0 = ptrba[4];
  836. res0_4 += a0*b0;
  837. a1 = ptrba[5];
  838. res0_5 += a1*b0;
  839. a0 = ptrba[6];
  840. res0_6 += a0*b0;
  841. a1 = ptrba[7];
  842. res0_7 += a1*b0;
  843. ptrba = ptrba+8;
  844. ptrbb = ptrbb+1;
  845. }
  846. res0_0 *= alpha;
  847. res0_1 *= alpha;
  848. res0_2 *= alpha;
  849. res0_3 *= alpha;
  850. res0_4 *= alpha;
  851. res0_5 *= alpha;
  852. res0_6 *= alpha;
  853. res0_7 *= alpha;
  854. C0[0] = res0_0;
  855. C0[1] = res0_1;
  856. C0[2] = res0_2;
  857. C0[3] = res0_3;
  858. C0[4] = res0_4;
  859. C0[5] = res0_5;
  860. C0[6] = res0_6;
  861. C0[7] = res0_7;
  862. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  863. temp = bk - off;
  864. #ifdef LEFT
  865. temp -= 8; // number of values in A
  866. #else
  867. temp -= 1; // number of values in B
  868. #endif
  869. ptrba += temp*8;
  870. ptrbb += temp*1;
  871. #endif
  872. #ifdef LEFT
  873. off += 8; // number of values in A
  874. #endif
  875. C0 = C0+8;
  876. }
  877. if ( bm & 4 )
  878. {
  879. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  880. ptrbb = bb;
  881. #else
  882. ptrba += off*4;
  883. ptrbb = bb + off*1;
  884. #endif
  885. res0_0 = 0;
  886. res0_1 = 0;
  887. res0_2 = 0;
  888. res0_3 = 0;
  889. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  890. temp = bk-off;
  891. #elif defined(LEFT)
  892. temp = off+4; // number of values in A
  893. #else
  894. temp = off+1; // number of values in B
  895. #endif
  896. for (k=0; k<temp; k++)
  897. {
  898. b0 = ptrbb[0];
  899. a0 = ptrba[0];
  900. res0_0 += a0*b0;
  901. a1 = ptrba[1];
  902. res0_1 += a1*b0;
  903. a0 = ptrba[2];
  904. res0_2 += a0*b0;
  905. a1 = ptrba[3];
  906. res0_3 += a1*b0;
  907. ptrba = ptrba+4;
  908. ptrbb = ptrbb+1;
  909. }
  910. res0_0 *= alpha;
  911. res0_1 *= alpha;
  912. res0_2 *= alpha;
  913. res0_3 *= alpha;
  914. C0[0] = res0_0;
  915. C0[1] = res0_1;
  916. C0[2] = res0_2;
  917. C0[3] = res0_3;
  918. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  919. temp = bk - off;
  920. #ifdef LEFT
  921. temp -= 4; // number of values in A
  922. #else
  923. temp -= 1; // number of values in B
  924. #endif
  925. ptrba += temp*4;
  926. ptrbb += temp*1;
  927. #endif
  928. #ifdef LEFT
  929. off += 4; // number of values in A
  930. #endif
  931. C0 = C0+4;
  932. }
  933. if ( bm & 2 )
  934. {
  935. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  936. ptrbb = bb;
  937. #else
  938. ptrba += off*2;
  939. ptrbb = bb + off*1;
  940. #endif
  941. res0_0 = 0;
  942. res0_1 = 0;
  943. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  944. temp = bk-off;
  945. #elif defined(LEFT)
  946. temp = off+2; // number of values in A
  947. #else
  948. temp = off+1; // number of values in B
  949. #endif
  950. for (k=0; k<temp; k++)
  951. {
  952. b0 = ptrbb[0];
  953. a0 = ptrba[0];
  954. res0_0 += a0*b0;
  955. a1 = ptrba[1];
  956. res0_1 += a1*b0;
  957. ptrba = ptrba+2;
  958. ptrbb = ptrbb+1;
  959. }
  960. res0_0 *= alpha;
  961. res0_1 *= alpha;
  962. C0[0] = res0_0;
  963. C0[1] = res0_1;
  964. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  965. temp = bk - off;
  966. #ifdef LEFT
  967. temp -= 2; // number of values in A
  968. #else
  969. temp -= 1; // number of values in B
  970. #endif
  971. ptrba += temp*2;
  972. ptrbb += temp*1;
  973. #endif
  974. #ifdef LEFT
  975. off += 2; // number of values in A
  976. #endif
  977. C0 = C0+2;
  978. }
  979. if ( bm & 1 )
  980. {
  981. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  982. ptrbb = bb;
  983. #else
  984. ptrba += off*1;
  985. ptrbb = bb + off*1;
  986. #endif
  987. res0_0 = 0;
  988. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  989. temp = bk-off;
  990. #elif defined(LEFT)
  991. temp = off+1; // number of values in A
  992. #else
  993. temp = off+1; // number of values in B
  994. #endif
  995. for (k=0; k<temp; k++)
  996. {
  997. b0 = ptrbb[0];
  998. a0 = ptrba[0];
  999. res0_0 += a0*b0;
  1000. ptrba = ptrba+1;
  1001. ptrbb = ptrbb+1;
  1002. }
  1003. res0_0 *= alpha;
  1004. C0[0] = res0_0;
  1005. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1006. temp = bk - off;
  1007. #ifdef LEFT
  1008. temp -= 1; // number of values in A
  1009. #else
  1010. temp -= 1; // number of values in B
  1011. #endif
  1012. ptrba += temp*1;
  1013. ptrbb += temp*1;
  1014. #endif
  1015. #ifdef LEFT
  1016. off += 1; // number of values in A
  1017. #endif
  1018. C0 = C0+1;
  1019. }
  1020. #if defined(TRMMKERNEL) && !defined(LEFT)
  1021. off += 1;
  1022. #endif
  1023. k = (bk<<0);
  1024. bb = bb+k;
  1025. C = C+ldc;
  1026. }
  1027. return 0;
  1028. }