You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trmmkernel_4x4.c 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875
  1. #include "common.h"
  2. #include <stdbool.h>
  3. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
  4. {
  5. BLASLONG i,j,k;
  6. FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
  7. FLOAT res0_0;
  8. FLOAT res0_1;
  9. FLOAT res0_2;
  10. FLOAT res0_3;
  11. FLOAT res1_0;
  12. FLOAT res1_1;
  13. FLOAT res1_2;
  14. FLOAT res1_3;
  15. FLOAT res2_0;
  16. FLOAT res2_1;
  17. FLOAT res2_2;
  18. FLOAT res2_3;
  19. FLOAT res3_0;
  20. FLOAT res3_1;
  21. FLOAT res3_2;
  22. FLOAT res3_3;
  23. FLOAT a0;
  24. FLOAT a1;
  25. FLOAT b0;
  26. FLOAT b1;
  27. FLOAT b2;
  28. FLOAT b3;
  29. BLASLONG off, temp;
  30. bool left;
  31. bool transposed;
  32. bool backwards;
  33. #ifdef LEFT
  34. left = true;
  35. #else
  36. left = false;
  37. #endif
  38. #ifdef TRANSA
  39. transposed = true;
  40. #else
  41. transposed = false;
  42. #endif
  43. backwards = left != transposed;
  44. if (!left) {
  45. off = -offset;
  46. }
  47. for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops
  48. {
  49. C0 = C;
  50. C1 = C0+ldc;
  51. C2 = C1+ldc;
  52. C3 = C2+ldc;
  53. if (left) {
  54. off = offset;
  55. }
  56. ptrba = ba;
  57. for (i=0; i<bm/4; i+=1) // do blocks of 4x4
  58. {
  59. ptrbb = bb;
  60. if (backwards)
  61. {
  62. ptrba += off*4; // number of values in A
  63. ptrbb += off*4; // number of values in B
  64. }
  65. res0_0 = 0;
  66. res0_1 = 0;
  67. res0_2 = 0;
  68. res0_3 = 0;
  69. res1_0 = 0;
  70. res1_1 = 0;
  71. res1_2 = 0;
  72. res1_3 = 0;
  73. res2_0 = 0;
  74. res2_1 = 0;
  75. res2_2 = 0;
  76. res2_3 = 0;
  77. res3_0 = 0;
  78. res3_1 = 0;
  79. res3_2 = 0;
  80. res3_3 = 0;
  81. temp = backwards ? bk-off :
  82. left ? off + 4 : // number of values in A
  83. off + 4; // number of values in B
  84. for (k=0; k<temp; k++)
  85. {
  86. b0 = ptrbb[0];
  87. b1 = ptrbb[1];
  88. b2 = ptrbb[2];
  89. b3 = ptrbb[3];
  90. a0 = ptrba[0];
  91. res0_0 += a0*b0;
  92. res1_0 += a0*b1;
  93. res2_0 += a0*b2;
  94. res3_0 += a0*b3;
  95. a1 = ptrba[1];
  96. res0_1 += a1*b0;
  97. res1_1 += a1*b1;
  98. res2_1 += a1*b2;
  99. res3_1 += a1*b3;
  100. a0 = ptrba[2];
  101. res0_2 += a0*b0;
  102. res1_2 += a0*b1;
  103. res2_2 += a0*b2;
  104. res3_2 += a0*b3;
  105. a1 = ptrba[3];
  106. res0_3 += a1*b0;
  107. res1_3 += a1*b1;
  108. res2_3 += a1*b2;
  109. res3_3 += a1*b3;
  110. ptrba = ptrba+4;
  111. ptrbb = ptrbb+4;
  112. }
  113. res0_0 *= alpha;
  114. res0_1 *= alpha;
  115. res0_2 *= alpha;
  116. res0_3 *= alpha;
  117. res1_0 *= alpha;
  118. res1_1 *= alpha;
  119. res1_2 *= alpha;
  120. res1_3 *= alpha;
  121. res2_0 *= alpha;
  122. res2_1 *= alpha;
  123. res2_2 *= alpha;
  124. res2_3 *= alpha;
  125. res3_0 *= alpha;
  126. res3_1 *= alpha;
  127. res3_2 *= alpha;
  128. res3_3 *= alpha;
  129. C0[0] = res0_0;
  130. C0[1] = res0_1;
  131. C0[2] = res0_2;
  132. C0[3] = res0_3;
  133. C1[0] = res1_0;
  134. C1[1] = res1_1;
  135. C1[2] = res1_2;
  136. C1[3] = res1_3;
  137. C2[0] = res2_0;
  138. C2[1] = res2_1;
  139. C2[2] = res2_2;
  140. C2[3] = res2_3;
  141. C3[0] = res3_0;
  142. C3[1] = res3_1;
  143. C3[2] = res3_2;
  144. C3[3] = res3_3;
  145. if (!backwards) {
  146. temp = bk-off;
  147. temp = left ? temp - 4 : // number of values in A
  148. temp - 4; // number of values in B
  149. ptrba += temp*4; // number of values in A
  150. ptrbb += temp*4; // number of values in B
  151. }
  152. #ifdef LEFT
  153. off += 4; // number of values in A
  154. #endif
  155. C0 = C0+4;
  156. C1 = C1+4;
  157. C2 = C2+4;
  158. C3 = C3+4;
  159. }
  160. if ( bm & 2 ) // do any 2x4 loop
  161. {
  162. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  163. ptrbb = bb;
  164. #else
  165. ptrba += off*2;
  166. ptrbb = bb + off*4;
  167. #endif
  168. res0_0 = 0;
  169. res0_1 = 0;
  170. res1_0 = 0;
  171. res1_1 = 0;
  172. res2_0 = 0;
  173. res2_1 = 0;
  174. res3_0 = 0;
  175. res3_1 = 0;
  176. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  177. temp = bk-off;
  178. #elif defined(LEFT)
  179. temp = off+2; // number of values in A
  180. #else
  181. temp = off+4; // number of values in B
  182. #endif
  183. for (k=0; k<temp; k++)
  184. {
  185. b0 = ptrbb[0];
  186. b1 = ptrbb[1];
  187. b2 = ptrbb[2];
  188. b3 = ptrbb[3];
  189. a0 = ptrba[0];
  190. res0_0 += a0*b0;
  191. res1_0 += a0*b1;
  192. res2_0 += a0*b2;
  193. res3_0 += a0*b3;
  194. a1 = ptrba[1];
  195. res0_1 += a1*b0;
  196. res1_1 += a1*b1;
  197. res2_1 += a1*b2;
  198. res3_1 += a1*b3;
  199. ptrba = ptrba+2;
  200. ptrbb = ptrbb+4;
  201. }
  202. res0_0 *= alpha;
  203. res0_1 *= alpha;
  204. res1_0 *= alpha;
  205. res1_1 *= alpha;
  206. res2_0 *= alpha;
  207. res2_1 *= alpha;
  208. res3_0 *= alpha;
  209. res3_1 *= alpha;
  210. C0[0] = res0_0;
  211. C0[1] = res0_1;
  212. C1[0] = res1_0;
  213. C1[1] = res1_1;
  214. C2[0] = res2_0;
  215. C2[1] = res2_1;
  216. C3[0] = res3_0;
  217. C3[1] = res3_1;
  218. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  219. temp = bk - off;
  220. #ifdef LEFT
  221. temp -= 2; // number of values in A
  222. #else
  223. temp -= 4; // number of values in B
  224. #endif
  225. ptrba += temp*2;
  226. ptrbb += temp*4;
  227. #endif
  228. #ifdef LEFT
  229. off += 2; // number of values in A
  230. #endif
  231. C0 = C0+2;
  232. C1 = C1+2;
  233. C2 = C2+2;
  234. C3 = C3+2;
  235. }
  236. if ( bm & 1 ) // do any 1x4 loop
  237. {
  238. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  239. ptrbb = bb;
  240. #else
  241. ptrba += off*1;
  242. ptrbb = bb + off*4;
  243. #endif
  244. res0_0 = 0;
  245. res1_0 = 0;
  246. res2_0 = 0;
  247. res3_0 = 0;
  248. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  249. temp = bk-off;
  250. #elif defined(LEFT)
  251. temp = off+1; // number of values in A
  252. #else
  253. temp = off+4; // number of values in B
  254. #endif
  255. for (k=0; k<temp; k++)
  256. {
  257. b0 = ptrbb[0];
  258. b1 = ptrbb[1];
  259. b2 = ptrbb[2];
  260. b3 = ptrbb[3];
  261. a0 = ptrba[0];
  262. res0_0 += a0*b0;
  263. res1_0 += a0*b1;
  264. res2_0 += a0*b2;
  265. res3_0 += a0*b3;
  266. ptrba = ptrba+1;
  267. ptrbb = ptrbb+4;
  268. }
  269. res0_0 *= alpha;
  270. res1_0 *= alpha;
  271. res2_0 *= alpha;
  272. res3_0 *= alpha;
  273. C0[0] = res0_0;
  274. C1[0] = res1_0;
  275. C2[0] = res2_0;
  276. C3[0] = res3_0;
  277. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  278. temp = bk - off;
  279. #ifdef LEFT
  280. temp -= 1; // number of values in A
  281. #else
  282. temp -= 4; // number of values in B
  283. #endif
  284. ptrba += temp*1;
  285. ptrbb += temp*4;
  286. #endif
  287. #ifdef LEFT
  288. off += 1; // number of values in A
  289. #endif
  290. C0 = C0+1;
  291. C1 = C1+1;
  292. C2 = C2+1;
  293. C3 = C3+1;
  294. }
  295. #if defined(TRMMKERNEL) && !defined(LEFT)
  296. off += 4;
  297. #endif
  298. k = (bk<<2);
  299. bb = bb+k;
  300. i = (ldc<<2);
  301. C = C+i;
  302. }
  303. for (j=0; j<(bn&2); j+=2) // do the Mx2 loops
  304. {
  305. C0 = C;
  306. C1 = C0+ldc;
  307. #if defined(TRMMKERNEL) && defined(LEFT)
  308. off = offset;
  309. #endif
  310. ptrba = ba;
  311. for (i=0; i<bm/4; i+=1) // do blocks of 4x2
  312. {
  313. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  314. ptrbb = bb;
  315. #else
  316. ptrba += off*4;
  317. ptrbb = bb + off*2;
  318. #endif
  319. res0_0 = 0;
  320. res0_1 = 0;
  321. res0_2 = 0;
  322. res0_3 = 0;
  323. res1_0 = 0;
  324. res1_1 = 0;
  325. res1_2 = 0;
  326. res1_3 = 0;
  327. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  328. temp = bk-off;
  329. #elif defined(LEFT)
  330. temp = off+4; // number of values in A
  331. #else
  332. temp = off+2; // number of values in B
  333. #endif
  334. for (k=0; k<temp; k++)
  335. {
  336. b0 = ptrbb[0];
  337. b1 = ptrbb[1];
  338. a0 = ptrba[0];
  339. res0_0 += a0*b0;
  340. res1_0 += a0*b1;
  341. a1 = ptrba[1];
  342. res0_1 += a1*b0;
  343. res1_1 += a1*b1;
  344. a0 = ptrba[2];
  345. res0_2 += a0*b0;
  346. res1_2 += a0*b1;
  347. a1 = ptrba[3];
  348. res0_3 += a1*b0;
  349. res1_3 += a1*b1;
  350. ptrba = ptrba+4;
  351. ptrbb = ptrbb+2;
  352. }
  353. res0_0 *= alpha;
  354. res0_1 *= alpha;
  355. res0_2 *= alpha;
  356. res0_3 *= alpha;
  357. res1_0 *= alpha;
  358. res1_1 *= alpha;
  359. res1_2 *= alpha;
  360. res1_3 *= alpha;
  361. C0[0] = res0_0;
  362. C0[1] = res0_1;
  363. C0[2] = res0_2;
  364. C0[3] = res0_3;
  365. C1[0] = res1_0;
  366. C1[1] = res1_1;
  367. C1[2] = res1_2;
  368. C1[3] = res1_3;
  369. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  370. temp = bk - off;
  371. #ifdef LEFT
  372. temp -= 4; // number of values in A
  373. #else
  374. temp -= 2; // number of values in B
  375. #endif
  376. ptrba += temp*4;
  377. ptrbb += temp*2;
  378. #endif
  379. #ifdef LEFT
  380. off += 4; // number of values in A
  381. #endif
  382. C0 = C0+4;
  383. C1 = C1+4;
  384. }
  385. if ( bm & 2 ) // do any 2x2 loop
  386. {
  387. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  388. ptrbb = bb;
  389. #else
  390. ptrba += off*2;
  391. ptrbb = bb + off*2;
  392. #endif
  393. res0_0 = 0;
  394. res0_1 = 0;
  395. res1_0 = 0;
  396. res1_1 = 0;
  397. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  398. temp = bk-off;
  399. #elif defined(LEFT)
  400. temp = off+2; // number of values in A
  401. #else
  402. temp = off+2; // number of values in B
  403. #endif
  404. for (k=0; k<temp; k++)
  405. {
  406. b0 = ptrbb[0];
  407. b1 = ptrbb[1];
  408. a0 = ptrba[0];
  409. res0_0 += a0*b0;
  410. res1_0 += a0*b1;
  411. a1 = ptrba[1];
  412. res0_1 += a1*b0;
  413. res1_1 += a1*b1;
  414. ptrba = ptrba+2;
  415. ptrbb = ptrbb+2;
  416. }
  417. res0_0 *= alpha;
  418. res0_1 *= alpha;
  419. res1_0 *= alpha;
  420. res1_1 *= alpha;
  421. C0[0] = res0_0;
  422. C0[1] = res0_1;
  423. C1[0] = res1_0;
  424. C1[1] = res1_1;
  425. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  426. temp = bk - off;
  427. #ifdef LEFT
  428. temp -= 2; // number of values in A
  429. #else
  430. temp -= 2; // number of values in B
  431. #endif
  432. ptrba += temp*2;
  433. ptrbb += temp*2;
  434. #endif
  435. #ifdef LEFT
  436. off += 2; // number of values in A
  437. #endif
  438. C0 = C0+2;
  439. C1 = C1+2;
  440. }
  441. if ( bm & 1 ) // do any 1x2 loop
  442. {
  443. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  444. ptrbb = bb;
  445. #else
  446. ptrba += off*1;
  447. ptrbb = bb + off*2;
  448. #endif
  449. res0_0 = 0;
  450. res1_0 = 0;
  451. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  452. temp = bk-off;
  453. #elif defined(LEFT)
  454. temp = off+1; // number of values in A
  455. #else
  456. temp = off+2; // number of values in B
  457. #endif
  458. for (k=0; k<temp; k++)
  459. {
  460. b0 = ptrbb[0];
  461. b1 = ptrbb[1];
  462. a0 = ptrba[0];
  463. res0_0 += a0*b0;
  464. res1_0 += a0*b1;
  465. ptrba = ptrba+1;
  466. ptrbb = ptrbb+2;
  467. }
  468. res0_0 *= alpha;
  469. res1_0 *= alpha;
  470. C0[0] = res0_0;
  471. C1[0] = res1_0;
  472. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  473. temp = bk - off;
  474. #ifdef LEFT
  475. temp -= 1; // number of values in A
  476. #else
  477. temp -= 2; // number of values in B
  478. #endif
  479. ptrba += temp*1;
  480. ptrbb += temp*2;
  481. #endif
  482. #ifdef LEFT
  483. off += 1; // number of values in A
  484. #endif
  485. C0 = C0+1;
  486. C1 = C1+1;
  487. }
  488. #if defined(TRMMKERNEL) && !defined(LEFT)
  489. off += 2;
  490. #endif
  491. k = (bk<<1);
  492. bb = bb+k;
  493. i = (ldc<<1);
  494. C = C+i;
  495. }
  496. for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
  497. {
  498. C0 = C;
  499. #if defined(TRMMKERNEL) && defined(LEFT)
  500. off = offset;
  501. #endif
  502. ptrba = ba;
  503. for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
  504. {
  505. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  506. ptrbb = bb;
  507. #else
  508. ptrba += off*4;
  509. ptrbb = bb + off*1;
  510. #endif
  511. res0_0 = 0;
  512. res0_1 = 0;
  513. res0_2 = 0;
  514. res0_3 = 0;
  515. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  516. temp = bk-off;
  517. #elif defined(LEFT)
  518. temp = off+4; // number of values in A
  519. #else
  520. temp = off+1; // number of values in B
  521. #endif
  522. for (k=0; k<temp; k++)
  523. {
  524. b0 = ptrbb[0];
  525. a0 = ptrba[0];
  526. res0_0 += a0*b0;
  527. a1 = ptrba[1];
  528. res0_1 += a1*b0;
  529. a0 = ptrba[2];
  530. res0_2 += a0*b0;
  531. a1 = ptrba[3];
  532. res0_3 += a1*b0;
  533. ptrba = ptrba+4;
  534. ptrbb = ptrbb+1;
  535. }
  536. res0_0 *= alpha;
  537. res0_1 *= alpha;
  538. res0_2 *= alpha;
  539. res0_3 *= alpha;
  540. C0[0] = res0_0;
  541. C0[1] = res0_1;
  542. C0[2] = res0_2;
  543. C0[3] = res0_3;
  544. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  545. temp = bk - off;
  546. #ifdef LEFT
  547. temp -= 4; // number of values in A
  548. #else
  549. temp -= 1; // number of values in B
  550. #endif
  551. ptrba += temp*4;
  552. ptrbb += temp*1;
  553. #endif
  554. #ifdef LEFT
  555. off += 4; // number of values in A
  556. #endif
  557. C0 = C0+4;
  558. }
  559. if ( bm & 2 ) // do any 2x1 loop
  560. {
  561. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  562. ptrbb = bb;
  563. #else
  564. ptrba += off*2;
  565. ptrbb = bb + off*1;
  566. #endif
  567. res0_0 = 0;
  568. res0_1 = 0;
  569. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  570. temp = bk-off;
  571. #elif defined(LEFT)
  572. temp = off+2; // number of values in A
  573. #else
  574. temp = off+1; // number of values in B
  575. #endif
  576. for (k=0; k<temp; k++)
  577. {
  578. b0 = ptrbb[0];
  579. a0 = ptrba[0];
  580. res0_0 += a0*b0;
  581. a1 = ptrba[1];
  582. res0_1 += a1*b0;
  583. ptrba = ptrba+2;
  584. ptrbb = ptrbb+1;
  585. }
  586. res0_0 *= alpha;
  587. res0_1 *= alpha;
  588. C0[0] = res0_0;
  589. C0[1] = res0_1;
  590. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  591. temp = bk - off;
  592. #ifdef LEFT
  593. temp -= 2; // number of values in A
  594. #else
  595. temp -= 1; // number of values in B
  596. #endif
  597. ptrba += temp*2;
  598. ptrbb += temp*1;
  599. #endif
  600. #ifdef LEFT
  601. off += 2; // number of values in A
  602. #endif
  603. C0 = C0+2;
  604. }
  605. if ( bm & 1 ) // do any 1x1 loop
  606. {
  607. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  608. ptrbb = bb;
  609. #else
  610. ptrba += off*1;
  611. ptrbb = bb + off*1;
  612. #endif
  613. res0_0 = 0;
  614. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  615. temp = bk-off;
  616. #elif defined(LEFT)
  617. temp = off+1; // number of values in A
  618. #else
  619. temp = off+1; // number of values in B
  620. #endif
  621. for (k=0; k<temp; k++)
  622. {
  623. b0 = ptrbb[0];
  624. a0 = ptrba[0];
  625. res0_0 += a0*b0;
  626. ptrba = ptrba+1;
  627. ptrbb = ptrbb+1;
  628. }
  629. res0_0 *= alpha;
  630. C0[0] = res0_0;
  631. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  632. temp = bk - off;
  633. #ifdef LEFT
  634. temp -= 1; // number of values in A
  635. #else
  636. temp -= 1; // number of values in B
  637. #endif
  638. ptrba += temp*1;
  639. ptrbb += temp*1;
  640. #endif
  641. #ifdef LEFT
  642. off += 1; // number of values in A
  643. #endif
  644. C0 = C0+1;
  645. }
  646. #if defined(TRMMKERNEL) && !defined(LEFT)
  647. off += 1;
  648. #endif
  649. k = (bk<<0);
  650. bb = bb+k;
  651. C = C+ldc;
  652. }
  653. return 0;
  654. }