You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trmmkernel_8x8.c 49 kB


  1. #include "common.h"
  2. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
  3. {
  4. BLASLONG i,j,k;
  5. FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
  6. FLOAT res0_0;
  7. FLOAT res0_1;
  8. FLOAT res0_2;
  9. FLOAT res0_3;
  10. FLOAT res0_4;
  11. FLOAT res0_5;
  12. FLOAT res0_6;
  13. FLOAT res0_7;
  14. FLOAT res1_0;
  15. FLOAT res1_1;
  16. FLOAT res1_2;
  17. FLOAT res1_3;
  18. FLOAT res1_4;
  19. FLOAT res1_5;
  20. FLOAT res1_6;
  21. FLOAT res1_7;
  22. FLOAT res2_0;
  23. FLOAT res2_1;
  24. FLOAT res2_2;
  25. FLOAT res2_3;
  26. FLOAT res2_4;
  27. FLOAT res2_5;
  28. FLOAT res2_6;
  29. FLOAT res2_7;
  30. FLOAT res3_0;
  31. FLOAT res3_1;
  32. FLOAT res3_2;
  33. FLOAT res3_3;
  34. FLOAT res3_4;
  35. FLOAT res3_5;
  36. FLOAT res3_6;
  37. FLOAT res3_7;
  38. FLOAT res4_0;
  39. FLOAT res4_1;
  40. FLOAT res4_2;
  41. FLOAT res4_3;
  42. FLOAT res4_4;
  43. FLOAT res4_5;
  44. FLOAT res4_6;
  45. FLOAT res4_7;
  46. FLOAT res5_0;
  47. FLOAT res5_1;
  48. FLOAT res5_2;
  49. FLOAT res5_3;
  50. FLOAT res5_4;
  51. FLOAT res5_5;
  52. FLOAT res5_6;
  53. FLOAT res5_7;
  54. FLOAT res6_0;
  55. FLOAT res6_1;
  56. FLOAT res6_2;
  57. FLOAT res6_3;
  58. FLOAT res6_4;
  59. FLOAT res6_5;
  60. FLOAT res6_6;
  61. FLOAT res6_7;
  62. FLOAT res7_0;
  63. FLOAT res7_1;
  64. FLOAT res7_2;
  65. FLOAT res7_3;
  66. FLOAT res7_4;
  67. FLOAT res7_5;
  68. FLOAT res7_6;
  69. FLOAT res7_7;
  70. FLOAT a0;
  71. FLOAT a1;
  72. FLOAT b0;
  73. FLOAT b1;
  74. FLOAT b2;
  75. FLOAT b3;
  76. FLOAT b4;
  77. FLOAT b5;
  78. FLOAT b6;
  79. FLOAT b7;
  80. BLASLONG off, temp;
  81. #if !defined(LEFT)
  82. off = -offset;
  83. #else
  84. off = 0;
  85. #endif
  86. for (j=0; j<bn/8; j+=1)
  87. {
  88. C0 = C;
  89. C1 = C0+ldc;
  90. C2 = C1+ldc;
  91. C3 = C2+ldc;
  92. C4 = C3+ldc;
  93. C5 = C4+ldc;
  94. C6 = C5+ldc;
  95. C7 = C6+ldc;
  96. #if defined(TRMMKERNEL) && defined(LEFT)
  97. off = offset;
  98. #endif
  99. ptrba = ba;
  100. for (i=0; i<bm/8; i+=1)
  101. {
  102. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  103. ptrbb = bb;
  104. #else
  105. ptrba += off*8;
  106. ptrbb = bb + off*8;
  107. #endif
  108. res0_0 = 0;
  109. res0_1 = 0;
  110. res0_2 = 0;
  111. res0_3 = 0;
  112. res0_4 = 0;
  113. res0_5 = 0;
  114. res0_6 = 0;
  115. res0_7 = 0;
  116. res1_0 = 0;
  117. res1_1 = 0;
  118. res1_2 = 0;
  119. res1_3 = 0;
  120. res1_4 = 0;
  121. res1_5 = 0;
  122. res1_6 = 0;
  123. res1_7 = 0;
  124. res2_0 = 0;
  125. res2_1 = 0;
  126. res2_2 = 0;
  127. res2_3 = 0;
  128. res2_4 = 0;
  129. res2_5 = 0;
  130. res2_6 = 0;
  131. res2_7 = 0;
  132. res3_0 = 0;
  133. res3_1 = 0;
  134. res3_2 = 0;
  135. res3_3 = 0;
  136. res3_4 = 0;
  137. res3_5 = 0;
  138. res3_6 = 0;
  139. res3_7 = 0;
  140. res4_0 = 0;
  141. res4_1 = 0;
  142. res4_2 = 0;
  143. res4_3 = 0;
  144. res4_4 = 0;
  145. res4_5 = 0;
  146. res4_6 = 0;
  147. res4_7 = 0;
  148. res5_0 = 0;
  149. res5_1 = 0;
  150. res5_2 = 0;
  151. res5_3 = 0;
  152. res5_4 = 0;
  153. res5_5 = 0;
  154. res5_6 = 0;
  155. res5_7 = 0;
  156. res6_0 = 0;
  157. res6_1 = 0;
  158. res6_2 = 0;
  159. res6_3 = 0;
  160. res6_4 = 0;
  161. res6_5 = 0;
  162. res6_6 = 0;
  163. res6_7 = 0;
  164. res7_0 = 0;
  165. res7_1 = 0;
  166. res7_2 = 0;
  167. res7_3 = 0;
  168. res7_4 = 0;
  169. res7_5 = 0;
  170. res7_6 = 0;
  171. res7_7 = 0;
  172. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  173. temp = bk-off;
  174. #elif defined(LEFT)
  175. temp = off+8; // number of values in A
  176. #else
  177. temp = off+8; // number of values in B
  178. #endif
  179. for (k=0; k<temp; k++)
  180. {
  181. b0 = ptrbb[0];
  182. b1 = ptrbb[1];
  183. b2 = ptrbb[2];
  184. b3 = ptrbb[3];
  185. b4 = ptrbb[4];
  186. b5 = ptrbb[5];
  187. b6 = ptrbb[6];
  188. b7 = ptrbb[7];
  189. a0 = ptrba[0];
  190. res0_0 += a0*b0;
  191. res1_0 += a0*b1;
  192. res2_0 += a0*b2;
  193. res3_0 += a0*b3;
  194. res4_0 += a0*b4;
  195. res5_0 += a0*b5;
  196. res6_0 += a0*b6;
  197. res7_0 += a0*b7;
  198. a1 = ptrba[1];
  199. res0_1 += a1*b0;
  200. res1_1 += a1*b1;
  201. res2_1 += a1*b2;
  202. res3_1 += a1*b3;
  203. res4_1 += a1*b4;
  204. res5_1 += a1*b5;
  205. res6_1 += a1*b6;
  206. res7_1 += a1*b7;
  207. a0 = ptrba[2];
  208. res0_2 += a0*b0;
  209. res1_2 += a0*b1;
  210. res2_2 += a0*b2;
  211. res3_2 += a0*b3;
  212. res4_2 += a0*b4;
  213. res5_2 += a0*b5;
  214. res6_2 += a0*b6;
  215. res7_2 += a0*b7;
  216. a1 = ptrba[3];
  217. res0_3 += a1*b0;
  218. res1_3 += a1*b1;
  219. res2_3 += a1*b2;
  220. res3_3 += a1*b3;
  221. res4_3 += a1*b4;
  222. res5_3 += a1*b5;
  223. res6_3 += a1*b6;
  224. res7_3 += a1*b7;
  225. a0 = ptrba[4];
  226. res0_4 += a0*b0;
  227. res1_4 += a0*b1;
  228. res2_4 += a0*b2;
  229. res3_4 += a0*b3;
  230. res4_4 += a0*b4;
  231. res5_4 += a0*b5;
  232. res6_4 += a0*b6;
  233. res7_4 += a0*b7;
  234. a1 = ptrba[5];
  235. res0_5 += a1*b0;
  236. res1_5 += a1*b1;
  237. res2_5 += a1*b2;
  238. res3_5 += a1*b3;
  239. res4_5 += a1*b4;
  240. res5_5 += a1*b5;
  241. res6_5 += a1*b6;
  242. res7_5 += a1*b7;
  243. a0 = ptrba[6];
  244. res0_6 += a0*b0;
  245. res1_6 += a0*b1;
  246. res2_6 += a0*b2;
  247. res3_6 += a0*b3;
  248. res4_6 += a0*b4;
  249. res5_6 += a0*b5;
  250. res6_6 += a0*b6;
  251. res7_6 += a0*b7;
  252. a1 = ptrba[7];
  253. res0_7 += a1*b0;
  254. res1_7 += a1*b1;
  255. res2_7 += a1*b2;
  256. res3_7 += a1*b3;
  257. res4_7 += a1*b4;
  258. res5_7 += a1*b5;
  259. res6_7 += a1*b6;
  260. res7_7 += a1*b7;
  261. ptrba = ptrba+8;
  262. ptrbb = ptrbb+8;
  263. }
  264. res0_0 *= alpha;
  265. res0_1 *= alpha;
  266. res0_2 *= alpha;
  267. res0_3 *= alpha;
  268. res0_4 *= alpha;
  269. res0_5 *= alpha;
  270. res0_6 *= alpha;
  271. res0_7 *= alpha;
  272. res1_0 *= alpha;
  273. res1_1 *= alpha;
  274. res1_2 *= alpha;
  275. res1_3 *= alpha;
  276. res1_4 *= alpha;
  277. res1_5 *= alpha;
  278. res1_6 *= alpha;
  279. res1_7 *= alpha;
  280. res2_0 *= alpha;
  281. res2_1 *= alpha;
  282. res2_2 *= alpha;
  283. res2_3 *= alpha;
  284. res2_4 *= alpha;
  285. res2_5 *= alpha;
  286. res2_6 *= alpha;
  287. res2_7 *= alpha;
  288. res3_0 *= alpha;
  289. res3_1 *= alpha;
  290. res3_2 *= alpha;
  291. res3_3 *= alpha;
  292. res3_4 *= alpha;
  293. res3_5 *= alpha;
  294. res3_6 *= alpha;
  295. res3_7 *= alpha;
  296. res4_0 *= alpha;
  297. res4_1 *= alpha;
  298. res4_2 *= alpha;
  299. res4_3 *= alpha;
  300. res4_4 *= alpha;
  301. res4_5 *= alpha;
  302. res4_6 *= alpha;
  303. res4_7 *= alpha;
  304. res5_0 *= alpha;
  305. res5_1 *= alpha;
  306. res5_2 *= alpha;
  307. res5_3 *= alpha;
  308. res5_4 *= alpha;
  309. res5_5 *= alpha;
  310. res5_6 *= alpha;
  311. res5_7 *= alpha;
  312. res6_0 *= alpha;
  313. res6_1 *= alpha;
  314. res6_2 *= alpha;
  315. res6_3 *= alpha;
  316. res6_4 *= alpha;
  317. res6_5 *= alpha;
  318. res6_6 *= alpha;
  319. res6_7 *= alpha;
  320. res7_0 *= alpha;
  321. res7_1 *= alpha;
  322. res7_2 *= alpha;
  323. res7_3 *= alpha;
  324. res7_4 *= alpha;
  325. res7_5 *= alpha;
  326. res7_6 *= alpha;
  327. res7_7 *= alpha;
  328. C0[0] = res0_0;
  329. C0[1] = res0_1;
  330. C0[2] = res0_2;
  331. C0[3] = res0_3;
  332. C0[4] = res0_4;
  333. C0[5] = res0_5;
  334. C0[6] = res0_6;
  335. C0[7] = res0_7;
  336. C1[0] = res1_0;
  337. C1[1] = res1_1;
  338. C1[2] = res1_2;
  339. C1[3] = res1_3;
  340. C1[4] = res1_4;
  341. C1[5] = res1_5;
  342. C1[6] = res1_6;
  343. C1[7] = res1_7;
  344. C2[0] = res2_0;
  345. C2[1] = res2_1;
  346. C2[2] = res2_2;
  347. C2[3] = res2_3;
  348. C2[4] = res2_4;
  349. C2[5] = res2_5;
  350. C2[6] = res2_6;
  351. C2[7] = res2_7;
  352. C3[0] = res3_0;
  353. C3[1] = res3_1;
  354. C3[2] = res3_2;
  355. C3[3] = res3_3;
  356. C3[4] = res3_4;
  357. C3[5] = res3_5;
  358. C3[6] = res3_6;
  359. C3[7] = res3_7;
  360. C4[0] = res4_0;
  361. C4[1] = res4_1;
  362. C4[2] = res4_2;
  363. C4[3] = res4_3;
  364. C4[4] = res4_4;
  365. C4[5] = res4_5;
  366. C4[6] = res4_6;
  367. C4[7] = res4_7;
  368. C5[0] = res5_0;
  369. C5[1] = res5_1;
  370. C5[2] = res5_2;
  371. C5[3] = res5_3;
  372. C5[4] = res5_4;
  373. C5[5] = res5_5;
  374. C5[6] = res5_6;
  375. C5[7] = res5_7;
  376. C6[0] = res6_0;
  377. C6[1] = res6_1;
  378. C6[2] = res6_2;
  379. C6[3] = res6_3;
  380. C6[4] = res6_4;
  381. C6[5] = res6_5;
  382. C6[6] = res6_6;
  383. C6[7] = res6_7;
  384. C7[0] = res7_0;
  385. C7[1] = res7_1;
  386. C7[2] = res7_2;
  387. C7[3] = res7_3;
  388. C7[4] = res7_4;
  389. C7[5] = res7_5;
  390. C7[6] = res7_6;
  391. C7[7] = res7_7;
  392. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  393. temp = bk - off;
  394. #ifdef LEFT
  395. temp -= 8; // number of values in A
  396. #else
  397. temp -= 8; // number of values in B
  398. #endif
  399. ptrba += temp*8;
  400. ptrbb += temp*8;
  401. #endif
  402. #ifdef LEFT
  403. off += 8; // number of values in A
  404. #endif
  405. C0 = C0+8;
  406. C1 = C1+8;
  407. C2 = C2+8;
  408. C3 = C3+8;
  409. C4 = C4+8;
  410. C5 = C5+8;
  411. C6 = C6+8;
  412. C7 = C7+8;
  413. }
  414. if ( bm & 4 )
  415. {
  416. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  417. ptrbb = bb;
  418. #else
  419. ptrba += off*4;
  420. ptrbb = bb + off*8;
  421. #endif
  422. res0_0 = 0;
  423. res0_1 = 0;
  424. res0_2 = 0;
  425. res0_3 = 0;
  426. res1_0 = 0;
  427. res1_1 = 0;
  428. res1_2 = 0;
  429. res1_3 = 0;
  430. res2_0 = 0;
  431. res2_1 = 0;
  432. res2_2 = 0;
  433. res2_3 = 0;
  434. res3_0 = 0;
  435. res3_1 = 0;
  436. res3_2 = 0;
  437. res3_3 = 0;
  438. res4_0 = 0;
  439. res4_1 = 0;
  440. res4_2 = 0;
  441. res4_3 = 0;
  442. res5_0 = 0;
  443. res5_1 = 0;
  444. res5_2 = 0;
  445. res5_3 = 0;
  446. res6_0 = 0;
  447. res6_1 = 0;
  448. res6_2 = 0;
  449. res6_3 = 0;
  450. res7_0 = 0;
  451. res7_1 = 0;
  452. res7_2 = 0;
  453. res7_3 = 0;
  454. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  455. temp = bk-off;
  456. #elif defined(LEFT)
  457. temp = off+4; // number of values in A
  458. #else
  459. temp = off+8; // number of values in B
  460. #endif
  461. for (k=0; k<temp; k++)
  462. {
  463. b0 = ptrbb[0];
  464. b1 = ptrbb[1];
  465. b2 = ptrbb[2];
  466. b3 = ptrbb[3];
  467. b4 = ptrbb[4];
  468. b5 = ptrbb[5];
  469. b6 = ptrbb[6];
  470. b7 = ptrbb[7];
  471. a0 = ptrba[0];
  472. res0_0 += a0*b0;
  473. res1_0 += a0*b1;
  474. res2_0 += a0*b2;
  475. res3_0 += a0*b3;
  476. res4_0 += a0*b4;
  477. res5_0 += a0*b5;
  478. res6_0 += a0*b6;
  479. res7_0 += a0*b7;
  480. a1 = ptrba[1];
  481. res0_1 += a1*b0;
  482. res1_1 += a1*b1;
  483. res2_1 += a1*b2;
  484. res3_1 += a1*b3;
  485. res4_1 += a1*b4;
  486. res5_1 += a1*b5;
  487. res6_1 += a1*b6;
  488. res7_1 += a1*b7;
  489. a0 = ptrba[2];
  490. res0_2 += a0*b0;
  491. res1_2 += a0*b1;
  492. res2_2 += a0*b2;
  493. res3_2 += a0*b3;
  494. res4_2 += a0*b4;
  495. res5_2 += a0*b5;
  496. res6_2 += a0*b6;
  497. res7_2 += a0*b7;
  498. a1 = ptrba[3];
  499. res0_3 += a1*b0;
  500. res1_3 += a1*b1;
  501. res2_3 += a1*b2;
  502. res3_3 += a1*b3;
  503. res4_3 += a1*b4;
  504. res5_3 += a1*b5;
  505. res6_3 += a1*b6;
  506. res7_3 += a1*b7;
  507. ptrba = ptrba+4;
  508. ptrbb = ptrbb+8;
  509. }
  510. res0_0 *= alpha;
  511. res0_1 *= alpha;
  512. res0_2 *= alpha;
  513. res0_3 *= alpha;
  514. res1_0 *= alpha;
  515. res1_1 *= alpha;
  516. res1_2 *= alpha;
  517. res1_3 *= alpha;
  518. res2_0 *= alpha;
  519. res2_1 *= alpha;
  520. res2_2 *= alpha;
  521. res2_3 *= alpha;
  522. res3_0 *= alpha;
  523. res3_1 *= alpha;
  524. res3_2 *= alpha;
  525. res3_3 *= alpha;
  526. res4_0 *= alpha;
  527. res4_1 *= alpha;
  528. res4_2 *= alpha;
  529. res4_3 *= alpha;
  530. res5_0 *= alpha;
  531. res5_1 *= alpha;
  532. res5_2 *= alpha;
  533. res5_3 *= alpha;
  534. res6_0 *= alpha;
  535. res6_1 *= alpha;
  536. res6_2 *= alpha;
  537. res6_3 *= alpha;
  538. res7_0 *= alpha;
  539. res7_1 *= alpha;
  540. res7_2 *= alpha;
  541. res7_3 *= alpha;
  542. C0[0] = res0_0;
  543. C0[1] = res0_1;
  544. C0[2] = res0_2;
  545. C0[3] = res0_3;
  546. C1[0] = res1_0;
  547. C1[1] = res1_1;
  548. C1[2] = res1_2;
  549. C1[3] = res1_3;
  550. C2[0] = res2_0;
  551. C2[1] = res2_1;
  552. C2[2] = res2_2;
  553. C2[3] = res2_3;
  554. C3[0] = res3_0;
  555. C3[1] = res3_1;
  556. C3[2] = res3_2;
  557. C3[3] = res3_3;
  558. C4[0] = res4_0;
  559. C4[1] = res4_1;
  560. C4[2] = res4_2;
  561. C4[3] = res4_3;
  562. C5[0] = res5_0;
  563. C5[1] = res5_1;
  564. C5[2] = res5_2;
  565. C5[3] = res5_3;
  566. C6[0] = res6_0;
  567. C6[1] = res6_1;
  568. C6[2] = res6_2;
  569. C6[3] = res6_3;
  570. C7[0] = res7_0;
  571. C7[1] = res7_1;
  572. C7[2] = res7_2;
  573. C7[3] = res7_3;
  574. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  575. temp = bk - off;
  576. #ifdef LEFT
  577. temp -= 4; // number of values in A
  578. #else
  579. temp -= 8; // number of values in B
  580. #endif
  581. ptrba += temp*4;
  582. ptrbb += temp*8;
  583. #endif
  584. #ifdef LEFT
  585. off += 4; // number of values in A
  586. #endif
  587. C0 = C0+4;
  588. C1 = C1+4;
  589. C2 = C2+4;
  590. C3 = C3+4;
  591. C4 = C4+4;
  592. C5 = C5+4;
  593. C6 = C6+4;
  594. C7 = C7+4;
  595. }
  596. if ( bm & 2 )
  597. {
  598. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  599. ptrbb = bb;
  600. #else
  601. ptrba += off*2;
  602. ptrbb = bb + off*8;
  603. #endif
  604. res0_0 = 0;
  605. res0_1 = 0;
  606. res1_0 = 0;
  607. res1_1 = 0;
  608. res2_0 = 0;
  609. res2_1 = 0;
  610. res3_0 = 0;
  611. res3_1 = 0;
  612. res4_0 = 0;
  613. res4_1 = 0;
  614. res5_0 = 0;
  615. res5_1 = 0;
  616. res6_0 = 0;
  617. res6_1 = 0;
  618. res7_0 = 0;
  619. res7_1 = 0;
  620. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  621. temp = bk-off;
  622. #elif defined(LEFT)
  623. temp = off+2; // number of values in A
  624. #else
  625. temp = off+8; // number of values in B
  626. #endif
  627. for (k=0; k<temp; k++)
  628. {
  629. b0 = ptrbb[0];
  630. b1 = ptrbb[1];
  631. b2 = ptrbb[2];
  632. b3 = ptrbb[3];
  633. b4 = ptrbb[4];
  634. b5 = ptrbb[5];
  635. b6 = ptrbb[6];
  636. b7 = ptrbb[7];
  637. a0 = ptrba[0];
  638. res0_0 += a0*b0;
  639. res1_0 += a0*b1;
  640. res2_0 += a0*b2;
  641. res3_0 += a0*b3;
  642. res4_0 += a0*b4;
  643. res5_0 += a0*b5;
  644. res6_0 += a0*b6;
  645. res7_0 += a0*b7;
  646. a1 = ptrba[1];
  647. res0_1 += a1*b0;
  648. res1_1 += a1*b1;
  649. res2_1 += a1*b2;
  650. res3_1 += a1*b3;
  651. res4_1 += a1*b4;
  652. res5_1 += a1*b5;
  653. res6_1 += a1*b6;
  654. res7_1 += a1*b7;
  655. ptrba = ptrba+2;
  656. ptrbb = ptrbb+8;
  657. }
  658. res0_0 *= alpha;
  659. res0_1 *= alpha;
  660. res1_0 *= alpha;
  661. res1_1 *= alpha;
  662. res2_0 *= alpha;
  663. res2_1 *= alpha;
  664. res3_0 *= alpha;
  665. res3_1 *= alpha;
  666. res4_0 *= alpha;
  667. res4_1 *= alpha;
  668. res5_0 *= alpha;
  669. res5_1 *= alpha;
  670. res6_0 *= alpha;
  671. res6_1 *= alpha;
  672. res7_0 *= alpha;
  673. res7_1 *= alpha;
  674. C0[0] = res0_0;
  675. C0[1] = res0_1;
  676. C1[0] = res1_0;
  677. C1[1] = res1_1;
  678. C2[0] = res2_0;
  679. C2[1] = res2_1;
  680. C3[0] = res3_0;
  681. C3[1] = res3_1;
  682. C4[0] = res4_0;
  683. C4[1] = res4_1;
  684. C5[0] = res5_0;
  685. C5[1] = res5_1;
  686. C6[0] = res6_0;
  687. C6[1] = res6_1;
  688. C7[0] = res7_0;
  689. C7[1] = res7_1;
  690. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  691. temp = bk - off;
  692. #ifdef LEFT
  693. temp -= 2; // number of values in A
  694. #else
  695. temp -= 8; // number of values in B
  696. #endif
  697. ptrba += temp*2;
  698. ptrbb += temp*8;
  699. #endif
  700. #ifdef LEFT
  701. off += 2; // number of values in A
  702. #endif
  703. C0 = C0+2;
  704. C1 = C1+2;
  705. C2 = C2+2;
  706. C3 = C3+2;
  707. C4 = C4+2;
  708. C5 = C5+2;
  709. C6 = C6+2;
  710. C7 = C7+2;
  711. }
  712. if ( bm & 1 )
  713. {
  714. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  715. ptrbb = bb;
  716. #else
  717. ptrba += off*1;
  718. ptrbb = bb + off*8;
  719. #endif
  720. res0_0 = 0;
  721. res1_0 = 0;
  722. res2_0 = 0;
  723. res3_0 = 0;
  724. res4_0 = 0;
  725. res5_0 = 0;
  726. res6_0 = 0;
  727. res7_0 = 0;
  728. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  729. temp = bk-off;
  730. #elif defined(LEFT)
  731. temp = off+1; // number of values in A
  732. #else
  733. temp = off+8; // number of values in B
  734. #endif
  735. for (k=0; k<temp; k++)
  736. {
  737. b0 = ptrbb[0];
  738. b1 = ptrbb[1];
  739. b2 = ptrbb[2];
  740. b3 = ptrbb[3];
  741. b4 = ptrbb[4];
  742. b5 = ptrbb[5];
  743. b6 = ptrbb[6];
  744. b7 = ptrbb[7];
  745. a0 = ptrba[0];
  746. res0_0 += a0*b0;
  747. res1_0 += a0*b1;
  748. res2_0 += a0*b2;
  749. res3_0 += a0*b3;
  750. res4_0 += a0*b4;
  751. res5_0 += a0*b5;
  752. res6_0 += a0*b6;
  753. res7_0 += a0*b7;
  754. ptrba = ptrba+1;
  755. ptrbb = ptrbb+8;
  756. }
  757. res0_0 *= alpha;
  758. res1_0 *= alpha;
  759. res2_0 *= alpha;
  760. res3_0 *= alpha;
  761. res4_0 *= alpha;
  762. res5_0 *= alpha;
  763. res6_0 *= alpha;
  764. res7_0 *= alpha;
  765. C0[0] = res0_0;
  766. C1[0] = res1_0;
  767. C2[0] = res2_0;
  768. C3[0] = res3_0;
  769. C4[0] = res4_0;
  770. C5[0] = res5_0;
  771. C6[0] = res6_0;
  772. C7[0] = res7_0;
  773. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  774. temp = bk - off;
  775. #ifdef LEFT
  776. temp -= 1; // number of values in A
  777. #else
  778. temp -= 8; // number of values in B
  779. #endif
  780. ptrba += temp*1;
  781. ptrbb += temp*8;
  782. #endif
  783. #ifdef LEFT
  784. off += 1; // number of values in A
  785. #endif
  786. C0 = C0+1;
  787. C1 = C1+1;
  788. C2 = C2+1;
  789. C3 = C3+1;
  790. C4 = C4+1;
  791. C5 = C5+1;
  792. C6 = C6+1;
  793. C7 = C7+1;
  794. }
  795. #if defined(TRMMKERNEL) && !defined(LEFT)
  796. off += 8;
  797. #endif
  798. k = (bk<<3);
  799. bb = bb+k;
  800. i = (ldc<<3);
  801. C = C+i;
  802. }
  803. if (bn&4)
  804. {
  805. C0 = C;
  806. C1 = C0+ldc;
  807. C2 = C1+ldc;
  808. C3 = C2+ldc;
  809. #if defined(TRMMKERNEL) && defined(LEFT)
  810. off = offset;
  811. #endif
  812. ptrba = ba;
  813. for (i=0; i<bm/8; i+=1)
  814. {
  815. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  816. ptrbb = bb;
  817. #else
  818. ptrba += off*8;
  819. ptrbb = bb + off*4;
  820. #endif
  821. res0_0 = 0;
  822. res0_1 = 0;
  823. res0_2 = 0;
  824. res0_3 = 0;
  825. res0_4 = 0;
  826. res0_5 = 0;
  827. res0_6 = 0;
  828. res0_7 = 0;
  829. res1_0 = 0;
  830. res1_1 = 0;
  831. res1_2 = 0;
  832. res1_3 = 0;
  833. res1_4 = 0;
  834. res1_5 = 0;
  835. res1_6 = 0;
  836. res1_7 = 0;
  837. res2_0 = 0;
  838. res2_1 = 0;
  839. res2_2 = 0;
  840. res2_3 = 0;
  841. res2_4 = 0;
  842. res2_5 = 0;
  843. res2_6 = 0;
  844. res2_7 = 0;
  845. res3_0 = 0;
  846. res3_1 = 0;
  847. res3_2 = 0;
  848. res3_3 = 0;
  849. res3_4 = 0;
  850. res3_5 = 0;
  851. res3_6 = 0;
  852. res3_7 = 0;
  853. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  854. temp = bk-off;
  855. #elif defined(LEFT)
  856. temp = off+8; // number of values in A
  857. #else
  858. temp = off+4; // number of values in B
  859. #endif
  860. for (k=0; k<temp; k++)
  861. {
  862. b0 = ptrbb[0];
  863. b1 = ptrbb[1];
  864. b2 = ptrbb[2];
  865. b3 = ptrbb[3];
  866. a0 = ptrba[0];
  867. res0_0 += a0*b0;
  868. res1_0 += a0*b1;
  869. res2_0 += a0*b2;
  870. res3_0 += a0*b3;
  871. a1 = ptrba[1];
  872. res0_1 += a1*b0;
  873. res1_1 += a1*b1;
  874. res2_1 += a1*b2;
  875. res3_1 += a1*b3;
  876. a0 = ptrba[2];
  877. res0_2 += a0*b0;
  878. res1_2 += a0*b1;
  879. res2_2 += a0*b2;
  880. res3_2 += a0*b3;
  881. a1 = ptrba[3];
  882. res0_3 += a1*b0;
  883. res1_3 += a1*b1;
  884. res2_3 += a1*b2;
  885. res3_3 += a1*b3;
  886. a0 = ptrba[4];
  887. res0_4 += a0*b0;
  888. res1_4 += a0*b1;
  889. res2_4 += a0*b2;
  890. res3_4 += a0*b3;
  891. a1 = ptrba[5];
  892. res0_5 += a1*b0;
  893. res1_5 += a1*b1;
  894. res2_5 += a1*b2;
  895. res3_5 += a1*b3;
  896. a0 = ptrba[6];
  897. res0_6 += a0*b0;
  898. res1_6 += a0*b1;
  899. res2_6 += a0*b2;
  900. res3_6 += a0*b3;
  901. a1 = ptrba[7];
  902. res0_7 += a1*b0;
  903. res1_7 += a1*b1;
  904. res2_7 += a1*b2;
  905. res3_7 += a1*b3;
  906. ptrba = ptrba+8;
  907. ptrbb = ptrbb+4;
  908. }
  909. res0_0 *= alpha;
  910. res0_1 *= alpha;
  911. res0_2 *= alpha;
  912. res0_3 *= alpha;
  913. res0_4 *= alpha;
  914. res0_5 *= alpha;
  915. res0_6 *= alpha;
  916. res0_7 *= alpha;
  917. res1_0 *= alpha;
  918. res1_1 *= alpha;
  919. res1_2 *= alpha;
  920. res1_3 *= alpha;
  921. res1_4 *= alpha;
  922. res1_5 *= alpha;
  923. res1_6 *= alpha;
  924. res1_7 *= alpha;
  925. res2_0 *= alpha;
  926. res2_1 *= alpha;
  927. res2_2 *= alpha;
  928. res2_3 *= alpha;
  929. res2_4 *= alpha;
  930. res2_5 *= alpha;
  931. res2_6 *= alpha;
  932. res2_7 *= alpha;
  933. res3_0 *= alpha;
  934. res3_1 *= alpha;
  935. res3_2 *= alpha;
  936. res3_3 *= alpha;
  937. res3_4 *= alpha;
  938. res3_5 *= alpha;
  939. res3_6 *= alpha;
  940. res3_7 *= alpha;
  941. C0[0] = res0_0;
  942. C0[1] = res0_1;
  943. C0[2] = res0_2;
  944. C0[3] = res0_3;
  945. C0[4] = res0_4;
  946. C0[5] = res0_5;
  947. C0[6] = res0_6;
  948. C0[7] = res0_7;
  949. C1[0] = res1_0;
  950. C1[1] = res1_1;
  951. C1[2] = res1_2;
  952. C1[3] = res1_3;
  953. C1[4] = res1_4;
  954. C1[5] = res1_5;
  955. C1[6] = res1_6;
  956. C1[7] = res1_7;
  957. C2[0] = res2_0;
  958. C2[1] = res2_1;
  959. C2[2] = res2_2;
  960. C2[3] = res2_3;
  961. C2[4] = res2_4;
  962. C2[5] = res2_5;
  963. C2[6] = res2_6;
  964. C2[7] = res2_7;
  965. C3[0] = res3_0;
  966. C3[1] = res3_1;
  967. C3[2] = res3_2;
  968. C3[3] = res3_3;
  969. C3[4] = res3_4;
  970. C3[5] = res3_5;
  971. C3[6] = res3_6;
  972. C3[7] = res3_7;
  973. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  974. temp = bk - off;
  975. #ifdef LEFT
  976. temp -= 8; // number of values in A
  977. #else
  978. temp -= 4; // number of values in B
  979. #endif
  980. ptrba += temp*8;
  981. ptrbb += temp*4;
  982. #endif
  983. #ifdef LEFT
  984. off += 8; // number of values in A
  985. #endif
  986. C0 = C0+8;
  987. C1 = C1+8;
  988. C2 = C2+8;
  989. C3 = C3+8;
  990. }
  991. if ( bm & 4 )
  992. {
  993. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  994. ptrbb = bb;
  995. #else
  996. ptrba += off*4;
  997. ptrbb = bb + off*4;
  998. #endif
  999. res0_0 = 0;
  1000. res0_1 = 0;
  1001. res0_2 = 0;
  1002. res0_3 = 0;
  1003. res1_0 = 0;
  1004. res1_1 = 0;
  1005. res1_2 = 0;
  1006. res1_3 = 0;
  1007. res2_0 = 0;
  1008. res2_1 = 0;
  1009. res2_2 = 0;
  1010. res2_3 = 0;
  1011. res3_0 = 0;
  1012. res3_1 = 0;
  1013. res3_2 = 0;
  1014. res3_3 = 0;
  1015. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1016. temp = bk-off;
  1017. #elif defined(LEFT)
  1018. temp = off+4; // number of values in A
  1019. #else
  1020. temp = off+4; // number of values in B
  1021. #endif
  1022. for (k=0; k<temp; k++)
  1023. {
  1024. b0 = ptrbb[0];
  1025. b1 = ptrbb[1];
  1026. b2 = ptrbb[2];
  1027. b3 = ptrbb[3];
  1028. a0 = ptrba[0];
  1029. res0_0 += a0*b0;
  1030. res1_0 += a0*b1;
  1031. res2_0 += a0*b2;
  1032. res3_0 += a0*b3;
  1033. a1 = ptrba[1];
  1034. res0_1 += a1*b0;
  1035. res1_1 += a1*b1;
  1036. res2_1 += a1*b2;
  1037. res3_1 += a1*b3;
  1038. a0 = ptrba[2];
  1039. res0_2 += a0*b0;
  1040. res1_2 += a0*b1;
  1041. res2_2 += a0*b2;
  1042. res3_2 += a0*b3;
  1043. a1 = ptrba[3];
  1044. res0_3 += a1*b0;
  1045. res1_3 += a1*b1;
  1046. res2_3 += a1*b2;
  1047. res3_3 += a1*b3;
  1048. ptrba = ptrba+4;
  1049. ptrbb = ptrbb+4;
  1050. }
  1051. res0_0 *= alpha;
  1052. res0_1 *= alpha;
  1053. res0_2 *= alpha;
  1054. res0_3 *= alpha;
  1055. res1_0 *= alpha;
  1056. res1_1 *= alpha;
  1057. res1_2 *= alpha;
  1058. res1_3 *= alpha;
  1059. res2_0 *= alpha;
  1060. res2_1 *= alpha;
  1061. res2_2 *= alpha;
  1062. res2_3 *= alpha;
  1063. res3_0 *= alpha;
  1064. res3_1 *= alpha;
  1065. res3_2 *= alpha;
  1066. res3_3 *= alpha;
  1067. C0[0] = res0_0;
  1068. C0[1] = res0_1;
  1069. C0[2] = res0_2;
  1070. C0[3] = res0_3;
  1071. C1[0] = res1_0;
  1072. C1[1] = res1_1;
  1073. C1[2] = res1_2;
  1074. C1[3] = res1_3;
  1075. C2[0] = res2_0;
  1076. C2[1] = res2_1;
  1077. C2[2] = res2_2;
  1078. C2[3] = res2_3;
  1079. C3[0] = res3_0;
  1080. C3[1] = res3_1;
  1081. C3[2] = res3_2;
  1082. C3[3] = res3_3;
  1083. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1084. temp = bk - off;
  1085. #ifdef LEFT
  1086. temp -= 4; // number of values in A
  1087. #else
  1088. temp -= 4; // number of values in B
  1089. #endif
  1090. ptrba += temp*4;
  1091. ptrbb += temp*4;
  1092. #endif
  1093. #ifdef LEFT
  1094. off += 4; // number of values in A
  1095. #endif
  1096. C0 = C0+4;
  1097. C1 = C1+4;
  1098. C2 = C2+4;
  1099. C3 = C3+4;
  1100. }
  1101. if ( bm & 2 )
  1102. {
  1103. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1104. ptrbb = bb;
  1105. #else
  1106. ptrba += off*2;
  1107. ptrbb = bb + off*4;
  1108. #endif
  1109. res0_0 = 0;
  1110. res0_1 = 0;
  1111. res1_0 = 0;
  1112. res1_1 = 0;
  1113. res2_0 = 0;
  1114. res2_1 = 0;
  1115. res3_0 = 0;
  1116. res3_1 = 0;
  1117. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1118. temp = bk-off;
  1119. #elif defined(LEFT)
  1120. temp = off+2; // number of values in A
  1121. #else
  1122. temp = off+4; // number of values in B
  1123. #endif
  1124. for (k=0; k<temp; k++)
  1125. {
  1126. b0 = ptrbb[0];
  1127. b1 = ptrbb[1];
  1128. b2 = ptrbb[2];
  1129. b3 = ptrbb[3];
  1130. a0 = ptrba[0];
  1131. res0_0 += a0*b0;
  1132. res1_0 += a0*b1;
  1133. res2_0 += a0*b2;
  1134. res3_0 += a0*b3;
  1135. a1 = ptrba[1];
  1136. res0_1 += a1*b0;
  1137. res1_1 += a1*b1;
  1138. res2_1 += a1*b2;
  1139. res3_1 += a1*b3;
  1140. ptrba = ptrba+2;
  1141. ptrbb = ptrbb+4;
  1142. }
  1143. res0_0 *= alpha;
  1144. res0_1 *= alpha;
  1145. res1_0 *= alpha;
  1146. res1_1 *= alpha;
  1147. res2_0 *= alpha;
  1148. res2_1 *= alpha;
  1149. res3_0 *= alpha;
  1150. res3_1 *= alpha;
  1151. C0[0] = res0_0;
  1152. C0[1] = res0_1;
  1153. C1[0] = res1_0;
  1154. C1[1] = res1_1;
  1155. C2[0] = res2_0;
  1156. C2[1] = res2_1;
  1157. C3[0] = res3_0;
  1158. C3[1] = res3_1;
  1159. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1160. temp = bk - off;
  1161. #ifdef LEFT
  1162. temp -= 2; // number of values in A
  1163. #else
  1164. temp -= 4; // number of values in B
  1165. #endif
  1166. ptrba += temp*2;
  1167. ptrbb += temp*4;
  1168. #endif
  1169. #ifdef LEFT
  1170. off += 2; // number of values in A
  1171. #endif
  1172. C0 = C0+2;
  1173. C1 = C1+2;
  1174. C2 = C2+2;
  1175. C3 = C3+2;
  1176. }
  1177. if ( bm & 1 )
  1178. {
  1179. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1180. ptrbb = bb;
  1181. #else
  1182. ptrba += off*1;
  1183. ptrbb = bb + off*4;
  1184. #endif
  1185. res0_0 = 0;
  1186. res1_0 = 0;
  1187. res2_0 = 0;
  1188. res3_0 = 0;
  1189. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1190. temp = bk-off;
  1191. #elif defined(LEFT)
  1192. temp = off+1; // number of values in A
  1193. #else
  1194. temp = off+4; // number of values in B
  1195. #endif
  1196. for (k=0; k<temp; k++)
  1197. {
  1198. b0 = ptrbb[0];
  1199. b1 = ptrbb[1];
  1200. b2 = ptrbb[2];
  1201. b3 = ptrbb[3];
  1202. a0 = ptrba[0];
  1203. res0_0 += a0*b0;
  1204. res1_0 += a0*b1;
  1205. res2_0 += a0*b2;
  1206. res3_0 += a0*b3;
  1207. ptrba = ptrba+1;
  1208. ptrbb = ptrbb+4;
  1209. }
  1210. res0_0 *= alpha;
  1211. res1_0 *= alpha;
  1212. res2_0 *= alpha;
  1213. res3_0 *= alpha;
  1214. C0[0] = res0_0;
  1215. C1[0] = res1_0;
  1216. C2[0] = res2_0;
  1217. C3[0] = res3_0;
  1218. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1219. temp = bk - off;
  1220. #ifdef LEFT
  1221. temp -= 1; // number of values in A
  1222. #else
  1223. temp -= 4; // number of values in B
  1224. #endif
  1225. ptrba += temp*1;
  1226. ptrbb += temp*4;
  1227. #endif
  1228. #ifdef LEFT
  1229. off += 1; // number of values in A
  1230. #endif
  1231. C0 = C0+1;
  1232. C1 = C1+1;
  1233. C2 = C2+1;
  1234. C3 = C3+1;
  1235. }
  1236. #if defined(TRMMKERNEL) && !defined(LEFT)
  1237. off += 4;
  1238. #endif
  1239. k = (bk<<2);
  1240. bb = bb+k;
  1241. i = (ldc<<2);
  1242. C = C+i;
  1243. }
  1244. for (j=0; j<(bn&2); j+=2)
  1245. {
  1246. C0 = C;
  1247. C1 = C0+ldc;
  1248. #if defined(TRMMKERNEL) && defined(LEFT)
  1249. off = offset;
  1250. #endif
  1251. ptrba = ba;
  1252. for (i=0; i<bm/8; i+=1)
  1253. {
  1254. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1255. ptrbb = bb;
  1256. #else
  1257. ptrba += off*8;
  1258. ptrbb = bb + off*2;
  1259. #endif
  1260. res0_0 = 0;
  1261. res0_1 = 0;
  1262. res0_2 = 0;
  1263. res0_3 = 0;
  1264. res0_4 = 0;
  1265. res0_5 = 0;
  1266. res0_6 = 0;
  1267. res0_7 = 0;
  1268. res1_0 = 0;
  1269. res1_1 = 0;
  1270. res1_2 = 0;
  1271. res1_3 = 0;
  1272. res1_4 = 0;
  1273. res1_5 = 0;
  1274. res1_6 = 0;
  1275. res1_7 = 0;
  1276. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1277. temp = bk-off;
  1278. #elif defined(LEFT)
  1279. temp = off+8; // number of values in A
  1280. #else
  1281. temp = off+2; // number of values in B
  1282. #endif
  1283. for (k=0; k<temp; k++)
  1284. {
  1285. b0 = ptrbb[0];
  1286. b1 = ptrbb[1];
  1287. a0 = ptrba[0];
  1288. res0_0 += a0*b0;
  1289. res1_0 += a0*b1;
  1290. a1 = ptrba[1];
  1291. res0_1 += a1*b0;
  1292. res1_1 += a1*b1;
  1293. a0 = ptrba[2];
  1294. res0_2 += a0*b0;
  1295. res1_2 += a0*b1;
  1296. a1 = ptrba[3];
  1297. res0_3 += a1*b0;
  1298. res1_3 += a1*b1;
  1299. a0 = ptrba[4];
  1300. res0_4 += a0*b0;
  1301. res1_4 += a0*b1;
  1302. a1 = ptrba[5];
  1303. res0_5 += a1*b0;
  1304. res1_5 += a1*b1;
  1305. a0 = ptrba[6];
  1306. res0_6 += a0*b0;
  1307. res1_6 += a0*b1;
  1308. a1 = ptrba[7];
  1309. res0_7 += a1*b0;
  1310. res1_7 += a1*b1;
  1311. ptrba = ptrba+8;
  1312. ptrbb = ptrbb+2;
  1313. }
  1314. res0_0 *= alpha;
  1315. res0_1 *= alpha;
  1316. res0_2 *= alpha;
  1317. res0_3 *= alpha;
  1318. res0_4 *= alpha;
  1319. res0_5 *= alpha;
  1320. res0_6 *= alpha;
  1321. res0_7 *= alpha;
  1322. res1_0 *= alpha;
  1323. res1_1 *= alpha;
  1324. res1_2 *= alpha;
  1325. res1_3 *= alpha;
  1326. res1_4 *= alpha;
  1327. res1_5 *= alpha;
  1328. res1_6 *= alpha;
  1329. res1_7 *= alpha;
  1330. C0[0] = res0_0;
  1331. C0[1] = res0_1;
  1332. C0[2] = res0_2;
  1333. C0[3] = res0_3;
  1334. C0[4] = res0_4;
  1335. C0[5] = res0_5;
  1336. C0[6] = res0_6;
  1337. C0[7] = res0_7;
  1338. C1[0] = res1_0;
  1339. C1[1] = res1_1;
  1340. C1[2] = res1_2;
  1341. C1[3] = res1_3;
  1342. C1[4] = res1_4;
  1343. C1[5] = res1_5;
  1344. C1[6] = res1_6;
  1345. C1[7] = res1_7;
  1346. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1347. temp = bk - off;
  1348. #ifdef LEFT
  1349. temp -= 8; // number of values in A
  1350. #else
  1351. temp -= 2; // number of values in B
  1352. #endif
  1353. ptrba += temp*8;
  1354. ptrbb += temp*2;
  1355. #endif
  1356. #ifdef LEFT
  1357. off += 8; // number of values in A
  1358. #endif
  1359. C0 = C0+8;
  1360. C1 = C1+8;
  1361. }
  1362. if ( bm & 4 )
  1363. {
  1364. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1365. ptrbb = bb;
  1366. #else
  1367. ptrba += off*4;
  1368. ptrbb = bb + off*2;
  1369. #endif
  1370. res0_0 = 0;
  1371. res0_1 = 0;
  1372. res0_2 = 0;
  1373. res0_3 = 0;
  1374. res1_0 = 0;
  1375. res1_1 = 0;
  1376. res1_2 = 0;
  1377. res1_3 = 0;
  1378. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1379. temp = bk-off;
  1380. #elif defined(LEFT)
  1381. temp = off+4; // number of values in A
  1382. #else
  1383. temp = off+2; // number of values in B
  1384. #endif
  1385. for (k=0; k<temp; k++)
  1386. {
  1387. b0 = ptrbb[0];
  1388. b1 = ptrbb[1];
  1389. a0 = ptrba[0];
  1390. res0_0 += a0*b0;
  1391. res1_0 += a0*b1;
  1392. a1 = ptrba[1];
  1393. res0_1 += a1*b0;
  1394. res1_1 += a1*b1;
  1395. a0 = ptrba[2];
  1396. res0_2 += a0*b0;
  1397. res1_2 += a0*b1;
  1398. a1 = ptrba[3];
  1399. res0_3 += a1*b0;
  1400. res1_3 += a1*b1;
  1401. ptrba = ptrba+4;
  1402. ptrbb = ptrbb+2;
  1403. }
  1404. res0_0 *= alpha;
  1405. res0_1 *= alpha;
  1406. res0_2 *= alpha;
  1407. res0_3 *= alpha;
  1408. res1_0 *= alpha;
  1409. res1_1 *= alpha;
  1410. res1_2 *= alpha;
  1411. res1_3 *= alpha;
  1412. C0[0] = res0_0;
  1413. C0[1] = res0_1;
  1414. C0[2] = res0_2;
  1415. C0[3] = res0_3;
  1416. C1[0] = res1_0;
  1417. C1[1] = res1_1;
  1418. C1[2] = res1_2;
  1419. C1[3] = res1_3;
  1420. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1421. temp = bk - off;
  1422. #ifdef LEFT
  1423. temp -= 4; // number of values in A
  1424. #else
  1425. temp -= 2; // number of values in B
  1426. #endif
  1427. ptrba += temp*4;
  1428. ptrbb += temp*2;
  1429. #endif
  1430. #ifdef LEFT
  1431. off += 4; // number of values in A
  1432. #endif
  1433. C0 = C0+4;
  1434. C1 = C1+4;
  1435. }
  1436. if ( bm & 2 )
  1437. {
  1438. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1439. ptrbb = bb;
  1440. #else
  1441. ptrba += off*2;
  1442. ptrbb = bb + off*2;
  1443. #endif
  1444. res0_0 = 0;
  1445. res0_1 = 0;
  1446. res1_0 = 0;
  1447. res1_1 = 0;
  1448. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1449. temp = bk-off;
  1450. #elif defined(LEFT)
  1451. temp = off+2; // number of values in A
  1452. #else
  1453. temp = off+2; // number of values in B
  1454. #endif
  1455. for (k=0; k<temp; k++)
  1456. {
  1457. b0 = ptrbb[0];
  1458. b1 = ptrbb[1];
  1459. a0 = ptrba[0];
  1460. res0_0 += a0*b0;
  1461. res1_0 += a0*b1;
  1462. a1 = ptrba[1];
  1463. res0_1 += a1*b0;
  1464. res1_1 += a1*b1;
  1465. ptrba = ptrba+2;
  1466. ptrbb = ptrbb+2;
  1467. }
  1468. res0_0 *= alpha;
  1469. res0_1 *= alpha;
  1470. res1_0 *= alpha;
  1471. res1_1 *= alpha;
  1472. C0[0] = res0_0;
  1473. C0[1] = res0_1;
  1474. C1[0] = res1_0;
  1475. C1[1] = res1_1;
  1476. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1477. temp = bk - off;
  1478. #ifdef LEFT
  1479. temp -= 2; // number of values in A
  1480. #else
  1481. temp -= 2; // number of values in B
  1482. #endif
  1483. ptrba += temp*2;
  1484. ptrbb += temp*2;
  1485. #endif
  1486. #ifdef LEFT
  1487. off += 2; // number of values in A
  1488. #endif
  1489. C0 = C0+2;
  1490. C1 = C1+2;
  1491. }
  1492. if ( bm & 1 )
  1493. {
  1494. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1495. ptrbb = bb;
  1496. #else
  1497. ptrba += off*1;
  1498. ptrbb = bb + off*2;
  1499. #endif
  1500. res0_0 = 0;
  1501. res1_0 = 0;
  1502. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1503. temp = bk-off;
  1504. #elif defined(LEFT)
  1505. temp = off+1; // number of values in A
  1506. #else
  1507. temp = off+2; // number of values in B
  1508. #endif
  1509. for (k=0; k<temp; k++)
  1510. {
  1511. b0 = ptrbb[0];
  1512. b1 = ptrbb[1];
  1513. a0 = ptrba[0];
  1514. res0_0 += a0*b0;
  1515. res1_0 += a0*b1;
  1516. ptrba = ptrba+1;
  1517. ptrbb = ptrbb+2;
  1518. }
  1519. res0_0 *= alpha;
  1520. res1_0 *= alpha;
  1521. C0[0] = res0_0;
  1522. C1[0] = res1_0;
  1523. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1524. temp = bk - off;
  1525. #ifdef LEFT
  1526. temp -= 1; // number of values in A
  1527. #else
  1528. temp -= 2; // number of values in B
  1529. #endif
  1530. ptrba += temp*1;
  1531. ptrbb += temp*2;
  1532. #endif
  1533. #ifdef LEFT
  1534. off += 1; // number of values in A
  1535. #endif
  1536. C0 = C0+1;
  1537. C1 = C1+1;
  1538. }
  1539. #if defined(TRMMKERNEL) && !defined(LEFT)
  1540. off += 2;
  1541. #endif
  1542. k = (bk<<1);
  1543. bb = bb+k;
  1544. i = (ldc<<1);
  1545. C = C+i;
  1546. }
  1547. for (j=0; j<(bn&1); j+=1)
  1548. {
  1549. C0 = C;
  1550. #if defined(TRMMKERNEL) && defined(LEFT)
  1551. off = offset;
  1552. #endif
  1553. ptrba = ba;
  1554. for (i=0; i<bm/8; i+=1)
  1555. {
  1556. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1557. ptrbb = bb;
  1558. #else
  1559. ptrba += off*8;
  1560. ptrbb = bb + off*1;
  1561. #endif
  1562. res0_0 = 0;
  1563. res0_1 = 0;
  1564. res0_2 = 0;
  1565. res0_3 = 0;
  1566. res0_4 = 0;
  1567. res0_5 = 0;
  1568. res0_6 = 0;
  1569. res0_7 = 0;
  1570. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1571. temp = bk-off;
  1572. #elif defined(LEFT)
  1573. temp = off+8; // number of values in A
  1574. #else
  1575. temp = off+1; // number of values in B
  1576. #endif
  1577. for (k=0; k<temp; k++)
  1578. {
  1579. b0 = ptrbb[0];
  1580. a0 = ptrba[0];
  1581. res0_0 += a0*b0;
  1582. a1 = ptrba[1];
  1583. res0_1 += a1*b0;
  1584. a0 = ptrba[2];
  1585. res0_2 += a0*b0;
  1586. a1 = ptrba[3];
  1587. res0_3 += a1*b0;
  1588. a0 = ptrba[4];
  1589. res0_4 += a0*b0;
  1590. a1 = ptrba[5];
  1591. res0_5 += a1*b0;
  1592. a0 = ptrba[6];
  1593. res0_6 += a0*b0;
  1594. a1 = ptrba[7];
  1595. res0_7 += a1*b0;
  1596. ptrba = ptrba+8;
  1597. ptrbb = ptrbb+1;
  1598. }
  1599. res0_0 *= alpha;
  1600. res0_1 *= alpha;
  1601. res0_2 *= alpha;
  1602. res0_3 *= alpha;
  1603. res0_4 *= alpha;
  1604. res0_5 *= alpha;
  1605. res0_6 *= alpha;
  1606. res0_7 *= alpha;
  1607. C0[0] = res0_0;
  1608. C0[1] = res0_1;
  1609. C0[2] = res0_2;
  1610. C0[3] = res0_3;
  1611. C0[4] = res0_4;
  1612. C0[5] = res0_5;
  1613. C0[6] = res0_6;
  1614. C0[7] = res0_7;
  1615. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1616. temp = bk - off;
  1617. #ifdef LEFT
  1618. temp -= 8; // number of values in A
  1619. #else
  1620. temp -= 1; // number of values in B
  1621. #endif
  1622. ptrba += temp*8;
  1623. ptrbb += temp*1;
  1624. #endif
  1625. #ifdef LEFT
  1626. off += 8; // number of values in A
  1627. #endif
  1628. C0 = C0+8;
  1629. }
  1630. if ( bm & 4 )
  1631. {
  1632. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1633. ptrbb = bb;
  1634. #else
  1635. ptrba += off*4;
  1636. ptrbb = bb + off*1;
  1637. #endif
  1638. res0_0 = 0;
  1639. res0_1 = 0;
  1640. res0_2 = 0;
  1641. res0_3 = 0;
  1642. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1643. temp = bk-off;
  1644. #elif defined(LEFT)
  1645. temp = off+4; // number of values in A
  1646. #else
  1647. temp = off+1; // number of values in B
  1648. #endif
  1649. for (k=0; k<temp; k++)
  1650. {
  1651. b0 = ptrbb[0];
  1652. a0 = ptrba[0];
  1653. res0_0 += a0*b0;
  1654. a1 = ptrba[1];
  1655. res0_1 += a1*b0;
  1656. a0 = ptrba[2];
  1657. res0_2 += a0*b0;
  1658. a1 = ptrba[3];
  1659. res0_3 += a1*b0;
  1660. ptrba = ptrba+4;
  1661. ptrbb = ptrbb+1;
  1662. }
  1663. res0_0 *= alpha;
  1664. res0_1 *= alpha;
  1665. res0_2 *= alpha;
  1666. res0_3 *= alpha;
  1667. C0[0] = res0_0;
  1668. C0[1] = res0_1;
  1669. C0[2] = res0_2;
  1670. C0[3] = res0_3;
  1671. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1672. temp = bk - off;
  1673. #ifdef LEFT
  1674. temp -= 4; // number of values in A
  1675. #else
  1676. temp -= 1; // number of values in B
  1677. #endif
  1678. ptrba += temp*4;
  1679. ptrbb += temp*1;
  1680. #endif
  1681. #ifdef LEFT
  1682. off += 4; // number of values in A
  1683. #endif
  1684. C0 = C0+4;
  1685. }
  1686. if ( bm & 2 )
  1687. {
  1688. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1689. ptrbb = bb;
  1690. #else
  1691. ptrba += off*2;
  1692. ptrbb = bb + off*1;
  1693. #endif
  1694. res0_0 = 0;
  1695. res0_1 = 0;
  1696. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1697. temp = bk-off;
  1698. #elif defined(LEFT)
  1699. temp = off+2; // number of values in A
  1700. #else
  1701. temp = off+1; // number of values in B
  1702. #endif
  1703. for (k=0; k<temp; k++)
  1704. {
  1705. b0 = ptrbb[0];
  1706. a0 = ptrba[0];
  1707. res0_0 += a0*b0;
  1708. a1 = ptrba[1];
  1709. res0_1 += a1*b0;
  1710. ptrba = ptrba+2;
  1711. ptrbb = ptrbb+1;
  1712. }
  1713. res0_0 *= alpha;
  1714. res0_1 *= alpha;
  1715. C0[0] = res0_0;
  1716. C0[1] = res0_1;
  1717. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1718. temp = bk - off;
  1719. #ifdef LEFT
  1720. temp -= 2; // number of values in A
  1721. #else
  1722. temp -= 1; // number of values in B
  1723. #endif
  1724. ptrba += temp*2;
  1725. ptrbb += temp*1;
  1726. #endif
  1727. #ifdef LEFT
  1728. off += 2; // number of values in A
  1729. #endif
  1730. C0 = C0+2;
  1731. }
  1732. if ( bm & 1 )
  1733. {
  1734. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1735. ptrbb = bb;
  1736. #else
  1737. ptrba += off*1;
  1738. ptrbb = bb + off*1;
  1739. #endif
  1740. res0_0 = 0;
  1741. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1742. temp = bk-off;
  1743. #elif defined(LEFT)
  1744. temp = off+1; // number of values in A
  1745. #else
  1746. temp = off+1; // number of values in B
  1747. #endif
  1748. for (k=0; k<temp; k++)
  1749. {
  1750. b0 = ptrbb[0];
  1751. a0 = ptrba[0];
  1752. res0_0 += a0*b0;
  1753. ptrba = ptrba+1;
  1754. ptrbb = ptrbb+1;
  1755. }
  1756. res0_0 *= alpha;
  1757. C0[0] = res0_0;
  1758. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1759. temp = bk - off;
  1760. #ifdef LEFT
  1761. temp -= 1; // number of values in A
  1762. #else
  1763. temp -= 1; // number of values in B
  1764. #endif
  1765. ptrba += temp*1;
  1766. ptrbb += temp*1;
  1767. #endif
  1768. #ifdef LEFT
  1769. off += 1; // number of values in A
  1770. #endif
  1771. C0 = C0+1;
  1772. }
  1773. #if defined(TRMMKERNEL) && !defined(LEFT)
  1774. off += 1;
  1775. #endif
  1776. k = (bk<<0);
  1777. bb = bb+k;
  1778. C = C+ldc;
  1779. }
  1780. return 0;
  1781. }