You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrmmkernel_4x4.c 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883
  1. #include "common.h"
  2. #define MADD_ALPHA_N_STORE(C, res, alpha) \
  3. C[0] = res ## _r * alpha ## _r - res ## _i * alpha ## _i; \
  4. C[1] = res ## _r * alpha ## _i + res ## _i * alpha ## _r;
  5. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  6. #define MADD(res, op1, op2) \
  7. res ## _r += op1 ## _r * op2 ## _r; \
  8. res ## _r -= op1 ## _i * op2 ## _i; \
  9. res ## _i += op1 ## _r * op2 ## _i; \
  10. res ## _i += op1 ## _i * op2 ## _r;
  11. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  12. #define MADD(res, op1, op2) \
  13. res ## _r += op1 ## _r * op2 ## _r; \
  14. res ## _r += op1 ## _i * op2 ## _i; \
  15. res ## _i -= op1 ## _r * op2 ## _i; \
  16. res ## _i += op1 ## _i * op2 ## _r;
  17. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  18. #define MADD(res, op1, op2) \
  19. res ## _r += op1 ## _r * op2 ## _r; \
  20. res ## _r += op1 ## _i * op2 ## _i; \
  21. res ## _i += op1 ## _r * op2 ## _i; \
  22. res ## _i -= op1 ## _i * op2 ## _r;
  23. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  24. #define MADD(res, op1, op2) \
  25. res ## _r += op1 ## _r * op2 ## _r; \
  26. res ## _r -= op1 ## _i * op2 ## _i; \
  27. res ## _i -= op1 ## _r * op2 ## _i; \
  28. res ## _i -= op1 ## _i * op2 ## _r;
  29. #endif
  30. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha_r, FLOAT alpha_i,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
  31. , BLASLONG offset
  32. )
  33. {
  34. BLASLONG i,j,k;
  35. FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
  36. FLOAT res00_r, res01_r, res02_r, res03_r;
  37. FLOAT res00_i, res01_i, res02_i, res03_i;
  38. FLOAT res10_r, res11_r, res12_r, res13_r;
  39. FLOAT res10_i, res11_i, res12_i, res13_i;
  40. FLOAT res20_r, res21_r, res22_r, res23_r;
  41. FLOAT res20_i, res21_i, res22_i, res23_i;
  42. FLOAT res30_r, res31_r, res32_r, res33_r;
  43. FLOAT res30_i, res31_i, res32_i, res33_i;
  44. FLOAT a0_r, a1_r;
  45. FLOAT a0_i, a1_i;
  46. FLOAT b0_r, b1_r, b2_r, b3_r;
  47. FLOAT b0_i, b1_i, b2_i, b3_i;
  48. BLASLONG off, temp;
  49. #if defined(TRMMKERNEL) && !defined(LEFT)
  50. off = -offset;
  51. #endif
  52. for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops
  53. {
  54. C0 = C;
  55. C1 = C0+2*ldc;
  56. C2 = C1+2*ldc;
  57. C3 = C2+2*ldc;
  58. #if defined(TRMMKERNEL) && defined(LEFT)
  59. off = offset;
  60. #endif
  61. ptrba = ba;
  62. for (i=0; i<bm/4; i+=1) // do blocks of 4x4
  63. {
  64. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  65. ptrbb = bb;
  66. #else
  67. ptrba += off*4*2; // number of values in A
  68. ptrbb = bb + off*4*2; // number of values in B
  69. #endif
  70. res00_r = 0;
  71. res00_i = 0;
  72. res01_r = 0;
  73. res01_i = 0;
  74. res02_r = 0;
  75. res02_i = 0;
  76. res03_r = 0;
  77. res03_i = 0;
  78. res10_r = 0;
  79. res10_i = 0;
  80. res11_r = 0;
  81. res11_i = 0;
  82. res12_r = 0;
  83. res12_i = 0;
  84. res13_r = 0;
  85. res13_i = 0;
  86. res20_r = 0;
  87. res20_i = 0;
  88. res21_r = 0;
  89. res21_i = 0;
  90. res22_r = 0;
  91. res22_i = 0;
  92. res23_r = 0;
  93. res23_i = 0;
  94. res30_r = 0;
  95. res30_i = 0;
  96. res31_r = 0;
  97. res31_i = 0;
  98. res32_r = 0;
  99. res32_i = 0;
  100. res33_r = 0;
  101. res33_i = 0;
  102. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  103. temp = bk - off;
  104. #elif defined(LEFT)
  105. temp = off + 4;
  106. #else
  107. temp = off + 4;
  108. #endif
  109. for (k=0; k<temp; k++)
  110. {
  111. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  112. b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
  113. b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
  114. b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];
  115. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  116. MADD(res00, a0, b0);
  117. MADD(res10, a0, b1);
  118. MADD(res20, a0, b2);
  119. MADD(res30, a0, b3);
  120. a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
  121. MADD(res01, a1, b0);
  122. MADD(res11, a1, b1);
  123. MADD(res21, a1, b2);
  124. MADD(res31, a1, b3);
  125. a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
  126. MADD(res02, a0, b0);
  127. MADD(res12, a0, b1);
  128. MADD(res22, a0, b2);
  129. MADD(res32, a0, b3);
  130. a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
  131. MADD(res03, a1, b0);
  132. MADD(res13, a1, b1);
  133. MADD(res23, a1, b2);
  134. MADD(res33, a1, b3);
  135. ptrba = ptrba+8;
  136. ptrbb = ptrbb+8;
  137. }
  138. MADD_ALPHA_N_STORE(C0, res00, alpha);
  139. C0 = C0 + 2;
  140. MADD_ALPHA_N_STORE(C0, res01, alpha);
  141. C0 = C0 + 2;
  142. MADD_ALPHA_N_STORE(C0, res02, alpha);
  143. C0 = C0 + 2;
  144. MADD_ALPHA_N_STORE(C0, res03, alpha);
  145. C0 = C0 + 2;
  146. MADD_ALPHA_N_STORE(C1, res10, alpha);
  147. C1 = C1 + 2;
  148. MADD_ALPHA_N_STORE(C1, res11, alpha);
  149. C1 = C1 + 2;
  150. MADD_ALPHA_N_STORE(C1, res12, alpha);
  151. C1 = C1 + 2;
  152. MADD_ALPHA_N_STORE(C1, res13, alpha);
  153. C1 = C1 + 2;
  154. MADD_ALPHA_N_STORE(C2, res20, alpha);
  155. C2 = C2 + 2;
  156. MADD_ALPHA_N_STORE(C2, res21, alpha);
  157. C2 = C2 + 2;
  158. MADD_ALPHA_N_STORE(C2, res22, alpha);
  159. C2 = C2 + 2;
  160. MADD_ALPHA_N_STORE(C2, res23, alpha);
  161. C2 = C2 + 2;
  162. MADD_ALPHA_N_STORE(C3, res30, alpha);
  163. C3 = C3 + 2;
  164. MADD_ALPHA_N_STORE(C3, res31, alpha);
  165. C3 = C3 + 2;
  166. MADD_ALPHA_N_STORE(C3, res32, alpha);
  167. C3 = C3 + 2;
  168. MADD_ALPHA_N_STORE(C3, res33, alpha);
  169. C3 = C3 + 2;
  170. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  171. temp = bk-off;
  172. #if defined(LEFT)
  173. temp = temp - 4;
  174. #else
  175. temp = temp - 4;
  176. #endif
  177. ptrba += temp*4*2; // number of values in A
  178. ptrbb += temp*4*2; // number of values in B
  179. #endif
  180. #ifdef LEFT
  181. off += 4; // number of values in A
  182. #endif
  183. }
  184. if ( bm & 2 ) // do any 2x4 loop
  185. {
  186. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  187. ptrbb = bb;
  188. #else
  189. ptrba += off*2*2;
  190. ptrbb = bb + off*4*2;
  191. #endif
  192. res00_r = 0;
  193. res00_i = 0;
  194. res01_r = 0;
  195. res01_i = 0;
  196. res10_r = 0;
  197. res10_i = 0;
  198. res11_r = 0;
  199. res11_i = 0;
  200. res20_r = 0;
  201. res20_i = 0;
  202. res21_r = 0;
  203. res21_i = 0;
  204. res30_r = 0;
  205. res30_i = 0;
  206. res31_r = 0;
  207. res31_i = 0;
  208. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  209. temp = bk-off;
  210. #elif defined(LEFT)
  211. temp = off+2; // number of values in A
  212. #else
  213. temp = off+4; // number of values in B
  214. #endif
  215. for (k=0; k<temp; k++)
  216. {
  217. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  218. b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
  219. b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
  220. b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];
  221. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  222. MADD(res00, a0, b0);
  223. MADD(res10, a0, b1);
  224. MADD(res20, a0, b2);
  225. MADD(res30, a0, b3);
  226. a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
  227. MADD(res01, a1, b0);
  228. MADD(res11, a1, b1);
  229. MADD(res21, a1, b2);
  230. MADD(res31, a1, b3);
  231. ptrba = ptrba+4;
  232. ptrbb = ptrbb+8;
  233. }
  234. MADD_ALPHA_N_STORE(C0, res00, alpha);
  235. C0 = C0 + 2;
  236. MADD_ALPHA_N_STORE(C0, res01, alpha);
  237. C0 = C0 + 2;
  238. MADD_ALPHA_N_STORE(C1, res10, alpha);
  239. C1 = C1 + 2;
  240. MADD_ALPHA_N_STORE(C1, res11, alpha);
  241. C1 = C1 + 2;
  242. MADD_ALPHA_N_STORE(C2, res20, alpha);
  243. C2 = C2 + 2;
  244. MADD_ALPHA_N_STORE(C2, res21, alpha);
  245. C2 = C2 + 2;
  246. MADD_ALPHA_N_STORE(C3, res30, alpha);
  247. C3 = C3 + 2;
  248. MADD_ALPHA_N_STORE(C3, res31, alpha);
  249. C3 = C3 + 2;
  250. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  251. temp = bk - off;
  252. #ifdef LEFT
  253. temp -= 2; // number of values in A
  254. #else
  255. temp -= 4; // number of values in B
  256. #endif
  257. ptrba += temp*2*2;
  258. ptrbb += temp*4*2;
  259. #endif
  260. #ifdef LEFT
  261. off += 2; // number of values in A
  262. #endif
  263. }
  264. if ( bm & 1 ) // do any 1x4 loop
  265. {
  266. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  267. ptrbb = bb;
  268. #else
  269. ptrba += off*1*2;
  270. ptrbb = bb + off*4*2;
  271. #endif
  272. res00_r = 0;
  273. res00_i = 0;
  274. res10_r = 0;
  275. res10_i = 0;
  276. res20_r = 0;
  277. res20_i = 0;
  278. res30_r = 0;
  279. res30_i = 0;
  280. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  281. temp = bk-off;
  282. #elif defined(LEFT)
  283. temp = off+1; // number of values in A
  284. #else
  285. temp = off+4; // number of values in B
  286. #endif
  287. for (k=0; k<temp; k++)
  288. {
  289. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  290. b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
  291. b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
  292. b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];
  293. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  294. MADD(res00, a0, b0);
  295. MADD(res10, a0, b1);
  296. MADD(res20, a0, b2);
  297. MADD(res30, a0, b3);
  298. ptrba = ptrba+2;
  299. ptrbb = ptrbb+8;
  300. }
  301. MADD_ALPHA_N_STORE(C0, res00, alpha);
  302. C0 = C0 + 2;
  303. MADD_ALPHA_N_STORE(C1, res10, alpha);
  304. C1 = C1 + 2;
  305. MADD_ALPHA_N_STORE(C2, res20, alpha);
  306. C2 = C2 + 2;
  307. MADD_ALPHA_N_STORE(C3, res30, alpha);
  308. C3 = C3 + 2;
  309. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  310. temp = bk - off;
  311. #ifdef LEFT
  312. temp -= 1; // number of values in A
  313. #else
  314. temp -= 4; // number of values in B
  315. #endif
  316. ptrba += temp*1*2;
  317. ptrbb += temp*4*2;
  318. #endif
  319. #ifdef LEFT
  320. off += 1; // number of values in A
  321. #endif
  322. }
  323. #if defined(TRMMKERNEL) && !defined(LEFT)
  324. off += 4;
  325. #endif
  326. k = (bk<<3);
  327. bb = bb+k;
  328. i = (ldc<<3);
  329. C = C+i;
  330. }
  331. for (j=0; j<(bn&2); j+=2) // do the Mx2 loops
  332. {
  333. C0 = C;
  334. C1 = C0+ldc*2;
  335. #if defined(TRMMKERNEL) && defined(LEFT)
  336. off = offset;
  337. #endif
  338. ptrba = ba;
  339. for (i=0; i<bm/4; i+=1) // do blocks of 4x2
  340. {
  341. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  342. ptrbb = bb;
  343. #else
  344. ptrba += off*4*2;
  345. ptrbb = bb + off*2*2;
  346. #endif
  347. res00_r = 0;
  348. res00_i = 0;
  349. res01_r = 0;
  350. res01_i = 0;
  351. res02_r = 0;
  352. res02_i = 0;
  353. res03_r = 0;
  354. res03_i = 0;
  355. res10_r = 0;
  356. res10_i = 0;
  357. res11_r = 0;
  358. res11_i = 0;
  359. res12_r = 0;
  360. res12_i = 0;
  361. res13_r = 0;
  362. res13_i = 0;
  363. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  364. temp = bk-off;
  365. #elif defined(LEFT)
  366. temp = off+4; // number of values in A
  367. #else
  368. temp = off+2; // number of values in B
  369. #endif
  370. for (k=0; k<temp; k++)
  371. {
  372. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  373. b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
  374. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  375. MADD(res00, a0, b0);
  376. MADD(res10, a0, b1);
  377. a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
  378. MADD(res01, a1, b0);
  379. MADD(res11, a1, b1);
  380. a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
  381. MADD(res02, a0, b0);
  382. MADD(res12, a0, b1);
  383. a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
  384. MADD(res03, a1, b0);
  385. MADD(res13, a1, b1);
  386. ptrba = ptrba+8;
  387. ptrbb = ptrbb+4;
  388. }
  389. MADD_ALPHA_N_STORE(C0, res00, alpha);
  390. C0 = C0 + 2;
  391. MADD_ALPHA_N_STORE(C0, res01, alpha);
  392. C0 = C0 + 2;
  393. MADD_ALPHA_N_STORE(C0, res02, alpha);
  394. C0 = C0 + 2;
  395. MADD_ALPHA_N_STORE(C0, res03, alpha);
  396. C0 = C0 + 2;
  397. MADD_ALPHA_N_STORE(C1, res10, alpha);
  398. C1 = C1 + 2;
  399. MADD_ALPHA_N_STORE(C1, res11, alpha);
  400. C1 = C1 + 2;
  401. MADD_ALPHA_N_STORE(C1, res12, alpha);
  402. C1 = C1 + 2;
  403. MADD_ALPHA_N_STORE(C1, res13, alpha);
  404. C1 = C1 + 2;
  405. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  406. temp = bk - off;
  407. #ifdef LEFT
  408. temp -= 4; // number of values in A
  409. #else
  410. temp -= 2; // number of values in B
  411. #endif
  412. ptrba += temp*4*2;
  413. ptrbb += temp*2*2;
  414. #endif
  415. #ifdef LEFT
  416. off += 4; // number of values in A
  417. #endif
  418. }
  419. if ( bm & 2 ) // do any 2x2 loop
  420. {
  421. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  422. ptrbb = bb;
  423. #else
  424. ptrba += off*2*2;
  425. ptrbb = bb + off*2*2;
  426. #endif
  427. res00_r = 0;
  428. res00_i = 0;
  429. res01_r = 0;
  430. res01_i = 0;
  431. res10_r = 0;
  432. res10_i = 0;
  433. res11_r = 0;
  434. res11_i = 0;
  435. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  436. temp = bk-off;
  437. #elif defined(LEFT)
  438. temp = off+2; // number of values in A
  439. #else
  440. temp = off+2; // number of values in B
  441. #endif
  442. for (k=0; k<temp; k++)
  443. {
  444. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  445. b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
  446. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  447. MADD(res00, a0, b0);
  448. MADD(res10, a0, b1);
  449. a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
  450. MADD(res01, a1, b0);
  451. MADD(res11, a1, b1);
  452. ptrba = ptrba+4;
  453. ptrbb = ptrbb+4;
  454. }
  455. MADD_ALPHA_N_STORE(C0, res00, alpha);
  456. C0 = C0 + 2;
  457. MADD_ALPHA_N_STORE(C0, res01, alpha);
  458. C0 = C0 + 2;
  459. MADD_ALPHA_N_STORE(C1, res10, alpha);
  460. C1 = C1 + 2;
  461. MADD_ALPHA_N_STORE(C1, res11, alpha);
  462. C1 = C1 + 2;
  463. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  464. temp = bk - off;
  465. #ifdef LEFT
  466. temp -= 2; // number of values in A
  467. #else
  468. temp -= 2; // number of values in B
  469. #endif
  470. ptrba += temp*2*2;
  471. ptrbb += temp*2*2;
  472. #endif
  473. #ifdef LEFT
  474. off += 2; // number of values in A
  475. #endif
  476. }
  477. if ( bm & 1 ) // do any 1x2 loop
  478. {
  479. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  480. ptrbb = bb;
  481. #else
  482. ptrba += off*1*2;
  483. ptrbb = bb + off*2*2;
  484. #endif
  485. res00_r = 0;
  486. res00_i = 0;
  487. res10_r = 0;
  488. res10_i = 0;
  489. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  490. temp = bk-off;
  491. #elif defined(LEFT)
  492. temp = off+1; // number of values in A
  493. #else
  494. temp = off+2; // number of values in B
  495. #endif
  496. for (k=0; k<temp; k++)
  497. {
  498. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  499. b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
  500. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  501. MADD(res00, a0, b0);
  502. MADD(res10, a0, b1);
  503. ptrba = ptrba+2;
  504. ptrbb = ptrbb+4;
  505. }
  506. MADD_ALPHA_N_STORE(C0, res00, alpha);
  507. C0 = C0 + 2;
  508. MADD_ALPHA_N_STORE(C1, res10, alpha);
  509. C1 = C1 + 2;
  510. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  511. temp = bk - off;
  512. #ifdef LEFT
  513. temp -= 1; // number of values in A
  514. #else
  515. temp -= 2; // number of values in B
  516. #endif
  517. ptrba += temp*1*2;
  518. ptrbb += temp*2*2;
  519. #endif
  520. #ifdef LEFT
  521. off += 1; // number of values in A
  522. #endif
  523. }
  524. #if defined(TRMMKERNEL) && !defined(LEFT)
  525. off += 2;
  526. #endif
  527. k = (bk<<2);
  528. bb = bb+k;
  529. i = (ldc<<2);
  530. C = C+i;
  531. }
  532. for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
  533. {
  534. C0 = C;
  535. #if defined(TRMMKERNEL) && defined(LEFT)
  536. off = offset;
  537. #endif
  538. ptrba = ba;
  539. for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
  540. {
  541. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  542. ptrbb = bb;
  543. #else
  544. ptrba += off*4*2;
  545. ptrbb = bb + off*1*2;
  546. #endif
  547. res00_r = 0;
  548. res00_i = 0;
  549. res01_r = 0;
  550. res01_i = 0;
  551. res02_r = 0;
  552. res02_i = 0;
  553. res03_r = 0;
  554. res03_i = 0;
  555. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  556. temp = bk-off;
  557. #elif defined(LEFT)
  558. temp = off+4; // number of values in A
  559. #else
  560. temp = off+1; // number of values in B
  561. #endif
  562. for (k=0; k<temp; k++)
  563. {
  564. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  565. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  566. MADD(res00, a0, b0);
  567. a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
  568. MADD(res01, a1, b0);
  569. a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
  570. MADD(res02, a0, b0);
  571. a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
  572. MADD(res03, a1, b0);
  573. ptrba = ptrba+8;
  574. ptrbb = ptrbb+2;
  575. }
  576. MADD_ALPHA_N_STORE(C0, res00, alpha);
  577. C0 = C0 + 2;
  578. MADD_ALPHA_N_STORE(C0, res01, alpha);
  579. C0 = C0 + 2;
  580. MADD_ALPHA_N_STORE(C0, res02, alpha);
  581. C0 = C0 + 2;
  582. MADD_ALPHA_N_STORE(C0, res03, alpha);
  583. C0 = C0 + 2;
  584. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  585. temp = bk - off;
  586. #ifdef LEFT
  587. temp -= 4; // number of values in A
  588. #else
  589. temp -= 1; // number of values in B
  590. #endif
  591. ptrba += temp*4*2;
  592. ptrbb += temp*1*2;
  593. #endif
  594. #ifdef LEFT
  595. off += 4; // number of values in A
  596. #endif
  597. }
  598. if ( bm & 2 ) // do any 2x1 loop
  599. {
  600. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  601. ptrbb = bb;
  602. #else
  603. ptrba += off*2*2;
  604. ptrbb = bb + off*1*2;
  605. #endif
  606. res00_r = 0;
  607. res00_i = 0;
  608. res01_r = 0;
  609. res01_i = 0;
  610. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  611. temp = bk-off;
  612. #elif defined(LEFT)
  613. temp = off+2; // number of values in A
  614. #else
  615. temp = off+1; // number of values in B
  616. #endif
  617. for (k=0; k<temp; k++)
  618. {
  619. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  620. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  621. MADD(res00, a0, b0);
  622. a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
  623. MADD(res01, a1, b0);
  624. ptrba = ptrba+4;
  625. ptrbb = ptrbb+2;
  626. }
  627. MADD_ALPHA_N_STORE(C0, res00, alpha);
  628. C0 = C0 + 2;
  629. MADD_ALPHA_N_STORE(C0, res01, alpha);
  630. C0 = C0 + 2;
  631. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  632. temp = bk - off;
  633. #ifdef LEFT
  634. temp -= 2; // number of values in A
  635. #else
  636. temp -= 1; // number of values in B
  637. #endif
  638. ptrba += temp*2*2;
  639. ptrbb += temp*1*2;
  640. #endif
  641. #ifdef LEFT
  642. off += 2; // number of values in A
  643. #endif
  644. }
  645. if ( bm & 1 ) // do any 1x1 loop
  646. {
  647. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  648. ptrbb = bb;
  649. #else
  650. ptrba += off*1*2;
  651. ptrbb = bb + off*1*2;
  652. #endif
  653. res00_r = 0;
  654. res00_i = 0;
  655. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  656. temp = bk-off;
  657. #elif defined(LEFT)
  658. temp = off+1; // number of values in A
  659. #else
  660. temp = off+1; // number of values in B
  661. #endif
  662. for (k=0; k<temp; k++)
  663. {
  664. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  665. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  666. MADD(res00, a0, b0);
  667. ptrba = ptrba+2;
  668. ptrbb = ptrbb+2;
  669. }
  670. MADD_ALPHA_N_STORE(C0, res00, alpha);
  671. C0 = C0 + 2;
  672. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  673. temp = bk - off;
  674. #ifdef LEFT
  675. temp -= 1; // number of values in A
  676. #else
  677. temp -= 1; // number of values in B
  678. #endif
  679. ptrba += temp*1*2;
  680. ptrbb += temp*1*2;
  681. #endif
  682. #ifdef LEFT
  683. off += 1; // number of values in A
  684. #endif
  685. }
  686. #if defined(TRMMKERNEL) && !defined(LEFT)
  687. off += 1;
  688. #endif
  689. k = (bk<<1);
  690. bb = bb+k;
  691. i = (ldc<<1);
  692. C = C+i;
  693. }
  694. return 0;
  695. }