You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrmmkernel_4x4.c 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885
  1. #include "common.h"
  2. #define MADD_ALPHA_N_STORE(C, res, alpha) \
  3. C[0] = res ## _r * alpha ## _r - res ## _i * alpha ## _i; \
  4. C[1] = res ## _r * alpha ## _i + res ## _i * alpha ## _r;
  5. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  6. #define MADD(res, op1, op2) \
  7. res ## _r += op1 ## _r * op2 ## _r; \
  8. res ## _r -= op1 ## _i * op2 ## _i; \
  9. res ## _i += op1 ## _r * op2 ## _i; \
  10. res ## _i += op1 ## _i * op2 ## _r;
  11. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  12. #define MADD(res, op1, op2) \
  13. res ## _r += op1 ## _r * op2 ## _r; \
  14. res ## _r += op1 ## _i * op2 ## _i; \
  15. res ## _i -= op1 ## _r * op2 ## _i; \
  16. res ## _i += op1 ## _i * op2 ## _r;
  17. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  18. #define MADD(res, op1, op2) \
  19. res ## _r += op1 ## _r * op2 ## _r; \
  20. res ## _r += op1 ## _i * op2 ## _i; \
  21. res ## _i += op1 ## _r * op2 ## _i; \
  22. res ## _i -= op1 ## _i * op2 ## _r;
  23. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  24. #define MADD(res, op1, op2) \
  25. res ## _r += op1 ## _r * op2 ## _r; \
  26. res ## _r -= op1 ## _i * op2 ## _i; \
  27. res ## _i -= op1 ## _r * op2 ## _i; \
  28. res ## _i -= op1 ## _i * op2 ## _r;
  29. #endif
  30. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha_r, FLOAT alpha_i,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
  31. , BLASLONG offset
  32. )
  33. {
  34. BLASLONG i,j,k;
  35. FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
  36. FLOAT res00_r, res01_r, res02_r, res03_r;
  37. FLOAT res00_i, res01_i, res02_i, res03_i;
  38. FLOAT res10_r, res11_r, res12_r, res13_r;
  39. FLOAT res10_i, res11_i, res12_i, res13_i;
  40. FLOAT res20_r, res21_r, res22_r, res23_r;
  41. FLOAT res20_i, res21_i, res22_i, res23_i;
  42. FLOAT res30_r, res31_r, res32_r, res33_r;
  43. FLOAT res30_i, res31_i, res32_i, res33_i;
  44. FLOAT a0_r, a1_r;
  45. FLOAT a0_i, a1_i;
  46. FLOAT b0_r, b1_r, b2_r, b3_r;
  47. FLOAT b0_i, b1_i, b2_i, b3_i;
  48. BLASLONG off, temp;
  49. #if defined(TRMMKERNEL) && !defined(LEFT)
  50. off = -offset;
  51. #else
  52. off = 0;
  53. #endif
  54. for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops
  55. {
  56. C0 = C;
  57. C1 = C0+2*ldc;
  58. C2 = C1+2*ldc;
  59. C3 = C2+2*ldc;
  60. #if defined(TRMMKERNEL) && defined(LEFT)
  61. off = offset;
  62. #endif
  63. ptrba = ba;
  64. for (i=0; i<bm/4; i+=1) // do blocks of 4x4
  65. {
  66. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  67. ptrbb = bb;
  68. #else
  69. ptrba += off*4*2; // number of values in A
  70. ptrbb = bb + off*4*2; // number of values in B
  71. #endif
  72. res00_r = 0;
  73. res00_i = 0;
  74. res01_r = 0;
  75. res01_i = 0;
  76. res02_r = 0;
  77. res02_i = 0;
  78. res03_r = 0;
  79. res03_i = 0;
  80. res10_r = 0;
  81. res10_i = 0;
  82. res11_r = 0;
  83. res11_i = 0;
  84. res12_r = 0;
  85. res12_i = 0;
  86. res13_r = 0;
  87. res13_i = 0;
  88. res20_r = 0;
  89. res20_i = 0;
  90. res21_r = 0;
  91. res21_i = 0;
  92. res22_r = 0;
  93. res22_i = 0;
  94. res23_r = 0;
  95. res23_i = 0;
  96. res30_r = 0;
  97. res30_i = 0;
  98. res31_r = 0;
  99. res31_i = 0;
  100. res32_r = 0;
  101. res32_i = 0;
  102. res33_r = 0;
  103. res33_i = 0;
  104. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  105. temp = bk - off;
  106. #elif defined(LEFT)
  107. temp = off + 4;
  108. #else
  109. temp = off + 4;
  110. #endif
  111. for (k=0; k<temp; k++)
  112. {
  113. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  114. b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
  115. b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
  116. b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];
  117. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  118. MADD(res00, a0, b0);
  119. MADD(res10, a0, b1);
  120. MADD(res20, a0, b2);
  121. MADD(res30, a0, b3);
  122. a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
  123. MADD(res01, a1, b0);
  124. MADD(res11, a1, b1);
  125. MADD(res21, a1, b2);
  126. MADD(res31, a1, b3);
  127. a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
  128. MADD(res02, a0, b0);
  129. MADD(res12, a0, b1);
  130. MADD(res22, a0, b2);
  131. MADD(res32, a0, b3);
  132. a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
  133. MADD(res03, a1, b0);
  134. MADD(res13, a1, b1);
  135. MADD(res23, a1, b2);
  136. MADD(res33, a1, b3);
  137. ptrba = ptrba+8;
  138. ptrbb = ptrbb+8;
  139. }
  140. MADD_ALPHA_N_STORE(C0, res00, alpha);
  141. C0 = C0 + 2;
  142. MADD_ALPHA_N_STORE(C0, res01, alpha);
  143. C0 = C0 + 2;
  144. MADD_ALPHA_N_STORE(C0, res02, alpha);
  145. C0 = C0 + 2;
  146. MADD_ALPHA_N_STORE(C0, res03, alpha);
  147. C0 = C0 + 2;
  148. MADD_ALPHA_N_STORE(C1, res10, alpha);
  149. C1 = C1 + 2;
  150. MADD_ALPHA_N_STORE(C1, res11, alpha);
  151. C1 = C1 + 2;
  152. MADD_ALPHA_N_STORE(C1, res12, alpha);
  153. C1 = C1 + 2;
  154. MADD_ALPHA_N_STORE(C1, res13, alpha);
  155. C1 = C1 + 2;
  156. MADD_ALPHA_N_STORE(C2, res20, alpha);
  157. C2 = C2 + 2;
  158. MADD_ALPHA_N_STORE(C2, res21, alpha);
  159. C2 = C2 + 2;
  160. MADD_ALPHA_N_STORE(C2, res22, alpha);
  161. C2 = C2 + 2;
  162. MADD_ALPHA_N_STORE(C2, res23, alpha);
  163. C2 = C2 + 2;
  164. MADD_ALPHA_N_STORE(C3, res30, alpha);
  165. C3 = C3 + 2;
  166. MADD_ALPHA_N_STORE(C3, res31, alpha);
  167. C3 = C3 + 2;
  168. MADD_ALPHA_N_STORE(C3, res32, alpha);
  169. C3 = C3 + 2;
  170. MADD_ALPHA_N_STORE(C3, res33, alpha);
  171. C3 = C3 + 2;
  172. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  173. temp = bk-off;
  174. #if defined(LEFT)
  175. temp = temp - 4;
  176. #else
  177. temp = temp - 4;
  178. #endif
  179. ptrba += temp*4*2; // number of values in A
  180. ptrbb += temp*4*2; // number of values in B
  181. #endif
  182. #ifdef LEFT
  183. off += 4; // number of values in A
  184. #endif
  185. }
  186. if ( bm & 2 ) // do any 2x4 loop
  187. {
  188. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  189. ptrbb = bb;
  190. #else
  191. ptrba += off*2*2;
  192. ptrbb = bb + off*4*2;
  193. #endif
  194. res00_r = 0;
  195. res00_i = 0;
  196. res01_r = 0;
  197. res01_i = 0;
  198. res10_r = 0;
  199. res10_i = 0;
  200. res11_r = 0;
  201. res11_i = 0;
  202. res20_r = 0;
  203. res20_i = 0;
  204. res21_r = 0;
  205. res21_i = 0;
  206. res30_r = 0;
  207. res30_i = 0;
  208. res31_r = 0;
  209. res31_i = 0;
  210. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  211. temp = bk-off;
  212. #elif defined(LEFT)
  213. temp = off+2; // number of values in A
  214. #else
  215. temp = off+4; // number of values in B
  216. #endif
  217. for (k=0; k<temp; k++)
  218. {
  219. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  220. b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
  221. b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
  222. b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];
  223. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  224. MADD(res00, a0, b0);
  225. MADD(res10, a0, b1);
  226. MADD(res20, a0, b2);
  227. MADD(res30, a0, b3);
  228. a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
  229. MADD(res01, a1, b0);
  230. MADD(res11, a1, b1);
  231. MADD(res21, a1, b2);
  232. MADD(res31, a1, b3);
  233. ptrba = ptrba+4;
  234. ptrbb = ptrbb+8;
  235. }
  236. MADD_ALPHA_N_STORE(C0, res00, alpha);
  237. C0 = C0 + 2;
  238. MADD_ALPHA_N_STORE(C0, res01, alpha);
  239. C0 = C0 + 2;
  240. MADD_ALPHA_N_STORE(C1, res10, alpha);
  241. C1 = C1 + 2;
  242. MADD_ALPHA_N_STORE(C1, res11, alpha);
  243. C1 = C1 + 2;
  244. MADD_ALPHA_N_STORE(C2, res20, alpha);
  245. C2 = C2 + 2;
  246. MADD_ALPHA_N_STORE(C2, res21, alpha);
  247. C2 = C2 + 2;
  248. MADD_ALPHA_N_STORE(C3, res30, alpha);
  249. C3 = C3 + 2;
  250. MADD_ALPHA_N_STORE(C3, res31, alpha);
  251. C3 = C3 + 2;
  252. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  253. temp = bk - off;
  254. #ifdef LEFT
  255. temp -= 2; // number of values in A
  256. #else
  257. temp -= 4; // number of values in B
  258. #endif
  259. ptrba += temp*2*2;
  260. ptrbb += temp*4*2;
  261. #endif
  262. #ifdef LEFT
  263. off += 2; // number of values in A
  264. #endif
  265. }
  266. if ( bm & 1 ) // do any 1x4 loop
  267. {
  268. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  269. ptrbb = bb;
  270. #else
  271. ptrba += off*1*2;
  272. ptrbb = bb + off*4*2;
  273. #endif
  274. res00_r = 0;
  275. res00_i = 0;
  276. res10_r = 0;
  277. res10_i = 0;
  278. res20_r = 0;
  279. res20_i = 0;
  280. res30_r = 0;
  281. res30_i = 0;
  282. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  283. temp = bk-off;
  284. #elif defined(LEFT)
  285. temp = off+1; // number of values in A
  286. #else
  287. temp = off+4; // number of values in B
  288. #endif
  289. for (k=0; k<temp; k++)
  290. {
  291. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  292. b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
  293. b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
  294. b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];
  295. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  296. MADD(res00, a0, b0);
  297. MADD(res10, a0, b1);
  298. MADD(res20, a0, b2);
  299. MADD(res30, a0, b3);
  300. ptrba = ptrba+2;
  301. ptrbb = ptrbb+8;
  302. }
  303. MADD_ALPHA_N_STORE(C0, res00, alpha);
  304. C0 = C0 + 2;
  305. MADD_ALPHA_N_STORE(C1, res10, alpha);
  306. C1 = C1 + 2;
  307. MADD_ALPHA_N_STORE(C2, res20, alpha);
  308. C2 = C2 + 2;
  309. MADD_ALPHA_N_STORE(C3, res30, alpha);
  310. C3 = C3 + 2;
  311. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  312. temp = bk - off;
  313. #ifdef LEFT
  314. temp -= 1; // number of values in A
  315. #else
  316. temp -= 4; // number of values in B
  317. #endif
  318. ptrba += temp*1*2;
  319. ptrbb += temp*4*2;
  320. #endif
  321. #ifdef LEFT
  322. off += 1; // number of values in A
  323. #endif
  324. }
  325. #if defined(TRMMKERNEL) && !defined(LEFT)
  326. off += 4;
  327. #endif
  328. k = (bk<<3);
  329. bb = bb+k;
  330. i = (ldc<<3);
  331. C = C+i;
  332. }
  333. for (j=0; j<(bn&2); j+=2) // do the Mx2 loops
  334. {
  335. C0 = C;
  336. C1 = C0+ldc*2;
  337. #if defined(TRMMKERNEL) && defined(LEFT)
  338. off = offset;
  339. #endif
  340. ptrba = ba;
  341. for (i=0; i<bm/4; i+=1) // do blocks of 4x2
  342. {
  343. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  344. ptrbb = bb;
  345. #else
  346. ptrba += off*4*2;
  347. ptrbb = bb + off*2*2;
  348. #endif
  349. res00_r = 0;
  350. res00_i = 0;
  351. res01_r = 0;
  352. res01_i = 0;
  353. res02_r = 0;
  354. res02_i = 0;
  355. res03_r = 0;
  356. res03_i = 0;
  357. res10_r = 0;
  358. res10_i = 0;
  359. res11_r = 0;
  360. res11_i = 0;
  361. res12_r = 0;
  362. res12_i = 0;
  363. res13_r = 0;
  364. res13_i = 0;
  365. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  366. temp = bk-off;
  367. #elif defined(LEFT)
  368. temp = off+4; // number of values in A
  369. #else
  370. temp = off+2; // number of values in B
  371. #endif
  372. for (k=0; k<temp; k++)
  373. {
  374. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  375. b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
  376. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  377. MADD(res00, a0, b0);
  378. MADD(res10, a0, b1);
  379. a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
  380. MADD(res01, a1, b0);
  381. MADD(res11, a1, b1);
  382. a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
  383. MADD(res02, a0, b0);
  384. MADD(res12, a0, b1);
  385. a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
  386. MADD(res03, a1, b0);
  387. MADD(res13, a1, b1);
  388. ptrba = ptrba+8;
  389. ptrbb = ptrbb+4;
  390. }
  391. MADD_ALPHA_N_STORE(C0, res00, alpha);
  392. C0 = C0 + 2;
  393. MADD_ALPHA_N_STORE(C0, res01, alpha);
  394. C0 = C0 + 2;
  395. MADD_ALPHA_N_STORE(C0, res02, alpha);
  396. C0 = C0 + 2;
  397. MADD_ALPHA_N_STORE(C0, res03, alpha);
  398. C0 = C0 + 2;
  399. MADD_ALPHA_N_STORE(C1, res10, alpha);
  400. C1 = C1 + 2;
  401. MADD_ALPHA_N_STORE(C1, res11, alpha);
  402. C1 = C1 + 2;
  403. MADD_ALPHA_N_STORE(C1, res12, alpha);
  404. C1 = C1 + 2;
  405. MADD_ALPHA_N_STORE(C1, res13, alpha);
  406. C1 = C1 + 2;
  407. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  408. temp = bk - off;
  409. #ifdef LEFT
  410. temp -= 4; // number of values in A
  411. #else
  412. temp -= 2; // number of values in B
  413. #endif
  414. ptrba += temp*4*2;
  415. ptrbb += temp*2*2;
  416. #endif
  417. #ifdef LEFT
  418. off += 4; // number of values in A
  419. #endif
  420. }
  421. if ( bm & 2 ) // do any 2x2 loop
  422. {
  423. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  424. ptrbb = bb;
  425. #else
  426. ptrba += off*2*2;
  427. ptrbb = bb + off*2*2;
  428. #endif
  429. res00_r = 0;
  430. res00_i = 0;
  431. res01_r = 0;
  432. res01_i = 0;
  433. res10_r = 0;
  434. res10_i = 0;
  435. res11_r = 0;
  436. res11_i = 0;
  437. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  438. temp = bk-off;
  439. #elif defined(LEFT)
  440. temp = off+2; // number of values in A
  441. #else
  442. temp = off+2; // number of values in B
  443. #endif
  444. for (k=0; k<temp; k++)
  445. {
  446. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  447. b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
  448. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  449. MADD(res00, a0, b0);
  450. MADD(res10, a0, b1);
  451. a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
  452. MADD(res01, a1, b0);
  453. MADD(res11, a1, b1);
  454. ptrba = ptrba+4;
  455. ptrbb = ptrbb+4;
  456. }
  457. MADD_ALPHA_N_STORE(C0, res00, alpha);
  458. C0 = C0 + 2;
  459. MADD_ALPHA_N_STORE(C0, res01, alpha);
  460. C0 = C0 + 2;
  461. MADD_ALPHA_N_STORE(C1, res10, alpha);
  462. C1 = C1 + 2;
  463. MADD_ALPHA_N_STORE(C1, res11, alpha);
  464. C1 = C1 + 2;
  465. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  466. temp = bk - off;
  467. #ifdef LEFT
  468. temp -= 2; // number of values in A
  469. #else
  470. temp -= 2; // number of values in B
  471. #endif
  472. ptrba += temp*2*2;
  473. ptrbb += temp*2*2;
  474. #endif
  475. #ifdef LEFT
  476. off += 2; // number of values in A
  477. #endif
  478. }
  479. if ( bm & 1 ) // do any 1x2 loop
  480. {
  481. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  482. ptrbb = bb;
  483. #else
  484. ptrba += off*1*2;
  485. ptrbb = bb + off*2*2;
  486. #endif
  487. res00_r = 0;
  488. res00_i = 0;
  489. res10_r = 0;
  490. res10_i = 0;
  491. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  492. temp = bk-off;
  493. #elif defined(LEFT)
  494. temp = off+1; // number of values in A
  495. #else
  496. temp = off+2; // number of values in B
  497. #endif
  498. for (k=0; k<temp; k++)
  499. {
  500. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  501. b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
  502. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  503. MADD(res00, a0, b0);
  504. MADD(res10, a0, b1);
  505. ptrba = ptrba+2;
  506. ptrbb = ptrbb+4;
  507. }
  508. MADD_ALPHA_N_STORE(C0, res00, alpha);
  509. C0 = C0 + 2;
  510. MADD_ALPHA_N_STORE(C1, res10, alpha);
  511. C1 = C1 + 2;
  512. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  513. temp = bk - off;
  514. #ifdef LEFT
  515. temp -= 1; // number of values in A
  516. #else
  517. temp -= 2; // number of values in B
  518. #endif
  519. ptrba += temp*1*2;
  520. ptrbb += temp*2*2;
  521. #endif
  522. #ifdef LEFT
  523. off += 1; // number of values in A
  524. #endif
  525. }
  526. #if defined(TRMMKERNEL) && !defined(LEFT)
  527. off += 2;
  528. #endif
  529. k = (bk<<2);
  530. bb = bb+k;
  531. i = (ldc<<2);
  532. C = C+i;
  533. }
  534. for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
  535. {
  536. C0 = C;
  537. #if defined(TRMMKERNEL) && defined(LEFT)
  538. off = offset;
  539. #endif
  540. ptrba = ba;
  541. for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
  542. {
  543. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  544. ptrbb = bb;
  545. #else
  546. ptrba += off*4*2;
  547. ptrbb = bb + off*1*2;
  548. #endif
  549. res00_r = 0;
  550. res00_i = 0;
  551. res01_r = 0;
  552. res01_i = 0;
  553. res02_r = 0;
  554. res02_i = 0;
  555. res03_r = 0;
  556. res03_i = 0;
  557. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  558. temp = bk-off;
  559. #elif defined(LEFT)
  560. temp = off+4; // number of values in A
  561. #else
  562. temp = off+1; // number of values in B
  563. #endif
  564. for (k=0; k<temp; k++)
  565. {
  566. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  567. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  568. MADD(res00, a0, b0);
  569. a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
  570. MADD(res01, a1, b0);
  571. a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
  572. MADD(res02, a0, b0);
  573. a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
  574. MADD(res03, a1, b0);
  575. ptrba = ptrba+8;
  576. ptrbb = ptrbb+2;
  577. }
  578. MADD_ALPHA_N_STORE(C0, res00, alpha);
  579. C0 = C0 + 2;
  580. MADD_ALPHA_N_STORE(C0, res01, alpha);
  581. C0 = C0 + 2;
  582. MADD_ALPHA_N_STORE(C0, res02, alpha);
  583. C0 = C0 + 2;
  584. MADD_ALPHA_N_STORE(C0, res03, alpha);
  585. C0 = C0 + 2;
  586. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  587. temp = bk - off;
  588. #ifdef LEFT
  589. temp -= 4; // number of values in A
  590. #else
  591. temp -= 1; // number of values in B
  592. #endif
  593. ptrba += temp*4*2;
  594. ptrbb += temp*1*2;
  595. #endif
  596. #ifdef LEFT
  597. off += 4; // number of values in A
  598. #endif
  599. }
  600. if ( bm & 2 ) // do any 2x1 loop
  601. {
  602. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  603. ptrbb = bb;
  604. #else
  605. ptrba += off*2*2;
  606. ptrbb = bb + off*1*2;
  607. #endif
  608. res00_r = 0;
  609. res00_i = 0;
  610. res01_r = 0;
  611. res01_i = 0;
  612. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  613. temp = bk-off;
  614. #elif defined(LEFT)
  615. temp = off+2; // number of values in A
  616. #else
  617. temp = off+1; // number of values in B
  618. #endif
  619. for (k=0; k<temp; k++)
  620. {
  621. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  622. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  623. MADD(res00, a0, b0);
  624. a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
  625. MADD(res01, a1, b0);
  626. ptrba = ptrba+4;
  627. ptrbb = ptrbb+2;
  628. }
  629. MADD_ALPHA_N_STORE(C0, res00, alpha);
  630. C0 = C0 + 2;
  631. MADD_ALPHA_N_STORE(C0, res01, alpha);
  632. C0 = C0 + 2;
  633. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  634. temp = bk - off;
  635. #ifdef LEFT
  636. temp -= 2; // number of values in A
  637. #else
  638. temp -= 1; // number of values in B
  639. #endif
  640. ptrba += temp*2*2;
  641. ptrbb += temp*1*2;
  642. #endif
  643. #ifdef LEFT
  644. off += 2; // number of values in A
  645. #endif
  646. }
  647. if ( bm & 1 ) // do any 1x1 loop
  648. {
  649. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  650. ptrbb = bb;
  651. #else
  652. ptrba += off*1*2;
  653. ptrbb = bb + off*1*2;
  654. #endif
  655. res00_r = 0;
  656. res00_i = 0;
  657. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  658. temp = bk-off;
  659. #elif defined(LEFT)
  660. temp = off+1; // number of values in A
  661. #else
  662. temp = off+1; // number of values in B
  663. #endif
  664. for (k=0; k<temp; k++)
  665. {
  666. b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
  667. a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
  668. MADD(res00, a0, b0);
  669. ptrba = ptrba+2;
  670. ptrbb = ptrbb+2;
  671. }
  672. MADD_ALPHA_N_STORE(C0, res00, alpha);
  673. C0 = C0 + 2;
  674. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  675. temp = bk - off;
  676. #ifdef LEFT
  677. temp -= 1; // number of values in A
  678. #else
  679. temp -= 1; // number of values in B
  680. #endif
  681. ptrba += temp*1*2;
  682. ptrbb += temp*1*2;
  683. #endif
  684. #ifdef LEFT
  685. off += 1; // number of values in A
  686. #endif
  687. }
  688. #if defined(TRMMKERNEL) && !defined(LEFT)
  689. off += 1;
  690. #endif
  691. k = (bk<<1);
  692. bb = bb+k;
  693. i = (ldc<<1);
  694. C = C+i;
  695. }
  696. return 0;
  697. }