You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemmkernel_2x2.c 33 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838
  1. #include "common.h"
  2. /********************************
  3. ADD1 a*c
  4. ADD2 b*c
  5. ADD3 a*d
  6. ADD4 b*d
  7. *********************************/
  8. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
  9. #ifdef TRMMKERNEL
  10. , BLASLONG offset
  11. #endif
  12. )
  13. {
  14. BLASLONG i,j,k;
  15. FLOAT *C0,*C1,*ptrba,*ptrbb;
  16. FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
  17. for (j=0; j<bn/2; j+=1)
  18. {
  19. C0 = C;
  20. C1 = C0+2*ldc;
  21. ptrba = ba;
  22. for (i=0; i<bm/2; i+=1)
  23. {
  24. ptrbb = bb;
  25. res0 = 0;
  26. res1 = 0;
  27. res2 = 0;
  28. res3 = 0;
  29. res4 = 0;
  30. res5 = 0;
  31. res6 = 0;
  32. res7 = 0;
  33. for (k=0; k<bk/4; k+=1)
  34. {
  35. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  36. load0 = ptrba[4*0+0];
  37. load1 = ptrbb[4*0+0];
  38. res0 = res0+load0*load1;
  39. load2 = ptrba[4*0+1];
  40. res1 = res1+load2*load1;
  41. load3 = ptrbb[4*0+1];
  42. res0 = res0-load2*load3;
  43. res1 = res1+load0*load3;
  44. load4 = ptrba[4*0+2];
  45. res2 = res2+load4*load1;
  46. load5 = ptrba[4*0+3];
  47. res3 = res3+load5*load1;
  48. res2 = res2-load5*load3;
  49. res3 = res3+load4*load3;
  50. load6 = ptrbb[4*0+2];
  51. res4 = res4+load0*load6;
  52. res5 = res5+load2*load6;
  53. load7 = ptrbb[4*0+3];
  54. res4 = res4-load2*load7;
  55. res5 = res5+load0*load7;
  56. res6 = res6+load4*load6;
  57. res7 = res7+load5*load6;
  58. res6 = res6-load5*load7;
  59. res7 = res7+load4*load7;
  60. load8 = ptrba[4*1+0];
  61. load9 = ptrbb[4*1+0];
  62. res0 = res0+load8*load9;
  63. load10 = ptrba[4*1+1];
  64. res1 = res1+load10*load9;
  65. load11 = ptrbb[4*1+1];
  66. res0 = res0-load10*load11;
  67. res1 = res1+load8*load11;
  68. load12 = ptrba[4*1+2];
  69. res2 = res2+load12*load9;
  70. load13 = ptrba[4*1+3];
  71. res3 = res3+load13*load9;
  72. res2 = res2-load13*load11;
  73. res3 = res3+load12*load11;
  74. load14 = ptrbb[4*1+2];
  75. res4 = res4+load8*load14;
  76. res5 = res5+load10*load14;
  77. load15 = ptrbb[4*1+3];
  78. res4 = res4-load10*load15;
  79. res5 = res5+load8*load15;
  80. res6 = res6+load12*load14;
  81. res7 = res7+load13*load14;
  82. res6 = res6-load13*load15;
  83. res7 = res7+load12*load15;
  84. load0 = ptrba[4*2+0];
  85. load1 = ptrbb[4*2+0];
  86. res0 = res0+load0*load1;
  87. load2 = ptrba[4*2+1];
  88. res1 = res1+load2*load1;
  89. load3 = ptrbb[4*2+1];
  90. res0 = res0-load2*load3;
  91. res1 = res1+load0*load3;
  92. load4 = ptrba[4*2+2];
  93. res2 = res2+load4*load1;
  94. load5 = ptrba[4*2+3];
  95. res3 = res3+load5*load1;
  96. res2 = res2-load5*load3;
  97. res3 = res3+load4*load3;
  98. load6 = ptrbb[4*2+2];
  99. res4 = res4+load0*load6;
  100. res5 = res5+load2*load6;
  101. load7 = ptrbb[4*2+3];
  102. res4 = res4-load2*load7;
  103. res5 = res5+load0*load7;
  104. res6 = res6+load4*load6;
  105. res7 = res7+load5*load6;
  106. res6 = res6-load5*load7;
  107. res7 = res7+load4*load7;
  108. load8 = ptrba[4*3+0];
  109. load9 = ptrbb[4*3+0];
  110. res0 = res0+load8*load9;
  111. load10 = ptrba[4*3+1];
  112. res1 = res1+load10*load9;
  113. load11 = ptrbb[4*3+1];
  114. res0 = res0-load10*load11;
  115. res1 = res1+load8*load11;
  116. load12 = ptrba[4*3+2];
  117. res2 = res2+load12*load9;
  118. load13 = ptrba[4*3+3];
  119. res3 = res3+load13*load9;
  120. res2 = res2-load13*load11;
  121. res3 = res3+load12*load11;
  122. load14 = ptrbb[4*3+2];
  123. res4 = res4+load8*load14;
  124. res5 = res5+load10*load14;
  125. load15 = ptrbb[4*3+3];
  126. res4 = res4-load10*load15;
  127. res5 = res5+load8*load15;
  128. res6 = res6+load12*load14;
  129. res7 = res7+load13*load14;
  130. res6 = res6-load13*load15;
  131. res7 = res7+load12*load15;
  132. #endif
  133. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  134. load0 = ptrba[4*0+0];
  135. load1 = ptrbb[4*0+0];
  136. res0 = res0+load0*load1;
  137. load2 = ptrba[4*0+1];
  138. res1 = res1+load2*load1;
  139. load3 = ptrbb[4*0+1];
  140. res0 = res0+load2*load3;
  141. res1 = res1-load0*load3;
  142. load4 = ptrba[4*0+2];
  143. res2 = res2+load4*load1;
  144. load5 = ptrba[4*0+3];
  145. res3 = res3+load5*load1;
  146. res2 = res2+load5*load3;
  147. res3 = res3-load4*load3;
  148. load6 = ptrbb[4*0+2];
  149. res4 = res4+load0*load6;
  150. res5 = res5+load2*load6;
  151. load7 = ptrbb[4*0+3];
  152. res4 = res4+load2*load7;
  153. res5 = res5-load0*load7;
  154. res6 = res6+load4*load6;
  155. res7 = res7+load5*load6;
  156. res6 = res6+load5*load7;
  157. res7 = res7-load4*load7;
  158. load8 = ptrba[4*1+0];
  159. load9 = ptrbb[4*1+0];
  160. res0 = res0+load8*load9;
  161. load10 = ptrba[4*1+1];
  162. res1 = res1+load10*load9;
  163. load11 = ptrbb[4*1+1];
  164. res0 = res0+load10*load11;
  165. res1 = res1-load8*load11;
  166. load12 = ptrba[4*1+2];
  167. res2 = res2+load12*load9;
  168. load13 = ptrba[4*1+3];
  169. res3 = res3+load13*load9;
  170. res2 = res2+load13*load11;
  171. res3 = res3-load12*load11;
  172. load14 = ptrbb[4*1+2];
  173. res4 = res4+load8*load14;
  174. res5 = res5+load10*load14;
  175. load15 = ptrbb[4*1+3];
  176. res4 = res4+load10*load15;
  177. res5 = res5-load8*load15;
  178. res6 = res6+load12*load14;
  179. res7 = res7+load13*load14;
  180. res6 = res6+load13*load15;
  181. res7 = res7-load12*load15;
  182. load0 = ptrba[4*2+0];
  183. load1 = ptrbb[4*2+0];
  184. res0 = res0+load0*load1;
  185. load2 = ptrba[4*2+1];
  186. res1 = res1+load2*load1;
  187. load3 = ptrbb[4*2+1];
  188. res0 = res0+load2*load3;
  189. res1 = res1-load0*load3;
  190. load4 = ptrba[4*2+2];
  191. res2 = res2+load4*load1;
  192. load5 = ptrba[4*2+3];
  193. res3 = res3+load5*load1;
  194. res2 = res2+load5*load3;
  195. res3 = res3-load4*load3;
  196. load6 = ptrbb[4*2+2];
  197. res4 = res4+load0*load6;
  198. res5 = res5+load2*load6;
  199. load7 = ptrbb[4*2+3];
  200. res4 = res4+load2*load7;
  201. res5 = res5-load0*load7;
  202. res6 = res6+load4*load6;
  203. res7 = res7+load5*load6;
  204. res6 = res6+load5*load7;
  205. res7 = res7-load4*load7;
  206. load8 = ptrba[4*3+0];
  207. load9 = ptrbb[4*3+0];
  208. res0 = res0+load8*load9;
  209. load10 = ptrba[4*3+1];
  210. res1 = res1+load10*load9;
  211. load11 = ptrbb[4*3+1];
  212. res0 = res0+load10*load11;
  213. res1 = res1-load8*load11;
  214. load12 = ptrba[4*3+2];
  215. res2 = res2+load12*load9;
  216. load13 = ptrba[4*3+3];
  217. res3 = res3+load13*load9;
  218. res2 = res2+load13*load11;
  219. res3 = res3-load12*load11;
  220. load14 = ptrbb[4*3+2];
  221. res4 = res4+load8*load14;
  222. res5 = res5+load10*load14;
  223. load15 = ptrbb[4*3+3];
  224. res4 = res4+load10*load15;
  225. res5 = res5-load8*load15;
  226. res6 = res6+load12*load14;
  227. res7 = res7+load13*load14;
  228. res6 = res6+load13*load15;
  229. res7 = res7-load12*load15;
  230. #endif
  231. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  232. load0 = ptrba[4*0+0];
  233. load1 = ptrbb[4*0+0];
  234. res0 = res0+load0*load1;
  235. load2 = ptrba[4*0+1];
  236. res1 = res1-load2*load1;
  237. load3 = ptrbb[4*0+1];
  238. res0 = res0+load2*load3;
  239. res1 = res1+load0*load3;
  240. load4 = ptrba[4*0+2];
  241. res2 = res2+load4*load1;
  242. load5 = ptrba[4*0+3];
  243. res3 = res3-load5*load1;
  244. res2 = res2+load5*load3;
  245. res3 = res3+load4*load3;
  246. load6 = ptrbb[4*0+2];
  247. res4 = res4+load0*load6;
  248. res5 = res5-load2*load6;
  249. load7 = ptrbb[4*0+3];
  250. res4 = res4+load2*load7;
  251. res5 = res5+load0*load7;
  252. res6 = res6+load4*load6;
  253. res7 = res7-load5*load6;
  254. res6 = res6+load5*load7;
  255. res7 = res7+load4*load7;
  256. load8 = ptrba[4*1+0];
  257. load9 = ptrbb[4*1+0];
  258. res0 = res0+load8*load9;
  259. load10 = ptrba[4*1+1];
  260. res1 = res1-load10*load9;
  261. load11 = ptrbb[4*1+1];
  262. res0 = res0+load10*load11;
  263. res1 = res1+load8*load11;
  264. load12 = ptrba[4*1+2];
  265. res2 = res2+load12*load9;
  266. load13 = ptrba[4*1+3];
  267. res3 = res3-load13*load9;
  268. res2 = res2+load13*load11;
  269. res3 = res3+load12*load11;
  270. load14 = ptrbb[4*1+2];
  271. res4 = res4+load8*load14;
  272. res5 = res5-load10*load14;
  273. load15 = ptrbb[4*1+3];
  274. res4 = res4+load10*load15;
  275. res5 = res5+load8*load15;
  276. res6 = res6+load12*load14;
  277. res7 = res7-load13*load14;
  278. res6 = res6+load13*load15;
  279. res7 = res7+load12*load15;
  280. load0 = ptrba[4*2+0];
  281. load1 = ptrbb[4*2+0];
  282. res0 = res0+load0*load1;
  283. load2 = ptrba[4*2+1];
  284. res1 = res1-load2*load1;
  285. load3 = ptrbb[4*2+1];
  286. res0 = res0+load2*load3;
  287. res1 = res1+load0*load3;
  288. load4 = ptrba[4*2+2];
  289. res2 = res2+load4*load1;
  290. load5 = ptrba[4*2+3];
  291. res3 = res3-load5*load1;
  292. res2 = res2+load5*load3;
  293. res3 = res3+load4*load3;
  294. load6 = ptrbb[4*2+2];
  295. res4 = res4+load0*load6;
  296. res5 = res5-load2*load6;
  297. load7 = ptrbb[4*2+3];
  298. res4 = res4+load2*load7;
  299. res5 = res5+load0*load7;
  300. res6 = res6+load4*load6;
  301. res7 = res7-load5*load6;
  302. res6 = res6+load5*load7;
  303. res7 = res7+load4*load7;
  304. load8 = ptrba[4*3+0];
  305. load9 = ptrbb[4*3+0];
  306. res0 = res0+load8*load9;
  307. load10 = ptrba[4*3+1];
  308. res1 = res1-load10*load9;
  309. load11 = ptrbb[4*3+1];
  310. res0 = res0+load10*load11;
  311. res1 = res1+load8*load11;
  312. load12 = ptrba[4*3+2];
  313. res2 = res2+load12*load9;
  314. load13 = ptrba[4*3+3];
  315. res3 = res3-load13*load9;
  316. res2 = res2+load13*load11;
  317. res3 = res3+load12*load11;
  318. load14 = ptrbb[4*3+2];
  319. res4 = res4+load8*load14;
  320. res5 = res5-load10*load14;
  321. load15 = ptrbb[4*3+3];
  322. res4 = res4+load10*load15;
  323. res5 = res5+load8*load15;
  324. res6 = res6+load12*load14;
  325. res7 = res7-load13*load14;
  326. res6 = res6+load13*load15;
  327. res7 = res7+load12*load15;
  328. #endif
  329. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  330. load0 = ptrba[4*0+0];
  331. load1 = ptrbb[4*0+0];
  332. res0 = res0+load0*load1;
  333. load2 = ptrba[4*0+1];
  334. res1 = res1-load2*load1;
  335. load3 = ptrbb[4*0+1];
  336. res0 = res0-load2*load3;
  337. res1 = res1-load0*load3;
  338. load4 = ptrba[4*0+2];
  339. res2 = res2+load4*load1;
  340. load5 = ptrba[4*0+3];
  341. res3 = res3-load5*load1;
  342. res2 = res2-load5*load3;
  343. res3 = res3-load4*load3;
  344. load6 = ptrbb[4*0+2];
  345. res4 = res4+load0*load6;
  346. res5 = res5-load2*load6;
  347. load7 = ptrbb[4*0+3];
  348. res4 = res4-load2*load7;
  349. res5 = res5-load0*load7;
  350. res6 = res6+load4*load6;
  351. res7 = res7-load5*load6;
  352. res6 = res6-load5*load7;
  353. res7 = res7-load4*load7;
  354. load8 = ptrba[4*1+0];
  355. load9 = ptrbb[4*1+0];
  356. res0 = res0+load8*load9;
  357. load10 = ptrba[4*1+1];
  358. res1 = res1-load10*load9;
  359. load11 = ptrbb[4*1+1];
  360. res0 = res0-load10*load11;
  361. res1 = res1-load8*load11;
  362. load12 = ptrba[4*1+2];
  363. res2 = res2+load12*load9;
  364. load13 = ptrba[4*1+3];
  365. res3 = res3-load13*load9;
  366. res2 = res2-load13*load11;
  367. res3 = res3-load12*load11;
  368. load14 = ptrbb[4*1+2];
  369. res4 = res4+load8*load14;
  370. res5 = res5-load10*load14;
  371. load15 = ptrbb[4*1+3];
  372. res4 = res4-load10*load15;
  373. res5 = res5-load8*load15;
  374. res6 = res6+load12*load14;
  375. res7 = res7-load13*load14;
  376. res6 = res6-load13*load15;
  377. res7 = res7-load12*load15;
  378. load0 = ptrba[4*2+0];
  379. load1 = ptrbb[4*2+0];
  380. res0 = res0+load0*load1;
  381. load2 = ptrba[4*2+1];
  382. res1 = res1-load2*load1;
  383. load3 = ptrbb[4*2+1];
  384. res0 = res0-load2*load3;
  385. res1 = res1-load0*load3;
  386. load4 = ptrba[4*2+2];
  387. res2 = res2+load4*load1;
  388. load5 = ptrba[4*2+3];
  389. res3 = res3-load5*load1;
  390. res2 = res2-load5*load3;
  391. res3 = res3-load4*load3;
  392. load6 = ptrbb[4*2+2];
  393. res4 = res4+load0*load6;
  394. res5 = res5-load2*load6;
  395. load7 = ptrbb[4*2+3];
  396. res4 = res4-load2*load7;
  397. res5 = res5-load0*load7;
  398. res6 = res6+load4*load6;
  399. res7 = res7-load5*load6;
  400. res6 = res6-load5*load7;
  401. res7 = res7-load4*load7;
  402. load8 = ptrba[4*3+0];
  403. load9 = ptrbb[4*3+0];
  404. res0 = res0+load8*load9;
  405. load10 = ptrba[4*3+1];
  406. res1 = res1-load10*load9;
  407. load11 = ptrbb[4*3+1];
  408. res0 = res0-load10*load11;
  409. res1 = res1-load8*load11;
  410. load12 = ptrba[4*3+2];
  411. res2 = res2+load12*load9;
  412. load13 = ptrba[4*3+3];
  413. res3 = res3-load13*load9;
  414. res2 = res2-load13*load11;
  415. res3 = res3-load12*load11;
  416. load14 = ptrbb[4*3+2];
  417. res4 = res4+load8*load14;
  418. res5 = res5-load10*load14;
  419. load15 = ptrbb[4*3+3];
  420. res4 = res4-load10*load15;
  421. res5 = res5-load8*load15;
  422. res6 = res6+load12*load14;
  423. res7 = res7-load13*load14;
  424. res6 = res6-load13*load15;
  425. res7 = res7-load12*load15;
  426. #endif
  427. ptrba = ptrba+16;
  428. ptrbb = ptrbb+16;
  429. }
  430. for (k=0; k<(bk&3); k+=1)
  431. {
  432. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  433. load0 = ptrba[4*0+0];
  434. load1 = ptrbb[4*0+0];
  435. res0 = res0+load0*load1;
  436. load2 = ptrba[4*0+1];
  437. res1 = res1+load2*load1;
  438. load3 = ptrbb[4*0+1];
  439. res0 = res0-load2*load3;
  440. res1 = res1+load0*load3;
  441. load4 = ptrba[4*0+2];
  442. res2 = res2+load4*load1;
  443. load5 = ptrba[4*0+3];
  444. res3 = res3+load5*load1;
  445. res2 = res2-load5*load3;
  446. res3 = res3+load4*load3;
  447. load6 = ptrbb[4*0+2];
  448. res4 = res4+load0*load6;
  449. res5 = res5+load2*load6;
  450. load7 = ptrbb[4*0+3];
  451. res4 = res4-load2*load7;
  452. res5 = res5+load0*load7;
  453. res6 = res6+load4*load6;
  454. res7 = res7+load5*load6;
  455. res6 = res6-load5*load7;
  456. res7 = res7+load4*load7;
  457. #endif
  458. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  459. load0 = ptrba[4*0+0];
  460. load1 = ptrbb[4*0+0];
  461. res0 = res0+load0*load1;
  462. load2 = ptrba[4*0+1];
  463. res1 = res1+load2*load1;
  464. load3 = ptrbb[4*0+1];
  465. res0 = res0+load2*load3;
  466. res1 = res1-load0*load3;
  467. load4 = ptrba[4*0+2];
  468. res2 = res2+load4*load1;
  469. load5 = ptrba[4*0+3];
  470. res3 = res3+load5*load1;
  471. res2 = res2+load5*load3;
  472. res3 = res3-load4*load3;
  473. load6 = ptrbb[4*0+2];
  474. res4 = res4+load0*load6;
  475. res5 = res5+load2*load6;
  476. load7 = ptrbb[4*0+3];
  477. res4 = res4+load2*load7;
  478. res5 = res5-load0*load7;
  479. res6 = res6+load4*load6;
  480. res7 = res7+load5*load6;
  481. res6 = res6+load5*load7;
  482. res7 = res7-load4*load7;
  483. #endif
  484. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  485. load0 = ptrba[4*0+0];
  486. load1 = ptrbb[4*0+0];
  487. res0 = res0+load0*load1;
  488. load2 = ptrba[4*0+1];
  489. res1 = res1-load2*load1;
  490. load3 = ptrbb[4*0+1];
  491. res0 = res0+load2*load3;
  492. res1 = res1+load0*load3;
  493. load4 = ptrba[4*0+2];
  494. res2 = res2+load4*load1;
  495. load5 = ptrba[4*0+3];
  496. res3 = res3-load5*load1;
  497. res2 = res2+load5*load3;
  498. res3 = res3+load4*load3;
  499. load6 = ptrbb[4*0+2];
  500. res4 = res4+load0*load6;
  501. res5 = res5-load2*load6;
  502. load7 = ptrbb[4*0+3];
  503. res4 = res4+load2*load7;
  504. res5 = res5+load0*load7;
  505. res6 = res6+load4*load6;
  506. res7 = res7-load5*load6;
  507. res6 = res6+load5*load7;
  508. res7 = res7+load4*load7;
  509. #endif
  510. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  511. load0 = ptrba[4*0+0];
  512. load1 = ptrbb[4*0+0];
  513. res0 = res0+load0*load1;
  514. load2 = ptrba[4*0+1];
  515. res1 = res1-load2*load1;
  516. load3 = ptrbb[4*0+1];
  517. res0 = res0-load2*load3;
  518. res1 = res1-load0*load3;
  519. load4 = ptrba[4*0+2];
  520. res2 = res2+load4*load1;
  521. load5 = ptrba[4*0+3];
  522. res3 = res3-load5*load1;
  523. res2 = res2-load5*load3;
  524. res3 = res3-load4*load3;
  525. load6 = ptrbb[4*0+2];
  526. res4 = res4+load0*load6;
  527. res5 = res5-load2*load6;
  528. load7 = ptrbb[4*0+3];
  529. res4 = res4-load2*load7;
  530. res5 = res5-load0*load7;
  531. res6 = res6+load4*load6;
  532. res7 = res7-load5*load6;
  533. res6 = res6-load5*load7;
  534. res7 = res7-load4*load7;
  535. #endif
  536. ptrba = ptrba+4;
  537. ptrbb = ptrbb+4;
  538. }
  539. load0 = res0*alphar;
  540. C0[0] = C0[0]+load0;
  541. load1 = res1*alphar;
  542. C0[1] = C0[1]+load1;
  543. load0 = res1*alphai;
  544. C0[0] = C0[0]-load0;
  545. load1 = res0*alphai;
  546. C0[1] = C0[1]+load1;
  547. load2 = res2*alphar;
  548. C0[2] = C0[2]+load2;
  549. load3 = res3*alphar;
  550. C0[3] = C0[3]+load3;
  551. load2 = res3*alphai;
  552. C0[2] = C0[2]-load2;
  553. load3 = res2*alphai;
  554. C0[3] = C0[3]+load3;
  555. load4 = res4*alphar;
  556. C1[0] = C1[0]+load4;
  557. load5 = res5*alphar;
  558. C1[1] = C1[1]+load5;
  559. load4 = res5*alphai;
  560. C1[0] = C1[0]-load4;
  561. load5 = res4*alphai;
  562. C1[1] = C1[1]+load5;
  563. load6 = res6*alphar;
  564. C1[2] = C1[2]+load6;
  565. load7 = res7*alphar;
  566. C1[3] = C1[3]+load7;
  567. load6 = res7*alphai;
  568. C1[2] = C1[2]-load6;
  569. load7 = res6*alphai;
  570. C1[3] = C1[3]+load7;
  571. C0 = C0+4;
  572. C1 = C1+4;
  573. }
  574. for (i=0; i<(bm&1); i+=1)
  575. {
  576. ptrbb = bb;
  577. res0 = 0;
  578. res1 = 0;
  579. res2 = 0;
  580. res3 = 0;
  581. for (k=0; k<bk; k+=1)
  582. {
  583. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  584. load0 = ptrba[2*0+0];
  585. load1 = ptrbb[4*0+0];
  586. res0 = res0+load0*load1;
  587. load2 = ptrba[2*0+1];
  588. res1 = res1+load2*load1;
  589. load3 = ptrbb[4*0+1];
  590. res0 = res0-load2*load3;
  591. res1 = res1+load0*load3;
  592. load4 = ptrbb[4*0+2];
  593. res2 = res2+load0*load4;
  594. res3 = res3+load2*load4;
  595. load5 = ptrbb[4*0+3];
  596. res2 = res2-load2*load5;
  597. res3 = res3+load0*load5;
  598. #endif
  599. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  600. load0 = ptrba[2*0+0];
  601. load1 = ptrbb[4*0+0];
  602. res0 = res0+load0*load1;
  603. load2 = ptrba[2*0+1];
  604. res1 = res1+load2*load1;
  605. load3 = ptrbb[4*0+1];
  606. res0 = res0+load2*load3;
  607. res1 = res1-load0*load3;
  608. load4 = ptrbb[4*0+2];
  609. res2 = res2+load0*load4;
  610. res3 = res3+load2*load4;
  611. load5 = ptrbb[4*0+3];
  612. res2 = res2+load2*load5;
  613. res3 = res3-load0*load5;
  614. #endif
  615. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  616. load0 = ptrba[2*0+0];
  617. load1 = ptrbb[4*0+0];
  618. res0 = res0+load0*load1;
  619. load2 = ptrba[2*0+1];
  620. res1 = res1-load2*load1;
  621. load3 = ptrbb[4*0+1];
  622. res0 = res0+load2*load3;
  623. res1 = res1+load0*load3;
  624. load4 = ptrbb[4*0+2];
  625. res2 = res2+load0*load4;
  626. res3 = res3-load2*load4;
  627. load5 = ptrbb[4*0+3];
  628. res2 = res2+load2*load5;
  629. res3 = res3+load0*load5;
  630. #endif
  631. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  632. load0 = ptrba[2*0+0];
  633. load1 = ptrbb[4*0+0];
  634. res0 = res0+load0*load1;
  635. load2 = ptrba[2*0+1];
  636. res1 = res1-load2*load1;
  637. load3 = ptrbb[4*0+1];
  638. res0 = res0-load2*load3;
  639. res1 = res1-load0*load3;
  640. load4 = ptrbb[4*0+2];
  641. res2 = res2+load0*load4;
  642. res3 = res3-load2*load4;
  643. load5 = ptrbb[4*0+3];
  644. res2 = res2-load2*load5;
  645. res3 = res3-load0*load5;
  646. #endif
  647. ptrba = ptrba+2;
  648. ptrbb = ptrbb+4;
  649. }
  650. load0 = res0*alphar;
  651. C0[0] = C0[0]+load0;
  652. load1 = res1*alphar;
  653. C0[1] = C0[1]+load1;
  654. load0 = res1*alphai;
  655. C0[0] = C0[0]-load0;
  656. load1 = res0*alphai;
  657. C0[1] = C0[1]+load1;
  658. load2 = res2*alphar;
  659. C1[0] = C1[0]+load2;
  660. load3 = res3*alphar;
  661. C1[1] = C1[1]+load3;
  662. load2 = res3*alphai;
  663. C1[0] = C1[0]-load2;
  664. load3 = res2*alphai;
  665. C1[1] = C1[1]+load3;
  666. C0 = C0+2;
  667. C1 = C1+2;
  668. }
  669. k = (bk<<2);
  670. bb = bb+k;
  671. i = (ldc<<2);
  672. C = C+i;
  673. }
  674. for (j=0; j<(bn&1); j+=1)
  675. {
  676. C0 = C;
  677. ptrba = ba;
  678. for (i=0; i<bm/2; i+=1)
  679. {
  680. ptrbb = bb;
  681. res0 = 0;
  682. res1 = 0;
  683. res2 = 0;
  684. res3 = 0;
  685. for (k=0; k<bk; k+=1)
  686. {
  687. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  688. load0 = ptrba[4*0+0];
  689. load1 = ptrbb[2*0+0];
  690. res0 = res0+load0*load1;
  691. load2 = ptrba[4*0+1];
  692. res1 = res1+load2*load1;
  693. load3 = ptrbb[2*0+1];
  694. res0 = res0-load2*load3;
  695. res1 = res1+load0*load3;
  696. load4 = ptrba[4*0+2];
  697. res2 = res2+load4*load1;
  698. load5 = ptrba[4*0+3];
  699. res3 = res3+load5*load1;
  700. res2 = res2-load5*load3;
  701. res3 = res3+load4*load3;
  702. #endif
  703. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  704. load0 = ptrba[4*0+0];
  705. load1 = ptrbb[2*0+0];
  706. res0 = res0+load0*load1;
  707. load2 = ptrba[4*0+1];
  708. res1 = res1+load2*load1;
  709. load3 = ptrbb[2*0+1];
  710. res0 = res0+load2*load3;
  711. res1 = res1-load0*load3;
  712. load4 = ptrba[4*0+2];
  713. res2 = res2+load4*load1;
  714. load5 = ptrba[4*0+3];
  715. res3 = res3+load5*load1;
  716. res2 = res2+load5*load3;
  717. res3 = res3-load4*load3;
  718. #endif
  719. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  720. load0 = ptrba[4*0+0];
  721. load1 = ptrbb[2*0+0];
  722. res0 = res0+load0*load1;
  723. load2 = ptrba[4*0+1];
  724. res1 = res1-load2*load1;
  725. load3 = ptrbb[2*0+1];
  726. res0 = res0+load2*load3;
  727. res1 = res1+load0*load3;
  728. load4 = ptrba[4*0+2];
  729. res2 = res2+load4*load1;
  730. load5 = ptrba[4*0+3];
  731. res3 = res3-load5*load1;
  732. res2 = res2+load5*load3;
  733. res3 = res3+load4*load3;
  734. #endif
  735. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  736. load0 = ptrba[4*0+0];
  737. load1 = ptrbb[2*0+0];
  738. res0 = res0+load0*load1;
  739. load2 = ptrba[4*0+1];
  740. res1 = res1-load2*load1;
  741. load3 = ptrbb[2*0+1];
  742. res0 = res0-load2*load3;
  743. res1 = res1-load0*load3;
  744. load4 = ptrba[4*0+2];
  745. res2 = res2+load4*load1;
  746. load5 = ptrba[4*0+3];
  747. res3 = res3-load5*load1;
  748. res2 = res2-load5*load3;
  749. res3 = res3-load4*load3;
  750. #endif
  751. ptrba = ptrba+4;
  752. ptrbb = ptrbb+2;
  753. }
  754. load0 = res0*alphar;
  755. C0[0] = C0[0]+load0;
  756. load1 = res1*alphar;
  757. C0[1] = C0[1]+load1;
  758. load0 = res1*alphai;
  759. C0[0] = C0[0]-load0;
  760. load1 = res0*alphai;
  761. C0[1] = C0[1]+load1;
  762. load2 = res2*alphar;
  763. C0[2] = C0[2]+load2;
  764. load3 = res3*alphar;
  765. C0[3] = C0[3]+load3;
  766. load2 = res3*alphai;
  767. C0[2] = C0[2]-load2;
  768. load3 = res2*alphai;
  769. C0[3] = C0[3]+load3;
  770. C0 = C0+4;
  771. }
  772. for (i=0; i<(bm&1); i+=1)
  773. {
  774. ptrbb = bb;
  775. res0 = 0;
  776. res1 = 0;
  777. for (k=0; k<bk; k+=1)
  778. {
  779. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  780. load0 = ptrba[2*0+0];
  781. load1 = ptrbb[2*0+0];
  782. res0 = res0+load0*load1;
  783. load2 = ptrba[2*0+1];
  784. res1 = res1+load2*load1;
  785. load3 = ptrbb[2*0+1];
  786. res0 = res0-load2*load3;
  787. res1 = res1+load0*load3;
  788. #endif
  789. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  790. load0 = ptrba[2*0+0];
  791. load1 = ptrbb[2*0+0];
  792. res0 = res0+load0*load1;
  793. load2 = ptrba[2*0+1];
  794. res1 = res1+load2*load1;
  795. load3 = ptrbb[2*0+1];
  796. res0 = res0+load2*load3;
  797. res1 = res1-load0*load3;
  798. #endif
  799. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  800. load0 = ptrba[2*0+0];
  801. load1 = ptrbb[2*0+0];
  802. res0 = res0+load0*load1;
  803. load2 = ptrba[2*0+1];
  804. res1 = res1-load2*load1;
  805. load3 = ptrbb[2*0+1];
  806. res0 = res0+load2*load3;
  807. res1 = res1+load0*load3;
  808. #endif
  809. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  810. load0 = ptrba[2*0+0];
  811. load1 = ptrbb[2*0+0];
  812. res0 = res0+load0*load1;
  813. load2 = ptrba[2*0+1];
  814. res1 = res1-load2*load1;
  815. load3 = ptrbb[2*0+1];
  816. res0 = res0-load2*load3;
  817. res1 = res1-load0*load3;
  818. #endif
  819. ptrba = ptrba+2;
  820. ptrbb = ptrbb+2;
  821. }
  822. load0 = res0*alphar;
  823. C0[0] = C0[0]+load0;
  824. load1 = res1*alphar;
  825. C0[1] = C0[1]+load1;
  826. load0 = res1*alphai;
  827. C0[0] = C0[0]-load0;
  828. load1 = res0*alphai;
  829. C0[1] = C0[1]+load1;
  830. C0 = C0+2;
  831. }
  832. k = (bk<<1);
  833. bb = bb+k;
  834. i = (ldc<<1);
  835. C = C+i;
  836. }
  837. return 0;
  838. }