You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trmmkernel_16x4.c 33 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092
  1. #include "common.h"
  2. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
  3. {
  4. BLASLONG i,j,k;
  5. FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
  6. FLOAT res0_0;
  7. FLOAT res0_1;
  8. FLOAT res0_2;
  9. FLOAT res0_3;
  10. FLOAT res0_4;
  11. FLOAT res0_5;
  12. FLOAT res0_6;
  13. FLOAT res0_7;
  14. FLOAT res0_8;
  15. FLOAT res0_9;
  16. FLOAT res0_10;
  17. FLOAT res0_11;
  18. FLOAT res0_12;
  19. FLOAT res0_13;
  20. FLOAT res0_14;
  21. FLOAT res0_15;
  22. FLOAT res1_0;
  23. FLOAT res1_1;
  24. FLOAT res1_2;
  25. FLOAT res1_3;
  26. FLOAT res1_4;
  27. FLOAT res1_5;
  28. FLOAT res1_6;
  29. FLOAT res1_7;
  30. FLOAT res1_8;
  31. FLOAT res1_9;
  32. FLOAT res1_10;
  33. FLOAT res1_11;
  34. FLOAT res1_12;
  35. FLOAT res1_13;
  36. FLOAT res1_14;
  37. FLOAT res1_15;
  38. FLOAT res2_0;
  39. FLOAT res2_1;
  40. FLOAT res2_2;
  41. FLOAT res2_3;
  42. FLOAT res2_4;
  43. FLOAT res2_5;
  44. FLOAT res2_6;
  45. FLOAT res2_7;
  46. FLOAT res2_8;
  47. FLOAT res2_9;
  48. FLOAT res2_10;
  49. FLOAT res2_11;
  50. FLOAT res2_12;
  51. FLOAT res2_13;
  52. FLOAT res2_14;
  53. FLOAT res2_15;
  54. FLOAT res3_0;
  55. FLOAT res3_1;
  56. FLOAT res3_2;
  57. FLOAT res3_3;
  58. FLOAT res3_4;
  59. FLOAT res3_5;
  60. FLOAT res3_6;
  61. FLOAT res3_7;
  62. FLOAT res3_8;
  63. FLOAT res3_9;
  64. FLOAT res3_10;
  65. FLOAT res3_11;
  66. FLOAT res3_12;
  67. FLOAT res3_13;
  68. FLOAT res3_14;
  69. FLOAT res3_15;
  70. FLOAT a0;
  71. FLOAT a1;
  72. FLOAT b0;
  73. FLOAT b1;
  74. FLOAT b2;
  75. FLOAT b3;
  76. BLASLONG off, temp;
  77. #if !defined(LEFT)
  78. off = -offset;
  79. #else
  80. off = 0;
  81. #endif
  82. for (j=0; j<bn/4; j+=1)
  83. {
  84. C0 = C;
  85. C1 = C0+ldc;
  86. C2 = C0+2*ldc;
  87. C3 = C0+3*ldc;
  88. #if defined(TRMMKERNEL) && defined(LEFT)
  89. off = offset;
  90. #endif
  91. ptrba = ba;
  92. for (i=0; i<bm/16; i+=1)
  93. {
  94. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  95. ptrbb = bb;
  96. #else
  97. ptrba += off*16;
  98. ptrbb = bb + off*4;
  99. #endif
  100. res0_0 = 0;
  101. res0_1 = 0;
  102. res0_2 = 0;
  103. res0_3 = 0;
  104. res0_4 = 0;
  105. res0_5 = 0;
  106. res0_6 = 0;
  107. res0_7 = 0;
  108. res0_8 = 0;
  109. res0_9 = 0;
  110. res0_10 = 0;
  111. res0_11 = 0;
  112. res0_12 = 0;
  113. res0_13 = 0;
  114. res0_14 = 0;
  115. res0_15 = 0;
  116. res1_0 = 0;
  117. res1_1 = 0;
  118. res1_2 = 0;
  119. res1_3 = 0;
  120. res1_4 = 0;
  121. res1_5 = 0;
  122. res1_6 = 0;
  123. res1_7 = 0;
  124. res1_8 = 0;
  125. res1_9 = 0;
  126. res1_10 = 0;
  127. res1_11 = 0;
  128. res1_12 = 0;
  129. res1_13 = 0;
  130. res1_14 = 0;
  131. res1_15 = 0;
  132. res2_0 = 0;
  133. res2_1 = 0;
  134. res2_2 = 0;
  135. res2_3 = 0;
  136. res2_4 = 0;
  137. res2_5 = 0;
  138. res2_6 = 0;
  139. res2_7 = 0;
  140. res2_8 = 0;
  141. res2_9 = 0;
  142. res2_10 = 0;
  143. res2_11 = 0;
  144. res2_12 = 0;
  145. res2_13 = 0;
  146. res2_14 = 0;
  147. res2_15 = 0;
  148. res3_0 = 0;
  149. res3_1 = 0;
  150. res3_2 = 0;
  151. res3_3 = 0;
  152. res3_4 = 0;
  153. res3_5 = 0;
  154. res3_6 = 0;
  155. res3_7 = 0;
  156. res3_8 = 0;
  157. res3_9 = 0;
  158. res3_10 = 0;
  159. res3_11 = 0;
  160. res3_12 = 0;
  161. res3_13 = 0;
  162. res3_14 = 0;
  163. res3_15 = 0;
  164. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  165. temp = bk-off;
  166. #elif defined(LEFT)
  167. temp = off+16; // number of values in A
  168. #else
  169. temp = off+4; // number of values in B
  170. #endif
  171. for (k=0; k<temp; k++)
  172. {
  173. b0 = ptrbb[0];
  174. b1 = ptrbb[1];
  175. b2 = ptrbb[2];
  176. b3 = ptrbb[3];
  177. a0 = ptrba[0];
  178. res0_0 += a0*b0;
  179. res1_0 += a0*b1;
  180. res2_0 += a0*b2;
  181. res3_0 += a0*b3;
  182. a1 = ptrba[1];
  183. res0_1 += a1*b0;
  184. res1_1 += a1*b1;
  185. res2_1 += a1*b2;
  186. res3_1 += a1*b3;
  187. a0 = ptrba[2];
  188. res0_2 += a0*b0;
  189. res1_2 += a0*b1;
  190. res2_2 += a0*b2;
  191. res3_2 += a0*b3;
  192. a1 = ptrba[3];
  193. res0_3 += a1*b0;
  194. res1_3 += a1*b1;
  195. res2_3 += a1*b2;
  196. res3_3 += a1*b3;
  197. a0 = ptrba[4];
  198. res0_4 += a0*b0;
  199. res1_4 += a0*b1;
  200. res2_4 += a0*b2;
  201. res3_4 += a0*b3;
  202. a1 = ptrba[5];
  203. res0_5 += a1*b0;
  204. res1_5 += a1*b1;
  205. res2_5 += a1*b2;
  206. res3_5 += a1*b3;
  207. a0 = ptrba[6];
  208. res0_6 += a0*b0;
  209. res1_6 += a0*b1;
  210. res2_6 += a0*b2;
  211. res3_6 += a0*b3;
  212. a1 = ptrba[7];
  213. res0_7 += a1*b0;
  214. res1_7 += a1*b1;
  215. res2_7 += a1*b2;
  216. res3_7 += a1*b3;
  217. a0 = ptrba[8];
  218. res0_8 += a0*b0;
  219. res1_8 += a0*b1;
  220. res2_8 += a0*b2;
  221. res3_8 += a0*b3;
  222. a1 = ptrba[9];
  223. res0_9 += a1*b0;
  224. res1_9 += a1*b1;
  225. res2_9 += a1*b2;
  226. res3_9 += a1*b3;
  227. a0 = ptrba[10];
  228. res0_10 += a0*b0;
  229. res1_10 += a0*b1;
  230. res2_10 += a0*b2;
  231. res3_10 += a0*b3;
  232. a1 = ptrba[11];
  233. res0_11 += a1*b0;
  234. res1_11 += a1*b1;
  235. res2_11 += a1*b2;
  236. res3_11 += a1*b3;
  237. a0 = ptrba[12];
  238. res0_12 += a0*b0;
  239. res1_12 += a0*b1;
  240. res2_12 += a0*b2;
  241. res3_12 += a0*b3;
  242. a1 = ptrba[13];
  243. res0_13 += a1*b0;
  244. res1_13 += a1*b1;
  245. res2_13 += a1*b2;
  246. res3_13 += a1*b3;
  247. a0 = ptrba[14];
  248. res0_14 += a0*b0;
  249. res1_14 += a0*b1;
  250. res2_14 += a0*b2;
  251. res3_14 += a0*b3;
  252. a1 = ptrba[15];
  253. res0_15 += a1*b0;
  254. res1_15 += a1*b1;
  255. res2_15 += a1*b2;
  256. res3_15 += a1*b3;
  257. ptrba = ptrba+16;
  258. ptrbb = ptrbb+4;
  259. }
  260. res0_0 *= alpha;
  261. res0_1 *= alpha;
  262. res0_2 *= alpha;
  263. res0_3 *= alpha;
  264. res0_4 *= alpha;
  265. res0_5 *= alpha;
  266. res0_6 *= alpha;
  267. res0_7 *= alpha;
  268. res0_8 *= alpha;
  269. res0_9 *= alpha;
  270. res0_10 *= alpha;
  271. res0_11 *= alpha;
  272. res0_12 *= alpha;
  273. res0_13 *= alpha;
  274. res0_14 *= alpha;
  275. res0_15 *= alpha;
  276. res1_0 *= alpha;
  277. res1_1 *= alpha;
  278. res1_2 *= alpha;
  279. res1_3 *= alpha;
  280. res1_4 *= alpha;
  281. res1_5 *= alpha;
  282. res1_6 *= alpha;
  283. res1_7 *= alpha;
  284. res1_8 *= alpha;
  285. res1_9 *= alpha;
  286. res1_10 *= alpha;
  287. res1_11 *= alpha;
  288. res1_12 *= alpha;
  289. res1_13 *= alpha;
  290. res1_14 *= alpha;
  291. res1_15 *= alpha;
  292. res2_0 *= alpha;
  293. res2_1 *= alpha;
  294. res2_2 *= alpha;
  295. res2_3 *= alpha;
  296. res2_4 *= alpha;
  297. res2_5 *= alpha;
  298. res2_6 *= alpha;
  299. res2_7 *= alpha;
  300. res2_8 *= alpha;
  301. res2_9 *= alpha;
  302. res2_10 *= alpha;
  303. res2_11 *= alpha;
  304. res2_12 *= alpha;
  305. res2_13 *= alpha;
  306. res2_14 *= alpha;
  307. res2_15 *= alpha;
  308. res3_0 *= alpha;
  309. res3_1 *= alpha;
  310. res3_2 *= alpha;
  311. res3_3 *= alpha;
  312. res3_4 *= alpha;
  313. res3_5 *= alpha;
  314. res3_6 *= alpha;
  315. res3_7 *= alpha;
  316. res3_8 *= alpha;
  317. res3_9 *= alpha;
  318. res3_10 *= alpha;
  319. res3_11 *= alpha;
  320. res3_12 *= alpha;
  321. res3_13 *= alpha;
  322. res3_14 *= alpha;
  323. res3_15 *= alpha;
  324. C0[0] = res0_0;
  325. C0[1] = res0_1;
  326. C0[2] = res0_2;
  327. C0[3] = res0_3;
  328. C0[4] = res0_4;
  329. C0[5] = res0_5;
  330. C0[6] = res0_6;
  331. C0[7] = res0_7;
  332. C0[8] = res0_8;
  333. C0[9] = res0_9;
  334. C0[10] = res0_10;
  335. C0[11] = res0_11;
  336. C0[12] = res0_12;
  337. C0[13] = res0_13;
  338. C0[14] = res0_14;
  339. C0[15] = res0_15;
  340. C1[0] = res1_0;
  341. C1[1] = res1_1;
  342. C1[2] = res1_2;
  343. C1[3] = res1_3;
  344. C1[4] = res1_4;
  345. C1[5] = res1_5;
  346. C1[6] = res1_6;
  347. C1[7] = res1_7;
  348. C1[8] = res1_8;
  349. C1[9] = res1_9;
  350. C1[10] = res1_10;
  351. C1[11] = res1_11;
  352. C1[12] = res1_12;
  353. C1[13] = res1_13;
  354. C1[14] = res1_14;
  355. C1[15] = res1_15;
  356. C2[0] = res2_0;
  357. C2[1] = res2_1;
  358. C2[2] = res2_2;
  359. C2[3] = res2_3;
  360. C2[4] = res2_4;
  361. C2[5] = res2_5;
  362. C2[6] = res2_6;
  363. C2[7] = res2_7;
  364. C2[8] = res2_8;
  365. C2[9] = res2_9;
  366. C2[10] = res2_10;
  367. C2[11] = res2_11;
  368. C2[12] = res2_12;
  369. C2[13] = res2_13;
  370. C2[14] = res2_14;
  371. C2[15] = res2_15;
  372. C3[0] = res3_0;
  373. C3[1] = res3_1;
  374. C3[2] = res3_2;
  375. C3[3] = res3_3;
  376. C3[4] = res3_4;
  377. C3[5] = res3_5;
  378. C3[6] = res3_6;
  379. C3[7] = res3_7;
  380. C3[8] = res3_8;
  381. C3[9] = res3_9;
  382. C3[10] = res3_10;
  383. C3[11] = res3_11;
  384. C3[12] = res3_12;
  385. C3[13] = res3_13;
  386. C3[14] = res3_14;
  387. C3[15] = res3_15;
  388. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  389. temp = bk - off;
  390. #ifdef LEFT
  391. temp -= 16; // number of values in A
  392. #else
  393. temp -= 4; // number of values in B
  394. #endif
  395. ptrba += temp*16;
  396. ptrbb += temp*4;
  397. #endif
  398. #ifdef LEFT
  399. off += 16; // number of values in A
  400. #endif
  401. C0 = C0+16;
  402. C1 = C1+16;
  403. C2 = C2+16;
  404. C3 = C3+16;
  405. }
  406. if ( bm & 8)
  407. {
  408. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  409. ptrbb = bb;
  410. #else
  411. ptrba += off*8;
  412. ptrbb = bb + off*4;
  413. #endif
  414. res0_0 = 0;
  415. res0_1 = 0;
  416. res0_2 = 0;
  417. res0_3 = 0;
  418. res0_4 = 0;
  419. res0_5 = 0;
  420. res0_6 = 0;
  421. res0_7 = 0;
  422. res1_0 = 0;
  423. res1_1 = 0;
  424. res1_2 = 0;
  425. res1_3 = 0;
  426. res1_4 = 0;
  427. res1_5 = 0;
  428. res1_6 = 0;
  429. res1_7 = 0;
  430. res2_0 = 0;
  431. res2_1 = 0;
  432. res2_2 = 0;
  433. res2_3 = 0;
  434. res2_4 = 0;
  435. res2_5 = 0;
  436. res2_6 = 0;
  437. res2_7 = 0;
  438. res3_0 = 0;
  439. res3_1 = 0;
  440. res3_2 = 0;
  441. res3_3 = 0;
  442. res3_4 = 0;
  443. res3_5 = 0;
  444. res3_6 = 0;
  445. res3_7 = 0;
  446. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  447. temp = bk-off;
  448. #elif defined(LEFT)
  449. temp = off+8; // number of values in A
  450. #else
  451. temp = off+4; // number of values in B
  452. #endif
  453. for (k=0; k<temp; k++)
  454. {
  455. b0 = ptrbb[0];
  456. b1 = ptrbb[1];
  457. b2 = ptrbb[2];
  458. b3 = ptrbb[3];
  459. a0 = ptrba[0];
  460. res0_0 += a0*b0;
  461. res1_0 += a0*b1;
  462. res2_0 += a0*b2;
  463. res3_0 += a0*b3;
  464. a1 = ptrba[1];
  465. res0_1 += a1*b0;
  466. res1_1 += a1*b1;
  467. res2_1 += a1*b2;
  468. res3_1 += a1*b3;
  469. a0 = ptrba[2];
  470. res0_2 += a0*b0;
  471. res1_2 += a0*b1;
  472. res2_2 += a0*b2;
  473. res3_2 += a0*b3;
  474. a1 = ptrba[3];
  475. res0_3 += a1*b0;
  476. res1_3 += a1*b1;
  477. res2_3 += a1*b2;
  478. res3_3 += a1*b3;
  479. a0 = ptrba[4];
  480. res0_4 += a0*b0;
  481. res1_4 += a0*b1;
  482. res2_4 += a0*b2;
  483. res3_4 += a0*b3;
  484. a1 = ptrba[5];
  485. res0_5 += a1*b0;
  486. res1_5 += a1*b1;
  487. res2_5 += a1*b2;
  488. res3_5 += a1*b3;
  489. a0 = ptrba[6];
  490. res0_6 += a0*b0;
  491. res1_6 += a0*b1;
  492. res2_6 += a0*b2;
  493. res3_6 += a0*b3;
  494. a1 = ptrba[7];
  495. res0_7 += a1*b0;
  496. res1_7 += a1*b1;
  497. res2_7 += a1*b2;
  498. res3_7 += a1*b3;
  499. ptrba = ptrba+8;
  500. ptrbb = ptrbb+4;
  501. }
  502. res0_0 *= alpha;
  503. res0_1 *= alpha;
  504. res0_2 *= alpha;
  505. res0_3 *= alpha;
  506. res0_4 *= alpha;
  507. res0_5 *= alpha;
  508. res0_6 *= alpha;
  509. res0_7 *= alpha;
  510. res1_0 *= alpha;
  511. res1_1 *= alpha;
  512. res1_2 *= alpha;
  513. res1_3 *= alpha;
  514. res1_4 *= alpha;
  515. res1_5 *= alpha;
  516. res1_6 *= alpha;
  517. res1_7 *= alpha;
  518. res2_0 *= alpha;
  519. res2_1 *= alpha;
  520. res2_2 *= alpha;
  521. res2_3 *= alpha;
  522. res2_4 *= alpha;
  523. res2_5 *= alpha;
  524. res2_6 *= alpha;
  525. res2_7 *= alpha;
  526. res3_0 *= alpha;
  527. res3_1 *= alpha;
  528. res3_2 *= alpha;
  529. res3_3 *= alpha;
  530. res3_4 *= alpha;
  531. res3_5 *= alpha;
  532. res3_6 *= alpha;
  533. res3_7 *= alpha;
  534. C0[0] = res0_0;
  535. C0[1] = res0_1;
  536. C0[2] = res0_2;
  537. C0[3] = res0_3;
  538. C0[4] = res0_4;
  539. C0[5] = res0_5;
  540. C0[6] = res0_6;
  541. C0[7] = res0_7;
  542. C1[0] = res1_0;
  543. C1[1] = res1_1;
  544. C1[2] = res1_2;
  545. C1[3] = res1_3;
  546. C1[4] = res1_4;
  547. C1[5] = res1_5;
  548. C1[6] = res1_6;
  549. C1[7] = res1_7;
  550. C2[0] = res2_0;
  551. C2[1] = res2_1;
  552. C2[2] = res2_2;
  553. C2[3] = res2_3;
  554. C2[4] = res2_4;
  555. C2[5] = res2_5;
  556. C2[6] = res2_6;
  557. C2[7] = res2_7;
  558. C3[0] = res3_0;
  559. C3[1] = res3_1;
  560. C3[2] = res3_2;
  561. C3[3] = res3_3;
  562. C3[4] = res3_4;
  563. C3[5] = res3_5;
  564. C3[6] = res3_6;
  565. C3[7] = res3_7;
  566. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  567. temp = bk - off;
  568. #ifdef LEFT
  569. temp -= 8; // number of values in A
  570. #else
  571. temp -= 4; // number of values in B
  572. #endif
  573. ptrba += temp*8;
  574. ptrbb += temp*4;
  575. #endif
  576. #ifdef LEFT
  577. off += 8; // number of values in A
  578. #endif
  579. C0 = C0+8;
  580. C1 = C1+8;
  581. C2 = C2+8;
  582. C3 = C3+8;
  583. }
  584. if ( bm & 4 )
  585. {
  586. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  587. ptrbb = bb;
  588. #else
  589. ptrba += off*4;
  590. ptrbb = bb + off*4;
  591. #endif
  592. res0_0 = 0;
  593. res0_1 = 0;
  594. res0_2 = 0;
  595. res0_3 = 0;
  596. res1_0 = 0;
  597. res1_1 = 0;
  598. res1_2 = 0;
  599. res1_3 = 0;
  600. res2_0 = 0;
  601. res2_1 = 0;
  602. res2_2 = 0;
  603. res2_3 = 0;
  604. res3_0 = 0;
  605. res3_1 = 0;
  606. res3_2 = 0;
  607. res3_3 = 0;
  608. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  609. temp = bk-off;
  610. #elif defined(LEFT)
  611. temp = off+4; // number of values in A
  612. #else
  613. temp = off+4; // number of values in B
  614. #endif
  615. for (k=0; k<temp; k++)
  616. {
  617. b0 = ptrbb[0];
  618. b1 = ptrbb[1];
  619. b2 = ptrbb[2];
  620. b3 = ptrbb[3];
  621. a0 = ptrba[0];
  622. res0_0 += a0*b0;
  623. res1_0 += a0*b1;
  624. res2_0 += a0*b2;
  625. res3_0 += a0*b3;
  626. a1 = ptrba[1];
  627. res0_1 += a1*b0;
  628. res1_1 += a1*b1;
  629. res2_1 += a1*b2;
  630. res3_1 += a1*b3;
  631. a0 = ptrba[2];
  632. res0_2 += a0*b0;
  633. res1_2 += a0*b1;
  634. res2_2 += a0*b2;
  635. res3_2 += a0*b3;
  636. a1 = ptrba[3];
  637. res0_3 += a1*b0;
  638. res1_3 += a1*b1;
  639. res2_3 += a1*b2;
  640. res3_3 += a1*b3;
  641. ptrba = ptrba+4;
  642. ptrbb = ptrbb+4;
  643. }
  644. res0_0 *= alpha;
  645. res0_1 *= alpha;
  646. res0_2 *= alpha;
  647. res0_3 *= alpha;
  648. res1_0 *= alpha;
  649. res1_1 *= alpha;
  650. res1_2 *= alpha;
  651. res1_3 *= alpha;
  652. res2_0 *= alpha;
  653. res2_1 *= alpha;
  654. res2_2 *= alpha;
  655. res2_3 *= alpha;
  656. res3_0 *= alpha;
  657. res3_1 *= alpha;
  658. res3_2 *= alpha;
  659. res3_3 *= alpha;
  660. C0[0] = res0_0;
  661. C0[1] = res0_1;
  662. C0[2] = res0_2;
  663. C0[3] = res0_3;
  664. C1[0] = res1_0;
  665. C1[1] = res1_1;
  666. C1[2] = res1_2;
  667. C1[3] = res1_3;
  668. C2[0] = res2_0;
  669. C2[1] = res2_1;
  670. C2[2] = res2_2;
  671. C2[3] = res2_3;
  672. C3[0] = res3_0;
  673. C3[1] = res3_1;
  674. C3[2] = res3_2;
  675. C3[3] = res3_3;
  676. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  677. temp = bk - off;
  678. #ifdef LEFT
  679. temp -= 4; // number of values in A
  680. #else
  681. temp -= 4; // number of values in B
  682. #endif
  683. ptrba += temp*4;
  684. ptrbb += temp*4;
  685. #endif
  686. #ifdef LEFT
  687. off += 4; // number of values in A
  688. #endif
  689. C0 = C0+4;
  690. C1 = C1+4;
  691. C2 = C2+4;
  692. C3 = C3+4;
  693. }
  694. if ( bm & 2 )
  695. {
  696. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  697. ptrbb = bb;
  698. #else
  699. ptrba += off*2;
  700. ptrbb = bb + off*4;
  701. #endif
  702. res0_0 = 0;
  703. res0_1 = 0;
  704. res1_0 = 0;
  705. res1_1 = 0;
  706. res2_0 = 0;
  707. res2_1 = 0;
  708. res3_0 = 0;
  709. res3_1 = 0;
  710. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  711. temp = bk-off;
  712. #elif defined(LEFT)
  713. temp = off+2; // number of values in A
  714. #else
  715. temp = off+4; // number of values in B
  716. #endif
  717. for (k=0; k<temp; k++)
  718. {
  719. b0 = ptrbb[0];
  720. b1 = ptrbb[1];
  721. b2 = ptrbb[2];
  722. b3 = ptrbb[3];
  723. a0 = ptrba[0];
  724. res0_0 += a0*b0;
  725. res1_0 += a0*b1;
  726. res2_0 += a0*b2;
  727. res3_0 += a0*b3;
  728. a1 = ptrba[1];
  729. res0_1 += a1*b0;
  730. res1_1 += a1*b1;
  731. res2_1 += a1*b2;
  732. res3_1 += a1*b3;
  733. ptrba = ptrba+2;
  734. ptrbb = ptrbb+4;
  735. }
  736. res0_0 *= alpha;
  737. res0_1 *= alpha;
  738. res1_0 *= alpha;
  739. res1_1 *= alpha;
  740. res2_0 *= alpha;
  741. res2_1 *= alpha;
  742. res3_0 *= alpha;
  743. res3_1 *= alpha;
  744. C0[0] = res0_0;
  745. C0[1] = res0_1;
  746. C1[0] = res1_0;
  747. C1[1] = res1_1;
  748. C2[0] = res2_0;
  749. C2[1] = res2_1;
  750. C3[0] = res3_0;
  751. C3[1] = res3_1;
  752. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  753. temp = bk - off;
  754. #ifdef LEFT
  755. temp -= 2; // number of values in A
  756. #else
  757. temp -= 4; // number of values in B
  758. #endif
  759. ptrba += temp*2;
  760. ptrbb += temp*4;
  761. #endif
  762. #ifdef LEFT
  763. off += 2; // number of values in A
  764. #endif
  765. C0 = C0+2;
  766. C1 = C1+2;
  767. C2 = C2+2;
  768. C3 = C3+2;
  769. }
  770. if ( bm & 1 )
  771. {
  772. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  773. ptrbb = bb;
  774. #else
  775. ptrba += off*1;
  776. ptrbb = bb + off*4;
  777. #endif
  778. res0_0 = 0;
  779. res1_0 = 0;
  780. res2_0 = 0;
  781. res3_0 = 0;
  782. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  783. temp = bk-off;
  784. #elif defined(LEFT)
  785. temp = off+1; // number of values in A
  786. #else
  787. temp = off+4; // number of values in B
  788. #endif
  789. for (k=0; k<temp; k++)
  790. {
  791. b0 = ptrbb[0];
  792. b1 = ptrbb[1];
  793. b2 = ptrbb[2];
  794. b3 = ptrbb[3];
  795. a0 = ptrba[0];
  796. res0_0 += a0*b0;
  797. res1_0 += a0*b1;
  798. res2_0 += a0*b2;
  799. res3_0 += a0*b3;
  800. ptrba = ptrba+1;
  801. ptrbb = ptrbb+4;
  802. }
  803. res0_0 *= alpha;
  804. res1_0 *= alpha;
  805. res2_0 *= alpha;
  806. res3_0 *= alpha;
  807. C0[0] = res0_0;
  808. C1[0] = res1_0;
  809. C2[0] = res2_0;
  810. C3[0] = res3_0;
  811. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  812. temp = bk - off;
  813. #ifdef LEFT
  814. temp -= 1; // number of values in A
  815. #else
  816. temp -= 4; // number of values in B
  817. #endif
  818. ptrba += temp*1;
  819. ptrbb += temp*4;
  820. #endif
  821. #ifdef LEFT
  822. off += 1; // number of values in A
  823. #endif
  824. C0 = C0+1;
  825. C1 = C1+1;
  826. C2 = C2+1;
  827. C3 = C3+1;
  828. }
  829. #if defined(TRMMKERNEL) && !defined(LEFT)
  830. off += 4;
  831. #endif
  832. k = (bk<<2);
  833. bb = bb+k;
  834. i = (ldc<<2);
  835. C = C+i;
  836. }
  837. if(bn&2)
  838. {
  839. C0 = C;
  840. C1 = C0+ldc;
  841. #if defined(TRMMKERNEL) && defined(LEFT)
  842. off = offset;
  843. #endif
  844. ptrba = ba;
  845. for (i=0; i<bm/16; i+=1)
  846. {
  847. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  848. ptrbb = bb;
  849. #else
  850. ptrba += off*16;
  851. ptrbb = bb + off*2;
  852. #endif
  853. res0_0 = 0;
  854. res0_1 = 0;
  855. res0_2 = 0;
  856. res0_3 = 0;
  857. res0_4 = 0;
  858. res0_5 = 0;
  859. res0_6 = 0;
  860. res0_7 = 0;
  861. res0_8 = 0;
  862. res0_9 = 0;
  863. res0_10 = 0;
  864. res0_11 = 0;
  865. res0_12 = 0;
  866. res0_13 = 0;
  867. res0_14 = 0;
  868. res0_15 = 0;
  869. res1_0 = 0;
  870. res1_1 = 0;
  871. res1_2 = 0;
  872. res1_3 = 0;
  873. res1_4 = 0;
  874. res1_5 = 0;
  875. res1_6 = 0;
  876. res1_7 = 0;
  877. res1_8 = 0;
  878. res1_9 = 0;
  879. res1_10 = 0;
  880. res1_11 = 0;
  881. res1_12 = 0;
  882. res1_13 = 0;
  883. res1_14 = 0;
  884. res1_15 = 0;
  885. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  886. temp = bk-off;
  887. #elif defined(LEFT)
  888. temp = off+16; // number of values in A
  889. #else
  890. temp = off+2; // number of values in B
  891. #endif
  892. for (k=0; k<temp; k++)
  893. {
  894. b0 = ptrbb[0];
  895. b1 = ptrbb[1];
  896. a0 = ptrba[0];
  897. res0_0 += a0*b0;
  898. res1_0 += a0*b1;
  899. a1 = ptrba[1];
  900. res0_1 += a1*b0;
  901. res1_1 += a1*b1;
  902. a0 = ptrba[2];
  903. res0_2 += a0*b0;
  904. res1_2 += a0*b1;
  905. a1 = ptrba[3];
  906. res0_3 += a1*b0;
  907. res1_3 += a1*b1;
  908. a0 = ptrba[4];
  909. res0_4 += a0*b0;
  910. res1_4 += a0*b1;
  911. a1 = ptrba[5];
  912. res0_5 += a1*b0;
  913. res1_5 += a1*b1;
  914. a0 = ptrba[6];
  915. res0_6 += a0*b0;
  916. res1_6 += a0*b1;
  917. a1 = ptrba[7];
  918. res0_7 += a1*b0;
  919. res1_7 += a1*b1;
  920. a0 = ptrba[8];
  921. res0_8 += a0*b0;
  922. res1_8 += a0*b1;
  923. a1 = ptrba[9];
  924. res0_9 += a1*b0;
  925. res1_9 += a1*b1;
  926. a0 = ptrba[10];
  927. res0_10 += a0*b0;
  928. res1_10 += a0*b1;
  929. a1 = ptrba[11];
  930. res0_11 += a1*b0;
  931. res1_11 += a1*b1;
  932. a0 = ptrba[12];
  933. res0_12 += a0*b0;
  934. res1_12 += a0*b1;
  935. a1 = ptrba[13];
  936. res0_13 += a1*b0;
  937. res1_13 += a1*b1;
  938. a0 = ptrba[14];
  939. res0_14 += a0*b0;
  940. res1_14 += a0*b1;
  941. a1 = ptrba[15];
  942. res0_15 += a1*b0;
  943. res1_15 += a1*b1;
  944. ptrba = ptrba+16;
  945. ptrbb = ptrbb+2;
  946. }
  947. res0_0 *= alpha;
  948. res0_1 *= alpha;
  949. res0_2 *= alpha;
  950. res0_3 *= alpha;
  951. res0_4 *= alpha;
  952. res0_5 *= alpha;
  953. res0_6 *= alpha;
  954. res0_7 *= alpha;
  955. res0_8 *= alpha;
  956. res0_9 *= alpha;
  957. res0_10 *= alpha;
  958. res0_11 *= alpha;
  959. res0_12 *= alpha;
  960. res0_13 *= alpha;
  961. res0_14 *= alpha;
  962. res0_15 *= alpha;
  963. res1_0 *= alpha;
  964. res1_1 *= alpha;
  965. res1_2 *= alpha;
  966. res1_3 *= alpha;
  967. res1_4 *= alpha;
  968. res1_5 *= alpha;
  969. res1_6 *= alpha;
  970. res1_7 *= alpha;
  971. res1_8 *= alpha;
  972. res1_9 *= alpha;
  973. res1_10 *= alpha;
  974. res1_11 *= alpha;
  975. res1_12 *= alpha;
  976. res1_13 *= alpha;
  977. res1_14 *= alpha;
  978. res1_15 *= alpha;
  979. C0[0] = res0_0;
  980. C0[1] = res0_1;
  981. C0[2] = res0_2;
  982. C0[3] = res0_3;
  983. C0[4] = res0_4;
  984. C0[5] = res0_5;
  985. C0[6] = res0_6;
  986. C0[7] = res0_7;
  987. C0[8] = res0_8;
  988. C0[9] = res0_9;
  989. C0[10] = res0_10;
  990. C0[11] = res0_11;
  991. C0[12] = res0_12;
  992. C0[13] = res0_13;
  993. C0[14] = res0_14;
  994. C0[15] = res0_15;
  995. C1[0] = res1_0;
  996. C1[1] = res1_1;
  997. C1[2] = res1_2;
  998. C1[3] = res1_3;
  999. C1[4] = res1_4;
  1000. C1[5] = res1_5;
  1001. C1[6] = res1_6;
  1002. C1[7] = res1_7;
  1003. C1[8] = res1_8;
  1004. C1[9] = res1_9;
  1005. C1[10] = res1_10;
  1006. C1[11] = res1_11;
  1007. C1[12] = res1_12;
  1008. C1[13] = res1_13;
  1009. C1[14] = res1_14;
  1010. C1[15] = res1_15;
  1011. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1012. temp = bk - off;
  1013. #ifdef LEFT
  1014. temp -= 16; // number of values in A
  1015. #else
  1016. temp -= 2; // number of values in B
  1017. #endif
  1018. ptrba += temp*16;
  1019. ptrbb += temp*2;
  1020. #endif
  1021. #ifdef LEFT
  1022. off += 16; // number of values in A
  1023. #endif
  1024. C0 = C0+16;
  1025. C1 = C1+16;
  1026. }
  1027. if ( bm & 8)
  1028. {
  1029. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1030. ptrbb = bb;
  1031. #else
  1032. ptrba += off*8;
  1033. ptrbb = bb + off*2;
  1034. #endif
  1035. res0_0 = 0;
  1036. res0_1 = 0;
  1037. res0_2 = 0;
  1038. res0_3 = 0;
  1039. res0_4 = 0;
  1040. res0_5 = 0;
  1041. res0_6 = 0;
  1042. res0_7 = 0;
  1043. res1_0 = 0;
  1044. res1_1 = 0;
  1045. res1_2 = 0;
  1046. res1_3 = 0;
  1047. res1_4 = 0;
  1048. res1_5 = 0;
  1049. res1_6 = 0;
  1050. res1_7 = 0;
  1051. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1052. temp = bk-off;
  1053. #elif defined(LEFT)
  1054. temp = off+8; // number of values in A
  1055. #else
  1056. temp = off+2; // number of values in B
  1057. #endif
  1058. for (k=0; k<temp; k++)
  1059. {
  1060. b0 = ptrbb[0];
  1061. b1 = ptrbb[1];
  1062. a0 = ptrba[0];
  1063. res0_0 += a0*b0;
  1064. res1_0 += a0*b1;
  1065. a1 = ptrba[1];
  1066. res0_1 += a1*b0;
  1067. res1_1 += a1*b1;
  1068. a0 = ptrba[2];
  1069. res0_2 += a0*b0;
  1070. res1_2 += a0*b1;
  1071. a1 = ptrba[3];
  1072. res0_3 += a1*b0;
  1073. res1_3 += a1*b1;
  1074. a0 = ptrba[4];
  1075. res0_4 += a0*b0;
  1076. res1_4 += a0*b1;
  1077. a1 = ptrba[5];
  1078. res0_5 += a1*b0;
  1079. res1_5 += a1*b1;
  1080. a0 = ptrba[6];
  1081. res0_6 += a0*b0;
  1082. res1_6 += a0*b1;
  1083. a1 = ptrba[7];
  1084. res0_7 += a1*b0;
  1085. res1_7 += a1*b1;
  1086. ptrba = ptrba+8;
  1087. ptrbb = ptrbb+2;
  1088. }
  1089. res0_0 *= alpha;
  1090. res0_1 *= alpha;
  1091. res0_2 *= alpha;
  1092. res0_3 *= alpha;
  1093. res0_4 *= alpha;
  1094. res0_5 *= alpha;
  1095. res0_6 *= alpha;
  1096. res0_7 *= alpha;
  1097. res1_0 *= alpha;
  1098. res1_1 *= alpha;
  1099. res1_2 *= alpha;
  1100. res1_3 *= alpha;
  1101. res1_4 *= alpha;
  1102. res1_5 *= alpha;
  1103. res1_6 *= alpha;
  1104. res1_7 *= alpha;
  1105. C0[0] = res0_0;
  1106. C0[1] = res0_1;
  1107. C0[2] = res0_2;
  1108. C0[3] = res0_3;
  1109. C0[4] = res0_4;
  1110. C0[5] = res0_5;
  1111. C0[6] = res0_6;
  1112. C0[7] = res0_7;
  1113. C1[0] = res1_0;
  1114. C1[1] = res1_1;
  1115. C1[2] = res1_2;
  1116. C1[3] = res1_3;
  1117. C1[4] = res1_4;
  1118. C1[5] = res1_5;
  1119. C1[6] = res1_6;
  1120. C1[7] = res1_7;
  1121. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1122. temp = bk - off;
  1123. #ifdef LEFT
  1124. temp -= 8; // number of values in A
  1125. #else
  1126. temp -= 2; // number of values in B
  1127. #endif
  1128. ptrba += temp*8;
  1129. ptrbb += temp*2;
  1130. #endif
  1131. #ifdef LEFT
  1132. off += 8; // number of values in A
  1133. #endif
  1134. C0 = C0+8;
  1135. C1 = C1+8;
  1136. }
  1137. if ( bm & 4 )
  1138. {
  1139. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1140. ptrbb = bb;
  1141. #else
  1142. ptrba += off*4;
  1143. ptrbb = bb + off*2;
  1144. #endif
  1145. res0_0 = 0;
  1146. res0_1 = 0;
  1147. res0_2 = 0;
  1148. res0_3 = 0;
  1149. res1_0 = 0;
  1150. res1_1 = 0;
  1151. res1_2 = 0;
  1152. res1_3 = 0;
  1153. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1154. temp = bk-off;
  1155. #elif defined(LEFT)
  1156. temp = off+4; // number of values in A
  1157. #else
  1158. temp = off+2; // number of values in B
  1159. #endif
  1160. for (k=0; k<temp; k++)
  1161. {
  1162. b0 = ptrbb[0];
  1163. b1 = ptrbb[1];
  1164. a0 = ptrba[0];
  1165. res0_0 += a0*b0;
  1166. res1_0 += a0*b1;
  1167. a1 = ptrba[1];
  1168. res0_1 += a1*b0;
  1169. res1_1 += a1*b1;
  1170. a0 = ptrba[2];
  1171. res0_2 += a0*b0;
  1172. res1_2 += a0*b1;
  1173. a1 = ptrba[3];
  1174. res0_3 += a1*b0;
  1175. res1_3 += a1*b1;
  1176. ptrba = ptrba+4;
  1177. ptrbb = ptrbb+2;
  1178. }
  1179. res0_0 *= alpha;
  1180. res0_1 *= alpha;
  1181. res0_2 *= alpha;
  1182. res0_3 *= alpha;
  1183. res1_0 *= alpha;
  1184. res1_1 *= alpha;
  1185. res1_2 *= alpha;
  1186. res1_3 *= alpha;
  1187. C0[0] = res0_0;
  1188. C0[1] = res0_1;
  1189. C0[2] = res0_2;
  1190. C0[3] = res0_3;
  1191. C1[0] = res1_0;
  1192. C1[1] = res1_1;
  1193. C1[2] = res1_2;
  1194. C1[3] = res1_3;
  1195. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1196. temp = bk - off;
  1197. #ifdef LEFT
  1198. temp -= 4; // number of values in A
  1199. #else
  1200. temp -= 2; // number of values in B
  1201. #endif
  1202. ptrba += temp*4;
  1203. ptrbb += temp*2;
  1204. #endif
  1205. #ifdef LEFT
  1206. off += 4; // number of values in A
  1207. #endif
  1208. C0 = C0+4;
  1209. C1 = C1+4;
  1210. }
  1211. if ( bm & 2 )
  1212. {
  1213. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1214. ptrbb = bb;
  1215. #else
  1216. ptrba += off*2;
  1217. ptrbb = bb + off*2;
  1218. #endif
  1219. res0_0 = 0;
  1220. res0_1 = 0;
  1221. res1_0 = 0;
  1222. res1_1 = 0;
  1223. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1224. temp = bk-off;
  1225. #elif defined(LEFT)
  1226. temp = off+2; // number of values in A
  1227. #else
  1228. temp = off+2; // number of values in B
  1229. #endif
  1230. for (k=0; k<temp; k++)
  1231. {
  1232. b0 = ptrbb[0];
  1233. b1 = ptrbb[1];
  1234. a0 = ptrba[0];
  1235. res0_0 += a0*b0;
  1236. res1_0 += a0*b1;
  1237. a1 = ptrba[1];
  1238. res0_1 += a1*b0;
  1239. res1_1 += a1*b1;
  1240. ptrba = ptrba+2;
  1241. ptrbb = ptrbb+2;
  1242. }
  1243. res0_0 *= alpha;
  1244. res0_1 *= alpha;
  1245. res1_0 *= alpha;
  1246. res1_1 *= alpha;
  1247. C0[0] = res0_0;
  1248. C0[1] = res0_1;
  1249. C1[0] = res1_0;
  1250. C1[1] = res1_1;
  1251. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1252. temp = bk - off;
  1253. #ifdef LEFT
  1254. temp -= 2; // number of values in A
  1255. #else
  1256. temp -= 2; // number of values in B
  1257. #endif
  1258. ptrba += temp*2;
  1259. ptrbb += temp*2;
  1260. #endif
  1261. #ifdef LEFT
  1262. off += 2; // number of values in A
  1263. #endif
  1264. C0 = C0+2;
  1265. C1 = C1+2;
  1266. }
  1267. if ( bm & 1 )
  1268. {
  1269. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1270. ptrbb = bb;
  1271. #else
  1272. ptrba += off*1;
  1273. ptrbb = bb + off*2;
  1274. #endif
  1275. res0_0 = 0;
  1276. res1_0 = 0;
  1277. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1278. temp = bk-off;
  1279. #elif defined(LEFT)
  1280. temp = off+1; // number of values in A
  1281. #else
  1282. temp = off+2; // number of values in B
  1283. #endif
  1284. for (k=0; k<temp; k++)
  1285. {
  1286. b0 = ptrbb[0];
  1287. b1 = ptrbb[1];
  1288. a0 = ptrba[0];
  1289. res0_0 += a0*b0;
  1290. res1_0 += a0*b1;
  1291. ptrba = ptrba+1;
  1292. ptrbb = ptrbb+2;
  1293. }
  1294. res0_0 *= alpha;
  1295. res1_0 *= alpha;
  1296. C0[0] = res0_0;
  1297. C1[0] = res1_0;
  1298. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1299. temp = bk - off;
  1300. #ifdef LEFT
  1301. temp -= 1; // number of values in A
  1302. #else
  1303. temp -= 2; // number of values in B
  1304. #endif
  1305. ptrba += temp*1;
  1306. ptrbb += temp*2;
  1307. #endif
  1308. #ifdef LEFT
  1309. off += 1; // number of values in A
  1310. #endif
  1311. C0 = C0+1;
  1312. C1 = C1+1;
  1313. }
  1314. #if defined(TRMMKERNEL) && !defined(LEFT)
  1315. off += 2;
  1316. #endif
  1317. k = (bk<<1);
  1318. bb = bb+k;
  1319. i = (ldc<<1);
  1320. C = C+i;
  1321. }
  1322. for (j=0; j<(bn&1); j+=1)
  1323. {
  1324. C0 = C;
  1325. #if defined(TRMMKERNEL) && defined(LEFT)
  1326. off = offset;
  1327. #endif
  1328. ptrba = ba;
  1329. for (i=0; i<bm/16; i+=1)
  1330. {
  1331. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1332. ptrbb = bb;
  1333. #else
  1334. ptrba += off*16;
  1335. ptrbb = bb + off*1;
  1336. #endif
  1337. res0_0 = 0;
  1338. res0_1 = 0;
  1339. res0_2 = 0;
  1340. res0_3 = 0;
  1341. res0_4 = 0;
  1342. res0_5 = 0;
  1343. res0_6 = 0;
  1344. res0_7 = 0;
  1345. res0_8 = 0;
  1346. res0_9 = 0;
  1347. res0_10 = 0;
  1348. res0_11 = 0;
  1349. res0_12 = 0;
  1350. res0_13 = 0;
  1351. res0_14 = 0;
  1352. res0_15 = 0;
  1353. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1354. temp = bk-off;
  1355. #elif defined(LEFT)
  1356. temp = off+16; // number of values in A
  1357. #else
  1358. temp = off+1; // number of values in B
  1359. #endif
  1360. for (k=0; k<temp; k++)
  1361. {
  1362. b0 = ptrbb[0];
  1363. a0 = ptrba[0];
  1364. res0_0 += a0*b0;
  1365. a1 = ptrba[1];
  1366. res0_1 += a1*b0;
  1367. a0 = ptrba[2];
  1368. res0_2 += a0*b0;
  1369. a1 = ptrba[3];
  1370. res0_3 += a1*b0;
  1371. a0 = ptrba[4];
  1372. res0_4 += a0*b0;
  1373. a1 = ptrba[5];
  1374. res0_5 += a1*b0;
  1375. a0 = ptrba[6];
  1376. res0_6 += a0*b0;
  1377. a1 = ptrba[7];
  1378. res0_7 += a1*b0;
  1379. a0 = ptrba[8];
  1380. res0_8 += a0*b0;
  1381. a1 = ptrba[9];
  1382. res0_9 += a1*b0;
  1383. a0 = ptrba[10];
  1384. res0_10 += a0*b0;
  1385. a1 = ptrba[11];
  1386. res0_11 += a1*b0;
  1387. a0 = ptrba[12];
  1388. res0_12 += a0*b0;
  1389. a1 = ptrba[13];
  1390. res0_13 += a1*b0;
  1391. a0 = ptrba[14];
  1392. res0_14 += a0*b0;
  1393. a1 = ptrba[15];
  1394. res0_15 += a1*b0;
  1395. ptrba = ptrba+16;
  1396. ptrbb = ptrbb+1;
  1397. }
  1398. res0_0 *= alpha;
  1399. res0_1 *= alpha;
  1400. res0_2 *= alpha;
  1401. res0_3 *= alpha;
  1402. res0_4 *= alpha;
  1403. res0_5 *= alpha;
  1404. res0_6 *= alpha;
  1405. res0_7 *= alpha;
  1406. res0_8 *= alpha;
  1407. res0_9 *= alpha;
  1408. res0_10 *= alpha;
  1409. res0_11 *= alpha;
  1410. res0_12 *= alpha;
  1411. res0_13 *= alpha;
  1412. res0_14 *= alpha;
  1413. res0_15 *= alpha;
  1414. C0[0] = res0_0;
  1415. C0[1] = res0_1;
  1416. C0[2] = res0_2;
  1417. C0[3] = res0_3;
  1418. C0[4] = res0_4;
  1419. C0[5] = res0_5;
  1420. C0[6] = res0_6;
  1421. C0[7] = res0_7;
  1422. C0[8] = res0_8;
  1423. C0[9] = res0_9;
  1424. C0[10] = res0_10;
  1425. C0[11] = res0_11;
  1426. C0[12] = res0_12;
  1427. C0[13] = res0_13;
  1428. C0[14] = res0_14;
  1429. C0[15] = res0_15;
  1430. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1431. temp = bk - off;
  1432. #ifdef LEFT
  1433. temp -= 16; // number of values in A
  1434. #else
  1435. temp -= 1; // number of values in B
  1436. #endif
  1437. ptrba += temp*16;
  1438. ptrbb += temp*1;
  1439. #endif
  1440. #ifdef LEFT
  1441. off += 16; // number of values in A
  1442. #endif
  1443. C0 = C0+16;
  1444. }
  1445. if ( bm & 8 )
  1446. {
  1447. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1448. ptrbb = bb;
  1449. #else
  1450. ptrba += off*8;
  1451. ptrbb = bb + off*1;
  1452. #endif
  1453. res0_0 = 0;
  1454. res0_1 = 0;
  1455. res0_2 = 0;
  1456. res0_3 = 0;
  1457. res0_4 = 0;
  1458. res0_5 = 0;
  1459. res0_6 = 0;
  1460. res0_7 = 0;
  1461. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1462. temp = bk-off;
  1463. #elif defined(LEFT)
  1464. temp = off+8; // number of values in A
  1465. #else
  1466. temp = off+1; // number of values in B
  1467. #endif
  1468. for (k=0; k<temp; k++)
  1469. {
  1470. b0 = ptrbb[0];
  1471. a0 = ptrba[0];
  1472. res0_0 += a0*b0;
  1473. a1 = ptrba[1];
  1474. res0_1 += a1*b0;
  1475. a0 = ptrba[2];
  1476. res0_2 += a0*b0;
  1477. a1 = ptrba[3];
  1478. res0_3 += a1*b0;
  1479. a0 = ptrba[4];
  1480. res0_4 += a0*b0;
  1481. a1 = ptrba[5];
  1482. res0_5 += a1*b0;
  1483. a0 = ptrba[6];
  1484. res0_6 += a0*b0;
  1485. a1 = ptrba[7];
  1486. res0_7 += a1*b0;
  1487. ptrba = ptrba+8;
  1488. ptrbb = ptrbb+1;
  1489. }
  1490. res0_0 *= alpha;
  1491. res0_1 *= alpha;
  1492. res0_2 *= alpha;
  1493. res0_3 *= alpha;
  1494. res0_4 *= alpha;
  1495. res0_5 *= alpha;
  1496. res0_6 *= alpha;
  1497. res0_7 *= alpha;
  1498. C0[0] = res0_0;
  1499. C0[1] = res0_1;
  1500. C0[2] = res0_2;
  1501. C0[3] = res0_3;
  1502. C0[4] = res0_4;
  1503. C0[5] = res0_5;
  1504. C0[6] = res0_6;
  1505. C0[7] = res0_7;
  1506. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1507. temp = bk - off;
  1508. #ifdef LEFT
  1509. temp -= 8; // number of values in A
  1510. #else
  1511. temp -= 1; // number of values in B
  1512. #endif
  1513. ptrba += temp*8;
  1514. ptrbb += temp*1;
  1515. #endif
  1516. #ifdef LEFT
  1517. off += 8; // number of values in A
  1518. #endif
  1519. C0 = C0+8;
  1520. }
  1521. if ( bm & 4 )
  1522. {
  1523. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1524. ptrbb = bb;
  1525. #else
  1526. ptrba += off*4;
  1527. ptrbb = bb + off*1;
  1528. #endif
  1529. res0_0 = 0;
  1530. res0_1 = 0;
  1531. res0_2 = 0;
  1532. res0_3 = 0;
  1533. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1534. temp = bk-off;
  1535. #elif defined(LEFT)
  1536. temp = off+4; // number of values in A
  1537. #else
  1538. temp = off+1; // number of values in B
  1539. #endif
  1540. for (k=0; k<temp; k++)
  1541. {
  1542. b0 = ptrbb[0];
  1543. a0 = ptrba[0];
  1544. res0_0 += a0*b0;
  1545. a1 = ptrba[1];
  1546. res0_1 += a1*b0;
  1547. a0 = ptrba[2];
  1548. res0_2 += a0*b0;
  1549. a1 = ptrba[3];
  1550. res0_3 += a1*b0;
  1551. ptrba = ptrba+4;
  1552. ptrbb = ptrbb+1;
  1553. }
  1554. res0_0 *= alpha;
  1555. res0_1 *= alpha;
  1556. res0_2 *= alpha;
  1557. res0_3 *= alpha;
  1558. C0[0] = res0_0;
  1559. C0[1] = res0_1;
  1560. C0[2] = res0_2;
  1561. C0[3] = res0_3;
  1562. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1563. temp = bk - off;
  1564. #ifdef LEFT
  1565. temp -= 4; // number of values in A
  1566. #else
  1567. temp -= 1; // number of values in B
  1568. #endif
  1569. ptrba += temp*4;
  1570. ptrbb += temp*1;
  1571. #endif
  1572. #ifdef LEFT
  1573. off += 4; // number of values in A
  1574. #endif
  1575. C0 = C0+4;
  1576. }
  1577. if ( bm & 2 )
  1578. {
  1579. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1580. ptrbb = bb;
  1581. #else
  1582. ptrba += off*2;
  1583. ptrbb = bb + off*1;
  1584. #endif
  1585. res0_0 = 0;
  1586. res0_1 = 0;
  1587. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1588. temp = bk-off;
  1589. #elif defined(LEFT)
  1590. temp = off+2; // number of values in A
  1591. #else
  1592. temp = off+1; // number of values in B
  1593. #endif
  1594. for (k=0; k<temp; k++)
  1595. {
  1596. b0 = ptrbb[0];
  1597. a0 = ptrba[0];
  1598. res0_0 += a0*b0;
  1599. a1 = ptrba[1];
  1600. res0_1 += a1*b0;
  1601. ptrba = ptrba+2;
  1602. ptrbb = ptrbb+1;
  1603. }
  1604. res0_0 *= alpha;
  1605. res0_1 *= alpha;
  1606. C0[0] = res0_0;
  1607. C0[1] = res0_1;
  1608. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1609. temp = bk - off;
  1610. #ifdef LEFT
  1611. temp -= 2; // number of values in A
  1612. #else
  1613. temp -= 1; // number of values in B
  1614. #endif
  1615. ptrba += temp*2;
  1616. ptrbb += temp*1;
  1617. #endif
  1618. #ifdef LEFT
  1619. off += 2; // number of values in A
  1620. #endif
  1621. C0 = C0+2;
  1622. }
  1623. if ( bm & 1 )
  1624. {
  1625. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1626. ptrbb = bb;
  1627. #else
  1628. ptrba += off*1;
  1629. ptrbb = bb + off*1;
  1630. #endif
  1631. res0_0 = 0;
  1632. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1633. temp = bk-off;
  1634. #elif defined(LEFT)
  1635. temp = off+1; // number of values in A
  1636. #else
  1637. temp = off+1; // number of values in B
  1638. #endif
  1639. for (k=0; k<temp; k++)
  1640. {
  1641. b0 = ptrbb[0];
  1642. a0 = ptrba[0];
  1643. res0_0 += a0*b0;
  1644. ptrba = ptrba+1;
  1645. ptrbb = ptrbb+1;
  1646. }
  1647. res0_0 *= alpha;
  1648. C0[0] = res0_0;
  1649. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1650. temp = bk - off;
  1651. #ifdef LEFT
  1652. temp -= 1; // number of values in A
  1653. #else
  1654. temp -= 1; // number of values in B
  1655. #endif
  1656. ptrba += temp*1;
  1657. ptrbb += temp*1;
  1658. #endif
  1659. #ifdef LEFT
  1660. off += 1; // number of values in A
  1661. #endif
  1662. C0 = C0+1;
  1663. }
  1664. #if defined(TRMMKERNEL) && !defined(LEFT)
  1665. off += 1;
  1666. #endif
  1667. k = (bk<<0);
  1668. bb = bb+k;
  1669. C = C+ldc;
  1670. }
  1671. return 0;
  1672. }