You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_tcopy_8.c 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
  41. BLASLONG i, j;
  42. IFLOAT *aoffset;
  43. IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
  44. IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
  45. IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
  46. IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
  47. IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
  48. IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
  49. IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
  50. IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
  51. IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
  52. IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
  53. IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
  54. IFLOAT ctemp33, ctemp34, ctemp35, ctemp36;
  55. IFLOAT ctemp37, ctemp38, ctemp39, ctemp40;
  56. IFLOAT ctemp41, ctemp42, ctemp43, ctemp44;
  57. IFLOAT ctemp45, ctemp46, ctemp47, ctemp48;
  58. IFLOAT ctemp49, ctemp50, ctemp51, ctemp52;
  59. IFLOAT ctemp53, ctemp54, ctemp55, ctemp56;
  60. IFLOAT ctemp57, ctemp58, ctemp59, ctemp60;
  61. IFLOAT ctemp61, ctemp62, ctemp63, ctemp64;
  62. aoffset = a;
  63. boffset = b;
  64. #if 0
  65. fprintf(stderr, "M = %d N = %d\n", m, n);
  66. #endif
  67. boffset2 = b + m * (n & ~7);
  68. boffset3 = b + m * (n & ~3);
  69. boffset4 = b + m * (n & ~1);
  70. j = (m >> 3);
  71. if (j > 0){
  72. do{
  73. aoffset1 = aoffset;
  74. aoffset2 = aoffset1 + lda;
  75. aoffset3 = aoffset2 + lda;
  76. aoffset4 = aoffset3 + lda;
  77. aoffset5 = aoffset4 + lda;
  78. aoffset6 = aoffset5 + lda;
  79. aoffset7 = aoffset6 + lda;
  80. aoffset8 = aoffset7 + lda;
  81. aoffset += 8 * lda;
  82. boffset1 = boffset;
  83. boffset += 64;
  84. i = (n >> 3);
  85. if (i > 0){
  86. do{
  87. ctemp01 = *(aoffset1 + 0);
  88. ctemp02 = *(aoffset1 + 1);
  89. ctemp03 = *(aoffset1 + 2);
  90. ctemp04 = *(aoffset1 + 3);
  91. ctemp05 = *(aoffset1 + 4);
  92. ctemp06 = *(aoffset1 + 5);
  93. ctemp07 = *(aoffset1 + 6);
  94. ctemp08 = *(aoffset1 + 7);
  95. aoffset1 += 8;
  96. ctemp09 = *(aoffset2 + 0);
  97. ctemp10 = *(aoffset2 + 1);
  98. ctemp11 = *(aoffset2 + 2);
  99. ctemp12 = *(aoffset2 + 3);
  100. ctemp13 = *(aoffset2 + 4);
  101. ctemp14 = *(aoffset2 + 5);
  102. ctemp15 = *(aoffset2 + 6);
  103. ctemp16 = *(aoffset2 + 7);
  104. aoffset2 += 8;
  105. ctemp17 = *(aoffset3 + 0);
  106. ctemp18 = *(aoffset3 + 1);
  107. ctemp19 = *(aoffset3 + 2);
  108. ctemp20 = *(aoffset3 + 3);
  109. ctemp21 = *(aoffset3 + 4);
  110. ctemp22 = *(aoffset3 + 5);
  111. ctemp23 = *(aoffset3 + 6);
  112. ctemp24 = *(aoffset3 + 7);
  113. aoffset3 += 8;
  114. ctemp25 = *(aoffset4 + 0);
  115. ctemp26 = *(aoffset4 + 1);
  116. ctemp27 = *(aoffset4 + 2);
  117. ctemp28 = *(aoffset4 + 3);
  118. ctemp29 = *(aoffset4 + 4);
  119. ctemp30 = *(aoffset4 + 5);
  120. ctemp31 = *(aoffset4 + 6);
  121. ctemp32 = *(aoffset4 + 7);
  122. aoffset4 += 8;
  123. ctemp33 = *(aoffset5 + 0);
  124. ctemp34 = *(aoffset5 + 1);
  125. ctemp35 = *(aoffset5 + 2);
  126. ctemp36 = *(aoffset5 + 3);
  127. ctemp37 = *(aoffset5 + 4);
  128. ctemp38 = *(aoffset5 + 5);
  129. ctemp39 = *(aoffset5 + 6);
  130. ctemp40 = *(aoffset5 + 7);
  131. aoffset5 += 8;
  132. ctemp41 = *(aoffset6 + 0);
  133. ctemp42 = *(aoffset6 + 1);
  134. ctemp43 = *(aoffset6 + 2);
  135. ctemp44 = *(aoffset6 + 3);
  136. ctemp45 = *(aoffset6 + 4);
  137. ctemp46 = *(aoffset6 + 5);
  138. ctemp47 = *(aoffset6 + 6);
  139. ctemp48 = *(aoffset6 + 7);
  140. aoffset6 += 8;
  141. ctemp49 = *(aoffset7 + 0);
  142. ctemp50 = *(aoffset7 + 1);
  143. ctemp51 = *(aoffset7 + 2);
  144. ctemp52 = *(aoffset7 + 3);
  145. ctemp53 = *(aoffset7 + 4);
  146. ctemp54 = *(aoffset7 + 5);
  147. ctemp55 = *(aoffset7 + 6);
  148. ctemp56 = *(aoffset7 + 7);
  149. aoffset7 += 8;
  150. ctemp57 = *(aoffset8 + 0);
  151. ctemp58 = *(aoffset8 + 1);
  152. ctemp59 = *(aoffset8 + 2);
  153. ctemp60 = *(aoffset8 + 3);
  154. ctemp61 = *(aoffset8 + 4);
  155. ctemp62 = *(aoffset8 + 5);
  156. ctemp63 = *(aoffset8 + 6);
  157. ctemp64 = *(aoffset8 + 7);
  158. aoffset8 += 8;
  159. *(boffset1 + 0) = ctemp01;
  160. *(boffset1 + 1) = ctemp02;
  161. *(boffset1 + 2) = ctemp03;
  162. *(boffset1 + 3) = ctemp04;
  163. *(boffset1 + 4) = ctemp05;
  164. *(boffset1 + 5) = ctemp06;
  165. *(boffset1 + 6) = ctemp07;
  166. *(boffset1 + 7) = ctemp08;
  167. *(boffset1 + 8) = ctemp09;
  168. *(boffset1 + 9) = ctemp10;
  169. *(boffset1 + 10) = ctemp11;
  170. *(boffset1 + 11) = ctemp12;
  171. *(boffset1 + 12) = ctemp13;
  172. *(boffset1 + 13) = ctemp14;
  173. *(boffset1 + 14) = ctemp15;
  174. *(boffset1 + 15) = ctemp16;
  175. *(boffset1 + 16) = ctemp17;
  176. *(boffset1 + 17) = ctemp18;
  177. *(boffset1 + 18) = ctemp19;
  178. *(boffset1 + 19) = ctemp20;
  179. *(boffset1 + 20) = ctemp21;
  180. *(boffset1 + 21) = ctemp22;
  181. *(boffset1 + 22) = ctemp23;
  182. *(boffset1 + 23) = ctemp24;
  183. *(boffset1 + 24) = ctemp25;
  184. *(boffset1 + 25) = ctemp26;
  185. *(boffset1 + 26) = ctemp27;
  186. *(boffset1 + 27) = ctemp28;
  187. *(boffset1 + 28) = ctemp29;
  188. *(boffset1 + 29) = ctemp30;
  189. *(boffset1 + 30) = ctemp31;
  190. *(boffset1 + 31) = ctemp32;
  191. *(boffset1 + 32) = ctemp33;
  192. *(boffset1 + 33) = ctemp34;
  193. *(boffset1 + 34) = ctemp35;
  194. *(boffset1 + 35) = ctemp36;
  195. *(boffset1 + 36) = ctemp37;
  196. *(boffset1 + 37) = ctemp38;
  197. *(boffset1 + 38) = ctemp39;
  198. *(boffset1 + 39) = ctemp40;
  199. *(boffset1 + 40) = ctemp41;
  200. *(boffset1 + 41) = ctemp42;
  201. *(boffset1 + 42) = ctemp43;
  202. *(boffset1 + 43) = ctemp44;
  203. *(boffset1 + 44) = ctemp45;
  204. *(boffset1 + 45) = ctemp46;
  205. *(boffset1 + 46) = ctemp47;
  206. *(boffset1 + 47) = ctemp48;
  207. *(boffset1 + 48) = ctemp49;
  208. *(boffset1 + 49) = ctemp50;
  209. *(boffset1 + 50) = ctemp51;
  210. *(boffset1 + 51) = ctemp52;
  211. *(boffset1 + 52) = ctemp53;
  212. *(boffset1 + 53) = ctemp54;
  213. *(boffset1 + 54) = ctemp55;
  214. *(boffset1 + 55) = ctemp56;
  215. *(boffset1 + 56) = ctemp57;
  216. *(boffset1 + 57) = ctemp58;
  217. *(boffset1 + 58) = ctemp59;
  218. *(boffset1 + 59) = ctemp60;
  219. *(boffset1 + 60) = ctemp61;
  220. *(boffset1 + 61) = ctemp62;
  221. *(boffset1 + 62) = ctemp63;
  222. *(boffset1 + 63) = ctemp64;
  223. boffset1 += m * 8;
  224. i --;
  225. }while(i > 0);
  226. }
  227. if (n & 4){
  228. ctemp01 = *(aoffset1 + 0);
  229. ctemp02 = *(aoffset1 + 1);
  230. ctemp03 = *(aoffset1 + 2);
  231. ctemp04 = *(aoffset1 + 3);
  232. aoffset1 += 4;
  233. ctemp05 = *(aoffset2 + 0);
  234. ctemp06 = *(aoffset2 + 1);
  235. ctemp07 = *(aoffset2 + 2);
  236. ctemp08 = *(aoffset2 + 3);
  237. aoffset2 += 4;
  238. ctemp09 = *(aoffset3 + 0);
  239. ctemp10 = *(aoffset3 + 1);
  240. ctemp11 = *(aoffset3 + 2);
  241. ctemp12 = *(aoffset3 + 3);
  242. aoffset3 += 4;
  243. ctemp13 = *(aoffset4 + 0);
  244. ctemp14 = *(aoffset4 + 1);
  245. ctemp15 = *(aoffset4 + 2);
  246. ctemp16 = *(aoffset4 + 3);
  247. aoffset4 += 4;
  248. ctemp17 = *(aoffset5 + 0);
  249. ctemp18 = *(aoffset5 + 1);
  250. ctemp19 = *(aoffset5 + 2);
  251. ctemp20 = *(aoffset5 + 3);
  252. aoffset5 += 4;
  253. ctemp21 = *(aoffset6 + 0);
  254. ctemp22 = *(aoffset6 + 1);
  255. ctemp23 = *(aoffset6 + 2);
  256. ctemp24 = *(aoffset6 + 3);
  257. aoffset6 += 4;
  258. ctemp25 = *(aoffset7 + 0);
  259. ctemp26 = *(aoffset7 + 1);
  260. ctemp27 = *(aoffset7 + 2);
  261. ctemp28 = *(aoffset7 + 3);
  262. aoffset7 += 4;
  263. ctemp29 = *(aoffset8 + 0);
  264. ctemp30 = *(aoffset8 + 1);
  265. ctemp31 = *(aoffset8 + 2);
  266. ctemp32 = *(aoffset8 + 3);
  267. aoffset8 += 4;
  268. *(boffset2 + 0) = ctemp01;
  269. *(boffset2 + 1) = ctemp02;
  270. *(boffset2 + 2) = ctemp03;
  271. *(boffset2 + 3) = ctemp04;
  272. *(boffset2 + 4) = ctemp05;
  273. *(boffset2 + 5) = ctemp06;
  274. *(boffset2 + 6) = ctemp07;
  275. *(boffset2 + 7) = ctemp08;
  276. *(boffset2 + 8) = ctemp09;
  277. *(boffset2 + 9) = ctemp10;
  278. *(boffset2 + 10) = ctemp11;
  279. *(boffset2 + 11) = ctemp12;
  280. *(boffset2 + 12) = ctemp13;
  281. *(boffset2 + 13) = ctemp14;
  282. *(boffset2 + 14) = ctemp15;
  283. *(boffset2 + 15) = ctemp16;
  284. *(boffset2 + 16) = ctemp17;
  285. *(boffset2 + 17) = ctemp18;
  286. *(boffset2 + 18) = ctemp19;
  287. *(boffset2 + 19) = ctemp20;
  288. *(boffset2 + 20) = ctemp21;
  289. *(boffset2 + 21) = ctemp22;
  290. *(boffset2 + 22) = ctemp23;
  291. *(boffset2 + 23) = ctemp24;
  292. *(boffset2 + 24) = ctemp25;
  293. *(boffset2 + 25) = ctemp26;
  294. *(boffset2 + 26) = ctemp27;
  295. *(boffset2 + 27) = ctemp28;
  296. *(boffset2 + 28) = ctemp29;
  297. *(boffset2 + 29) = ctemp30;
  298. *(boffset2 + 30) = ctemp31;
  299. *(boffset2 + 31) = ctemp32;
  300. boffset2 += 32;
  301. }
  302. if (n & 2){
  303. ctemp01 = *(aoffset1 + 0);
  304. ctemp02 = *(aoffset1 + 1);
  305. aoffset1 += 2;
  306. ctemp03 = *(aoffset2 + 0);
  307. ctemp04 = *(aoffset2 + 1);
  308. aoffset2 += 2;
  309. ctemp05 = *(aoffset3 + 0);
  310. ctemp06 = *(aoffset3 + 1);
  311. aoffset3 += 2;
  312. ctemp07 = *(aoffset4 + 0);
  313. ctemp08 = *(aoffset4 + 1);
  314. aoffset4 += 2;
  315. ctemp09 = *(aoffset5 + 0);
  316. ctemp10 = *(aoffset5 + 1);
  317. aoffset5 += 2;
  318. ctemp11 = *(aoffset6 + 0);
  319. ctemp12 = *(aoffset6 + 1);
  320. aoffset6 += 2;
  321. ctemp13 = *(aoffset7 + 0);
  322. ctemp14 = *(aoffset7 + 1);
  323. aoffset7 += 2;
  324. ctemp15 = *(aoffset8 + 0);
  325. ctemp16 = *(aoffset8 + 1);
  326. aoffset8 += 2;
  327. *(boffset3 + 0) = ctemp01;
  328. *(boffset3 + 1) = ctemp02;
  329. *(boffset3 + 2) = ctemp03;
  330. *(boffset3 + 3) = ctemp04;
  331. *(boffset3 + 4) = ctemp05;
  332. *(boffset3 + 5) = ctemp06;
  333. *(boffset3 + 6) = ctemp07;
  334. *(boffset3 + 7) = ctemp08;
  335. *(boffset3 + 8) = ctemp09;
  336. *(boffset3 + 9) = ctemp10;
  337. *(boffset3 + 10) = ctemp11;
  338. *(boffset3 + 11) = ctemp12;
  339. *(boffset3 + 12) = ctemp13;
  340. *(boffset3 + 13) = ctemp14;
  341. *(boffset3 + 14) = ctemp15;
  342. *(boffset3 + 15) = ctemp16;
  343. boffset3 += 16;
  344. }
  345. if (n & 1){
  346. ctemp01 = *(aoffset1 + 0);
  347. aoffset1 ++;
  348. ctemp02 = *(aoffset2 + 0);
  349. aoffset2 ++;
  350. ctemp03 = *(aoffset3 + 0);
  351. aoffset3 ++;
  352. ctemp04 = *(aoffset4 + 0);
  353. aoffset4 ++;
  354. ctemp05 = *(aoffset5 + 0);
  355. aoffset5 ++;
  356. ctemp06 = *(aoffset6 + 0);
  357. aoffset6 ++;
  358. ctemp07 = *(aoffset7 + 0);
  359. aoffset7 ++;
  360. ctemp08 = *(aoffset8 + 0);
  361. aoffset8 ++;
  362. *(boffset4 + 0) = ctemp01;
  363. *(boffset4 + 1) = ctemp02;
  364. *(boffset4 + 2) = ctemp03;
  365. *(boffset4 + 3) = ctemp04;
  366. *(boffset4 + 4) = ctemp05;
  367. *(boffset4 + 5) = ctemp06;
  368. *(boffset4 + 6) = ctemp07;
  369. *(boffset4 + 7) = ctemp08;
  370. boffset4 += 8;
  371. }
  372. j--;
  373. }while(j > 0);
  374. }
  375. if (m & 4){
  376. aoffset1 = aoffset;
  377. aoffset2 = aoffset1 + lda;
  378. aoffset3 = aoffset2 + lda;
  379. aoffset4 = aoffset3 + lda;
  380. aoffset += 4 * lda;
  381. boffset1 = boffset;
  382. boffset += 32;
  383. i = (n >> 3);
  384. if (i > 0){
  385. do{
  386. ctemp01 = *(aoffset1 + 0);
  387. ctemp02 = *(aoffset1 + 1);
  388. ctemp03 = *(aoffset1 + 2);
  389. ctemp04 = *(aoffset1 + 3);
  390. ctemp05 = *(aoffset1 + 4);
  391. ctemp06 = *(aoffset1 + 5);
  392. ctemp07 = *(aoffset1 + 6);
  393. ctemp08 = *(aoffset1 + 7);
  394. aoffset1 += 8;
  395. ctemp09 = *(aoffset2 + 0);
  396. ctemp10 = *(aoffset2 + 1);
  397. ctemp11 = *(aoffset2 + 2);
  398. ctemp12 = *(aoffset2 + 3);
  399. ctemp13 = *(aoffset2 + 4);
  400. ctemp14 = *(aoffset2 + 5);
  401. ctemp15 = *(aoffset2 + 6);
  402. ctemp16 = *(aoffset2 + 7);
  403. aoffset2 += 8;
  404. ctemp17 = *(aoffset3 + 0);
  405. ctemp18 = *(aoffset3 + 1);
  406. ctemp19 = *(aoffset3 + 2);
  407. ctemp20 = *(aoffset3 + 3);
  408. ctemp21 = *(aoffset3 + 4);
  409. ctemp22 = *(aoffset3 + 5);
  410. ctemp23 = *(aoffset3 + 6);
  411. ctemp24 = *(aoffset3 + 7);
  412. aoffset3 += 8;
  413. ctemp25 = *(aoffset4 + 0);
  414. ctemp26 = *(aoffset4 + 1);
  415. ctemp27 = *(aoffset4 + 2);
  416. ctemp28 = *(aoffset4 + 3);
  417. ctemp29 = *(aoffset4 + 4);
  418. ctemp30 = *(aoffset4 + 5);
  419. ctemp31 = *(aoffset4 + 6);
  420. ctemp32 = *(aoffset4 + 7);
  421. aoffset4 += 8;
  422. *(boffset1 + 0) = ctemp01;
  423. *(boffset1 + 1) = ctemp02;
  424. *(boffset1 + 2) = ctemp03;
  425. *(boffset1 + 3) = ctemp04;
  426. *(boffset1 + 4) = ctemp05;
  427. *(boffset1 + 5) = ctemp06;
  428. *(boffset1 + 6) = ctemp07;
  429. *(boffset1 + 7) = ctemp08;
  430. *(boffset1 + 8) = ctemp09;
  431. *(boffset1 + 9) = ctemp10;
  432. *(boffset1 + 10) = ctemp11;
  433. *(boffset1 + 11) = ctemp12;
  434. *(boffset1 + 12) = ctemp13;
  435. *(boffset1 + 13) = ctemp14;
  436. *(boffset1 + 14) = ctemp15;
  437. *(boffset1 + 15) = ctemp16;
  438. *(boffset1 + 16) = ctemp17;
  439. *(boffset1 + 17) = ctemp18;
  440. *(boffset1 + 18) = ctemp19;
  441. *(boffset1 + 19) = ctemp20;
  442. *(boffset1 + 20) = ctemp21;
  443. *(boffset1 + 21) = ctemp22;
  444. *(boffset1 + 22) = ctemp23;
  445. *(boffset1 + 23) = ctemp24;
  446. *(boffset1 + 24) = ctemp25;
  447. *(boffset1 + 25) = ctemp26;
  448. *(boffset1 + 26) = ctemp27;
  449. *(boffset1 + 27) = ctemp28;
  450. *(boffset1 + 28) = ctemp29;
  451. *(boffset1 + 29) = ctemp30;
  452. *(boffset1 + 30) = ctemp31;
  453. *(boffset1 + 31) = ctemp32;
  454. boffset1 += 8 * m;
  455. i --;
  456. }while(i > 0);
  457. }
  458. if (n & 4) {
  459. ctemp01 = *(aoffset1 + 0);
  460. ctemp02 = *(aoffset1 + 1);
  461. ctemp03 = *(aoffset1 + 2);
  462. ctemp04 = *(aoffset1 + 3);
  463. aoffset1 += 4;
  464. ctemp05 = *(aoffset2 + 0);
  465. ctemp06 = *(aoffset2 + 1);
  466. ctemp07 = *(aoffset2 + 2);
  467. ctemp08 = *(aoffset2 + 3);
  468. aoffset2 += 4;
  469. ctemp09 = *(aoffset3 + 0);
  470. ctemp10 = *(aoffset3 + 1);
  471. ctemp11 = *(aoffset3 + 2);
  472. ctemp12 = *(aoffset3 + 3);
  473. aoffset3 += 4;
  474. ctemp13 = *(aoffset4 + 0);
  475. ctemp14 = *(aoffset4 + 1);
  476. ctemp15 = *(aoffset4 + 2);
  477. ctemp16 = *(aoffset4 + 3);
  478. aoffset4 += 4;
  479. *(boffset2 + 0) = ctemp01;
  480. *(boffset2 + 1) = ctemp02;
  481. *(boffset2 + 2) = ctemp03;
  482. *(boffset2 + 3) = ctemp04;
  483. *(boffset2 + 4) = ctemp05;
  484. *(boffset2 + 5) = ctemp06;
  485. *(boffset2 + 6) = ctemp07;
  486. *(boffset2 + 7) = ctemp08;
  487. *(boffset2 + 8) = ctemp09;
  488. *(boffset2 + 9) = ctemp10;
  489. *(boffset2 + 10) = ctemp11;
  490. *(boffset2 + 11) = ctemp12;
  491. *(boffset2 + 12) = ctemp13;
  492. *(boffset2 + 13) = ctemp14;
  493. *(boffset2 + 14) = ctemp15;
  494. *(boffset2 + 15) = ctemp16;
  495. boffset2 += 16;
  496. }
  497. if (n & 2){
  498. ctemp01 = *(aoffset1 + 0);
  499. ctemp02 = *(aoffset1 + 1);
  500. aoffset1 += 2;
  501. ctemp03 = *(aoffset2 + 0);
  502. ctemp04 = *(aoffset2 + 1);
  503. aoffset2 += 2;
  504. ctemp05 = *(aoffset3 + 0);
  505. ctemp06 = *(aoffset3 + 1);
  506. aoffset3 += 2;
  507. ctemp07 = *(aoffset4 + 0);
  508. ctemp08 = *(aoffset4 + 1);
  509. aoffset4 += 2;
  510. *(boffset3 + 0) = ctemp01;
  511. *(boffset3 + 1) = ctemp02;
  512. *(boffset3 + 2) = ctemp03;
  513. *(boffset3 + 3) = ctemp04;
  514. *(boffset3 + 4) = ctemp05;
  515. *(boffset3 + 5) = ctemp06;
  516. *(boffset3 + 6) = ctemp07;
  517. *(boffset3 + 7) = ctemp08;
  518. boffset3 += 8;
  519. }
  520. if (n & 1){
  521. ctemp01 = *(aoffset1 + 0);
  522. aoffset1 ++;
  523. ctemp02 = *(aoffset2 + 0);
  524. aoffset2 ++;
  525. ctemp03 = *(aoffset3 + 0);
  526. aoffset3 ++;
  527. ctemp04 = *(aoffset4 + 0);
  528. aoffset4 ++;
  529. *(boffset4 + 0) = ctemp01;
  530. *(boffset4 + 1) = ctemp02;
  531. *(boffset4 + 2) = ctemp03;
  532. *(boffset4 + 3) = ctemp04;
  533. boffset4 += 4;
  534. }
  535. }
  536. if (m & 2){
  537. aoffset1 = aoffset;
  538. aoffset2 = aoffset1 + lda;
  539. aoffset += 2 * lda;
  540. boffset1 = boffset;
  541. boffset += 16;
  542. i = (n >> 3);
  543. if (i > 0){
  544. do{
  545. ctemp01 = *(aoffset1 + 0);
  546. ctemp02 = *(aoffset1 + 1);
  547. ctemp03 = *(aoffset1 + 2);
  548. ctemp04 = *(aoffset1 + 3);
  549. ctemp05 = *(aoffset1 + 4);
  550. ctemp06 = *(aoffset1 + 5);
  551. ctemp07 = *(aoffset1 + 6);
  552. ctemp08 = *(aoffset1 + 7);
  553. aoffset1 += 8;
  554. ctemp09 = *(aoffset2 + 0);
  555. ctemp10 = *(aoffset2 + 1);
  556. ctemp11 = *(aoffset2 + 2);
  557. ctemp12 = *(aoffset2 + 3);
  558. ctemp13 = *(aoffset2 + 4);
  559. ctemp14 = *(aoffset2 + 5);
  560. ctemp15 = *(aoffset2 + 6);
  561. ctemp16 = *(aoffset2 + 7);
  562. aoffset2 += 8;
  563. *(boffset1 + 0) = ctemp01;
  564. *(boffset1 + 1) = ctemp02;
  565. *(boffset1 + 2) = ctemp03;
  566. *(boffset1 + 3) = ctemp04;
  567. *(boffset1 + 4) = ctemp05;
  568. *(boffset1 + 5) = ctemp06;
  569. *(boffset1 + 6) = ctemp07;
  570. *(boffset1 + 7) = ctemp08;
  571. *(boffset1 + 8) = ctemp09;
  572. *(boffset1 + 9) = ctemp10;
  573. *(boffset1 + 10) = ctemp11;
  574. *(boffset1 + 11) = ctemp12;
  575. *(boffset1 + 12) = ctemp13;
  576. *(boffset1 + 13) = ctemp14;
  577. *(boffset1 + 14) = ctemp15;
  578. *(boffset1 + 15) = ctemp16;
  579. boffset1 += 8 * m;
  580. i --;
  581. }while(i > 0);
  582. }
  583. if (n & 4){
  584. ctemp01 = *(aoffset1 + 0);
  585. ctemp02 = *(aoffset1 + 1);
  586. ctemp03 = *(aoffset1 + 2);
  587. ctemp04 = *(aoffset1 + 3);
  588. aoffset1 += 4;
  589. ctemp05 = *(aoffset2 + 0);
  590. ctemp06 = *(aoffset2 + 1);
  591. ctemp07 = *(aoffset2 + 2);
  592. ctemp08 = *(aoffset2 + 3);
  593. aoffset2 += 4;
  594. *(boffset2 + 0) = ctemp01;
  595. *(boffset2 + 1) = ctemp02;
  596. *(boffset2 + 2) = ctemp03;
  597. *(boffset2 + 3) = ctemp04;
  598. *(boffset2 + 4) = ctemp05;
  599. *(boffset2 + 5) = ctemp06;
  600. *(boffset2 + 6) = ctemp07;
  601. *(boffset2 + 7) = ctemp08;
  602. boffset2 += 8;
  603. }
  604. if (n & 2){
  605. ctemp01 = *(aoffset1 + 0);
  606. ctemp02 = *(aoffset1 + 1);
  607. aoffset1 += 2;
  608. ctemp03 = *(aoffset2 + 0);
  609. ctemp04 = *(aoffset2 + 1);
  610. aoffset2 += 2;
  611. *(boffset3 + 0) = ctemp01;
  612. *(boffset3 + 1) = ctemp02;
  613. *(boffset3 + 2) = ctemp03;
  614. *(boffset3 + 3) = ctemp04;
  615. boffset3 += 4;
  616. }
  617. if (n & 1){
  618. ctemp01 = *(aoffset1 + 0);
  619. aoffset1 ++;
  620. ctemp02 = *(aoffset2 + 0);
  621. aoffset2 ++;
  622. *(boffset4 + 0) = ctemp01;
  623. *(boffset4 + 1) = ctemp02;
  624. boffset4 += 2;
  625. }
  626. }
  627. if (m & 1){
  628. aoffset1 = aoffset;
  629. // aoffset += lda;
  630. boffset1 = boffset;
  631. // boffset += 8;
  632. i = (n >> 3);
  633. if (i > 0){
  634. do{
  635. ctemp01 = *(aoffset1 + 0);
  636. ctemp02 = *(aoffset1 + 1);
  637. ctemp03 = *(aoffset1 + 2);
  638. ctemp04 = *(aoffset1 + 3);
  639. ctemp05 = *(aoffset1 + 4);
  640. ctemp06 = *(aoffset1 + 5);
  641. ctemp07 = *(aoffset1 + 6);
  642. ctemp08 = *(aoffset1 + 7);
  643. aoffset1 += 8;
  644. *(boffset1 + 0) = ctemp01;
  645. *(boffset1 + 1) = ctemp02;
  646. *(boffset1 + 2) = ctemp03;
  647. *(boffset1 + 3) = ctemp04;
  648. *(boffset1 + 4) = ctemp05;
  649. *(boffset1 + 5) = ctemp06;
  650. *(boffset1 + 6) = ctemp07;
  651. *(boffset1 + 7) = ctemp08;
  652. boffset1 += 8 * m;
  653. i --;
  654. }while(i > 0);
  655. }
  656. if (n & 4){
  657. ctemp01 = *(aoffset1 + 0);
  658. ctemp02 = *(aoffset1 + 1);
  659. ctemp03 = *(aoffset1 + 2);
  660. ctemp04 = *(aoffset1 + 3);
  661. aoffset1 += 4;
  662. *(boffset2 + 0) = ctemp01;
  663. *(boffset2 + 1) = ctemp02;
  664. *(boffset2 + 2) = ctemp03;
  665. *(boffset2 + 3) = ctemp04;
  666. // boffset2 += 4;
  667. }
  668. if (n & 2){
  669. ctemp01 = *(aoffset1 + 0);
  670. ctemp02 = *(aoffset1 + 1);
  671. aoffset1 += 2;
  672. *(boffset3 + 0) = ctemp01;
  673. *(boffset3 + 1) = ctemp02;
  674. // boffset3 += 2;
  675. }
  676. if (n & 1){
  677. ctemp01 = *(aoffset1 + 0);
  678. aoffset1 ++;
  679. *(boffset4 + 0) = ctemp01;
  680. boffset4 ++;
  681. }
  682. }
  683. return 0;
  684. }