You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sbgemm_tcopy_8_power10.c 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. #include <altivec.h>
  41. typedef IFLOAT vec_bf16 __attribute__ ((vector_size (16)));
  42. int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
  43. BLASLONG i, j;
  44. IFLOAT *aoffset;
  45. IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
  46. IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
  47. IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
  48. vec_bf16 vtemp01, vtemp02, vtemp03, vtemp04;
  49. vec_bf16 vtemp05, vtemp06, vtemp07, vtemp08;
  50. IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
  51. IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
  52. IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
  53. IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
  54. IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
  55. IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
  56. IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
  57. IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
  58. aoffset = a;
  59. boffset = b;
  60. #if 0
  61. fprintf(stderr, "M = %d N = %d\n", m, n);
  62. #endif
  63. boffset2 = b + m * (n & ~7);
  64. boffset3 = b + m * (n & ~3);
  65. boffset4 = b + m * (n & ~1);
  66. j = (m >> 3);
  67. if (j > 0){
  68. do{
  69. aoffset1 = aoffset;
  70. aoffset2 = aoffset1 + lda;
  71. aoffset3 = aoffset2 + lda;
  72. aoffset4 = aoffset3 + lda;
  73. aoffset5 = aoffset4 + lda;
  74. aoffset6 = aoffset5 + lda;
  75. aoffset7 = aoffset6 + lda;
  76. aoffset8 = aoffset7 + lda;
  77. aoffset += 8 * lda;
  78. boffset1 = boffset;
  79. boffset += 64;
  80. i = (n >> 3);
  81. if (i > 0){
  82. do{
  83. vtemp01 = *(vec_bf16 *)(aoffset1);
  84. vtemp02 = *(vec_bf16 *)(aoffset2);
  85. vtemp03 = *(vec_bf16 *)(aoffset3);
  86. vtemp04 = *(vec_bf16 *)(aoffset4);
  87. vtemp05 = *(vec_bf16 *)(aoffset5);
  88. vtemp06 = *(vec_bf16 *)(aoffset6);
  89. vtemp07 = *(vec_bf16 *)(aoffset7);
  90. vtemp08 = *(vec_bf16 *)(aoffset8);
  91. aoffset1 += 8;
  92. aoffset2 += 8;
  93. aoffset3 += 8;
  94. aoffset4 += 8;
  95. aoffset5 += 8;
  96. aoffset6 += 8;
  97. aoffset7 += 8;
  98. aoffset8 += 8;
  99. *(vec_bf16 *)(boffset1 + 0) = vec_mergeh(vtemp01, vtemp02);
  100. *(vec_bf16 *)(boffset1 + 8) = vec_mergel(vtemp01, vtemp02);
  101. *(vec_bf16 *)(boffset1 + 16) = vec_mergeh(vtemp03, vtemp04);
  102. *(vec_bf16 *)(boffset1 + 24) = vec_mergel(vtemp03, vtemp04);
  103. *(vec_bf16 *)(boffset1 + 32) = vec_mergeh(vtemp05, vtemp06);
  104. *(vec_bf16 *)(boffset1 + 40) = vec_mergel(vtemp05, vtemp06);
  105. *(vec_bf16 *)(boffset1 + 48) = vec_mergeh(vtemp07, vtemp08);
  106. *(vec_bf16 *)(boffset1 + 56) = vec_mergel(vtemp07, vtemp08);
  107. boffset1 += m * 8;
  108. i --;
  109. }while(i > 0);
  110. }
  111. if (n & 4){
  112. ctemp01 = *(aoffset1 + 0);
  113. ctemp02 = *(aoffset1 + 1);
  114. ctemp03 = *(aoffset1 + 2);
  115. ctemp04 = *(aoffset1 + 3);
  116. aoffset1 += 4;
  117. ctemp05 = *(aoffset2 + 0);
  118. ctemp06 = *(aoffset2 + 1);
  119. ctemp07 = *(aoffset2 + 2);
  120. ctemp08 = *(aoffset2 + 3);
  121. aoffset2 += 4;
  122. ctemp09 = *(aoffset3 + 0);
  123. ctemp10 = *(aoffset3 + 1);
  124. ctemp11 = *(aoffset3 + 2);
  125. ctemp12 = *(aoffset3 + 3);
  126. aoffset3 += 4;
  127. ctemp13 = *(aoffset4 + 0);
  128. ctemp14 = *(aoffset4 + 1);
  129. ctemp15 = *(aoffset4 + 2);
  130. ctemp16 = *(aoffset4 + 3);
  131. aoffset4 += 4;
  132. ctemp17 = *(aoffset5 + 0);
  133. ctemp18 = *(aoffset5 + 1);
  134. ctemp19 = *(aoffset5 + 2);
  135. ctemp20 = *(aoffset5 + 3);
  136. aoffset5 += 4;
  137. ctemp21 = *(aoffset6 + 0);
  138. ctemp22 = *(aoffset6 + 1);
  139. ctemp23 = *(aoffset6 + 2);
  140. ctemp24 = *(aoffset6 + 3);
  141. aoffset6 += 4;
  142. ctemp25 = *(aoffset7 + 0);
  143. ctemp26 = *(aoffset7 + 1);
  144. ctemp27 = *(aoffset7 + 2);
  145. ctemp28 = *(aoffset7 + 3);
  146. aoffset7 += 4;
  147. ctemp29 = *(aoffset8 + 0);
  148. ctemp30 = *(aoffset8 + 1);
  149. ctemp31 = *(aoffset8 + 2);
  150. ctemp32 = *(aoffset8 + 3);
  151. aoffset8 += 4;
  152. *(boffset2 + 0) = ctemp01;
  153. *(boffset2 + 1) = ctemp05;
  154. *(boffset2 + 2) = ctemp02;
  155. *(boffset2 + 3) = ctemp06;
  156. *(boffset2 + 4) = ctemp03;
  157. *(boffset2 + 5) = ctemp07;
  158. *(boffset2 + 6) = ctemp04;
  159. *(boffset2 + 7) = ctemp08;
  160. *(boffset2 + 8) = ctemp09;
  161. *(boffset2 + 9) = ctemp13;
  162. *(boffset2 + 10) = ctemp10;
  163. *(boffset2 + 11) = ctemp14;
  164. *(boffset2 + 12) = ctemp11;
  165. *(boffset2 + 13) = ctemp15;
  166. *(boffset2 + 14) = ctemp12;
  167. *(boffset2 + 15) = ctemp16;
  168. *(boffset2 + 16) = ctemp17;
  169. *(boffset2 + 17) = ctemp21;
  170. *(boffset2 + 18) = ctemp18;
  171. *(boffset2 + 19) = ctemp22;
  172. *(boffset2 + 20) = ctemp19;
  173. *(boffset2 + 21) = ctemp23;
  174. *(boffset2 + 22) = ctemp20;
  175. *(boffset2 + 23) = ctemp24;
  176. *(boffset2 + 24) = ctemp25;
  177. *(boffset2 + 25) = ctemp29;
  178. *(boffset2 + 26) = ctemp26;
  179. *(boffset2 + 27) = ctemp30;
  180. *(boffset2 + 28) = ctemp27;
  181. *(boffset2 + 29) = ctemp31;
  182. *(boffset2 + 30) = ctemp28;
  183. *(boffset2 + 31) = ctemp32;
  184. boffset2 += 32;
  185. }
  186. if (n & 2){
  187. ctemp01 = *(aoffset1 + 0);
  188. ctemp02 = *(aoffset1 + 1);
  189. aoffset1 += 2;
  190. ctemp03 = *(aoffset2 + 0);
  191. ctemp04 = *(aoffset2 + 1);
  192. aoffset2 += 2;
  193. ctemp05 = *(aoffset3 + 0);
  194. ctemp06 = *(aoffset3 + 1);
  195. aoffset3 += 2;
  196. ctemp07 = *(aoffset4 + 0);
  197. ctemp08 = *(aoffset4 + 1);
  198. aoffset4 += 2;
  199. ctemp09 = *(aoffset5 + 0);
  200. ctemp10 = *(aoffset5 + 1);
  201. aoffset5 += 2;
  202. ctemp11 = *(aoffset6 + 0);
  203. ctemp12 = *(aoffset6 + 1);
  204. aoffset6 += 2;
  205. ctemp13 = *(aoffset7 + 0);
  206. ctemp14 = *(aoffset7 + 1);
  207. aoffset7 += 2;
  208. ctemp15 = *(aoffset8 + 0);
  209. ctemp16 = *(aoffset8 + 1);
  210. aoffset8 += 2;
  211. *(boffset3 + 0) = ctemp01;
  212. *(boffset3 + 1) = ctemp02;
  213. *(boffset3 + 2) = ctemp03;
  214. *(boffset3 + 3) = ctemp04;
  215. *(boffset3 + 4) = ctemp05;
  216. *(boffset3 + 5) = ctemp06;
  217. *(boffset3 + 6) = ctemp07;
  218. *(boffset3 + 7) = ctemp08;
  219. *(boffset3 + 8) = ctemp09;
  220. *(boffset3 + 9) = ctemp10;
  221. *(boffset3 + 10) = ctemp11;
  222. *(boffset3 + 11) = ctemp12;
  223. *(boffset3 + 12) = ctemp13;
  224. *(boffset3 + 13) = ctemp14;
  225. *(boffset3 + 14) = ctemp15;
  226. *(boffset3 + 15) = ctemp16;
  227. boffset3 += 16;
  228. }
  229. if (n & 1){
  230. ctemp01 = *(aoffset1 + 0);
  231. aoffset1 ++;
  232. ctemp02 = *(aoffset2 + 0);
  233. aoffset2 ++;
  234. ctemp03 = *(aoffset3 + 0);
  235. aoffset3 ++;
  236. ctemp04 = *(aoffset4 + 0);
  237. aoffset4 ++;
  238. ctemp05 = *(aoffset5 + 0);
  239. aoffset5 ++;
  240. ctemp06 = *(aoffset6 + 0);
  241. aoffset6 ++;
  242. ctemp07 = *(aoffset7 + 0);
  243. aoffset7 ++;
  244. ctemp08 = *(aoffset8 + 0);
  245. aoffset8 ++;
  246. *(boffset4 + 0) = ctemp01;
  247. *(boffset4 + 1) = ctemp02;
  248. *(boffset4 + 2) = ctemp03;
  249. *(boffset4 + 3) = ctemp04;
  250. *(boffset4 + 4) = ctemp05;
  251. *(boffset4 + 5) = ctemp06;
  252. *(boffset4 + 6) = ctemp07;
  253. *(boffset4 + 7) = ctemp08;
  254. boffset4 += 8;
  255. }
  256. j--;
  257. }while(j > 0);
  258. }
  259. if (m & 4){
  260. aoffset1 = aoffset;
  261. aoffset2 = aoffset1 + lda;
  262. aoffset3 = aoffset2 + lda;
  263. aoffset4 = aoffset3 + lda;
  264. aoffset += 4 * lda;
  265. boffset1 = boffset;
  266. boffset += 32;
  267. i = (n >> 3);
  268. if (i > 0){
  269. do{
  270. ctemp01 = *(aoffset1 + 0);
  271. ctemp02 = *(aoffset1 + 1);
  272. ctemp03 = *(aoffset1 + 2);
  273. ctemp04 = *(aoffset1 + 3);
  274. ctemp05 = *(aoffset1 + 4);
  275. ctemp06 = *(aoffset1 + 5);
  276. ctemp07 = *(aoffset1 + 6);
  277. ctemp08 = *(aoffset1 + 7);
  278. aoffset1 += 8;
  279. ctemp09 = *(aoffset2 + 0);
  280. ctemp10 = *(aoffset2 + 1);
  281. ctemp11 = *(aoffset2 + 2);
  282. ctemp12 = *(aoffset2 + 3);
  283. ctemp13 = *(aoffset2 + 4);
  284. ctemp14 = *(aoffset2 + 5);
  285. ctemp15 = *(aoffset2 + 6);
  286. ctemp16 = *(aoffset2 + 7);
  287. aoffset2 += 8;
  288. ctemp17 = *(aoffset3 + 0);
  289. ctemp18 = *(aoffset3 + 1);
  290. ctemp19 = *(aoffset3 + 2);
  291. ctemp20 = *(aoffset3 + 3);
  292. ctemp21 = *(aoffset3 + 4);
  293. ctemp22 = *(aoffset3 + 5);
  294. ctemp23 = *(aoffset3 + 6);
  295. ctemp24 = *(aoffset3 + 7);
  296. aoffset3 += 8;
  297. ctemp25 = *(aoffset4 + 0);
  298. ctemp26 = *(aoffset4 + 1);
  299. ctemp27 = *(aoffset4 + 2);
  300. ctemp28 = *(aoffset4 + 3);
  301. ctemp29 = *(aoffset4 + 4);
  302. ctemp30 = *(aoffset4 + 5);
  303. ctemp31 = *(aoffset4 + 6);
  304. ctemp32 = *(aoffset4 + 7);
  305. aoffset4 += 8;
  306. *(boffset1 + 0) = ctemp01;
  307. *(boffset1 + 1) = ctemp09;
  308. *(boffset1 + 2) = ctemp02;
  309. *(boffset1 + 3) = ctemp10;
  310. *(boffset1 + 4) = ctemp03;
  311. *(boffset1 + 5) = ctemp11;
  312. *(boffset1 + 6) = ctemp04;
  313. *(boffset1 + 7) = ctemp12;
  314. *(boffset1 + 8) = ctemp05;
  315. *(boffset1 + 9) = ctemp13;
  316. *(boffset1 + 10) = ctemp06;
  317. *(boffset1 + 11) = ctemp14;
  318. *(boffset1 + 12) = ctemp07;
  319. *(boffset1 + 13) = ctemp15;
  320. *(boffset1 + 14) = ctemp08;
  321. *(boffset1 + 15) = ctemp16;
  322. *(boffset1 + 16) = ctemp17;
  323. *(boffset1 + 17) = ctemp25;
  324. *(boffset1 + 18) = ctemp18;
  325. *(boffset1 + 19) = ctemp26;
  326. *(boffset1 + 20) = ctemp19;
  327. *(boffset1 + 21) = ctemp27;
  328. *(boffset1 + 22) = ctemp20;
  329. *(boffset1 + 23) = ctemp28;
  330. *(boffset1 + 24) = ctemp21;
  331. *(boffset1 + 25) = ctemp29;
  332. *(boffset1 + 26) = ctemp22;
  333. *(boffset1 + 27) = ctemp30;
  334. *(boffset1 + 28) = ctemp23;
  335. *(boffset1 + 29) = ctemp31;
  336. *(boffset1 + 30) = ctemp24;
  337. *(boffset1 + 31) = ctemp32;
  338. boffset1 += 8 * m;
  339. i --;
  340. }while(i > 0);
  341. }
  342. if (n & 4) {
  343. ctemp01 = *(aoffset1 + 0);
  344. ctemp02 = *(aoffset1 + 1);
  345. ctemp03 = *(aoffset1 + 2);
  346. ctemp04 = *(aoffset1 + 3);
  347. aoffset1 += 4;
  348. ctemp05 = *(aoffset2 + 0);
  349. ctemp06 = *(aoffset2 + 1);
  350. ctemp07 = *(aoffset2 + 2);
  351. ctemp08 = *(aoffset2 + 3);
  352. aoffset2 += 4;
  353. ctemp09 = *(aoffset3 + 0);
  354. ctemp10 = *(aoffset3 + 1);
  355. ctemp11 = *(aoffset3 + 2);
  356. ctemp12 = *(aoffset3 + 3);
  357. aoffset3 += 4;
  358. ctemp13 = *(aoffset4 + 0);
  359. ctemp14 = *(aoffset4 + 1);
  360. ctemp15 = *(aoffset4 + 2);
  361. ctemp16 = *(aoffset4 + 3);
  362. aoffset4 += 4;
  363. *(boffset2 + 0) = ctemp01;
  364. *(boffset2 + 1) = ctemp05;
  365. *(boffset2 + 2) = ctemp02;
  366. *(boffset2 + 3) = ctemp06;
  367. *(boffset2 + 4) = ctemp03;
  368. *(boffset2 + 5) = ctemp07;
  369. *(boffset2 + 6) = ctemp04;
  370. *(boffset2 + 7) = ctemp08;
  371. *(boffset2 + 8) = ctemp09;
  372. *(boffset2 + 9) = ctemp13;
  373. *(boffset2 + 10) = ctemp10;
  374. *(boffset2 + 11) = ctemp14;
  375. *(boffset2 + 12) = ctemp11;
  376. *(boffset2 + 13) = ctemp15;
  377. *(boffset2 + 14) = ctemp12;
  378. *(boffset2 + 15) = ctemp16;
  379. boffset2 += 16;
  380. }
  381. if (n & 2){
  382. ctemp01 = *(aoffset1 + 0);
  383. ctemp02 = *(aoffset1 + 1);
  384. aoffset1 += 2;
  385. ctemp03 = *(aoffset2 + 0);
  386. ctemp04 = *(aoffset2 + 1);
  387. aoffset2 += 2;
  388. ctemp05 = *(aoffset3 + 0);
  389. ctemp06 = *(aoffset3 + 1);
  390. aoffset3 += 2;
  391. ctemp07 = *(aoffset4 + 0);
  392. ctemp08 = *(aoffset4 + 1);
  393. aoffset4 += 2;
  394. *(boffset3 + 0) = ctemp01;
  395. *(boffset3 + 1) = ctemp02;
  396. *(boffset3 + 2) = ctemp03;
  397. *(boffset3 + 3) = ctemp04;
  398. *(boffset3 + 4) = ctemp05;
  399. *(boffset3 + 5) = ctemp06;
  400. *(boffset3 + 6) = ctemp07;
  401. *(boffset3 + 7) = ctemp08;
  402. boffset3 += 8;
  403. }
  404. if (n & 1){
  405. ctemp01 = *(aoffset1 + 0);
  406. aoffset1 ++;
  407. ctemp02 = *(aoffset2 + 0);
  408. aoffset2 ++;
  409. ctemp03 = *(aoffset3 + 0);
  410. aoffset3 ++;
  411. ctemp04 = *(aoffset4 + 0);
  412. aoffset4 ++;
  413. *(boffset4 + 0) = ctemp01;
  414. *(boffset4 + 1) = ctemp02;
  415. *(boffset4 + 2) = ctemp03;
  416. *(boffset4 + 3) = ctemp04;
  417. boffset4 += 4;
  418. }
  419. }
  420. if (m & 2){
  421. aoffset1 = aoffset;
  422. aoffset2 = aoffset1 + lda;
  423. aoffset += 2 * lda;
  424. boffset1 = boffset;
  425. boffset += 16;
  426. i = (n >> 3);
  427. if (i > 0){
  428. do{
  429. ctemp01 = *(aoffset1 + 0);
  430. ctemp02 = *(aoffset1 + 1);
  431. ctemp03 = *(aoffset1 + 2);
  432. ctemp04 = *(aoffset1 + 3);
  433. ctemp05 = *(aoffset1 + 4);
  434. ctemp06 = *(aoffset1 + 5);
  435. ctemp07 = *(aoffset1 + 6);
  436. ctemp08 = *(aoffset1 + 7);
  437. aoffset1 += 8;
  438. ctemp09 = *(aoffset2 + 0);
  439. ctemp10 = *(aoffset2 + 1);
  440. ctemp11 = *(aoffset2 + 2);
  441. ctemp12 = *(aoffset2 + 3);
  442. ctemp13 = *(aoffset2 + 4);
  443. ctemp14 = *(aoffset2 + 5);
  444. ctemp15 = *(aoffset2 + 6);
  445. ctemp16 = *(aoffset2 + 7);
  446. aoffset2 += 8;
  447. *(boffset1 + 0) = ctemp01;
  448. *(boffset1 + 1) = ctemp09;
  449. *(boffset1 + 2) = ctemp02;
  450. *(boffset1 + 3) = ctemp10;
  451. *(boffset1 + 4) = ctemp03;
  452. *(boffset1 + 5) = ctemp11;
  453. *(boffset1 + 6) = ctemp04;
  454. *(boffset1 + 7) = ctemp12;
  455. *(boffset1 + 8) = ctemp05;
  456. *(boffset1 + 9) = ctemp13;
  457. *(boffset1 + 10) = ctemp06;
  458. *(boffset1 + 11) = ctemp14;
  459. *(boffset1 + 12) = ctemp07;
  460. *(boffset1 + 13) = ctemp15;
  461. *(boffset1 + 14) = ctemp08;
  462. *(boffset1 + 15) = ctemp16;
  463. boffset1 += 8 * m;
  464. i --;
  465. }while(i > 0);
  466. }
  467. if (n & 4){
  468. ctemp01 = *(aoffset1 + 0);
  469. ctemp02 = *(aoffset1 + 1);
  470. ctemp03 = *(aoffset1 + 2);
  471. ctemp04 = *(aoffset1 + 3);
  472. aoffset1 += 4;
  473. ctemp05 = *(aoffset2 + 0);
  474. ctemp06 = *(aoffset2 + 1);
  475. ctemp07 = *(aoffset2 + 2);
  476. ctemp08 = *(aoffset2 + 3);
  477. aoffset2 += 4;
  478. *(boffset2 + 0) = ctemp01;
  479. *(boffset2 + 1) = ctemp05;
  480. *(boffset2 + 2) = ctemp02;
  481. *(boffset2 + 3) = ctemp06;
  482. *(boffset2 + 4) = ctemp03;
  483. *(boffset2 + 5) = ctemp07;
  484. *(boffset2 + 6) = ctemp04;
  485. *(boffset2 + 7) = ctemp08;
  486. boffset2 += 8;
  487. }
  488. if (n & 2){
  489. ctemp01 = *(aoffset1 + 0);
  490. ctemp02 = *(aoffset1 + 1);
  491. aoffset1 += 2;
  492. ctemp03 = *(aoffset2 + 0);
  493. ctemp04 = *(aoffset2 + 1);
  494. aoffset2 += 2;
  495. *(boffset3 + 0) = ctemp01;
  496. *(boffset3 + 1) = ctemp02;
  497. *(boffset3 + 2) = ctemp03;
  498. *(boffset3 + 3) = ctemp04;
  499. boffset3 += 4;
  500. }
  501. if (n & 1){
  502. ctemp01 = *(aoffset1 + 0);
  503. aoffset1 ++;
  504. ctemp02 = *(aoffset2 + 0);
  505. aoffset2 ++;
  506. *(boffset4 + 0) = ctemp01;
  507. *(boffset4 + 1) = ctemp02;
  508. boffset4 += 2;
  509. }
  510. }
  511. if (m & 1){
  512. aoffset1 = aoffset;
  513. // aoffset += lda;
  514. boffset1 = boffset;
  515. // boffset += 8;
  516. i = (n >> 3);
  517. if (i > 0){
  518. do{
  519. ctemp01 = *(aoffset1 + 0);
  520. ctemp02 = *(aoffset1 + 1);
  521. ctemp03 = *(aoffset1 + 2);
  522. ctemp04 = *(aoffset1 + 3);
  523. ctemp05 = *(aoffset1 + 4);
  524. ctemp06 = *(aoffset1 + 5);
  525. ctemp07 = *(aoffset1 + 6);
  526. ctemp08 = *(aoffset1 + 7);
  527. aoffset1 += 8;
  528. *(boffset1 + 0) = ctemp01;
  529. *(boffset1 + 1) = ctemp02;
  530. *(boffset1 + 2) = ctemp03;
  531. *(boffset1 + 3) = ctemp04;
  532. *(boffset1 + 4) = ctemp05;
  533. *(boffset1 + 5) = ctemp06;
  534. *(boffset1 + 6) = ctemp07;
  535. *(boffset1 + 7) = ctemp08;
  536. boffset1 += 8 * m;
  537. i --;
  538. }while(i > 0);
  539. }
  540. if (n & 4){
  541. ctemp01 = *(aoffset1 + 0);
  542. ctemp02 = *(aoffset1 + 1);
  543. ctemp03 = *(aoffset1 + 2);
  544. ctemp04 = *(aoffset1 + 3);
  545. aoffset1 += 4;
  546. *(boffset2 + 0) = ctemp01;
  547. *(boffset2 + 1) = ctemp02;
  548. *(boffset2 + 2) = ctemp03;
  549. *(boffset2 + 3) = ctemp04;
  550. // boffset2 += 4;
  551. }
  552. if (n & 2){
  553. ctemp01 = *(aoffset1 + 0);
  554. ctemp02 = *(aoffset1 + 1);
  555. aoffset1 += 2;
  556. *(boffset3 + 0) = ctemp01;
  557. *(boffset3 + 1) = ctemp02;
  558. // boffset3 += 2;
  559. }
  560. if (n & 1){
  561. ctemp01 = *(aoffset1 + 0);
  562. aoffset1 ++;
  563. *(boffset4 + 0) = ctemp01;
  564. boffset4 ++;
  565. }
  566. }
  567. return 0;
  568. }