You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrmm_utcopy_8.c 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
  41. BLASLONG i, js, ii;
  42. BLASLONG X;
  43. FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08;
  44. lda *= 2;
  45. js = (n >> 3);
  46. if (js > 0){
  47. do {
  48. X = posX;
  49. if (posX <= posY) {
  50. a01 = a + posX * 2 + (posY + 0) * lda;
  51. a02 = a + posX * 2 + (posY + 1) * lda;
  52. a03 = a + posX * 2 + (posY + 2) * lda;
  53. a04 = a + posX * 2 + (posY + 3) * lda;
  54. a05 = a + posX * 2 + (posY + 4) * lda;
  55. a06 = a + posX * 2 + (posY + 5) * lda;
  56. a07 = a + posX * 2 + (posY + 6) * lda;
  57. a08 = a + posX * 2 + (posY + 7) * lda;
  58. } else {
  59. a01 = a + posY * 2 + (posX + 0) * lda;
  60. a02 = a + posY * 2 + (posX + 1) * lda;
  61. a03 = a + posY * 2 + (posX + 2) * lda;
  62. a04 = a + posY * 2 + (posX + 3) * lda;
  63. a05 = a + posY * 2 + (posX + 4) * lda;
  64. a06 = a + posY * 2 + (posX + 5) * lda;
  65. a07 = a + posY * 2 + (posX + 6) * lda;
  66. a08 = a + posY * 2 + (posX + 7) * lda;
  67. }
  68. i = (m >> 3);
  69. if (i > 0) {
  70. do {
  71. if (X < posY) {
  72. a01 += 16;
  73. a02 += 16;
  74. a03 += 16;
  75. a04 += 16;
  76. a05 += 16;
  77. a06 += 16;
  78. a07 += 16;
  79. a08 += 16;
  80. b += 128;
  81. } else
  82. if (X > posY) {
  83. for (ii = 0; ii < 8; ii++){
  84. b[ 0] = *(a01 + 0);
  85. b[ 1] = *(a01 + 1);
  86. b[ 2] = *(a01 + 2);
  87. b[ 3] = *(a01 + 3);
  88. b[ 4] = *(a01 + 4);
  89. b[ 5] = *(a01 + 5);
  90. b[ 6] = *(a01 + 6);
  91. b[ 7] = *(a01 + 7);
  92. b[ 8] = *(a01 + 8);
  93. b[ 9] = *(a01 + 9);
  94. b[ 10] = *(a01 + 10);
  95. b[ 11] = *(a01 + 11);
  96. b[ 12] = *(a01 + 12);
  97. b[ 13] = *(a01 + 13);
  98. b[ 14] = *(a01 + 14);
  99. b[ 15] = *(a01 + 15);
  100. a01 += lda;
  101. b += 16;
  102. }
  103. a02 += 8 * lda;
  104. a03 += 8 * lda;
  105. a04 += 8 * lda;
  106. a05 += 8 * lda;
  107. a06 += 8 * lda;
  108. a07 += 8 * lda;
  109. a08 += 8 * lda;
  110. } else {
  111. #ifdef UNIT
  112. b[ 0] = ONE;
  113. b[ 1] = ZERO;
  114. #else
  115. b[ 0] = *(a01 + 0);
  116. b[ 1] = *(a01 + 1);
  117. #endif
  118. b[ 2] = ZERO;
  119. b[ 3] = ZERO;
  120. b[ 4] = ZERO;
  121. b[ 5] = ZERO;
  122. b[ 6] = ZERO;
  123. b[ 7] = ZERO;
  124. b[ 8] = ZERO;
  125. b[ 9] = ZERO;
  126. b[ 10] = ZERO;
  127. b[ 11] = ZERO;
  128. b[ 12] = ZERO;
  129. b[ 13] = ZERO;
  130. b[ 14] = ZERO;
  131. b[ 15] = ZERO;
  132. b[ 16] = *(a02 + 0);
  133. b[ 17] = *(a02 + 1);
  134. #ifdef UNIT
  135. b[ 18] = ONE;
  136. b[ 19] = ZERO;
  137. #else
  138. b[ 18] = *(a02 + 2);
  139. b[ 19] = *(a02 + 3);
  140. #endif
  141. b[ 20] = ZERO;
  142. b[ 21] = ZERO;
  143. b[ 22] = ZERO;
  144. b[ 23] = ZERO;
  145. b[ 24] = ZERO;
  146. b[ 25] = ZERO;
  147. b[ 26] = ZERO;
  148. b[ 27] = ZERO;
  149. b[ 28] = ZERO;
  150. b[ 29] = ZERO;
  151. b[ 30] = ZERO;
  152. b[ 31] = ZERO;
  153. b[ 32] = *(a03 + 0);
  154. b[ 33] = *(a03 + 1);
  155. b[ 34] = *(a03 + 2);
  156. b[ 35] = *(a03 + 3);
  157. #ifdef UNIT
  158. b[ 36] = ONE;
  159. b[ 37] = ZERO;
  160. #else
  161. b[ 36] = *(a03 + 4);
  162. b[ 37] = *(a03 + 5);
  163. #endif
  164. b[ 38] = ZERO;
  165. b[ 39] = ZERO;
  166. b[ 40] = ZERO;
  167. b[ 41] = ZERO;
  168. b[ 42] = ZERO;
  169. b[ 43] = ZERO;
  170. b[ 44] = ZERO;
  171. b[ 45] = ZERO;
  172. b[ 46] = ZERO;
  173. b[ 47] = ZERO;
  174. b[ 48] = *(a04 + 0);
  175. b[ 49] = *(a04 + 1);
  176. b[ 50] = *(a04 + 2);
  177. b[ 51] = *(a04 + 3);
  178. b[ 52] = *(a04 + 4);
  179. b[ 53] = *(a04 + 5);
  180. #ifdef UNIT
  181. b[ 54] = ONE;
  182. b[ 55] = ZERO;
  183. #else
  184. b[ 54] = *(a04 + 6);
  185. b[ 55] = *(a04 + 7);
  186. #endif
  187. b[ 56] = ZERO;
  188. b[ 57] = ZERO;
  189. b[ 58] = ZERO;
  190. b[ 59] = ZERO;
  191. b[ 60] = ZERO;
  192. b[ 61] = ZERO;
  193. b[ 62] = ZERO;
  194. b[ 63] = ZERO;
  195. b[ 64] = *(a05 + 0);
  196. b[ 65] = *(a05 + 1);
  197. b[ 66] = *(a05 + 2);
  198. b[ 67] = *(a05 + 3);
  199. b[ 68] = *(a05 + 4);
  200. b[ 69] = *(a05 + 5);
  201. b[ 70] = *(a05 + 6);
  202. b[ 71] = *(a05 + 7);
  203. #ifdef UNIT
  204. b[ 72] = ONE;
  205. b[ 73] = ZERO;
  206. #else
  207. b[ 72] = *(a05 + 8);
  208. b[ 73] = *(a05 + 9);
  209. #endif
  210. b[ 74] = ZERO;
  211. b[ 75] = ZERO;
  212. b[ 76] = ZERO;
  213. b[ 77] = ZERO;
  214. b[ 78] = ZERO;
  215. b[ 79] = ZERO;
  216. b[ 80] = *(a06 + 0);
  217. b[ 81] = *(a06 + 1);
  218. b[ 82] = *(a06 + 2);
  219. b[ 83] = *(a06 + 3);
  220. b[ 84] = *(a06 + 4);
  221. b[ 85] = *(a06 + 5);
  222. b[ 86] = *(a06 + 6);
  223. b[ 87] = *(a06 + 7);
  224. b[ 88] = *(a06 + 8);
  225. b[ 89] = *(a06 + 9);
  226. #ifdef UNIT
  227. b[ 90] = ONE;
  228. b[ 91] = ZERO;
  229. #else
  230. b[ 90] = *(a06 + 10);
  231. b[ 91] = *(a06 + 11);
  232. #endif
  233. b[ 92] = ZERO;
  234. b[ 93] = ZERO;
  235. b[ 94] = ZERO;
  236. b[ 95] = ZERO;
  237. b[ 96] = *(a07 + 0);
  238. b[ 97] = *(a07 + 1);
  239. b[ 98] = *(a07 + 2);
  240. b[ 99] = *(a07 + 3);
  241. b[100] = *(a07 + 4);
  242. b[101] = *(a07 + 5);
  243. b[102] = *(a07 + 6);
  244. b[103] = *(a07 + 7);
  245. b[104] = *(a07 + 8);
  246. b[105] = *(a07 + 9);
  247. b[106] = *(a07 + 10);
  248. b[107] = *(a07 + 11);
  249. #ifdef UNIT
  250. b[108] = ONE;
  251. b[109] = ZERO;
  252. #else
  253. b[108] = *(a07 + 12);
  254. b[109] = *(a07 + 13);
  255. #endif
  256. b[110] = ZERO;
  257. b[111] = ZERO;
  258. b[112] = *(a08 + 0);
  259. b[113] = *(a08 + 1);
  260. b[114] = *(a08 + 2);
  261. b[115] = *(a08 + 3);
  262. b[116] = *(a08 + 4);
  263. b[117] = *(a08 + 5);
  264. b[118] = *(a08 + 6);
  265. b[119] = *(a08 + 7);
  266. b[120] = *(a08 + 8);
  267. b[121] = *(a08 + 9);
  268. b[122] = *(a08 + 10);
  269. b[123] = *(a08 + 11);
  270. b[124] = *(a08 + 12);
  271. b[125] = *(a08 + 13);
  272. #ifdef UNIT
  273. b[126] = ONE;
  274. b[127] = ZERO;
  275. #else
  276. b[126] = *(a08 + 14);
  277. b[127] = *(a08 + 15);
  278. #endif
  279. a01 += 8 * lda;
  280. a02 += 8 * lda;
  281. a03 += 8 * lda;
  282. a04 += 8 * lda;
  283. a05 += 8 * lda;
  284. a06 += 8 * lda;
  285. a07 += 8 * lda;
  286. a08 += 8 * lda;
  287. b += 128;
  288. }
  289. X += 8;
  290. i --;
  291. } while (i > 0);
  292. }
  293. i = (m & 7);
  294. if (i) {
  295. if (X < posY) {
  296. /* a01 += 2 * i;
  297. a02 += 2 * i;
  298. a03 += 2 * i;
  299. a04 += 2 * i;
  300. a05 += 2 * i;
  301. a06 += 2 * i;
  302. a07 += 2 * i;
  303. a08 += 2 * i; */
  304. b += 16 * i;
  305. } else
  306. if (X > posY) {
  307. for (ii = 0; ii < i; ii++){
  308. b[ 0] = *(a01 + 0);
  309. b[ 1] = *(a01 + 1);
  310. b[ 2] = *(a01 + 2);
  311. b[ 3] = *(a01 + 3);
  312. b[ 4] = *(a01 + 4);
  313. b[ 5] = *(a01 + 5);
  314. b[ 6] = *(a01 + 6);
  315. b[ 7] = *(a01 + 7);
  316. b[ 8] = *(a01 + 8);
  317. b[ 9] = *(a01 + 9);
  318. b[ 10] = *(a01 + 10);
  319. b[ 11] = *(a01 + 11);
  320. b[ 12] = *(a01 + 12);
  321. b[ 13] = *(a01 + 13);
  322. b[ 14] = *(a01 + 14);
  323. b[ 15] = *(a01 + 15);
  324. a01 += lda;
  325. a02 += lda;
  326. a03 += lda;
  327. a04 += lda;
  328. a05 += lda;
  329. a06 += lda;
  330. a07 += lda;
  331. a08 += lda;
  332. b += 16;
  333. }
  334. } else {
  335. #ifdef UNIT
  336. b[ 0] = ONE;
  337. b[ 1] = ZERO;
  338. #else
  339. b[ 0] = *(a01 + 0);
  340. b[ 1] = *(a01 + 1);
  341. #endif
  342. b[ 2] = ZERO;
  343. b[ 3] = ZERO;
  344. b[ 4] = ZERO;
  345. b[ 5] = ZERO;
  346. b[ 6] = ZERO;
  347. b[ 7] = ZERO;
  348. b[ 8] = ZERO;
  349. b[ 9] = ZERO;
  350. b[10] = ZERO;
  351. b[11] = ZERO;
  352. b[12] = ZERO;
  353. b[13] = ZERO;
  354. b[14] = ZERO;
  355. b[15] = ZERO;
  356. b += 16;
  357. if(i >= 2) {
  358. b[ 0] = *(a02 + 0);
  359. b[ 1] = *(a02 + 1);
  360. #ifdef UNIT
  361. b[ 2] = ONE;
  362. b[ 3] = ZERO;
  363. #else
  364. b[ 2] = *(a02 + 2);
  365. b[ 3] = *(a02 + 3);
  366. #endif
  367. b[ 4] = ZERO;
  368. b[ 5] = ZERO;
  369. b[ 6] = ZERO;
  370. b[ 7] = ZERO;
  371. b[ 8] = ZERO;
  372. b[ 9] = ZERO;
  373. b[10] = ZERO;
  374. b[11] = ZERO;
  375. b[12] = ZERO;
  376. b[13] = ZERO;
  377. b[14] = ZERO;
  378. b[15] = ZERO;
  379. b += 16;
  380. }
  381. if (i >= 3) {
  382. b[ 0] = *(a03 + 0);
  383. b[ 1] = *(a03 + 1);
  384. b[ 2] = *(a03 + 2);
  385. b[ 3] = *(a03 + 3);
  386. #ifdef UNIT
  387. b[ 4] = ONE;
  388. b[ 5] = ZERO;
  389. #else
  390. b[ 4] = *(a03 + 4);
  391. b[ 5] = *(a03 + 5);
  392. #endif
  393. b[ 6] = ZERO;
  394. b[ 7] = ZERO;
  395. b[ 8] = ZERO;
  396. b[ 9] = ZERO;
  397. b[10] = ZERO;
  398. b[11] = ZERO;
  399. b[12] = ZERO;
  400. b[13] = ZERO;
  401. b[14] = ZERO;
  402. b[15] = ZERO;
  403. b += 16;
  404. }
  405. if (i >= 4) {
  406. b[ 0] = *(a04 + 0);
  407. b[ 1] = *(a04 + 1);
  408. b[ 2] = *(a04 + 2);
  409. b[ 3] = *(a04 + 3);
  410. b[ 4] = *(a04 + 4);
  411. b[ 5] = *(a04 + 5);
  412. #ifdef UNIT
  413. b[ 6] = ONE;
  414. b[ 7] = ZERO;
  415. #else
  416. b[ 6] = *(a04 + 6);
  417. b[ 7] = *(a04 + 7);
  418. #endif
  419. b[ 8] = ZERO;
  420. b[ 9] = ZERO;
  421. b[10] = ZERO;
  422. b[11] = ZERO;
  423. b[12] = ZERO;
  424. b[13] = ZERO;
  425. b[14] = ZERO;
  426. b[15] = ZERO;
  427. b += 16;
  428. }
  429. if (i >= 5) {
  430. b[ 0] = *(a05 + 0);
  431. b[ 1] = *(a05 + 1);
  432. b[ 2] = *(a05 + 2);
  433. b[ 3] = *(a05 + 3);
  434. b[ 4] = *(a05 + 4);
  435. b[ 5] = *(a05 + 5);
  436. b[ 6] = *(a05 + 6);
  437. b[ 7] = *(a05 + 7);
  438. #ifdef UNIT
  439. b[ 8] = ONE;
  440. b[ 9] = ZERO;
  441. #else
  442. b[ 8] = *(a05 + 8);
  443. b[ 9] = *(a05 + 9);
  444. #endif
  445. b[10] = ZERO;
  446. b[11] = ZERO;
  447. b[12] = ZERO;
  448. b[13] = ZERO;
  449. b[14] = ZERO;
  450. b[15] = ZERO;
  451. b += 16;
  452. }
  453. if (i >= 6) {
  454. b[ 0] = *(a06 + 0);
  455. b[ 1] = *(a06 + 1);
  456. b[ 2] = *(a06 + 2);
  457. b[ 3] = *(a06 + 3);
  458. b[ 4] = *(a06 + 4);
  459. b[ 5] = *(a06 + 5);
  460. b[ 6] = *(a06 + 6);
  461. b[ 7] = *(a06 + 7);
  462. b[ 8] = *(a06 + 8);
  463. b[ 9] = *(a06 + 9);
  464. #ifdef UNIT
  465. b[10] = ONE;
  466. b[11] = ZERO;
  467. #else
  468. b[10] = *(a06 + 10);
  469. b[11] = *(a06 + 11);
  470. #endif
  471. b[12] = ZERO;
  472. b[13] = ZERO;
  473. b[14] = ZERO;
  474. b[15] = ZERO;
  475. b += 16;
  476. }
  477. if (i >= 7) {
  478. b[ 0] = *(a07 + 0);
  479. b[ 1] = *(a07 + 1);
  480. b[ 2] = *(a07 + 2);
  481. b[ 3] = *(a07 + 3);
  482. b[ 4] = *(a07 + 4);
  483. b[ 5] = *(a07 + 5);
  484. b[ 6] = *(a07 + 6);
  485. b[ 7] = *(a07 + 7);
  486. b[ 8] = *(a07 + 8);
  487. b[ 9] = *(a07 + 9);
  488. b[10] = *(a07 + 10);
  489. b[11] = *(a07 + 11);
  490. #ifdef UNIT
  491. b[12] = ONE;
  492. b[13] = ZERO;
  493. #else
  494. b[12] = *(a07 + 12);
  495. b[13] = *(a07 + 13);
  496. #endif
  497. b[14] = ZERO;
  498. b[15] = ZERO;
  499. b += 16;
  500. }
  501. }
  502. }
  503. posY += 8;
  504. js --;
  505. } while (js > 0);
  506. } /* End of main loop */
  507. if (n & 4){
  508. X = posX;
  509. if (posX <= posY) {
  510. a01 = a + posX * 2 + (posY + 0) * lda;
  511. a02 = a + posX * 2 + (posY + 1) * lda;
  512. a03 = a + posX * 2 + (posY + 2) * lda;
  513. a04 = a + posX * 2 + (posY + 3) * lda;
  514. } else {
  515. a01 = a + posY * 2 + (posX + 0) * lda;
  516. a02 = a + posY * 2 + (posX + 1) * lda;
  517. a03 = a + posY * 2 + (posX + 2) * lda;
  518. a04 = a + posY * 2 + (posX + 3) * lda;
  519. }
  520. i = (m >> 2);
  521. if (i > 0) {
  522. do {
  523. if (X < posY) {
  524. a01 += 8;
  525. a02 += 8;
  526. a03 += 8;
  527. a04 += 8;
  528. b += 32;
  529. } else
  530. if (X > posY) {
  531. for (ii = 0; ii < 4; ii++){
  532. b[ 0] = *(a01 + 0);
  533. b[ 1] = *(a01 + 1);
  534. b[ 2] = *(a01 + 2);
  535. b[ 3] = *(a01 + 3);
  536. b[ 4] = *(a01 + 4);
  537. b[ 5] = *(a01 + 5);
  538. b[ 6] = *(a01 + 6);
  539. b[ 7] = *(a01 + 7);
  540. a01 += lda;
  541. b += 8;
  542. }
  543. a02 += 4 * lda;
  544. a03 += 4 * lda;
  545. a04 += 4 * lda;
  546. } else {
  547. #ifdef UNIT
  548. b[ 0] = ONE;
  549. b[ 1] = ZERO;
  550. #else
  551. b[ 0] = *(a01 + 0);
  552. b[ 1] = *(a01 + 1);
  553. #endif
  554. b[ 2] = ZERO;
  555. b[ 3] = ZERO;
  556. b[ 4] = ZERO;
  557. b[ 5] = ZERO;
  558. b[ 6] = ZERO;
  559. b[ 7] = ZERO;
  560. b[ 8] = *(a02 + 0);
  561. b[ 9] = *(a02 + 1);
  562. #ifdef UNIT
  563. b[ 10] = ONE;
  564. b[ 11] = ZERO;
  565. #else
  566. b[ 10] = *(a02 + 2);
  567. b[ 11] = *(a02 + 3);
  568. #endif
  569. b[ 12] = ZERO;
  570. b[ 13] = ZERO;
  571. b[ 14] = ZERO;
  572. b[ 15] = ZERO;
  573. b[ 16] = *(a03 + 0);
  574. b[ 17] = *(a03 + 1);
  575. b[ 18] = *(a03 + 2);
  576. b[ 19] = *(a03 + 3);
  577. #ifdef UNIT
  578. b[ 20] = ONE;
  579. b[ 21] = ZERO;
  580. #else
  581. b[ 20] = *(a03 + 4);
  582. b[ 21] = *(a03 + 5);
  583. #endif
  584. b[ 22] = ZERO;
  585. b[ 23] = ZERO;
  586. b[ 24] = *(a04 + 0);
  587. b[ 25] = *(a04 + 1);
  588. b[ 26] = *(a04 + 2);
  589. b[ 27] = *(a04 + 3);
  590. b[ 28] = *(a04 + 4);
  591. b[ 29] = *(a04 + 5);
  592. #ifdef UNIT
  593. b[ 30] = ONE;
  594. b[ 31] = ZERO;
  595. #else
  596. b[ 30] = *(a04 + 6);
  597. b[ 31] = *(a04 + 7);
  598. #endif
  599. a01 += 4 * lda;
  600. a02 += 4 * lda;
  601. a03 += 4 * lda;
  602. a04 += 4 * lda;
  603. b += 32;
  604. }
  605. X += 4;
  606. i --;
  607. } while (i > 0);
  608. }
  609. i = (m & 3);
  610. if (i) {
  611. if (X < posY) {
  612. /* a01 += 2 * i;
  613. a02 += 2 * i;
  614. a03 += 2 * i;
  615. a04 += 2 * i; */
  616. b += 8 * i;
  617. } else
  618. if (X > posY) {
  619. for (ii = 0; ii < i; ii++){
  620. b[ 0] = *(a01 + 0);
  621. b[ 1] = *(a01 + 1);
  622. b[ 2] = *(a01 + 2);
  623. b[ 3] = *(a01 + 3);
  624. b[ 4] = *(a01 + 4);
  625. b[ 5] = *(a01 + 5);
  626. b[ 6] = *(a01 + 6);
  627. b[ 7] = *(a01 + 7);
  628. a01 += lda;
  629. a02 += lda;
  630. a03 += lda;
  631. a04 += lda;
  632. b += 8;
  633. }
  634. } else {
  635. #ifdef UNIT
  636. b[ 0] = ONE;
  637. b[ 1] = ZERO;
  638. #else
  639. b[ 0] = *(a01 + 0);
  640. b[ 1] = *(a01 + 1);
  641. #endif
  642. b[ 2] = ZERO;
  643. b[ 3] = ZERO;
  644. b[ 4] = ZERO;
  645. b[ 5] = ZERO;
  646. b[ 6] = ZERO;
  647. b[ 7] = ZERO;
  648. b += 8;
  649. if(i >= 2) {
  650. b[ 0] = *(a02 + 0);
  651. b[ 1] = *(a02 + 1);
  652. #ifdef UNIT
  653. b[ 2] = ONE;
  654. b[ 3] = ZERO;
  655. #else
  656. b[ 2] = *(a02 + 2);
  657. b[ 3] = *(a02 + 3);
  658. #endif
  659. b[ 4] = ZERO;
  660. b[ 5] = ZERO;
  661. b[ 6] = ZERO;
  662. b[ 7] = ZERO;
  663. b += 8;
  664. }
  665. if (i >= 3) {
  666. b[ 0] = *(a03 + 0);
  667. b[ 1] = *(a03 + 1);
  668. b[ 2] = *(a03 + 2);
  669. b[ 3] = *(a03 + 3);
  670. #ifdef UNIT
  671. b[ 4] = ONE;
  672. b[ 5] = ZERO;
  673. #else
  674. b[ 4] = *(a03 + 4);
  675. b[ 5] = *(a03 + 5);
  676. #endif
  677. b[ 6] = ZERO;
  678. b[ 7] = ZERO;
  679. b += 8;
  680. }
  681. }
  682. }
  683. posY += 4;
  684. }
  685. if (n & 2){
  686. X = posX;
  687. if (posX <= posY) {
  688. a01 = a + posX * 2 + (posY + 0) * lda;
  689. a02 = a + posX * 2 + (posY + 1) * lda;
  690. } else {
  691. a01 = a + posY * 2 + (posX + 0) * lda;
  692. a02 = a + posY * 2 + (posX + 1) * lda;
  693. }
  694. i = (m >> 1);
  695. if (i > 0) {
  696. do {
  697. if (X < posY) {
  698. a01 += 4;
  699. a02 += 4;
  700. b += 8;
  701. } else
  702. if (X > posY) {
  703. b[ 0] = *(a01 + 0);
  704. b[ 1] = *(a01 + 1);
  705. b[ 2] = *(a01 + 2);
  706. b[ 3] = *(a01 + 3);
  707. b[ 4] = *(a02 + 0);
  708. b[ 5] = *(a02 + 1);
  709. b[ 6] = *(a02 + 2);
  710. b[ 7] = *(a02 + 3);
  711. a01 += 2 * lda;
  712. a02 += 2 * lda;
  713. b += 8;
  714. } else {
  715. #ifdef UNIT
  716. b[ 0] = ONE;
  717. b[ 1] = ZERO;
  718. #else
  719. b[ 0] = *(a01 + 0);
  720. b[ 1] = *(a01 + 1);
  721. #endif
  722. b[ 2] = ZERO;
  723. b[ 3] = ZERO;
  724. b[ 4] = *(a02 + 0);
  725. b[ 5] = *(a02 + 1);
  726. #ifdef UNIT
  727. b[ 6] = ONE;
  728. b[ 7] = ZERO;
  729. #else
  730. b[ 6] = *(a02 + 2);
  731. b[ 7] = *(a02 + 3);
  732. #endif
  733. a01 += 2 * lda;
  734. a02 += 2 * lda;
  735. b += 8;
  736. }
  737. X += 2;
  738. i --;
  739. } while (i > 0);
  740. }
  741. i = (m & 1);
  742. if (i) {
  743. if (X < posY) {
  744. b += 4;
  745. } else
  746. if (X > posY) {
  747. b[ 0] = *(a01 + 0);
  748. b[ 1] = *(a01 + 1);
  749. b[ 2] = *(a01 + 2);
  750. b[ 3] = *(a01 + 3);
  751. b += 4;
  752. }
  753. #if 1
  754. }
  755. #else
  756. } else {
  757. #ifdef UNIT
  758. b[ 0] = ONE;
  759. b[ 1] = ZERO;
  760. #else
  761. b[ 0] = *(a01 + 0);
  762. b[ 1] = *(a01 + 1);
  763. #endif
  764. b[ 2] = *(a02 + 0);
  765. b[ 3] = *(a02 + 1);
  766. b += 4;
  767. }
  768. #endif
  769. posY += 2;
  770. }
  771. if (n & 1){
  772. X = posX;
  773. if (posX <= posY) {
  774. a01 = a + posX * 2 + (posY + 0) * lda;
  775. } else {
  776. a01 = a + posY * 2 + (posX + 0) * lda;
  777. }
  778. i = m;
  779. if (m > 0) {
  780. do {
  781. if (X < posY) {
  782. a01 += 2;
  783. } else {
  784. #ifdef UNIT
  785. if (X > posY) {
  786. #endif
  787. b[ 0] = *(a01 + 0);
  788. b[ 1] = *(a01 + 1);
  789. #ifdef UNIT
  790. } else {
  791. b[ 0] = ONE;
  792. b[ 1] = ZERO;
  793. }
  794. #endif
  795. a01 += lda;
  796. }
  797. b += 2;
  798. X ++;
  799. i --;
  800. } while (i > 0);
  801. }
  802. }
  803. return 0;
  804. }