You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_uncopy_6.c 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include "common.h"
  39. #ifndef UNIT
  40. #define INV(a) (ONE / (a))
  41. #else
  42. #define INV(a) (ONE)
  43. #endif
  44. int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
  45. BLASLONG i, ii, j, jj;
  46. FLOAT data01, data02, data03, data04, data05, data06;
  47. FLOAT data09, data10, data11, data12, data13, data14;
  48. FLOAT data17, data18, data19, data20, data21, data22;
  49. FLOAT data25, data26, data27, data28, data29, data30;
  50. FLOAT data33, data34, data35, data36, data37, data38;
  51. FLOAT data41, data42, data43, data44, data45, data46;
  52. FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8;
  53. jj = offset;
  54. BLASLONG mmod6, nmod6;
  55. mmod6 = m - (m/6)*6 ;
  56. nmod6 = n - (n/6)*6 ;
  57. // j = (n >> 3);
  58. j = (n / 6);
  59. while (j > 0){
  60. a1 = a + 0 * lda;
  61. a2 = a + 1 * lda;
  62. a3 = a + 2 * lda;
  63. a4 = a + 3 * lda;
  64. a5 = a + 4 * lda;
  65. a6 = a + 5 * lda;
  66. // a7 = a + 6 * lda;
  67. // a8 = a + 7 * lda;
  68. ii = 0;
  69. // i = (m >> 3);
  70. i = (m / 6);
  71. while (i > 0) {
  72. if (ii == jj) {
  73. #ifndef UNIT
  74. data01 = *(a1 + 0);
  75. #endif
  76. data09 = *(a2 + 0);
  77. #ifndef UNIT
  78. data10 = *(a2 + 1);
  79. #endif
  80. data17 = *(a3 + 0);
  81. data18 = *(a3 + 1);
  82. #ifndef UNIT
  83. data19 = *(a3 + 2);
  84. #endif
  85. data25 = *(a4 + 0);
  86. data26 = *(a4 + 1);
  87. data27 = *(a4 + 2);
  88. #ifndef UNIT
  89. data28 = *(a4 + 3);
  90. #endif
  91. data33 = *(a5 + 0);
  92. data34 = *(a5 + 1);
  93. data35 = *(a5 + 2);
  94. data36 = *(a5 + 3);
  95. #ifndef UNIT
  96. data37 = *(a5 + 4);
  97. #endif
  98. data41 = *(a6 + 0);
  99. data42 = *(a6 + 1);
  100. data43 = *(a6 + 2);
  101. data44 = *(a6 + 3);
  102. data45 = *(a6 + 4);
  103. #ifndef UNIT
  104. data46 = *(a6 + 5);
  105. #endif
  106. // data49 = *(a7 + 0);
  107. // data50 = *(a7 + 1);
  108. // data51 = *(a7 + 2);
  109. // data52 = *(a7 + 3);
  110. // data53 = *(a7 + 4);
  111. // data54 = *(a7 + 5);
  112. // #ifndef UNIT
  113. // data55 = *(a7 + 6);
  114. // #endif
  115. //
  116. // data57 = *(a8 + 0);
  117. // data58 = *(a8 + 1);
  118. // data59 = *(a8 + 2);
  119. // data60 = *(a8 + 3);
  120. // data61 = *(a8 + 4);
  121. // data62 = *(a8 + 5);
  122. // data63 = *(a8 + 6);
  123. // #ifndef UNIT
  124. // data64 = *(a8 + 7);
  125. // #endif
  126. *(b + 0) = INV(data01);
  127. *(b + 1) = data09;
  128. *(b + 2) = data17;
  129. *(b + 3) = data25;
  130. *(b + 4) = data33;
  131. *(b + 5) = data41;
  132. // *(b + 6) = data49;
  133. // *(b + 7) = data57;
  134. *(b + 7) = INV(data10);
  135. *(b + 8) = data18;
  136. *(b + 9) = data26;
  137. *(b + 10) = data34;
  138. *(b + 11) = data42;
  139. // *(b + 14) = data50;
  140. // *(b + 15) = data58;
  141. *(b + 14) = INV(data19);
  142. *(b + 15) = data27;
  143. *(b + 16) = data35;
  144. *(b + 17) = data43;
  145. // *(b + 22) = data51;
  146. // *(b + 23) = data59;
  147. *(b + 21) = INV(data28);
  148. *(b + 22) = data36;
  149. *(b + 23) = data44;
  150. // *(b + 30) = data52;
  151. // *(b + 31) = data60;
  152. *(b + 28) = INV(data37);
  153. *(b + 29) = data45;
  154. // *(b + 38) = data53;
  155. // *(b + 39) = data61;
  156. *(b + 35) = INV(data46);
  157. // *(b + 46) = data54;
  158. // *(b + 47) = data62;
  159. // *(b + 54) = INV(data55);
  160. // *(b + 55) = data63;
  161. // *(b + 63) = INV(data64);
  162. }
  163. if (ii < jj) {
  164. data01 = *(a1 + 0);
  165. data02 = *(a1 + 1);
  166. data03 = *(a1 + 2);
  167. data04 = *(a1 + 3);
  168. data05 = *(a1 + 4);
  169. data06 = *(a1 + 5);
  170. // data07 = *(a1 + 6);
  171. // data08 = *(a1 + 7);
  172. data09 = *(a2 + 0);
  173. data10 = *(a2 + 1);
  174. data11 = *(a2 + 2);
  175. data12 = *(a2 + 3);
  176. data13 = *(a2 + 4);
  177. data14 = *(a2 + 5);
  178. // data15 = *(a2 + 6);
  179. // data16 = *(a2 + 7);
  180. data17 = *(a3 + 0);
  181. data18 = *(a3 + 1);
  182. data19 = *(a3 + 2);
  183. data20 = *(a3 + 3);
  184. data21 = *(a3 + 4);
  185. data22 = *(a3 + 5);
  186. // data23 = *(a3 + 6);
  187. // data24 = *(a3 + 7);
  188. data25 = *(a4 + 0);
  189. data26 = *(a4 + 1);
  190. data27 = *(a4 + 2);
  191. data28 = *(a4 + 3);
  192. data29 = *(a4 + 4);
  193. data30 = *(a4 + 5);
  194. // data31 = *(a4 + 6);
  195. // data32 = *(a4 + 7);
  196. data33 = *(a5 + 0);
  197. data34 = *(a5 + 1);
  198. data35 = *(a5 + 2);
  199. data36 = *(a5 + 3);
  200. data37 = *(a5 + 4);
  201. data38 = *(a5 + 5);
  202. // data39 = *(a5 + 6);
  203. // data40 = *(a5 + 7);
  204. data41 = *(a6 + 0);
  205. data42 = *(a6 + 1);
  206. data43 = *(a6 + 2);
  207. data44 = *(a6 + 3);
  208. data45 = *(a6 + 4);
  209. data46 = *(a6 + 5);
  210. // data47 = *(a6 + 6);
  211. // data48 = *(a6 + 7);
  212. // data49 = *(a7 + 0);
  213. // data50 = *(a7 + 1);
  214. // data51 = *(a7 + 2);
  215. // data52 = *(a7 + 3);
  216. // data53 = *(a7 + 4);
  217. // data54 = *(a7 + 5);
  218. // data55 = *(a7 + 6);
  219. // data56 = *(a7 + 7);
  220. // data57 = *(a8 + 0);
  221. // data58 = *(a8 + 1);
  222. // data59 = *(a8 + 2);
  223. // data60 = *(a8 + 3);
  224. // data61 = *(a8 + 4);
  225. // data62 = *(a8 + 5);
  226. // data63 = *(a8 + 6);
  227. // data64 = *(a8 + 7);
  228. *(b + 0) = data01;
  229. *(b + 1) = data09;
  230. *(b + 2) = data17;
  231. *(b + 3) = data25;
  232. *(b + 4) = data33;
  233. *(b + 5) = data41;
  234. // *(b + 6) = data49;
  235. // *(b + 7) = data57;
  236. *(b + 6) = data02;
  237. *(b + 7) = data10;
  238. *(b + 8) = data18;
  239. *(b + 9) = data26;
  240. *(b + 10) = data34;
  241. *(b + 11) = data42;
  242. // *(b + 14) = data50;
  243. // *(b + 15) = data58;
  244. *(b + 12) = data03;
  245. *(b + 13) = data11;
  246. *(b + 14) = data19;
  247. *(b + 15) = data27;
  248. *(b + 16) = data35;
  249. *(b + 17) = data43;
  250. // *(b + 22) = data51;
  251. // *(b + 23) = data59;
  252. *(b + 18) = data04;
  253. *(b + 19) = data12;
  254. *(b + 20) = data20;
  255. *(b + 21) = data28;
  256. *(b + 22) = data36;
  257. *(b + 23) = data44;
  258. // *(b + 30) = data52;
  259. // *(b + 31) = data60;
  260. *(b + 24) = data05;
  261. *(b + 25) = data13;
  262. *(b + 26) = data21;
  263. *(b + 27) = data29;
  264. *(b + 28) = data37;
  265. *(b + 29) = data45;
  266. // *(b + 38) = data53;
  267. // *(b + 39) = data61;
  268. *(b + 30) = data06;
  269. *(b + 31) = data14;
  270. *(b + 32) = data22;
  271. *(b + 33) = data30;
  272. *(b + 34) = data38;
  273. *(b + 35) = data46;
  274. // *(b + 46) = data54;
  275. // *(b + 47) = data62;
  276. // *(b + 48) = data07;
  277. // *(b + 49) = data15;
  278. // *(b + 50) = data23;
  279. // *(b + 51) = data31;
  280. // *(b + 52) = data39;
  281. // *(b + 53) = data47;
  282. // *(b + 54) = data55;
  283. // *(b + 55) = data63;
  284. // *(b + 56) = data08;
  285. // *(b + 57) = data16;
  286. // *(b + 58) = data24;
  287. // *(b + 59) = data32;
  288. // *(b + 60) = data40;
  289. // *(b + 61) = data48;
  290. // *(b + 62) = data56;
  291. // *(b + 63) = data64;
  292. }
  293. a1 += 6;
  294. a2 += 6;
  295. a3 += 6;
  296. a4 += 6;
  297. a5 += 6;
  298. a6 += 6;
  299. // a7 += 6;
  300. // a8 += 6;
  301. b += 36;
  302. i --;
  303. ii += 6;
  304. }
  305. if (mmod6 & 4) {
  306. if (ii == jj) {
  307. #ifndef UNIT
  308. data01 = *(a1 + 0);
  309. #endif
  310. data09 = *(a2 + 0);
  311. #ifndef UNIT
  312. data10 = *(a2 + 1);
  313. #endif
  314. data17 = *(a3 + 0);
  315. data18 = *(a3 + 1);
  316. #ifndef UNIT
  317. data19 = *(a3 + 2);
  318. #endif
  319. data25 = *(a4 + 0);
  320. data26 = *(a4 + 1);
  321. data27 = *(a4 + 2);
  322. #ifndef UNIT
  323. data28 = *(a4 + 3);
  324. #endif
  325. data33 = *(a5 + 0);
  326. data34 = *(a5 + 1);
  327. data35 = *(a5 + 2);
  328. data36 = *(a5 + 3);
  329. data41 = *(a6 + 0);
  330. data42 = *(a6 + 1);
  331. data43 = *(a6 + 2);
  332. data44 = *(a6 + 3);
  333. // data49 = *(a7 + 0);
  334. // data50 = *(a7 + 1);
  335. // data51 = *(a7 + 2);
  336. // data52 = *(a7 + 3);
  337. // data57 = *(a8 + 0);
  338. // data58 = *(a8 + 1);
  339. // data59 = *(a8 + 2);
  340. // data60 = *(a8 + 3);
  341. *(b + 0) = INV(data01);
  342. *(b + 1) = data09;
  343. *(b + 2) = data17;
  344. *(b + 3) = data25;
  345. *(b + 4) = data33;
  346. *(b + 5) = data41;
  347. // *(b + 6) = data49;
  348. // *(b + 7) = data57;
  349. *(b + 7) = INV(data10);
  350. *(b + 8) = data18;
  351. *(b + 9) = data26;
  352. *(b + 10) = data34;
  353. *(b + 11) = data42;
  354. // *(b + 14) = data50;
  355. // *(b + 15) = data58;
  356. *(b + 14) = INV(data19);
  357. *(b + 15) = data27;
  358. *(b + 16) = data35;
  359. *(b + 17) = data43;
  360. // *(b + 22) = data51;
  361. // *(b + 23) = data59;
  362. *(b + 21) = INV(data28);
  363. *(b + 22) = data36;
  364. *(b + 23) = data44;
  365. // *(b + 30) = data52;
  366. // *(b + 31) = data60;
  367. }
  368. if (ii < jj) {
  369. data01 = *(a1 + 0);
  370. data02 = *(a1 + 1);
  371. data03 = *(a1 + 2);
  372. data04 = *(a1 + 3);
  373. data09 = *(a2 + 0);
  374. data10 = *(a2 + 1);
  375. data11 = *(a2 + 2);
  376. data12 = *(a2 + 3);
  377. data17 = *(a3 + 0);
  378. data18 = *(a3 + 1);
  379. data19 = *(a3 + 2);
  380. data20 = *(a3 + 3);
  381. data25 = *(a4 + 0);
  382. data26 = *(a4 + 1);
  383. data27 = *(a4 + 2);
  384. data28 = *(a4 + 3);
  385. data33 = *(a5 + 0);
  386. data34 = *(a5 + 1);
  387. data35 = *(a5 + 2);
  388. data36 = *(a5 + 3);
  389. data41 = *(a6 + 0);
  390. data42 = *(a6 + 1);
  391. data43 = *(a6 + 2);
  392. data44 = *(a6 + 3);
  393. // data49 = *(a7 + 0);
  394. // data50 = *(a7 + 1);
  395. // data51 = *(a7 + 2);
  396. // data52 = *(a7 + 3);
  397. // data57 = *(a8 + 0);
  398. // data58 = *(a8 + 1);
  399. // data59 = *(a8 + 2);
  400. // data60 = *(a8 + 3);
  401. *(b + 0) = data01;
  402. *(b + 1) = data09;
  403. *(b + 2) = data17;
  404. *(b + 3) = data25;
  405. *(b + 4) = data33;
  406. *(b + 5) = data41;
  407. // *(b + 6) = data49;
  408. // *(b + 7) = data57;
  409. *(b + 6) = data02;
  410. *(b + 7) = data10;
  411. *(b + 8) = data18;
  412. *(b + 9) = data26;
  413. *(b + 10) = data34;
  414. *(b + 11) = data42;
  415. // *(b + 14) = data50;
  416. // *(b + 15) = data58;
  417. *(b + 12) = data03;
  418. *(b + 13) = data11;
  419. *(b + 14) = data19;
  420. *(b + 15) = data27;
  421. *(b + 16) = data35;
  422. *(b + 17) = data43;
  423. // *(b + 22) = data51;
  424. // *(b + 23) = data59;
  425. *(b + 18) = data04;
  426. *(b + 19) = data12;
  427. *(b + 20) = data20;
  428. *(b + 21) = data28;
  429. *(b + 22) = data36;
  430. *(b + 23) = data44;
  431. // *(b + 30) = data52;
  432. // *(b + 31) = data60;
  433. }
  434. a1 += 4;
  435. a2 += 4;
  436. a3 += 4;
  437. a4 += 4;
  438. a5 += 4;
  439. a6 += 4;
  440. // a7 += 4;
  441. // a8 += 4;
  442. b += 24;
  443. ii += 4;
  444. }
  445. if (mmod6 & 2) {
  446. if (ii == jj) {
  447. #ifndef UNIT
  448. data01 = *(a1 + 0);
  449. #endif
  450. data09 = *(a2 + 0);
  451. #ifndef UNIT
  452. data10 = *(a2 + 1);
  453. #endif
  454. data17 = *(a3 + 0);
  455. data18 = *(a3 + 1);
  456. data25 = *(a4 + 0);
  457. data26 = *(a4 + 1);
  458. data33 = *(a5 + 0);
  459. data34 = *(a5 + 1);
  460. data41 = *(a6 + 0);
  461. data42 = *(a6 + 1);
  462. // data49 = *(a7 + 0);
  463. // data50 = *(a7 + 1);
  464. // data57 = *(a8 + 0);
  465. // data58 = *(a8 + 1);
  466. *(b + 0) = INV(data01);
  467. *(b + 1) = data09;
  468. *(b + 2) = data17;
  469. *(b + 3) = data25;
  470. *(b + 4) = data33;
  471. *(b + 5) = data41;
  472. // *(b + 6) = data49;
  473. // *(b + 7) = data57;
  474. *(b + 7) = INV(data10);
  475. *(b + 8) = data18;
  476. *(b + 9) = data26;
  477. *(b + 10) = data34;
  478. *(b + 11) = data42;
  479. // *(b + 14) = data50;
  480. // *(b + 15) = data58;
  481. }
  482. if (ii < jj) {
  483. data01 = *(a1 + 0);
  484. data02 = *(a1 + 1);
  485. data09 = *(a2 + 0);
  486. data10 = *(a2 + 1);
  487. data17 = *(a3 + 0);
  488. data18 = *(a3 + 1);
  489. data25 = *(a4 + 0);
  490. data26 = *(a4 + 1);
  491. data33 = *(a5 + 0);
  492. data34 = *(a5 + 1);
  493. data41 = *(a6 + 0);
  494. data42 = *(a6 + 1);
  495. // data49 = *(a7 + 0);
  496. // data50 = *(a7 + 1);
  497. // data57 = *(a8 + 0);
  498. // data58 = *(a8 + 1);
  499. *(b + 0) = data01;
  500. *(b + 1) = data09;
  501. *(b + 2) = data17;
  502. *(b + 3) = data25;
  503. *(b + 4) = data33;
  504. *(b + 5) = data41;
  505. // *(b + 6) = data49;
  506. // *(b + 7) = data57;
  507. *(b + 6) = data02;
  508. *(b + 7) = data10;
  509. *(b + 8) = data18;
  510. *(b + 9) = data26;
  511. *(b + 10) = data34;
  512. *(b + 11) = data42;
  513. // *(b + 14) = data50;
  514. // *(b + 15) = data58;
  515. }
  516. a1 += 2;
  517. a2 += 2;
  518. a3 += 2;
  519. a4 += 2;
  520. a5 += 2;
  521. a6 += 2;
  522. a7 += 2;
  523. a8 += 2;
  524. b += 12;
  525. ii += 2;
  526. }
  527. if (mmod6 & 1) {
  528. if (ii == jj) {
  529. #ifndef UNIT
  530. data01 = *(a1 + 0);
  531. #endif
  532. data09 = *(a2 + 0);
  533. data17 = *(a3 + 0);
  534. data25 = *(a4 + 0);
  535. data33 = *(a5 + 0);
  536. data41 = *(a6 + 0);
  537. // data49 = *(a7 + 0);
  538. // data57 = *(a8 + 0);
  539. *(b + 0) = INV(data01);
  540. *(b + 1) = data09;
  541. *(b + 2) = data17;
  542. *(b + 3) = data25;
  543. *(b + 4) = data33;
  544. *(b + 5) = data41;
  545. // *(b + 6) = data49;
  546. // *(b + 7) = data57;
  547. }
  548. if (ii < jj) {
  549. data01 = *(a1 + 0);
  550. // data02 = *(a1 + 1);
  551. data09 = *(a2 + 0);
  552. // data10 = *(a2 + 1);
  553. data17 = *(a3 + 0);
  554. // data18 = *(a3 + 1);
  555. data25 = *(a4 + 0);
  556. // data26 = *(a4 + 1);
  557. // // data33 = *(a5 + 0);
  558. // data34 = *(a5 + 1);
  559. // // data41 = *(a6 + 0);
  560. // data42 = *(a6 + 1);
  561. // data49 = *(a7 + 0);
  562. // data50 = *(a7 + 1);
  563. // data57 = *(a8 + 0);
  564. // data58 = *(a8 + 1);
  565. *(b + 0) = data01;
  566. *(b + 1) = data09;
  567. *(b + 2) = data17;
  568. *(b + 3) = data25;
  569. *(b + 4) = data33;
  570. *(b + 5) = data41;
  571. // *(b + 6) = data49;
  572. // *(b + 7) = data57;
  573. }
  574. b += 6;
  575. // ii += 1;
  576. }
  577. a += 6 * lda;
  578. jj += 6;
  579. j --;
  580. }
  581. if (nmod6 & 4) {
  582. a1 = a + 0 * lda;
  583. a2 = a + 1 * lda;
  584. a3 = a + 2 * lda;
  585. a4 = a + 3 * lda;
  586. ii = 0;
  587. i = (m >> 1);
  588. while (i > 0) {
  589. if (ii == jj) {
  590. #ifndef UNIT
  591. data01 = *(a1 + 0);
  592. #endif
  593. data09 = *(a2 + 0);
  594. #ifndef UNIT
  595. data10 = *(a2 + 1);
  596. #endif
  597. data17 = *(a3 + 0);
  598. data18 = *(a3 + 1);
  599. #ifndef UNIT
  600. data19 = *(a3 + 2);
  601. #endif
  602. data25 = *(a4 + 0);
  603. data26 = *(a4 + 1);
  604. data27 = *(a4 + 2);
  605. #ifndef UNIT
  606. data28 = *(a4 + 3);
  607. #endif
  608. *(b + 0) = INV(data01);
  609. *(b + 1) = data09;
  610. *(b + 2) = data17;
  611. *(b + 3) = data25;
  612. *(b + 5) = INV(data10);
  613. *(b + 6) = data18;
  614. *(b + 7) = data26;
  615. *(b + 10) = INV(data19);
  616. *(b + 11) = data27;
  617. *(b + 15) = INV(data28);
  618. a1 += 4;
  619. a2 += 4;
  620. a3 += 4;
  621. a4 += 4;
  622. b += 16;
  623. i -= 2;
  624. ii += 4;
  625. }
  626. else if (ii < jj) {
  627. data01 = *(a1 + 0);
  628. data02 = *(a1 + 1);
  629. data09 = *(a2 + 0);
  630. data10 = *(a2 + 1);
  631. data17 = *(a3 + 0);
  632. data18 = *(a3 + 1);
  633. data25 = *(a4 + 0);
  634. data26 = *(a4 + 1);
  635. *(b + 0) = data01;
  636. *(b + 1) = data09;
  637. *(b + 2) = data17;
  638. *(b + 3) = data25;
  639. *(b + 4) = data02;
  640. *(b + 5) = data10;
  641. *(b + 6) = data18;
  642. *(b + 7) = data26;
  643. a1 += 2;
  644. a2 += 2;
  645. a3 += 2;
  646. a4 += 2;
  647. b += 8;
  648. i -- ;
  649. ii += 2;
  650. }
  651. else{
  652. a1 += 2;
  653. a2 += 2;
  654. a3 += 2;
  655. a4 += 2;
  656. b += 8;
  657. i -- ;
  658. ii += 2;
  659. }
  660. }
  661. if (m & 1) {
  662. if (ii == jj) {
  663. #ifndef UNIT
  664. data01 = *(a1 + 0);
  665. #endif
  666. data09 = *(a2 + 0);
  667. data17 = *(a3 + 0);
  668. data25 = *(a4 + 0);
  669. *(b + 0) = INV(data01);
  670. *(b + 1) = data09;
  671. *(b + 2) = data17;
  672. *(b + 3) = data25;
  673. }
  674. if (ii < jj) {
  675. data01 = *(a1 + 0);
  676. data09 = *(a2 + 0);
  677. data17 = *(a3 + 0);
  678. data25 = *(a4 + 0);
  679. *(b + 0) = data01;
  680. *(b + 1) = data09;
  681. *(b + 2) = data17;
  682. *(b + 3) = data25;
  683. }
  684. b += 4;
  685. // ii += 1;
  686. }
  687. a += 4 * lda;
  688. jj += 4;
  689. }
  690. if (nmod6 & 2) {
  691. a1 = a + 0 * lda;
  692. a2 = a + 1 * lda;
  693. ii = 0;
  694. i = (m >> 1);
  695. while (i > 0) {
  696. if (ii == jj) {
  697. #ifndef UNIT
  698. data01 = *(a1 + 0);
  699. #endif
  700. data09 = *(a2 + 0);
  701. #ifndef UNIT
  702. data10 = *(a2 + 1);
  703. #endif
  704. *(b + 0) = INV(data01);
  705. *(b + 1) = data09;
  706. *(b + 3) = INV(data10);
  707. }
  708. if (ii < jj) {
  709. data01 = *(a1 + 0);
  710. data02 = *(a1 + 1);
  711. data09 = *(a2 + 0);
  712. data10 = *(a2 + 1);
  713. *(b + 0) = data01;
  714. *(b + 1) = data09;
  715. *(b + 2) = data02;
  716. *(b + 3) = data10;
  717. }
  718. a1 += 2;
  719. a2 += 2;
  720. b += 4;
  721. i --;
  722. ii += 2;
  723. }
  724. if (m & 1) {
  725. if (ii == jj) {
  726. #ifndef UNIT
  727. data01 = *(a1 + 0);
  728. #endif
  729. data09 = *(a2 + 0);
  730. *(b + 0) = INV(data01);
  731. *(b + 1) = data09;
  732. }
  733. if (ii < jj) {
  734. data01 = *(a1 + 0);
  735. data09 = *(a2 + 0);
  736. *(b + 0) = data01;
  737. *(b + 1) = data09;
  738. }
  739. b += 2;
  740. // ii += 1;
  741. }
  742. a += 2 * lda;
  743. jj += 2;
  744. }
  745. if (nmod6 & 1) {
  746. a1 = a + 0 * lda;
  747. ii = 0;
  748. i = m;
  749. while (i > 0) {
  750. if (ii == jj) {
  751. #ifndef UNIT
  752. data01 = *(a1 + 0);
  753. #endif
  754. *(b + 0) = INV(data01);
  755. }
  756. if (ii < jj) {
  757. data01 = *(a1 + 0);
  758. *(b + 0) = data01;
  759. }
  760. a1 += 1;
  761. b += 1;
  762. i --;
  763. ii ++;
  764. }
  765. }
  766. return 0;
  767. }