You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zlaswp_ncopy_8.c 22 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. #define a2 (a1 + 2)
  41. #define a4 (a3 + 2)
  42. #define a6 (a5 + 2)
  43. #define a8 (a7 + 2)
  44. int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){
  45. BLASLONG i, j, ip1, ip2;
  46. blasint *piv;
  47. FLOAT *a1, *a3, *a5, *a7;
  48. FLOAT *b1, *b2, *b3, *b4;
  49. FLOAT *b5, *b6, *b7, *b8;
  50. FLOAT A1, A2, A3, A4, A5, A6, A7, A8;
  51. FLOAT B1, B2, B3, B4, B5, B6, B7, B8;
  52. FLOAT A9, A10, A11, A12, A13, A14, A15, A16;
  53. FLOAT B9, B10, B11, B12, B13, B14, B15, B16;
  54. a -= 2;
  55. lda *= 2;
  56. k1 --;
  57. ipiv += k1;
  58. if (n <= 0) return 0;
  59. j = (n >> 3);
  60. if (j > 0) {
  61. do {
  62. piv = ipiv;
  63. a1 = a + (k1 + 1) * 2;
  64. a3 = a1 + 1 * lda;
  65. a5 = a1 + 2 * lda;
  66. a7 = a1 + 3 * lda;
  67. ip1 = *(piv + 0) * 2;
  68. ip2 = *(piv + 1) * 2;
  69. piv += 2;
  70. b1 = a + ip1;
  71. b2 = a + ip2;
  72. b3 = b1 + 1 * lda;
  73. b4 = b2 + 1 * lda;
  74. b5 = b1 + 2 * lda;
  75. b6 = b2 + 2 * lda;
  76. b7 = b1 + 3 * lda;
  77. b8 = b2 + 3 * lda;
  78. i = ((k2 - k1) >> 1);
  79. if (i > 0) {
  80. do {
  81. ip1 = *(piv + 0) * 2;
  82. ip2 = *(piv + 1) * 2;
  83. piv += 2;
  84. for( int pass = 0; pass < 2; ++pass ) {
  85. A1 = *(a1 + 0);
  86. A9 = *(a1 + 1);
  87. A2 = *(a2 + 0);
  88. A10 = *(a2 + 1);
  89. A3 = *(a3 + 0);
  90. A11 = *(a3 + 1);
  91. A4 = *(a4 + 0);
  92. A12 = *(a4 + 1);
  93. A5 = *(a5 + 0);
  94. A13 = *(a5 + 1);
  95. A6 = *(a6 + 0);
  96. A14 = *(a6 + 1);
  97. A7 = *(a7 + 0);
  98. A15 = *(a7 + 1);
  99. A8 = *(a8 + 0);
  100. A16 = *(a8 + 1);
  101. B1 = *(b1 + 0);
  102. B9 = *(b1 + 1);
  103. B2 = *(b2 + 0);
  104. B10 = *(b2 + 1);
  105. B3 = *(b3 + 0);
  106. B11 = *(b3 + 1);
  107. B4 = *(b4 + 0);
  108. B12 = *(b4 + 1);
  109. B5 = *(b5 + 0);
  110. B13 = *(b5 + 1);
  111. B6 = *(b6 + 0);
  112. B14 = *(b6 + 1);
  113. B7 = *(b7 + 0);
  114. B15 = *(b7 + 1);
  115. B8 = *(b8 + 0);
  116. B16 = *(b8 + 1);
  117. if (b1 == a1) {
  118. if (b2 == a2) {
  119. *(buffer + 0) = A1;
  120. *(buffer + 1) = A9;
  121. *(buffer + 2) = A3;
  122. *(buffer + 3) = A11;
  123. *(buffer + 4) = A5;
  124. *(buffer + 5) = A13;
  125. *(buffer + 6) = A7;
  126. *(buffer + 7) = A15;
  127. *(buffer + 8) = A2;
  128. *(buffer + 9) = A10;
  129. *(buffer + 10) = A4;
  130. *(buffer + 11) = A12;
  131. *(buffer + 12) = A6;
  132. *(buffer + 13) = A14;
  133. *(buffer + 14) = A8;
  134. *(buffer + 15) = A16;
  135. } else {
  136. *(buffer + 0) = A1;
  137. *(buffer + 1) = A9;
  138. *(buffer + 2) = A3;
  139. *(buffer + 3) = A11;
  140. *(buffer + 4) = A5;
  141. *(buffer + 5) = A13;
  142. *(buffer + 6) = A7;
  143. *(buffer + 7) = A15;
  144. *(buffer + 8) = B2;
  145. *(buffer + 9) = B10;
  146. *(buffer + 10) = B4;
  147. *(buffer + 11) = B12;
  148. *(buffer + 12) = B6;
  149. *(buffer + 13) = B14;
  150. *(buffer + 14) = B8;
  151. *(buffer + 15) = B16;
  152. *(b2 + 0) = A2;
  153. *(b2 + 1) = A10;
  154. *(b4 + 0) = A4;
  155. *(b4 + 1) = A12;
  156. *(b6 + 0) = A6;
  157. *(b6 + 1) = A14;
  158. *(b8 + 0) = A8;
  159. *(b8 + 1) = A16;
  160. }
  161. } else
  162. if (b1 == a2) {
  163. if (b2 == a2) {
  164. *(buffer + 0) = A2;
  165. *(buffer + 1) = A10;
  166. *(buffer + 2) = A4;
  167. *(buffer + 3) = A12;
  168. *(buffer + 4) = A6;
  169. *(buffer + 5) = A14;
  170. *(buffer + 6) = A8;
  171. *(buffer + 7) = A16;
  172. *(buffer + 8) = A1;
  173. *(buffer + 9) = A9;
  174. *(buffer + 10) = A3;
  175. *(buffer + 11) = A11;
  176. *(buffer + 12) = A5;
  177. *(buffer + 13) = A13;
  178. *(buffer + 14) = A7;
  179. *(buffer + 15) = A15;
  180. } else {
  181. *(buffer + 0) = A2;
  182. *(buffer + 1) = A10;
  183. *(buffer + 2) = A4;
  184. *(buffer + 3) = A12;
  185. *(buffer + 4) = A6;
  186. *(buffer + 5) = A14;
  187. *(buffer + 6) = A8;
  188. *(buffer + 7) = A16;
  189. *(buffer + 8) = B2;
  190. *(buffer + 9) = B10;
  191. *(buffer + 10) = B4;
  192. *(buffer + 11) = B12;
  193. *(buffer + 12) = B6;
  194. *(buffer + 13) = B14;
  195. *(buffer + 14) = B8;
  196. *(buffer + 15) = B16;
  197. *(b2 + 0) = A1;
  198. *(b2 + 1) = A9;
  199. *(b4 + 0) = A3;
  200. *(b4 + 1) = A11;
  201. *(b6 + 0) = A5;
  202. *(b6 + 1) = A13;
  203. *(b8 + 0) = A7;
  204. *(b8 + 1) = A15;
  205. }
  206. } else {
  207. if (b2 == a2) {
  208. *(buffer + 0) = B1;
  209. *(buffer + 1) = B9;
  210. *(buffer + 2) = B3;
  211. *(buffer + 3) = B11;
  212. *(buffer + 4) = B5;
  213. *(buffer + 5) = B13;
  214. *(buffer + 6) = B7;
  215. *(buffer + 7) = B15;
  216. *(buffer + 8) = A2;
  217. *(buffer + 9) = A10;
  218. *(buffer + 10) = A4;
  219. *(buffer + 11) = A12;
  220. *(buffer + 12) = A6;
  221. *(buffer + 13) = A14;
  222. *(buffer + 14) = A8;
  223. *(buffer + 15) = A16;
  224. *(b1 + 0) = A1;
  225. *(b1 + 1) = A9;
  226. *(b3 + 0) = A3;
  227. *(b3 + 1) = A11;
  228. *(b5 + 0) = A5;
  229. *(b5 + 1) = A13;
  230. *(b7 + 0) = A7;
  231. *(b7 + 1) = A15;
  232. } else
  233. if (b2 == b1) {
  234. *(buffer + 0) = B1;
  235. *(buffer + 1) = B9;
  236. *(buffer + 2) = B3;
  237. *(buffer + 3) = B11;
  238. *(buffer + 4) = B5;
  239. *(buffer + 5) = B13;
  240. *(buffer + 6) = B7;
  241. *(buffer + 7) = B15;
  242. *(buffer + 8) = A1;
  243. *(buffer + 9) = A9;
  244. *(buffer + 10) = A3;
  245. *(buffer + 11) = A11;
  246. *(buffer + 12) = A5;
  247. *(buffer + 13) = A13;
  248. *(buffer + 14) = A7;
  249. *(buffer + 15) = A15;
  250. *(b1 + 0) = A2;
  251. *(b1 + 1) = A10;
  252. *(b3 + 0) = A4;
  253. *(b3 + 1) = A12;
  254. *(b5 + 0) = A6;
  255. *(b5 + 1) = A14;
  256. *(b7 + 0) = A8;
  257. *(b7 + 1) = A16;
  258. } else {
  259. *(buffer + 0) = B1;
  260. *(buffer + 1) = B9;
  261. *(buffer + 2) = B3;
  262. *(buffer + 3) = B11;
  263. *(buffer + 4) = B5;
  264. *(buffer + 5) = B13;
  265. *(buffer + 6) = B7;
  266. *(buffer + 7) = B15;
  267. *(buffer + 8) = B2;
  268. *(buffer + 9) = B10;
  269. *(buffer + 10) = B4;
  270. *(buffer + 11) = B12;
  271. *(buffer + 12) = B6;
  272. *(buffer + 13) = B14;
  273. *(buffer + 14) = B8;
  274. *(buffer + 15) = B16;
  275. *(b1 + 0) = A1;
  276. *(b1 + 1) = A9;
  277. *(b2 + 0) = A2;
  278. *(b2 + 1) = A10;
  279. *(b3 + 0) = A3;
  280. *(b3 + 1) = A11;
  281. *(b4 + 0) = A4;
  282. *(b4 + 1) = A12;
  283. *(b5 + 0) = A5;
  284. *(b5 + 1) = A13;
  285. *(b6 + 0) = A6;
  286. *(b6 + 1) = A14;
  287. *(b7 + 0) = A7;
  288. *(b7 + 1) = A15;
  289. *(b8 + 0) = A8;
  290. *(b8 + 1) = A16;
  291. }
  292. }
  293. b1 += 4*lda;
  294. b2 += 4*lda;
  295. b3 += 4*lda;
  296. b4 += 4*lda;
  297. b5 += 4*lda;
  298. b6 += 4*lda;
  299. b7 += 4*lda;
  300. b8 += 4*lda;
  301. a1 += 4;
  302. a3 += 4;
  303. a5 += 4;
  304. a7 += 4;
  305. buffer += 16;
  306. }
  307. b1 = a + ip1;
  308. b2 = a + ip2;
  309. b3 = b1 + 1 * lda;
  310. b4 = b2 + 1 * lda;
  311. b5 = b1 + 2 * lda;
  312. b6 = b2 + 2 * lda;
  313. b7 = b1 + 3 * lda;
  314. b8 = b2 + 3 * lda;
  315. i --;
  316. } while (i > 0);
  317. }
  318. i = ((k2 - k1) & 1);
  319. if (i > 0) {
  320. A1 = *(a1 + 0);
  321. A9 = *(a1 + 1);
  322. B1 = *(b1 + 0);
  323. B9 = *(b1 + 1);
  324. A3 = *(a3 + 0);
  325. A11 = *(a3 + 1);
  326. B3 = *(b3 + 0);
  327. B11 = *(b3 + 1);
  328. A5 = *(a5 + 0);
  329. A13 = *(a5 + 1);
  330. B5 = *(b5 + 0);
  331. B13 = *(b5 + 1);
  332. A7 = *(a7 + 0);
  333. A15 = *(a7 + 1);
  334. B7 = *(b7 + 0);
  335. B15 = *(b7 + 1);
  336. if (a1 == b1) {
  337. *(buffer + 0) = A1;
  338. *(buffer + 1) = A9;
  339. *(buffer + 2) = A3;
  340. *(buffer + 3) = A11;
  341. *(buffer + 4) = A5;
  342. *(buffer + 5) = A13;
  343. *(buffer + 6) = A7;
  344. *(buffer + 7) = A15;
  345. } else {
  346. *(buffer + 0) = B1;
  347. *(buffer + 1) = B9;
  348. *(buffer + 2) = B3;
  349. *(buffer + 3) = B11;
  350. *(buffer + 4) = B5;
  351. *(buffer + 5) = B13;
  352. *(buffer + 6) = B7;
  353. *(buffer + 7) = B15;
  354. *(b1 + 0) = A1;
  355. *(b1 + 1) = A9;
  356. *(b3 + 0) = A3;
  357. *(b3 + 1) = A11;
  358. *(b5 + 0) = A5;
  359. *(b5 + 1) = A13;
  360. *(b7 + 0) = A7;
  361. *(b7 + 1) = A15;
  362. }
  363. buffer += 8;
  364. }
  365. a += 4 * lda;
  366. j --;
  367. } while (j > 0);
  368. }
  369. if (n & 4) {
  370. {
  371. piv = ipiv;
  372. a1 = a + (k1 + 1) * 2;
  373. a3 = a1 + 1 * lda;
  374. a5 = a1 + 2 * lda;
  375. a7 = a1 + 3 * lda;
  376. ip1 = *(piv + 0) * 2;
  377. ip2 = *(piv + 1) * 2;
  378. piv += 2;
  379. b1 = a + ip1;
  380. b2 = a + ip2;
  381. b3 = b1 + 1 * lda;
  382. b4 = b2 + 1 * lda;
  383. b5 = b1 + 2 * lda;
  384. b6 = b2 + 2 * lda;
  385. b7 = b1 + 3 * lda;
  386. b8 = b2 + 3 * lda;
  387. i = ((k2 - k1) >> 1);
  388. if (i > 0) {
  389. do {
  390. A1 = *(a1 + 0);
  391. A9 = *(a1 + 1);
  392. A2 = *(a2 + 0);
  393. A10 = *(a2 + 1);
  394. A3 = *(a3 + 0);
  395. A11 = *(a3 + 1);
  396. A4 = *(a4 + 0);
  397. A12 = *(a4 + 1);
  398. A5 = *(a5 + 0);
  399. A13 = *(a5 + 1);
  400. A6 = *(a6 + 0);
  401. A14 = *(a6 + 1);
  402. A7 = *(a7 + 0);
  403. A15 = *(a7 + 1);
  404. A8 = *(a8 + 0);
  405. A16 = *(a8 + 1);
  406. B1 = *(b1 + 0);
  407. B9 = *(b1 + 1);
  408. B2 = *(b2 + 0);
  409. B10 = *(b2 + 1);
  410. B3 = *(b3 + 0);
  411. B11 = *(b3 + 1);
  412. B4 = *(b4 + 0);
  413. B12 = *(b4 + 1);
  414. B5 = *(b5 + 0);
  415. B13 = *(b5 + 1);
  416. B6 = *(b6 + 0);
  417. B14 = *(b6 + 1);
  418. B7 = *(b7 + 0);
  419. B15 = *(b7 + 1);
  420. B8 = *(b8 + 0);
  421. B16 = *(b8 + 1);
  422. ip1 = *(piv + 0) * 2;
  423. ip2 = *(piv + 1) * 2;
  424. piv += 2;
  425. if (b1 == a1) {
  426. if (b2 == a2) {
  427. *(buffer + 0) = A1;
  428. *(buffer + 1) = A9;
  429. *(buffer + 2) = A3;
  430. *(buffer + 3) = A11;
  431. *(buffer + 4) = A5;
  432. *(buffer + 5) = A13;
  433. *(buffer + 6) = A7;
  434. *(buffer + 7) = A15;
  435. *(buffer + 8) = A2;
  436. *(buffer + 9) = A10;
  437. *(buffer + 10) = A4;
  438. *(buffer + 11) = A12;
  439. *(buffer + 12) = A6;
  440. *(buffer + 13) = A14;
  441. *(buffer + 14) = A8;
  442. *(buffer + 15) = A16;
  443. } else {
  444. *(buffer + 0) = A1;
  445. *(buffer + 1) = A9;
  446. *(buffer + 2) = A3;
  447. *(buffer + 3) = A11;
  448. *(buffer + 4) = A5;
  449. *(buffer + 5) = A13;
  450. *(buffer + 6) = A7;
  451. *(buffer + 7) = A15;
  452. *(buffer + 8) = B2;
  453. *(buffer + 9) = B10;
  454. *(buffer + 10) = B4;
  455. *(buffer + 11) = B12;
  456. *(buffer + 12) = B6;
  457. *(buffer + 13) = B14;
  458. *(buffer + 14) = B8;
  459. *(buffer + 15) = B16;
  460. *(b2 + 0) = A2;
  461. *(b2 + 1) = A10;
  462. *(b4 + 0) = A4;
  463. *(b4 + 1) = A12;
  464. *(b6 + 0) = A6;
  465. *(b6 + 1) = A14;
  466. *(b8 + 0) = A8;
  467. *(b8 + 1) = A16;
  468. }
  469. } else
  470. if (b1 == a2) {
  471. if (b2 == a2) {
  472. *(buffer + 0) = A2;
  473. *(buffer + 1) = A10;
  474. *(buffer + 2) = A4;
  475. *(buffer + 3) = A12;
  476. *(buffer + 4) = A6;
  477. *(buffer + 5) = A14;
  478. *(buffer + 6) = A8;
  479. *(buffer + 7) = A16;
  480. *(buffer + 8) = A1;
  481. *(buffer + 9) = A9;
  482. *(buffer + 10) = A3;
  483. *(buffer + 11) = A11;
  484. *(buffer + 12) = A5;
  485. *(buffer + 13) = A13;
  486. *(buffer + 14) = A7;
  487. *(buffer + 15) = A15;
  488. } else {
  489. *(buffer + 0) = A2;
  490. *(buffer + 1) = A10;
  491. *(buffer + 2) = A4;
  492. *(buffer + 3) = A12;
  493. *(buffer + 4) = A6;
  494. *(buffer + 5) = A14;
  495. *(buffer + 6) = A8;
  496. *(buffer + 7) = A16;
  497. *(buffer + 8) = B2;
  498. *(buffer + 9) = B10;
  499. *(buffer + 10) = B4;
  500. *(buffer + 11) = B12;
  501. *(buffer + 12) = B6;
  502. *(buffer + 13) = B14;
  503. *(buffer + 14) = B8;
  504. *(buffer + 15) = B16;
  505. *(b2 + 0) = A1;
  506. *(b2 + 1) = A9;
  507. *(b4 + 0) = A3;
  508. *(b4 + 1) = A11;
  509. *(b6 + 0) = A5;
  510. *(b6 + 1) = A13;
  511. *(b8 + 0) = A7;
  512. *(b8 + 1) = A15;
  513. }
  514. } else {
  515. if (b2 == a2) {
  516. *(buffer + 0) = B1;
  517. *(buffer + 1) = B9;
  518. *(buffer + 2) = B3;
  519. *(buffer + 3) = B11;
  520. *(buffer + 4) = B5;
  521. *(buffer + 5) = B13;
  522. *(buffer + 6) = B7;
  523. *(buffer + 7) = B15;
  524. *(buffer + 8) = A2;
  525. *(buffer + 9) = A10;
  526. *(buffer + 10) = A4;
  527. *(buffer + 11) = A12;
  528. *(buffer + 12) = A6;
  529. *(buffer + 13) = A14;
  530. *(buffer + 14) = A8;
  531. *(buffer + 15) = A16;
  532. *(b1 + 0) = A1;
  533. *(b1 + 1) = A9;
  534. *(b3 + 0) = A3;
  535. *(b3 + 1) = A11;
  536. *(b5 + 0) = A5;
  537. *(b5 + 1) = A13;
  538. *(b7 + 0) = A7;
  539. *(b7 + 1) = A15;
  540. } else
  541. if (b2 == b1) {
  542. *(buffer + 0) = B1;
  543. *(buffer + 1) = B9;
  544. *(buffer + 2) = B3;
  545. *(buffer + 3) = B11;
  546. *(buffer + 4) = B5;
  547. *(buffer + 5) = B13;
  548. *(buffer + 6) = B7;
  549. *(buffer + 7) = B15;
  550. *(buffer + 8) = A1;
  551. *(buffer + 9) = A9;
  552. *(buffer + 10) = A3;
  553. *(buffer + 11) = A11;
  554. *(buffer + 12) = A5;
  555. *(buffer + 13) = A13;
  556. *(buffer + 14) = A7;
  557. *(buffer + 15) = A15;
  558. *(b1 + 0) = A2;
  559. *(b1 + 1) = A10;
  560. *(b3 + 0) = A4;
  561. *(b3 + 1) = A12;
  562. *(b5 + 0) = A6;
  563. *(b5 + 1) = A14;
  564. *(b7 + 0) = A8;
  565. *(b7 + 1) = A16;
  566. } else {
  567. *(buffer + 0) = B1;
  568. *(buffer + 1) = B9;
  569. *(buffer + 2) = B3;
  570. *(buffer + 3) = B11;
  571. *(buffer + 4) = B5;
  572. *(buffer + 5) = B13;
  573. *(buffer + 6) = B7;
  574. *(buffer + 7) = B15;
  575. *(buffer + 8) = B2;
  576. *(buffer + 9) = B10;
  577. *(buffer + 10) = B4;
  578. *(buffer + 11) = B12;
  579. *(buffer + 12) = B6;
  580. *(buffer + 13) = B14;
  581. *(buffer + 14) = B8;
  582. *(buffer + 15) = B16;
  583. *(b1 + 0) = A1;
  584. *(b1 + 1) = A9;
  585. *(b2 + 0) = A2;
  586. *(b2 + 1) = A10;
  587. *(b3 + 0) = A3;
  588. *(b3 + 1) = A11;
  589. *(b4 + 0) = A4;
  590. *(b4 + 1) = A12;
  591. *(b5 + 0) = A5;
  592. *(b5 + 1) = A13;
  593. *(b6 + 0) = A6;
  594. *(b6 + 1) = A14;
  595. *(b7 + 0) = A7;
  596. *(b7 + 1) = A15;
  597. *(b8 + 0) = A8;
  598. *(b8 + 1) = A16;
  599. }
  600. }
  601. buffer += 16;
  602. b1 = a + ip1;
  603. b2 = a + ip2;
  604. b3 = b1 + 1 * lda;
  605. b4 = b2 + 1 * lda;
  606. b5 = b1 + 2 * lda;
  607. b6 = b2 + 2 * lda;
  608. b7 = b1 + 3 * lda;
  609. b8 = b2 + 3 * lda;
  610. a1 += 4;
  611. a3 += 4;
  612. a5 += 4;
  613. a7 += 4;
  614. i --;
  615. } while (i > 0);
  616. }
  617. i = ((k2 - k1) & 1);
  618. if (i > 0) {
  619. A1 = *(a1 + 0);
  620. A9 = *(a1 + 1);
  621. B1 = *(b1 + 0);
  622. B9 = *(b1 + 1);
  623. A3 = *(a3 + 0);
  624. A11 = *(a3 + 1);
  625. B3 = *(b3 + 0);
  626. B11 = *(b3 + 1);
  627. A5 = *(a5 + 0);
  628. A13 = *(a5 + 1);
  629. B5 = *(b5 + 0);
  630. B13 = *(b5 + 1);
  631. A7 = *(a7 + 0);
  632. A15 = *(a7 + 1);
  633. B7 = *(b7 + 0);
  634. B15 = *(b7 + 1);
  635. if (a1 == b1) {
  636. *(buffer + 0) = A1;
  637. *(buffer + 1) = A9;
  638. *(buffer + 2) = A3;
  639. *(buffer + 3) = A11;
  640. *(buffer + 4) = A5;
  641. *(buffer + 5) = A13;
  642. *(buffer + 6) = A7;
  643. *(buffer + 7) = A15;
  644. } else {
  645. *(buffer + 0) = B1;
  646. *(buffer + 1) = B9;
  647. *(buffer + 2) = B3;
  648. *(buffer + 3) = B11;
  649. *(buffer + 4) = B5;
  650. *(buffer + 5) = B13;
  651. *(buffer + 6) = B7;
  652. *(buffer + 7) = B15;
  653. *(b1 + 0) = A1;
  654. *(b1 + 1) = A9;
  655. *(b3 + 0) = A3;
  656. *(b3 + 1) = A11;
  657. *(b5 + 0) = A5;
  658. *(b5 + 1) = A13;
  659. *(b7 + 0) = A7;
  660. *(b7 + 1) = A15;
  661. }
  662. buffer += 8;
  663. }
  664. a += 4 * lda;
  665. }
  666. } //if (n & 4)
  667. if (n & 2) {
  668. piv = ipiv;
  669. a1 = a + (k1 + 1) * 2;
  670. a3 = a1 + lda;
  671. ip1 = *(piv + 0) * 2;
  672. ip2 = *(piv + 1) * 2;
  673. piv += 2;
  674. b1 = a + ip1;
  675. b2 = a + ip2;
  676. b3 = b1 + lda;
  677. b4 = b2 + lda;
  678. i = ((k2 - k1) >> 1);
  679. if (i > 0) {
  680. do {
  681. A1 = *(a1 + 0);
  682. A2 = *(a1 + 1);
  683. A3 = *(a2 + 0);
  684. A4 = *(a2 + 1);
  685. A5 = *(a3 + 0);
  686. A6 = *(a3 + 1);
  687. A7 = *(a4 + 0);
  688. A8 = *(a4 + 1);
  689. B1 = *(b1 + 0);
  690. B2 = *(b1 + 1);
  691. B3 = *(b2 + 0);
  692. B4 = *(b2 + 1);
  693. B5 = *(b3 + 0);
  694. B6 = *(b3 + 1);
  695. B7 = *(b4 + 0);
  696. B8 = *(b4 + 1);
  697. ip1 = *(piv + 0) * 2;
  698. ip2 = *(piv + 1) * 2;
  699. piv += 2;
  700. if (b1 == a1) {
  701. if (b2 == a2) {
  702. *(buffer + 0) = A1;
  703. *(buffer + 1) = A2;
  704. *(buffer + 2) = A5;
  705. *(buffer + 3) = A6;
  706. *(buffer + 4) = A3;
  707. *(buffer + 5) = A4;
  708. *(buffer + 6) = A7;
  709. *(buffer + 7) = A8;
  710. } else {
  711. *(buffer + 0) = A1;
  712. *(buffer + 1) = A2;
  713. *(buffer + 2) = A5;
  714. *(buffer + 3) = A6;
  715. *(buffer + 4) = B3;
  716. *(buffer + 5) = B4;
  717. *(buffer + 6) = B7;
  718. *(buffer + 7) = B8;
  719. *(b2 + 0) = A3;
  720. *(b2 + 1) = A4;
  721. *(b4 + 0) = A7;
  722. *(b4 + 1) = A8;
  723. }
  724. } else {
  725. if (b1 == a2) {
  726. if (b2 == a2) {
  727. *(buffer + 0) = A3;
  728. *(buffer + 1) = A4;
  729. *(buffer + 2) = A7;
  730. *(buffer + 3) = A8;
  731. *(buffer + 4) = A1;
  732. *(buffer + 5) = A2;
  733. *(buffer + 6) = A5;
  734. *(buffer + 7) = A6;
  735. } else {
  736. *(buffer + 0) = A3;
  737. *(buffer + 1) = A4;
  738. *(buffer + 2) = A7;
  739. *(buffer + 3) = A8;
  740. *(buffer + 4) = B3;
  741. *(buffer + 5) = B4;
  742. *(buffer + 6) = B7;
  743. *(buffer + 7) = B8;
  744. *(b2 + 0) = A1;
  745. *(b2 + 1) = A2;
  746. *(b4 + 0) = A5;
  747. *(b4 + 1) = A6;
  748. }
  749. } else {
  750. if (b2 == a2) {
  751. *(buffer + 0) = B1;
  752. *(buffer + 1) = B2;
  753. *(buffer + 2) = B5;
  754. *(buffer + 3) = B6;
  755. *(buffer + 4) = A3;
  756. *(buffer + 5) = A4;
  757. *(buffer + 6) = A7;
  758. *(buffer + 7) = A8;
  759. *(b1 + 0) = A1;
  760. *(b1 + 1) = A2;
  761. *(b3 + 0) = A5;
  762. *(b3 + 1) = A6;
  763. } else {
  764. if (b2 == b1) {
  765. *(buffer + 0) = B1;
  766. *(buffer + 1) = B2;
  767. *(buffer + 2) = B5;
  768. *(buffer + 3) = B6;
  769. *(buffer + 4) = A1;
  770. *(buffer + 5) = A2;
  771. *(buffer + 6) = A5;
  772. *(buffer + 7) = A6;
  773. *(b1 + 0) = A3;
  774. *(b1 + 1) = A4;
  775. *(b3 + 0) = A7;
  776. *(b3 + 1) = A8;
  777. } else {
  778. *(buffer + 0) = B1;
  779. *(buffer + 1) = B2;
  780. *(buffer + 2) = B5;
  781. *(buffer + 3) = B6;
  782. *(buffer + 4) = B3;
  783. *(buffer + 5) = B4;
  784. *(buffer + 6) = B7;
  785. *(buffer + 7) = B8;
  786. *(b1 + 0) = A1;
  787. *(b1 + 1) = A2;
  788. *(b2 + 0) = A3;
  789. *(b2 + 1) = A4;
  790. *(b3 + 0) = A5;
  791. *(b3 + 1) = A6;
  792. *(b4 + 0) = A7;
  793. *(b4 + 1) = A8;
  794. }
  795. }
  796. }
  797. }
  798. buffer += 8;
  799. b1 = a + ip1;
  800. b2 = a + ip2;
  801. b3 = b1 + lda;
  802. b4 = b2 + lda;
  803. a1 += 4;
  804. a3 += 4;
  805. i --;
  806. } while (i > 0);
  807. }
  808. i = ((k2 - k1) & 1);
  809. if (i > 0) {
  810. A1 = *(a1 + 0);
  811. A2 = *(a1 + 1);
  812. B1 = *(b1 + 0);
  813. B2 = *(b1 + 1);
  814. A3 = *(a3 + 0);
  815. A4 = *(a3 + 1);
  816. B3 = *(b3 + 0);
  817. B4 = *(b3 + 1);
  818. if (a1 == b1) {
  819. *(buffer + 0) = A1;
  820. *(buffer + 1) = A2;
  821. *(buffer + 2) = A3;
  822. *(buffer + 3) = A4;
  823. } else {
  824. *(buffer + 0) = B1;
  825. *(buffer + 1) = B2;
  826. *(buffer + 2) = B3;
  827. *(buffer + 3) = B4;
  828. *(b1 + 0) = A1;
  829. *(b1 + 1) = A2;
  830. *(b3 + 0) = A3;
  831. *(b3 + 1) = A4;
  832. }
  833. buffer += 4;
  834. }
  835. a += 2 * lda;
  836. }
  837. if (n & 1) {
  838. piv = ipiv;
  839. a1 = a + (k1 + 1) * 2;
  840. ip1 = *(piv + 0) * 2;
  841. ip2 = *(piv + 1) * 2;
  842. piv += 2;
  843. b1 = a + ip1;
  844. b2 = a + ip2;
  845. i = ((k2 - k1) >> 1);
  846. if (i > 0) {
  847. do {
  848. A1 = *(a1 + 0);
  849. A2 = *(a1 + 1);
  850. A3 = *(a2 + 0);
  851. A4 = *(a2 + 1);
  852. B1 = *(b1 + 0);
  853. B2 = *(b1 + 1);
  854. B3 = *(b2 + 0);
  855. B4 = *(b2 + 1);
  856. ip1 = *(piv + 0) * 2;
  857. ip2 = *(piv + 1) * 2;
  858. piv += 2;
  859. if (b1 == a1) {
  860. if (b2 == a2) {
  861. *(buffer + 0) = A1;
  862. *(buffer + 1) = A2;
  863. *(buffer + 2) = A3;
  864. *(buffer + 3) = A4;
  865. } else {
  866. *(buffer + 0) = A1;
  867. *(buffer + 1) = A2;
  868. *(buffer + 2) = B3;
  869. *(buffer + 3) = B4;
  870. *(b2 + 0) = A3;
  871. *(b2 + 1) = A4;
  872. }
  873. } else
  874. if (b1 == a2) {
  875. if (b2 == a2) {
  876. *(buffer + 0) = A3;
  877. *(buffer + 1) = A4;
  878. *(buffer + 2) = A1;
  879. *(buffer + 3) = A2;
  880. } else {
  881. *(buffer + 0) = A3;
  882. *(buffer + 1) = A4;
  883. *(buffer + 2) = B3;
  884. *(buffer + 3) = B4;
  885. *(b2 + 0) = A1;
  886. *(b2 + 1) = A2;
  887. }
  888. } else {
  889. if (b2 == a2) {
  890. *(buffer + 0) = B1;
  891. *(buffer + 1) = B2;
  892. *(buffer + 2) = A3;
  893. *(buffer + 3) = A4;
  894. *(b1 + 0) = A1;
  895. *(b1 + 1) = A2;
  896. } else
  897. if (b2 == b1) {
  898. *(buffer + 0) = B1;
  899. *(buffer + 1) = B2;
  900. *(buffer + 2) = A1;
  901. *(buffer + 3) = A2;
  902. *(b1 + 0) = A3;
  903. *(b1 + 1) = A4;
  904. } else {
  905. *(buffer + 0) = B1;
  906. *(buffer + 1) = B2;
  907. *(buffer + 2) = B3;
  908. *(buffer + 3) = B4;
  909. *(b1 + 0) = A1;
  910. *(b1 + 1) = A2;
  911. *(b2 + 0) = A3;
  912. *(b2 + 1) = A4;
  913. }
  914. }
  915. buffer += 4;
  916. b1 = a + ip1;
  917. b2 = a + ip2;
  918. a1 += 4;
  919. i --;
  920. } while (i > 0);
  921. }
  922. i = ((k2 - k1) & 1);
  923. if (i > 0) {
  924. A1 = *(a1 + 0);
  925. A2 = *(a1 + 1);
  926. B1 = *(b1 + 0);
  927. B2 = *(b1 + 1);
  928. if (a1 == b1) {
  929. *(buffer + 0) = A1;
  930. *(buffer + 1) = A2;
  931. } else {
  932. *(buffer + 0) = B1;
  933. *(buffer + 1) = B2;
  934. *(b1 + 0) = A1;
  935. *(b1 + 1) = A2;
  936. }
  937. // buffer += 2;
  938. }
  939. }
  940. return 0;
  941. }