You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zlaswp_k_4.c 24 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. #ifndef MINUS
  41. #define a2 (a1 + 2)
  42. #define a4 (a3 + 2)
  43. #define a6 (a5 + 2)
  44. #define a8 (a7 + 2)
  45. #else
  46. #define a2 (a1 - 2)
  47. #define a4 (a3 - 2)
  48. #define a6 (a5 - 2)
  49. #define a8 (a7 - 2)
  50. #endif
  51. int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
  52. FLOAT *a, BLASLONG lda,
  53. FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
  54. BLASLONG i, j, ip1, ip2, rows;
  55. blasint *piv;
  56. FLOAT *a1, *a3, *a5, *a7;
  57. FLOAT *b1, *b2, *b3, *b4;
  58. FLOAT *b5, *b6, *b7, *b8;
  59. FLOAT A1, A2, B1, B2, A3, A4, B3, B4;
  60. FLOAT A5, A6, B5, B6, A7, A8, B7, B8;
  61. FLOAT A9, A10, B9, B10, A11, A12, B11, B12;
  62. FLOAT A13, A14, B13, B14, A15, A16, B15, B16;
  63. a -= 2;
  64. lda *= 2;
  65. k1 --;
  66. ipiv += k1;
  67. #ifdef MINUS
  68. ipiv -= (k2 - k1 - 1) * incx;
  69. #endif
  70. if (n <= 0) return 0;
  71. rows = k2-k1;
  72. if (rows <=0) return 0;
  73. if (rows == 1) {
  74. //Only have 1 row
  75. ip1 = *ipiv * 2;
  76. #ifndef MINUS
  77. a1 = a + (k1 + 1) * 2;
  78. #else
  79. a1 = a + k2 * 2;
  80. #endif
  81. b1 = a + ip1;
  82. if(a1 == b1) return 0;
  83. for(j=0; j<n; j++){
  84. A1 = *(a1 + 0);
  85. A2 = *(a1 + 1);
  86. B1 = *(b1 + 0);
  87. B2 = *(b1 + 1);
  88. *(a1 + 0) = B1;
  89. *(a1 + 1) = B2;
  90. *(b1 + 0) = A1;
  91. *(b1 + 1) = A2;
  92. a1 += lda;
  93. b1 += lda;
  94. }
  95. return 0;
  96. }
  97. j = (n >> 2);
  98. if (j > 0) {
  99. do {
  100. piv = ipiv;
  101. #ifndef MINUS
  102. a1 = a + (k1 + 1) * 2;
  103. #else
  104. a1 = a + k2 * 2;
  105. #endif
  106. a3 = a1 + 1 * lda;
  107. a5 = a1 + 2 * lda;
  108. a7 = a1 + 3 * lda;
  109. ip1 = *piv * 2;
  110. piv += incx;
  111. ip2 = *piv * 2;
  112. piv += incx;
  113. b1 = a + ip1;
  114. b2 = a + ip2;
  115. b3 = b1 + 1 * lda;
  116. b4 = b2 + 1 * lda;
  117. b5 = b1 + 2 * lda;
  118. b6 = b2 + 2 * lda;
  119. b7 = b1 + 3 * lda;
  120. b8 = b2 + 3 * lda;
  121. i = (rows >> 1);
  122. i--;
  123. //Loop pipeline
  124. //Main Loop
  125. while (i > 0) {
  126. A1 = *(a1 + 0);
  127. A2 = *(a1 + 1);
  128. A3 = *(a2 + 0);
  129. A4 = *(a2 + 1);
  130. A5 = *(a3 + 0);
  131. A6 = *(a3 + 1);
  132. A7 = *(a4 + 0);
  133. A8 = *(a4 + 1);
  134. A9 = *(a5 + 0);
  135. A10 = *(a5 + 1);
  136. A11 = *(a6 + 0);
  137. A12 = *(a6 + 1);
  138. A13 = *(a7 + 0);
  139. A14 = *(a7 + 1);
  140. A15 = *(a8 + 0);
  141. A16 = *(a8 + 1);
  142. B1 = *(b1 + 0);
  143. B2 = *(b1 + 1);
  144. B3 = *(b2 + 0);
  145. B4 = *(b2 + 1);
  146. B5 = *(b3 + 0);
  147. B6 = *(b3 + 1);
  148. B7 = *(b4 + 0);
  149. B8 = *(b4 + 1);
  150. B9 = *(b5 + 0);
  151. B10 = *(b5 + 1);
  152. B11 = *(b6 + 0);
  153. B12 = *(b6 + 1);
  154. B13 = *(b7 + 0);
  155. B14 = *(b7 + 1);
  156. B15 = *(b8 + 0);
  157. B16 = *(b8 + 1);
  158. ip1 = *piv * 2;
  159. piv += incx;
  160. ip2 = *piv * 2;
  161. piv += incx;
  162. if (b1 == a1) {
  163. if (b2 == a1) {
  164. *(a1 + 0) = A3;
  165. *(a1 + 1) = A4;
  166. *(a2 + 0) = A1;
  167. *(a2 + 1) = A2;
  168. *(a3 + 0) = A7;
  169. *(a3 + 1) = A8;
  170. *(a4 + 0) = A5;
  171. *(a4 + 1) = A6;
  172. *(a5 + 0) = A11;
  173. *(a5 + 1) = A12;
  174. *(a6 + 0) = A9;
  175. *(a6 + 1) = A10;
  176. *(a7 + 0) = A15;
  177. *(a7 + 1) = A16;
  178. *(a8 + 0) = A13;
  179. *(a8 + 1) = A14;
  180. } else
  181. if (b2 != a2) {
  182. *(a2 + 0) = B3;
  183. *(a2 + 1) = B4;
  184. *(b2 + 0) = A3;
  185. *(b2 + 1) = A4;
  186. *(a4 + 0) = B7;
  187. *(a4 + 1) = B8;
  188. *(b4 + 0) = A7;
  189. *(b4 + 1) = A8;
  190. *(a6 + 0) = B11;
  191. *(a6 + 1) = B12;
  192. *(b6 + 0) = A11;
  193. *(b6 + 1) = A12;
  194. *(a8 + 0) = B15;
  195. *(a8 + 1) = B16;
  196. *(b8 + 0) = A15;
  197. *(b8 + 1) = A16;
  198. }
  199. } else
  200. if (b1 == a2) {
  201. if (b2 != a1) {
  202. if (b2 == a2) {
  203. *(a1 + 0) = A3;
  204. *(a1 + 1) = A4;
  205. *(a2 + 0) = A1;
  206. *(a2 + 1) = A2;
  207. *(a3 + 0) = A7;
  208. *(a3 + 1) = A8;
  209. *(a4 + 0) = A5;
  210. *(a4 + 1) = A6;
  211. *(a5 + 0) = A11;
  212. *(a5 + 1) = A12;
  213. *(a6 + 0) = A9;
  214. *(a6 + 1) = A10;
  215. *(a7 + 0) = A15;
  216. *(a7 + 1) = A16;
  217. *(a8 + 0) = A13;
  218. *(a8 + 1) = A14;
  219. } else {
  220. *(a1 + 0) = A3;
  221. *(a1 + 1) = A4;
  222. *(a2 + 0) = B3;
  223. *(a2 + 1) = B4;
  224. *(b2 + 0) = A1;
  225. *(b2 + 1) = A2;
  226. *(a3 + 0) = A7;
  227. *(a3 + 1) = A8;
  228. *(a4 + 0) = B7;
  229. *(a4 + 1) = B8;
  230. *(b4 + 0) = A5;
  231. *(b4 + 1) = A6;
  232. *(a5 + 0) = A11;
  233. *(a5 + 1) = A12;
  234. *(a6 + 0) = B11;
  235. *(a6 + 1) = B12;
  236. *(b6 + 0) = A9;
  237. *(b6 + 1) = A10;
  238. *(a7 + 0) = A15;
  239. *(a7 + 1) = A16;
  240. *(a8 + 0) = B15;
  241. *(a8 + 1) = B16;
  242. *(b8 + 0) = A13;
  243. *(b8 + 1) = A14;
  244. }
  245. }
  246. } else {
  247. if (b2 == a1) {
  248. *(a1 + 0) = A3;
  249. *(a1 + 1) = A4;
  250. *(a2 + 0) = B1;
  251. *(a2 + 1) = B2;
  252. *(b1 + 0) = A1;
  253. *(b1 + 1) = A2;
  254. *(a3 + 0) = A7;
  255. *(a3 + 1) = A8;
  256. *(a4 + 0) = B5;
  257. *(a4 + 1) = B6;
  258. *(b3 + 0) = A5;
  259. *(b3 + 1) = A6;
  260. *(a5 + 0) = A11;
  261. *(a5 + 1) = A12;
  262. *(a6 + 0) = B9;
  263. *(a6 + 1) = B10;
  264. *(b5 + 0) = A9;
  265. *(b5 + 1) = A10;
  266. *(a7 + 0) = A15;
  267. *(a7 + 1) = A16;
  268. *(a8 + 0) = B13;
  269. *(a8 + 1) = B14;
  270. *(b7 + 0) = A13;
  271. *(b7 + 1) = A14;
  272. } else
  273. if (b2 == a2) {
  274. *(a1 + 0) = B1;
  275. *(a1 + 1) = B2;
  276. *(b1 + 0) = A1;
  277. *(b1 + 1) = A2;
  278. *(a3 + 0) = B5;
  279. *(a3 + 1) = B6;
  280. *(b3 + 0) = A5;
  281. *(b3 + 1) = A6;
  282. *(a5 + 0) = B9;
  283. *(a5 + 1) = B10;
  284. *(b5 + 0) = A9;
  285. *(b5 + 1) = A10;
  286. *(a7 + 0) = B13;
  287. *(a7 + 1) = B14;
  288. *(b7 + 0) = A13;
  289. *(b7 + 1) = A14;
  290. } else
  291. if (b2 == b1) {
  292. *(a1 + 0) = B1;
  293. *(a1 + 1) = B2;
  294. *(a2 + 0) = A1;
  295. *(a2 + 1) = A2;
  296. *(b1 + 0) = A3;
  297. *(b1 + 1) = A4;
  298. *(a3 + 0) = B5;
  299. *(a3 + 1) = B6;
  300. *(a4 + 0) = A5;
  301. *(a4 + 1) = A6;
  302. *(b3 + 0) = A7;
  303. *(b3 + 1) = A8;
  304. *(a5 + 0) = B9;
  305. *(a5 + 1) = B10;
  306. *(a6 + 0) = A9;
  307. *(a6 + 1) = A10;
  308. *(b5 + 0) = A11;
  309. *(b5 + 1) = A12;
  310. *(a7 + 0) = B13;
  311. *(a7 + 1) = B14;
  312. *(a8 + 0) = A13;
  313. *(a8 + 1) = A14;
  314. *(b7 + 0) = A15;
  315. *(b7 + 1) = A16;
  316. } else {
  317. *(a1 + 0) = B1;
  318. *(a1 + 1) = B2;
  319. *(a2 + 0) = B3;
  320. *(a2 + 1) = B4;
  321. *(b1 + 0) = A1;
  322. *(b1 + 1) = A2;
  323. *(b2 + 0) = A3;
  324. *(b2 + 1) = A4;
  325. *(a3 + 0) = B5;
  326. *(a3 + 1) = B6;
  327. *(a4 + 0) = B7;
  328. *(a4 + 1) = B8;
  329. *(b3 + 0) = A5;
  330. *(b3 + 1) = A6;
  331. *(b4 + 0) = A7;
  332. *(b4 + 1) = A8;
  333. *(a5 + 0) = B9;
  334. *(a5 + 1) = B10;
  335. *(a6 + 0) = B11;
  336. *(a6 + 1) = B12;
  337. *(b5 + 0) = A9;
  338. *(b5 + 1) = A10;
  339. *(b6 + 0) = A11;
  340. *(b6 + 1) = A12;
  341. *(a7 + 0) = B13;
  342. *(a7 + 1) = B14;
  343. *(a8 + 0) = B15;
  344. *(a8 + 1) = B16;
  345. *(b7 + 0) = A13;
  346. *(b7 + 1) = A14;
  347. *(b8 + 0) = A15;
  348. *(b8 + 1) = A16;
  349. }
  350. }
  351. b1 = a + ip1;
  352. b2 = a + ip2;
  353. b3 = b1 + 1 * lda;
  354. b4 = b2 + 1 * lda;
  355. b5 = b1 + 2 * lda;
  356. b6 = b2 + 2 * lda;
  357. b7 = b1 + 3 * lda;
  358. b8 = b2 + 3 * lda;
  359. #ifndef MINUS
  360. a1 += 4;
  361. a3 += 4;
  362. a5 += 4;
  363. a7 += 4;
  364. #else
  365. a1 -= 4;
  366. a3 -= 4;
  367. a5 -= 4;
  368. a7 -= 4;
  369. #endif
  370. i --;
  371. }
  372. //Loop Ending
  373. A1 = *(a1 + 0);
  374. A2 = *(a1 + 1);
  375. A3 = *(a2 + 0);
  376. A4 = *(a2 + 1);
  377. A5 = *(a3 + 0);
  378. A6 = *(a3 + 1);
  379. A7 = *(a4 + 0);
  380. A8 = *(a4 + 1);
  381. A9 = *(a5 + 0);
  382. A10 = *(a5 + 1);
  383. A11 = *(a6 + 0);
  384. A12 = *(a6 + 1);
  385. A13 = *(a7 + 0);
  386. A14 = *(a7 + 1);
  387. A15 = *(a8 + 0);
  388. A16 = *(a8 + 1);
  389. B1 = *(b1 + 0);
  390. B2 = *(b1 + 1);
  391. B3 = *(b2 + 0);
  392. B4 = *(b2 + 1);
  393. B5 = *(b3 + 0);
  394. B6 = *(b3 + 1);
  395. B7 = *(b4 + 0);
  396. B8 = *(b4 + 1);
  397. B9 = *(b5 + 0);
  398. B10 = *(b5 + 1);
  399. B11 = *(b6 + 0);
  400. B12 = *(b6 + 1);
  401. B13 = *(b7 + 0);
  402. B14 = *(b7 + 1);
  403. B15 = *(b8 + 0);
  404. B16 = *(b8 + 1);
  405. if (b1 == a1) {
  406. if (b2 == a1) {
  407. *(a1 + 0) = A3;
  408. *(a1 + 1) = A4;
  409. *(a2 + 0) = A1;
  410. *(a2 + 1) = A2;
  411. *(a3 + 0) = A7;
  412. *(a3 + 1) = A8;
  413. *(a4 + 0) = A5;
  414. *(a4 + 1) = A6;
  415. *(a5 + 0) = A11;
  416. *(a5 + 1) = A12;
  417. *(a6 + 0) = A9;
  418. *(a6 + 1) = A10;
  419. *(a7 + 0) = A15;
  420. *(a7 + 1) = A16;
  421. *(a8 + 0) = A13;
  422. *(a8 + 1) = A14;
  423. } else
  424. if (b2 != a2) {
  425. *(a2 + 0) = B3;
  426. *(a2 + 1) = B4;
  427. *(b2 + 0) = A3;
  428. *(b2 + 1) = A4;
  429. *(a4 + 0) = B7;
  430. *(a4 + 1) = B8;
  431. *(b4 + 0) = A7;
  432. *(b4 + 1) = A8;
  433. *(a6 + 0) = B11;
  434. *(a6 + 1) = B12;
  435. *(b6 + 0) = A11;
  436. *(b6 + 1) = A12;
  437. *(a8 + 0) = B15;
  438. *(a8 + 1) = B16;
  439. *(b8 + 0) = A15;
  440. *(b8 + 1) = A16;
  441. }
  442. } else
  443. if (b1 == a2) {
  444. if (b2 != a1) {
  445. if (b2 == a2) {
  446. *(a1 + 0) = A3;
  447. *(a1 + 1) = A4;
  448. *(a2 + 0) = A1;
  449. *(a2 + 1) = A2;
  450. *(a3 + 0) = A7;
  451. *(a3 + 1) = A8;
  452. *(a4 + 0) = A5;
  453. *(a4 + 1) = A6;
  454. *(a5 + 0) = A11;
  455. *(a5 + 1) = A12;
  456. *(a6 + 0) = A9;
  457. *(a6 + 1) = A10;
  458. *(a7 + 0) = A15;
  459. *(a7 + 1) = A16;
  460. *(a8 + 0) = A13;
  461. *(a8 + 1) = A14;
  462. } else {
  463. *(a1 + 0) = A3;
  464. *(a1 + 1) = A4;
  465. *(a2 + 0) = B3;
  466. *(a2 + 1) = B4;
  467. *(b2 + 0) = A1;
  468. *(b2 + 1) = A2;
  469. *(a3 + 0) = A7;
  470. *(a3 + 1) = A8;
  471. *(a4 + 0) = B7;
  472. *(a4 + 1) = B8;
  473. *(b4 + 0) = A5;
  474. *(b4 + 1) = A6;
  475. *(a5 + 0) = A11;
  476. *(a5 + 1) = A12;
  477. *(a6 + 0) = B11;
  478. *(a6 + 1) = B12;
  479. *(b6 + 0) = A9;
  480. *(b6 + 1) = A10;
  481. *(a7 + 0) = A15;
  482. *(a7 + 1) = A16;
  483. *(a8 + 0) = B15;
  484. *(a8 + 1) = B16;
  485. *(b8 + 0) = A13;
  486. *(b8 + 1) = A14;
  487. }
  488. }
  489. } else {
  490. if (b2 == a1) {
  491. *(a1 + 0) = A3;
  492. *(a1 + 1) = A4;
  493. *(a2 + 0) = B1;
  494. *(a2 + 1) = B2;
  495. *(b1 + 0) = A1;
  496. *(b1 + 1) = A2;
  497. *(a3 + 0) = A7;
  498. *(a3 + 1) = A8;
  499. *(a4 + 0) = B5;
  500. *(a4 + 1) = B6;
  501. *(b3 + 0) = A5;
  502. *(b3 + 1) = A6;
  503. *(a5 + 0) = A11;
  504. *(a5 + 1) = A12;
  505. *(a6 + 0) = B9;
  506. *(a6 + 1) = B10;
  507. *(b5 + 0) = A9;
  508. *(b5 + 1) = A10;
  509. *(a7 + 0) = A15;
  510. *(a7 + 1) = A16;
  511. *(a8 + 0) = B13;
  512. *(a8 + 1) = B14;
  513. *(b7 + 0) = A13;
  514. *(b7 + 1) = A14;
  515. } else
  516. if (b2 == a2) {
  517. *(a1 + 0) = B1;
  518. *(a1 + 1) = B2;
  519. *(b1 + 0) = A1;
  520. *(b1 + 1) = A2;
  521. *(a3 + 0) = B5;
  522. *(a3 + 1) = B6;
  523. *(b3 + 0) = A5;
  524. *(b3 + 1) = A6;
  525. *(a5 + 0) = B9;
  526. *(a5 + 1) = B10;
  527. *(b5 + 0) = A9;
  528. *(b5 + 1) = A10;
  529. *(a7 + 0) = B13;
  530. *(a7 + 1) = B14;
  531. *(b7 + 0) = A13;
  532. *(b7 + 1) = A14;
  533. } else
  534. if (b2 == b1) {
  535. *(a1 + 0) = B1;
  536. *(a1 + 1) = B2;
  537. *(a2 + 0) = A1;
  538. *(a2 + 1) = A2;
  539. *(b1 + 0) = A3;
  540. *(b1 + 1) = A4;
  541. *(a3 + 0) = B5;
  542. *(a3 + 1) = B6;
  543. *(a4 + 0) = A5;
  544. *(a4 + 1) = A6;
  545. *(b3 + 0) = A7;
  546. *(b3 + 1) = A8;
  547. *(a5 + 0) = B9;
  548. *(a5 + 1) = B10;
  549. *(a6 + 0) = A9;
  550. *(a6 + 1) = A10;
  551. *(b5 + 0) = A11;
  552. *(b5 + 1) = A12;
  553. *(a7 + 0) = B13;
  554. *(a7 + 1) = B14;
  555. *(a8 + 0) = A13;
  556. *(a8 + 1) = A14;
  557. *(b7 + 0) = A15;
  558. *(b7 + 1) = A16;
  559. } else {
  560. *(a1 + 0) = B1;
  561. *(a1 + 1) = B2;
  562. *(a2 + 0) = B3;
  563. *(a2 + 1) = B4;
  564. *(b1 + 0) = A1;
  565. *(b1 + 1) = A2;
  566. *(b2 + 0) = A3;
  567. *(b2 + 1) = A4;
  568. *(a3 + 0) = B5;
  569. *(a3 + 1) = B6;
  570. *(a4 + 0) = B7;
  571. *(a4 + 1) = B8;
  572. *(b3 + 0) = A5;
  573. *(b3 + 1) = A6;
  574. *(b4 + 0) = A7;
  575. *(b4 + 1) = A8;
  576. *(a5 + 0) = B9;
  577. *(a5 + 1) = B10;
  578. *(a6 + 0) = B11;
  579. *(a6 + 1) = B12;
  580. *(b5 + 0) = A9;
  581. *(b5 + 1) = A10;
  582. *(b6 + 0) = A11;
  583. *(b6 + 1) = A12;
  584. *(a7 + 0) = B13;
  585. *(a7 + 1) = B14;
  586. *(a8 + 0) = B15;
  587. *(a8 + 1) = B16;
  588. *(b7 + 0) = A13;
  589. *(b7 + 1) = A14;
  590. *(b8 + 0) = A15;
  591. *(b8 + 1) = A16;
  592. }
  593. }
  594. #ifndef MINUS
  595. a1 += 4;
  596. a3 += 4;
  597. a5 += 4;
  598. a7 += 4;
  599. #else
  600. a1 -= 4;
  601. a3 -= 4;
  602. a5 -= 4;
  603. a7 -= 4;
  604. #endif
  605. //Remain
  606. i = (rows & 1);
  607. if (i > 0) {
  608. ip1 = *piv * 2;
  609. b1 = a + ip1;
  610. b3 = b1 + 1 * lda;
  611. b5 = b1 + 2 * lda;
  612. b7 = b1 + 3 * lda;
  613. A1 = *(a1 + 0);
  614. A2 = *(a1 + 1);
  615. A3 = *(a3 + 0);
  616. A4 = *(a3 + 1);
  617. B1 = *(b1 + 0);
  618. B2 = *(b1 + 1);
  619. B3 = *(b3 + 0);
  620. B4 = *(b3 + 1);
  621. A5 = *(a5 + 0);
  622. A6 = *(a5 + 1);
  623. A7 = *(a7 + 0);
  624. A8 = *(a7 + 1);
  625. B5 = *(b5 + 0);
  626. B6 = *(b5 + 1);
  627. B7 = *(b7 + 0);
  628. B8 = *(b7 + 1);
  629. *(a1 + 0) = B1;
  630. *(a1 + 1) = B2;
  631. *(a3 + 0) = B3;
  632. *(a3 + 1) = B4;
  633. *(b1 + 0) = A1;
  634. *(b1 + 1) = A2;
  635. *(b3 + 0) = A3;
  636. *(b3 + 1) = A4;
  637. *(a5 + 0) = B5;
  638. *(a5 + 1) = B6;
  639. *(a7 + 0) = B7;
  640. *(a7 + 1) = B8;
  641. *(b5 + 0) = A5;
  642. *(b5 + 1) = A6;
  643. *(b7 + 0) = A7;
  644. *(b7 + 1) = A8;
  645. }
  646. a += 4 * lda;
  647. j --;
  648. } while (j > 0);
  649. }
  650. if (n & 2) {
  651. piv = ipiv;
  652. #ifndef MINUS
  653. a1 = a + (k1 + 1) * 2;
  654. #else
  655. a1 = a + k2 * 2;
  656. #endif
  657. a3 = a1 + lda;
  658. ip1 = *piv * 2;
  659. piv += incx;
  660. ip2 = *piv * 2;
  661. piv += incx;
  662. b1 = a + ip1;
  663. b2 = a + ip2;
  664. b3 = b1 + lda;
  665. b4 = b2 + lda;
  666. i = (rows >> 1);
  667. i--;
  668. //Loop pipeline
  669. //Main Loop
  670. while (i > 0) {
  671. A1 = *(a1 + 0);
  672. A2 = *(a1 + 1);
  673. A3 = *(a2 + 0);
  674. A4 = *(a2 + 1);
  675. A5 = *(a3 + 0);
  676. A6 = *(a3 + 1);
  677. A7 = *(a4 + 0);
  678. A8 = *(a4 + 1);
  679. B1 = *(b1 + 0);
  680. B2 = *(b1 + 1);
  681. B3 = *(b2 + 0);
  682. B4 = *(b2 + 1);
  683. B5 = *(b3 + 0);
  684. B6 = *(b3 + 1);
  685. B7 = *(b4 + 0);
  686. B8 = *(b4 + 1);
  687. ip1 = *piv * 2;
  688. piv += incx;
  689. ip2 = *piv * 2;
  690. piv += incx;
  691. if (b1 == a1) {
  692. if (b2 == a1) {
  693. *(a1 + 0) = A3;
  694. *(a1 + 1) = A4;
  695. *(a2 + 0) = A1;
  696. *(a2 + 1) = A2;
  697. *(a3 + 0) = A7;
  698. *(a3 + 1) = A8;
  699. *(a4 + 0) = A5;
  700. *(a4 + 1) = A6;
  701. } else
  702. if (b2 != a2) {
  703. *(a2 + 0) = B3;
  704. *(a2 + 1) = B4;
  705. *(b2 + 0) = A3;
  706. *(b2 + 1) = A4;
  707. *(a4 + 0) = B7;
  708. *(a4 + 1) = B8;
  709. *(b4 + 0) = A7;
  710. *(b4 + 1) = A8;
  711. }
  712. } else
  713. if (b1 == a2) {
  714. if (b2 != a1) {
  715. if (b2 == a2) {
  716. *(a1 + 0) = A3;
  717. *(a1 + 1) = A4;
  718. *(a2 + 0) = A1;
  719. *(a2 + 1) = A2;
  720. *(a3 + 0) = A7;
  721. *(a3 + 1) = A8;
  722. *(a4 + 0) = A5;
  723. *(a4 + 1) = A6;
  724. } else {
  725. *(a1 + 0) = A3;
  726. *(a1 + 1) = A4;
  727. *(a2 + 0) = B3;
  728. *(a2 + 1) = B4;
  729. *(b2 + 0) = A1;
  730. *(b2 + 1) = A2;
  731. *(a3 + 0) = A7;
  732. *(a3 + 1) = A8;
  733. *(a4 + 0) = B7;
  734. *(a4 + 1) = B8;
  735. *(b4 + 0) = A5;
  736. *(b4 + 1) = A6;
  737. }
  738. }
  739. } else {
  740. if (b2 == a1) {
  741. *(a1 + 0) = A3;
  742. *(a1 + 1) = A4;
  743. *(a2 + 0) = B1;
  744. *(a2 + 1) = B2;
  745. *(b1 + 0) = A1;
  746. *(b1 + 1) = A2;
  747. *(a3 + 0) = A7;
  748. *(a3 + 1) = A8;
  749. *(a4 + 0) = B5;
  750. *(a4 + 1) = B6;
  751. *(b3 + 0) = A5;
  752. *(b3 + 1) = A6;
  753. } else
  754. if (b2 == a2) {
  755. *(a1 + 0) = B1;
  756. *(a1 + 1) = B2;
  757. *(b1 + 0) = A1;
  758. *(b1 + 1) = A2;
  759. *(a3 + 0) = B5;
  760. *(a3 + 1) = B6;
  761. *(b3 + 0) = A5;
  762. *(b3 + 1) = A6;
  763. } else
  764. if (b2 == b1) {
  765. *(a1 + 0) = B1;
  766. *(a1 + 1) = B2;
  767. *(a2 + 0) = A1;
  768. *(a2 + 1) = A2;
  769. *(b1 + 0) = A3;
  770. *(b1 + 1) = A4;
  771. *(a3 + 0) = B5;
  772. *(a3 + 1) = B6;
  773. *(a4 + 0) = A5;
  774. *(a4 + 1) = A6;
  775. *(b3 + 0) = A7;
  776. *(b3 + 1) = A8;
  777. } else {
  778. *(a1 + 0) = B1;
  779. *(a1 + 1) = B2;
  780. *(a2 + 0) = B3;
  781. *(a2 + 1) = B4;
  782. *(b1 + 0) = A1;
  783. *(b1 + 1) = A2;
  784. *(b2 + 0) = A3;
  785. *(b2 + 1) = A4;
  786. *(a3 + 0) = B5;
  787. *(a3 + 1) = B6;
  788. *(a4 + 0) = B7;
  789. *(a4 + 1) = B8;
  790. *(b3 + 0) = A5;
  791. *(b3 + 1) = A6;
  792. *(b4 + 0) = A7;
  793. *(b4 + 1) = A8;
  794. }
  795. }
  796. b1 = a + ip1;
  797. b2 = a + ip2;
  798. b3 = b1 + lda;
  799. b4 = b2 + lda;
  800. #ifndef MINUS
  801. a1 += 4;
  802. a3 += 4;
  803. #else
  804. a1 -= 4;
  805. a3 -= 4;
  806. #endif
  807. i --;
  808. }
  809. //Loop Ending
  810. A1 = *(a1 + 0);
  811. A2 = *(a1 + 1);
  812. A3 = *(a2 + 0);
  813. A4 = *(a2 + 1);
  814. A5 = *(a3 + 0);
  815. A6 = *(a3 + 1);
  816. A7 = *(a4 + 0);
  817. A8 = *(a4 + 1);
  818. B1 = *(b1 + 0);
  819. B2 = *(b1 + 1);
  820. B3 = *(b2 + 0);
  821. B4 = *(b2 + 1);
  822. B5 = *(b3 + 0);
  823. B6 = *(b3 + 1);
  824. B7 = *(b4 + 0);
  825. B8 = *(b4 + 1);
  826. if (b1 == a1) {
  827. if (b2 == a1) {
  828. *(a1 + 0) = A3;
  829. *(a1 + 1) = A4;
  830. *(a2 + 0) = A1;
  831. *(a2 + 1) = A2;
  832. *(a3 + 0) = A7;
  833. *(a3 + 1) = A8;
  834. *(a4 + 0) = A5;
  835. *(a4 + 1) = A6;
  836. } else
  837. if (b2 != a2) {
  838. *(a2 + 0) = B3;
  839. *(a2 + 1) = B4;
  840. *(b2 + 0) = A3;
  841. *(b2 + 1) = A4;
  842. *(a4 + 0) = B7;
  843. *(a4 + 1) = B8;
  844. *(b4 + 0) = A7;
  845. *(b4 + 1) = A8;
  846. }
  847. } else
  848. if (b1 == a2) {
  849. if (b2 != a1) {
  850. if (b2 == a2) {
  851. *(a1 + 0) = A3;
  852. *(a1 + 1) = A4;
  853. *(a2 + 0) = A1;
  854. *(a2 + 1) = A2;
  855. *(a3 + 0) = A7;
  856. *(a3 + 1) = A8;
  857. *(a4 + 0) = A5;
  858. *(a4 + 1) = A6;
  859. } else {
  860. *(a1 + 0) = A3;
  861. *(a1 + 1) = A4;
  862. *(a2 + 0) = B3;
  863. *(a2 + 1) = B4;
  864. *(b2 + 0) = A1;
  865. *(b2 + 1) = A2;
  866. *(a3 + 0) = A7;
  867. *(a3 + 1) = A8;
  868. *(a4 + 0) = B7;
  869. *(a4 + 1) = B8;
  870. *(b4 + 0) = A5;
  871. *(b4 + 1) = A6;
  872. }
  873. }
  874. } else {
  875. if (b2 == a1) {
  876. *(a1 + 0) = A3;
  877. *(a1 + 1) = A4;
  878. *(a2 + 0) = B1;
  879. *(a2 + 1) = B2;
  880. *(b1 + 0) = A1;
  881. *(b1 + 1) = A2;
  882. *(a3 + 0) = A7;
  883. *(a3 + 1) = A8;
  884. *(a4 + 0) = B5;
  885. *(a4 + 1) = B6;
  886. *(b3 + 0) = A5;
  887. *(b3 + 1) = A6;
  888. } else
  889. if (b2 == a2) {
  890. *(a1 + 0) = B1;
  891. *(a1 + 1) = B2;
  892. *(b1 + 0) = A1;
  893. *(b1 + 1) = A2;
  894. *(a3 + 0) = B5;
  895. *(a3 + 1) = B6;
  896. *(b3 + 0) = A5;
  897. *(b3 + 1) = A6;
  898. } else
  899. if (b2 == b1) {
  900. *(a1 + 0) = B1;
  901. *(a1 + 1) = B2;
  902. *(a2 + 0) = A1;
  903. *(a2 + 1) = A2;
  904. *(b1 + 0) = A3;
  905. *(b1 + 1) = A4;
  906. *(a3 + 0) = B5;
  907. *(a3 + 1) = B6;
  908. *(a4 + 0) = A5;
  909. *(a4 + 1) = A6;
  910. *(b3 + 0) = A7;
  911. *(b3 + 1) = A8;
  912. } else {
  913. *(a1 + 0) = B1;
  914. *(a1 + 1) = B2;
  915. *(a2 + 0) = B3;
  916. *(a2 + 1) = B4;
  917. *(b1 + 0) = A1;
  918. *(b1 + 1) = A2;
  919. *(b2 + 0) = A3;
  920. *(b2 + 1) = A4;
  921. *(a3 + 0) = B5;
  922. *(a3 + 1) = B6;
  923. *(a4 + 0) = B7;
  924. *(a4 + 1) = B8;
  925. *(b3 + 0) = A5;
  926. *(b3 + 1) = A6;
  927. *(b4 + 0) = A7;
  928. *(b4 + 1) = A8;
  929. }
  930. }
  931. #ifndef MINUS
  932. a1 += 4;
  933. a3 += 4;
  934. #else
  935. a1 -= 4;
  936. a3 -= 4;
  937. #endif
  938. //Remain
  939. i = (rows & 1);
  940. if (i > 0) {
  941. ip1 = *piv * 2;
  942. b1 = a + ip1;
  943. b3 = b1 + lda;
  944. A1 = *(a1 + 0);
  945. A2 = *(a1 + 1);
  946. A3 = *(a3 + 0);
  947. A4 = *(a3 + 1);
  948. B1 = *(b1 + 0);
  949. B2 = *(b1 + 1);
  950. B3 = *(b3 + 0);
  951. B4 = *(b3 + 1);
  952. *(a1 + 0) = B1;
  953. *(a1 + 1) = B2;
  954. *(a3 + 0) = B3;
  955. *(a3 + 1) = B4;
  956. *(b1 + 0) = A1;
  957. *(b1 + 1) = A2;
  958. *(b3 + 0) = A3;
  959. *(b3 + 1) = A4;
  960. }
  961. a += 2 * lda;
  962. }
  963. if (n & 1) {
  964. piv = ipiv;
  965. #ifndef MINUS
  966. a1 = a + (k1 + 1) * 2;
  967. #else
  968. a1 = a + k2 * 2;
  969. #endif
  970. ip1 = *piv * 2;
  971. piv += incx;
  972. ip2 = *piv * 2;
  973. piv += incx;
  974. b1 = a + ip1;
  975. b2 = a + ip2;
  976. i = (rows >> 1);
  977. i--;
  978. //Loop pipeline
  979. //Main Loop
  980. while (i > 0) {
  981. A1 = *(a1 + 0);
  982. A2 = *(a1 + 1);
  983. A3 = *(a2 + 0);
  984. A4 = *(a2 + 1);
  985. B1 = *(b1 + 0);
  986. B2 = *(b1 + 1);
  987. B3 = *(b2 + 0);
  988. B4 = *(b2 + 1);
  989. ip1 = *piv * 2;
  990. piv += incx;
  991. ip2 = *piv * 2;
  992. piv += incx;
  993. if (b1 == a1) {
  994. if (b2 == a1) {
  995. *(a1 + 0) = A3;
  996. *(a1 + 1) = A4;
  997. *(a2 + 0) = A1;
  998. *(a2 + 1) = A2;
  999. } else
  1000. if (b2 != a2) {
  1001. *(a2 + 0) = B3;
  1002. *(a2 + 1) = B4;
  1003. *(b2 + 0) = A3;
  1004. *(b2 + 1) = A4;
  1005. }
  1006. } else
  1007. if (b1 == a2) {
  1008. if (b2 != a1) {
  1009. if (b2 == a2) {
  1010. *(a1 + 0) = A3;
  1011. *(a1 + 1) = A4;
  1012. *(a2 + 0) = A1;
  1013. *(a2 + 1) = A2;
  1014. } else {
  1015. *(a1 + 0) = A3;
  1016. *(a1 + 1) = A4;
  1017. *(a2 + 0) = B3;
  1018. *(a2 + 1) = B4;
  1019. *(b2 + 0) = A1;
  1020. *(b2 + 1) = A2;
  1021. }
  1022. }
  1023. } else {
  1024. if (b2 == a1) {
  1025. *(a1 + 0) = A3;
  1026. *(a1 + 1) = A4;
  1027. *(a2 + 0) = B1;
  1028. *(a2 + 1) = B2;
  1029. *(b1 + 0) = A1;
  1030. *(b1 + 1) = A2;
  1031. } else
  1032. if (b2 == a2) {
  1033. *(a1 + 0) = B1;
  1034. *(a1 + 1) = B2;
  1035. *(b1 + 0) = A1;
  1036. *(b1 + 1) = A2;
  1037. } else
  1038. if (b2 == b1) {
  1039. *(a1 + 0) = B1;
  1040. *(a1 + 1) = B2;
  1041. *(a2 + 0) = A1;
  1042. *(a2 + 1) = A2;
  1043. *(b1 + 0) = A3;
  1044. *(b1 + 1) = A4;
  1045. } else {
  1046. *(a1 + 0) = B1;
  1047. *(a1 + 1) = B2;
  1048. *(a2 + 0) = B3;
  1049. *(a2 + 1) = B4;
  1050. *(b1 + 0) = A1;
  1051. *(b1 + 1) = A2;
  1052. *(b2 + 0) = A3;
  1053. *(b2 + 1) = A4;
  1054. }
  1055. }
  1056. b1 = a + ip1;
  1057. b2 = a + ip2;
  1058. #ifndef MINUS
  1059. a1 += 4;
  1060. #else
  1061. a1 -= 4;
  1062. #endif
  1063. i --;
  1064. }
  1065. //Loop Ending
  1066. A1 = *(a1 + 0);
  1067. A2 = *(a1 + 1);
  1068. A3 = *(a2 + 0);
  1069. A4 = *(a2 + 1);
  1070. B1 = *(b1 + 0);
  1071. B2 = *(b1 + 1);
  1072. B3 = *(b2 + 0);
  1073. B4 = *(b2 + 1);
  1074. if (b1 == a1) {
  1075. if (b2 == a1) {
  1076. *(a1 + 0) = A3;
  1077. *(a1 + 1) = A4;
  1078. *(a2 + 0) = A1;
  1079. *(a2 + 1) = A2;
  1080. } else
  1081. if (b2 != a2) {
  1082. *(a2 + 0) = B3;
  1083. *(a2 + 1) = B4;
  1084. *(b2 + 0) = A3;
  1085. *(b2 + 1) = A4;
  1086. }
  1087. } else
  1088. if (b1 == a2) {
  1089. if (b2 != a1) {
  1090. if (b2 == a2) {
  1091. *(a1 + 0) = A3;
  1092. *(a1 + 1) = A4;
  1093. *(a2 + 0) = A1;
  1094. *(a2 + 1) = A2;
  1095. } else {
  1096. *(a1 + 0) = A3;
  1097. *(a1 + 1) = A4;
  1098. *(a2 + 0) = B3;
  1099. *(a2 + 1) = B4;
  1100. *(b2 + 0) = A1;
  1101. *(b2 + 1) = A2;
  1102. }
  1103. }
  1104. } else {
  1105. if (b2 == a1) {
  1106. *(a1 + 0) = A3;
  1107. *(a1 + 1) = A4;
  1108. *(a2 + 0) = B1;
  1109. *(a2 + 1) = B2;
  1110. *(b1 + 0) = A1;
  1111. *(b1 + 1) = A2;
  1112. } else
  1113. if (b2 == a2) {
  1114. *(a1 + 0) = B1;
  1115. *(a1 + 1) = B2;
  1116. *(b1 + 0) = A1;
  1117. *(b1 + 1) = A2;
  1118. } else
  1119. if (b2 == b1) {
  1120. *(a1 + 0) = B1;
  1121. *(a1 + 1) = B2;
  1122. *(a2 + 0) = A1;
  1123. *(a2 + 1) = A2;
  1124. *(b1 + 0) = A3;
  1125. *(b1 + 1) = A4;
  1126. } else {
  1127. *(a1 + 0) = B1;
  1128. *(a1 + 1) = B2;
  1129. *(a2 + 0) = B3;
  1130. *(a2 + 1) = B4;
  1131. *(b1 + 0) = A1;
  1132. *(b1 + 1) = A2;
  1133. *(b2 + 0) = A3;
  1134. *(b2 + 1) = A4;
  1135. }
  1136. }
  1137. #ifndef MINUS
  1138. a1 += 4;
  1139. #else
  1140. a1 -= 4;
  1141. #endif
  1142. //Remain
  1143. i = (rows & 1);
  1144. if (i > 0) {
  1145. ip1 = *piv * 2;
  1146. b1 = a + ip1;
  1147. A1 = *(a1 + 0);
  1148. A2 = *(a1 + 1);
  1149. B1 = *(b1 + 0);
  1150. B2 = *(b1 + 1);
  1151. *(a1 + 0) = B1;
  1152. *(a1 + 1) = B2;
  1153. *(b1 + 0) = A1;
  1154. *(b1 + 1) = A2;
  1155. }
  1156. }
  1157. return 0;
  1158. }