You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zlaswp_k_4.c 24 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. #ifndef MINUS
  41. #define a2 (a1 + 2)
  42. #define a4 (a3 + 2)
  43. #define a6 (a5 + 2)
  44. #define a8 (a7 + 2)
  45. #else
  46. #define a2 (a1 - 2)
  47. #define a4 (a3 - 2)
  48. #define a6 (a5 - 2)
  49. #define a8 (a7 - 2)
  50. #endif
  51. int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
  52. FLOAT *a, BLASLONG lda,
  53. FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
  54. BLASLONG i, j, ip1, ip2, rows;
  55. blasint *piv;
  56. FLOAT *a1, *a3, *a5, *a7;
  57. FLOAT *b1, *b2, *b3, *b4;
  58. FLOAT *b5, *b6, *b7, *b8;
  59. FLOAT A1, A2, B1, B2, A3, A4, B3, B4;
  60. FLOAT A5, A6, B5, B6, A7, A8, B7, B8;
  61. FLOAT A9, A10, B9, B10, A11, A12, B11, B12;
  62. FLOAT A13, A14, B13, B14, A15, A16, B15, B16;
  63. a -= 2;
  64. lda *= 2;
  65. k1 --;
  66. #ifndef MINUS
  67. ipiv += k1;
  68. #else
  69. ipiv -= (k2 - 1) * incx;
  70. #endif
  71. if (n <= 0) return 0;
  72. rows = k2-k1;
  73. if (rows <=0) return 0;
  74. if (rows == 1) {
  75. //Only have 1 row
  76. ip1 = *ipiv * 2;
  77. #ifndef MINUS
  78. a1 = a + (k1 + 1) * 2;
  79. #else
  80. a1 = a + k2 * 2;
  81. #endif
  82. b1 = a + ip1;
  83. if(a1 == b1) return 0;
  84. for(j=0; j<n; j++){
  85. A1 = *(a1 + 0);
  86. A2 = *(a1 + 1);
  87. B1 = *(b1 + 0);
  88. B2 = *(b1 + 1);
  89. *(a1 + 0) = B1;
  90. *(a1 + 1) = B2;
  91. *(b1 + 0) = A1;
  92. *(b1 + 1) = A2;
  93. a1 += lda;
  94. b1 += lda;
  95. }
  96. return 0;
  97. }
  98. j = (n >> 2);
  99. if (j > 0) {
  100. do {
  101. piv = ipiv;
  102. #ifndef MINUS
  103. a1 = a + (k1 + 1) * 2;
  104. #else
  105. a1 = a + k2 * 2;
  106. #endif
  107. a3 = a1 + 1 * lda;
  108. a5 = a1 + 2 * lda;
  109. a7 = a1 + 3 * lda;
  110. ip1 = *piv * 2;
  111. piv += incx;
  112. ip2 = *piv * 2;
  113. piv += incx;
  114. b1 = a + ip1;
  115. b2 = a + ip2;
  116. b3 = b1 + 1 * lda;
  117. b4 = b2 + 1 * lda;
  118. b5 = b1 + 2 * lda;
  119. b6 = b2 + 2 * lda;
  120. b7 = b1 + 3 * lda;
  121. b8 = b2 + 3 * lda;
  122. i = (rows >> 1);
  123. i--;
  124. //Loop pipeline
  125. //Main Loop
  126. while (i > 0) {
  127. A1 = *(a1 + 0);
  128. A2 = *(a1 + 1);
  129. A3 = *(a2 + 0);
  130. A4 = *(a2 + 1);
  131. A5 = *(a3 + 0);
  132. A6 = *(a3 + 1);
  133. A7 = *(a4 + 0);
  134. A8 = *(a4 + 1);
  135. A9 = *(a5 + 0);
  136. A10 = *(a5 + 1);
  137. A11 = *(a6 + 0);
  138. A12 = *(a6 + 1);
  139. A13 = *(a7 + 0);
  140. A14 = *(a7 + 1);
  141. A15 = *(a8 + 0);
  142. A16 = *(a8 + 1);
  143. B1 = *(b1 + 0);
  144. B2 = *(b1 + 1);
  145. B3 = *(b2 + 0);
  146. B4 = *(b2 + 1);
  147. B5 = *(b3 + 0);
  148. B6 = *(b3 + 1);
  149. B7 = *(b4 + 0);
  150. B8 = *(b4 + 1);
  151. B9 = *(b5 + 0);
  152. B10 = *(b5 + 1);
  153. B11 = *(b6 + 0);
  154. B12 = *(b6 + 1);
  155. B13 = *(b7 + 0);
  156. B14 = *(b7 + 1);
  157. B15 = *(b8 + 0);
  158. B16 = *(b8 + 1);
  159. ip1 = *piv * 2;
  160. piv += incx;
  161. ip2 = *piv * 2;
  162. piv += incx;
  163. if (b1 == a1) {
  164. if (b2 == a1) {
  165. *(a1 + 0) = A3;
  166. *(a1 + 1) = A4;
  167. *(a2 + 0) = A1;
  168. *(a2 + 1) = A2;
  169. *(a3 + 0) = A7;
  170. *(a3 + 1) = A8;
  171. *(a4 + 0) = A5;
  172. *(a4 + 1) = A6;
  173. *(a5 + 0) = A11;
  174. *(a5 + 1) = A12;
  175. *(a6 + 0) = A9;
  176. *(a6 + 1) = A10;
  177. *(a7 + 0) = A15;
  178. *(a7 + 1) = A16;
  179. *(a8 + 0) = A13;
  180. *(a8 + 1) = A14;
  181. } else
  182. if (b2 != a2) {
  183. *(a2 + 0) = B3;
  184. *(a2 + 1) = B4;
  185. *(b2 + 0) = A3;
  186. *(b2 + 1) = A4;
  187. *(a4 + 0) = B7;
  188. *(a4 + 1) = B8;
  189. *(b4 + 0) = A7;
  190. *(b4 + 1) = A8;
  191. *(a6 + 0) = B11;
  192. *(a6 + 1) = B12;
  193. *(b6 + 0) = A11;
  194. *(b6 + 1) = A12;
  195. *(a8 + 0) = B15;
  196. *(a8 + 1) = B16;
  197. *(b8 + 0) = A15;
  198. *(b8 + 1) = A16;
  199. }
  200. } else
  201. if (b1 == a2) {
  202. if (b2 != a1) {
  203. if (b2 == a2) {
  204. *(a1 + 0) = A3;
  205. *(a1 + 1) = A4;
  206. *(a2 + 0) = A1;
  207. *(a2 + 1) = A2;
  208. *(a3 + 0) = A7;
  209. *(a3 + 1) = A8;
  210. *(a4 + 0) = A5;
  211. *(a4 + 1) = A6;
  212. *(a5 + 0) = A11;
  213. *(a5 + 1) = A12;
  214. *(a6 + 0) = A9;
  215. *(a6 + 1) = A10;
  216. *(a7 + 0) = A15;
  217. *(a7 + 1) = A16;
  218. *(a8 + 0) = A13;
  219. *(a8 + 1) = A14;
  220. } else {
  221. *(a1 + 0) = A3;
  222. *(a1 + 1) = A4;
  223. *(a2 + 0) = B3;
  224. *(a2 + 1) = B4;
  225. *(b2 + 0) = A1;
  226. *(b2 + 1) = A2;
  227. *(a3 + 0) = A7;
  228. *(a3 + 1) = A8;
  229. *(a4 + 0) = B7;
  230. *(a4 + 1) = B8;
  231. *(b4 + 0) = A5;
  232. *(b4 + 1) = A6;
  233. *(a5 + 0) = A11;
  234. *(a5 + 1) = A12;
  235. *(a6 + 0) = B11;
  236. *(a6 + 1) = B12;
  237. *(b6 + 0) = A9;
  238. *(b6 + 1) = A10;
  239. *(a7 + 0) = A15;
  240. *(a7 + 1) = A16;
  241. *(a8 + 0) = B15;
  242. *(a8 + 1) = B16;
  243. *(b8 + 0) = A13;
  244. *(b8 + 1) = A14;
  245. }
  246. }
  247. } else {
  248. if (b2 == a1) {
  249. *(a1 + 0) = A3;
  250. *(a1 + 1) = A4;
  251. *(a2 + 0) = B1;
  252. *(a2 + 1) = B2;
  253. *(b1 + 0) = A1;
  254. *(b1 + 1) = A2;
  255. *(a3 + 0) = A7;
  256. *(a3 + 1) = A8;
  257. *(a4 + 0) = B5;
  258. *(a4 + 1) = B6;
  259. *(b3 + 0) = A5;
  260. *(b3 + 1) = A6;
  261. *(a5 + 0) = A11;
  262. *(a5 + 1) = A12;
  263. *(a6 + 0) = B9;
  264. *(a6 + 1) = B10;
  265. *(b5 + 0) = A9;
  266. *(b5 + 1) = A10;
  267. *(a7 + 0) = A15;
  268. *(a7 + 1) = A16;
  269. *(a8 + 0) = B13;
  270. *(a8 + 1) = B14;
  271. *(b7 + 0) = A13;
  272. *(b7 + 1) = A14;
  273. } else
  274. if (b2 == a2) {
  275. *(a1 + 0) = B1;
  276. *(a1 + 1) = B2;
  277. *(b1 + 0) = A1;
  278. *(b1 + 1) = A2;
  279. *(a3 + 0) = B5;
  280. *(a3 + 1) = B6;
  281. *(b3 + 0) = A5;
  282. *(b3 + 1) = A6;
  283. *(a5 + 0) = B9;
  284. *(a5 + 1) = B10;
  285. *(b5 + 0) = A9;
  286. *(b5 + 1) = A10;
  287. *(a7 + 0) = B13;
  288. *(a7 + 1) = B14;
  289. *(b7 + 0) = A13;
  290. *(b7 + 1) = A14;
  291. } else
  292. if (b2 == b1) {
  293. *(a1 + 0) = B1;
  294. *(a1 + 1) = B2;
  295. *(a2 + 0) = A1;
  296. *(a2 + 1) = A2;
  297. *(b1 + 0) = A3;
  298. *(b1 + 1) = A4;
  299. *(a3 + 0) = B5;
  300. *(a3 + 1) = B6;
  301. *(a4 + 0) = A5;
  302. *(a4 + 1) = A6;
  303. *(b3 + 0) = A7;
  304. *(b3 + 1) = A8;
  305. *(a5 + 0) = B9;
  306. *(a5 + 1) = B10;
  307. *(a6 + 0) = A9;
  308. *(a6 + 1) = A10;
  309. *(b5 + 0) = A11;
  310. *(b5 + 1) = A12;
  311. *(a7 + 0) = B13;
  312. *(a7 + 1) = B14;
  313. *(a8 + 0) = A13;
  314. *(a8 + 1) = A14;
  315. *(b7 + 0) = A15;
  316. *(b7 + 1) = A16;
  317. } else {
  318. *(a1 + 0) = B1;
  319. *(a1 + 1) = B2;
  320. *(a2 + 0) = B3;
  321. *(a2 + 1) = B4;
  322. *(b1 + 0) = A1;
  323. *(b1 + 1) = A2;
  324. *(b2 + 0) = A3;
  325. *(b2 + 1) = A4;
  326. *(a3 + 0) = B5;
  327. *(a3 + 1) = B6;
  328. *(a4 + 0) = B7;
  329. *(a4 + 1) = B8;
  330. *(b3 + 0) = A5;
  331. *(b3 + 1) = A6;
  332. *(b4 + 0) = A7;
  333. *(b4 + 1) = A8;
  334. *(a5 + 0) = B9;
  335. *(a5 + 1) = B10;
  336. *(a6 + 0) = B11;
  337. *(a6 + 1) = B12;
  338. *(b5 + 0) = A9;
  339. *(b5 + 1) = A10;
  340. *(b6 + 0) = A11;
  341. *(b6 + 1) = A12;
  342. *(a7 + 0) = B13;
  343. *(a7 + 1) = B14;
  344. *(a8 + 0) = B15;
  345. *(a8 + 1) = B16;
  346. *(b7 + 0) = A13;
  347. *(b7 + 1) = A14;
  348. *(b8 + 0) = A15;
  349. *(b8 + 1) = A16;
  350. }
  351. }
  352. b1 = a + ip1;
  353. b2 = a + ip2;
  354. b3 = b1 + 1 * lda;
  355. b4 = b2 + 1 * lda;
  356. b5 = b1 + 2 * lda;
  357. b6 = b2 + 2 * lda;
  358. b7 = b1 + 3 * lda;
  359. b8 = b2 + 3 * lda;
  360. #ifndef MINUS
  361. a1 += 4;
  362. a3 += 4;
  363. a5 += 4;
  364. a7 += 4;
  365. #else
  366. a1 -= 4;
  367. a3 -= 4;
  368. a5 -= 4;
  369. a7 -= 4;
  370. #endif
  371. i --;
  372. }
  373. //Loop Ending
  374. A1 = *(a1 + 0);
  375. A2 = *(a1 + 1);
  376. A3 = *(a2 + 0);
  377. A4 = *(a2 + 1);
  378. A5 = *(a3 + 0);
  379. A6 = *(a3 + 1);
  380. A7 = *(a4 + 0);
  381. A8 = *(a4 + 1);
  382. A9 = *(a5 + 0);
  383. A10 = *(a5 + 1);
  384. A11 = *(a6 + 0);
  385. A12 = *(a6 + 1);
  386. A13 = *(a7 + 0);
  387. A14 = *(a7 + 1);
  388. A15 = *(a8 + 0);
  389. A16 = *(a8 + 1);
  390. B1 = *(b1 + 0);
  391. B2 = *(b1 + 1);
  392. B3 = *(b2 + 0);
  393. B4 = *(b2 + 1);
  394. B5 = *(b3 + 0);
  395. B6 = *(b3 + 1);
  396. B7 = *(b4 + 0);
  397. B8 = *(b4 + 1);
  398. B9 = *(b5 + 0);
  399. B10 = *(b5 + 1);
  400. B11 = *(b6 + 0);
  401. B12 = *(b6 + 1);
  402. B13 = *(b7 + 0);
  403. B14 = *(b7 + 1);
  404. B15 = *(b8 + 0);
  405. B16 = *(b8 + 1);
  406. if (b1 == a1) {
  407. if (b2 == a1) {
  408. *(a1 + 0) = A3;
  409. *(a1 + 1) = A4;
  410. *(a2 + 0) = A1;
  411. *(a2 + 1) = A2;
  412. *(a3 + 0) = A7;
  413. *(a3 + 1) = A8;
  414. *(a4 + 0) = A5;
  415. *(a4 + 1) = A6;
  416. *(a5 + 0) = A11;
  417. *(a5 + 1) = A12;
  418. *(a6 + 0) = A9;
  419. *(a6 + 1) = A10;
  420. *(a7 + 0) = A15;
  421. *(a7 + 1) = A16;
  422. *(a8 + 0) = A13;
  423. *(a8 + 1) = A14;
  424. } else
  425. if (b2 != a2) {
  426. *(a2 + 0) = B3;
  427. *(a2 + 1) = B4;
  428. *(b2 + 0) = A3;
  429. *(b2 + 1) = A4;
  430. *(a4 + 0) = B7;
  431. *(a4 + 1) = B8;
  432. *(b4 + 0) = A7;
  433. *(b4 + 1) = A8;
  434. *(a6 + 0) = B11;
  435. *(a6 + 1) = B12;
  436. *(b6 + 0) = A11;
  437. *(b6 + 1) = A12;
  438. *(a8 + 0) = B15;
  439. *(a8 + 1) = B16;
  440. *(b8 + 0) = A15;
  441. *(b8 + 1) = A16;
  442. }
  443. } else
  444. if (b1 == a2) {
  445. if (b2 != a1) {
  446. if (b2 == a2) {
  447. *(a1 + 0) = A3;
  448. *(a1 + 1) = A4;
  449. *(a2 + 0) = A1;
  450. *(a2 + 1) = A2;
  451. *(a3 + 0) = A7;
  452. *(a3 + 1) = A8;
  453. *(a4 + 0) = A5;
  454. *(a4 + 1) = A6;
  455. *(a5 + 0) = A11;
  456. *(a5 + 1) = A12;
  457. *(a6 + 0) = A9;
  458. *(a6 + 1) = A10;
  459. *(a7 + 0) = A15;
  460. *(a7 + 1) = A16;
  461. *(a8 + 0) = A13;
  462. *(a8 + 1) = A14;
  463. } else {
  464. *(a1 + 0) = A3;
  465. *(a1 + 1) = A4;
  466. *(a2 + 0) = B3;
  467. *(a2 + 1) = B4;
  468. *(b2 + 0) = A1;
  469. *(b2 + 1) = A2;
  470. *(a3 + 0) = A7;
  471. *(a3 + 1) = A8;
  472. *(a4 + 0) = B7;
  473. *(a4 + 1) = B8;
  474. *(b4 + 0) = A5;
  475. *(b4 + 1) = A6;
  476. *(a5 + 0) = A11;
  477. *(a5 + 1) = A12;
  478. *(a6 + 0) = B11;
  479. *(a6 + 1) = B12;
  480. *(b6 + 0) = A9;
  481. *(b6 + 1) = A10;
  482. *(a7 + 0) = A15;
  483. *(a7 + 1) = A16;
  484. *(a8 + 0) = B15;
  485. *(a8 + 1) = B16;
  486. *(b8 + 0) = A13;
  487. *(b8 + 1) = A14;
  488. }
  489. }
  490. } else {
  491. if (b2 == a1) {
  492. *(a1 + 0) = A3;
  493. *(a1 + 1) = A4;
  494. *(a2 + 0) = B1;
  495. *(a2 + 1) = B2;
  496. *(b1 + 0) = A1;
  497. *(b1 + 1) = A2;
  498. *(a3 + 0) = A7;
  499. *(a3 + 1) = A8;
  500. *(a4 + 0) = B5;
  501. *(a4 + 1) = B6;
  502. *(b3 + 0) = A5;
  503. *(b3 + 1) = A6;
  504. *(a5 + 0) = A11;
  505. *(a5 + 1) = A12;
  506. *(a6 + 0) = B9;
  507. *(a6 + 1) = B10;
  508. *(b5 + 0) = A9;
  509. *(b5 + 1) = A10;
  510. *(a7 + 0) = A15;
  511. *(a7 + 1) = A16;
  512. *(a8 + 0) = B13;
  513. *(a8 + 1) = B14;
  514. *(b7 + 0) = A13;
  515. *(b7 + 1) = A14;
  516. } else
  517. if (b2 == a2) {
  518. *(a1 + 0) = B1;
  519. *(a1 + 1) = B2;
  520. *(b1 + 0) = A1;
  521. *(b1 + 1) = A2;
  522. *(a3 + 0) = B5;
  523. *(a3 + 1) = B6;
  524. *(b3 + 0) = A5;
  525. *(b3 + 1) = A6;
  526. *(a5 + 0) = B9;
  527. *(a5 + 1) = B10;
  528. *(b5 + 0) = A9;
  529. *(b5 + 1) = A10;
  530. *(a7 + 0) = B13;
  531. *(a7 + 1) = B14;
  532. *(b7 + 0) = A13;
  533. *(b7 + 1) = A14;
  534. } else
  535. if (b2 == b1) {
  536. *(a1 + 0) = B1;
  537. *(a1 + 1) = B2;
  538. *(a2 + 0) = A1;
  539. *(a2 + 1) = A2;
  540. *(b1 + 0) = A3;
  541. *(b1 + 1) = A4;
  542. *(a3 + 0) = B5;
  543. *(a3 + 1) = B6;
  544. *(a4 + 0) = A5;
  545. *(a4 + 1) = A6;
  546. *(b3 + 0) = A7;
  547. *(b3 + 1) = A8;
  548. *(a5 + 0) = B9;
  549. *(a5 + 1) = B10;
  550. *(a6 + 0) = A9;
  551. *(a6 + 1) = A10;
  552. *(b5 + 0) = A11;
  553. *(b5 + 1) = A12;
  554. *(a7 + 0) = B13;
  555. *(a7 + 1) = B14;
  556. *(a8 + 0) = A13;
  557. *(a8 + 1) = A14;
  558. *(b7 + 0) = A15;
  559. *(b7 + 1) = A16;
  560. } else {
  561. *(a1 + 0) = B1;
  562. *(a1 + 1) = B2;
  563. *(a2 + 0) = B3;
  564. *(a2 + 1) = B4;
  565. *(b1 + 0) = A1;
  566. *(b1 + 1) = A2;
  567. *(b2 + 0) = A3;
  568. *(b2 + 1) = A4;
  569. *(a3 + 0) = B5;
  570. *(a3 + 1) = B6;
  571. *(a4 + 0) = B7;
  572. *(a4 + 1) = B8;
  573. *(b3 + 0) = A5;
  574. *(b3 + 1) = A6;
  575. *(b4 + 0) = A7;
  576. *(b4 + 1) = A8;
  577. *(a5 + 0) = B9;
  578. *(a5 + 1) = B10;
  579. *(a6 + 0) = B11;
  580. *(a6 + 1) = B12;
  581. *(b5 + 0) = A9;
  582. *(b5 + 1) = A10;
  583. *(b6 + 0) = A11;
  584. *(b6 + 1) = A12;
  585. *(a7 + 0) = B13;
  586. *(a7 + 1) = B14;
  587. *(a8 + 0) = B15;
  588. *(a8 + 1) = B16;
  589. *(b7 + 0) = A13;
  590. *(b7 + 1) = A14;
  591. *(b8 + 0) = A15;
  592. *(b8 + 1) = A16;
  593. }
  594. }
  595. #ifndef MINUS
  596. a1 += 4;
  597. a3 += 4;
  598. a5 += 4;
  599. a7 += 4;
  600. #else
  601. a1 -= 4;
  602. a3 -= 4;
  603. a5 -= 4;
  604. a7 -= 4;
  605. #endif
  606. //Remain
  607. i = (rows & 1);
  608. if (i > 0) {
  609. ip1 = *piv * 2;
  610. b1 = a + ip1;
  611. b3 = b1 + 1 * lda;
  612. b5 = b1 + 2 * lda;
  613. b7 = b1 + 3 * lda;
  614. A1 = *(a1 + 0);
  615. A2 = *(a1 + 1);
  616. A3 = *(a3 + 0);
  617. A4 = *(a3 + 1);
  618. B1 = *(b1 + 0);
  619. B2 = *(b1 + 1);
  620. B3 = *(b3 + 0);
  621. B4 = *(b3 + 1);
  622. A5 = *(a5 + 0);
  623. A6 = *(a5 + 1);
  624. A7 = *(a7 + 0);
  625. A8 = *(a7 + 1);
  626. B5 = *(b5 + 0);
  627. B6 = *(b5 + 1);
  628. B7 = *(b7 + 0);
  629. B8 = *(b7 + 1);
  630. *(a1 + 0) = B1;
  631. *(a1 + 1) = B2;
  632. *(a3 + 0) = B3;
  633. *(a3 + 1) = B4;
  634. *(b1 + 0) = A1;
  635. *(b1 + 1) = A2;
  636. *(b3 + 0) = A3;
  637. *(b3 + 1) = A4;
  638. *(a5 + 0) = B5;
  639. *(a5 + 1) = B6;
  640. *(a7 + 0) = B7;
  641. *(a7 + 1) = B8;
  642. *(b5 + 0) = A5;
  643. *(b5 + 1) = A6;
  644. *(b7 + 0) = A7;
  645. *(b7 + 1) = A8;
  646. }
  647. a += 4 * lda;
  648. j --;
  649. } while (j > 0);
  650. }
  651. if (n & 2) {
  652. piv = ipiv;
  653. #ifndef MINUS
  654. a1 = a + (k1 + 1) * 2;
  655. #else
  656. a1 = a + k2 * 2;
  657. #endif
  658. a3 = a1 + lda;
  659. ip1 = *piv * 2;
  660. piv += incx;
  661. ip2 = *piv * 2;
  662. piv += incx;
  663. b1 = a + ip1;
  664. b2 = a + ip2;
  665. b3 = b1 + lda;
  666. b4 = b2 + lda;
  667. i = (rows >> 1);
  668. i--;
  669. //Loop pipeline
  670. //Main Loop
  671. while (i > 0) {
  672. A1 = *(a1 + 0);
  673. A2 = *(a1 + 1);
  674. A3 = *(a2 + 0);
  675. A4 = *(a2 + 1);
  676. A5 = *(a3 + 0);
  677. A6 = *(a3 + 1);
  678. A7 = *(a4 + 0);
  679. A8 = *(a4 + 1);
  680. B1 = *(b1 + 0);
  681. B2 = *(b1 + 1);
  682. B3 = *(b2 + 0);
  683. B4 = *(b2 + 1);
  684. B5 = *(b3 + 0);
  685. B6 = *(b3 + 1);
  686. B7 = *(b4 + 0);
  687. B8 = *(b4 + 1);
  688. ip1 = *piv * 2;
  689. piv += incx;
  690. ip2 = *piv * 2;
  691. piv += incx;
  692. if (b1 == a1) {
  693. if (b2 == a1) {
  694. *(a1 + 0) = A3;
  695. *(a1 + 1) = A4;
  696. *(a2 + 0) = A1;
  697. *(a2 + 1) = A2;
  698. *(a3 + 0) = A7;
  699. *(a3 + 1) = A8;
  700. *(a4 + 0) = A5;
  701. *(a4 + 1) = A6;
  702. } else
  703. if (b2 != a2) {
  704. *(a2 + 0) = B3;
  705. *(a2 + 1) = B4;
  706. *(b2 + 0) = A3;
  707. *(b2 + 1) = A4;
  708. *(a4 + 0) = B7;
  709. *(a4 + 1) = B8;
  710. *(b4 + 0) = A7;
  711. *(b4 + 1) = A8;
  712. }
  713. } else
  714. if (b1 == a2) {
  715. if (b2 != a1) {
  716. if (b2 == a2) {
  717. *(a1 + 0) = A3;
  718. *(a1 + 1) = A4;
  719. *(a2 + 0) = A1;
  720. *(a2 + 1) = A2;
  721. *(a3 + 0) = A7;
  722. *(a3 + 1) = A8;
  723. *(a4 + 0) = A5;
  724. *(a4 + 1) = A6;
  725. } else {
  726. *(a1 + 0) = A3;
  727. *(a1 + 1) = A4;
  728. *(a2 + 0) = B3;
  729. *(a2 + 1) = B4;
  730. *(b2 + 0) = A1;
  731. *(b2 + 1) = A2;
  732. *(a3 + 0) = A7;
  733. *(a3 + 1) = A8;
  734. *(a4 + 0) = B7;
  735. *(a4 + 1) = B8;
  736. *(b4 + 0) = A5;
  737. *(b4 + 1) = A6;
  738. }
  739. }
  740. } else {
  741. if (b2 == a1) {
  742. *(a1 + 0) = A3;
  743. *(a1 + 1) = A4;
  744. *(a2 + 0) = B1;
  745. *(a2 + 1) = B2;
  746. *(b1 + 0) = A1;
  747. *(b1 + 1) = A2;
  748. *(a3 + 0) = A7;
  749. *(a3 + 1) = A8;
  750. *(a4 + 0) = B5;
  751. *(a4 + 1) = B6;
  752. *(b3 + 0) = A5;
  753. *(b3 + 1) = A6;
  754. } else
  755. if (b2 == a2) {
  756. *(a1 + 0) = B1;
  757. *(a1 + 1) = B2;
  758. *(b1 + 0) = A1;
  759. *(b1 + 1) = A2;
  760. *(a3 + 0) = B5;
  761. *(a3 + 1) = B6;
  762. *(b3 + 0) = A5;
  763. *(b3 + 1) = A6;
  764. } else
  765. if (b2 == b1) {
  766. *(a1 + 0) = B1;
  767. *(a1 + 1) = B2;
  768. *(a2 + 0) = A1;
  769. *(a2 + 1) = A2;
  770. *(b1 + 0) = A3;
  771. *(b1 + 1) = A4;
  772. *(a3 + 0) = B5;
  773. *(a3 + 1) = B6;
  774. *(a4 + 0) = A5;
  775. *(a4 + 1) = A6;
  776. *(b3 + 0) = A7;
  777. *(b3 + 1) = A8;
  778. } else {
  779. *(a1 + 0) = B1;
  780. *(a1 + 1) = B2;
  781. *(a2 + 0) = B3;
  782. *(a2 + 1) = B4;
  783. *(b1 + 0) = A1;
  784. *(b1 + 1) = A2;
  785. *(b2 + 0) = A3;
  786. *(b2 + 1) = A4;
  787. *(a3 + 0) = B5;
  788. *(a3 + 1) = B6;
  789. *(a4 + 0) = B7;
  790. *(a4 + 1) = B8;
  791. *(b3 + 0) = A5;
  792. *(b3 + 1) = A6;
  793. *(b4 + 0) = A7;
  794. *(b4 + 1) = A8;
  795. }
  796. }
  797. b1 = a + ip1;
  798. b2 = a + ip2;
  799. b3 = b1 + lda;
  800. b4 = b2 + lda;
  801. #ifndef MINUS
  802. a1 += 4;
  803. a3 += 4;
  804. #else
  805. a1 -= 4;
  806. a3 -= 4;
  807. #endif
  808. i --;
  809. }
  810. //Loop Ending
  811. A1 = *(a1 + 0);
  812. A2 = *(a1 + 1);
  813. A3 = *(a2 + 0);
  814. A4 = *(a2 + 1);
  815. A5 = *(a3 + 0);
  816. A6 = *(a3 + 1);
  817. A7 = *(a4 + 0);
  818. A8 = *(a4 + 1);
  819. B1 = *(b1 + 0);
  820. B2 = *(b1 + 1);
  821. B3 = *(b2 + 0);
  822. B4 = *(b2 + 1);
  823. B5 = *(b3 + 0);
  824. B6 = *(b3 + 1);
  825. B7 = *(b4 + 0);
  826. B8 = *(b4 + 1);
  827. if (b1 == a1) {
  828. if (b2 == a1) {
  829. *(a1 + 0) = A3;
  830. *(a1 + 1) = A4;
  831. *(a2 + 0) = A1;
  832. *(a2 + 1) = A2;
  833. *(a3 + 0) = A7;
  834. *(a3 + 1) = A8;
  835. *(a4 + 0) = A5;
  836. *(a4 + 1) = A6;
  837. } else
  838. if (b2 != a2) {
  839. *(a2 + 0) = B3;
  840. *(a2 + 1) = B4;
  841. *(b2 + 0) = A3;
  842. *(b2 + 1) = A4;
  843. *(a4 + 0) = B7;
  844. *(a4 + 1) = B8;
  845. *(b4 + 0) = A7;
  846. *(b4 + 1) = A8;
  847. }
  848. } else
  849. if (b1 == a2) {
  850. if (b2 != a1) {
  851. if (b2 == a2) {
  852. *(a1 + 0) = A3;
  853. *(a1 + 1) = A4;
  854. *(a2 + 0) = A1;
  855. *(a2 + 1) = A2;
  856. *(a3 + 0) = A7;
  857. *(a3 + 1) = A8;
  858. *(a4 + 0) = A5;
  859. *(a4 + 1) = A6;
  860. } else {
  861. *(a1 + 0) = A3;
  862. *(a1 + 1) = A4;
  863. *(a2 + 0) = B3;
  864. *(a2 + 1) = B4;
  865. *(b2 + 0) = A1;
  866. *(b2 + 1) = A2;
  867. *(a3 + 0) = A7;
  868. *(a3 + 1) = A8;
  869. *(a4 + 0) = B7;
  870. *(a4 + 1) = B8;
  871. *(b4 + 0) = A5;
  872. *(b4 + 1) = A6;
  873. }
  874. }
  875. } else {
  876. if (b2 == a1) {
  877. *(a1 + 0) = A3;
  878. *(a1 + 1) = A4;
  879. *(a2 + 0) = B1;
  880. *(a2 + 1) = B2;
  881. *(b1 + 0) = A1;
  882. *(b1 + 1) = A2;
  883. *(a3 + 0) = A7;
  884. *(a3 + 1) = A8;
  885. *(a4 + 0) = B5;
  886. *(a4 + 1) = B6;
  887. *(b3 + 0) = A5;
  888. *(b3 + 1) = A6;
  889. } else
  890. if (b2 == a2) {
  891. *(a1 + 0) = B1;
  892. *(a1 + 1) = B2;
  893. *(b1 + 0) = A1;
  894. *(b1 + 1) = A2;
  895. *(a3 + 0) = B5;
  896. *(a3 + 1) = B6;
  897. *(b3 + 0) = A5;
  898. *(b3 + 1) = A6;
  899. } else
  900. if (b2 == b1) {
  901. *(a1 + 0) = B1;
  902. *(a1 + 1) = B2;
  903. *(a2 + 0) = A1;
  904. *(a2 + 1) = A2;
  905. *(b1 + 0) = A3;
  906. *(b1 + 1) = A4;
  907. *(a3 + 0) = B5;
  908. *(a3 + 1) = B6;
  909. *(a4 + 0) = A5;
  910. *(a4 + 1) = A6;
  911. *(b3 + 0) = A7;
  912. *(b3 + 1) = A8;
  913. } else {
  914. *(a1 + 0) = B1;
  915. *(a1 + 1) = B2;
  916. *(a2 + 0) = B3;
  917. *(a2 + 1) = B4;
  918. *(b1 + 0) = A1;
  919. *(b1 + 1) = A2;
  920. *(b2 + 0) = A3;
  921. *(b2 + 1) = A4;
  922. *(a3 + 0) = B5;
  923. *(a3 + 1) = B6;
  924. *(a4 + 0) = B7;
  925. *(a4 + 1) = B8;
  926. *(b3 + 0) = A5;
  927. *(b3 + 1) = A6;
  928. *(b4 + 0) = A7;
  929. *(b4 + 1) = A8;
  930. }
  931. }
  932. #ifndef MINUS
  933. a1 += 4;
  934. a3 += 4;
  935. #else
  936. a1 -= 4;
  937. a3 -= 4;
  938. #endif
  939. //Remain
  940. i = (rows & 1);
  941. if (i > 0) {
  942. ip1 = *piv * 2;
  943. b1 = a + ip1;
  944. b3 = b1 + lda;
  945. A1 = *(a1 + 0);
  946. A2 = *(a1 + 1);
  947. A3 = *(a3 + 0);
  948. A4 = *(a3 + 1);
  949. B1 = *(b1 + 0);
  950. B2 = *(b1 + 1);
  951. B3 = *(b3 + 0);
  952. B4 = *(b3 + 1);
  953. *(a1 + 0) = B1;
  954. *(a1 + 1) = B2;
  955. *(a3 + 0) = B3;
  956. *(a3 + 1) = B4;
  957. *(b1 + 0) = A1;
  958. *(b1 + 1) = A2;
  959. *(b3 + 0) = A3;
  960. *(b3 + 1) = A4;
  961. }
  962. a += 2 * lda;
  963. }
  964. if (n & 1) {
  965. piv = ipiv;
  966. #ifndef MINUS
  967. a1 = a + (k1 + 1) * 2;
  968. #else
  969. a1 = a + k2 * 2;
  970. #endif
  971. ip1 = *piv * 2;
  972. piv += incx;
  973. ip2 = *piv * 2;
  974. piv += incx;
  975. b1 = a + ip1;
  976. b2 = a + ip2;
  977. i = (rows >> 1);
  978. i--;
  979. //Loop pipeline
  980. //Main Loop
  981. while (i > 0) {
  982. A1 = *(a1 + 0);
  983. A2 = *(a1 + 1);
  984. A3 = *(a2 + 0);
  985. A4 = *(a2 + 1);
  986. B1 = *(b1 + 0);
  987. B2 = *(b1 + 1);
  988. B3 = *(b2 + 0);
  989. B4 = *(b2 + 1);
  990. ip1 = *piv * 2;
  991. piv += incx;
  992. ip2 = *piv * 2;
  993. piv += incx;
  994. if (b1 == a1) {
  995. if (b2 == a1) {
  996. *(a1 + 0) = A3;
  997. *(a1 + 1) = A4;
  998. *(a2 + 0) = A1;
  999. *(a2 + 1) = A2;
  1000. } else
  1001. if (b2 != a2) {
  1002. *(a2 + 0) = B3;
  1003. *(a2 + 1) = B4;
  1004. *(b2 + 0) = A3;
  1005. *(b2 + 1) = A4;
  1006. }
  1007. } else
  1008. if (b1 == a2) {
  1009. if (b2 != a1) {
  1010. if (b2 == a2) {
  1011. *(a1 + 0) = A3;
  1012. *(a1 + 1) = A4;
  1013. *(a2 + 0) = A1;
  1014. *(a2 + 1) = A2;
  1015. } else {
  1016. *(a1 + 0) = A3;
  1017. *(a1 + 1) = A4;
  1018. *(a2 + 0) = B3;
  1019. *(a2 + 1) = B4;
  1020. *(b2 + 0) = A1;
  1021. *(b2 + 1) = A2;
  1022. }
  1023. }
  1024. } else {
  1025. if (b2 == a1) {
  1026. *(a1 + 0) = A3;
  1027. *(a1 + 1) = A4;
  1028. *(a2 + 0) = B1;
  1029. *(a2 + 1) = B2;
  1030. *(b1 + 0) = A1;
  1031. *(b1 + 1) = A2;
  1032. } else
  1033. if (b2 == a2) {
  1034. *(a1 + 0) = B1;
  1035. *(a1 + 1) = B2;
  1036. *(b1 + 0) = A1;
  1037. *(b1 + 1) = A2;
  1038. } else
  1039. if (b2 == b1) {
  1040. *(a1 + 0) = B1;
  1041. *(a1 + 1) = B2;
  1042. *(a2 + 0) = A1;
  1043. *(a2 + 1) = A2;
  1044. *(b1 + 0) = A3;
  1045. *(b1 + 1) = A4;
  1046. } else {
  1047. *(a1 + 0) = B1;
  1048. *(a1 + 1) = B2;
  1049. *(a2 + 0) = B3;
  1050. *(a2 + 1) = B4;
  1051. *(b1 + 0) = A1;
  1052. *(b1 + 1) = A2;
  1053. *(b2 + 0) = A3;
  1054. *(b2 + 1) = A4;
  1055. }
  1056. }
  1057. b1 = a + ip1;
  1058. b2 = a + ip2;
  1059. #ifndef MINUS
  1060. a1 += 4;
  1061. #else
  1062. a1 -= 4;
  1063. #endif
  1064. i --;
  1065. }
  1066. //Loop Ending
  1067. A1 = *(a1 + 0);
  1068. A2 = *(a1 + 1);
  1069. A3 = *(a2 + 0);
  1070. A4 = *(a2 + 1);
  1071. B1 = *(b1 + 0);
  1072. B2 = *(b1 + 1);
  1073. B3 = *(b2 + 0);
  1074. B4 = *(b2 + 1);
  1075. if (b1 == a1) {
  1076. if (b2 == a1) {
  1077. *(a1 + 0) = A3;
  1078. *(a1 + 1) = A4;
  1079. *(a2 + 0) = A1;
  1080. *(a2 + 1) = A2;
  1081. } else
  1082. if (b2 != a2) {
  1083. *(a2 + 0) = B3;
  1084. *(a2 + 1) = B4;
  1085. *(b2 + 0) = A3;
  1086. *(b2 + 1) = A4;
  1087. }
  1088. } else
  1089. if (b1 == a2) {
  1090. if (b2 != a1) {
  1091. if (b2 == a2) {
  1092. *(a1 + 0) = A3;
  1093. *(a1 + 1) = A4;
  1094. *(a2 + 0) = A1;
  1095. *(a2 + 1) = A2;
  1096. } else {
  1097. *(a1 + 0) = A3;
  1098. *(a1 + 1) = A4;
  1099. *(a2 + 0) = B3;
  1100. *(a2 + 1) = B4;
  1101. *(b2 + 0) = A1;
  1102. *(b2 + 1) = A2;
  1103. }
  1104. }
  1105. } else {
  1106. if (b2 == a1) {
  1107. *(a1 + 0) = A3;
  1108. *(a1 + 1) = A4;
  1109. *(a2 + 0) = B1;
  1110. *(a2 + 1) = B2;
  1111. *(b1 + 0) = A1;
  1112. *(b1 + 1) = A2;
  1113. } else
  1114. if (b2 == a2) {
  1115. *(a1 + 0) = B1;
  1116. *(a1 + 1) = B2;
  1117. *(b1 + 0) = A1;
  1118. *(b1 + 1) = A2;
  1119. } else
  1120. if (b2 == b1) {
  1121. *(a1 + 0) = B1;
  1122. *(a1 + 1) = B2;
  1123. *(a2 + 0) = A1;
  1124. *(a2 + 1) = A2;
  1125. *(b1 + 0) = A3;
  1126. *(b1 + 1) = A4;
  1127. } else {
  1128. *(a1 + 0) = B1;
  1129. *(a1 + 1) = B2;
  1130. *(a2 + 0) = B3;
  1131. *(a2 + 1) = B4;
  1132. *(b1 + 0) = A1;
  1133. *(b1 + 1) = A2;
  1134. *(b2 + 0) = A3;
  1135. *(b2 + 1) = A4;
  1136. }
  1137. }
  1138. #ifndef MINUS
  1139. a1 += 4;
  1140. #else
  1141. a1 -= 4;
  1142. #endif
  1143. //Remain
  1144. i = (rows & 1);
  1145. if (i > 0) {
  1146. ip1 = *piv * 2;
  1147. b1 = a + ip1;
  1148. A1 = *(a1 + 0);
  1149. A2 = *(a1 + 1);
  1150. B1 = *(b1 + 0);
  1151. B2 = *(b1 + 1);
  1152. *(a1 + 0) = B1;
  1153. *(a1 + 1) = B2;
  1154. *(b1 + 0) = A1;
  1155. *(b1 + 1) = A2;
  1156. }
  1157. }
  1158. return 0;
  1159. }