You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

laswp_k_2.c 9.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. #ifndef MINUS
  41. #define a2 (a1 + 1)
  42. #define a4 (a3 + 1)
  43. #else
  44. #define a2 (a1 - 1)
  45. #define a4 (a3 - 1)
  46. #endif
  47. int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
  48. FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
  49. BLASLONG i, j, ip1, ip2, rows;
  50. blasint *piv;
  51. FLOAT *a1, *a3;
  52. FLOAT *b1, *b2, *b3, *b4;
  53. FLOAT A1, A2, B1, B2, A3, A4, B3, B4;
  54. a--;
  55. k1 --;
  56. #ifndef MINUS
  57. ipiv += k1;
  58. #else
  59. ipiv -= (k2 - 1) * incx;
  60. #endif
  61. if (n <= 0) return 0;
  62. j = (n >> 1);
  63. rows = k2-k1;
  64. if (rows <=0) return 0;
  65. if (rows == 1) {
  66. //Only have 1 row
  67. ip1 = *ipiv;
  68. a1 = a + k1 + 1;
  69. b1 = a + ip1;
  70. if(a1 == b1) return 0;
  71. for(j=0; j<n; j++){
  72. A1 = *a1;
  73. B1 = *b1;
  74. *a1 = B1;
  75. *b1 = A1;
  76. a1 += lda;
  77. b1 += lda;
  78. }
  79. return 0;
  80. }
  81. if (j > 0) {
  82. do {
  83. piv = ipiv;
  84. #ifndef MINUS
  85. a1 = a + k1 + 1;
  86. #else
  87. a1 = a + k2;
  88. #endif
  89. a3 = a1 + 1 * lda;
  90. ip1 = *piv;
  91. piv += incx;
  92. ip2 = *piv;
  93. piv += incx;
  94. b1 = a + ip1;
  95. b2 = a + ip2;
  96. b3 = b1 + 1 * lda;
  97. b4 = b2 + 1 * lda;
  98. i = ((rows) >> 1);
  99. // Loop pipeline
  100. i--;
  101. //Main Loop
  102. while (i > 0) {
  103. #ifdef CORE2
  104. #ifndef MINUS
  105. asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1));
  106. asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b3));
  107. asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a1));
  108. asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a3));
  109. #else
  110. asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b1));
  111. asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b3));
  112. asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a1));
  113. asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a3));
  114. #endif
  115. #endif
  116. B1 = *b1;
  117. B2 = *b2;
  118. B3 = *b3;
  119. B4 = *b4;
  120. A1 = *a1;
  121. A2 = *a2;
  122. A3 = *a3;
  123. A4 = *a4;
  124. ip1 = *piv;
  125. piv += incx;
  126. ip2 = *piv;
  127. piv += incx;
  128. if (b1 == a1) {
  129. if (b2 == a1) {
  130. *a1 = A2;
  131. *a2 = A1;
  132. *a3 = A4;
  133. *a4 = A3;
  134. } else
  135. if (b2 != a2) {
  136. *a2 = B2;
  137. *b2 = A2;
  138. *a4 = B4;
  139. *b4 = A4;
  140. }
  141. } else
  142. if (b1 == a2) {
  143. if (b2 != a1) {
  144. if (b2 == a2) {
  145. *a1 = A2;
  146. *a2 = A1;
  147. *a3 = A4;
  148. *a4 = A3;
  149. } else {
  150. *a1 = A2;
  151. *a2 = B2;
  152. *b2 = A1;
  153. *a3 = A4;
  154. *a4 = B4;
  155. *b4 = A3;
  156. }
  157. }
  158. } else {
  159. if (b2 == a1) {
  160. *a1 = A2;
  161. *a2 = B1;
  162. *b1 = A1;
  163. *a3 = A4;
  164. *a4 = B3;
  165. *b3 = A3;
  166. } else
  167. if (b2 == a2) {
  168. *a1 = B1;
  169. *b1 = A1;
  170. *a3 = B3;
  171. *b3 = A3;
  172. } else
  173. if (b2 == b1) {
  174. *a1 = B1;
  175. *a2 = A1;
  176. *b1 = A2;
  177. *a3 = B3;
  178. *a4 = A3;
  179. *b3 = A4;
  180. } else {
  181. *a1 = B1;
  182. *a2 = B2;
  183. *b1 = A1;
  184. *b2 = A2;
  185. *a3 = B3;
  186. *a4 = B4;
  187. *b3 = A3;
  188. *b4 = A4;
  189. }
  190. }
  191. b1 = a + ip1;
  192. b2 = a + ip2;
  193. b3 = b1 + 1 * lda;
  194. b4 = b2 + 1 * lda;
  195. #ifndef MINUS
  196. a1 += 2;
  197. a3 += 2;
  198. #else
  199. a1 -= 2;
  200. a3 -= 2;
  201. #endif
  202. i --;
  203. }
  204. //Loop Ending
  205. B1 = *b1;
  206. B2 = *b2;
  207. B3 = *b3;
  208. B4 = *b4;
  209. A1 = *a1;
  210. A2 = *a2;
  211. A3 = *a3;
  212. A4 = *a4;
  213. if (b1 == a1) {
  214. if (b2 == a1) {
  215. *a1 = A2;
  216. *a2 = A1;
  217. *a3 = A4;
  218. *a4 = A3;
  219. } else
  220. if (b2 != a2) {
  221. *a2 = B2;
  222. *b2 = A2;
  223. *a4 = B4;
  224. *b4 = A4;
  225. }
  226. } else
  227. if (b1 == a2) {
  228. if (b2 != a1) {
  229. if (b2 == a2) {
  230. *a1 = A2;
  231. *a2 = A1;
  232. *a3 = A4;
  233. *a4 = A3;
  234. } else {
  235. *a1 = A2;
  236. *a2 = B2;
  237. *b2 = A1;
  238. *a3 = A4;
  239. *a4 = B4;
  240. *b4 = A3;
  241. }
  242. }
  243. } else {
  244. if (b2 == a1) {
  245. *a1 = A2;
  246. *a2 = B1;
  247. *b1 = A1;
  248. *a3 = A4;
  249. *a4 = B3;
  250. *b3 = A3;
  251. } else
  252. if (b2 == a2) {
  253. *a1 = B1;
  254. *b1 = A1;
  255. *a3 = B3;
  256. *b3 = A3;
  257. } else
  258. if (b2 == b1) {
  259. *a1 = B1;
  260. *a2 = A1;
  261. *b1 = A2;
  262. *a3 = B3;
  263. *a4 = A3;
  264. *b3 = A4;
  265. } else {
  266. *a1 = B1;
  267. *a2 = B2;
  268. *b1 = A1;
  269. *b2 = A2;
  270. *a3 = B3;
  271. *a4 = B4;
  272. *b3 = A3;
  273. *b4 = A4;
  274. }
  275. }
  276. #ifndef MINUS
  277. a1 += 2;
  278. a3 += 2;
  279. #else
  280. a1 -= 2;
  281. a3 -= 2;
  282. #endif
  283. //Remain
  284. i = ((rows) & 1);
  285. if (i > 0) {
  286. ip1 = *piv;
  287. b1 = a + ip1;
  288. b3 = b1 + 1 * lda;
  289. A1 = *a1;
  290. B1 = *b1;
  291. A3 = *a3;
  292. B3 = *b3;
  293. *a1 = B1;
  294. *b1 = A1;
  295. *a3 = B3;
  296. *b3 = A3;
  297. }
  298. a += 2 * lda;
  299. j --;
  300. } while (j > 0);
  301. }
  302. if (n & 1) {
  303. piv = ipiv;
  304. #ifndef MINUS
  305. a1 = a + k1 + 1;
  306. #else
  307. a1 = a + k2;
  308. #endif
  309. ip1 = *piv;
  310. piv += incx;
  311. ip2 = *piv;
  312. piv += incx;
  313. b1 = a + ip1;
  314. b2 = a + ip2;
  315. i = ((rows) >> 1);
  316. i --;
  317. while (i > 0) {
  318. A1 = *a1;
  319. A2 = *a2;
  320. B1 = *b1;
  321. B2 = *b2;
  322. ip1 = *piv;
  323. piv += incx;
  324. ip2 = *piv;
  325. piv += incx;
  326. if (b1 == a1) {
  327. if (b2 == a1) {
  328. *a1 = A2;
  329. *a2 = A1;
  330. } else
  331. if (b2 != a2) {
  332. *a2 = B2;
  333. *b2 = A2;
  334. }
  335. } else
  336. if (b1 == a2) {
  337. if (b2 != a1) {
  338. if (b2 == a2) {
  339. *a1 = A2;
  340. *a2 = A1;
  341. } else {
  342. *a1 = A2;
  343. *a2 = B2;
  344. *b2 = A1;
  345. }
  346. }
  347. } else {
  348. if (b2 == a1) {
  349. *a1 = A2;
  350. *a2 = B1;
  351. *b1 = A1;
  352. } else
  353. if (b2 == a2) {
  354. *a1 = B1;
  355. *b1 = A1;
  356. } else
  357. if (b2 == b1) {
  358. *a1 = B1;
  359. *a2 = A1;
  360. *b1 = A2;
  361. } else {
  362. *a1 = B1;
  363. *a2 = B2;
  364. *b1 = A1;
  365. *b2 = A2;
  366. }
  367. }
  368. b1 = a + ip1;
  369. b2 = a + ip2;
  370. #ifndef MINUS
  371. a1 += 2;
  372. #else
  373. a1 -= 2;
  374. #endif
  375. i --;
  376. }
  377. //Loop Ending (n=1)
  378. A1 = *a1;
  379. A2 = *a2;
  380. B1 = *b1;
  381. B2 = *b2;
  382. if (b1 == a1) {
  383. if (b2 == a1) {
  384. *a1 = A2;
  385. *a2 = A1;
  386. } else
  387. if (b2 != a2) {
  388. *a2 = B2;
  389. *b2 = A2;
  390. }
  391. } else
  392. if (b1 == a2) {
  393. if (b2 != a1) {
  394. if (b2 == a2) {
  395. *a1 = A2;
  396. *a2 = A1;
  397. } else {
  398. *a1 = A2;
  399. *a2 = B2;
  400. *b2 = A1;
  401. }
  402. }
  403. } else {
  404. if (b2 == a1) {
  405. *a1 = A2;
  406. *a2 = B1;
  407. *b1 = A1;
  408. } else
  409. if (b2 == a2) {
  410. *a1 = B1;
  411. *b1 = A1;
  412. } else
  413. if (b2 == b1) {
  414. *a1 = B1;
  415. *a2 = A1;
  416. *b1 = A2;
  417. } else {
  418. *a1 = B1;
  419. *a2 = B2;
  420. *b1 = A1;
  421. *b2 = A2;
  422. }
  423. }
  424. #ifndef MINUS
  425. a1 += 2;
  426. #else
  427. a1 -= 2;
  428. #endif
  429. //Remain
  430. i = (rows & 1);
  431. if (i > 0) {
  432. ip1 = *piv;
  433. b1 = a + ip1;
  434. A1 = *a1;
  435. B1 = *b1;
  436. *a1 = B1;
  437. *b1 = A1;
  438. }
  439. }
  440. return 0;
  441. }