You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

laswp_k_2.c 9.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. #ifndef MINUS
  41. #define a2 (a1 + 1)
  42. #define a4 (a3 + 1)
  43. #else
  44. #define a2 (a1 - 1)
  45. #define a4 (a3 - 1)
  46. #endif
  47. int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
  48. FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
  49. BLASLONG i, j, ip1, ip2, rows;
  50. blasint *piv;
  51. FLOAT *a1, *a3;
  52. FLOAT *b1, *b2, *b3, *b4;
  53. FLOAT A1, A2, B1, B2, A3, A4, B3, B4;
  54. a--;
  55. k1 --;
  56. ipiv += k1;
  57. #ifdef MINUS
  58. ipiv -= (k2 - k1 - 1) * incx;
  59. #endif
  60. if (n <= 0) return 0;
  61. j = (n >> 1);
  62. rows = k2-k1;
  63. if (rows <=0) return 0;
  64. if (rows == 1) {
  65. //Only have 1 row
  66. ip1 = *ipiv;
  67. a1 = a + k1 + 1;
  68. b1 = a + ip1;
  69. if(a1 == b1) return 0;
  70. for(j=0; j<n; j++){
  71. A1 = *a1;
  72. B1 = *b1;
  73. *a1 = B1;
  74. *b1 = A1;
  75. a1 += lda;
  76. b1 += lda;
  77. }
  78. return 0;
  79. }
  80. if (j > 0) {
  81. do {
  82. piv = ipiv;
  83. #ifndef MINUS
  84. a1 = a + k1 + 1;
  85. #else
  86. a1 = a + k2;
  87. #endif
  88. a3 = a1 + 1 * lda;
  89. ip1 = *piv;
  90. piv += incx;
  91. ip2 = *piv;
  92. piv += incx;
  93. b1 = a + ip1;
  94. b2 = a + ip2;
  95. b3 = b1 + 1 * lda;
  96. b4 = b2 + 1 * lda;
  97. i = ((rows) >> 1);
  98. // Loop pipeline
  99. i--;
  100. //Main Loop
  101. while (i > 0) {
  102. #ifdef CORE2
  103. #ifndef MINUS
  104. asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1));
  105. asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b3));
  106. asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a1));
  107. asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a3));
  108. #else
  109. asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b1));
  110. asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b3));
  111. asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a1));
  112. asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a3));
  113. #endif
  114. #endif
  115. B1 = *b1;
  116. B2 = *b2;
  117. B3 = *b3;
  118. B4 = *b4;
  119. A1 = *a1;
  120. A2 = *a2;
  121. A3 = *a3;
  122. A4 = *a4;
  123. ip1 = *piv;
  124. piv += incx;
  125. ip2 = *piv;
  126. piv += incx;
  127. if (b1 == a1) {
  128. if (b2 == a1) {
  129. *a1 = A2;
  130. *a2 = A1;
  131. *a3 = A4;
  132. *a4 = A3;
  133. } else
  134. if (b2 != a2) {
  135. *a2 = B2;
  136. *b2 = A2;
  137. *a4 = B4;
  138. *b4 = A4;
  139. }
  140. } else
  141. if (b1 == a2) {
  142. if (b2 != a1) {
  143. if (b2 == a2) {
  144. *a1 = A2;
  145. *a2 = A1;
  146. *a3 = A4;
  147. *a4 = A3;
  148. } else {
  149. *a1 = A2;
  150. *a2 = B2;
  151. *b2 = A1;
  152. *a3 = A4;
  153. *a4 = B4;
  154. *b4 = A3;
  155. }
  156. }
  157. } else {
  158. if (b2 == a1) {
  159. *a1 = A2;
  160. *a2 = B1;
  161. *b1 = A1;
  162. *a3 = A4;
  163. *a4 = B3;
  164. *b3 = A3;
  165. } else
  166. if (b2 == a2) {
  167. *a1 = B1;
  168. *b1 = A1;
  169. *a3 = B3;
  170. *b3 = A3;
  171. } else
  172. if (b2 == b1) {
  173. *a1 = B1;
  174. *a2 = A1;
  175. *b1 = A2;
  176. *a3 = B3;
  177. *a4 = A3;
  178. *b3 = A4;
  179. } else {
  180. *a1 = B1;
  181. *a2 = B2;
  182. *b1 = A1;
  183. *b2 = A2;
  184. *a3 = B3;
  185. *a4 = B4;
  186. *b3 = A3;
  187. *b4 = A4;
  188. }
  189. }
  190. b1 = a + ip1;
  191. b2 = a + ip2;
  192. b3 = b1 + 1 * lda;
  193. b4 = b2 + 1 * lda;
  194. #ifndef MINUS
  195. a1 += 2;
  196. a3 += 2;
  197. #else
  198. a1 -= 2;
  199. a3 -= 2;
  200. #endif
  201. i --;
  202. }
  203. //Loop Ending
  204. B1 = *b1;
  205. B2 = *b2;
  206. B3 = *b3;
  207. B4 = *b4;
  208. A1 = *a1;
  209. A2 = *a2;
  210. A3 = *a3;
  211. A4 = *a4;
  212. if (b1 == a1) {
  213. if (b2 == a1) {
  214. *a1 = A2;
  215. *a2 = A1;
  216. *a3 = A4;
  217. *a4 = A3;
  218. } else
  219. if (b2 != a2) {
  220. *a2 = B2;
  221. *b2 = A2;
  222. *a4 = B4;
  223. *b4 = A4;
  224. }
  225. } else
  226. if (b1 == a2) {
  227. if (b2 != a1) {
  228. if (b2 == a2) {
  229. *a1 = A2;
  230. *a2 = A1;
  231. *a3 = A4;
  232. *a4 = A3;
  233. } else {
  234. *a1 = A2;
  235. *a2 = B2;
  236. *b2 = A1;
  237. *a3 = A4;
  238. *a4 = B4;
  239. *b4 = A3;
  240. }
  241. }
  242. } else {
  243. if (b2 == a1) {
  244. *a1 = A2;
  245. *a2 = B1;
  246. *b1 = A1;
  247. *a3 = A4;
  248. *a4 = B3;
  249. *b3 = A3;
  250. } else
  251. if (b2 == a2) {
  252. *a1 = B1;
  253. *b1 = A1;
  254. *a3 = B3;
  255. *b3 = A3;
  256. } else
  257. if (b2 == b1) {
  258. *a1 = B1;
  259. *a2 = A1;
  260. *b1 = A2;
  261. *a3 = B3;
  262. *a4 = A3;
  263. *b3 = A4;
  264. } else {
  265. *a1 = B1;
  266. *a2 = B2;
  267. *b1 = A1;
  268. *b2 = A2;
  269. *a3 = B3;
  270. *a4 = B4;
  271. *b3 = A3;
  272. *b4 = A4;
  273. }
  274. }
  275. #ifndef MINUS
  276. a1 += 2;
  277. a3 += 2;
  278. #else
  279. a1 -= 2;
  280. a3 -= 2;
  281. #endif
  282. //Remain
  283. i = ((rows) & 1);
  284. if (i > 0) {
  285. ip1 = *piv;
  286. b1 = a + ip1;
  287. b3 = b1 + 1 * lda;
  288. A1 = *a1;
  289. B1 = *b1;
  290. A3 = *a3;
  291. B3 = *b3;
  292. *a1 = B1;
  293. *b1 = A1;
  294. *a3 = B3;
  295. *b3 = A3;
  296. }
  297. a += 2 * lda;
  298. j --;
  299. } while (j > 0);
  300. }
  301. if (n & 1) {
  302. piv = ipiv;
  303. #ifndef MINUS
  304. a1 = a + k1 + 1;
  305. #else
  306. a1 = a + k2;
  307. #endif
  308. ip1 = *piv;
  309. piv += incx;
  310. ip2 = *piv;
  311. piv += incx;
  312. b1 = a + ip1;
  313. b2 = a + ip2;
  314. i = ((rows) >> 1);
  315. i --;
  316. while (i > 0) {
  317. A1 = *a1;
  318. A2 = *a2;
  319. B1 = *b1;
  320. B2 = *b2;
  321. ip1 = *piv;
  322. piv += incx;
  323. ip2 = *piv;
  324. piv += incx;
  325. if (b1 == a1) {
  326. if (b2 == a1) {
  327. *a1 = A2;
  328. *a2 = A1;
  329. } else
  330. if (b2 != a2) {
  331. *a2 = B2;
  332. *b2 = A2;
  333. }
  334. } else
  335. if (b1 == a2) {
  336. if (b2 != a1) {
  337. if (b2 == a2) {
  338. *a1 = A2;
  339. *a2 = A1;
  340. } else {
  341. *a1 = A2;
  342. *a2 = B2;
  343. *b2 = A1;
  344. }
  345. }
  346. } else {
  347. if (b2 == a1) {
  348. *a1 = A2;
  349. *a2 = B1;
  350. *b1 = A1;
  351. } else
  352. if (b2 == a2) {
  353. *a1 = B1;
  354. *b1 = A1;
  355. } else
  356. if (b2 == b1) {
  357. *a1 = B1;
  358. *a2 = A1;
  359. *b1 = A2;
  360. } else {
  361. *a1 = B1;
  362. *a2 = B2;
  363. *b1 = A1;
  364. *b2 = A2;
  365. }
  366. }
  367. b1 = a + ip1;
  368. b2 = a + ip2;
  369. #ifndef MINUS
  370. a1 += 2;
  371. #else
  372. a1 -= 2;
  373. #endif
  374. i --;
  375. }
  376. //Loop Ending (n=1)
  377. A1 = *a1;
  378. A2 = *a2;
  379. B1 = *b1;
  380. B2 = *b2;
  381. if (b1 == a1) {
  382. if (b2 == a1) {
  383. *a1 = A2;
  384. *a2 = A1;
  385. } else
  386. if (b2 != a2) {
  387. *a2 = B2;
  388. *b2 = A2;
  389. }
  390. } else
  391. if (b1 == a2) {
  392. if (b2 != a1) {
  393. if (b2 == a2) {
  394. *a1 = A2;
  395. *a2 = A1;
  396. } else {
  397. *a1 = A2;
  398. *a2 = B2;
  399. *b2 = A1;
  400. }
  401. }
  402. } else {
  403. if (b2 == a1) {
  404. *a1 = A2;
  405. *a2 = B1;
  406. *b1 = A1;
  407. } else
  408. if (b2 == a2) {
  409. *a1 = B1;
  410. *b1 = A1;
  411. } else
  412. if (b2 == b1) {
  413. *a1 = B1;
  414. *a2 = A1;
  415. *b1 = A2;
  416. } else {
  417. *a1 = B1;
  418. *a2 = B2;
  419. *b1 = A1;
  420. *b2 = A2;
  421. }
  422. }
  423. #ifndef MINUS
  424. a1 += 2;
  425. #else
  426. a1 -= 2;
  427. #endif
  428. //Remain
  429. i = (rows & 1);
  430. if (i > 0) {
  431. ip1 = *piv;
  432. b1 = a + ip1;
  433. A1 = *a1;
  434. B1 = *b1;
  435. *a1 = B1;
  436. *b1 = A1;
  437. }
  438. }
  439. return 0;
  440. }