You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zlaswp_k_1.c 7.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. #ifndef MINUS
  41. #define a2 (a1 + 2)
  42. #else
  43. #define a2 (a1 - 2)
  44. #endif
  45. int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
  46. FLOAT *a, BLASLONG lda,
  47. FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
  48. BLASLONG i, j, ip1, ip2, rows;
  49. blasint *piv;
  50. FLOAT *a1;
  51. FLOAT *b1, *b2;
  52. FLOAT A1, A2, B1, B2, A3, A4, B3, B4;
  53. a -= 2;
  54. lda *= 2;
  55. k1 --;
  56. ipiv += k1;
  57. #ifdef MINUS
  58. ipiv -= (k2 - k1 - 1) * incx;
  59. #endif
  60. if (n <= 0) return 0;
  61. rows = k2-k1;
  62. if (rows <=0) return 0;
  63. if (rows == 1) {
  64. //Only have 1 row
  65. ip1 = *ipiv * 2;
  66. #ifndef MINUS
  67. a1 = a + (k1 + 1) * 2;
  68. #else
  69. a1 = a + k2 * 2;
  70. #endif
  71. b1 = a + ip1;
  72. if(a1 == b1) return 0;
  73. for(j=0; j<n; j++){
  74. A1 = *(a1 + 0);
  75. A2 = *(a1 + 1);
  76. B1 = *(b1 + 0);
  77. B2 = *(b1 + 1);
  78. *(a1 + 0) = B1;
  79. *(a1 + 1) = B2;
  80. *(b1 + 0) = A1;
  81. *(b1 + 1) = A2;
  82. a1 += lda;
  83. b1 += lda;
  84. }
  85. return 0;
  86. }
  87. j = n;
  88. if (j > 0) {
  89. do {
  90. piv = ipiv;
  91. #ifndef MINUS
  92. a1 = a + (k1 + 1) * 2;
  93. #else
  94. a1 = a + k2 * 2;
  95. #endif
  96. ip1 = *piv * 2;
  97. piv += incx;
  98. ip2 = *piv * 2;
  99. piv += incx;
  100. b1 = a + ip1;
  101. b2 = a + ip2;
  102. i = ((k2 - k1) >> 1);
  103. i --;
  104. //Loop pipeline
  105. //Main Loop
  106. while (i > 0) {
  107. #ifdef OPTERON
  108. #ifndef MINUS
  109. asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1));
  110. asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(b1));
  111. #else
  112. asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(a1));
  113. asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(b1));
  114. #endif
  115. #endif
  116. #ifdef CORE2
  117. #ifndef MINUS
  118. asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(a1));
  119. asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b1));
  120. asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b2));
  121. #else
  122. asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(a1));
  123. asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b1));
  124. asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b2));
  125. #endif
  126. #endif
  127. A1 = *(a1 + 0);
  128. A2 = *(a1 + 1);
  129. A3 = *(a2 + 0);
  130. A4 = *(a2 + 1);
  131. B1 = *(b1 + 0);
  132. B2 = *(b1 + 1);
  133. B3 = *(b2 + 0);
  134. B4 = *(b2 + 1);
  135. ip1 = *piv * 2;
  136. piv += incx;
  137. ip2 = *piv * 2;
  138. piv += incx;
  139. if (b1 == a1) {
  140. if (b2 == a1) {
  141. *(a1 + 0) = A3;
  142. *(a1 + 1) = A4;
  143. *(a2 + 0) = A1;
  144. *(a2 + 1) = A2;
  145. } else
  146. if (b2 != a2) {
  147. *(a2 + 0) = B3;
  148. *(a2 + 1) = B4;
  149. *(b2 + 0) = A3;
  150. *(b2 + 1) = A4;
  151. }
  152. } else
  153. if (b1 == a2) {
  154. if (b2 != a1) {
  155. if (b2 == a2) {
  156. *(a1 + 0) = A3;
  157. *(a1 + 1) = A4;
  158. *(a2 + 0) = A1;
  159. *(a2 + 1) = A2;
  160. } else {
  161. *(a1 + 0) = A3;
  162. *(a1 + 1) = A4;
  163. *(a2 + 0) = B3;
  164. *(a2 + 1) = B4;
  165. *(b2 + 0) = A1;
  166. *(b2 + 1) = A2;
  167. }
  168. }
  169. } else {
  170. if (b2 == a1) {
  171. *(a1 + 0) = A3;
  172. *(a1 + 1) = A4;
  173. *(a2 + 0) = B1;
  174. *(a2 + 1) = B2;
  175. *(b1 + 0) = A1;
  176. *(b1 + 1) = A2;
  177. } else
  178. if (b2 == a2) {
  179. *(a1 + 0) = B1;
  180. *(a1 + 1) = B2;
  181. *(b1 + 0) = A1;
  182. *(b1 + 1) = A2;
  183. } else
  184. if (b2 == b1) {
  185. *(a1 + 0) = B1;
  186. *(a1 + 1) = B2;
  187. *(a2 + 0) = A1;
  188. *(a2 + 1) = A2;
  189. *(b1 + 0) = A3;
  190. *(b1 + 1) = A4;
  191. } else {
  192. *(a1 + 0) = B1;
  193. *(a1 + 1) = B2;
  194. *(a2 + 0) = B3;
  195. *(a2 + 1) = B4;
  196. *(b1 + 0) = A1;
  197. *(b1 + 1) = A2;
  198. *(b2 + 0) = A3;
  199. *(b2 + 1) = A4;
  200. }
  201. }
  202. b1 = a + ip1;
  203. b2 = a + ip2;
  204. #ifndef MINUS
  205. a1 += 4;
  206. #else
  207. a1 -= 4;
  208. #endif
  209. i --;
  210. }
  211. //Loop Ending
  212. A1 = *(a1 + 0);
  213. A2 = *(a1 + 1);
  214. A3 = *(a2 + 0);
  215. A4 = *(a2 + 1);
  216. B1 = *(b1 + 0);
  217. B2 = *(b1 + 1);
  218. B3 = *(b2 + 0);
  219. B4 = *(b2 + 1);
  220. if (b1 == a1) {
  221. if (b2 == a1) {
  222. *(a1 + 0) = A3;
  223. *(a1 + 1) = A4;
  224. *(a2 + 0) = A1;
  225. *(a2 + 1) = A2;
  226. } else
  227. if (b2 != a2) {
  228. *(a2 + 0) = B3;
  229. *(a2 + 1) = B4;
  230. *(b2 + 0) = A3;
  231. *(b2 + 1) = A4;
  232. }
  233. } else
  234. if (b1 == a2) {
  235. if (b2 != a1) {
  236. if (b2 == a2) {
  237. *(a1 + 0) = A3;
  238. *(a1 + 1) = A4;
  239. *(a2 + 0) = A1;
  240. *(a2 + 1) = A2;
  241. } else {
  242. *(a1 + 0) = A3;
  243. *(a1 + 1) = A4;
  244. *(a2 + 0) = B3;
  245. *(a2 + 1) = B4;
  246. *(b2 + 0) = A1;
  247. *(b2 + 1) = A2;
  248. }
  249. }
  250. } else {
  251. if (b2 == a1) {
  252. *(a1 + 0) = A3;
  253. *(a1 + 1) = A4;
  254. *(a2 + 0) = B1;
  255. *(a2 + 1) = B2;
  256. *(b1 + 0) = A1;
  257. *(b1 + 1) = A2;
  258. } else
  259. if (b2 == a2) {
  260. *(a1 + 0) = B1;
  261. *(a1 + 1) = B2;
  262. *(b1 + 0) = A1;
  263. *(b1 + 1) = A2;
  264. } else
  265. if (b2 == b1) {
  266. *(a1 + 0) = B1;
  267. *(a1 + 1) = B2;
  268. *(a2 + 0) = A1;
  269. *(a2 + 1) = A2;
  270. *(b1 + 0) = A3;
  271. *(b1 + 1) = A4;
  272. } else {
  273. *(a1 + 0) = B1;
  274. *(a1 + 1) = B2;
  275. *(a2 + 0) = B3;
  276. *(a2 + 1) = B4;
  277. *(b1 + 0) = A1;
  278. *(b1 + 1) = A2;
  279. *(b2 + 0) = A3;
  280. *(b2 + 1) = A4;
  281. }
  282. }
  283. #ifndef MINUS
  284. a1 += 4;
  285. #else
  286. a1 -= 4;
  287. #endif
  288. //Remain
  289. i = (rows & 1);
  290. if (i > 0) {
  291. ip1 = *piv * 2;
  292. b1 = a + ip1;
  293. A1 = *(a1 + 0);
  294. A2 = *(a1 + 1);
  295. B1 = *(b1 + 0);
  296. B2 = *(b1 + 1);
  297. *(a1 + 0) = B1;
  298. *(a1 + 1) = B2;
  299. *(b1 + 0) = A1;
  300. *(b1 + 1) = A2;
  301. }
  302. a += lda;
  303. j --;
  304. } while (j > 0);
  305. }
  306. return 0;
  307. }