You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zlaswp_k_1.c 7.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. #ifndef MINUS
  41. #define a2 (a1 + 2)
  42. #else
  43. #define a2 (a1 - 2)
  44. #endif
  45. int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
  46. FLOAT *a, BLASLONG lda,
  47. FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
  48. BLASLONG i, j, ip1, ip2, rows;
  49. blasint *piv;
  50. FLOAT *a1;
  51. FLOAT *b1, *b2;
  52. FLOAT A1, A2, B1, B2, A3, A4, B3, B4;
  53. a -= 2;
  54. lda *= 2;
  55. k1 --;
  56. #ifndef MINUS
  57. ipiv += k1;
  58. #else
  59. ipiv -= (k2 - 1) * incx;
  60. #endif
  61. if (n <= 0) return 0;
  62. rows = k2-k1;
  63. if (rows <=0) return 0;
  64. if (rows == 1) {
  65. //Only have 1 row
  66. ip1 = *ipiv * 2;
  67. #ifndef MINUS
  68. a1 = a + (k1 + 1) * 2;
  69. #else
  70. a1 = a + k2 * 2;
  71. #endif
  72. b1 = a + ip1;
  73. if(a1 == b1) return 0;
  74. for(j=0; j<n; j++){
  75. A1 = *(a1 + 0);
  76. A2 = *(a1 + 1);
  77. B1 = *(b1 + 0);
  78. B2 = *(b1 + 1);
  79. *(a1 + 0) = B1;
  80. *(a1 + 1) = B2;
  81. *(b1 + 0) = A1;
  82. *(b1 + 1) = A2;
  83. a1 += lda;
  84. b1 += lda;
  85. }
  86. return 0;
  87. }
  88. j = n;
  89. if (j > 0) {
  90. do {
  91. piv = ipiv;
  92. #ifndef MINUS
  93. a1 = a + (k1 + 1) * 2;
  94. #else
  95. a1 = a + k2 * 2;
  96. #endif
  97. ip1 = *piv * 2;
  98. piv += incx;
  99. ip2 = *piv * 2;
  100. piv += incx;
  101. b1 = a + ip1;
  102. b2 = a + ip2;
  103. i = ((k2 - k1) >> 1);
  104. i --;
  105. //Loop pipeline
  106. //Main Loop
  107. while (i > 0) {
  108. #ifdef OPTERON
  109. #ifndef MINUS
  110. asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1));
  111. asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(b1));
  112. #else
  113. asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(a1));
  114. asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(b1));
  115. #endif
  116. #endif
  117. #ifdef CORE2
  118. #ifndef MINUS
  119. asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(a1));
  120. asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b1));
  121. asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b2));
  122. #else
  123. asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(a1));
  124. asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b1));
  125. asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b2));
  126. #endif
  127. #endif
  128. A1 = *(a1 + 0);
  129. A2 = *(a1 + 1);
  130. A3 = *(a2 + 0);
  131. A4 = *(a2 + 1);
  132. B1 = *(b1 + 0);
  133. B2 = *(b1 + 1);
  134. B3 = *(b2 + 0);
  135. B4 = *(b2 + 1);
  136. ip1 = *piv * 2;
  137. piv += incx;
  138. ip2 = *piv * 2;
  139. piv += incx;
  140. if (b1 == a1) {
  141. if (b2 == a1) {
  142. *(a1 + 0) = A3;
  143. *(a1 + 1) = A4;
  144. *(a2 + 0) = A1;
  145. *(a2 + 1) = A2;
  146. } else
  147. if (b2 != a2) {
  148. *(a2 + 0) = B3;
  149. *(a2 + 1) = B4;
  150. *(b2 + 0) = A3;
  151. *(b2 + 1) = A4;
  152. }
  153. } else
  154. if (b1 == a2) {
  155. if (b2 != a1) {
  156. if (b2 == a2) {
  157. *(a1 + 0) = A3;
  158. *(a1 + 1) = A4;
  159. *(a2 + 0) = A1;
  160. *(a2 + 1) = A2;
  161. } else {
  162. *(a1 + 0) = A3;
  163. *(a1 + 1) = A4;
  164. *(a2 + 0) = B3;
  165. *(a2 + 1) = B4;
  166. *(b2 + 0) = A1;
  167. *(b2 + 1) = A2;
  168. }
  169. }
  170. } else {
  171. if (b2 == a1) {
  172. *(a1 + 0) = A3;
  173. *(a1 + 1) = A4;
  174. *(a2 + 0) = B1;
  175. *(a2 + 1) = B2;
  176. *(b1 + 0) = A1;
  177. *(b1 + 1) = A2;
  178. } else
  179. if (b2 == a2) {
  180. *(a1 + 0) = B1;
  181. *(a1 + 1) = B2;
  182. *(b1 + 0) = A1;
  183. *(b1 + 1) = A2;
  184. } else
  185. if (b2 == b1) {
  186. *(a1 + 0) = B1;
  187. *(a1 + 1) = B2;
  188. *(a2 + 0) = A1;
  189. *(a2 + 1) = A2;
  190. *(b1 + 0) = A3;
  191. *(b1 + 1) = A4;
  192. } else {
  193. *(a1 + 0) = B1;
  194. *(a1 + 1) = B2;
  195. *(a2 + 0) = B3;
  196. *(a2 + 1) = B4;
  197. *(b1 + 0) = A1;
  198. *(b1 + 1) = A2;
  199. *(b2 + 0) = A3;
  200. *(b2 + 1) = A4;
  201. }
  202. }
  203. b1 = a + ip1;
  204. b2 = a + ip2;
  205. #ifndef MINUS
  206. a1 += 4;
  207. #else
  208. a1 -= 4;
  209. #endif
  210. i --;
  211. }
  212. //Loop Ending
  213. A1 = *(a1 + 0);
  214. A2 = *(a1 + 1);
  215. A3 = *(a2 + 0);
  216. A4 = *(a2 + 1);
  217. B1 = *(b1 + 0);
  218. B2 = *(b1 + 1);
  219. B3 = *(b2 + 0);
  220. B4 = *(b2 + 1);
  221. if (b1 == a1) {
  222. if (b2 == a1) {
  223. *(a1 + 0) = A3;
  224. *(a1 + 1) = A4;
  225. *(a2 + 0) = A1;
  226. *(a2 + 1) = A2;
  227. } else
  228. if (b2 != a2) {
  229. *(a2 + 0) = B3;
  230. *(a2 + 1) = B4;
  231. *(b2 + 0) = A3;
  232. *(b2 + 1) = A4;
  233. }
  234. } else
  235. if (b1 == a2) {
  236. if (b2 != a1) {
  237. if (b2 == a2) {
  238. *(a1 + 0) = A3;
  239. *(a1 + 1) = A4;
  240. *(a2 + 0) = A1;
  241. *(a2 + 1) = A2;
  242. } else {
  243. *(a1 + 0) = A3;
  244. *(a1 + 1) = A4;
  245. *(a2 + 0) = B3;
  246. *(a2 + 1) = B4;
  247. *(b2 + 0) = A1;
  248. *(b2 + 1) = A2;
  249. }
  250. }
  251. } else {
  252. if (b2 == a1) {
  253. *(a1 + 0) = A3;
  254. *(a1 + 1) = A4;
  255. *(a2 + 0) = B1;
  256. *(a2 + 1) = B2;
  257. *(b1 + 0) = A1;
  258. *(b1 + 1) = A2;
  259. } else
  260. if (b2 == a2) {
  261. *(a1 + 0) = B1;
  262. *(a1 + 1) = B2;
  263. *(b1 + 0) = A1;
  264. *(b1 + 1) = A2;
  265. } else
  266. if (b2 == b1) {
  267. *(a1 + 0) = B1;
  268. *(a1 + 1) = B2;
  269. *(a2 + 0) = A1;
  270. *(a2 + 1) = A2;
  271. *(b1 + 0) = A3;
  272. *(b1 + 1) = A4;
  273. } else {
  274. *(a1 + 0) = B1;
  275. *(a1 + 1) = B2;
  276. *(a2 + 0) = B3;
  277. *(a2 + 1) = B4;
  278. *(b1 + 0) = A1;
  279. *(b1 + 1) = A2;
  280. *(b2 + 0) = A3;
  281. *(b2 + 1) = A4;
  282. }
  283. }
  284. #ifndef MINUS
  285. a1 += 4;
  286. #else
  287. a1 -= 4;
  288. #endif
  289. //Remain
  290. i = (rows & 1);
  291. if (i > 0) {
  292. ip1 = *piv * 2;
  293. b1 = a + ip1;
  294. A1 = *(a1 + 0);
  295. A2 = *(a1 + 1);
  296. B1 = *(b1 + 0);
  297. B2 = *(b1 + 1);
  298. *(a1 + 0) = B1;
  299. *(a1 + 1) = B2;
  300. *(b1 + 0) = A1;
  301. *(b1 + 1) = A2;
  302. }
  303. a += lda;
  304. j --;
  305. } while (j > 0);
  306. }
  307. return 0;
  308. }