/*********************************************************************/ /* Copyright 2009, 2010, 2024 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #define PREFETCHSIZE 4 int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ BLASLONG i, j, ip; blasint *piv; FLOAT *dx1, *dy1; FLOAT *dx2, *dy2; FLOAT *dx3, *dy3; FLOAT *dx4, *dy4; FLOAT *dx5, *dy5; FLOAT *dx6, *dy6; FLOAT atemp1, btemp1; FLOAT atemp2, btemp2; FLOAT atemp3, btemp3; FLOAT atemp4, btemp4; FLOAT atemp5, btemp5; FLOAT atemp6, btemp6; a--; ipiv += k1 - 1; if (n <= 0) return 0; if (k1 > k2) return 0; j = (n / 6); if (j > 0) { do { piv = ipiv; i = k1; do { ip = *piv; piv ++; dx1 = a + i; dy1 = a + ip; dx2 = a + i + lda * 1; dy2 = a + ip + lda * 1; dx3 = a + i + lda * 2; dy3 = a + ip + lda * 2; dx4 = a + i + lda * 3; dy4 = a + ip + lda * 3; dx5 = a + i + lda * 4; dy5 = a + ip + lda * 4; dx6 = a + i + lda * 5; dy6 = a + ip + lda * 5; #ifdef __GNUC__ __builtin_prefetch(dx1 + PREFETCHSIZE, 0, 1); __builtin_prefetch(dx2 + PREFETCHSIZE, 0, 1); __builtin_prefetch(dx3 + PREFETCHSIZE, 0, 1); __builtin_prefetch(dx4 + PREFETCHSIZE, 0, 1); __builtin_prefetch(dx5 + PREFETCHSIZE, 0, 1); __builtin_prefetch(dx6 + PREFETCHSIZE, 0, 1); #endif atemp1 = *dx1; btemp1 = *dy1; atemp2 = *dx2; btemp2 = *dy2; atemp3 = *dx3; btemp3 = *dy3; atemp4 = *dx4; btemp4 = *dy4; atemp5 = *dx5; btemp5 = *dy5; atemp6 = *dx6; btemp6 = *dy6; if (ip != i) { *dy1 = atemp1; *dy2 = atemp2; *dy3 = atemp3; *dy4 = atemp4; *dy5 = atemp5; *dy6 = atemp6; *(buffer + 0) = btemp1; *(buffer + 1) = btemp2; *(buffer + 2) = btemp3; *(buffer + 3) = btemp4; *(buffer + 4) = btemp5; *(buffer + 5) = btemp6; } else { *(buffer + 0) = atemp1; *(buffer + 1) = atemp2; *(buffer + 2) = atemp3; *(buffer + 3) = atemp4; *(buffer + 4) = atemp5; *(buffer + 5) = atemp6; } buffer += 6; i++; } while (i <= k2); a += 6 * lda; j --; } while (j > 0); } if ((n % 6) & 4) { piv = ipiv; ip = *piv; piv ++; dx1 = a + k1; dy1 = a + ip; dx2 = a + k1 + lda * 1; dy2 = a + ip + lda * 1; dx3 = a + k1 + lda * 2; dy3 = a + ip + lda * 2; dx4 = a + k1 + lda * 3; dy4 = a + ip + lda * 3; i = k1; do { atemp1 = *dx1; atemp2 = *dx2; atemp3 = *dx3; atemp4 = *dx4; btemp1 = *dy1; btemp2 = *dy2; btemp3 = *dy3; btemp4 = *dy4; if (ip != i) { *dy1 = atemp1; *dy2 = atemp2; *dy3 = atemp3; *dy4 = atemp4; *(buffer + 0) = btemp1; *(buffer + 1) = btemp2; *(buffer + 2) = btemp3; *(buffer + 3) = btemp4; } else { *(buffer + 0) = atemp1; *(buffer + 1) = atemp2; *(buffer + 2) = atemp3; *(buffer + 3) = atemp4; } ip = *piv; piv ++; i++; dx1 = a + i; dy1 = a + ip; dx2 = a + i + lda * 1; dy2 = a + ip + lda * 1; dx3 = a + i + lda * 2; dy3 = a + ip + lda * 2; dx4 = a + i + lda * 3; dy4 = a + ip + lda * 3; buffer += 4; } while (i <= k2); a += 4 * lda; } if ((n % 6) & 2) { piv = ipiv; i = k1; do { ip = *piv; piv ++; dx1 = a + i; dy1 = a + ip; dx2 = a + i + lda; dy2 = a + ip + lda; atemp1 = *dx1; btemp1 = *dy1; atemp2 = *dx2; btemp2 = *dy2; if (ip != i) { *dy1 = atemp1; *dy2 = atemp2; *(buffer + 0) = btemp1; *(buffer + 1) = btemp2; } else { *(buffer + 0) = atemp1; *(buffer + 1) = atemp2; } buffer += 2; i++; } while (i <= k2); a += 2 * lda; } if ((n % 6) & 1) { piv = ipiv; i = k1; do { ip = *piv; piv ++; dx1 = a + i; dy1 = a + ip; atemp1 = *dx1; btemp1 = *dy1; if (ip != i) { *dy1 = atemp1; *buffer = btemp1; } else { *buffer = atemp1; } buffer ++; i++; } while (i <= k2); // a += lda; } return 0; }