|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368 |
- /***************************************************************************
- Copyright (c) 2017, The OpenBLAS Project
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- 3. Neither the name of the OpenBLAS project nor the names of
- its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
- USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-
-
- #include "common.h"
- #include <math.h>
-
- #define ABS fabs
- #define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
-
-
-
- #if defined(__VEC__) || defined(__ALTIVEC__)
-
- /**
- * Find maximum index
- * Warning: requirements n>0 and n % 16 == 0
- * @param n
- * @param x pointer to the vector
- * @param maxf (out) maximum absolute value .( only for output )
- * @return index
- */
- static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
-
- BLASLONG index;
- register __vector long long start = {1,0};
- register __vector long long temp_add_index = {2, 2};
- __asm__(
-
- "lxvd2x 44, 0,%[ptr_tmp] \n\t"
- "lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
- "lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
- "lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
- "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
- "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
- "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
- "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
-
- "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
- "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
- "xxlxor 37,37 ,37 \n\t" //v5 v37 index_count
- "vaddudm 10,9,%[adder] \n\t" //{5,4} vs42
- "xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_max_index
- "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
- "xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero
- "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
- XXSPLTD_S(36,36,0)
-
-
-
- "xvabsdp 44, 44 \n\t"
- "xvabsdp 45, 45 \n\t"
- "xvabsdp 46, 46 \n\t"
- "xvabsdp 47, 47 \n\t"
- "xvabsdp 48, 48 \n\t"
- "xvabsdp 49, 49 \n\t"
- "xvabsdp 6, 6 \n\t"
- "xvabsdp 7, 7 \n\t"
-
- //jump first half forward
- "b two%= \n\t"
-
- ".align 5 \n\t"
- "one%=: \n\t"
-
-
- XXMRGHD_S(0,44,45)
- XXMRGLD_S(1,44,45)
- XXMRGHD_S(2,46,47)
- XXMRGLD_S(3,46,47)
- XXMRGHD_S(4,48,49)
- XXMRGLD_S(5,48,49)
- XXMRGHD_S(44,6,7)
- XXMRGLD_S(45,6,7)
-
- "xvadddp 46, 0,1 \n\t"
- "xvadddp 47, 2,3 \n\t"
- "xvadddp 48, 4,5 \n\t"
- "xvadddp 49, 44,45 \n\t"
-
-
-
- "xvcmpgtdp 6,47,46 \n\t "
- "xvcmpgtdp 7,49,48 \n\t "
-
- "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
-
- "xxsel 32,40,41,6 \n\t"
- "xxsel 0,46,47,6 \n\t"
- "xxsel 33,42,43,7 \n\t"
- "xxsel 1,48,49,7 \n\t"
-
- "lxvd2x 44, 0,%[ptr_tmp] \n\t"
- "lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
-
- "xvcmpgtdp 2,1,0 \n\t "
- "lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
- "lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
-
-
- "xxsel 32,32,33,2 \n\t"
- "xxsel 3,0,1,2 \n\t"
-
- "vaddudm 0,0,5 \n\t"
-
- //cmp with previous
-
- "xvcmpgtdp 4,3,39 \n\t "
- "vaddudm 5,5,4 \n\t"
-
- "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
- "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
- "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
- "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
- //select with previous
- "xxsel 38,38,32,4 \n\t"
- "xxsel 39,39,3,4 \n\t"
-
-
-
-
- "xvabsdp 44, 44 \n\t"
- "xvabsdp 45, 45 \n\t"
- "xvabsdp 46, 46 \n\t"
- "xvabsdp 47, 47 \n\t"
- "xvabsdp 48, 48 \n\t"
- "xvabsdp 49, 49 \n\t"
- "xvabsdp 6, 6 \n\t"
- "xvabsdp 7, 7 \n\t"
-
-
- //>>/////////////////////////////// half start
- "two%=: \n\t"
- XXMRGHD_S(0,44,45)
- XXMRGLD_S(1,44,45)
- XXMRGHD_S(2,46,47)
- XXMRGLD_S(3,46,47)
- XXMRGHD_S(4,48,49)
- XXMRGLD_S(5,48,49)
- XXMRGHD_S(44,6,7)
- XXMRGLD_S(45,6,7)
-
- "xvadddp 46, 0,1 \n\t"
- "xvadddp 47, 2,3 \n\t"
- "xvadddp 48, 4,5 \n\t"
- "xvadddp 49, 44,45 \n\t"
-
- "xvcmpgtdp 6,47,46 \n\t "
- "xvcmpgtdp 7,49,48 \n\t "
-
- "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
-
- "xxsel 32,40,41,6 \n\t"
- "xxsel 0,46,47,6 \n\t"
- "xxsel 33,42,43,7 \n\t"
- "xxsel 1,48,49,7 \n\t"
-
- "lxvd2x 44, 0,%[ptr_tmp] \n\t"
- "lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
-
- "xvcmpgtdp 2,1,0 \n\t "
- "lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
- "lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
-
-
- "xxsel 32,32,33,2 \n\t"
- "xxsel 3,0,1,2 \n\t"
-
- "vaddudm 0,0,5 \n\t"
-
- //cmp with previous
-
- "xvcmpgtdp 4,3,39 \n\t "
- "vaddudm 5,5,4 \n\t"
-
- "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
- "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
- "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
- "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
- //select with previous
- "xxsel 38,38,32,4 \n\t"
- "xxsel 39,39,3,4 \n\t"
-
-
- "xvabsdp 44, 44 \n\t"
- "xvabsdp 45, 45 \n\t"
- "xvabsdp 46, 46 \n\t"
- "xvabsdp 47, 47 \n\t"
- "xvabsdp 48, 48 \n\t"
- "xvabsdp 49, 49 \n\t"
- "xvabsdp 6, 6 \n\t"
- "xvabsdp 7, 7 \n\t"
-
-
- //decrement n
- "addic. %[n], %[n], -16 \n\t"
- //Loop back if >0
- "bgt+ one%= \n\t"
-
-
- XXMRGHD_S(0,44,45)
- XXMRGLD_S(1,44,45)
- XXMRGHD_S(2,46,47)
- XXMRGLD_S(3,46,47)
- XXMRGHD_S(4,48,49)
- XXMRGLD_S(5,48,49)
- XXMRGHD_S(44,6,7)
- XXMRGLD_S(45,6,7)
-
- "xvadddp 46, 0,1 \n\t"
- "xvadddp 47, 2,3 \n\t"
- "xvadddp 48, 4,5 \n\t"
- "xvadddp 49, 44,45 \n\t"
-
-
-
- "xvcmpgtdp 6,47,46 \n\t "
- "xvcmpgtdp 7,49,48 \n\t "
-
- "xxsel 32,40,41,6 \n\t"
- "xxsel 0,46,47,6 \n\t"
- "xxsel 33,42,43,7 \n\t"
- "xxsel 1,48,49,7 \n\t"
-
- "xvcmpgtdp 2,1,0 \n\t "
- "xxsel 32,32,33,2 \n\t"
- "xxsel 3,0,1,2 \n\t"
-
- "vaddudm 0,0,5 \n\t"
-
- "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
- //cmp with previous
-
- "xvcmpgtdp 4,3,39 \n\t "
- "vaddudm 5,5,4 \n\t"
- "xxsel 38,38,32,4 \n\t"
- "xxsel 39,39,3,4 \n\t"
-
-
- ///////extract max value and max index from vector
-
- XXSPLTD_S(32,38,1)
- XXSPLTD_S(40,39,1)
- "xvcmpeqdp. 2, 40,39 \n\t"
-
- //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
- //0b001110=14
- "bc 14,24, three%= \n\t"
- "xvcmpgtdp 4, 40,39 \n\t"
- "xxsel 0,39,40,4 \n\t"
- "xxsel 1,38,32,4 \n\t"
- "stxsdx 0,0,%[ptr_maxf] \n\t"
- "b four%= \n\t"
-
- "three%=: \n\t"
- //if elements value are equal then choose minimum index
- XXSPLTD_S(0,40,0)
- "vminud 0,0,6 \n\t" //vs32 vs38
- "xxlor 1,32,32 \n\t"
- "stxsdx 0,0,%[ptr_maxf] \n\t"
-
-
- "four%=: \n\t"
- "mfvsrd %[index],1 \n\t"
-
- : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
- : [mem] "m"(*(const double (*)[2*n])x), [ptr_x] "b"(x), [ptr_maxf] "b"(maxf) ,
- [i16] "b"(16), [i32] "b"(32), [i48] "b"(48),
- [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
- [start] "v"(start), [adder] "v"(temp_add_index)
- : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
- "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
- );
-
- return index;
-
- }
-
- #endif
-
-
-
-
- BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
- {
- BLASLONG i = 0;
- BLASLONG ix = 0;
- FLOAT maxf = 0;
- BLASLONG max = 0;
- BLASLONG inc_x2;
-
- if (n <= 0 || inc_x <= 0) return(max);
-
- if (inc_x == 1) {
-
- #if defined(_CALL_ELF) && (_CALL_ELF == 2)
- #if defined(__VEC__) || defined(__ALTIVEC__)
-
- BLASLONG n1 = n & -16;
- if (n1 > 0) {
-
- max = ziamax_kernel_16(n1, x, &maxf);
- i = n1;
- ix = n1 << 1;
- }
- #endif
- #endif
-
- while(i < n)
- {
- if( CABS1(x,ix) > maxf )
- {
- max = i;
- maxf = CABS1(x,ix);
- }
- ix += 2;
- i++;
- }
- return (max + 1);
-
- } else {
-
- inc_x2 = 2 * inc_x;
-
- maxf = CABS1(x,0);
- ix += inc_x2;
- i++;
-
- while(i < n)
- {
- if( CABS1(x,ix) > maxf )
- {
- max = i;
- maxf = CABS1(x,ix);
- }
- ix += inc_x2;
- i++;
- }
- return (max + 1);
- }
-
- }
-
-
|