@@ -1,333 +1,333 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2016, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#define M x0 | |||||
#define N x1 | |||||
#define A00 x2 | |||||
#define LDA x3 | |||||
#define B00 x4 | |||||
#define A01 x5 | |||||
#define A02 x6 | |||||
#define A03 x7 | |||||
#define A04 x8 | |||||
#define I x9 | |||||
#define J x10 | |||||
#define TEMP1 x11 | |||||
#define TEMP2 x12 | |||||
#define A_PREFETCH 2560 | |||||
/************************************************************************************** | |||||
* Macro definitions | |||||
**************************************************************************************/ | |||||
.macro SAVE_REGS | |||||
add sp, sp, #-(11 * 16) | |||||
stp d8, d9, [sp, #(0 * 16)] | |||||
stp d10, d11, [sp, #(1 * 16)] | |||||
stp d12, d13, [sp, #(2 * 16)] | |||||
stp d14, d15, [sp, #(3 * 16)] | |||||
stp d16, d17, [sp, #(4 * 16)] | |||||
stp x18, x19, [sp, #(5 * 16)] | |||||
stp x20, x21, [sp, #(6 * 16)] | |||||
stp x22, x23, [sp, #(7 * 16)] | |||||
stp x24, x25, [sp, #(8 * 16)] | |||||
stp x26, x27, [sp, #(9 * 16)] | |||||
str x28, [sp, #(10 * 16)] | |||||
.endm | |||||
.macro RESTORE_REGS | |||||
ldp d8, d9, [sp, #(0 * 16)] | |||||
ldp d10, d11, [sp, #(1 * 16)] | |||||
ldp d12, d13, [sp, #(2 * 16)] | |||||
ldp d14, d15, [sp, #(3 * 16)] | |||||
ldp d16, d17, [sp, #(4 * 16)] | |||||
ldp x18, x19, [sp, #(5 * 16)] | |||||
ldp x20, x21, [sp, #(6 * 16)] | |||||
ldp x22, x23, [sp, #(7 * 16)] | |||||
ldp x24, x25, [sp, #(8 * 16)] | |||||
ldp x26, x27, [sp, #(9 * 16)] | |||||
ldr x28, [sp, #(10 * 16)] | |||||
add sp, sp, #(11*16) | |||||
.endm | |||||
.macro COPY4x4 | |||||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
ldr q0, [A01], #16 | |||||
ins v8.s[0], v0.s[0] | |||||
ins v9.s[0], v0.s[1] | |||||
ins v10.s[0], v0.s[2] | |||||
ins v11.s[0], v0.s[3] | |||||
ldr q1, [A02], #16 | |||||
ins v8.s[1], v1.s[0] | |||||
ins v9.s[1], v1.s[1] | |||||
ins v10.s[1], v1.s[2] | |||||
ins v11.s[1], v1.s[3] | |||||
ldr q2, [A03], #16 | |||||
ins v8.s[2], v2.s[0] | |||||
ins v9.s[2], v2.s[1] | |||||
ins v10.s[2], v2.s[2] | |||||
ins v11.s[2], v2.s[3] | |||||
ldr q3, [A04], #16 | |||||
ins v8.s[3], v3.s[0] | |||||
ins v9.s[3], v3.s[1] | |||||
ins v10.s[3], v3.s[2] | |||||
ins v11.s[3], v3.s[3] | |||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] | |||||
add B00, B00, #64 | |||||
.endm | |||||
.macro COPY1x4 | |||||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
ldr s0, [A01], #4 | |||||
ldr s1, [A02], #4 | |||||
ldr s2, [A03], #4 | |||||
ldr s3, [A04], #4 | |||||
stp s0, s1, [B00] | |||||
add B00, B00, #8 | |||||
stp s2, s3, [B00] | |||||
add B00, B00, #8 | |||||
.endm | |||||
.macro COPY4x2 | |||||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
ldr q0, [A01], #16 | |||||
ins v8.s[0], v0.s[0] | |||||
ins v9.s[0], v0.s[1] | |||||
ins v10.s[0], v0.s[2] | |||||
ins v11.s[0], v0.s[3] | |||||
ldr q1, [A02], #16 | |||||
ins v8.s[1], v1.s[0] | |||||
ins v9.s[1], v1.s[1] | |||||
ins v10.s[1], v1.s[2] | |||||
ins v11.s[1], v1.s[3] | |||||
st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] | |||||
add B00, B00, #32 | |||||
.endm | |||||
.macro COPY1x2 | |||||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
ldr s0, [A01], #4 | |||||
ldr s1, [A02], #4 | |||||
stp s0, s1, [B00] | |||||
add B00, B00, #8 | |||||
.endm | |||||
.macro COPY4x1 | |||||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
ldr q0, [A01], #16 | |||||
str q0, [B00], #16 | |||||
.endm | |||||
.macro COPY1x1 | |||||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
ldr s0, [A01], #4 | |||||
str s0, [B00], #4 | |||||
.endm | |||||
/************************************************************************************** | |||||
* End of macro definitions | |||||
**************************************************************************************/ | |||||
PROLOGUE | |||||
.align 5 | |||||
SAVE_REGS | |||||
lsl LDA, LDA, #2 // LDA = LDA * SIZE | |||||
.Ldgemm_ncopy_L4_BEGIN: | |||||
asr J, N, #2 // J = N / 4 | |||||
cmp J, #0 | |||||
ble .Ldgemm_ncopy_L2_BEGIN | |||||
.align 5 | |||||
.Ldgemm_ncopy_L4_M4_BEGIN: | |||||
mov A01, A00 | |||||
add A02, A01, LDA | |||||
add A03, A02, LDA | |||||
add A04, A03, LDA | |||||
add A00, A04, LDA | |||||
asr I, M, #2 // I = M / 4 | |||||
cmp I, #0 | |||||
ble .Ldgemm_ncopy_L4_M4_40 | |||||
.align 5 | |||||
.Ldgemm_ncopy_L4_M4_20: | |||||
COPY4x4 | |||||
subs I , I , #1 | |||||
bne .Ldgemm_ncopy_L4_M4_20 | |||||
.Ldgemm_ncopy_L4_M4_40: | |||||
and I, M , #3 | |||||
cmp I, #0 | |||||
ble .Ldgemm_ncopy_L4_M4_END | |||||
.align 5 | |||||
.Ldgemm_ncopy_L4_M4_60: | |||||
COPY1x4 | |||||
subs I , I , #1 | |||||
bne .Ldgemm_ncopy_L4_M4_60 | |||||
.Ldgemm_ncopy_L4_M4_END: | |||||
subs J , J, #1 // j-- | |||||
bne .Ldgemm_ncopy_L4_M4_BEGIN | |||||
/*********************************************************************************************/ | |||||
.Ldgemm_ncopy_L2_BEGIN: | |||||
tst N, #3 | |||||
ble .Ldgemm_ncopy_L999 | |||||
tst N, #2 | |||||
ble .Ldgemm_ncopy_L1_BEGIN | |||||
.Ldgemm_ncopy_L2_M4_BEGIN: | |||||
mov A01, A00 | |||||
add A02, A01, LDA | |||||
add A00, A02, LDA | |||||
asr I, M, #2 // I = M / 4 | |||||
cmp I, #0 | |||||
ble .Ldgemm_ncopy_L2_M4_40 | |||||
.align 5 | |||||
.Ldgemm_ncopy_L2_M4_20: | |||||
COPY4x2 | |||||
subs I , I , #1 | |||||
bne .Ldgemm_ncopy_L2_M4_20 | |||||
.Ldgemm_ncopy_L2_M4_40: | |||||
and I, M , #3 | |||||
cmp I, #0 | |||||
ble .Ldgemm_ncopy_L2_M4_END | |||||
.align 5 | |||||
.Ldgemm_ncopy_L2_M4_60: | |||||
COPY1x2 | |||||
subs I , I , #1 | |||||
bne .Ldgemm_ncopy_L2_M4_60 | |||||
.Ldgemm_ncopy_L2_M4_END: | |||||
/*********************************************************************************************/ | |||||
.Ldgemm_ncopy_L1_BEGIN: | |||||
tst N, #1 | |||||
ble .Ldgemm_ncopy_L999 | |||||
.Ldgemm_ncopy_L1_M4_BEGIN: | |||||
mov A01, A00 | |||||
asr I, M, #2 // I = M / 4 | |||||
cmp I, #0 | |||||
ble .Ldgemm_ncopy_L1_M4_40 | |||||
.align 5 | |||||
.Ldgemm_ncopy_L1_M4_20: | |||||
COPY4x1 | |||||
subs I , I , #1 | |||||
bne .Ldgemm_ncopy_L1_M4_20 | |||||
.Ldgemm_ncopy_L1_M4_40: | |||||
and I, M , #3 | |||||
cmp I, #0 | |||||
ble .Ldgemm_ncopy_L1_M4_END | |||||
.align 5 | |||||
.Ldgemm_ncopy_L1_M4_60: | |||||
COPY1x1 | |||||
subs I , I , #1 | |||||
bne .Ldgemm_ncopy_L1_M4_60 | |||||
.Ldgemm_ncopy_L1_M4_END: | |||||
.Ldgemm_ncopy_L999: | |||||
mov x0, #0 | |||||
RESTORE_REGS | |||||
ret | |||||
EPILOGUE | |||||
/*************************************************************************** | |||||
Copyright (c) 2016, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#define M x0 | |||||
#define N x1 | |||||
#define A00 x2 | |||||
#define LDA x3 | |||||
#define B00 x4 | |||||
#define A01 x5 | |||||
#define A02 x6 | |||||
#define A03 x7 | |||||
#define A04 x8 | |||||
#define I x9 | |||||
#define J x10 | |||||
#define TEMP1 x11 | |||||
#define TEMP2 x12 | |||||
#define A_PREFETCH 2560 | |||||
/************************************************************************************** | |||||
* Macro definitions | |||||
**************************************************************************************/ | |||||
.macro SAVE_REGS | |||||
add sp, sp, #-(11 * 16) | |||||
stp d8, d9, [sp, #(0 * 16)] | |||||
stp d10, d11, [sp, #(1 * 16)] | |||||
stp d12, d13, [sp, #(2 * 16)] | |||||
stp d14, d15, [sp, #(3 * 16)] | |||||
stp d16, d17, [sp, #(4 * 16)] | |||||
stp x18, x19, [sp, #(5 * 16)] | |||||
stp x20, x21, [sp, #(6 * 16)] | |||||
stp x22, x23, [sp, #(7 * 16)] | |||||
stp x24, x25, [sp, #(8 * 16)] | |||||
stp x26, x27, [sp, #(9 * 16)] | |||||
str x28, [sp, #(10 * 16)] | |||||
.endm | |||||
.macro RESTORE_REGS | |||||
ldp d8, d9, [sp, #(0 * 16)] | |||||
ldp d10, d11, [sp, #(1 * 16)] | |||||
ldp d12, d13, [sp, #(2 * 16)] | |||||
ldp d14, d15, [sp, #(3 * 16)] | |||||
ldp d16, d17, [sp, #(4 * 16)] | |||||
ldp x18, x19, [sp, #(5 * 16)] | |||||
ldp x20, x21, [sp, #(6 * 16)] | |||||
ldp x22, x23, [sp, #(7 * 16)] | |||||
ldp x24, x25, [sp, #(8 * 16)] | |||||
ldp x26, x27, [sp, #(9 * 16)] | |||||
ldr x28, [sp, #(10 * 16)] | |||||
add sp, sp, #(11*16) | |||||
.endm | |||||
.macro COPY4x4 | |||||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
ldr q0, [A01], #16 | |||||
ins v8.s[0], v0.s[0] | |||||
ins v9.s[0], v0.s[1] | |||||
ins v10.s[0], v0.s[2] | |||||
ins v11.s[0], v0.s[3] | |||||
ldr q1, [A02], #16 | |||||
ins v8.s[1], v1.s[0] | |||||
ins v9.s[1], v1.s[1] | |||||
ins v10.s[1], v1.s[2] | |||||
ins v11.s[1], v1.s[3] | |||||
ldr q2, [A03], #16 | |||||
ins v8.s[2], v2.s[0] | |||||
ins v9.s[2], v2.s[1] | |||||
ins v10.s[2], v2.s[2] | |||||
ins v11.s[2], v2.s[3] | |||||
ldr q3, [A04], #16 | |||||
ins v8.s[3], v3.s[0] | |||||
ins v9.s[3], v3.s[1] | |||||
ins v10.s[3], v3.s[2] | |||||
ins v11.s[3], v3.s[3] | |||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] | |||||
add B00, B00, #64 | |||||
.endm | |||||
.macro COPY1x4 | |||||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
ldr s0, [A01], #4 | |||||
ldr s1, [A02], #4 | |||||
ldr s2, [A03], #4 | |||||
ldr s3, [A04], #4 | |||||
stp s0, s1, [B00] | |||||
add B00, B00, #8 | |||||
stp s2, s3, [B00] | |||||
add B00, B00, #8 | |||||
.endm | |||||
.macro COPY4x2 | |||||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
ldr q0, [A01], #16 | |||||
ins v8.s[0], v0.s[0] | |||||
ins v9.s[0], v0.s[1] | |||||
ins v10.s[0], v0.s[2] | |||||
ins v11.s[0], v0.s[3] | |||||
ldr q1, [A02], #16 | |||||
ins v8.s[1], v1.s[0] | |||||
ins v9.s[1], v1.s[1] | |||||
ins v10.s[1], v1.s[2] | |||||
ins v11.s[1], v1.s[3] | |||||
st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] | |||||
add B00, B00, #32 | |||||
.endm | |||||
.macro COPY1x2 | |||||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
ldr s0, [A01], #4 | |||||
ldr s1, [A02], #4 | |||||
stp s0, s1, [B00] | |||||
add B00, B00, #8 | |||||
.endm | |||||
.macro COPY4x1 | |||||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
ldr q0, [A01], #16 | |||||
str q0, [B00], #16 | |||||
.endm | |||||
.macro COPY1x1 | |||||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
ldr s0, [A01], #4 | |||||
str s0, [B00], #4 | |||||
.endm | |||||
/************************************************************************************** | |||||
* End of macro definitions | |||||
**************************************************************************************/ | |||||
PROLOGUE | |||||
.align 5 | |||||
SAVE_REGS | |||||
lsl LDA, LDA, #2 // LDA = LDA * SIZE | |||||
.Ldgemm_ncopy_L4_BEGIN: | |||||
asr J, N, #2 // J = N / 4 | |||||
cmp J, #0 | |||||
ble .Ldgemm_ncopy_L2_BEGIN | |||||
.align 5 | |||||
.Ldgemm_ncopy_L4_M4_BEGIN: | |||||
mov A01, A00 | |||||
add A02, A01, LDA | |||||
add A03, A02, LDA | |||||
add A04, A03, LDA | |||||
add A00, A04, LDA | |||||
asr I, M, #2 // I = M / 4 | |||||
cmp I, #0 | |||||
ble .Ldgemm_ncopy_L4_M4_40 | |||||
.align 5 | |||||
.Ldgemm_ncopy_L4_M4_20: | |||||
COPY4x4 | |||||
subs I , I , #1 | |||||
bne .Ldgemm_ncopy_L4_M4_20 | |||||
.Ldgemm_ncopy_L4_M4_40: | |||||
and I, M , #3 | |||||
cmp I, #0 | |||||
ble .Ldgemm_ncopy_L4_M4_END | |||||
.align 5 | |||||
.Ldgemm_ncopy_L4_M4_60: | |||||
COPY1x4 | |||||
subs I , I , #1 | |||||
bne .Ldgemm_ncopy_L4_M4_60 | |||||
.Ldgemm_ncopy_L4_M4_END: | |||||
subs J , J, #1 // j-- | |||||
bne .Ldgemm_ncopy_L4_M4_BEGIN | |||||
/*********************************************************************************************/ | |||||
.Ldgemm_ncopy_L2_BEGIN: | |||||
tst N, #3 | |||||
ble .Ldgemm_ncopy_L999 | |||||
tst N, #2 | |||||
ble .Ldgemm_ncopy_L1_BEGIN | |||||
.Ldgemm_ncopy_L2_M4_BEGIN: | |||||
mov A01, A00 | |||||
add A02, A01, LDA | |||||
add A00, A02, LDA | |||||
asr I, M, #2 // I = M / 4 | |||||
cmp I, #0 | |||||
ble .Ldgemm_ncopy_L2_M4_40 | |||||
.align 5 | |||||
.Ldgemm_ncopy_L2_M4_20: | |||||
COPY4x2 | |||||
subs I , I , #1 | |||||
bne .Ldgemm_ncopy_L2_M4_20 | |||||
.Ldgemm_ncopy_L2_M4_40: | |||||
and I, M , #3 | |||||
cmp I, #0 | |||||
ble .Ldgemm_ncopy_L2_M4_END | |||||
.align 5 | |||||
.Ldgemm_ncopy_L2_M4_60: | |||||
COPY1x2 | |||||
subs I , I , #1 | |||||
bne .Ldgemm_ncopy_L2_M4_60 | |||||
.Ldgemm_ncopy_L2_M4_END: | |||||
/*********************************************************************************************/ | |||||
.Ldgemm_ncopy_L1_BEGIN: | |||||
tst N, #1 | |||||
ble .Ldgemm_ncopy_L999 | |||||
.Ldgemm_ncopy_L1_M4_BEGIN: | |||||
mov A01, A00 | |||||
asr I, M, #2 // I = M / 4 | |||||
cmp I, #0 | |||||
ble .Ldgemm_ncopy_L1_M4_40 | |||||
.align 5 | |||||
.Ldgemm_ncopy_L1_M4_20: | |||||
COPY4x1 | |||||
subs I , I , #1 | |||||
bne .Ldgemm_ncopy_L1_M4_20 | |||||
.Ldgemm_ncopy_L1_M4_40: | |||||
and I, M , #3 | |||||
cmp I, #0 | |||||
ble .Ldgemm_ncopy_L1_M4_END | |||||
.align 5 | |||||
.Ldgemm_ncopy_L1_M4_60: | |||||
COPY1x1 | |||||
subs I , I , #1 | |||||
bne .Ldgemm_ncopy_L1_M4_60 | |||||
.Ldgemm_ncopy_L1_M4_END: | |||||
.Ldgemm_ncopy_L999: | |||||
mov x0, #0 | |||||
RESTORE_REGS | |||||
ret | |||||
EPILOGUE | |||||
@@ -1,293 +1,293 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
/************************************************************************************** | |||||
* Abdelrauf(quickwritereader@gmail.com) | |||||
* BLASTEST : OK | |||||
* CTEST : OK | |||||
* TEST : OK | |||||
* LAPACK-TEST : OK | |||||
**************************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "def_vsx.h" | |||||
#define LOAD ld | |||||
#define STACKSIZE (512 ) | |||||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||||
#define M r3 | |||||
#define N r4 | |||||
#define K r5 | |||||
#define A r8 | |||||
#define B r9 | |||||
#define C r10 | |||||
#define LDC r6 | |||||
#define OFFSET r7 | |||||
#define alpha_r vs19 | |||||
#define alpha_i vs20 | |||||
#define save_permute_1 vs21 | |||||
#define permute_mask vs22 | |||||
#define o0 0 | |||||
#define T1 r11 | |||||
#define T2 r12 | |||||
#define T3 r14 | |||||
#define T4 r15 | |||||
#define T5 r16 | |||||
#define T6 r17 | |||||
#define L r18 | |||||
#define T7 r19 | |||||
#define T8 r20 | |||||
#define TEMP_REG r21 | |||||
#define I r22 | |||||
#define J r23 | |||||
#define AO r24 | |||||
#define BO r25 | |||||
#define CO r26 | |||||
#define T9 r27 | |||||
#define T10 r28 | |||||
#define PRE r29 | |||||
#define T12 r30 | |||||
#define T13 r31 | |||||
#include "cgemm_macros_power9.S" | |||||
.equ perm_const1, 0x0405060700010203 | |||||
.equ perm_const2, 0x0c0d0e0f08090a0b | |||||
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||||
.equ save_permute_11, 0x0405060714151617 | |||||
#ifndef NEEDPARAM | |||||
PROLOGUE | |||||
PROFCODE | |||||
addi SP, SP, -STACKSIZE | |||||
mflr r0 | |||||
stfd f14, 0(SP) | |||||
stfd f15, 8(SP) | |||||
stfd f16, 16(SP) | |||||
stfd f17, 24(SP) | |||||
stfd f18, 32(SP) | |||||
stfd f19, 40(SP) | |||||
stfd f20, 48(SP) | |||||
stfd f21, 56(SP) | |||||
stfd f22, 64(SP) | |||||
stfd f23, 72(SP) | |||||
stfd f24, 80(SP) | |||||
stfd f25, 88(SP) | |||||
stfd f26, 96(SP) | |||||
stfd f27, 104(SP) | |||||
stfd f28, 112(SP) | |||||
stfd f29, 120(SP) | |||||
stfd f30, 128(SP) | |||||
stfd f31, 136(SP) | |||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
stxv vs52, 288(SP) | |||||
stxv vs53, 304(SP) | |||||
stxv vs54, 320(SP) | |||||
stxv vs55, 336(SP) | |||||
stxv vs56, 352(SP) | |||||
stxv vs57, 368(SP) | |||||
stxv vs58, 384(SP) | |||||
stxv vs59, 400(SP) | |||||
stxv vs60, 416(SP) | |||||
stxv vs61, 432(SP) | |||||
stxv vs62, 448(SP) | |||||
stxv vs63, 464(SP) | |||||
std r0, FLINK_SAVE(SP) | |||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||||
#ifdef TRMMKERNEL | |||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||||
#endif | |||||
slwi LDC, LDC, ZBASE_SHIFT | |||||
/*alpha is stored in f1. convert to single and splat*/ | |||||
xscvdpspn alpha_r,vs1 | |||||
xscvdpspn alpha_i,vs2 | |||||
xxspltw alpha_r,alpha_r,0 | |||||
xxspltw alpha_i,alpha_i,0 | |||||
/*load reverse permute mask for big endian | |||||
uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||||
*/ | |||||
lis T2, perm_const2@highest | |||||
lis T1, perm_const1@highest | |||||
lis T3, save_permute_12@highest | |||||
lis T4, save_permute_11@highest | |||||
ori T2, T2, perm_const2@higher | |||||
ori T1, T1, perm_const1@higher | |||||
ori T3, T3, save_permute_12@higher | |||||
ori T4, T4, save_permute_11@higher | |||||
rldicr T2, T2, 32, 31 | |||||
rldicr T1, T1, 32, 31 | |||||
rldicr T3, T3, 32, 31 | |||||
rldicr T4, T4, 32, 31 | |||||
oris T2, T2, perm_const2@h | |||||
oris T1, T1, perm_const1@h | |||||
oris T3, T3, save_permute_12@h | |||||
oris T4, T4, save_permute_11@h | |||||
ori T2, T2, perm_const2@l | |||||
ori T1, T1, perm_const1@l | |||||
ori T3, T3, save_permute_12@l | |||||
ori T4, T4, save_permute_11@l | |||||
li r0,0 | |||||
li PRE,512 | |||||
#if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||||
/*negate for this case as we will use addition -1*(a+b) */ | |||||
xvnegsp alpha_r,alpha_r | |||||
xvnegsp alpha_i,alpha_i | |||||
#endif | |||||
mtvsrdd permute_mask,T2,T1 | |||||
mtvsrdd save_permute_1,T3,T4 | |||||
/*mask is reverse permute so we have to make it inner permute */ | |||||
xxpermdi permute_mask, permute_mask, permute_mask,2 | |||||
#include "cgemm_logic_power9.S" | |||||
.L999: | |||||
lfd f14, 0(SP) | |||||
lfd f15, 8(SP) | |||||
lfd f16, 16(SP) | |||||
lfd f17, 24(SP) | |||||
lfd f18, 32(SP) | |||||
lfd f19, 40(SP) | |||||
lfd f20, 48(SP) | |||||
lfd f21, 56(SP) | |||||
lfd f22, 64(SP) | |||||
lfd f23, 72(SP) | |||||
lfd f24, 80(SP) | |||||
lfd f25, 88(SP) | |||||
lfd f26, 96(SP) | |||||
lfd f27, 104(SP) | |||||
lfd f28, 112(SP) | |||||
lfd f29, 120(SP) | |||||
lfd f30, 128(SP) | |||||
lfd f31, 136(SP) | |||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
ld r0, FLINK_SAVE(SP) | |||||
lxv vs52, 288(SP) | |||||
lxv vs53, 304(SP) | |||||
lxv vs54, 320(SP) | |||||
lxv vs55, 336(SP) | |||||
lxv vs56, 352(SP) | |||||
lxv vs57, 368(SP) | |||||
lxv vs58, 384(SP) | |||||
lxv vs59, 400(SP) | |||||
mtlr r0 | |||||
lxv vs60, 416(SP) | |||||
lxv vs61, 432(SP) | |||||
lxv vs62, 448(SP) | |||||
lxv vs63, 464(SP) | |||||
addi SP, SP, STACKSIZE | |||||
blr | |||||
EPILOGUE | |||||
#endif | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
/************************************************************************************** | |||||
* Abdelrauf(quickwritereader@gmail.com) | |||||
* BLASTEST : OK | |||||
* CTEST : OK | |||||
* TEST : OK | |||||
* LAPACK-TEST : OK | |||||
**************************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "def_vsx.h" | |||||
#define LOAD ld | |||||
#define STACKSIZE (512 ) | |||||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||||
#define M r3 | |||||
#define N r4 | |||||
#define K r5 | |||||
#define A r8 | |||||
#define B r9 | |||||
#define C r10 | |||||
#define LDC r6 | |||||
#define OFFSET r7 | |||||
#define alpha_r vs19 | |||||
#define alpha_i vs20 | |||||
#define save_permute_1 vs21 | |||||
#define permute_mask vs22 | |||||
#define o0 0 | |||||
#define T1 r11 | |||||
#define T2 r12 | |||||
#define T3 r14 | |||||
#define T4 r15 | |||||
#define T5 r16 | |||||
#define T6 r17 | |||||
#define L r18 | |||||
#define T7 r19 | |||||
#define T8 r20 | |||||
#define TEMP_REG r21 | |||||
#define I r22 | |||||
#define J r23 | |||||
#define AO r24 | |||||
#define BO r25 | |||||
#define CO r26 | |||||
#define T9 r27 | |||||
#define T10 r28 | |||||
#define PRE r29 | |||||
#define T12 r30 | |||||
#define T13 r31 | |||||
#include "cgemm_macros_power9.S" | |||||
.equ perm_const1, 0x0405060700010203 | |||||
.equ perm_const2, 0x0c0d0e0f08090a0b | |||||
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||||
.equ save_permute_11, 0x0405060714151617 | |||||
#ifndef NEEDPARAM | |||||
PROLOGUE | |||||
PROFCODE | |||||
addi SP, SP, -STACKSIZE | |||||
mflr r0 | |||||
stfd f14, 0(SP) | |||||
stfd f15, 8(SP) | |||||
stfd f16, 16(SP) | |||||
stfd f17, 24(SP) | |||||
stfd f18, 32(SP) | |||||
stfd f19, 40(SP) | |||||
stfd f20, 48(SP) | |||||
stfd f21, 56(SP) | |||||
stfd f22, 64(SP) | |||||
stfd f23, 72(SP) | |||||
stfd f24, 80(SP) | |||||
stfd f25, 88(SP) | |||||
stfd f26, 96(SP) | |||||
stfd f27, 104(SP) | |||||
stfd f28, 112(SP) | |||||
stfd f29, 120(SP) | |||||
stfd f30, 128(SP) | |||||
stfd f31, 136(SP) | |||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
stxv vs52, 288(SP) | |||||
stxv vs53, 304(SP) | |||||
stxv vs54, 320(SP) | |||||
stxv vs55, 336(SP) | |||||
stxv vs56, 352(SP) | |||||
stxv vs57, 368(SP) | |||||
stxv vs58, 384(SP) | |||||
stxv vs59, 400(SP) | |||||
stxv vs60, 416(SP) | |||||
stxv vs61, 432(SP) | |||||
stxv vs62, 448(SP) | |||||
stxv vs63, 464(SP) | |||||
std r0, FLINK_SAVE(SP) | |||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||||
#ifdef TRMMKERNEL | |||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||||
#endif | |||||
slwi LDC, LDC, ZBASE_SHIFT | |||||
/*alpha is stored in f1. convert to single and splat*/ | |||||
xscvdpspn alpha_r,vs1 | |||||
xscvdpspn alpha_i,vs2 | |||||
xxspltw alpha_r,alpha_r,0 | |||||
xxspltw alpha_i,alpha_i,0 | |||||
/*load reverse permute mask for big endian | |||||
uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||||
*/ | |||||
lis T2, perm_const2@highest | |||||
lis T1, perm_const1@highest | |||||
lis T3, save_permute_12@highest | |||||
lis T4, save_permute_11@highest | |||||
ori T2, T2, perm_const2@higher | |||||
ori T1, T1, perm_const1@higher | |||||
ori T3, T3, save_permute_12@higher | |||||
ori T4, T4, save_permute_11@higher | |||||
rldicr T2, T2, 32, 31 | |||||
rldicr T1, T1, 32, 31 | |||||
rldicr T3, T3, 32, 31 | |||||
rldicr T4, T4, 32, 31 | |||||
oris T2, T2, perm_const2@h | |||||
oris T1, T1, perm_const1@h | |||||
oris T3, T3, save_permute_12@h | |||||
oris T4, T4, save_permute_11@h | |||||
ori T2, T2, perm_const2@l | |||||
ori T1, T1, perm_const1@l | |||||
ori T3, T3, save_permute_12@l | |||||
ori T4, T4, save_permute_11@l | |||||
li r0,0 | |||||
li PRE,512 | |||||
#if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||||
/*negate for this case as we will use addition -1*(a+b) */ | |||||
xvnegsp alpha_r,alpha_r | |||||
xvnegsp alpha_i,alpha_i | |||||
#endif | |||||
mtvsrdd permute_mask,T2,T1 | |||||
mtvsrdd save_permute_1,T3,T4 | |||||
/*mask is reverse permute so we have to make it inner permute */ | |||||
xxpermdi permute_mask, permute_mask, permute_mask,2 | |||||
#include "cgemm_logic_power9.S" | |||||
.L999: | |||||
lfd f14, 0(SP) | |||||
lfd f15, 8(SP) | |||||
lfd f16, 16(SP) | |||||
lfd f17, 24(SP) | |||||
lfd f18, 32(SP) | |||||
lfd f19, 40(SP) | |||||
lfd f20, 48(SP) | |||||
lfd f21, 56(SP) | |||||
lfd f22, 64(SP) | |||||
lfd f23, 72(SP) | |||||
lfd f24, 80(SP) | |||||
lfd f25, 88(SP) | |||||
lfd f26, 96(SP) | |||||
lfd f27, 104(SP) | |||||
lfd f28, 112(SP) | |||||
lfd f29, 120(SP) | |||||
lfd f30, 128(SP) | |||||
lfd f31, 136(SP) | |||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
ld r0, FLINK_SAVE(SP) | |||||
lxv vs52, 288(SP) | |||||
lxv vs53, 304(SP) | |||||
lxv vs54, 320(SP) | |||||
lxv vs55, 336(SP) | |||||
lxv vs56, 352(SP) | |||||
lxv vs57, 368(SP) | |||||
lxv vs58, 384(SP) | |||||
lxv vs59, 400(SP) | |||||
mtlr r0 | |||||
lxv vs60, 416(SP) | |||||
lxv vs61, 432(SP) | |||||
lxv vs62, 448(SP) | |||||
lxv vs63, 464(SP) | |||||
addi SP, SP, STACKSIZE | |||||
blr | |||||
EPILOGUE | |||||
#endif |
@@ -1,233 +1,233 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2018, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||||
static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||||
{ | |||||
__vector float t0; | |||||
__vector float t1; | |||||
__vector float t2; | |||||
__vector float t3; | |||||
__vector float t4; | |||||
__vector float t5; | |||||
__vector float t6; | |||||
__vector float t7; | |||||
__asm__ | |||||
( | |||||
"xscvdpspn 36, %x[cos] \n\t" // load c to all words | |||||
"xxspltw 36, 36, 0 \n\t" | |||||
"xscvdpspn 37, %x[sin] \n\t" // load s to all words | |||||
"xxspltw 37, 37, 0 \n\t" | |||||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||||
"lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||||
"lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||||
"lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||||
"lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||||
"lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||||
"lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||||
"addi %[x_ptr], %[x_ptr], 64 \n\t" | |||||
"addi %[y_ptr], %[y_ptr], 64 \n\t" | |||||
"addic. %[temp_n], %[temp_n], -8 \n\t" | |||||
"ble two%= \n\t" | |||||
".align 5 \n\t" | |||||
"one%=: \n\t" | |||||
"xvmulsp 40, 32, 36 \n\t" // c * x | |||||
"xvmulsp 41, 33, 36 \n\t" | |||||
"xvmulsp 42, 34, 36 \n\t" | |||||
"xvmulsp 43, 35, 36 \n\t" | |||||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||||
"xvmulsp %x[x2], 49, 36 \n\t" | |||||
"xvmulsp %x[x1], 50, 36 \n\t" | |||||
"xvmulsp %x[x3], 51, 36 \n\t" | |||||
"xvmulsp 44, 32, 37 \n\t" // s * x | |||||
"xvmulsp 45, 33, 37 \n\t" | |||||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||||
"lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||||
"xvmulsp 46, 34, 37 \n\t" | |||||
"xvmulsp 47, 35, 37 \n\t" | |||||
"lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||||
"lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||||
"xvmulsp %x[x5], 49, 37 \n\t" | |||||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||||
"lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||||
"xvmulsp %x[x6], 50, 37 \n\t" | |||||
"xvmulsp %x[x7], 51, 37 \n\t" | |||||
"lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||||
"lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||||
"addi %[x_ptr], %[x_ptr], -64 \n\t" | |||||
"addi %[y_ptr], %[y_ptr], -64 \n\t" | |||||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||||
"stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||||
"stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||||
"stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||||
"stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||||
"addi %[x_ptr], %[x_ptr], 128 \n\t" | |||||
"addi %[y_ptr], %[y_ptr], 128 \n\t" | |||||
"addic. %[temp_n], %[temp_n], -8 \n\t" | |||||
"bgt one%= \n\t" | |||||
"two%=: \n\t" | |||||
"xvmulsp 40, 32, 36 \n\t" // c * x | |||||
"xvmulsp 41, 33, 36 \n\t" | |||||
"xvmulsp 42, 34, 36 \n\t" | |||||
"xvmulsp 43, 35, 36 \n\t" | |||||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||||
"xvmulsp %x[x2], 49, 36 \n\t" | |||||
"xvmulsp %x[x1], 50, 36 \n\t" | |||||
"xvmulsp %x[x3], 51, 36 \n\t" | |||||
"xvmulsp 44, 32, 37 \n\t" // s * x | |||||
"xvmulsp 45, 33, 37 \n\t" | |||||
"xvmulsp 46, 34, 37 \n\t" | |||||
"xvmulsp 47, 35, 37 \n\t" | |||||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||||
"xvmulsp %x[x5], 49, 37 \n\t" | |||||
"xvmulsp %x[x6], 50, 37 \n\t" | |||||
"xvmulsp %x[x7], 51, 37 \n\t" | |||||
"addi %[x_ptr], %[x_ptr], -64 \n\t" | |||||
"addi %[y_ptr], %[y_ptr], -64 \n\t" | |||||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||||
"stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||||
"stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||||
"stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||||
"stxvd2x %x[x3], %[i48], %[y_ptr] " | |||||
: | |||||
[mem_x] "+m" (*(float (*)[2*n])x), | |||||
[mem_y] "+m" (*(float (*)[2*n])y), | |||||
[temp_n] "+r" (n), | |||||
[x_ptr] "+&b" (x), | |||||
[y_ptr] "+&b" (y), | |||||
[x0] "=wa" (t0), | |||||
[x1] "=wa" (t2), | |||||
[x2] "=wa" (t1), | |||||
[x3] "=wa" (t3), | |||||
[x4] "=wa" (t4), | |||||
[x5] "=wa" (t5), | |||||
[x6] "=wa" (t6), | |||||
[x7] "=wa" (t7) | |||||
: | |||||
[cos] "f" (c), | |||||
[sin] "f" (s), | |||||
[i16] "b" (16), | |||||
[i32] "b" (32), | |||||
[i48] "b" (48) | |||||
: | |||||
"cr0", | |||||
"vs32","vs33","vs34","vs35","vs36","vs37", | |||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||||
"vs48","vs49","vs50","vs51" | |||||
); | |||||
} | |||||
#endif | |||||
#endif | |||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||||
{ | |||||
BLASLONG i=0; | |||||
BLASLONG ix=0,iy=0; | |||||
FLOAT temp[2]; | |||||
BLASLONG inc_x2; | |||||
BLASLONG inc_y2; | |||||
if ( n <= 0 ) return(0); | |||||
if ( (inc_x == 1) && (inc_y == 1) ) | |||||
{ | |||||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||||
BLASLONG n1 = n & -8; | |||||
if ( n1 > 0 ) | |||||
{ | |||||
crot_kernel_8(n1, x, y, c, s); | |||||
i=n1; | |||||
ix=2*n1; | |||||
} | |||||
#endif | |||||
while(i < n) | |||||
{ | |||||
temp[0] = c*x[ix] + s*y[ix] ; | |||||
temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||||
y[ix] = c*y[ix] - s*x[ix] ; | |||||
y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||||
x[ix] = temp[0] ; | |||||
x[ix+1] = temp[1] ; | |||||
ix += 2 ; | |||||
i++ ; | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
inc_x2 = 2 * inc_x ; | |||||
inc_y2 = 2 * inc_y ; | |||||
while(i < n) | |||||
{ | |||||
temp[0] = c*x[ix] + s*y[iy] ; | |||||
temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||||
y[iy] = c*y[iy] - s*x[ix] ; | |||||
y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||||
x[ix] = temp[0] ; | |||||
x[ix+1] = temp[1] ; | |||||
ix += inc_x2 ; | |||||
iy += inc_y2 ; | |||||
i++ ; | |||||
} | |||||
} | |||||
return(0); | |||||
} | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2018, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||||
static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||||
{ | |||||
__vector float t0; | |||||
__vector float t1; | |||||
__vector float t2; | |||||
__vector float t3; | |||||
__vector float t4; | |||||
__vector float t5; | |||||
__vector float t6; | |||||
__vector float t7; | |||||
__asm__ | |||||
( | |||||
"xscvdpspn 36, %x[cos] \n\t" // load c to all words | |||||
"xxspltw 36, 36, 0 \n\t" | |||||
"xscvdpspn 37, %x[sin] \n\t" // load s to all words | |||||
"xxspltw 37, 37, 0 \n\t" | |||||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||||
"lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||||
"lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||||
"lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||||
"lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||||
"lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||||
"lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||||
"addi %[x_ptr], %[x_ptr], 64 \n\t" | |||||
"addi %[y_ptr], %[y_ptr], 64 \n\t" | |||||
"addic. %[temp_n], %[temp_n], -8 \n\t" | |||||
"ble two%= \n\t" | |||||
".align 5 \n\t" | |||||
"one%=: \n\t" | |||||
"xvmulsp 40, 32, 36 \n\t" // c * x | |||||
"xvmulsp 41, 33, 36 \n\t" | |||||
"xvmulsp 42, 34, 36 \n\t" | |||||
"xvmulsp 43, 35, 36 \n\t" | |||||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||||
"xvmulsp %x[x2], 49, 36 \n\t" | |||||
"xvmulsp %x[x1], 50, 36 \n\t" | |||||
"xvmulsp %x[x3], 51, 36 \n\t" | |||||
"xvmulsp 44, 32, 37 \n\t" // s * x | |||||
"xvmulsp 45, 33, 37 \n\t" | |||||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||||
"lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||||
"xvmulsp 46, 34, 37 \n\t" | |||||
"xvmulsp 47, 35, 37 \n\t" | |||||
"lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||||
"lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||||
"xvmulsp %x[x5], 49, 37 \n\t" | |||||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||||
"lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||||
"xvmulsp %x[x6], 50, 37 \n\t" | |||||
"xvmulsp %x[x7], 51, 37 \n\t" | |||||
"lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||||
"lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||||
"addi %[x_ptr], %[x_ptr], -64 \n\t" | |||||
"addi %[y_ptr], %[y_ptr], -64 \n\t" | |||||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||||
"stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||||
"stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||||
"stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||||
"stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||||
"addi %[x_ptr], %[x_ptr], 128 \n\t" | |||||
"addi %[y_ptr], %[y_ptr], 128 \n\t" | |||||
"addic. %[temp_n], %[temp_n], -8 \n\t" | |||||
"bgt one%= \n\t" | |||||
"two%=: \n\t" | |||||
"xvmulsp 40, 32, 36 \n\t" // c * x | |||||
"xvmulsp 41, 33, 36 \n\t" | |||||
"xvmulsp 42, 34, 36 \n\t" | |||||
"xvmulsp 43, 35, 36 \n\t" | |||||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||||
"xvmulsp %x[x2], 49, 36 \n\t" | |||||
"xvmulsp %x[x1], 50, 36 \n\t" | |||||
"xvmulsp %x[x3], 51, 36 \n\t" | |||||
"xvmulsp 44, 32, 37 \n\t" // s * x | |||||
"xvmulsp 45, 33, 37 \n\t" | |||||
"xvmulsp 46, 34, 37 \n\t" | |||||
"xvmulsp 47, 35, 37 \n\t" | |||||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||||
"xvmulsp %x[x5], 49, 37 \n\t" | |||||
"xvmulsp %x[x6], 50, 37 \n\t" | |||||
"xvmulsp %x[x7], 51, 37 \n\t" | |||||
"addi %[x_ptr], %[x_ptr], -64 \n\t" | |||||
"addi %[y_ptr], %[y_ptr], -64 \n\t" | |||||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||||
"stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||||
"stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||||
"stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||||
"stxvd2x %x[x3], %[i48], %[y_ptr] " | |||||
: | |||||
[mem_x] "+m" (*(float (*)[2*n])x), | |||||
[mem_y] "+m" (*(float (*)[2*n])y), | |||||
[temp_n] "+r" (n), | |||||
[x_ptr] "+&b" (x), | |||||
[y_ptr] "+&b" (y), | |||||
[x0] "=wa" (t0), | |||||
[x1] "=wa" (t2), | |||||
[x2] "=wa" (t1), | |||||
[x3] "=wa" (t3), | |||||
[x4] "=wa" (t4), | |||||
[x5] "=wa" (t5), | |||||
[x6] "=wa" (t6), | |||||
[x7] "=wa" (t7) | |||||
: | |||||
[cos] "f" (c), | |||||
[sin] "f" (s), | |||||
[i16] "b" (16), | |||||
[i32] "b" (32), | |||||
[i48] "b" (48) | |||||
: | |||||
"cr0", | |||||
"vs32","vs33","vs34","vs35","vs36","vs37", | |||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||||
"vs48","vs49","vs50","vs51" | |||||
); | |||||
} | |||||
#endif | |||||
#endif | |||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||||
{ | |||||
BLASLONG i=0; | |||||
BLASLONG ix=0,iy=0; | |||||
FLOAT temp[2]; | |||||
BLASLONG inc_x2; | |||||
BLASLONG inc_y2; | |||||
if ( n <= 0 ) return(0); | |||||
if ( (inc_x == 1) && (inc_y == 1) ) | |||||
{ | |||||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||||
BLASLONG n1 = n & -8; | |||||
if ( n1 > 0 ) | |||||
{ | |||||
crot_kernel_8(n1, x, y, c, s); | |||||
i=n1; | |||||
ix=2*n1; | |||||
} | |||||
#endif | |||||
while(i < n) | |||||
{ | |||||
temp[0] = c*x[ix] + s*y[ix] ; | |||||
temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||||
y[ix] = c*y[ix] - s*x[ix] ; | |||||
y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||||
x[ix] = temp[0] ; | |||||
x[ix+1] = temp[1] ; | |||||
ix += 2 ; | |||||
i++ ; | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
inc_x2 = 2 * inc_x ; | |||||
inc_y2 = 2 * inc_y ; | |||||
while(i < n) | |||||
{ | |||||
temp[0] = c*x[ix] + s*y[iy] ; | |||||
temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||||
y[iy] = c*y[iy] - s*x[ix] ; | |||||
y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||||
x[ix] = temp[0] ; | |||||
x[ix+1] = temp[1] ; | |||||
ix += inc_x2 ; | |||||
iy += inc_y2 ; | |||||
i++ ; | |||||
} | |||||
} | |||||
return(0); | |||||
} | |||||
@@ -1,249 +1,249 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "def_vsx.h" | |||||
#define LOAD ld | |||||
#define STACKSIZE (512 ) | |||||
#define ALPHA_SP (296+192)(SP) | |||||
#define FZERO (304+192)(SP) | |||||
#define M r3 | |||||
#define N r4 | |||||
#define K r5 | |||||
#define A r7 | |||||
#define B r8 | |||||
#define C r9 | |||||
#define LDC r10 | |||||
#define OFFSET r6 | |||||
#define alpha_r vs18 | |||||
#define o0 0 | |||||
#define T4 r12 | |||||
#define T3 r11 | |||||
#define C4 r14 | |||||
#define o8 r15 | |||||
#define o24 r16 | |||||
#define C2 r17 | |||||
#define L r18 | |||||
#define T1 r19 | |||||
#define C3 r20 | |||||
#define TEMP_REG r21 | |||||
#define I r22 | |||||
#define J r23 | |||||
#define AO r24 | |||||
#define BO r25 | |||||
#define CO r26 | |||||
#define o16 r27 | |||||
#define o32 r28 | |||||
#define o48 r29 | |||||
#define PRE r30 | |||||
#define T2 r31 | |||||
#include "dgemm_macros_power9.S" | |||||
#ifndef NEEDPARAM | |||||
PROLOGUE | |||||
PROFCODE | |||||
addi SP, SP, -STACKSIZE | |||||
li r0, 0 | |||||
stfd f14, 0(SP) | |||||
stfd f15, 8(SP) | |||||
stfd f16, 16(SP) | |||||
stfd f17, 24(SP) | |||||
stfd f18, 32(SP) | |||||
stfd f19, 40(SP) | |||||
stfd f20, 48(SP) | |||||
stfd f21, 56(SP) | |||||
stfd f22, 64(SP) | |||||
stfd f23, 72(SP) | |||||
stfd f24, 80(SP) | |||||
stfd f25, 88(SP) | |||||
stfd f26, 96(SP) | |||||
stfd f27, 104(SP) | |||||
stfd f28, 112(SP) | |||||
stfd f29, 120(SP) | |||||
stfd f30, 128(SP) | |||||
stfd f31, 136(SP) | |||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
stxv vs52, 288(SP) | |||||
stxv vs53, 304(SP) | |||||
stxv vs54, 320(SP) | |||||
stxv vs55, 336(SP) | |||||
stxv vs56, 352(SP) | |||||
stxv vs57, 368(SP) | |||||
stxv vs58, 384(SP) | |||||
stxv vs59, 400(SP) | |||||
stxv vs60, 416(SP) | |||||
stxv vs61, 432(SP) | |||||
stxv vs62, 448(SP) | |||||
stxv vs63, 464(SP) | |||||
stfd f1, ALPHA_SP | |||||
stw r0, FZERO | |||||
slwi LDC, LDC, BASE_SHIFT | |||||
#if defined(TRMMKERNEL) | |||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
#endif | |||||
cmpwi cr0, M, 0 | |||||
ble .L999_H1 | |||||
cmpwi cr0, N, 0 | |||||
ble .L999_H1 | |||||
cmpwi cr0, K, 0 | |||||
ble .L999_H1 | |||||
addi T1, SP, 296+192 | |||||
li PRE, 384 | |||||
li o8 , 8 | |||||
li o16, 16 | |||||
li o24, 24 | |||||
li o32, 32 | |||||
li o48, 48 | |||||
lxvdsx alpha_r, 0, T1 | |||||
#include "dgemm_logic_power9.S" | |||||
.L999: | |||||
addi r3, 0, 0 | |||||
lfd f14, 0(SP) | |||||
lfd f15, 8(SP) | |||||
lfd f16, 16(SP) | |||||
lfd f17, 24(SP) | |||||
lfd f18, 32(SP) | |||||
lfd f19, 40(SP) | |||||
lfd f20, 48(SP) | |||||
lfd f21, 56(SP) | |||||
lfd f22, 64(SP) | |||||
lfd f23, 72(SP) | |||||
lfd f24, 80(SP) | |||||
lfd f25, 88(SP) | |||||
lfd f26, 96(SP) | |||||
lfd f27, 104(SP) | |||||
lfd f28, 112(SP) | |||||
lfd f29, 120(SP) | |||||
lfd f30, 128(SP) | |||||
lfd f31, 136(SP) | |||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
lxv vs52, 288(SP) | |||||
lxv vs53, 304(SP) | |||||
lxv vs54, 320(SP) | |||||
lxv vs55, 336(SP) | |||||
lxv vs56, 352(SP) | |||||
lxv vs57, 368(SP) | |||||
lxv vs58, 384(SP) | |||||
lxv vs59, 400(SP) | |||||
lxv vs60, 416(SP) | |||||
lxv vs61, 432(SP) | |||||
lxv vs62, 448(SP) | |||||
lxv vs63, 464(SP) | |||||
addi SP, SP, STACKSIZE | |||||
blr | |||||
EPILOGUE | |||||
#endif | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "def_vsx.h" | |||||
#define LOAD ld | |||||
#define STACKSIZE (512 ) | |||||
#define ALPHA_SP (296+192)(SP) | |||||
#define FZERO (304+192)(SP) | |||||
#define M r3 | |||||
#define N r4 | |||||
#define K r5 | |||||
#define A r7 | |||||
#define B r8 | |||||
#define C r9 | |||||
#define LDC r10 | |||||
#define OFFSET r6 | |||||
#define alpha_r vs18 | |||||
#define o0 0 | |||||
#define T4 r12 | |||||
#define T3 r11 | |||||
#define C4 r14 | |||||
#define o8 r15 | |||||
#define o24 r16 | |||||
#define C2 r17 | |||||
#define L r18 | |||||
#define T1 r19 | |||||
#define C3 r20 | |||||
#define TEMP_REG r21 | |||||
#define I r22 | |||||
#define J r23 | |||||
#define AO r24 | |||||
#define BO r25 | |||||
#define CO r26 | |||||
#define o16 r27 | |||||
#define o32 r28 | |||||
#define o48 r29 | |||||
#define PRE r30 | |||||
#define T2 r31 | |||||
#include "dgemm_macros_power9.S" | |||||
#ifndef NEEDPARAM | |||||
PROLOGUE | |||||
PROFCODE | |||||
addi SP, SP, -STACKSIZE | |||||
li r0, 0 | |||||
stfd f14, 0(SP) | |||||
stfd f15, 8(SP) | |||||
stfd f16, 16(SP) | |||||
stfd f17, 24(SP) | |||||
stfd f18, 32(SP) | |||||
stfd f19, 40(SP) | |||||
stfd f20, 48(SP) | |||||
stfd f21, 56(SP) | |||||
stfd f22, 64(SP) | |||||
stfd f23, 72(SP) | |||||
stfd f24, 80(SP) | |||||
stfd f25, 88(SP) | |||||
stfd f26, 96(SP) | |||||
stfd f27, 104(SP) | |||||
stfd f28, 112(SP) | |||||
stfd f29, 120(SP) | |||||
stfd f30, 128(SP) | |||||
stfd f31, 136(SP) | |||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
stxv vs52, 288(SP) | |||||
stxv vs53, 304(SP) | |||||
stxv vs54, 320(SP) | |||||
stxv vs55, 336(SP) | |||||
stxv vs56, 352(SP) | |||||
stxv vs57, 368(SP) | |||||
stxv vs58, 384(SP) | |||||
stxv vs59, 400(SP) | |||||
stxv vs60, 416(SP) | |||||
stxv vs61, 432(SP) | |||||
stxv vs62, 448(SP) | |||||
stxv vs63, 464(SP) | |||||
stfd f1, ALPHA_SP | |||||
stw r0, FZERO | |||||
slwi LDC, LDC, BASE_SHIFT | |||||
#if defined(TRMMKERNEL) | |||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
#endif | |||||
cmpwi cr0, M, 0 | |||||
ble .L999_H1 | |||||
cmpwi cr0, N, 0 | |||||
ble .L999_H1 | |||||
cmpwi cr0, K, 0 | |||||
ble .L999_H1 | |||||
addi T1, SP, 296+192 | |||||
li PRE, 384 | |||||
li o8 , 8 | |||||
li o16, 16 | |||||
li o24, 24 | |||||
li o32, 32 | |||||
li o48, 48 | |||||
lxvdsx alpha_r, 0, T1 | |||||
#include "dgemm_logic_power9.S" | |||||
.L999: | |||||
addi r3, 0, 0 | |||||
lfd f14, 0(SP) | |||||
lfd f15, 8(SP) | |||||
lfd f16, 16(SP) | |||||
lfd f17, 24(SP) | |||||
lfd f18, 32(SP) | |||||
lfd f19, 40(SP) | |||||
lfd f20, 48(SP) | |||||
lfd f21, 56(SP) | |||||
lfd f22, 64(SP) | |||||
lfd f23, 72(SP) | |||||
lfd f24, 80(SP) | |||||
lfd f25, 88(SP) | |||||
lfd f26, 96(SP) | |||||
lfd f27, 104(SP) | |||||
lfd f28, 112(SP) | |||||
lfd f29, 120(SP) | |||||
lfd f30, 128(SP) | |||||
lfd f31, 136(SP) | |||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
lxv vs52, 288(SP) | |||||
lxv vs53, 304(SP) | |||||
lxv vs54, 320(SP) | |||||
lxv vs55, 336(SP) | |||||
lxv vs56, 352(SP) | |||||
lxv vs57, 368(SP) | |||||
lxv vs58, 384(SP) | |||||
lxv vs59, 400(SP) | |||||
lxv vs60, 416(SP) | |||||
lxv vs61, 432(SP) | |||||
lxv vs62, 448(SP) | |||||
lxv vs63, 464(SP) | |||||
addi SP, SP, STACKSIZE | |||||
blr | |||||
EPILOGUE | |||||
#endif |
@@ -1,328 +1,328 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <math.h> | |||||
#include <altivec.h> | |||||
#if defined(DOUBLE) | |||||
#define ABS fabs | |||||
#else | |||||
#define ABS fabsf | |||||
#endif | |||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||||
#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code | |||||
#if !defined(USE_MASK_PERMUTATIONS) | |||||
static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ | |||||
__vector float result; | |||||
__asm__ ( | |||||
"vmrgew %0,%1,%2;\n" | |||||
: "=v" (result) | |||||
: "v" (a), | |||||
"v" (b) | |||||
: ); | |||||
return result; | |||||
} | |||||
static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ | |||||
__vector float result; | |||||
__asm__ ( | |||||
"vmrgow %0,%1,%2;\n" | |||||
: "=v" (result) | |||||
: "v" (a), | |||||
"v" (b) | |||||
: ); | |||||
return result; | |||||
} | |||||
#endif | |||||
/** | |||||
* Find maximum index | |||||
* Warning: requirements n>0 and n % 32 == 0 | |||||
* @param n | |||||
* @param x pointer to the vector | |||||
* @param maxf (out) maximum absolute value .( only for output ) | |||||
* @return index | |||||
*/ | |||||
static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
BLASLONG index; | |||||
BLASLONG i=0; | |||||
#if defined(USE_MASK_PERMUTATIONS) | |||||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||||
#else | |||||
register __vector unsigned int static_index0 = {2,0,3,1}; | |||||
#endif | |||||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||||
register __vector unsigned int static_index1=static_index0 +temp0; | |||||
register __vector unsigned int static_index2=static_index0 +temp1; | |||||
register __vector unsigned int static_index3=static_index1 +temp1; | |||||
temp0=vec_xor(temp0,temp0); | |||||
temp1=temp1 <<1 ; //{16,16,16,16} | |||||
register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||||
register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||||
register __vector float quadruple_values={0,0,0,0}; | |||||
register __vector float * v_ptrx=(__vector float *)x; | |||||
#if defined(USE_MASK_PERMUTATIONS) | |||||
register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||||
register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||||
#endif | |||||
for(; i<n; i+=32 ){ | |||||
//absolute temporary complex vectors | |||||
register __vector float v0=vec_abs(v_ptrx[0]); | |||||
register __vector float v1=vec_abs(v_ptrx[1]); | |||||
register __vector float v2=vec_abs(v_ptrx[2]); | |||||
register __vector float v3=vec_abs(v_ptrx[3]); | |||||
register __vector float v4=vec_abs(v_ptrx[4]); | |||||
register __vector float v5=vec_abs(v_ptrx[5]); | |||||
register __vector float v6=vec_abs(v_ptrx[6]); | |||||
register __vector float v7=vec_abs(v_ptrx[7]); | |||||
//pack complex real and imaginary parts together to sum real+image | |||||
#if defined(USE_MASK_PERMUTATIONS) | |||||
register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||||
register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||||
v0=t1+ti; //sum quadruple real with quadruple image | |||||
register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||||
register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||||
v1=t2+ti2; | |||||
t1=vec_perm(v4,v5,real_pack_mask); | |||||
ti=vec_perm(v4,v5,image_pack_mask); | |||||
v2=t1+ti; //sum | |||||
t2=vec_perm(v6,v7,real_pack_mask); | |||||
ti2=vec_perm(v6,v7,image_pack_mask); | |||||
v3=t2+ti2; | |||||
#else | |||||
register __vector float t1=mvec_mergee(v0,v1); | |||||
register __vector float ti=mvec_mergeo(v0,v1); | |||||
v0=t1+ti; //sum quadruple real with quadruple image | |||||
register __vector float t2= mvec_mergee(v2,v3); | |||||
register __vector float ti2=mvec_mergeo(v2,v3); | |||||
v1=t2+ti2; | |||||
t1=mvec_mergee(v4,v5); | |||||
ti=mvec_mergeo(v4,v5); | |||||
v2=t1+ti; //sum | |||||
t2=mvec_mergee(v6,v7); | |||||
ti2=mvec_mergeo(v6,v7); | |||||
v3=t2+ti2; | |||||
#endif | |||||
// now we have 16 summed elements . lets compare them | |||||
v_ptrx+=8; | |||||
register __vector bool int r1=vec_cmpgt(v1,v0); | |||||
register __vector bool int r2=vec_cmpgt(v3,v2); | |||||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||||
v0=vec_sel(v0,v1,r1); | |||||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||||
v1=vec_sel(v2,v3,r2); | |||||
//final cmp and select index and value for first 16 values | |||||
r1=vec_cmpgt(v1,v0); | |||||
register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||||
register __vector float vf0= vec_sel(v0,v1,r1); | |||||
//absolute temporary complex vectors | |||||
v0=vec_abs(v_ptrx[0]); | |||||
v1=vec_abs(v_ptrx[1]); | |||||
v2=vec_abs(v_ptrx[2]); | |||||
v3=vec_abs(v_ptrx[3]); | |||||
v4=vec_abs(v_ptrx[4]); | |||||
v5=vec_abs(v_ptrx[5]); | |||||
v6=vec_abs(v_ptrx[6]); | |||||
v7=vec_abs(v_ptrx[7]); | |||||
//pack complex real and imaginary parts together to sum real+image | |||||
#if defined(USE_MASK_PERMUTATIONS) | |||||
t1=vec_perm(v0,v1,real_pack_mask); | |||||
ti=vec_perm(v0,v1,image_pack_mask); | |||||
v0=t1+ti; //sum quadruple real with quadruple image | |||||
t2=vec_perm(v2,v3,real_pack_mask); | |||||
ti2=vec_perm(v2,v3,image_pack_mask); | |||||
v1=t2+ti2; | |||||
t1=vec_perm(v4,v5,real_pack_mask); | |||||
ti=vec_perm(v4,v5,image_pack_mask); | |||||
v2=t1+ti; //sum | |||||
t2=vec_perm(v6,v7,real_pack_mask); | |||||
ti2=vec_perm(v6,v7,image_pack_mask); | |||||
v3=t2+ti2; | |||||
#else | |||||
t1=mvec_mergee(v0,v1); | |||||
ti=mvec_mergeo(v0,v1); | |||||
v0=t1+ti; //sum quadruple real with quadruple image | |||||
t2=mvec_mergee(v2,v3); | |||||
ti2=mvec_mergeo(v2,v3); | |||||
v1=t2+ti2; | |||||
t1=mvec_mergee(v4,v5); | |||||
ti=mvec_mergeo(v4,v5); | |||||
v2=t1+ti; //sum | |||||
t2=mvec_mergee(v6,v7); | |||||
ti2=mvec_mergeo(v6,v7); | |||||
v3=t2+ti2; | |||||
#endif | |||||
// now we have 16 summed elements {from 16 to 31} . lets compare them | |||||
v_ptrx+=8; | |||||
r1=vec_cmpgt(v1,v0); | |||||
r2=vec_cmpgt(v3,v2); | |||||
ind2= vec_sel(static_index0,static_index1,r1); | |||||
v0=vec_sel(v0,v1,r1); | |||||
ind3= vec_sel(static_index2,static_index3,r2); | |||||
v1=vec_sel(v2,v3,r2); | |||||
//final cmp and select index and value for the second 16 values | |||||
r1=vec_cmpgt(v1,v0); | |||||
register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||||
register __vector float vv0= vec_sel(v0,v1,r1); | |||||
indv0+=temp1; //make index from 16->31 | |||||
//find final quadruple from 32 elements | |||||
r2=vec_cmpgt(vv0,vf0); | |||||
ind2 = vec_sel( indf0,indv0,r2); | |||||
vv0= vec_sel(vf0,vv0,r2); | |||||
//get asbolute index | |||||
ind2+=temp0; | |||||
//compare with old quadruple and update | |||||
r1=vec_cmpgt(vv0,quadruple_values); | |||||
quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||||
quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||||
temp0+=temp_add; | |||||
} | |||||
//now we have to chose from 4 values and 4 different indices | |||||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||||
// otherwise we will assign index of the maximum value | |||||
float a1,a2,a3,a4; | |||||
unsigned int i1,i2,i3,i4; | |||||
a1=vec_extract(quadruple_values,0); | |||||
a2=vec_extract(quadruple_values,1); | |||||
a3=vec_extract(quadruple_values,2); | |||||
a4=vec_extract(quadruple_values,3); | |||||
i1=vec_extract(quadruple_indices,0); | |||||
i2=vec_extract(quadruple_indices,1); | |||||
i3=vec_extract(quadruple_indices,2); | |||||
i4=vec_extract(quadruple_indices,3); | |||||
if(a1==a2){ | |||||
index=i1>i2?i2:i1; | |||||
}else if(a2>a1){ | |||||
index=i2; | |||||
a1=a2; | |||||
}else{ | |||||
index= i1; | |||||
} | |||||
if(a4==a3){ | |||||
i1=i3>i4?i4:i3; | |||||
}else if(a4>a3){ | |||||
i1=i4; | |||||
a3=a4; | |||||
}else{ | |||||
i1= i3; | |||||
} | |||||
if(a1==a3){ | |||||
index=i1>index?index:i1; | |||||
*maxf=a1; | |||||
}else if(a3>a1){ | |||||
index=i1; | |||||
*maxf=a3; | |||||
}else{ | |||||
*maxf=a1; | |||||
} | |||||
return index; | |||||
} | |||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
{ | |||||
BLASLONG i = 0; | |||||
BLASLONG ix = 0; | |||||
FLOAT maxf = 0; | |||||
BLASLONG max = 0; | |||||
BLASLONG inc_x2; | |||||
if (n <= 0 || inc_x <= 0) return(max); | |||||
if (inc_x == 1) { | |||||
BLASLONG n1 = n & -32; | |||||
if (n1 > 0) { | |||||
max = ciamax_kernel_32(n1, x, &maxf); | |||||
i = n1; | |||||
ix = n1 << 1; | |||||
} | |||||
while(i < n) | |||||
{ | |||||
if( CABS1(x,ix) > maxf ) | |||||
{ | |||||
max = i; | |||||
maxf = CABS1(x,ix); | |||||
} | |||||
ix += 2; | |||||
i++; | |||||
} | |||||
return (max + 1); | |||||
} else { | |||||
inc_x2 = 2 * inc_x; | |||||
maxf = CABS1(x,0); | |||||
ix += inc_x2; | |||||
i++; | |||||
while(i < n) | |||||
{ | |||||
if( CABS1(x,ix) > maxf ) | |||||
{ | |||||
max = i; | |||||
maxf = CABS1(x,ix); | |||||
} | |||||
ix += inc_x2; | |||||
i++; | |||||
} | |||||
return (max + 1); | |||||
} | |||||
} | |||||
/*************************************************************************** | |||||
Copyright (c) 2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <math.h> | |||||
#include <altivec.h> | |||||
#if defined(DOUBLE) | |||||
#define ABS fabs | |||||
#else | |||||
#define ABS fabsf | |||||
#endif | |||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||||
#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code | |||||
#if !defined(USE_MASK_PERMUTATIONS) | |||||
static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ | |||||
__vector float result; | |||||
__asm__ ( | |||||
"vmrgew %0,%1,%2;\n" | |||||
: "=v" (result) | |||||
: "v" (a), | |||||
"v" (b) | |||||
: ); | |||||
return result; | |||||
} | |||||
static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ | |||||
__vector float result; | |||||
__asm__ ( | |||||
"vmrgow %0,%1,%2;\n" | |||||
: "=v" (result) | |||||
: "v" (a), | |||||
"v" (b) | |||||
: ); | |||||
return result; | |||||
} | |||||
#endif | |||||
/** | |||||
* Find maximum index | |||||
* Warning: requirements n>0 and n % 32 == 0 | |||||
* @param n | |||||
* @param x pointer to the vector | |||||
* @param maxf (out) maximum absolute value .( only for output ) | |||||
* @return index | |||||
*/ | |||||
static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
BLASLONG index; | |||||
BLASLONG i=0; | |||||
#if defined(USE_MASK_PERMUTATIONS) | |||||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||||
#else | |||||
register __vector unsigned int static_index0 = {2,0,3,1}; | |||||
#endif | |||||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||||
register __vector unsigned int static_index1=static_index0 +temp0; | |||||
register __vector unsigned int static_index2=static_index0 +temp1; | |||||
register __vector unsigned int static_index3=static_index1 +temp1; | |||||
temp0=vec_xor(temp0,temp0); | |||||
temp1=temp1 <<1 ; //{16,16,16,16} | |||||
register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||||
register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||||
register __vector float quadruple_values={0,0,0,0}; | |||||
register __vector float * v_ptrx=(__vector float *)x; | |||||
#if defined(USE_MASK_PERMUTATIONS) | |||||
register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||||
register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||||
#endif | |||||
for(; i<n; i+=32 ){ | |||||
//absolute temporary complex vectors | |||||
register __vector float v0=vec_abs(v_ptrx[0]); | |||||
register __vector float v1=vec_abs(v_ptrx[1]); | |||||
register __vector float v2=vec_abs(v_ptrx[2]); | |||||
register __vector float v3=vec_abs(v_ptrx[3]); | |||||
register __vector float v4=vec_abs(v_ptrx[4]); | |||||
register __vector float v5=vec_abs(v_ptrx[5]); | |||||
register __vector float v6=vec_abs(v_ptrx[6]); | |||||
register __vector float v7=vec_abs(v_ptrx[7]); | |||||
//pack complex real and imaginary parts together to sum real+image | |||||
#if defined(USE_MASK_PERMUTATIONS) | |||||
register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||||
register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||||
v0=t1+ti; //sum quadruple real with quadruple image | |||||
register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||||
register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||||
v1=t2+ti2; | |||||
t1=vec_perm(v4,v5,real_pack_mask); | |||||
ti=vec_perm(v4,v5,image_pack_mask); | |||||
v2=t1+ti; //sum | |||||
t2=vec_perm(v6,v7,real_pack_mask); | |||||
ti2=vec_perm(v6,v7,image_pack_mask); | |||||
v3=t2+ti2; | |||||
#else | |||||
register __vector float t1=mvec_mergee(v0,v1); | |||||
register __vector float ti=mvec_mergeo(v0,v1); | |||||
v0=t1+ti; //sum quadruple real with quadruple image | |||||
register __vector float t2= mvec_mergee(v2,v3); | |||||
register __vector float ti2=mvec_mergeo(v2,v3); | |||||
v1=t2+ti2; | |||||
t1=mvec_mergee(v4,v5); | |||||
ti=mvec_mergeo(v4,v5); | |||||
v2=t1+ti; //sum | |||||
t2=mvec_mergee(v6,v7); | |||||
ti2=mvec_mergeo(v6,v7); | |||||
v3=t2+ti2; | |||||
#endif | |||||
// now we have 16 summed elements . lets compare them | |||||
v_ptrx+=8; | |||||
register __vector bool int r1=vec_cmpgt(v1,v0); | |||||
register __vector bool int r2=vec_cmpgt(v3,v2); | |||||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||||
v0=vec_sel(v0,v1,r1); | |||||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||||
v1=vec_sel(v2,v3,r2); | |||||
//final cmp and select index and value for first 16 values | |||||
r1=vec_cmpgt(v1,v0); | |||||
register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||||
register __vector float vf0= vec_sel(v0,v1,r1); | |||||
//absolute temporary complex vectors | |||||
v0=vec_abs(v_ptrx[0]); | |||||
v1=vec_abs(v_ptrx[1]); | |||||
v2=vec_abs(v_ptrx[2]); | |||||
v3=vec_abs(v_ptrx[3]); | |||||
v4=vec_abs(v_ptrx[4]); | |||||
v5=vec_abs(v_ptrx[5]); | |||||
v6=vec_abs(v_ptrx[6]); | |||||
v7=vec_abs(v_ptrx[7]); | |||||
//pack complex real and imaginary parts together to sum real+image | |||||
#if defined(USE_MASK_PERMUTATIONS) | |||||
t1=vec_perm(v0,v1,real_pack_mask); | |||||
ti=vec_perm(v0,v1,image_pack_mask); | |||||
v0=t1+ti; //sum quadruple real with quadruple image | |||||
t2=vec_perm(v2,v3,real_pack_mask); | |||||
ti2=vec_perm(v2,v3,image_pack_mask); | |||||
v1=t2+ti2; | |||||
t1=vec_perm(v4,v5,real_pack_mask); | |||||
ti=vec_perm(v4,v5,image_pack_mask); | |||||
v2=t1+ti; //sum | |||||
t2=vec_perm(v6,v7,real_pack_mask); | |||||
ti2=vec_perm(v6,v7,image_pack_mask); | |||||
v3=t2+ti2; | |||||
#else | |||||
t1=mvec_mergee(v0,v1); | |||||
ti=mvec_mergeo(v0,v1); | |||||
v0=t1+ti; //sum quadruple real with quadruple image | |||||
t2=mvec_mergee(v2,v3); | |||||
ti2=mvec_mergeo(v2,v3); | |||||
v1=t2+ti2; | |||||
t1=mvec_mergee(v4,v5); | |||||
ti=mvec_mergeo(v4,v5); | |||||
v2=t1+ti; //sum | |||||
t2=mvec_mergee(v6,v7); | |||||
ti2=mvec_mergeo(v6,v7); | |||||
v3=t2+ti2; | |||||
#endif | |||||
// now we have 16 summed elements {from 16 to 31} . lets compare them | |||||
v_ptrx+=8; | |||||
r1=vec_cmpgt(v1,v0); | |||||
r2=vec_cmpgt(v3,v2); | |||||
ind2= vec_sel(static_index0,static_index1,r1); | |||||
v0=vec_sel(v0,v1,r1); | |||||
ind3= vec_sel(static_index2,static_index3,r2); | |||||
v1=vec_sel(v2,v3,r2); | |||||
//final cmp and select index and value for the second 16 values | |||||
r1=vec_cmpgt(v1,v0); | |||||
register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||||
register __vector float vv0= vec_sel(v0,v1,r1); | |||||
indv0+=temp1; //make index from 16->31 | |||||
//find final quadruple from 32 elements | |||||
r2=vec_cmpgt(vv0,vf0); | |||||
ind2 = vec_sel( indf0,indv0,r2); | |||||
vv0= vec_sel(vf0,vv0,r2); | |||||
//get asbolute index | |||||
ind2+=temp0; | |||||
//compare with old quadruple and update | |||||
r1=vec_cmpgt(vv0,quadruple_values); | |||||
quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||||
quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||||
temp0+=temp_add; | |||||
} | |||||
//now we have to chose from 4 values and 4 different indices | |||||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||||
// otherwise we will assign index of the maximum value | |||||
float a1,a2,a3,a4; | |||||
unsigned int i1,i2,i3,i4; | |||||
a1=vec_extract(quadruple_values,0); | |||||
a2=vec_extract(quadruple_values,1); | |||||
a3=vec_extract(quadruple_values,2); | |||||
a4=vec_extract(quadruple_values,3); | |||||
i1=vec_extract(quadruple_indices,0); | |||||
i2=vec_extract(quadruple_indices,1); | |||||
i3=vec_extract(quadruple_indices,2); | |||||
i4=vec_extract(quadruple_indices,3); | |||||
if(a1==a2){ | |||||
index=i1>i2?i2:i1; | |||||
}else if(a2>a1){ | |||||
index=i2; | |||||
a1=a2; | |||||
}else{ | |||||
index= i1; | |||||
} | |||||
if(a4==a3){ | |||||
i1=i3>i4?i4:i3; | |||||
}else if(a4>a3){ | |||||
i1=i4; | |||||
a3=a4; | |||||
}else{ | |||||
i1= i3; | |||||
} | |||||
if(a1==a3){ | |||||
index=i1>index?index:i1; | |||||
*maxf=a1; | |||||
}else if(a3>a1){ | |||||
index=i1; | |||||
*maxf=a3; | |||||
}else{ | |||||
*maxf=a1; | |||||
} | |||||
return index; | |||||
} | |||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
{ | |||||
BLASLONG i = 0; | |||||
BLASLONG ix = 0; | |||||
FLOAT maxf = 0; | |||||
BLASLONG max = 0; | |||||
BLASLONG inc_x2; | |||||
if (n <= 0 || inc_x <= 0) return(max); | |||||
if (inc_x == 1) { | |||||
BLASLONG n1 = n & -32; | |||||
if (n1 > 0) { | |||||
max = ciamax_kernel_32(n1, x, &maxf); | |||||
i = n1; | |||||
ix = n1 << 1; | |||||
} | |||||
while(i < n) | |||||
{ | |||||
if( CABS1(x,ix) > maxf ) | |||||
{ | |||||
max = i; | |||||
maxf = CABS1(x,ix); | |||||
} | |||||
ix += 2; | |||||
i++; | |||||
} | |||||
return (max + 1); | |||||
} else { | |||||
inc_x2 = 2 * inc_x; | |||||
maxf = CABS1(x,0); | |||||
ix += inc_x2; | |||||
i++; | |||||
while(i < n) | |||||
{ | |||||
if( CABS1(x,ix) > maxf ) | |||||
{ | |||||
max = i; | |||||
maxf = CABS1(x,ix); | |||||
} | |||||
ix += inc_x2; | |||||
i++; | |||||
} | |||||
return (max + 1); | |||||
} | |||||
} | |||||
@@ -1,266 +1,266 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <math.h> | |||||
#include <altivec.h> | |||||
#if defined(DOUBLE) | |||||
#define ABS fabs | |||||
#else | |||||
#define ABS fabsf | |||||
#endif | |||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||||
/** | |||||
* Find minimum index | |||||
* Warning: requirements n>0 and n % 32 == 0 | |||||
* @param n | |||||
* @param x pointer to the vector | |||||
* @param minf (out) minimum absolute value .( only for output ) | |||||
* @return index | |||||
*/ | |||||
static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
BLASLONG index; | |||||
BLASLONG i=0; | |||||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||||
temp0=vec_xor(temp0,temp0); | |||||
temp1=temp1 <<1 ; //{16,16,16,16} | |||||
register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||||
register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||||
float first_min=CABS1(x,0); | |||||
register __vector float quadruple_values={first_min,first_min,first_min,first_min}; | |||||
register __vector float * v_ptrx=(__vector float *)x; | |||||
register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||||
register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||||
for(; i<n; i+=32){ | |||||
//absolute temporary complex vectors | |||||
register __vector float v0=vec_abs(v_ptrx[0]); | |||||
register __vector float v1=vec_abs(v_ptrx[1]); | |||||
register __vector float v2=vec_abs(v_ptrx[2]); | |||||
register __vector float v3=vec_abs(v_ptrx[3]); | |||||
register __vector float v4=vec_abs(v_ptrx[4]); | |||||
register __vector float v5=vec_abs(v_ptrx[5]); | |||||
register __vector float v6=vec_abs(v_ptrx[6]); | |||||
register __vector float v7=vec_abs(v_ptrx[7]); | |||||
//pack complex real and imaginary parts together to sum real+image | |||||
register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||||
register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||||
v0=t1+ti; //sum quadruple real with quadruple image | |||||
register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||||
register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||||
v1=t2+ti2; | |||||
t1=vec_perm(v4,v5,real_pack_mask); | |||||
ti=vec_perm(v4,v5,image_pack_mask); | |||||
v2=t1+ti; //sum | |||||
t2=vec_perm(v6,v7,real_pack_mask); | |||||
ti2=vec_perm(v6,v7,image_pack_mask); | |||||
v3=t2+ti2; | |||||
// now we have 16 summed elements . lets compare them | |||||
v_ptrx+=8; | |||||
register __vector bool int r1=vec_cmpgt(v0,v1); | |||||
register __vector bool int r2=vec_cmpgt(v2,v3); | |||||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||||
v0=vec_sel(v0,v1,r1); | |||||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||||
v1=vec_sel(v2,v3,r2); | |||||
//final cmp and select index and value for first 16 values | |||||
r1=vec_cmpgt(v0,v1); | |||||
register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||||
register __vector float vf0= vec_sel(v0,v1,r1); | |||||
//absolute temporary complex vectors | |||||
v0=vec_abs(v_ptrx[0]); | |||||
v1=vec_abs(v_ptrx[1]); | |||||
v2=vec_abs(v_ptrx[2]); | |||||
v3=vec_abs(v_ptrx[3]); | |||||
v4=vec_abs(v_ptrx[4]); | |||||
v5=vec_abs(v_ptrx[5]); | |||||
v6=vec_abs(v_ptrx[6]); | |||||
v7=vec_abs(v_ptrx[7]); | |||||
//pack complex real and imaginary parts together to sum real+image | |||||
t1=vec_perm(v0,v1,real_pack_mask); | |||||
ti=vec_perm(v0,v1,image_pack_mask); | |||||
v0=t1+ti; //sum quadruple real with quadruple image | |||||
t2=vec_perm(v2,v3,real_pack_mask); | |||||
ti2=vec_perm(v2,v3,image_pack_mask); | |||||
v1=t2+ti2; | |||||
t1=vec_perm(v4,v5,real_pack_mask); | |||||
ti=vec_perm(v4,v5,image_pack_mask); | |||||
v2=t1+ti; //sum | |||||
t2=vec_perm(v6,v7,real_pack_mask); | |||||
ti2=vec_perm(v6,v7,image_pack_mask); | |||||
v3=t2+ti2; | |||||
// now we have 16 summed elements {from 16 to 31} . lets compare them | |||||
v_ptrx+=8; | |||||
r1=vec_cmpgt(v0,v1); | |||||
r2=vec_cmpgt(v2,v3); | |||||
ind2= vec_sel(static_index0,static_index1,r1); | |||||
v0=vec_sel(v0,v1,r1); | |||||
ind3= vec_sel(static_index2,static_index3,r2); | |||||
v1=vec_sel(v2,v3,r2); | |||||
//final cmp and select index and value for the second 16 values | |||||
r1=vec_cmpgt(v0,v1); | |||||
register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||||
register __vector float vv0= vec_sel(v0,v1,r1); | |||||
indv0+=temp1; //make index from 16->31 | |||||
//find final quadruple from 32 elements | |||||
r2=vec_cmpgt(vf0,vv0); | |||||
ind2 = vec_sel( indf0,indv0,r2); | |||||
vv0= vec_sel(vf0,vv0,r2); | |||||
//get asbolute index | |||||
ind2+=temp0; | |||||
//compare with old quadruple and update | |||||
r1=vec_cmpgt(quadruple_values,vv0); | |||||
quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||||
quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||||
temp0+=temp_add; | |||||
} | |||||
//now we have to chose from 4 values and 4 different indices | |||||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||||
// otherwise we will assign index of the minimum value | |||||
float a1,a2,a3,a4; | |||||
unsigned int i1,i2,i3,i4; | |||||
a1=vec_extract(quadruple_values,0); | |||||
a2=vec_extract(quadruple_values,1); | |||||
a3=vec_extract(quadruple_values,2); | |||||
a4=vec_extract(quadruple_values,3); | |||||
i1=vec_extract(quadruple_indices,0); | |||||
i2=vec_extract(quadruple_indices,1); | |||||
i3=vec_extract(quadruple_indices,2); | |||||
i4=vec_extract(quadruple_indices,3); | |||||
if(a1==a2){ | |||||
index=i1>i2?i2:i1; | |||||
}else if(a2<a1){ | |||||
index=i2; | |||||
a1=a2; | |||||
}else{ | |||||
index= i1; | |||||
} | |||||
if(a4==a3){ | |||||
i1=i3>i4?i4:i3; | |||||
}else if(a4<a3){ | |||||
i1=i4; | |||||
a3=a4; | |||||
}else{ | |||||
i1= i3; | |||||
} | |||||
if(a1==a3){ | |||||
index=i1>index?index:i1; | |||||
*minf=a1; | |||||
}else if(a3<a1){ | |||||
index=i1; | |||||
*minf=a3; | |||||
}else{ | |||||
*minf=a1; | |||||
} | |||||
return index; | |||||
} | |||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
{ | |||||
BLASLONG i=0; | |||||
BLASLONG ix=0; | |||||
FLOAT minf; | |||||
BLASLONG min=0; | |||||
BLASLONG inc_x2; | |||||
if (n <= 0 || inc_x <= 0) return(min); | |||||
if (inc_x == 1) { | |||||
minf = CABS1(x,0); //index will not be incremented | |||||
BLASLONG n1 = n & -32; | |||||
if (n1 > 0) { | |||||
min = ciamin_kernel_32(n1, x, &minf); | |||||
i = n1; | |||||
ix = n1 << 1; | |||||
} | |||||
while(i < n) | |||||
{ | |||||
if( CABS1(x,ix) < minf ) | |||||
{ | |||||
min = i; | |||||
minf = CABS1(x,ix); | |||||
} | |||||
ix += 2; | |||||
i++; | |||||
} | |||||
return (min + 1); | |||||
} else { | |||||
inc_x2 = 2 * inc_x; | |||||
minf = CABS1(x,0); | |||||
ix += inc_x2; | |||||
i++; | |||||
while(i < n) | |||||
{ | |||||
if( CABS1(x,ix) < minf ) | |||||
{ | |||||
min = i; | |||||
minf = CABS1(x,ix); | |||||
} | |||||
ix += inc_x2; | |||||
i++; | |||||
} | |||||
return (min + 1); | |||||
} | |||||
} | |||||
/*************************************************************************** | |||||
Copyright (c) 2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <math.h> | |||||
#include <altivec.h> | |||||
#if defined(DOUBLE) | |||||
#define ABS fabs | |||||
#else | |||||
#define ABS fabsf | |||||
#endif | |||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||||
/** | |||||
* Find minimum index | |||||
* Warning: requirements n>0 and n % 32 == 0 | |||||
* @param n | |||||
* @param x pointer to the vector | |||||
* @param minf (out) minimum absolute value .( only for output ) | |||||
* @return index | |||||
*/ | |||||
static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
BLASLONG index; | |||||
BLASLONG i=0; | |||||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||||
temp0=vec_xor(temp0,temp0); | |||||
temp1=temp1 <<1 ; //{16,16,16,16} | |||||
register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||||
register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||||
float first_min=CABS1(x,0); | |||||
register __vector float quadruple_values={first_min,first_min,first_min,first_min}; | |||||
register __vector float * v_ptrx=(__vector float *)x; | |||||
register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||||
register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||||
for(; i<n; i+=32){ | |||||
//absolute temporary complex vectors | |||||
register __vector float v0=vec_abs(v_ptrx[0]); | |||||
register __vector float v1=vec_abs(v_ptrx[1]); | |||||
register __vector float v2=vec_abs(v_ptrx[2]); | |||||
register __vector float v3=vec_abs(v_ptrx[3]); | |||||
register __vector float v4=vec_abs(v_ptrx[4]); | |||||
register __vector float v5=vec_abs(v_ptrx[5]); | |||||
register __vector float v6=vec_abs(v_ptrx[6]); | |||||
register __vector float v7=vec_abs(v_ptrx[7]); | |||||
//pack complex real and imaginary parts together to sum real+image | |||||
register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||||
register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||||
v0=t1+ti; //sum quadruple real with quadruple image | |||||
register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||||
register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||||
v1=t2+ti2; | |||||
t1=vec_perm(v4,v5,real_pack_mask); | |||||
ti=vec_perm(v4,v5,image_pack_mask); | |||||
v2=t1+ti; //sum | |||||
t2=vec_perm(v6,v7,real_pack_mask); | |||||
ti2=vec_perm(v6,v7,image_pack_mask); | |||||
v3=t2+ti2; | |||||
// now we have 16 summed elements . lets compare them | |||||
v_ptrx+=8; | |||||
register __vector bool int r1=vec_cmpgt(v0,v1); | |||||
register __vector bool int r2=vec_cmpgt(v2,v3); | |||||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||||
v0=vec_sel(v0,v1,r1); | |||||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||||
v1=vec_sel(v2,v3,r2); | |||||
//final cmp and select index and value for first 16 values | |||||
r1=vec_cmpgt(v0,v1); | |||||
register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||||
register __vector float vf0= vec_sel(v0,v1,r1); | |||||
//absolute temporary complex vectors | |||||
v0=vec_abs(v_ptrx[0]); | |||||
v1=vec_abs(v_ptrx[1]); | |||||
v2=vec_abs(v_ptrx[2]); | |||||
v3=vec_abs(v_ptrx[3]); | |||||
v4=vec_abs(v_ptrx[4]); | |||||
v5=vec_abs(v_ptrx[5]); | |||||
v6=vec_abs(v_ptrx[6]); | |||||
v7=vec_abs(v_ptrx[7]); | |||||
//pack complex real and imaginary parts together to sum real+image | |||||
t1=vec_perm(v0,v1,real_pack_mask); | |||||
ti=vec_perm(v0,v1,image_pack_mask); | |||||
v0=t1+ti; //sum quadruple real with quadruple image | |||||
t2=vec_perm(v2,v3,real_pack_mask); | |||||
ti2=vec_perm(v2,v3,image_pack_mask); | |||||
v1=t2+ti2; | |||||
t1=vec_perm(v4,v5,real_pack_mask); | |||||
ti=vec_perm(v4,v5,image_pack_mask); | |||||
v2=t1+ti; //sum | |||||
t2=vec_perm(v6,v7,real_pack_mask); | |||||
ti2=vec_perm(v6,v7,image_pack_mask); | |||||
v3=t2+ti2; | |||||
// now we have 16 summed elements {from 16 to 31} . lets compare them | |||||
v_ptrx+=8; | |||||
r1=vec_cmpgt(v0,v1); | |||||
r2=vec_cmpgt(v2,v3); | |||||
ind2= vec_sel(static_index0,static_index1,r1); | |||||
v0=vec_sel(v0,v1,r1); | |||||
ind3= vec_sel(static_index2,static_index3,r2); | |||||
v1=vec_sel(v2,v3,r2); | |||||
//final cmp and select index and value for the second 16 values | |||||
r1=vec_cmpgt(v0,v1); | |||||
register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||||
register __vector float vv0= vec_sel(v0,v1,r1); | |||||
indv0+=temp1; //make index from 16->31 | |||||
//find final quadruple from 32 elements | |||||
r2=vec_cmpgt(vf0,vv0); | |||||
ind2 = vec_sel( indf0,indv0,r2); | |||||
vv0= vec_sel(vf0,vv0,r2); | |||||
//get asbolute index | |||||
ind2+=temp0; | |||||
//compare with old quadruple and update | |||||
r1=vec_cmpgt(quadruple_values,vv0); | |||||
quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||||
quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||||
temp0+=temp_add; | |||||
} | |||||
//now we have to chose from 4 values and 4 different indices | |||||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||||
// otherwise we will assign index of the minimum value | |||||
float a1,a2,a3,a4; | |||||
unsigned int i1,i2,i3,i4; | |||||
a1=vec_extract(quadruple_values,0); | |||||
a2=vec_extract(quadruple_values,1); | |||||
a3=vec_extract(quadruple_values,2); | |||||
a4=vec_extract(quadruple_values,3); | |||||
i1=vec_extract(quadruple_indices,0); | |||||
i2=vec_extract(quadruple_indices,1); | |||||
i3=vec_extract(quadruple_indices,2); | |||||
i4=vec_extract(quadruple_indices,3); | |||||
if(a1==a2){ | |||||
index=i1>i2?i2:i1; | |||||
}else if(a2<a1){ | |||||
index=i2; | |||||
a1=a2; | |||||
}else{ | |||||
index= i1; | |||||
} | |||||
if(a4==a3){ | |||||
i1=i3>i4?i4:i3; | |||||
}else if(a4<a3){ | |||||
i1=i4; | |||||
a3=a4; | |||||
}else{ | |||||
i1= i3; | |||||
} | |||||
if(a1==a3){ | |||||
index=i1>index?index:i1; | |||||
*minf=a1; | |||||
}else if(a3<a1){ | |||||
index=i1; | |||||
*minf=a3; | |||||
}else{ | |||||
*minf=a1; | |||||
} | |||||
return index; | |||||
} | |||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
{ | |||||
BLASLONG i=0; | |||||
BLASLONG ix=0; | |||||
FLOAT minf; | |||||
BLASLONG min=0; | |||||
BLASLONG inc_x2; | |||||
if (n <= 0 || inc_x <= 0) return(min); | |||||
if (inc_x == 1) { | |||||
minf = CABS1(x,0); //index will not be incremented | |||||
BLASLONG n1 = n & -32; | |||||
if (n1 > 0) { | |||||
min = ciamin_kernel_32(n1, x, &minf); | |||||
i = n1; | |||||
ix = n1 << 1; | |||||
} | |||||
while(i < n) | |||||
{ | |||||
if( CABS1(x,ix) < minf ) | |||||
{ | |||||
min = i; | |||||
minf = CABS1(x,ix); | |||||
} | |||||
ix += 2; | |||||
i++; | |||||
} | |||||
return (min + 1); | |||||
} else { | |||||
inc_x2 = 2 * inc_x; | |||||
minf = CABS1(x,0); | |||||
ix += inc_x2; | |||||
i++; | |||||
while(i < n) | |||||
{ | |||||
if( CABS1(x,ix) < minf ) | |||||
{ | |||||
min = i; | |||||
minf = CABS1(x,ix); | |||||
} | |||||
ix += inc_x2; | |||||
i++; | |||||
} | |||||
return (min + 1); | |||||
} | |||||
} | |||||
@@ -1,288 +1,288 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <math.h> | |||||
#include <altivec.h> | |||||
#if defined(DOUBLE) | |||||
#define ABS fabs | |||||
#else | |||||
#define ABS fabsf | |||||
#endif | |||||
/** | |||||
* Find maximum index | |||||
* Warning: requirements n>0 and n % 64 == 0 | |||||
* @param n | |||||
* @param x pointer to the vector | |||||
* @param maxf (out) maximum absolute value .( only for output ) | |||||
* @return index | |||||
*/ | |||||
static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
BLASLONG index; | |||||
BLASLONG i=0; | |||||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||||
temp0=vec_xor(temp0,temp0); | |||||
temp1=temp1 <<1 ; //{16,16,16,16} | |||||
register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||||
register __vector float quadruple_values={0,0,0,0}; | |||||
register __vector float * v_ptrx=(__vector float *)x; | |||||
for(; i<n; i+=64){ | |||||
//absolute temporary vectors | |||||
register __vector float v0=vec_abs(v_ptrx[0]); | |||||
register __vector float v1=vec_abs(v_ptrx[1]); | |||||
register __vector float v2=vec_abs(v_ptrx[2]); | |||||
register __vector float v3=vec_abs(v_ptrx[3]); | |||||
register __vector float v4=vec_abs(v_ptrx[4]); | |||||
register __vector float v5=vec_abs(v_ptrx[5]); | |||||
register __vector float v6=vec_abs(v_ptrx[6]); | |||||
register __vector float v7=vec_abs(v_ptrx[7]); | |||||
//cmp quadruple pairs | |||||
register __vector bool int r1=vec_cmpgt(v1,v0); | |||||
register __vector bool int r2=vec_cmpgt(v3,v2); | |||||
register __vector bool int r3=vec_cmpgt(v5,v4); | |||||
register __vector bool int r4=vec_cmpgt(v7,v6); | |||||
//select | |||||
register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||||
register __vector float vf0= vec_sel(v0,v1,r1); | |||||
register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||||
register __vector float vf1= vec_sel(v2,v3,r2); | |||||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||||
v0=vec_sel(v4,v5,r3); | |||||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||||
v1=vec_sel(v6,v7,r4); | |||||
// cmp selected | |||||
r1=vec_cmpgt(vf1,vf0); | |||||
r2=vec_cmpgt(v1,v0); | |||||
v_ptrx+=8; | |||||
//select from above | |||||
ind0_first= vec_sel(ind0_first,ind1,r1); | |||||
vf0= vec_sel(vf0,vf1,r1) ; | |||||
ind2= vec_sel(ind2,ind3,r2); | |||||
vf1= vec_sel(v0,v1,r2); | |||||
//second indices actually should be within [16,31] so ind2+16 | |||||
ind2 +=temp1; | |||||
//final cmp and select index and value for the first 32 values | |||||
r1=vec_cmpgt(vf1,vf0); | |||||
ind0_first = vec_sel(ind0_first,ind2,r1); | |||||
vf0= vec_sel(vf0,vf1,r1); | |||||
ind0_first+=temp0; //get absolute index | |||||
temp0+=temp1; | |||||
temp0+=temp1; //temp0+32 | |||||
//second part of 32 | |||||
// absolute temporary vectors | |||||
v0=vec_abs(v_ptrx[0]); | |||||
v1=vec_abs(v_ptrx[1]); | |||||
v2=vec_abs(v_ptrx[2]); | |||||
v3=vec_abs(v_ptrx[3]); | |||||
v4=vec_abs(v_ptrx[4]); | |||||
v5=vec_abs(v_ptrx[5]); | |||||
v6=vec_abs(v_ptrx[6]); | |||||
v7=vec_abs(v_ptrx[7]); | |||||
//cmp quadruple pairs | |||||
r1=vec_cmpgt(v1,v0); | |||||
r2=vec_cmpgt(v3,v2); | |||||
r3=vec_cmpgt(v5,v4); | |||||
r4=vec_cmpgt(v7,v6); | |||||
//select | |||||
register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||||
register __vector float vv0= vec_sel(v0,v1,r1); | |||||
ind1= vec_sel(static_index2,static_index3,r2); | |||||
register __vector float vv1= vec_sel(v2,v3,r2); | |||||
ind2= vec_sel(static_index0,static_index1,r3); | |||||
v0=vec_sel(v4,v5,r3); | |||||
ind3= vec_sel(static_index2,static_index3,r4); | |||||
v1=vec_sel(v6,v7,r4); | |||||
// cmp selected | |||||
r1=vec_cmpgt(vv1,vv0); | |||||
r2=vec_cmpgt(v1,v0); | |||||
v_ptrx+=8; | |||||
//select from above | |||||
ind0_second= vec_sel(ind0_second,ind1,r1); | |||||
vv0= vec_sel(vv0,vv1,r1) ; | |||||
ind2= vec_sel(ind2,ind3,r2); | |||||
vv1= vec_sel(v0,v1,r2) ; | |||||
//second indices actually should be within [16,31] so ind2+16 | |||||
ind2 +=temp1; | |||||
//final cmp and select index and value for the second 32 values | |||||
r1=vec_cmpgt(vv1,vv0); | |||||
ind0_second = vec_sel(ind0_second,ind2,r1); | |||||
vv0= vec_sel(vv0,vv1,r1); | |||||
ind0_second+=temp0; //get absolute index | |||||
//find final quadruple from 64 elements | |||||
r2=vec_cmpgt(vv0,vf0); | |||||
ind2 = vec_sel( ind0_first,ind0_second,r2); | |||||
vv0= vec_sel(vf0,vv0,r2); | |||||
//compare with old quadruple and update | |||||
r3=vec_cmpgt(vv0,quadruple_values); | |||||
quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||||
quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||||
temp0+=temp1; | |||||
temp0+=temp1; //temp0+32 | |||||
} | |||||
//now we have to chose from 4 values and 4 different indices | |||||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||||
// otherwise we will assign index of the maximum value | |||||
float a1,a2,a3,a4; | |||||
unsigned int i1,i2,i3,i4; | |||||
a1=vec_extract(quadruple_values,0); | |||||
a2=vec_extract(quadruple_values,1); | |||||
a3=vec_extract(quadruple_values,2); | |||||
a4=vec_extract(quadruple_values,3); | |||||
i1=vec_extract(quadruple_indices,0); | |||||
i2=vec_extract(quadruple_indices,1); | |||||
i3=vec_extract(quadruple_indices,2); | |||||
i4=vec_extract(quadruple_indices,3); | |||||
if(a1==a2){ | |||||
index=i1>i2?i2:i1; | |||||
}else if(a2>a1){ | |||||
index=i2; | |||||
a1=a2; | |||||
}else{ | |||||
index= i1; | |||||
} | |||||
if(a4==a3){ | |||||
i1=i3>i4?i4:i3; | |||||
}else if(a4>a3){ | |||||
i1=i4; | |||||
a3=a4; | |||||
}else{ | |||||
i1= i3; | |||||
} | |||||
if(a1==a3){ | |||||
index=i1>index?index:i1; | |||||
*maxf=a1; | |||||
}else if(a3>a1){ | |||||
index=i1; | |||||
*maxf=a3; | |||||
}else{ | |||||
*maxf=a1; | |||||
} | |||||
return index; | |||||
} | |||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
BLASLONG i = 0; | |||||
BLASLONG j = 0; | |||||
FLOAT maxf = 0.0; | |||||
BLASLONG max = 0; | |||||
if (n <= 0 || inc_x <= 0) return (max); | |||||
if (inc_x == 1) { | |||||
BLASLONG n1 = n & -64; | |||||
if (n1 > 0) { | |||||
max = siamax_kernel_64(n1, x, &maxf); | |||||
i = n1; | |||||
} | |||||
while (i < n) { | |||||
if (ABS(x[i]) > maxf) { | |||||
max = i; | |||||
maxf = ABS(x[i]); | |||||
} | |||||
i++; | |||||
} | |||||
return (max + 1); | |||||
} else { | |||||
BLASLONG n1 = n & -4; | |||||
while (j < n1) { | |||||
if (ABS(x[i]) > maxf) { | |||||
max = j; | |||||
maxf = ABS(x[i]); | |||||
} | |||||
if (ABS(x[i + inc_x]) > maxf) { | |||||
max = j + 1; | |||||
maxf = ABS(x[i + inc_x]); | |||||
} | |||||
if (ABS(x[i + 2 * inc_x]) > maxf) { | |||||
max = j + 2; | |||||
maxf = ABS(x[i + 2 * inc_x]); | |||||
} | |||||
if (ABS(x[i + 3 * inc_x]) > maxf) { | |||||
max = j + 3; | |||||
maxf = ABS(x[i + 3 * inc_x]); | |||||
} | |||||
i += inc_x * 4; | |||||
j += 4; | |||||
} | |||||
while (j < n) { | |||||
if (ABS(x[i]) > maxf) { | |||||
max = j; | |||||
maxf = ABS(x[i]); | |||||
} | |||||
i += inc_x; | |||||
j++; | |||||
} | |||||
return (max + 1); | |||||
} | |||||
} | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <math.h> | |||||
#include <altivec.h> | |||||
#if defined(DOUBLE) | |||||
#define ABS fabs | |||||
#else | |||||
#define ABS fabsf | |||||
#endif | |||||
/** | |||||
* Find maximum index | |||||
* Warning: requirements n>0 and n % 64 == 0 | |||||
* @param n | |||||
* @param x pointer to the vector | |||||
* @param maxf (out) maximum absolute value .( only for output ) | |||||
* @return index | |||||
*/ | |||||
static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||||
BLASLONG index; | |||||
BLASLONG i=0; | |||||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||||
temp0=vec_xor(temp0,temp0); | |||||
temp1=temp1 <<1 ; //{16,16,16,16} | |||||
register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||||
register __vector float quadruple_values={0,0,0,0}; | |||||
register __vector float * v_ptrx=(__vector float *)x; | |||||
for(; i<n; i+=64){ | |||||
//absolute temporary vectors | |||||
register __vector float v0=vec_abs(v_ptrx[0]); | |||||
register __vector float v1=vec_abs(v_ptrx[1]); | |||||
register __vector float v2=vec_abs(v_ptrx[2]); | |||||
register __vector float v3=vec_abs(v_ptrx[3]); | |||||
register __vector float v4=vec_abs(v_ptrx[4]); | |||||
register __vector float v5=vec_abs(v_ptrx[5]); | |||||
register __vector float v6=vec_abs(v_ptrx[6]); | |||||
register __vector float v7=vec_abs(v_ptrx[7]); | |||||
//cmp quadruple pairs | |||||
register __vector bool int r1=vec_cmpgt(v1,v0); | |||||
register __vector bool int r2=vec_cmpgt(v3,v2); | |||||
register __vector bool int r3=vec_cmpgt(v5,v4); | |||||
register __vector bool int r4=vec_cmpgt(v7,v6); | |||||
//select | |||||
register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||||
register __vector float vf0= vec_sel(v0,v1,r1); | |||||
register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||||
register __vector float vf1= vec_sel(v2,v3,r2); | |||||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||||
v0=vec_sel(v4,v5,r3); | |||||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||||
v1=vec_sel(v6,v7,r4); | |||||
// cmp selected | |||||
r1=vec_cmpgt(vf1,vf0); | |||||
r2=vec_cmpgt(v1,v0); | |||||
v_ptrx+=8; | |||||
//select from above | |||||
ind0_first= vec_sel(ind0_first,ind1,r1); | |||||
vf0= vec_sel(vf0,vf1,r1) ; | |||||
ind2= vec_sel(ind2,ind3,r2); | |||||
vf1= vec_sel(v0,v1,r2); | |||||
//second indices actually should be within [16,31] so ind2+16 | |||||
ind2 +=temp1; | |||||
//final cmp and select index and value for the first 32 values | |||||
r1=vec_cmpgt(vf1,vf0); | |||||
ind0_first = vec_sel(ind0_first,ind2,r1); | |||||
vf0= vec_sel(vf0,vf1,r1); | |||||
ind0_first+=temp0; //get absolute index | |||||
temp0+=temp1; | |||||
temp0+=temp1; //temp0+32 | |||||
//second part of 32 | |||||
// absolute temporary vectors | |||||
v0=vec_abs(v_ptrx[0]); | |||||
v1=vec_abs(v_ptrx[1]); | |||||
v2=vec_abs(v_ptrx[2]); | |||||
v3=vec_abs(v_ptrx[3]); | |||||
v4=vec_abs(v_ptrx[4]); | |||||
v5=vec_abs(v_ptrx[5]); | |||||
v6=vec_abs(v_ptrx[6]); | |||||
v7=vec_abs(v_ptrx[7]); | |||||
//cmp quadruple pairs | |||||
r1=vec_cmpgt(v1,v0); | |||||
r2=vec_cmpgt(v3,v2); | |||||
r3=vec_cmpgt(v5,v4); | |||||
r4=vec_cmpgt(v7,v6); | |||||
//select | |||||
register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||||
register __vector float vv0= vec_sel(v0,v1,r1); | |||||
ind1= vec_sel(static_index2,static_index3,r2); | |||||
register __vector float vv1= vec_sel(v2,v3,r2); | |||||
ind2= vec_sel(static_index0,static_index1,r3); | |||||
v0=vec_sel(v4,v5,r3); | |||||
ind3= vec_sel(static_index2,static_index3,r4); | |||||
v1=vec_sel(v6,v7,r4); | |||||
// cmp selected | |||||
r1=vec_cmpgt(vv1,vv0); | |||||
r2=vec_cmpgt(v1,v0); | |||||
v_ptrx+=8; | |||||
//select from above | |||||
ind0_second= vec_sel(ind0_second,ind1,r1); | |||||
vv0= vec_sel(vv0,vv1,r1) ; | |||||
ind2= vec_sel(ind2,ind3,r2); | |||||
vv1= vec_sel(v0,v1,r2) ; | |||||
//second indices actually should be within [16,31] so ind2+16 | |||||
ind2 +=temp1; | |||||
//final cmp and select index and value for the second 32 values | |||||
r1=vec_cmpgt(vv1,vv0); | |||||
ind0_second = vec_sel(ind0_second,ind2,r1); | |||||
vv0= vec_sel(vv0,vv1,r1); | |||||
ind0_second+=temp0; //get absolute index | |||||
//find final quadruple from 64 elements | |||||
r2=vec_cmpgt(vv0,vf0); | |||||
ind2 = vec_sel( ind0_first,ind0_second,r2); | |||||
vv0= vec_sel(vf0,vv0,r2); | |||||
//compare with old quadruple and update | |||||
r3=vec_cmpgt(vv0,quadruple_values); | |||||
quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||||
quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||||
temp0+=temp1; | |||||
temp0+=temp1; //temp0+32 | |||||
} | |||||
//now we have to chose from 4 values and 4 different indices | |||||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||||
// otherwise we will assign index of the maximum value | |||||
float a1,a2,a3,a4; | |||||
unsigned int i1,i2,i3,i4; | |||||
a1=vec_extract(quadruple_values,0); | |||||
a2=vec_extract(quadruple_values,1); | |||||
a3=vec_extract(quadruple_values,2); | |||||
a4=vec_extract(quadruple_values,3); | |||||
i1=vec_extract(quadruple_indices,0); | |||||
i2=vec_extract(quadruple_indices,1); | |||||
i3=vec_extract(quadruple_indices,2); | |||||
i4=vec_extract(quadruple_indices,3); | |||||
if(a1==a2){ | |||||
index=i1>i2?i2:i1; | |||||
}else if(a2>a1){ | |||||
index=i2; | |||||
a1=a2; | |||||
}else{ | |||||
index= i1; | |||||
} | |||||
if(a4==a3){ | |||||
i1=i3>i4?i4:i3; | |||||
}else if(a4>a3){ | |||||
i1=i4; | |||||
a3=a4; | |||||
}else{ | |||||
i1= i3; | |||||
} | |||||
if(a1==a3){ | |||||
index=i1>index?index:i1; | |||||
*maxf=a1; | |||||
}else if(a3>a1){ | |||||
index=i1; | |||||
*maxf=a3; | |||||
}else{ | |||||
*maxf=a1; | |||||
} | |||||
return index; | |||||
} | |||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
BLASLONG i = 0; | |||||
BLASLONG j = 0; | |||||
FLOAT maxf = 0.0; | |||||
BLASLONG max = 0; | |||||
if (n <= 0 || inc_x <= 0) return (max); | |||||
if (inc_x == 1) { | |||||
BLASLONG n1 = n & -64; | |||||
if (n1 > 0) { | |||||
max = siamax_kernel_64(n1, x, &maxf); | |||||
i = n1; | |||||
} | |||||
while (i < n) { | |||||
if (ABS(x[i]) > maxf) { | |||||
max = i; | |||||
maxf = ABS(x[i]); | |||||
} | |||||
i++; | |||||
} | |||||
return (max + 1); | |||||
} else { | |||||
BLASLONG n1 = n & -4; | |||||
while (j < n1) { | |||||
if (ABS(x[i]) > maxf) { | |||||
max = j; | |||||
maxf = ABS(x[i]); | |||||
} | |||||
if (ABS(x[i + inc_x]) > maxf) { | |||||
max = j + 1; | |||||
maxf = ABS(x[i + inc_x]); | |||||
} | |||||
if (ABS(x[i + 2 * inc_x]) > maxf) { | |||||
max = j + 2; | |||||
maxf = ABS(x[i + 2 * inc_x]); | |||||
} | |||||
if (ABS(x[i + 3 * inc_x]) > maxf) { | |||||
max = j + 3; | |||||
maxf = ABS(x[i + 3 * inc_x]); | |||||
} | |||||
i += inc_x * 4; | |||||
j += 4; | |||||
} | |||||
while (j < n) { | |||||
if (ABS(x[i]) > maxf) { | |||||
max = j; | |||||
maxf = ABS(x[i]); | |||||
} | |||||
i += inc_x; | |||||
j++; | |||||
} | |||||
return (max + 1); | |||||
} | |||||
} |
@@ -1,288 +1,288 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <math.h> | |||||
#include <altivec.h> | |||||
#if defined(DOUBLE) | |||||
#define ABS fabs | |||||
#else | |||||
#define ABS fabsf | |||||
#endif | |||||
/** | |||||
* Find minimum index | |||||
* Warning: requirements n>0 and n % 64 == 0 | |||||
* @param n | |||||
* @param x pointer to the vector | |||||
* @param minf (out) minimum absolute value .( only for output ) | |||||
* @return index | |||||
*/ | |||||
static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
BLASLONG index; | |||||
BLASLONG i=0; | |||||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||||
temp0=vec_xor(temp0,temp0); | |||||
temp1=temp1 <<1 ; //{16,16,16,16} | |||||
register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; | |||||
register __vector float * v_ptrx=(__vector float *)x; | |||||
register __vector float quadruple_values=vec_abs(v_ptrx[0]); | |||||
for(; i<n; i+=64){ | |||||
//absolute temporary vectors | |||||
register __vector float v0=vec_abs(v_ptrx[0]); | |||||
register __vector float v1=vec_abs(v_ptrx[1]); | |||||
register __vector float v2=vec_abs(v_ptrx[2]); | |||||
register __vector float v3=vec_abs(v_ptrx[3]); | |||||
register __vector float v4=vec_abs(v_ptrx[4]); | |||||
register __vector float v5=vec_abs(v_ptrx[5]); | |||||
register __vector float v6=vec_abs(v_ptrx[6]); | |||||
register __vector float v7=vec_abs(v_ptrx[7]); | |||||
//cmp quadruple pairs | |||||
register __vector bool int r1=vec_cmpgt(v0,v1); | |||||
register __vector bool int r2=vec_cmpgt(v2,v3); | |||||
register __vector bool int r3=vec_cmpgt(v4,v5); | |||||
register __vector bool int r4=vec_cmpgt(v6,v7); | |||||
//select | |||||
register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||||
register __vector float vf0= vec_sel(v0,v1,r1); | |||||
register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||||
register __vector float vf1= vec_sel(v2,v3,r2); | |||||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||||
v0=vec_sel(v4,v5,r3); | |||||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||||
v1=vec_sel(v6,v7,r4); | |||||
// cmp selected | |||||
r1=vec_cmpgt(vf0,vf1); | |||||
r2=vec_cmpgt(v0,v1); | |||||
v_ptrx+=8; | |||||
//select from above | |||||
ind0_first= vec_sel(ind0_first,ind1,r1); | |||||
vf0= vec_sel(vf0,vf1,r1) ; | |||||
ind2= vec_sel(ind2,ind3,r2); | |||||
vf1= vec_sel(v0,v1,r2); | |||||
//second indices actually should be within [16,31] so ind2+16 | |||||
ind2 +=temp1; | |||||
//final cmp and select index and value for the first 32 values | |||||
r1=vec_cmpgt(vf0,vf1); | |||||
ind0_first = vec_sel(ind0_first,ind2,r1); | |||||
vf0= vec_sel(vf0,vf1,r1); | |||||
ind0_first+=temp0; //get absolute index | |||||
temp0+=temp1; | |||||
temp0+=temp1; //temp0+32 | |||||
//second part of 32 | |||||
// absolute temporary vectors | |||||
v0=vec_abs(v_ptrx[0]); | |||||
v1=vec_abs(v_ptrx[1]); | |||||
v2=vec_abs(v_ptrx[2]); | |||||
v3=vec_abs(v_ptrx[3]); | |||||
v4=vec_abs(v_ptrx[4]); | |||||
v5=vec_abs(v_ptrx[5]); | |||||
v6=vec_abs(v_ptrx[6]); | |||||
v7=vec_abs(v_ptrx[7]); | |||||
//cmp quadruple pairs | |||||
r1=vec_cmpgt(v0,v1); | |||||
r2=vec_cmpgt(v2,v3); | |||||
r3=vec_cmpgt(v4,v5); | |||||
r4=vec_cmpgt(v6,v7); | |||||
//select | |||||
register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||||
register __vector float vv0= vec_sel(v0,v1,r1); | |||||
ind1= vec_sel(static_index2,static_index3,r2); | |||||
register __vector float vv1= vec_sel(v2,v3,r2); | |||||
ind2= vec_sel(static_index0,static_index1,r3); | |||||
v0=vec_sel(v4,v5,r3); | |||||
ind3= vec_sel(static_index2,static_index3,r4); | |||||
v1=vec_sel(v6,v7,r4); | |||||
// cmp selected | |||||
r1=vec_cmpgt(vv0,vv1); | |||||
r2=vec_cmpgt(v0,v1); | |||||
v_ptrx+=8; | |||||
//select from above | |||||
ind0_second= vec_sel(ind0_second,ind1,r1); | |||||
vv0= vec_sel(vv0,vv1,r1) ; | |||||
ind2= vec_sel(ind2,ind3,r2); | |||||
vv1= vec_sel(v0,v1,r2) ; | |||||
//second indices actually should be within [16,31] so ind2+16 | |||||
ind2 +=temp1; | |||||
//final cmp and select index and value for the second 32 values | |||||
r1=vec_cmpgt(vv0,vv1); | |||||
ind0_second = vec_sel(ind0_second,ind2,r1); | |||||
vv0= vec_sel(vv0,vv1,r1); | |||||
ind0_second+=temp0; //get absolute index | |||||
//find final quadruple from 64 elements | |||||
r2=vec_cmpgt(vf0,vv0); | |||||
ind2 = vec_sel( ind0_first,ind0_second,r2); | |||||
vv0= vec_sel(vf0,vv0,r2); | |||||
//compare with old quadruple and update | |||||
r3=vec_cmpgt( quadruple_values,vv0); | |||||
quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||||
quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||||
temp0+=temp1; | |||||
temp0+=temp1; //temp0+32 | |||||
} | |||||
//now we have to chose from 4 values and 4 different indices | |||||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||||
// otherwise we will assign index of the minimum value | |||||
float a1,a2,a3,a4; | |||||
unsigned int i1,i2,i3,i4; | |||||
a1=vec_extract(quadruple_values,0); | |||||
a2=vec_extract(quadruple_values,1); | |||||
a3=vec_extract(quadruple_values,2); | |||||
a4=vec_extract(quadruple_values,3); | |||||
i1=vec_extract(quadruple_indices,0); | |||||
i2=vec_extract(quadruple_indices,1); | |||||
i3=vec_extract(quadruple_indices,2); | |||||
i4=vec_extract(quadruple_indices,3); | |||||
if(a1==a2){ | |||||
index=i1>i2?i2:i1; | |||||
}else if(a2<a1){ | |||||
index=i2; | |||||
a1=a2; | |||||
}else{ | |||||
index= i1; | |||||
} | |||||
if(a4==a3){ | |||||
i1=i3>i4?i4:i3; | |||||
}else if(a4<a3){ | |||||
i1=i4; | |||||
a3=a4; | |||||
}else{ | |||||
i1= i3; | |||||
} | |||||
if(a1==a3){ | |||||
index=i1>index?index:i1; | |||||
*minf=a1; | |||||
}else if(a3<a1){ | |||||
index=i1; | |||||
*minf=a3; | |||||
}else{ | |||||
*minf=a1; | |||||
} | |||||
return index; | |||||
} | |||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
BLASLONG i = 0; | |||||
BLASLONG j = 0; | |||||
BLASLONG min = 0; | |||||
FLOAT minf = 0.0; | |||||
if (n <= 0 || inc_x <= 0) return (min); | |||||
minf = ABS(x[0]); //index's not incremented | |||||
if (inc_x == 1) { | |||||
BLASLONG n1 = n & -64; | |||||
if (n1 > 0) { | |||||
min = siamin_kernel_64(n1, x, &minf); | |||||
i = n1; | |||||
} | |||||
while (i < n) { | |||||
if (ABS(x[i]) < minf) { | |||||
min = i; | |||||
minf = ABS(x[i]); | |||||
} | |||||
i++; | |||||
} | |||||
return (min + 1); | |||||
} else { | |||||
BLASLONG n1 = n & -4; | |||||
while (j < n1) { | |||||
if (ABS(x[i]) < minf) { | |||||
min = j; | |||||
minf = ABS(x[i]); | |||||
} | |||||
if (ABS(x[i + inc_x]) < minf) { | |||||
min = j + 1; | |||||
minf = ABS(x[i + inc_x]); | |||||
} | |||||
if (ABS(x[i + 2 * inc_x]) < minf) { | |||||
min = j + 2; | |||||
minf = ABS(x[i + 2 * inc_x]); | |||||
} | |||||
if (ABS(x[i + 3 * inc_x]) < minf) { | |||||
min = j + 3; | |||||
minf = ABS(x[i + 3 * inc_x]); | |||||
} | |||||
i += inc_x * 4; | |||||
j += 4; | |||||
} | |||||
while (j < n) { | |||||
if (ABS(x[i]) < minf) { | |||||
min = j; | |||||
minf = ABS(x[i]); | |||||
} | |||||
i += inc_x; | |||||
j++; | |||||
} | |||||
return (min + 1); | |||||
} | |||||
} | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <math.h> | |||||
#include <altivec.h> | |||||
#if defined(DOUBLE) | |||||
#define ABS fabs | |||||
#else | |||||
#define ABS fabsf | |||||
#endif | |||||
/** | |||||
* Find minimum index | |||||
* Warning: requirements n>0 and n % 64 == 0 | |||||
* @param n | |||||
* @param x pointer to the vector | |||||
* @param minf (out) minimum absolute value .( only for output ) | |||||
* @return index | |||||
*/ | |||||
static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||||
BLASLONG index; | |||||
BLASLONG i=0; | |||||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||||
temp0=vec_xor(temp0,temp0); | |||||
temp1=temp1 <<1 ; //{16,16,16,16} | |||||
register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; | |||||
register __vector float * v_ptrx=(__vector float *)x; | |||||
register __vector float quadruple_values=vec_abs(v_ptrx[0]); | |||||
for(; i<n; i+=64){ | |||||
//absolute temporary vectors | |||||
register __vector float v0=vec_abs(v_ptrx[0]); | |||||
register __vector float v1=vec_abs(v_ptrx[1]); | |||||
register __vector float v2=vec_abs(v_ptrx[2]); | |||||
register __vector float v3=vec_abs(v_ptrx[3]); | |||||
register __vector float v4=vec_abs(v_ptrx[4]); | |||||
register __vector float v5=vec_abs(v_ptrx[5]); | |||||
register __vector float v6=vec_abs(v_ptrx[6]); | |||||
register __vector float v7=vec_abs(v_ptrx[7]); | |||||
//cmp quadruple pairs | |||||
register __vector bool int r1=vec_cmpgt(v0,v1); | |||||
register __vector bool int r2=vec_cmpgt(v2,v3); | |||||
register __vector bool int r3=vec_cmpgt(v4,v5); | |||||
register __vector bool int r4=vec_cmpgt(v6,v7); | |||||
//select | |||||
register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||||
register __vector float vf0= vec_sel(v0,v1,r1); | |||||
register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||||
register __vector float vf1= vec_sel(v2,v3,r2); | |||||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||||
v0=vec_sel(v4,v5,r3); | |||||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||||
v1=vec_sel(v6,v7,r4); | |||||
// cmp selected | |||||
r1=vec_cmpgt(vf0,vf1); | |||||
r2=vec_cmpgt(v0,v1); | |||||
v_ptrx+=8; | |||||
//select from above | |||||
ind0_first= vec_sel(ind0_first,ind1,r1); | |||||
vf0= vec_sel(vf0,vf1,r1) ; | |||||
ind2= vec_sel(ind2,ind3,r2); | |||||
vf1= vec_sel(v0,v1,r2); | |||||
//second indices actually should be within [16,31] so ind2+16 | |||||
ind2 +=temp1; | |||||
//final cmp and select index and value for the first 32 values | |||||
r1=vec_cmpgt(vf0,vf1); | |||||
ind0_first = vec_sel(ind0_first,ind2,r1); | |||||
vf0= vec_sel(vf0,vf1,r1); | |||||
ind0_first+=temp0; //get absolute index | |||||
temp0+=temp1; | |||||
temp0+=temp1; //temp0+32 | |||||
//second part of 32 | |||||
// absolute temporary vectors | |||||
v0=vec_abs(v_ptrx[0]); | |||||
v1=vec_abs(v_ptrx[1]); | |||||
v2=vec_abs(v_ptrx[2]); | |||||
v3=vec_abs(v_ptrx[3]); | |||||
v4=vec_abs(v_ptrx[4]); | |||||
v5=vec_abs(v_ptrx[5]); | |||||
v6=vec_abs(v_ptrx[6]); | |||||
v7=vec_abs(v_ptrx[7]); | |||||
//cmp quadruple pairs | |||||
r1=vec_cmpgt(v0,v1); | |||||
r2=vec_cmpgt(v2,v3); | |||||
r3=vec_cmpgt(v4,v5); | |||||
r4=vec_cmpgt(v6,v7); | |||||
//select | |||||
register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||||
register __vector float vv0= vec_sel(v0,v1,r1); | |||||
ind1= vec_sel(static_index2,static_index3,r2); | |||||
register __vector float vv1= vec_sel(v2,v3,r2); | |||||
ind2= vec_sel(static_index0,static_index1,r3); | |||||
v0=vec_sel(v4,v5,r3); | |||||
ind3= vec_sel(static_index2,static_index3,r4); | |||||
v1=vec_sel(v6,v7,r4); | |||||
// cmp selected | |||||
r1=vec_cmpgt(vv0,vv1); | |||||
r2=vec_cmpgt(v0,v1); | |||||
v_ptrx+=8; | |||||
//select from above | |||||
ind0_second= vec_sel(ind0_second,ind1,r1); | |||||
vv0= vec_sel(vv0,vv1,r1) ; | |||||
ind2= vec_sel(ind2,ind3,r2); | |||||
vv1= vec_sel(v0,v1,r2) ; | |||||
//second indices actually should be within [16,31] so ind2+16 | |||||
ind2 +=temp1; | |||||
//final cmp and select index and value for the second 32 values | |||||
r1=vec_cmpgt(vv0,vv1); | |||||
ind0_second = vec_sel(ind0_second,ind2,r1); | |||||
vv0= vec_sel(vv0,vv1,r1); | |||||
ind0_second+=temp0; //get absolute index | |||||
//find final quadruple from 64 elements | |||||
r2=vec_cmpgt(vf0,vv0); | |||||
ind2 = vec_sel( ind0_first,ind0_second,r2); | |||||
vv0= vec_sel(vf0,vv0,r2); | |||||
//compare with old quadruple and update | |||||
r3=vec_cmpgt( quadruple_values,vv0); | |||||
quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||||
quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||||
temp0+=temp1; | |||||
temp0+=temp1; //temp0+32 | |||||
} | |||||
//now we have to chose from 4 values and 4 different indices | |||||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||||
// otherwise we will assign index of the minimum value | |||||
float a1,a2,a3,a4; | |||||
unsigned int i1,i2,i3,i4; | |||||
a1=vec_extract(quadruple_values,0); | |||||
a2=vec_extract(quadruple_values,1); | |||||
a3=vec_extract(quadruple_values,2); | |||||
a4=vec_extract(quadruple_values,3); | |||||
i1=vec_extract(quadruple_indices,0); | |||||
i2=vec_extract(quadruple_indices,1); | |||||
i3=vec_extract(quadruple_indices,2); | |||||
i4=vec_extract(quadruple_indices,3); | |||||
if(a1==a2){ | |||||
index=i1>i2?i2:i1; | |||||
}else if(a2<a1){ | |||||
index=i2; | |||||
a1=a2; | |||||
}else{ | |||||
index= i1; | |||||
} | |||||
if(a4==a3){ | |||||
i1=i3>i4?i4:i3; | |||||
}else if(a4<a3){ | |||||
i1=i4; | |||||
a3=a4; | |||||
}else{ | |||||
i1= i3; | |||||
} | |||||
if(a1==a3){ | |||||
index=i1>index?index:i1; | |||||
*minf=a1; | |||||
}else if(a3<a1){ | |||||
index=i1; | |||||
*minf=a3; | |||||
}else{ | |||||
*minf=a1; | |||||
} | |||||
return index; | |||||
} | |||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
BLASLONG i = 0; | |||||
BLASLONG j = 0; | |||||
BLASLONG min = 0; | |||||
FLOAT minf = 0.0; | |||||
if (n <= 0 || inc_x <= 0) return (min); | |||||
minf = ABS(x[0]); //index's not incremented | |||||
if (inc_x == 1) { | |||||
BLASLONG n1 = n & -64; | |||||
if (n1 > 0) { | |||||
min = siamin_kernel_64(n1, x, &minf); | |||||
i = n1; | |||||
} | |||||
while (i < n) { | |||||
if (ABS(x[i]) < minf) { | |||||
min = i; | |||||
minf = ABS(x[i]); | |||||
} | |||||
i++; | |||||
} | |||||
return (min + 1); | |||||
} else { | |||||
BLASLONG n1 = n & -4; | |||||
while (j < n1) { | |||||
if (ABS(x[i]) < minf) { | |||||
min = j; | |||||
minf = ABS(x[i]); | |||||
} | |||||
if (ABS(x[i + inc_x]) < minf) { | |||||
min = j + 1; | |||||
minf = ABS(x[i + inc_x]); | |||||
} | |||||
if (ABS(x[i + 2 * inc_x]) < minf) { | |||||
min = j + 2; | |||||
minf = ABS(x[i + 2 * inc_x]); | |||||
} | |||||
if (ABS(x[i + 3 * inc_x]) < minf) { | |||||
min = j + 3; | |||||
minf = ABS(x[i + 3 * inc_x]); | |||||
} | |||||
i += inc_x * 4; | |||||
j += 4; | |||||
} | |||||
while (j < n) { | |||||
if (ABS(x[i]) < minf) { | |||||
min = j; | |||||
minf = ABS(x[i]); | |||||
} | |||||
i += inc_x; | |||||
j++; | |||||
} | |||||
return (min + 1); | |||||
} | |||||
} |
@@ -1,272 +1,272 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "def_vsx.h" | |||||
#define LOAD ld | |||||
#define STACKSIZE (512 ) | |||||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||||
#define M r3 | |||||
#define N r4 | |||||
#define K r5 | |||||
#define A r7 | |||||
#define B r8 | |||||
#define C r9 | |||||
#define LDC r10 | |||||
#define OFFSET r6 | |||||
#define alpha_r vs20 | |||||
#define save_permute_1 vs21 | |||||
#define save_permute_2 vs22 | |||||
#define permute_mask vs23 | |||||
#define o0 0 | |||||
#define T1 r11 | |||||
#define T2 r12 | |||||
#define T3 r14 | |||||
#define T4 r15 | |||||
#define T5 r16 | |||||
#define T6 r17 | |||||
#define L r18 | |||||
#define T7 r19 | |||||
#define T8 r20 | |||||
#define TEMP_REG r21 | |||||
#define I r22 | |||||
#define J r23 | |||||
#define AO r24 | |||||
#define BO r25 | |||||
#define CO r26 | |||||
#define T9 r27 | |||||
#define T10 r28 | |||||
#define T11 r29 | |||||
#define T12 r30 | |||||
#define T13 r31 | |||||
#include "sgemm_macros_power9.S" | |||||
.equ perm_const1, 0x0405060700010203 | |||||
.equ perm_const2, 0x0c0d0e0f08090a0b | |||||
.equ save_permute_11, 0x1415161718191a1b | |||||
.equ save_permute_12, 0x0405060708090a0b | |||||
.equ save_permute_21, 0x101112131c1d1e1f | |||||
.equ save_permute_22, 0x000102030c0d0e0f | |||||
#ifndef NEEDPARAM | |||||
PROLOGUE | |||||
PROFCODE | |||||
addi SP, SP, -STACKSIZE | |||||
mflr r0 | |||||
stfd f14, 0(SP) | |||||
stfd f15, 8(SP) | |||||
stfd f16, 16(SP) | |||||
stfd f17, 24(SP) | |||||
stfd f18, 32(SP) | |||||
stfd f19, 40(SP) | |||||
stfd f20, 48(SP) | |||||
stfd f21, 56(SP) | |||||
stfd f22, 64(SP) | |||||
stfd f23, 72(SP) | |||||
stfd f24, 80(SP) | |||||
stfd f25, 88(SP) | |||||
stfd f26, 96(SP) | |||||
stfd f27, 104(SP) | |||||
stfd f28, 112(SP) | |||||
stfd f29, 120(SP) | |||||
stfd f30, 128(SP) | |||||
stfd f31, 136(SP) | |||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
stxv vs52, 288(SP) | |||||
stxv vs53, 304(SP) | |||||
stxv vs54, 320(SP) | |||||
stxv vs55, 336(SP) | |||||
stxv vs56, 352(SP) | |||||
stxv vs57, 368(SP) | |||||
stxv vs58, 384(SP) | |||||
stxv vs59, 400(SP) | |||||
stxv vs60, 416(SP) | |||||
stxv vs61, 432(SP) | |||||
stxv vs62, 448(SP) | |||||
stxv vs63, 464(SP) | |||||
std r0, FLINK_SAVE(SP) | |||||
#if defined(TRMMKERNEL) | |||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
#endif | |||||
slwi LDC, LDC, 2 | |||||
/*alpha is stored in f1. convert to single and splat*/ | |||||
xscvdpspn alpha_r,vs1 | |||||
xxspltw alpha_r,alpha_r,0 | |||||
/*load reverse permute mask for big endian | |||||
uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||||
*/ | |||||
lis T2, perm_const2@highest | |||||
lis T1, perm_const1@highest | |||||
lis T3, save_permute_12@highest | |||||
lis T4, save_permute_11@highest | |||||
lis T5, save_permute_22@highest | |||||
lis T6, save_permute_21@highest | |||||
ori T2, T2, perm_const2@higher | |||||
ori T1, T1, perm_const1@higher | |||||
ori T3, T3, save_permute_12@higher | |||||
ori T4, T4, save_permute_11@higher | |||||
ori T5, T5, save_permute_22@higher | |||||
ori T6, T6, save_permute_21@higher | |||||
rldicr T2, T2, 32, 31 | |||||
rldicr T1, T1, 32, 31 | |||||
rldicr T3, T3, 32, 31 | |||||
rldicr T4, T4, 32, 31 | |||||
rldicr T5, T5, 32, 31 | |||||
rldicr T6, T6, 32, 31 | |||||
oris T2, T2, perm_const2@h | |||||
oris T1, T1, perm_const1@h | |||||
oris T3, T3, save_permute_12@h | |||||
oris T4, T4, save_permute_11@h | |||||
oris T5, T5, save_permute_22@h | |||||
oris T6, T6, save_permute_21@h | |||||
ori T2, T2, perm_const2@l | |||||
ori T1, T1, perm_const1@l | |||||
ori T3, T3, save_permute_12@l | |||||
ori T4, T4, save_permute_11@l | |||||
ori T5, T5, save_permute_22@l | |||||
ori T6, T6, save_permute_21@l | |||||
li r0,0 | |||||
mtvsrdd permute_mask,T2,T1 | |||||
mtvsrdd save_permute_1,T3,T4 | |||||
mtvsrdd save_permute_2,T5,T6 | |||||
#include "sgemm_logic_power9.S" | |||||
.L999: | |||||
lfd f14, 0(SP) | |||||
lfd f15, 8(SP) | |||||
lfd f16, 16(SP) | |||||
lfd f17, 24(SP) | |||||
lfd f18, 32(SP) | |||||
lfd f19, 40(SP) | |||||
lfd f20, 48(SP) | |||||
lfd f21, 56(SP) | |||||
lfd f22, 64(SP) | |||||
lfd f23, 72(SP) | |||||
lfd f24, 80(SP) | |||||
lfd f25, 88(SP) | |||||
lfd f26, 96(SP) | |||||
lfd f27, 104(SP) | |||||
lfd f28, 112(SP) | |||||
lfd f29, 120(SP) | |||||
lfd f30, 128(SP) | |||||
lfd f31, 136(SP) | |||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
ld r0, FLINK_SAVE(SP) | |||||
lxv vs52, 288(SP) | |||||
lxv vs53, 304(SP) | |||||
lxv vs54, 320(SP) | |||||
lxv vs55, 336(SP) | |||||
lxv vs56, 352(SP) | |||||
lxv vs57, 368(SP) | |||||
lxv vs58, 384(SP) | |||||
lxv vs59, 400(SP) | |||||
mtlr r0 | |||||
lxv vs60, 416(SP) | |||||
lxv vs61, 432(SP) | |||||
lxv vs62, 448(SP) | |||||
lxv vs63, 464(SP) | |||||
addi SP, SP, STACKSIZE | |||||
blr | |||||
EPILOGUE | |||||
#endif | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "def_vsx.h" | |||||
#define LOAD ld | |||||
#define STACKSIZE (512 ) | |||||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||||
#define M r3 | |||||
#define N r4 | |||||
#define K r5 | |||||
#define A r7 | |||||
#define B r8 | |||||
#define C r9 | |||||
#define LDC r10 | |||||
#define OFFSET r6 | |||||
#define alpha_r vs20 | |||||
#define save_permute_1 vs21 | |||||
#define save_permute_2 vs22 | |||||
#define permute_mask vs23 | |||||
#define o0 0 | |||||
#define T1 r11 | |||||
#define T2 r12 | |||||
#define T3 r14 | |||||
#define T4 r15 | |||||
#define T5 r16 | |||||
#define T6 r17 | |||||
#define L r18 | |||||
#define T7 r19 | |||||
#define T8 r20 | |||||
#define TEMP_REG r21 | |||||
#define I r22 | |||||
#define J r23 | |||||
#define AO r24 | |||||
#define BO r25 | |||||
#define CO r26 | |||||
#define T9 r27 | |||||
#define T10 r28 | |||||
#define T11 r29 | |||||
#define T12 r30 | |||||
#define T13 r31 | |||||
#include "sgemm_macros_power9.S" | |||||
.equ perm_const1, 0x0405060700010203 | |||||
.equ perm_const2, 0x0c0d0e0f08090a0b | |||||
.equ save_permute_11, 0x1415161718191a1b | |||||
.equ save_permute_12, 0x0405060708090a0b | |||||
.equ save_permute_21, 0x101112131c1d1e1f | |||||
.equ save_permute_22, 0x000102030c0d0e0f | |||||
#ifndef NEEDPARAM | |||||
PROLOGUE | |||||
PROFCODE | |||||
addi SP, SP, -STACKSIZE | |||||
mflr r0 | |||||
stfd f14, 0(SP) | |||||
stfd f15, 8(SP) | |||||
stfd f16, 16(SP) | |||||
stfd f17, 24(SP) | |||||
stfd f18, 32(SP) | |||||
stfd f19, 40(SP) | |||||
stfd f20, 48(SP) | |||||
stfd f21, 56(SP) | |||||
stfd f22, 64(SP) | |||||
stfd f23, 72(SP) | |||||
stfd f24, 80(SP) | |||||
stfd f25, 88(SP) | |||||
stfd f26, 96(SP) | |||||
stfd f27, 104(SP) | |||||
stfd f28, 112(SP) | |||||
stfd f29, 120(SP) | |||||
stfd f30, 128(SP) | |||||
stfd f31, 136(SP) | |||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
stxv vs52, 288(SP) | |||||
stxv vs53, 304(SP) | |||||
stxv vs54, 320(SP) | |||||
stxv vs55, 336(SP) | |||||
stxv vs56, 352(SP) | |||||
stxv vs57, 368(SP) | |||||
stxv vs58, 384(SP) | |||||
stxv vs59, 400(SP) | |||||
stxv vs60, 416(SP) | |||||
stxv vs61, 432(SP) | |||||
stxv vs62, 448(SP) | |||||
stxv vs63, 464(SP) | |||||
std r0, FLINK_SAVE(SP) | |||||
#if defined(TRMMKERNEL) | |||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
#endif | |||||
slwi LDC, LDC, 2 | |||||
/*alpha is stored in f1. convert to single and splat*/ | |||||
xscvdpspn alpha_r,vs1 | |||||
xxspltw alpha_r,alpha_r,0 | |||||
/*load reverse permute mask for big endian | |||||
uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||||
*/ | |||||
lis T2, perm_const2@highest | |||||
lis T1, perm_const1@highest | |||||
lis T3, save_permute_12@highest | |||||
lis T4, save_permute_11@highest | |||||
lis T5, save_permute_22@highest | |||||
lis T6, save_permute_21@highest | |||||
ori T2, T2, perm_const2@higher | |||||
ori T1, T1, perm_const1@higher | |||||
ori T3, T3, save_permute_12@higher | |||||
ori T4, T4, save_permute_11@higher | |||||
ori T5, T5, save_permute_22@higher | |||||
ori T6, T6, save_permute_21@higher | |||||
rldicr T2, T2, 32, 31 | |||||
rldicr T1, T1, 32, 31 | |||||
rldicr T3, T3, 32, 31 | |||||
rldicr T4, T4, 32, 31 | |||||
rldicr T5, T5, 32, 31 | |||||
rldicr T6, T6, 32, 31 | |||||
oris T2, T2, perm_const2@h | |||||
oris T1, T1, perm_const1@h | |||||
oris T3, T3, save_permute_12@h | |||||
oris T4, T4, save_permute_11@h | |||||
oris T5, T5, save_permute_22@h | |||||
oris T6, T6, save_permute_21@h | |||||
ori T2, T2, perm_const2@l | |||||
ori T1, T1, perm_const1@l | |||||
ori T3, T3, save_permute_12@l | |||||
ori T4, T4, save_permute_11@l | |||||
ori T5, T5, save_permute_22@l | |||||
ori T6, T6, save_permute_21@l | |||||
li r0,0 | |||||
mtvsrdd permute_mask,T2,T1 | |||||
mtvsrdd save_permute_1,T3,T4 | |||||
mtvsrdd save_permute_2,T5,T6 | |||||
#include "sgemm_logic_power9.S" | |||||
.L999: | |||||
lfd f14, 0(SP) | |||||
lfd f15, 8(SP) | |||||
lfd f16, 16(SP) | |||||
lfd f17, 24(SP) | |||||
lfd f18, 32(SP) | |||||
lfd f19, 40(SP) | |||||
lfd f20, 48(SP) | |||||
lfd f21, 56(SP) | |||||
lfd f22, 64(SP) | |||||
lfd f23, 72(SP) | |||||
lfd f24, 80(SP) | |||||
lfd f25, 88(SP) | |||||
lfd f26, 96(SP) | |||||
lfd f27, 104(SP) | |||||
lfd f28, 112(SP) | |||||
lfd f29, 120(SP) | |||||
lfd f30, 128(SP) | |||||
lfd f31, 136(SP) | |||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
ld r0, FLINK_SAVE(SP) | |||||
lxv vs52, 288(SP) | |||||
lxv vs53, 304(SP) | |||||
lxv vs54, 320(SP) | |||||
lxv vs55, 336(SP) | |||||
lxv vs56, 352(SP) | |||||
lxv vs57, 368(SP) | |||||
lxv vs58, 384(SP) | |||||
lxv vs59, 400(SP) | |||||
mtlr r0 | |||||
lxv vs60, 416(SP) | |||||
lxv vs61, 432(SP) | |||||
lxv vs62, 448(SP) | |||||
lxv vs63, 464(SP) | |||||
addi SP, SP, STACKSIZE | |||||
blr | |||||
EPILOGUE | |||||
#endif |
@@ -1,470 +1,470 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#if !defined(__VEC__) || !defined(__ALTIVEC__) | |||||
#include "../arm/gemv_n.c" | |||||
#else | |||||
#include "common.h" | |||||
#define NBMAX 4096 | |||||
static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||||
{ | |||||
BLASLONG i; | |||||
FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; | |||||
FLOAT x0,x1,x2,x3,x4,x5,x6,x7; | |||||
a0 = ap[0]; | |||||
a1 = ap[1]; | |||||
a2 = ap[2]; | |||||
a3 = ap[3]; | |||||
b0 = a0 + lda4 ; | |||||
b1 = a1 + lda4 ; | |||||
b2 = a2 + lda4 ; | |||||
b3 = a3 + lda4 ; | |||||
x0 = xo[0] * *alpha; | |||||
x1 = xo[1] * *alpha; | |||||
x2 = xo[2] * *alpha; | |||||
x3 = xo[3] * *alpha; | |||||
x4 = xo[4] * *alpha; | |||||
x5 = xo[5] * *alpha; | |||||
x6 = xo[6] * *alpha; | |||||
x7 = xo[7] * *alpha; | |||||
__vector float* va0 = (__vector float*)a0; | |||||
__vector float* va1 = (__vector float*)a1; | |||||
__vector float* va2 = (__vector float*)a2; | |||||
__vector float* va3 = (__vector float*)a3; | |||||
__vector float* vb0 = (__vector float*)b0; | |||||
__vector float* vb1 = (__vector float*)b1; | |||||
__vector float* vb2 = (__vector float*)b2; | |||||
__vector float* vb3 = (__vector float*)b3; | |||||
__vector float v_x0 = {x0,x0,x0,x0}; | |||||
__vector float v_x1 = {x1,x1,x1,x1}; | |||||
__vector float v_x2 = {x2,x2,x2,x2}; | |||||
__vector float v_x3 = {x3,x3,x3,x3}; | |||||
__vector float v_x4 = {x4,x4,x4,x4}; | |||||
__vector float v_x5 = {x5,x5,x5,x5}; | |||||
__vector float v_x6 = {x6,x6,x6,x6}; | |||||
__vector float v_x7 = {x7,x7,x7,x7}; | |||||
__vector float* v_y =(__vector float*)y; | |||||
for ( i=0; i< n/4; i++) | |||||
{ | |||||
register __vector float vy=v_y[i]; | |||||
vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||||
vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; | |||||
v_y[i] =vy; | |||||
} | |||||
} | |||||
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||||
{ | |||||
BLASLONG i; | |||||
FLOAT x0,x1,x2,x3; | |||||
x0 = xo[0] * *alpha; | |||||
x1 = xo[1] * *alpha; | |||||
x2 = xo[2] * *alpha; | |||||
x3 = xo[3] * *alpha; | |||||
__vector float v_x0 = {x0,x0,x0,x0}; | |||||
__vector float v_x1 = {x1,x1,x1,x1}; | |||||
__vector float v_x2 = {x2,x2,x2,x2}; | |||||
__vector float v_x3 = {x3,x3,x3,x3}; | |||||
__vector float* v_y =(__vector float*)y; | |||||
__vector float* va0 = (__vector float*)ap[0]; | |||||
__vector float* va1 = (__vector float*)ap[1]; | |||||
__vector float* va2 = (__vector float*)ap[2]; | |||||
__vector float* va3 = (__vector float*)ap[3]; | |||||
for ( i=0; i< n/4; i++ ) | |||||
{ | |||||
register __vector float vy=v_y[i]; | |||||
vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||||
v_y[i] =vy; | |||||
} | |||||
} | |||||
static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
{ | |||||
BLASLONG i; | |||||
FLOAT x0,x1; | |||||
x0 = x[0] * *alpha; | |||||
x1 = x[1] * *alpha; | |||||
__vector float v_x0 = {x0,x0,x0,x0}; | |||||
__vector float v_x1 = {x1,x1,x1,x1}; | |||||
__vector float* v_y =(__vector float*)y; | |||||
__vector float* va0 = (__vector float*)ap[0]; | |||||
__vector float* va1 = (__vector float*)ap[1]; | |||||
for ( i=0; i< n/4; i++ ) | |||||
{ | |||||
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; | |||||
} | |||||
} | |||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
{ | |||||
BLASLONG i; | |||||
FLOAT x0 ; | |||||
x0 = x[0] * *alpha; | |||||
__vector float v_x0 = {x0,x0,x0,x0}; | |||||
__vector float* v_y =(__vector float*)y; | |||||
__vector float* va0 = (__vector float*)ap; | |||||
for ( i=0; i< n/4; i++ ) | |||||
{ | |||||
v_y[i] += v_x0 * va0[i] ; | |||||
} | |||||
} | |||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||||
{ | |||||
BLASLONG i; | |||||
for ( i=0; i<n; i++ ){ | |||||
*dest += *src; | |||||
src++; | |||||
dest += inc_dest; | |||||
} | |||||
return; | |||||
} | |||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
{ | |||||
BLASLONG i; | |||||
FLOAT *a_ptr; | |||||
FLOAT *x_ptr; | |||||
FLOAT *y_ptr; | |||||
FLOAT *ap[4]; | |||||
BLASLONG n1; | |||||
BLASLONG m1; | |||||
BLASLONG m2; | |||||
BLASLONG m3; | |||||
BLASLONG n2; | |||||
BLASLONG lda4 = lda << 2; | |||||
BLASLONG lda8 = lda << 3; | |||||
FLOAT xbuffer[8] __attribute__((aligned(16))); | |||||
FLOAT *ybuffer; | |||||
if ( m < 1 ) return(0); | |||||
if ( n < 1 ) return(0); | |||||
ybuffer = buffer; | |||||
if ( inc_x == 1 ) | |||||
{ | |||||
n1 = n >> 3 ; | |||||
n2 = n & 7 ; | |||||
} | |||||
else | |||||
{ | |||||
n1 = n >> 2 ; | |||||
n2 = n & 3 ; | |||||
} | |||||
m3 = m & 3 ; | |||||
m1 = m & -4 ; | |||||
m2 = (m & (NBMAX-1)) - m3 ; | |||||
y_ptr = y; | |||||
BLASLONG NB = NBMAX; | |||||
while ( NB == NBMAX ) | |||||
{ | |||||
m1 -= NB; | |||||
if ( m1 < 0) | |||||
{ | |||||
if ( m2 == 0 ) break; | |||||
NB = m2; | |||||
} | |||||
a_ptr = a; | |||||
x_ptr = x; | |||||
ap[0] = a_ptr; | |||||
ap[1] = a_ptr + lda; | |||||
ap[2] = ap[1] + lda; | |||||
ap[3] = ap[2] + lda; | |||||
if ( inc_y != 1 ) | |||||
memset(ybuffer,0,NB*4); | |||||
else | |||||
ybuffer = y_ptr; | |||||
if ( inc_x == 1 ) | |||||
{ | |||||
for( i = 0; i < n1 ; i++) | |||||
{ | |||||
sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); | |||||
ap[0] += lda8; | |||||
ap[1] += lda8; | |||||
ap[2] += lda8; | |||||
ap[3] += lda8; | |||||
a_ptr += lda8; | |||||
x_ptr += 8; | |||||
} | |||||
if ( n2 & 4 ) | |||||
{ | |||||
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); | |||||
ap[0] += lda4; | |||||
ap[1] += lda4; | |||||
ap[2] += lda4; | |||||
ap[3] += lda4; | |||||
a_ptr += lda4; | |||||
x_ptr += 4; | |||||
} | |||||
if ( n2 & 2 ) | |||||
{ | |||||
sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); | |||||
a_ptr += lda*2; | |||||
x_ptr += 2; | |||||
} | |||||
if ( n2 & 1 ) | |||||
{ | |||||
sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); | |||||
a_ptr += lda; | |||||
x_ptr += 1; | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
for( i = 0; i < n1 ; i++) | |||||
{ | |||||
xbuffer[0] = x_ptr[0]; | |||||
x_ptr += inc_x; | |||||
xbuffer[1] = x_ptr[0]; | |||||
x_ptr += inc_x; | |||||
xbuffer[2] = x_ptr[0]; | |||||
x_ptr += inc_x; | |||||
xbuffer[3] = x_ptr[0]; | |||||
x_ptr += inc_x; | |||||
sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); | |||||
ap[0] += lda4; | |||||
ap[1] += lda4; | |||||
ap[2] += lda4; | |||||
ap[3] += lda4; | |||||
a_ptr += lda4; | |||||
} | |||||
for( i = 0; i < n2 ; i++) | |||||
{ | |||||
xbuffer[0] = x_ptr[0]; | |||||
x_ptr += inc_x; | |||||
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); | |||||
a_ptr += lda; | |||||
} | |||||
} | |||||
a += NB; | |||||
if ( inc_y != 1 ) | |||||
{ | |||||
add_y(NB,ybuffer,y_ptr,inc_y); | |||||
y_ptr += NB * inc_y; | |||||
} | |||||
else | |||||
y_ptr += NB ; | |||||
} | |||||
if ( m3 == 0 ) return(0); | |||||
if ( m3 == 3 ) | |||||
{ | |||||
a_ptr = a; | |||||
x_ptr = x; | |||||
FLOAT temp0 = 0.0; | |||||
FLOAT temp1 = 0.0; | |||||
FLOAT temp2 = 0.0; | |||||
if ( lda == 3 && inc_x ==1 ) | |||||
{ | |||||
for( i = 0; i < ( n & -4 ); i+=4 ) | |||||
{ | |||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||||
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||||
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||||
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||||
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||||
a_ptr += 12; | |||||
x_ptr += 4; | |||||
} | |||||
for( ; i < n; i++ ) | |||||
{ | |||||
temp0 += a_ptr[0] * x_ptr[0]; | |||||
temp1 += a_ptr[1] * x_ptr[0]; | |||||
temp2 += a_ptr[2] * x_ptr[0]; | |||||
a_ptr += 3; | |||||
x_ptr ++; | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
for( i = 0; i < n; i++ ) | |||||
{ | |||||
temp0 += a_ptr[0] * x_ptr[0]; | |||||
temp1 += a_ptr[1] * x_ptr[0]; | |||||
temp2 += a_ptr[2] * x_ptr[0]; | |||||
a_ptr += lda; | |||||
x_ptr += inc_x; | |||||
} | |||||
} | |||||
y_ptr[0] += alpha * temp0; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha * temp1; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha * temp2; | |||||
return(0); | |||||
} | |||||
if ( m3 == 2 ) | |||||
{ | |||||
a_ptr = a; | |||||
x_ptr = x; | |||||
FLOAT temp0 = 0.0; | |||||
FLOAT temp1 = 0.0; | |||||
if ( lda == 2 && inc_x ==1 ) | |||||
{ | |||||
for( i = 0; i < (n & -4) ; i+=4 ) | |||||
{ | |||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||||
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||||
a_ptr += 8; | |||||
x_ptr += 4; | |||||
} | |||||
for( ; i < n; i++ ) | |||||
{ | |||||
temp0 += a_ptr[0] * x_ptr[0]; | |||||
temp1 += a_ptr[1] * x_ptr[0]; | |||||
a_ptr += 2; | |||||
x_ptr ++; | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
for( i = 0; i < n; i++ ) | |||||
{ | |||||
temp0 += a_ptr[0] * x_ptr[0]; | |||||
temp1 += a_ptr[1] * x_ptr[0]; | |||||
a_ptr += lda; | |||||
x_ptr += inc_x; | |||||
} | |||||
} | |||||
y_ptr[0] += alpha * temp0; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha * temp1; | |||||
return(0); | |||||
} | |||||
if ( m3 == 1 ) | |||||
{ | |||||
a_ptr = a; | |||||
x_ptr = x; | |||||
FLOAT temp = 0.0; | |||||
if ( lda == 1 && inc_x ==1 ) | |||||
{ | |||||
for( i = 0; i < (n & -4); i+=4 ) | |||||
{ | |||||
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||||
} | |||||
for( ; i < n; i++ ) | |||||
{ | |||||
temp += a_ptr[i] * x_ptr[i]; | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
for( i = 0; i < n; i++ ) | |||||
{ | |||||
temp += a_ptr[0] * x_ptr[0]; | |||||
a_ptr += lda; | |||||
x_ptr += inc_x; | |||||
} | |||||
} | |||||
y_ptr[0] += alpha * temp; | |||||
return(0); | |||||
} | |||||
return(0); | |||||
} | |||||
#endif | |||||
/*************************************************************************** | |||||
Copyright (c) 2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#if !defined(__VEC__) || !defined(__ALTIVEC__) | |||||
#include "../arm/gemv_n.c" | |||||
#else | |||||
#include "common.h" | |||||
#define NBMAX 4096 | |||||
static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||||
{ | |||||
BLASLONG i; | |||||
FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; | |||||
FLOAT x0,x1,x2,x3,x4,x5,x6,x7; | |||||
a0 = ap[0]; | |||||
a1 = ap[1]; | |||||
a2 = ap[2]; | |||||
a3 = ap[3]; | |||||
b0 = a0 + lda4 ; | |||||
b1 = a1 + lda4 ; | |||||
b2 = a2 + lda4 ; | |||||
b3 = a3 + lda4 ; | |||||
x0 = xo[0] * *alpha; | |||||
x1 = xo[1] * *alpha; | |||||
x2 = xo[2] * *alpha; | |||||
x3 = xo[3] * *alpha; | |||||
x4 = xo[4] * *alpha; | |||||
x5 = xo[5] * *alpha; | |||||
x6 = xo[6] * *alpha; | |||||
x7 = xo[7] * *alpha; | |||||
__vector float* va0 = (__vector float*)a0; | |||||
__vector float* va1 = (__vector float*)a1; | |||||
__vector float* va2 = (__vector float*)a2; | |||||
__vector float* va3 = (__vector float*)a3; | |||||
__vector float* vb0 = (__vector float*)b0; | |||||
__vector float* vb1 = (__vector float*)b1; | |||||
__vector float* vb2 = (__vector float*)b2; | |||||
__vector float* vb3 = (__vector float*)b3; | |||||
__vector float v_x0 = {x0,x0,x0,x0}; | |||||
__vector float v_x1 = {x1,x1,x1,x1}; | |||||
__vector float v_x2 = {x2,x2,x2,x2}; | |||||
__vector float v_x3 = {x3,x3,x3,x3}; | |||||
__vector float v_x4 = {x4,x4,x4,x4}; | |||||
__vector float v_x5 = {x5,x5,x5,x5}; | |||||
__vector float v_x6 = {x6,x6,x6,x6}; | |||||
__vector float v_x7 = {x7,x7,x7,x7}; | |||||
__vector float* v_y =(__vector float*)y; | |||||
for ( i=0; i< n/4; i++) | |||||
{ | |||||
register __vector float vy=v_y[i]; | |||||
vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||||
vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; | |||||
v_y[i] =vy; | |||||
} | |||||
} | |||||
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||||
{ | |||||
BLASLONG i; | |||||
FLOAT x0,x1,x2,x3; | |||||
x0 = xo[0] * *alpha; | |||||
x1 = xo[1] * *alpha; | |||||
x2 = xo[2] * *alpha; | |||||
x3 = xo[3] * *alpha; | |||||
__vector float v_x0 = {x0,x0,x0,x0}; | |||||
__vector float v_x1 = {x1,x1,x1,x1}; | |||||
__vector float v_x2 = {x2,x2,x2,x2}; | |||||
__vector float v_x3 = {x3,x3,x3,x3}; | |||||
__vector float* v_y =(__vector float*)y; | |||||
__vector float* va0 = (__vector float*)ap[0]; | |||||
__vector float* va1 = (__vector float*)ap[1]; | |||||
__vector float* va2 = (__vector float*)ap[2]; | |||||
__vector float* va3 = (__vector float*)ap[3]; | |||||
for ( i=0; i< n/4; i++ ) | |||||
{ | |||||
register __vector float vy=v_y[i]; | |||||
vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||||
v_y[i] =vy; | |||||
} | |||||
} | |||||
static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
{ | |||||
BLASLONG i; | |||||
FLOAT x0,x1; | |||||
x0 = x[0] * *alpha; | |||||
x1 = x[1] * *alpha; | |||||
__vector float v_x0 = {x0,x0,x0,x0}; | |||||
__vector float v_x1 = {x1,x1,x1,x1}; | |||||
__vector float* v_y =(__vector float*)y; | |||||
__vector float* va0 = (__vector float*)ap[0]; | |||||
__vector float* va1 = (__vector float*)ap[1]; | |||||
for ( i=0; i< n/4; i++ ) | |||||
{ | |||||
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; | |||||
} | |||||
} | |||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
{ | |||||
BLASLONG i; | |||||
FLOAT x0 ; | |||||
x0 = x[0] * *alpha; | |||||
__vector float v_x0 = {x0,x0,x0,x0}; | |||||
__vector float* v_y =(__vector float*)y; | |||||
__vector float* va0 = (__vector float*)ap; | |||||
for ( i=0; i< n/4; i++ ) | |||||
{ | |||||
v_y[i] += v_x0 * va0[i] ; | |||||
} | |||||
} | |||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||||
{ | |||||
BLASLONG i; | |||||
for ( i=0; i<n; i++ ){ | |||||
*dest += *src; | |||||
src++; | |||||
dest += inc_dest; | |||||
} | |||||
return; | |||||
} | |||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
{ | |||||
BLASLONG i; | |||||
FLOAT *a_ptr; | |||||
FLOAT *x_ptr; | |||||
FLOAT *y_ptr; | |||||
FLOAT *ap[4]; | |||||
BLASLONG n1; | |||||
BLASLONG m1; | |||||
BLASLONG m2; | |||||
BLASLONG m3; | |||||
BLASLONG n2; | |||||
BLASLONG lda4 = lda << 2; | |||||
BLASLONG lda8 = lda << 3; | |||||
FLOAT xbuffer[8] __attribute__((aligned(16))); | |||||
FLOAT *ybuffer; | |||||
if ( m < 1 ) return(0); | |||||
if ( n < 1 ) return(0); | |||||
ybuffer = buffer; | |||||
if ( inc_x == 1 ) | |||||
{ | |||||
n1 = n >> 3 ; | |||||
n2 = n & 7 ; | |||||
} | |||||
else | |||||
{ | |||||
n1 = n >> 2 ; | |||||
n2 = n & 3 ; | |||||
} | |||||
m3 = m & 3 ; | |||||
m1 = m & -4 ; | |||||
m2 = (m & (NBMAX-1)) - m3 ; | |||||
y_ptr = y; | |||||
BLASLONG NB = NBMAX; | |||||
while ( NB == NBMAX ) | |||||
{ | |||||
m1 -= NB; | |||||
if ( m1 < 0) | |||||
{ | |||||
if ( m2 == 0 ) break; | |||||
NB = m2; | |||||
} | |||||
a_ptr = a; | |||||
x_ptr = x; | |||||
ap[0] = a_ptr; | |||||
ap[1] = a_ptr + lda; | |||||
ap[2] = ap[1] + lda; | |||||
ap[3] = ap[2] + lda; | |||||
if ( inc_y != 1 ) | |||||
memset(ybuffer,0,NB*4); | |||||
else | |||||
ybuffer = y_ptr; | |||||
if ( inc_x == 1 ) | |||||
{ | |||||
for( i = 0; i < n1 ; i++) | |||||
{ | |||||
sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); | |||||
ap[0] += lda8; | |||||
ap[1] += lda8; | |||||
ap[2] += lda8; | |||||
ap[3] += lda8; | |||||
a_ptr += lda8; | |||||
x_ptr += 8; | |||||
} | |||||
if ( n2 & 4 ) | |||||
{ | |||||
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); | |||||
ap[0] += lda4; | |||||
ap[1] += lda4; | |||||
ap[2] += lda4; | |||||
ap[3] += lda4; | |||||
a_ptr += lda4; | |||||
x_ptr += 4; | |||||
} | |||||
if ( n2 & 2 ) | |||||
{ | |||||
sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); | |||||
a_ptr += lda*2; | |||||
x_ptr += 2; | |||||
} | |||||
if ( n2 & 1 ) | |||||
{ | |||||
sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); | |||||
a_ptr += lda; | |||||
x_ptr += 1; | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
for( i = 0; i < n1 ; i++) | |||||
{ | |||||
xbuffer[0] = x_ptr[0]; | |||||
x_ptr += inc_x; | |||||
xbuffer[1] = x_ptr[0]; | |||||
x_ptr += inc_x; | |||||
xbuffer[2] = x_ptr[0]; | |||||
x_ptr += inc_x; | |||||
xbuffer[3] = x_ptr[0]; | |||||
x_ptr += inc_x; | |||||
sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); | |||||
ap[0] += lda4; | |||||
ap[1] += lda4; | |||||
ap[2] += lda4; | |||||
ap[3] += lda4; | |||||
a_ptr += lda4; | |||||
} | |||||
for( i = 0; i < n2 ; i++) | |||||
{ | |||||
xbuffer[0] = x_ptr[0]; | |||||
x_ptr += inc_x; | |||||
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); | |||||
a_ptr += lda; | |||||
} | |||||
} | |||||
a += NB; | |||||
if ( inc_y != 1 ) | |||||
{ | |||||
add_y(NB,ybuffer,y_ptr,inc_y); | |||||
y_ptr += NB * inc_y; | |||||
} | |||||
else | |||||
y_ptr += NB ; | |||||
} | |||||
if ( m3 == 0 ) return(0); | |||||
if ( m3 == 3 ) | |||||
{ | |||||
a_ptr = a; | |||||
x_ptr = x; | |||||
FLOAT temp0 = 0.0; | |||||
FLOAT temp1 = 0.0; | |||||
FLOAT temp2 = 0.0; | |||||
if ( lda == 3 && inc_x ==1 ) | |||||
{ | |||||
for( i = 0; i < ( n & -4 ); i+=4 ) | |||||
{ | |||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||||
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||||
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||||
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||||
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||||
a_ptr += 12; | |||||
x_ptr += 4; | |||||
} | |||||
for( ; i < n; i++ ) | |||||
{ | |||||
temp0 += a_ptr[0] * x_ptr[0]; | |||||
temp1 += a_ptr[1] * x_ptr[0]; | |||||
temp2 += a_ptr[2] * x_ptr[0]; | |||||
a_ptr += 3; | |||||
x_ptr ++; | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
for( i = 0; i < n; i++ ) | |||||
{ | |||||
temp0 += a_ptr[0] * x_ptr[0]; | |||||
temp1 += a_ptr[1] * x_ptr[0]; | |||||
temp2 += a_ptr[2] * x_ptr[0]; | |||||
a_ptr += lda; | |||||
x_ptr += inc_x; | |||||
} | |||||
} | |||||
y_ptr[0] += alpha * temp0; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha * temp1; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha * temp2; | |||||
return(0); | |||||
} | |||||
if ( m3 == 2 ) | |||||
{ | |||||
a_ptr = a; | |||||
x_ptr = x; | |||||
FLOAT temp0 = 0.0; | |||||
FLOAT temp1 = 0.0; | |||||
if ( lda == 2 && inc_x ==1 ) | |||||
{ | |||||
for( i = 0; i < (n & -4) ; i+=4 ) | |||||
{ | |||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||||
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||||
a_ptr += 8; | |||||
x_ptr += 4; | |||||
} | |||||
for( ; i < n; i++ ) | |||||
{ | |||||
temp0 += a_ptr[0] * x_ptr[0]; | |||||
temp1 += a_ptr[1] * x_ptr[0]; | |||||
a_ptr += 2; | |||||
x_ptr ++; | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
for( i = 0; i < n; i++ ) | |||||
{ | |||||
temp0 += a_ptr[0] * x_ptr[0]; | |||||
temp1 += a_ptr[1] * x_ptr[0]; | |||||
a_ptr += lda; | |||||
x_ptr += inc_x; | |||||
} | |||||
} | |||||
y_ptr[0] += alpha * temp0; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha * temp1; | |||||
return(0); | |||||
} | |||||
if ( m3 == 1 ) | |||||
{ | |||||
a_ptr = a; | |||||
x_ptr = x; | |||||
FLOAT temp = 0.0; | |||||
if ( lda == 1 && inc_x ==1 ) | |||||
{ | |||||
for( i = 0; i < (n & -4); i+=4 ) | |||||
{ | |||||
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||||
} | |||||
for( ; i < n; i++ ) | |||||
{ | |||||
temp += a_ptr[i] * x_ptr[i]; | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
for( i = 0; i < n; i++ ) | |||||
{ | |||||
temp += a_ptr[0] * x_ptr[0]; | |||||
a_ptr += lda; | |||||
x_ptr += inc_x; | |||||
} | |||||
} | |||||
y_ptr[0] += alpha * temp; | |||||
return(0); | |||||
} | |||||
return(0); | |||||
} | |||||
#endif | |||||
@@ -1,484 +1,484 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#if !defined(__VEC__) || !defined(__ALTIVEC__) | |||||
#include "../arm/gemv_t.c" | |||||
#else | |||||
#include "common.h" | |||||
#define NBMAX 2048 | |||||
#include <altivec.h> | |||||
static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||||
BLASLONG i; | |||||
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||||
__vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; | |||||
register __vector float temp0 = {0,0,0,0}; | |||||
register __vector float temp1 = {0,0,0,0}; | |||||
register __vector float temp2 = {0,0,0,0}; | |||||
register __vector float temp3 = {0,0,0,0}; | |||||
register __vector float temp4 = {0,0,0,0}; | |||||
register __vector float temp5 = {0,0,0,0}; | |||||
register __vector float temp6 = {0,0,0,0}; | |||||
register __vector float temp7 = {0,0,0,0}; | |||||
a0 = ap; | |||||
a1 = ap + lda; | |||||
a2 = a1 + lda; | |||||
a3 = a2 + lda; | |||||
a4 = a3 + lda; | |||||
a5 = a4 + lda; | |||||
a6 = a5 + lda; | |||||
a7 = a6 + lda; | |||||
va0 = (__vector float*) a0; | |||||
va1 = (__vector float*) a1; | |||||
va2 = (__vector float*) a2; | |||||
va3 = (__vector float*) a3; | |||||
va4 = (__vector float*) a4; | |||||
va5 = (__vector float*) a5; | |||||
va6 = (__vector float*) a6; | |||||
va7 = (__vector float*) a7; | |||||
v_x = (__vector float*) x; | |||||
for (i = 0; i < n/4; i ++) { | |||||
temp0 += v_x[i] * va0[i]; | |||||
temp1 += v_x[i] * va1[i]; | |||||
temp2 += v_x[i] * va2[i]; | |||||
temp3 += v_x[i] * va3[i]; | |||||
temp4 += v_x[i] * va4[i]; | |||||
temp5 += v_x[i] * va5[i]; | |||||
temp6 += v_x[i] * va6[i]; | |||||
temp7 += v_x[i] * va7[i]; | |||||
} | |||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||||
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||||
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||||
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||||
y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); | |||||
y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); | |||||
y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); | |||||
y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); | |||||
} | |||||
static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||||
BLASLONG i = 0; | |||||
FLOAT *a0, *a1, *a2, *a3; | |||||
a0 = ap; | |||||
a1 = ap + lda; | |||||
a2 = a1 + lda; | |||||
a3 = a2 + lda; | |||||
__vector float* va0 = (__vector float*) a0; | |||||
__vector float* va1 = (__vector float*) a1; | |||||
__vector float* va2 = (__vector float*) a2; | |||||
__vector float* va3 = (__vector float*) a3; | |||||
__vector float* v_x = (__vector float*) x; | |||||
register __vector float temp0 = {0,0,0,0}; | |||||
register __vector float temp1 = {0,0,0,0}; | |||||
register __vector float temp2 = {0,0,0,0}; | |||||
register __vector float temp3 = {0,0,0,0}; | |||||
for (i = 0; i < n / 4; i ++) { | |||||
temp0 += v_x[i] * va0[i]; | |||||
temp1 += v_x[i] * va1[i]; | |||||
temp2 += v_x[i] * va2[i]; | |||||
temp3 += v_x[i] * va3[i]; | |||||
} | |||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||||
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||||
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||||
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||||
} | |||||
static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||||
BLASLONG i; | |||||
FLOAT *a0, *a1; | |||||
a0 = ap; | |||||
a1 = ap + lda; | |||||
__vector float* va0 = (__vector float*) a0; | |||||
__vector float* va1 = (__vector float*) a1; | |||||
__vector float* v_x = (__vector float*) x; | |||||
__vector float temp0 = {0,0,0,0}; | |||||
__vector float temp1 = {0,0,0,0}; | |||||
for (i = 0; i < n / 4; i ++) { | |||||
temp0 += v_x[i] * va0[i]; | |||||
temp1 += v_x[i] * va1[i]; | |||||
} | |||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||||
y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||||
} | |||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||||
BLASLONG i; | |||||
FLOAT *a0; | |||||
a0 = ap; | |||||
__vector float* va0 = (__vector float*) a0; | |||||
__vector float* v_x = (__vector float*) x; | |||||
__vector float temp0 = {0,0,0,0}; | |||||
for (i = 0; i < n / 4; i ++) { | |||||
temp0 += v_x[i] * va0[i] ; | |||||
} | |||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||||
} | |||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||||
BLASLONG i; | |||||
for (i = 0; i < n; i++) { | |||||
*dest++ = *src; | |||||
src += inc_src; | |||||
} | |||||
} | |||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||||
BLASLONG i; | |||||
BLASLONG j; | |||||
FLOAT *a_ptr; | |||||
FLOAT *x_ptr; | |||||
FLOAT *y_ptr; | |||||
BLASLONG n1; | |||||
BLASLONG m1; | |||||
BLASLONG m2; | |||||
BLASLONG m3; | |||||
BLASLONG n2; | |||||
FLOAT ybuffer[8] __attribute__((aligned(16))); | |||||
FLOAT *xbuffer; | |||||
if (m < 1) return (0); | |||||
if (n < 1) return (0); | |||||
xbuffer = buffer; | |||||
n1 = n >> 3; | |||||
n2 = n & 7; | |||||
m3 = m & 3; | |||||
m1 = m - m3; | |||||
m2 = (m & (NBMAX - 1)) - m3; | |||||
BLASLONG NB = NBMAX; | |||||
while (NB == NBMAX) { | |||||
m1 -= NB; | |||||
if (m1 < 0) { | |||||
if (m2 == 0) break; | |||||
NB = m2; | |||||
} | |||||
y_ptr = y; | |||||
a_ptr = a; | |||||
x_ptr = x; | |||||
if (inc_x != 1) | |||||
copy_x(NB, x_ptr, xbuffer, inc_x); | |||||
else | |||||
xbuffer = x_ptr; | |||||
BLASLONG lda8 = lda << 3; | |||||
if (inc_y == 1) { | |||||
for (i = 0; i < n1; i++) { | |||||
sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); | |||||
y_ptr += 8; | |||||
a_ptr += lda8; | |||||
} | |||||
} else { | |||||
for (i = 0; i < n1; i++) { | |||||
ybuffer[0] = 0; | |||||
ybuffer[1] = 0; | |||||
ybuffer[2] = 0; | |||||
ybuffer[3] = 0; | |||||
ybuffer[4] = 0; | |||||
ybuffer[5] = 0; | |||||
ybuffer[6] = 0; | |||||
ybuffer[7] = 0; | |||||
sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||||
*y_ptr += ybuffer[0]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[1]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[2]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[3]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[4]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[5]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[6]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[7]; | |||||
y_ptr += inc_y; | |||||
a_ptr += lda8; | |||||
} | |||||
} | |||||
if (n2 & 4) { | |||||
ybuffer[0] = 0; | |||||
ybuffer[1] = 0; | |||||
ybuffer[2] = 0; | |||||
ybuffer[3] = 0; | |||||
sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||||
a_ptr += lda<<2; | |||||
*y_ptr += ybuffer[0]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[1]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[2]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[3]; | |||||
y_ptr += inc_y; | |||||
} | |||||
if (n2 & 2) { | |||||
sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); | |||||
a_ptr += lda << 1; | |||||
y_ptr += 2 * inc_y; | |||||
} | |||||
if (n2 & 1) { | |||||
sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); | |||||
a_ptr += lda; | |||||
y_ptr += inc_y; | |||||
} | |||||
a += NB; | |||||
x += NB * inc_x; | |||||
} | |||||
if (m3 == 0) return (0); | |||||
x_ptr = x; | |||||
a_ptr = a; | |||||
if (m3 == 3) { | |||||
FLOAT xtemp0 = *x_ptr * alpha; | |||||
x_ptr += inc_x; | |||||
FLOAT xtemp1 = *x_ptr * alpha; | |||||
x_ptr += inc_x; | |||||
FLOAT xtemp2 = *x_ptr * alpha; | |||||
FLOAT *aj = a_ptr; | |||||
y_ptr = y; | |||||
if (lda == 3 && inc_y == 1) { | |||||
for (j = 0; j < (n & -4); j += 4) { | |||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||||
y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||||
y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||||
y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||||
aj += 12; | |||||
} | |||||
for (; j < n; j++) { | |||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||||
aj += 3; | |||||
} | |||||
} else { | |||||
if (inc_y == 1) { | |||||
BLASLONG register lda2 = lda << 1; | |||||
BLASLONG register lda4 = lda << 2; | |||||
BLASLONG register lda3 = lda2 + lda; | |||||
for (j = 0; j < (n & -4); j += 4) { | |||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; | |||||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; | |||||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; | |||||
aj += lda4; | |||||
} | |||||
for (; j < n; j++) { | |||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||||
aj += lda; | |||||
} | |||||
} else { | |||||
for (j = 0; j < n; j++) { | |||||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||||
y_ptr += inc_y; | |||||
aj += lda; | |||||
} | |||||
} | |||||
} | |||||
return (0); | |||||
} | |||||
if (m3 == 2) { | |||||
FLOAT xtemp0 = *x_ptr * alpha; | |||||
x_ptr += inc_x; | |||||
FLOAT xtemp1 = *x_ptr * alpha; | |||||
FLOAT *aj = a_ptr; | |||||
y_ptr = y; | |||||
if (lda == 2 && inc_y == 1) { | |||||
for (j = 0; j < (n & -4); j += 4) { | |||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||||
y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; | |||||
y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; | |||||
y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; | |||||
aj += 8; | |||||
} | |||||
for (; j < n; j++) { | |||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||||
aj += 2; | |||||
} | |||||
} else { | |||||
if (inc_y == 1) { | |||||
BLASLONG register lda2 = lda << 1; | |||||
BLASLONG register lda4 = lda << 2; | |||||
BLASLONG register lda3 = lda2 + lda; | |||||
for (j = 0; j < (n & -4); j += 4) { | |||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||||
aj += lda4; | |||||
} | |||||
for (; j < n; j++) { | |||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||||
aj += lda; | |||||
} | |||||
} else { | |||||
for (j = 0; j < n; j++) { | |||||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||||
y_ptr += inc_y; | |||||
aj += lda; | |||||
} | |||||
} | |||||
} | |||||
return (0); | |||||
} | |||||
FLOAT xtemp = *x_ptr * alpha; | |||||
FLOAT *aj = a_ptr; | |||||
y_ptr = y; | |||||
if (lda == 1 && inc_y == 1) { | |||||
for (j = 0; j < (n & -4); j += 4) { | |||||
y_ptr[j] += aj[j] * xtemp; | |||||
y_ptr[j + 1] += aj[j + 1] * xtemp; | |||||
y_ptr[j + 2] += aj[j + 2] * xtemp; | |||||
y_ptr[j + 3] += aj[j + 3] * xtemp; | |||||
} | |||||
for (; j < n; j++) { | |||||
y_ptr[j] += aj[j] * xtemp; | |||||
} | |||||
} else { | |||||
if (inc_y == 1) { | |||||
BLASLONG register lda2 = lda << 1; | |||||
BLASLONG register lda4 = lda << 2; | |||||
BLASLONG register lda3 = lda2 + lda; | |||||
for (j = 0; j < (n & -4); j += 4) { | |||||
y_ptr[j] += *aj * xtemp; | |||||
y_ptr[j + 1] += *(aj + lda) * xtemp; | |||||
y_ptr[j + 2] += *(aj + lda2) * xtemp; | |||||
y_ptr[j + 3] += *(aj + lda3) * xtemp; | |||||
aj += lda4; | |||||
} | |||||
for (; j < n; j++) { | |||||
y_ptr[j] += *aj * xtemp; | |||||
aj += lda; | |||||
} | |||||
} else { | |||||
for (j = 0; j < n; j++) { | |||||
*y_ptr += *aj * xtemp; | |||||
y_ptr += inc_y; | |||||
aj += lda; | |||||
} | |||||
} | |||||
} | |||||
return (0); | |||||
} | |||||
#endif | |||||
/*************************************************************************** | |||||
Copyright (c) 2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#if !defined(__VEC__) || !defined(__ALTIVEC__) | |||||
#include "../arm/gemv_t.c" | |||||
#else | |||||
#include "common.h" | |||||
#define NBMAX 2048 | |||||
#include <altivec.h> | |||||
static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||||
BLASLONG i; | |||||
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||||
__vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; | |||||
register __vector float temp0 = {0,0,0,0}; | |||||
register __vector float temp1 = {0,0,0,0}; | |||||
register __vector float temp2 = {0,0,0,0}; | |||||
register __vector float temp3 = {0,0,0,0}; | |||||
register __vector float temp4 = {0,0,0,0}; | |||||
register __vector float temp5 = {0,0,0,0}; | |||||
register __vector float temp6 = {0,0,0,0}; | |||||
register __vector float temp7 = {0,0,0,0}; | |||||
a0 = ap; | |||||
a1 = ap + lda; | |||||
a2 = a1 + lda; | |||||
a3 = a2 + lda; | |||||
a4 = a3 + lda; | |||||
a5 = a4 + lda; | |||||
a6 = a5 + lda; | |||||
a7 = a6 + lda; | |||||
va0 = (__vector float*) a0; | |||||
va1 = (__vector float*) a1; | |||||
va2 = (__vector float*) a2; | |||||
va3 = (__vector float*) a3; | |||||
va4 = (__vector float*) a4; | |||||
va5 = (__vector float*) a5; | |||||
va6 = (__vector float*) a6; | |||||
va7 = (__vector float*) a7; | |||||
v_x = (__vector float*) x; | |||||
for (i = 0; i < n/4; i ++) { | |||||
temp0 += v_x[i] * va0[i]; | |||||
temp1 += v_x[i] * va1[i]; | |||||
temp2 += v_x[i] * va2[i]; | |||||
temp3 += v_x[i] * va3[i]; | |||||
temp4 += v_x[i] * va4[i]; | |||||
temp5 += v_x[i] * va5[i]; | |||||
temp6 += v_x[i] * va6[i]; | |||||
temp7 += v_x[i] * va7[i]; | |||||
} | |||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||||
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||||
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||||
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||||
y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); | |||||
y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); | |||||
y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); | |||||
y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); | |||||
} | |||||
static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||||
BLASLONG i = 0; | |||||
FLOAT *a0, *a1, *a2, *a3; | |||||
a0 = ap; | |||||
a1 = ap + lda; | |||||
a2 = a1 + lda; | |||||
a3 = a2 + lda; | |||||
__vector float* va0 = (__vector float*) a0; | |||||
__vector float* va1 = (__vector float*) a1; | |||||
__vector float* va2 = (__vector float*) a2; | |||||
__vector float* va3 = (__vector float*) a3; | |||||
__vector float* v_x = (__vector float*) x; | |||||
register __vector float temp0 = {0,0,0,0}; | |||||
register __vector float temp1 = {0,0,0,0}; | |||||
register __vector float temp2 = {0,0,0,0}; | |||||
register __vector float temp3 = {0,0,0,0}; | |||||
for (i = 0; i < n / 4; i ++) { | |||||
temp0 += v_x[i] * va0[i]; | |||||
temp1 += v_x[i] * va1[i]; | |||||
temp2 += v_x[i] * va2[i]; | |||||
temp3 += v_x[i] * va3[i]; | |||||
} | |||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||||
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||||
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||||
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||||
} | |||||
static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||||
BLASLONG i; | |||||
FLOAT *a0, *a1; | |||||
a0 = ap; | |||||
a1 = ap + lda; | |||||
__vector float* va0 = (__vector float*) a0; | |||||
__vector float* va1 = (__vector float*) a1; | |||||
__vector float* v_x = (__vector float*) x; | |||||
__vector float temp0 = {0,0,0,0}; | |||||
__vector float temp1 = {0,0,0,0}; | |||||
for (i = 0; i < n / 4; i ++) { | |||||
temp0 += v_x[i] * va0[i]; | |||||
temp1 += v_x[i] * va1[i]; | |||||
} | |||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||||
y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||||
} | |||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||||
BLASLONG i; | |||||
FLOAT *a0; | |||||
a0 = ap; | |||||
__vector float* va0 = (__vector float*) a0; | |||||
__vector float* v_x = (__vector float*) x; | |||||
__vector float temp0 = {0,0,0,0}; | |||||
for (i = 0; i < n / 4; i ++) { | |||||
temp0 += v_x[i] * va0[i] ; | |||||
} | |||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||||
} | |||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||||
BLASLONG i; | |||||
for (i = 0; i < n; i++) { | |||||
*dest++ = *src; | |||||
src += inc_src; | |||||
} | |||||
} | |||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||||
BLASLONG i; | |||||
BLASLONG j; | |||||
FLOAT *a_ptr; | |||||
FLOAT *x_ptr; | |||||
FLOAT *y_ptr; | |||||
BLASLONG n1; | |||||
BLASLONG m1; | |||||
BLASLONG m2; | |||||
BLASLONG m3; | |||||
BLASLONG n2; | |||||
FLOAT ybuffer[8] __attribute__((aligned(16))); | |||||
FLOAT *xbuffer; | |||||
if (m < 1) return (0); | |||||
if (n < 1) return (0); | |||||
xbuffer = buffer; | |||||
n1 = n >> 3; | |||||
n2 = n & 7; | |||||
m3 = m & 3; | |||||
m1 = m - m3; | |||||
m2 = (m & (NBMAX - 1)) - m3; | |||||
BLASLONG NB = NBMAX; | |||||
while (NB == NBMAX) { | |||||
m1 -= NB; | |||||
if (m1 < 0) { | |||||
if (m2 == 0) break; | |||||
NB = m2; | |||||
} | |||||
y_ptr = y; | |||||
a_ptr = a; | |||||
x_ptr = x; | |||||
if (inc_x != 1) | |||||
copy_x(NB, x_ptr, xbuffer, inc_x); | |||||
else | |||||
xbuffer = x_ptr; | |||||
BLASLONG lda8 = lda << 3; | |||||
if (inc_y == 1) { | |||||
for (i = 0; i < n1; i++) { | |||||
sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); | |||||
y_ptr += 8; | |||||
a_ptr += lda8; | |||||
} | |||||
} else { | |||||
for (i = 0; i < n1; i++) { | |||||
ybuffer[0] = 0; | |||||
ybuffer[1] = 0; | |||||
ybuffer[2] = 0; | |||||
ybuffer[3] = 0; | |||||
ybuffer[4] = 0; | |||||
ybuffer[5] = 0; | |||||
ybuffer[6] = 0; | |||||
ybuffer[7] = 0; | |||||
sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||||
*y_ptr += ybuffer[0]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[1]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[2]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[3]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[4]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[5]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[6]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[7]; | |||||
y_ptr += inc_y; | |||||
a_ptr += lda8; | |||||
} | |||||
} | |||||
if (n2 & 4) { | |||||
ybuffer[0] = 0; | |||||
ybuffer[1] = 0; | |||||
ybuffer[2] = 0; | |||||
ybuffer[3] = 0; | |||||
sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||||
a_ptr += lda<<2; | |||||
*y_ptr += ybuffer[0]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[1]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[2]; | |||||
y_ptr += inc_y; | |||||
*y_ptr += ybuffer[3]; | |||||
y_ptr += inc_y; | |||||
} | |||||
if (n2 & 2) { | |||||
sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); | |||||
a_ptr += lda << 1; | |||||
y_ptr += 2 * inc_y; | |||||
} | |||||
if (n2 & 1) { | |||||
sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); | |||||
a_ptr += lda; | |||||
y_ptr += inc_y; | |||||
} | |||||
a += NB; | |||||
x += NB * inc_x; | |||||
} | |||||
if (m3 == 0) return (0); | |||||
x_ptr = x; | |||||
a_ptr = a; | |||||
if (m3 == 3) { | |||||
FLOAT xtemp0 = *x_ptr * alpha; | |||||
x_ptr += inc_x; | |||||
FLOAT xtemp1 = *x_ptr * alpha; | |||||
x_ptr += inc_x; | |||||
FLOAT xtemp2 = *x_ptr * alpha; | |||||
FLOAT *aj = a_ptr; | |||||
y_ptr = y; | |||||
if (lda == 3 && inc_y == 1) { | |||||
for (j = 0; j < (n & -4); j += 4) { | |||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||||
y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||||
y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||||
y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||||
aj += 12; | |||||
} | |||||
for (; j < n; j++) { | |||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||||
aj += 3; | |||||
} | |||||
} else { | |||||
if (inc_y == 1) { | |||||
BLASLONG register lda2 = lda << 1; | |||||
BLASLONG register lda4 = lda << 2; | |||||
BLASLONG register lda3 = lda2 + lda; | |||||
for (j = 0; j < (n & -4); j += 4) { | |||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; | |||||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; | |||||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; | |||||
aj += lda4; | |||||
} | |||||
for (; j < n; j++) { | |||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||||
aj += lda; | |||||
} | |||||
} else { | |||||
for (j = 0; j < n; j++) { | |||||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||||
y_ptr += inc_y; | |||||
aj += lda; | |||||
} | |||||
} | |||||
} | |||||
return (0); | |||||
} | |||||
if (m3 == 2) { | |||||
FLOAT xtemp0 = *x_ptr * alpha; | |||||
x_ptr += inc_x; | |||||
FLOAT xtemp1 = *x_ptr * alpha; | |||||
FLOAT *aj = a_ptr; | |||||
y_ptr = y; | |||||
if (lda == 2 && inc_y == 1) { | |||||
for (j = 0; j < (n & -4); j += 4) { | |||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||||
y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; | |||||
y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; | |||||
y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; | |||||
aj += 8; | |||||
} | |||||
for (; j < n; j++) { | |||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||||
aj += 2; | |||||
} | |||||
} else { | |||||
if (inc_y == 1) { | |||||
BLASLONG register lda2 = lda << 1; | |||||
BLASLONG register lda4 = lda << 2; | |||||
BLASLONG register lda3 = lda2 + lda; | |||||
for (j = 0; j < (n & -4); j += 4) { | |||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||||
aj += lda4; | |||||
} | |||||
for (; j < n; j++) { | |||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||||
aj += lda; | |||||
} | |||||
} else { | |||||
for (j = 0; j < n; j++) { | |||||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||||
y_ptr += inc_y; | |||||
aj += lda; | |||||
} | |||||
} | |||||
} | |||||
return (0); | |||||
} | |||||
FLOAT xtemp = *x_ptr * alpha; | |||||
FLOAT *aj = a_ptr; | |||||
y_ptr = y; | |||||
if (lda == 1 && inc_y == 1) { | |||||
for (j = 0; j < (n & -4); j += 4) { | |||||
y_ptr[j] += aj[j] * xtemp; | |||||
y_ptr[j + 1] += aj[j + 1] * xtemp; | |||||
y_ptr[j + 2] += aj[j + 2] * xtemp; | |||||
y_ptr[j + 3] += aj[j + 3] * xtemp; | |||||
} | |||||
for (; j < n; j++) { | |||||
y_ptr[j] += aj[j] * xtemp; | |||||
} | |||||
} else { | |||||
if (inc_y == 1) { | |||||
BLASLONG register lda2 = lda << 1; | |||||
BLASLONG register lda4 = lda << 2; | |||||
BLASLONG register lda3 = lda2 + lda; | |||||
for (j = 0; j < (n & -4); j += 4) { | |||||
y_ptr[j] += *aj * xtemp; | |||||
y_ptr[j + 1] += *(aj + lda) * xtemp; | |||||
y_ptr[j + 2] += *(aj + lda2) * xtemp; | |||||
y_ptr[j + 3] += *(aj + lda3) * xtemp; | |||||
aj += lda4; | |||||
} | |||||
for (; j < n; j++) { | |||||
y_ptr[j] += *aj * xtemp; | |||||
aj += lda; | |||||
} | |||||
} else { | |||||
for (j = 0; j < n; j++) { | |||||
*y_ptr += *aj * xtemp; | |||||
y_ptr += inc_y; | |||||
aj += lda; | |||||
} | |||||
} | |||||
} | |||||
return (0); | |||||
} | |||||
#endif |
@@ -1,245 +1,245 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "def_vsx.h" | |||||
#define LOAD ld | |||||
#define STACKSIZE 512 | |||||
#define FZERO 312+192(SP) | |||||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||||
#define M r3 | |||||
#define N r4 | |||||
#define K r5 | |||||
#define A r8 | |||||
#define B r9 | |||||
#define C r10 | |||||
#define LDC r6 | |||||
#define OFFSET r7 | |||||
#define o0 0 | |||||
#define alpha_r vs30 | |||||
#define alpha_i vs31 | |||||
#define VECSAVE r11 | |||||
#define FRAMEPOINTER r12 | |||||
#define T10 r14 | |||||
#define L r15 | |||||
#define T8 r16 | |||||
#define T5 r17 | |||||
#define T2 r19 | |||||
#define TEMP_REG r20 | |||||
#define T6 r21 | |||||
#define I r22 | |||||
#define J r23 | |||||
#define AO r24 | |||||
#define BO r25 | |||||
#define CO r26 | |||||
#define T7 r27 | |||||
#define T3 r28 | |||||
#define T4 r29 | |||||
#define PRE r30 | |||||
#define T1 r31 | |||||
#ifndef NEEDPARAM | |||||
PROLOGUE | |||||
PROFCODE | |||||
mr FRAMEPOINTER, SP | |||||
addi SP, SP, -STACKSIZE | |||||
mflr r0 | |||||
stfd f14, 0(SP) | |||||
stfd f15, 8(SP) | |||||
stfd f16, 16(SP) | |||||
stfd f17, 24(SP) | |||||
stfd f18, 32(SP) | |||||
stfd f19, 40(SP) | |||||
stfd f20, 48(SP) | |||||
stfd f21, 56(SP) | |||||
stfd f22, 64(SP) | |||||
stfd f23, 72(SP) | |||||
stfd f24, 80(SP) | |||||
stfd f25, 88(SP) | |||||
stfd f26, 96(SP) | |||||
stfd f27, 104(SP) | |||||
stfd f28, 112(SP) | |||||
stfd f29, 120(SP) | |||||
stfd f30, 128(SP) | |||||
stfd f31, 136(SP) | |||||
xxspltd alpha_r,vs1,0 /*copy from register f1 */ | |||||
xxspltd alpha_i,vs2,0 /*copy from register f2 */ | |||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
stxv vs52, 288(SP) | |||||
stxv vs53, 304(SP) | |||||
stxv vs54, 320(SP) | |||||
stxv vs55, 336(SP) | |||||
stxv vs56, 352(SP) | |||||
stxv vs57, 368(SP) | |||||
stxv vs58, 384(SP) | |||||
stxv vs59, 400(SP) | |||||
stxv vs60, 416(SP) | |||||
stxv vs61, 432(SP) | |||||
stxv vs62, 448(SP) | |||||
stxv vs63, 464(SP) | |||||
std r0, FLINK_SAVE(SP) | |||||
#if defined(linux) || defined(__FreeBSD__) | |||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||||
#endif | |||||
#ifdef TRMMKERNEL | |||||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||||
#endif | |||||
#endif | |||||
#include "zgemm_macros_power9.S" | |||||
slwi LDC, LDC, ZBASE_SHIFT | |||||
li PRE, 512 | |||||
li r0, 0 | |||||
#if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||||
/*negate for this case as we will use addition -1*(a+b) */ | |||||
xvnegdp alpha_r,alpha_r | |||||
xvnegdp alpha_i,alpha_i | |||||
#endif | |||||
.align 4 | |||||
#include "zgemm_logic_power9.S" | |||||
L999: | |||||
lfd f14, 0(SP) | |||||
lfd f15, 8(SP) | |||||
lfd f16, 16(SP) | |||||
lfd f17, 24(SP) | |||||
lfd f18, 32(SP) | |||||
lfd f19, 40(SP) | |||||
lfd f20, 48(SP) | |||||
lfd f21, 56(SP) | |||||
lfd f22, 64(SP) | |||||
lfd f23, 72(SP) | |||||
lfd f24, 80(SP) | |||||
lfd f25, 88(SP) | |||||
lfd f26, 96(SP) | |||||
lfd f27, 104(SP) | |||||
lfd f28, 112(SP) | |||||
lfd f29, 120(SP) | |||||
lfd f30, 128(SP) | |||||
lfd f31, 136(SP) | |||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
ld r0, FLINK_SAVE(SP) | |||||
lxv vs52, 288(SP) | |||||
lxv vs53, 304(SP) | |||||
lxv vs54, 320(SP) | |||||
lxv vs55, 336(SP) | |||||
lxv vs56, 352(SP) | |||||
lxv vs57, 368(SP) | |||||
lxv vs58, 384(SP) | |||||
lxv vs59, 400(SP) | |||||
mtlr r0 | |||||
lxv vs60, 416(SP) | |||||
lxv vs61, 432(SP) | |||||
lxv vs62, 448(SP) | |||||
lxv vs63, 464(SP) | |||||
addi SP, SP, STACKSIZE | |||||
blr | |||||
EPILOGUE | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "def_vsx.h" | |||||
#define LOAD ld | |||||
#define STACKSIZE 512 | |||||
#define FZERO 312+192(SP) | |||||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||||
#define M r3 | |||||
#define N r4 | |||||
#define K r5 | |||||
#define A r8 | |||||
#define B r9 | |||||
#define C r10 | |||||
#define LDC r6 | |||||
#define OFFSET r7 | |||||
#define o0 0 | |||||
#define alpha_r vs30 | |||||
#define alpha_i vs31 | |||||
#define VECSAVE r11 | |||||
#define FRAMEPOINTER r12 | |||||
#define T10 r14 | |||||
#define L r15 | |||||
#define T8 r16 | |||||
#define T5 r17 | |||||
#define T2 r19 | |||||
#define TEMP_REG r20 | |||||
#define T6 r21 | |||||
#define I r22 | |||||
#define J r23 | |||||
#define AO r24 | |||||
#define BO r25 | |||||
#define CO r26 | |||||
#define T7 r27 | |||||
#define T3 r28 | |||||
#define T4 r29 | |||||
#define PRE r30 | |||||
#define T1 r31 | |||||
#ifndef NEEDPARAM | |||||
PROLOGUE | |||||
PROFCODE | |||||
mr FRAMEPOINTER, SP | |||||
addi SP, SP, -STACKSIZE | |||||
mflr r0 | |||||
stfd f14, 0(SP) | |||||
stfd f15, 8(SP) | |||||
stfd f16, 16(SP) | |||||
stfd f17, 24(SP) | |||||
stfd f18, 32(SP) | |||||
stfd f19, 40(SP) | |||||
stfd f20, 48(SP) | |||||
stfd f21, 56(SP) | |||||
stfd f22, 64(SP) | |||||
stfd f23, 72(SP) | |||||
stfd f24, 80(SP) | |||||
stfd f25, 88(SP) | |||||
stfd f26, 96(SP) | |||||
stfd f27, 104(SP) | |||||
stfd f28, 112(SP) | |||||
stfd f29, 120(SP) | |||||
stfd f30, 128(SP) | |||||
stfd f31, 136(SP) | |||||
xxspltd alpha_r,vs1,0 /*copy from register f1 */ | |||||
xxspltd alpha_i,vs2,0 /*copy from register f2 */ | |||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
stxv vs52, 288(SP) | |||||
stxv vs53, 304(SP) | |||||
stxv vs54, 320(SP) | |||||
stxv vs55, 336(SP) | |||||
stxv vs56, 352(SP) | |||||
stxv vs57, 368(SP) | |||||
stxv vs58, 384(SP) | |||||
stxv vs59, 400(SP) | |||||
stxv vs60, 416(SP) | |||||
stxv vs61, 432(SP) | |||||
stxv vs62, 448(SP) | |||||
stxv vs63, 464(SP) | |||||
std r0, FLINK_SAVE(SP) | |||||
#if defined(linux) || defined(__FreeBSD__) | |||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||||
#endif | |||||
#ifdef TRMMKERNEL | |||||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||||
#endif | |||||
#endif | |||||
#include "zgemm_macros_power9.S" | |||||
slwi LDC, LDC, ZBASE_SHIFT | |||||
li PRE, 512 | |||||
li r0, 0 | |||||
#if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||||
/*negate for this case as we will use addition -1*(a+b) */ | |||||
xvnegdp alpha_r,alpha_r | |||||
xvnegdp alpha_i,alpha_i | |||||
#endif | |||||
.align 4 | |||||
#include "zgemm_logic_power9.S" | |||||
L999: | |||||
lfd f14, 0(SP) | |||||
lfd f15, 8(SP) | |||||
lfd f16, 16(SP) | |||||
lfd f17, 24(SP) | |||||
lfd f18, 32(SP) | |||||
lfd f19, 40(SP) | |||||
lfd f20, 48(SP) | |||||
lfd f21, 56(SP) | |||||
lfd f22, 64(SP) | |||||
lfd f23, 72(SP) | |||||
lfd f24, 80(SP) | |||||
lfd f25, 88(SP) | |||||
lfd f26, 96(SP) | |||||
lfd f27, 104(SP) | |||||
lfd f28, 112(SP) | |||||
lfd f29, 120(SP) | |||||
lfd f30, 128(SP) | |||||
lfd f31, 136(SP) | |||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
ld r0, FLINK_SAVE(SP) | |||||
lxv vs52, 288(SP) | |||||
lxv vs53, 304(SP) | |||||
lxv vs54, 320(SP) | |||||
lxv vs55, 336(SP) | |||||
lxv vs56, 352(SP) | |||||
lxv vs57, 368(SP) | |||||
lxv vs58, 384(SP) | |||||
lxv vs59, 400(SP) | |||||
mtlr r0 | |||||
lxv vs60, 416(SP) | |||||
lxv vs61, 432(SP) | |||||
lxv vs62, 448(SP) | |||||
lxv vs63, 464(SP) | |||||
addi SP, SP, STACKSIZE | |||||
blr | |||||
EPILOGUE | |||||
#endif | #endif |
@@ -1,226 +1,226 @@ | |||||
/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ | |||||
/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ | |||||
/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ | |||||
#define init_m8n4(c1,c2,c3,c4)\ | |||||
"vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ | |||||
"vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" | |||||
#define INIT_m8n4 init_m8n4(4,5,6,7) | |||||
#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) | |||||
#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) | |||||
#define init_m4n4(c1,c2,c3,c4)\ | |||||
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ | |||||
"vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" | |||||
#define INIT_m4n4 init_m4n4(4,5,6,7) | |||||
#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) | |||||
#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) | |||||
#define init_m2n4(c1,c2)\ | |||||
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" | |||||
#define INIT_m2n4 init_m2n4(4,5) | |||||
#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) | |||||
#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) | |||||
#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" | |||||
#define INIT_m1n4 init_m1n4(4) | |||||
#define INIT_m1n8 INIT_m1n4 init_m1n4(5) | |||||
#define INIT_m1n12 INIT_m1n8 init_m1n4(6) | |||||
#define GEMM_KERNEL_k1m8n4 \ | |||||
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ | |||||
"vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ | |||||
"vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" | |||||
#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ | |||||
"vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ | |||||
"vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" | |||||
#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ | |||||
"vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ | |||||
"vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" | |||||
#define GEMM_KERNEL_k1m4n4 \ | |||||
"vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ | |||||
"vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ | |||||
"vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||||
#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ | |||||
"vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ | |||||
"vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" | |||||
#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ | |||||
"vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ | |||||
"vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" | |||||
#define GEMM_KERNEL_k1m2n4 \ | |||||
"vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ | |||||
"vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" | |||||
#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ | |||||
"vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||||
#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ | |||||
"vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" | |||||
#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" | |||||
#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" | |||||
#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" | |||||
#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ | |||||
"vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||||
"vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||||
"vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ | |||||
"vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||||
"vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||||
"vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" | |||||
#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ | |||||
"vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||||
"vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||||
"vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ | |||||
"vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ | |||||
"vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||||
"vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||||
"vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ | |||||
"vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ | |||||
"vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" | |||||
#define GEMM_SUM_REORDER_2x4(c1,c2)\ | |||||
"vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ | |||||
"vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ | |||||
"vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||||
"vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ | |||||
#define GEMM_SUM_REORDER_1x4(c1)\ | |||||
"vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||||
"vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||||
"vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" | |||||
#define SOLVE_le_m4n2(b_off,c1,...)\ | |||||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||||
"vmovsldup %%ymm"#c1",%%ymm1;" | |||||
#define SOLVE_le_m8n2(b_off,c1,c2,...)\ | |||||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||||
"vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" | |||||
#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||||
#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||||
#define SOLVE_ri_m4n2(b_off,c1,...)\ | |||||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||||
"vmovshdup %%ymm"#c1",%%ymm1;" | |||||
#define SOLVE_ri_m8n2(b_off,c1,c2,...)\ | |||||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||||
"vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" | |||||
#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||||
#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||||
#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||||
"vpermilps $0,%%xmm"#c1",%%xmm1;" | |||||
#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||||
"vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" | |||||
#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||||
#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||||
#define SOLVE_col2_mul_m1n4(b_off,c1,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||||
"vpermilps $85,%%xmm"#c1",%%xmm1;" | |||||
#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||||
"vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" | |||||
#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||||
#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||||
#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||||
#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||||
#define SOLVE_col3_mul_m1n4(b_off,c1,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||||
"vpermilps $170,%%xmm"#c1",%%xmm1;" | |||||
#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||||
"vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" | |||||
#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||||
#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||||
#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||||
#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||||
#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||||
"vpermilps $255,%%xmm"#c1",%%xmm1;" | |||||
#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||||
"vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" | |||||
#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||||
#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||||
#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||||
#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||||
#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||||
#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||||
#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ | |||||
"vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ | |||||
"vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ | |||||
"vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ | |||||
"vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" | |||||
#define SAVE_SOLUTION_m4n2(c1,a_off)\ | |||||
"vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ | |||||
"vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||||
#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ | |||||
"vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||||
"vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" | |||||
#define SAVE_SOLUTION_m1n4(c1,a_off)\ | |||||
"vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||||
"vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||||
/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ | |||||
/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ | |||||
/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ | |||||
#define init_m8n4(c1,c2,c3,c4)\ | |||||
"vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ | |||||
"vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" | |||||
#define INIT_m8n4 init_m8n4(4,5,6,7) | |||||
#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) | |||||
#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) | |||||
#define init_m4n4(c1,c2,c3,c4)\ | |||||
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ | |||||
"vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" | |||||
#define INIT_m4n4 init_m4n4(4,5,6,7) | |||||
#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) | |||||
#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) | |||||
#define init_m2n4(c1,c2)\ | |||||
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" | |||||
#define INIT_m2n4 init_m2n4(4,5) | |||||
#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) | |||||
#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) | |||||
#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" | |||||
#define INIT_m1n4 init_m1n4(4) | |||||
#define INIT_m1n8 INIT_m1n4 init_m1n4(5) | |||||
#define INIT_m1n12 INIT_m1n8 init_m1n4(6) | |||||
#define GEMM_KERNEL_k1m8n4 \ | |||||
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ | |||||
"vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ | |||||
"vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" | |||||
#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ | |||||
"vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ | |||||
"vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" | |||||
#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ | |||||
"vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ | |||||
"vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" | |||||
#define GEMM_KERNEL_k1m4n4 \ | |||||
"vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ | |||||
"vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ | |||||
"vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||||
#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ | |||||
"vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ | |||||
"vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" | |||||
#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ | |||||
"vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ | |||||
"vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" | |||||
#define GEMM_KERNEL_k1m2n4 \ | |||||
"vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ | |||||
"vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" | |||||
#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ | |||||
"vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||||
#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ | |||||
"vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" | |||||
#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" | |||||
#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" | |||||
#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" | |||||
#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ | |||||
"vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||||
"vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||||
"vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ | |||||
"vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||||
"vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||||
"vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" | |||||
#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ | |||||
"vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||||
"vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||||
"vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ | |||||
"vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ | |||||
"vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||||
"vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||||
"vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ | |||||
"vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ | |||||
"vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" | |||||
#define GEMM_SUM_REORDER_2x4(c1,c2)\ | |||||
"vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ | |||||
"vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ | |||||
"vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||||
"vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ | |||||
#define GEMM_SUM_REORDER_1x4(c1)\ | |||||
"vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||||
"vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||||
"vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" | |||||
#define SOLVE_le_m4n2(b_off,c1,...)\ | |||||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||||
"vmovsldup %%ymm"#c1",%%ymm1;" | |||||
#define SOLVE_le_m8n2(b_off,c1,c2,...)\ | |||||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||||
"vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" | |||||
#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||||
#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||||
#define SOLVE_ri_m4n2(b_off,c1,...)\ | |||||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||||
"vmovshdup %%ymm"#c1",%%ymm1;" | |||||
#define SOLVE_ri_m8n2(b_off,c1,c2,...)\ | |||||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||||
"vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" | |||||
#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||||
#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||||
#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||||
"vpermilps $0,%%xmm"#c1",%%xmm1;" | |||||
#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||||
"vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" | |||||
#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||||
#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||||
#define SOLVE_col2_mul_m1n4(b_off,c1,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||||
"vpermilps $85,%%xmm"#c1",%%xmm1;" | |||||
#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||||
"vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" | |||||
#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||||
#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||||
#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||||
#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||||
#define SOLVE_col3_mul_m1n4(b_off,c1,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||||
"vpermilps $170,%%xmm"#c1",%%xmm1;" | |||||
#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||||
"vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" | |||||
#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||||
#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||||
#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||||
#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||||
#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||||
"vpermilps $255,%%xmm"#c1",%%xmm1;" | |||||
#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ | |||||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||||
"vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" | |||||
#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ | |||||
"vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||||
#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||||
"vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||||
#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||||
#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||||
#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||||
#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||||
#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ | |||||
"vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ | |||||
"vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ | |||||
"vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ | |||||
"vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" | |||||
#define SAVE_SOLUTION_m4n2(c1,a_off)\ | |||||
"vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ | |||||
"vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||||
#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ | |||||
"vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||||
"vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" | |||||
#define SAVE_SOLUTION_m1n4(c1,a_off)\ | |||||
"vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||||
"vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" |
@@ -1,86 +1,86 @@ | |||||
include_directories(${PROJECT_SOURCE_DIR}) | |||||
include_directories(${PROJECT_BINARY_DIR}) | |||||
include_directories(${PROJECT_SOURCE_DIR}/relapack) | |||||
set(RELAFILES | |||||
clauum.c | |||||
ctrsyl_rec2.c | |||||
dsytrf.c | |||||
spbtrf.c | |||||
strsyl_rec2.c | |||||
zhetrf_rook_rec2.c | |||||
ztrsyl.c | |||||
cgbtrf.c | |||||
cpbtrf.c | |||||
ctrtri.c | |||||
dsytrf_rec2.c | |||||
spotrf.c | |||||
strtri.c | |||||
zlauum.c | |||||
ztrsyl_rec2.c | |||||
cgemmt.c | |||||
cpotrf.c | |||||
dgbtrf.c | |||||
dsytrf_rook.c | |||||
lapack_wrappers.c | |||||
ssygst.c | |||||
zgbtrf.c | |||||
zpbtrf.c | |||||
ztrtri.c | |||||
cgetrf.c | |||||
csytrf.c | |||||
dgemmt.c | |||||
dsytrf_rook_rec2.c | |||||
ssytrf.c | |||||
zgemmt.c | |||||
zpotrf.c | |||||
chegst.c | |||||
csytrf_rec2.c | |||||
dgetrf.c | |||||
dtgsyl.c | |||||
ssytrf_rec2.c | |||||
zgetrf.c | |||||
zsytrf.c | |||||
chetrf.c | |||||
csytrf_rook.c | |||||
dlauum.c | |||||
dtrsyl.c | |||||
sgbtrf.c | |||||
ssytrf_rook.c | |||||
zhegst.c | |||||
zsytrf_rec2.c | |||||
chetrf_rec2.c | |||||
csytrf_rook_rec2.c | |||||
dpbtrf.c | |||||
dtrsyl_rec2.c | |||||
sgemmt.c | |||||
ssytrf_rook_rec2.c | |||||
zhetrf.c | |||||
zsytrf_rook.c | |||||
chetrf_rook.c | |||||
ctgsyl.c | |||||
dpotrf.c | |||||
dtrtri.c | |||||
sgetrf.c | |||||
stgsyl.c | |||||
zhetrf_rec2.c | |||||
zsytrf_rook_rec2.c | |||||
chetrf_rook_rec2.c | |||||
ctrsyl.c | |||||
dsygst.c | |||||
f2c.c | |||||
slauum.c | |||||
strsyl.c | |||||
zhetrf_rook.c | |||||
ztgsyl.c | |||||
) | |||||
# add relapack folder to the sources | |||||
set(RELA_SOURCES "") | |||||
foreach (RELA_FILE ${RELAFILES}) | |||||
list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}") | |||||
endforeach () | |||||
add_library(relapack_src OBJECT ${RELA_SOURCES}) | |||||
set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") | |||||
include_directories(${PROJECT_SOURCE_DIR}) | |||||
include_directories(${PROJECT_BINARY_DIR}) | |||||
include_directories(${PROJECT_SOURCE_DIR}/relapack) | |||||
set(RELAFILES | |||||
clauum.c | |||||
ctrsyl_rec2.c | |||||
dsytrf.c | |||||
spbtrf.c | |||||
strsyl_rec2.c | |||||
zhetrf_rook_rec2.c | |||||
ztrsyl.c | |||||
cgbtrf.c | |||||
cpbtrf.c | |||||
ctrtri.c | |||||
dsytrf_rec2.c | |||||
spotrf.c | |||||
strtri.c | |||||
zlauum.c | |||||
ztrsyl_rec2.c | |||||
cgemmt.c | |||||
cpotrf.c | |||||
dgbtrf.c | |||||
dsytrf_rook.c | |||||
lapack_wrappers.c | |||||
ssygst.c | |||||
zgbtrf.c | |||||
zpbtrf.c | |||||
ztrtri.c | |||||
cgetrf.c | |||||
csytrf.c | |||||
dgemmt.c | |||||
dsytrf_rook_rec2.c | |||||
ssytrf.c | |||||
zgemmt.c | |||||
zpotrf.c | |||||
chegst.c | |||||
csytrf_rec2.c | |||||
dgetrf.c | |||||
dtgsyl.c | |||||
ssytrf_rec2.c | |||||
zgetrf.c | |||||
zsytrf.c | |||||
chetrf.c | |||||
csytrf_rook.c | |||||
dlauum.c | |||||
dtrsyl.c | |||||
sgbtrf.c | |||||
ssytrf_rook.c | |||||
zhegst.c | |||||
zsytrf_rec2.c | |||||
chetrf_rec2.c | |||||
csytrf_rook_rec2.c | |||||
dpbtrf.c | |||||
dtrsyl_rec2.c | |||||
sgemmt.c | |||||
ssytrf_rook_rec2.c | |||||
zhetrf.c | |||||
zsytrf_rook.c | |||||
chetrf_rook.c | |||||
ctgsyl.c | |||||
dpotrf.c | |||||
dtrtri.c | |||||
sgetrf.c | |||||
stgsyl.c | |||||
zhetrf_rec2.c | |||||
zsytrf_rook_rec2.c | |||||
chetrf_rook_rec2.c | |||||
ctrsyl.c | |||||
dsygst.c | |||||
f2c.c | |||||
slauum.c | |||||
strsyl.c | |||||
zhetrf_rook.c | |||||
ztgsyl.c | |||||
) | |||||
# add relapack folder to the sources | |||||
set(RELA_SOURCES "") | |||||
foreach (RELA_FILE ${RELAFILES}) | |||||
list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}") | |||||
endforeach () | |||||
add_library(relapack_src OBJECT ${RELA_SOURCES}) | |||||
set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") |