@@ -1,333 +1,333 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2016, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#define M x0 | |||
#define N x1 | |||
#define A00 x2 | |||
#define LDA x3 | |||
#define B00 x4 | |||
#define A01 x5 | |||
#define A02 x6 | |||
#define A03 x7 | |||
#define A04 x8 | |||
#define I x9 | |||
#define J x10 | |||
#define TEMP1 x11 | |||
#define TEMP2 x12 | |||
#define A_PREFETCH 2560 | |||
/************************************************************************************** | |||
* Macro definitions | |||
**************************************************************************************/ | |||
.macro SAVE_REGS | |||
add sp, sp, #-(11 * 16) | |||
stp d8, d9, [sp, #(0 * 16)] | |||
stp d10, d11, [sp, #(1 * 16)] | |||
stp d12, d13, [sp, #(2 * 16)] | |||
stp d14, d15, [sp, #(3 * 16)] | |||
stp d16, d17, [sp, #(4 * 16)] | |||
stp x18, x19, [sp, #(5 * 16)] | |||
stp x20, x21, [sp, #(6 * 16)] | |||
stp x22, x23, [sp, #(7 * 16)] | |||
stp x24, x25, [sp, #(8 * 16)] | |||
stp x26, x27, [sp, #(9 * 16)] | |||
str x28, [sp, #(10 * 16)] | |||
.endm | |||
.macro RESTORE_REGS | |||
ldp d8, d9, [sp, #(0 * 16)] | |||
ldp d10, d11, [sp, #(1 * 16)] | |||
ldp d12, d13, [sp, #(2 * 16)] | |||
ldp d14, d15, [sp, #(3 * 16)] | |||
ldp d16, d17, [sp, #(4 * 16)] | |||
ldp x18, x19, [sp, #(5 * 16)] | |||
ldp x20, x21, [sp, #(6 * 16)] | |||
ldp x22, x23, [sp, #(7 * 16)] | |||
ldp x24, x25, [sp, #(8 * 16)] | |||
ldp x26, x27, [sp, #(9 * 16)] | |||
ldr x28, [sp, #(10 * 16)] | |||
add sp, sp, #(11*16) | |||
.endm | |||
.macro COPY4x4 | |||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldr q0, [A01], #16 | |||
ins v8.s[0], v0.s[0] | |||
ins v9.s[0], v0.s[1] | |||
ins v10.s[0], v0.s[2] | |||
ins v11.s[0], v0.s[3] | |||
ldr q1, [A02], #16 | |||
ins v8.s[1], v1.s[0] | |||
ins v9.s[1], v1.s[1] | |||
ins v10.s[1], v1.s[2] | |||
ins v11.s[1], v1.s[3] | |||
ldr q2, [A03], #16 | |||
ins v8.s[2], v2.s[0] | |||
ins v9.s[2], v2.s[1] | |||
ins v10.s[2], v2.s[2] | |||
ins v11.s[2], v2.s[3] | |||
ldr q3, [A04], #16 | |||
ins v8.s[3], v3.s[0] | |||
ins v9.s[3], v3.s[1] | |||
ins v10.s[3], v3.s[2] | |||
ins v11.s[3], v3.s[3] | |||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] | |||
add B00, B00, #64 | |||
.endm | |||
.macro COPY1x4 | |||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldr s0, [A01], #4 | |||
ldr s1, [A02], #4 | |||
ldr s2, [A03], #4 | |||
ldr s3, [A04], #4 | |||
stp s0, s1, [B00] | |||
add B00, B00, #8 | |||
stp s2, s3, [B00] | |||
add B00, B00, #8 | |||
.endm | |||
.macro COPY4x2 | |||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldr q0, [A01], #16 | |||
ins v8.s[0], v0.s[0] | |||
ins v9.s[0], v0.s[1] | |||
ins v10.s[0], v0.s[2] | |||
ins v11.s[0], v0.s[3] | |||
ldr q1, [A02], #16 | |||
ins v8.s[1], v1.s[0] | |||
ins v9.s[1], v1.s[1] | |||
ins v10.s[1], v1.s[2] | |||
ins v11.s[1], v1.s[3] | |||
st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] | |||
add B00, B00, #32 | |||
.endm | |||
.macro COPY1x2 | |||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldr s0, [A01], #4 | |||
ldr s1, [A02], #4 | |||
stp s0, s1, [B00] | |||
add B00, B00, #8 | |||
.endm | |||
.macro COPY4x1 | |||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
ldr q0, [A01], #16 | |||
str q0, [B00], #16 | |||
.endm | |||
.macro COPY1x1 | |||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
ldr s0, [A01], #4 | |||
str s0, [B00], #4 | |||
.endm | |||
/************************************************************************************** | |||
* End of macro definitions | |||
**************************************************************************************/ | |||
PROLOGUE | |||
.align 5 | |||
SAVE_REGS | |||
lsl LDA, LDA, #2 // LDA = LDA * SIZE | |||
.Ldgemm_ncopy_L4_BEGIN: | |||
asr J, N, #2 // J = N / 4 | |||
cmp J, #0 | |||
ble .Ldgemm_ncopy_L2_BEGIN | |||
.align 5 | |||
.Ldgemm_ncopy_L4_M4_BEGIN: | |||
mov A01, A00 | |||
add A02, A01, LDA | |||
add A03, A02, LDA | |||
add A04, A03, LDA | |||
add A00, A04, LDA | |||
asr I, M, #2 // I = M / 4 | |||
cmp I, #0 | |||
ble .Ldgemm_ncopy_L4_M4_40 | |||
.align 5 | |||
.Ldgemm_ncopy_L4_M4_20: | |||
COPY4x4 | |||
subs I , I , #1 | |||
bne .Ldgemm_ncopy_L4_M4_20 | |||
.Ldgemm_ncopy_L4_M4_40: | |||
and I, M , #3 | |||
cmp I, #0 | |||
ble .Ldgemm_ncopy_L4_M4_END | |||
.align 5 | |||
.Ldgemm_ncopy_L4_M4_60: | |||
COPY1x4 | |||
subs I , I , #1 | |||
bne .Ldgemm_ncopy_L4_M4_60 | |||
.Ldgemm_ncopy_L4_M4_END: | |||
subs J , J, #1 // j-- | |||
bne .Ldgemm_ncopy_L4_M4_BEGIN | |||
/*********************************************************************************************/ | |||
.Ldgemm_ncopy_L2_BEGIN: | |||
tst N, #3 | |||
ble .Ldgemm_ncopy_L999 | |||
tst N, #2 | |||
ble .Ldgemm_ncopy_L1_BEGIN | |||
.Ldgemm_ncopy_L2_M4_BEGIN: | |||
mov A01, A00 | |||
add A02, A01, LDA | |||
add A00, A02, LDA | |||
asr I, M, #2 // I = M / 4 | |||
cmp I, #0 | |||
ble .Ldgemm_ncopy_L2_M4_40 | |||
.align 5 | |||
.Ldgemm_ncopy_L2_M4_20: | |||
COPY4x2 | |||
subs I , I , #1 | |||
bne .Ldgemm_ncopy_L2_M4_20 | |||
.Ldgemm_ncopy_L2_M4_40: | |||
and I, M , #3 | |||
cmp I, #0 | |||
ble .Ldgemm_ncopy_L2_M4_END | |||
.align 5 | |||
.Ldgemm_ncopy_L2_M4_60: | |||
COPY1x2 | |||
subs I , I , #1 | |||
bne .Ldgemm_ncopy_L2_M4_60 | |||
.Ldgemm_ncopy_L2_M4_END: | |||
/*********************************************************************************************/ | |||
.Ldgemm_ncopy_L1_BEGIN: | |||
tst N, #1 | |||
ble .Ldgemm_ncopy_L999 | |||
.Ldgemm_ncopy_L1_M4_BEGIN: | |||
mov A01, A00 | |||
asr I, M, #2 // I = M / 4 | |||
cmp I, #0 | |||
ble .Ldgemm_ncopy_L1_M4_40 | |||
.align 5 | |||
.Ldgemm_ncopy_L1_M4_20: | |||
COPY4x1 | |||
subs I , I , #1 | |||
bne .Ldgemm_ncopy_L1_M4_20 | |||
.Ldgemm_ncopy_L1_M4_40: | |||
and I, M , #3 | |||
cmp I, #0 | |||
ble .Ldgemm_ncopy_L1_M4_END | |||
.align 5 | |||
.Ldgemm_ncopy_L1_M4_60: | |||
COPY1x1 | |||
subs I , I , #1 | |||
bne .Ldgemm_ncopy_L1_M4_60 | |||
.Ldgemm_ncopy_L1_M4_END: | |||
.Ldgemm_ncopy_L999: | |||
mov x0, #0 | |||
RESTORE_REGS | |||
ret | |||
EPILOGUE | |||
/*************************************************************************** | |||
Copyright (c) 2016, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#define M x0 | |||
#define N x1 | |||
#define A00 x2 | |||
#define LDA x3 | |||
#define B00 x4 | |||
#define A01 x5 | |||
#define A02 x6 | |||
#define A03 x7 | |||
#define A04 x8 | |||
#define I x9 | |||
#define J x10 | |||
#define TEMP1 x11 | |||
#define TEMP2 x12 | |||
#define A_PREFETCH 2560 | |||
/************************************************************************************** | |||
* Macro definitions | |||
**************************************************************************************/ | |||
.macro SAVE_REGS | |||
add sp, sp, #-(11 * 16) | |||
stp d8, d9, [sp, #(0 * 16)] | |||
stp d10, d11, [sp, #(1 * 16)] | |||
stp d12, d13, [sp, #(2 * 16)] | |||
stp d14, d15, [sp, #(3 * 16)] | |||
stp d16, d17, [sp, #(4 * 16)] | |||
stp x18, x19, [sp, #(5 * 16)] | |||
stp x20, x21, [sp, #(6 * 16)] | |||
stp x22, x23, [sp, #(7 * 16)] | |||
stp x24, x25, [sp, #(8 * 16)] | |||
stp x26, x27, [sp, #(9 * 16)] | |||
str x28, [sp, #(10 * 16)] | |||
.endm | |||
.macro RESTORE_REGS | |||
ldp d8, d9, [sp, #(0 * 16)] | |||
ldp d10, d11, [sp, #(1 * 16)] | |||
ldp d12, d13, [sp, #(2 * 16)] | |||
ldp d14, d15, [sp, #(3 * 16)] | |||
ldp d16, d17, [sp, #(4 * 16)] | |||
ldp x18, x19, [sp, #(5 * 16)] | |||
ldp x20, x21, [sp, #(6 * 16)] | |||
ldp x22, x23, [sp, #(7 * 16)] | |||
ldp x24, x25, [sp, #(8 * 16)] | |||
ldp x26, x27, [sp, #(9 * 16)] | |||
ldr x28, [sp, #(10 * 16)] | |||
add sp, sp, #(11*16) | |||
.endm | |||
.macro COPY4x4 | |||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldr q0, [A01], #16 | |||
ins v8.s[0], v0.s[0] | |||
ins v9.s[0], v0.s[1] | |||
ins v10.s[0], v0.s[2] | |||
ins v11.s[0], v0.s[3] | |||
ldr q1, [A02], #16 | |||
ins v8.s[1], v1.s[0] | |||
ins v9.s[1], v1.s[1] | |||
ins v10.s[1], v1.s[2] | |||
ins v11.s[1], v1.s[3] | |||
ldr q2, [A03], #16 | |||
ins v8.s[2], v2.s[0] | |||
ins v9.s[2], v2.s[1] | |||
ins v10.s[2], v2.s[2] | |||
ins v11.s[2], v2.s[3] | |||
ldr q3, [A04], #16 | |||
ins v8.s[3], v3.s[0] | |||
ins v9.s[3], v3.s[1] | |||
ins v10.s[3], v3.s[2] | |||
ins v11.s[3], v3.s[3] | |||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] | |||
add B00, B00, #64 | |||
.endm | |||
.macro COPY1x4 | |||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldr s0, [A01], #4 | |||
ldr s1, [A02], #4 | |||
ldr s2, [A03], #4 | |||
ldr s3, [A04], #4 | |||
stp s0, s1, [B00] | |||
add B00, B00, #8 | |||
stp s2, s3, [B00] | |||
add B00, B00, #8 | |||
.endm | |||
.macro COPY4x2 | |||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldr q0, [A01], #16 | |||
ins v8.s[0], v0.s[0] | |||
ins v9.s[0], v0.s[1] | |||
ins v10.s[0], v0.s[2] | |||
ins v11.s[0], v0.s[3] | |||
ldr q1, [A02], #16 | |||
ins v8.s[1], v1.s[0] | |||
ins v9.s[1], v1.s[1] | |||
ins v10.s[1], v1.s[2] | |||
ins v11.s[1], v1.s[3] | |||
st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] | |||
add B00, B00, #32 | |||
.endm | |||
.macro COPY1x2 | |||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldr s0, [A01], #4 | |||
ldr s1, [A02], #4 | |||
stp s0, s1, [B00] | |||
add B00, B00, #8 | |||
.endm | |||
.macro COPY4x1 | |||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
ldr q0, [A01], #16 | |||
str q0, [B00], #16 | |||
.endm | |||
.macro COPY1x1 | |||
prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
ldr s0, [A01], #4 | |||
str s0, [B00], #4 | |||
.endm | |||
/************************************************************************************** | |||
* End of macro definitions | |||
**************************************************************************************/ | |||
PROLOGUE | |||
.align 5 | |||
SAVE_REGS | |||
lsl LDA, LDA, #2 // LDA = LDA * SIZE | |||
.Ldgemm_ncopy_L4_BEGIN: | |||
asr J, N, #2 // J = N / 4 | |||
cmp J, #0 | |||
ble .Ldgemm_ncopy_L2_BEGIN | |||
.align 5 | |||
.Ldgemm_ncopy_L4_M4_BEGIN: | |||
mov A01, A00 | |||
add A02, A01, LDA | |||
add A03, A02, LDA | |||
add A04, A03, LDA | |||
add A00, A04, LDA | |||
asr I, M, #2 // I = M / 4 | |||
cmp I, #0 | |||
ble .Ldgemm_ncopy_L4_M4_40 | |||
.align 5 | |||
.Ldgemm_ncopy_L4_M4_20: | |||
COPY4x4 | |||
subs I , I , #1 | |||
bne .Ldgemm_ncopy_L4_M4_20 | |||
.Ldgemm_ncopy_L4_M4_40: | |||
and I, M , #3 | |||
cmp I, #0 | |||
ble .Ldgemm_ncopy_L4_M4_END | |||
.align 5 | |||
.Ldgemm_ncopy_L4_M4_60: | |||
COPY1x4 | |||
subs I , I , #1 | |||
bne .Ldgemm_ncopy_L4_M4_60 | |||
.Ldgemm_ncopy_L4_M4_END: | |||
subs J , J, #1 // j-- | |||
bne .Ldgemm_ncopy_L4_M4_BEGIN | |||
/*********************************************************************************************/ | |||
.Ldgemm_ncopy_L2_BEGIN: | |||
tst N, #3 | |||
ble .Ldgemm_ncopy_L999 | |||
tst N, #2 | |||
ble .Ldgemm_ncopy_L1_BEGIN | |||
.Ldgemm_ncopy_L2_M4_BEGIN: | |||
mov A01, A00 | |||
add A02, A01, LDA | |||
add A00, A02, LDA | |||
asr I, M, #2 // I = M / 4 | |||
cmp I, #0 | |||
ble .Ldgemm_ncopy_L2_M4_40 | |||
.align 5 | |||
.Ldgemm_ncopy_L2_M4_20: | |||
COPY4x2 | |||
subs I , I , #1 | |||
bne .Ldgemm_ncopy_L2_M4_20 | |||
.Ldgemm_ncopy_L2_M4_40: | |||
and I, M , #3 | |||
cmp I, #0 | |||
ble .Ldgemm_ncopy_L2_M4_END | |||
.align 5 | |||
.Ldgemm_ncopy_L2_M4_60: | |||
COPY1x2 | |||
subs I , I , #1 | |||
bne .Ldgemm_ncopy_L2_M4_60 | |||
.Ldgemm_ncopy_L2_M4_END: | |||
/*********************************************************************************************/ | |||
.Ldgemm_ncopy_L1_BEGIN: | |||
tst N, #1 | |||
ble .Ldgemm_ncopy_L999 | |||
.Ldgemm_ncopy_L1_M4_BEGIN: | |||
mov A01, A00 | |||
asr I, M, #2 // I = M / 4 | |||
cmp I, #0 | |||
ble .Ldgemm_ncopy_L1_M4_40 | |||
.align 5 | |||
.Ldgemm_ncopy_L1_M4_20: | |||
COPY4x1 | |||
subs I , I , #1 | |||
bne .Ldgemm_ncopy_L1_M4_20 | |||
.Ldgemm_ncopy_L1_M4_40: | |||
and I, M , #3 | |||
cmp I, #0 | |||
ble .Ldgemm_ncopy_L1_M4_END | |||
.align 5 | |||
.Ldgemm_ncopy_L1_M4_60: | |||
COPY1x1 | |||
subs I , I , #1 | |||
bne .Ldgemm_ncopy_L1_M4_60 | |||
.Ldgemm_ncopy_L1_M4_END: | |||
.Ldgemm_ncopy_L999: | |||
mov x0, #0 | |||
RESTORE_REGS | |||
ret | |||
EPILOGUE | |||
@@ -1,293 +1,293 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
/************************************************************************************** | |||
* Abdelrauf(quickwritereader@gmail.com) | |||
* BLASTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
* LAPACK-TEST : OK | |||
**************************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "def_vsx.h" | |||
#define LOAD ld | |||
#define STACKSIZE (512 ) | |||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
#define M r3 | |||
#define N r4 | |||
#define K r5 | |||
#define A r8 | |||
#define B r9 | |||
#define C r10 | |||
#define LDC r6 | |||
#define OFFSET r7 | |||
#define alpha_r vs19 | |||
#define alpha_i vs20 | |||
#define save_permute_1 vs21 | |||
#define permute_mask vs22 | |||
#define o0 0 | |||
#define T1 r11 | |||
#define T2 r12 | |||
#define T3 r14 | |||
#define T4 r15 | |||
#define T5 r16 | |||
#define T6 r17 | |||
#define L r18 | |||
#define T7 r19 | |||
#define T8 r20 | |||
#define TEMP_REG r21 | |||
#define I r22 | |||
#define J r23 | |||
#define AO r24 | |||
#define BO r25 | |||
#define CO r26 | |||
#define T9 r27 | |||
#define T10 r28 | |||
#define PRE r29 | |||
#define T12 r30 | |||
#define T13 r31 | |||
#include "cgemm_macros_power9.S" | |||
.equ perm_const1, 0x0405060700010203 | |||
.equ perm_const2, 0x0c0d0e0f08090a0b | |||
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||
.equ save_permute_11, 0x0405060714151617 | |||
#ifndef NEEDPARAM | |||
PROLOGUE | |||
PROFCODE | |||
addi SP, SP, -STACKSIZE | |||
mflr r0 | |||
stfd f14, 0(SP) | |||
stfd f15, 8(SP) | |||
stfd f16, 16(SP) | |||
stfd f17, 24(SP) | |||
stfd f18, 32(SP) | |||
stfd f19, 40(SP) | |||
stfd f20, 48(SP) | |||
stfd f21, 56(SP) | |||
stfd f22, 64(SP) | |||
stfd f23, 72(SP) | |||
stfd f24, 80(SP) | |||
stfd f25, 88(SP) | |||
stfd f26, 96(SP) | |||
stfd f27, 104(SP) | |||
stfd f28, 112(SP) | |||
stfd f29, 120(SP) | |||
stfd f30, 128(SP) | |||
stfd f31, 136(SP) | |||
std r31, 144(SP) | |||
std r30, 152(SP) | |||
std r29, 160(SP) | |||
std r28, 168(SP) | |||
std r27, 176(SP) | |||
std r26, 184(SP) | |||
std r25, 192(SP) | |||
std r24, 200(SP) | |||
std r23, 208(SP) | |||
std r22, 216(SP) | |||
std r21, 224(SP) | |||
std r20, 232(SP) | |||
std r19, 240(SP) | |||
std r18, 248(SP) | |||
std r17, 256(SP) | |||
std r16, 264(SP) | |||
std r15, 272(SP) | |||
std r14, 280(SP) | |||
stxv vs52, 288(SP) | |||
stxv vs53, 304(SP) | |||
stxv vs54, 320(SP) | |||
stxv vs55, 336(SP) | |||
stxv vs56, 352(SP) | |||
stxv vs57, 368(SP) | |||
stxv vs58, 384(SP) | |||
stxv vs59, 400(SP) | |||
stxv vs60, 416(SP) | |||
stxv vs61, 432(SP) | |||
stxv vs62, 448(SP) | |||
stxv vs63, 464(SP) | |||
std r0, FLINK_SAVE(SP) | |||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
#ifdef TRMMKERNEL | |||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
#endif | |||
slwi LDC, LDC, ZBASE_SHIFT | |||
/*alpha is stored in f1. convert to single and splat*/ | |||
xscvdpspn alpha_r,vs1 | |||
xscvdpspn alpha_i,vs2 | |||
xxspltw alpha_r,alpha_r,0 | |||
xxspltw alpha_i,alpha_i,0 | |||
/*load reverse permute mask for big endian | |||
uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
*/ | |||
lis T2, perm_const2@highest | |||
lis T1, perm_const1@highest | |||
lis T3, save_permute_12@highest | |||
lis T4, save_permute_11@highest | |||
ori T2, T2, perm_const2@higher | |||
ori T1, T1, perm_const1@higher | |||
ori T3, T3, save_permute_12@higher | |||
ori T4, T4, save_permute_11@higher | |||
rldicr T2, T2, 32, 31 | |||
rldicr T1, T1, 32, 31 | |||
rldicr T3, T3, 32, 31 | |||
rldicr T4, T4, 32, 31 | |||
oris T2, T2, perm_const2@h | |||
oris T1, T1, perm_const1@h | |||
oris T3, T3, save_permute_12@h | |||
oris T4, T4, save_permute_11@h | |||
ori T2, T2, perm_const2@l | |||
ori T1, T1, perm_const1@l | |||
ori T3, T3, save_permute_12@l | |||
ori T4, T4, save_permute_11@l | |||
li r0,0 | |||
li PRE,512 | |||
#if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
/*negate for this case as we will use addition -1*(a+b) */ | |||
xvnegsp alpha_r,alpha_r | |||
xvnegsp alpha_i,alpha_i | |||
#endif | |||
mtvsrdd permute_mask,T2,T1 | |||
mtvsrdd save_permute_1,T3,T4 | |||
/*mask is reverse permute so we have to make it inner permute */ | |||
xxpermdi permute_mask, permute_mask, permute_mask,2 | |||
#include "cgemm_logic_power9.S" | |||
.L999: | |||
lfd f14, 0(SP) | |||
lfd f15, 8(SP) | |||
lfd f16, 16(SP) | |||
lfd f17, 24(SP) | |||
lfd f18, 32(SP) | |||
lfd f19, 40(SP) | |||
lfd f20, 48(SP) | |||
lfd f21, 56(SP) | |||
lfd f22, 64(SP) | |||
lfd f23, 72(SP) | |||
lfd f24, 80(SP) | |||
lfd f25, 88(SP) | |||
lfd f26, 96(SP) | |||
lfd f27, 104(SP) | |||
lfd f28, 112(SP) | |||
lfd f29, 120(SP) | |||
lfd f30, 128(SP) | |||
lfd f31, 136(SP) | |||
ld r31, 144(SP) | |||
ld r30, 152(SP) | |||
ld r29, 160(SP) | |||
ld r28, 168(SP) | |||
ld r27, 176(SP) | |||
ld r26, 184(SP) | |||
ld r25, 192(SP) | |||
ld r24, 200(SP) | |||
ld r23, 208(SP) | |||
ld r22, 216(SP) | |||
ld r21, 224(SP) | |||
ld r20, 232(SP) | |||
ld r19, 240(SP) | |||
ld r18, 248(SP) | |||
ld r17, 256(SP) | |||
ld r16, 264(SP) | |||
ld r15, 272(SP) | |||
ld r14, 280(SP) | |||
ld r0, FLINK_SAVE(SP) | |||
lxv vs52, 288(SP) | |||
lxv vs53, 304(SP) | |||
lxv vs54, 320(SP) | |||
lxv vs55, 336(SP) | |||
lxv vs56, 352(SP) | |||
lxv vs57, 368(SP) | |||
lxv vs58, 384(SP) | |||
lxv vs59, 400(SP) | |||
mtlr r0 | |||
lxv vs60, 416(SP) | |||
lxv vs61, 432(SP) | |||
lxv vs62, 448(SP) | |||
lxv vs63, 464(SP) | |||
addi SP, SP, STACKSIZE | |||
blr | |||
EPILOGUE | |||
#endif | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
/************************************************************************************** | |||
* Abdelrauf(quickwritereader@gmail.com) | |||
* BLASTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
* LAPACK-TEST : OK | |||
**************************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "def_vsx.h" | |||
#define LOAD ld | |||
#define STACKSIZE (512 ) | |||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
#define M r3 | |||
#define N r4 | |||
#define K r5 | |||
#define A r8 | |||
#define B r9 | |||
#define C r10 | |||
#define LDC r6 | |||
#define OFFSET r7 | |||
#define alpha_r vs19 | |||
#define alpha_i vs20 | |||
#define save_permute_1 vs21 | |||
#define permute_mask vs22 | |||
#define o0 0 | |||
#define T1 r11 | |||
#define T2 r12 | |||
#define T3 r14 | |||
#define T4 r15 | |||
#define T5 r16 | |||
#define T6 r17 | |||
#define L r18 | |||
#define T7 r19 | |||
#define T8 r20 | |||
#define TEMP_REG r21 | |||
#define I r22 | |||
#define J r23 | |||
#define AO r24 | |||
#define BO r25 | |||
#define CO r26 | |||
#define T9 r27 | |||
#define T10 r28 | |||
#define PRE r29 | |||
#define T12 r30 | |||
#define T13 r31 | |||
#include "cgemm_macros_power9.S" | |||
.equ perm_const1, 0x0405060700010203 | |||
.equ perm_const2, 0x0c0d0e0f08090a0b | |||
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||
.equ save_permute_11, 0x0405060714151617 | |||
#ifndef NEEDPARAM | |||
PROLOGUE | |||
PROFCODE | |||
addi SP, SP, -STACKSIZE | |||
mflr r0 | |||
stfd f14, 0(SP) | |||
stfd f15, 8(SP) | |||
stfd f16, 16(SP) | |||
stfd f17, 24(SP) | |||
stfd f18, 32(SP) | |||
stfd f19, 40(SP) | |||
stfd f20, 48(SP) | |||
stfd f21, 56(SP) | |||
stfd f22, 64(SP) | |||
stfd f23, 72(SP) | |||
stfd f24, 80(SP) | |||
stfd f25, 88(SP) | |||
stfd f26, 96(SP) | |||
stfd f27, 104(SP) | |||
stfd f28, 112(SP) | |||
stfd f29, 120(SP) | |||
stfd f30, 128(SP) | |||
stfd f31, 136(SP) | |||
std r31, 144(SP) | |||
std r30, 152(SP) | |||
std r29, 160(SP) | |||
std r28, 168(SP) | |||
std r27, 176(SP) | |||
std r26, 184(SP) | |||
std r25, 192(SP) | |||
std r24, 200(SP) | |||
std r23, 208(SP) | |||
std r22, 216(SP) | |||
std r21, 224(SP) | |||
std r20, 232(SP) | |||
std r19, 240(SP) | |||
std r18, 248(SP) | |||
std r17, 256(SP) | |||
std r16, 264(SP) | |||
std r15, 272(SP) | |||
std r14, 280(SP) | |||
stxv vs52, 288(SP) | |||
stxv vs53, 304(SP) | |||
stxv vs54, 320(SP) | |||
stxv vs55, 336(SP) | |||
stxv vs56, 352(SP) | |||
stxv vs57, 368(SP) | |||
stxv vs58, 384(SP) | |||
stxv vs59, 400(SP) | |||
stxv vs60, 416(SP) | |||
stxv vs61, 432(SP) | |||
stxv vs62, 448(SP) | |||
stxv vs63, 464(SP) | |||
std r0, FLINK_SAVE(SP) | |||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
#ifdef TRMMKERNEL | |||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
#endif | |||
slwi LDC, LDC, ZBASE_SHIFT | |||
/*alpha is stored in f1. convert to single and splat*/ | |||
xscvdpspn alpha_r,vs1 | |||
xscvdpspn alpha_i,vs2 | |||
xxspltw alpha_r,alpha_r,0 | |||
xxspltw alpha_i,alpha_i,0 | |||
/*load reverse permute mask for big endian | |||
uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
*/ | |||
lis T2, perm_const2@highest | |||
lis T1, perm_const1@highest | |||
lis T3, save_permute_12@highest | |||
lis T4, save_permute_11@highest | |||
ori T2, T2, perm_const2@higher | |||
ori T1, T1, perm_const1@higher | |||
ori T3, T3, save_permute_12@higher | |||
ori T4, T4, save_permute_11@higher | |||
rldicr T2, T2, 32, 31 | |||
rldicr T1, T1, 32, 31 | |||
rldicr T3, T3, 32, 31 | |||
rldicr T4, T4, 32, 31 | |||
oris T2, T2, perm_const2@h | |||
oris T1, T1, perm_const1@h | |||
oris T3, T3, save_permute_12@h | |||
oris T4, T4, save_permute_11@h | |||
ori T2, T2, perm_const2@l | |||
ori T1, T1, perm_const1@l | |||
ori T3, T3, save_permute_12@l | |||
ori T4, T4, save_permute_11@l | |||
li r0,0 | |||
li PRE,512 | |||
#if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
/*negate for this case as we will use addition -1*(a+b) */ | |||
xvnegsp alpha_r,alpha_r | |||
xvnegsp alpha_i,alpha_i | |||
#endif | |||
mtvsrdd permute_mask,T2,T1 | |||
mtvsrdd save_permute_1,T3,T4 | |||
/*mask is reverse permute so we have to make it inner permute */ | |||
xxpermdi permute_mask, permute_mask, permute_mask,2 | |||
#include "cgemm_logic_power9.S" | |||
.L999: | |||
lfd f14, 0(SP) | |||
lfd f15, 8(SP) | |||
lfd f16, 16(SP) | |||
lfd f17, 24(SP) | |||
lfd f18, 32(SP) | |||
lfd f19, 40(SP) | |||
lfd f20, 48(SP) | |||
lfd f21, 56(SP) | |||
lfd f22, 64(SP) | |||
lfd f23, 72(SP) | |||
lfd f24, 80(SP) | |||
lfd f25, 88(SP) | |||
lfd f26, 96(SP) | |||
lfd f27, 104(SP) | |||
lfd f28, 112(SP) | |||
lfd f29, 120(SP) | |||
lfd f30, 128(SP) | |||
lfd f31, 136(SP) | |||
ld r31, 144(SP) | |||
ld r30, 152(SP) | |||
ld r29, 160(SP) | |||
ld r28, 168(SP) | |||
ld r27, 176(SP) | |||
ld r26, 184(SP) | |||
ld r25, 192(SP) | |||
ld r24, 200(SP) | |||
ld r23, 208(SP) | |||
ld r22, 216(SP) | |||
ld r21, 224(SP) | |||
ld r20, 232(SP) | |||
ld r19, 240(SP) | |||
ld r18, 248(SP) | |||
ld r17, 256(SP) | |||
ld r16, 264(SP) | |||
ld r15, 272(SP) | |||
ld r14, 280(SP) | |||
ld r0, FLINK_SAVE(SP) | |||
lxv vs52, 288(SP) | |||
lxv vs53, 304(SP) | |||
lxv vs54, 320(SP) | |||
lxv vs55, 336(SP) | |||
lxv vs56, 352(SP) | |||
lxv vs57, 368(SP) | |||
lxv vs58, 384(SP) | |||
lxv vs59, 400(SP) | |||
mtlr r0 | |||
lxv vs60, 416(SP) | |||
lxv vs61, 432(SP) | |||
lxv vs62, 448(SP) | |||
lxv vs63, 464(SP) | |||
addi SP, SP, STACKSIZE | |||
blr | |||
EPILOGUE | |||
#endif |
@@ -1,233 +1,233 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013-2018, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||
static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||
{ | |||
__vector float t0; | |||
__vector float t1; | |||
__vector float t2; | |||
__vector float t3; | |||
__vector float t4; | |||
__vector float t5; | |||
__vector float t6; | |||
__vector float t7; | |||
__asm__ | |||
( | |||
"xscvdpspn 36, %x[cos] \n\t" // load c to all words | |||
"xxspltw 36, 36, 0 \n\t" | |||
"xscvdpspn 37, %x[sin] \n\t" // load s to all words | |||
"xxspltw 37, 37, 0 \n\t" | |||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
"lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
"lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
"lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
"lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
"lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
"lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
"addi %[x_ptr], %[x_ptr], 64 \n\t" | |||
"addi %[y_ptr], %[y_ptr], 64 \n\t" | |||
"addic. %[temp_n], %[temp_n], -8 \n\t" | |||
"ble two%= \n\t" | |||
".align 5 \n\t" | |||
"one%=: \n\t" | |||
"xvmulsp 40, 32, 36 \n\t" // c * x | |||
"xvmulsp 41, 33, 36 \n\t" | |||
"xvmulsp 42, 34, 36 \n\t" | |||
"xvmulsp 43, 35, 36 \n\t" | |||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
"xvmulsp %x[x2], 49, 36 \n\t" | |||
"xvmulsp %x[x1], 50, 36 \n\t" | |||
"xvmulsp %x[x3], 51, 36 \n\t" | |||
"xvmulsp 44, 32, 37 \n\t" // s * x | |||
"xvmulsp 45, 33, 37 \n\t" | |||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
"lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
"xvmulsp 46, 34, 37 \n\t" | |||
"xvmulsp 47, 35, 37 \n\t" | |||
"lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
"lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
"xvmulsp %x[x5], 49, 37 \n\t" | |||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
"lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
"xvmulsp %x[x6], 50, 37 \n\t" | |||
"xvmulsp %x[x7], 51, 37 \n\t" | |||
"lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
"lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
"addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
"addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
"stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
"stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
"stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
"stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||
"addi %[x_ptr], %[x_ptr], 128 \n\t" | |||
"addi %[y_ptr], %[y_ptr], 128 \n\t" | |||
"addic. %[temp_n], %[temp_n], -8 \n\t" | |||
"bgt one%= \n\t" | |||
"two%=: \n\t" | |||
"xvmulsp 40, 32, 36 \n\t" // c * x | |||
"xvmulsp 41, 33, 36 \n\t" | |||
"xvmulsp 42, 34, 36 \n\t" | |||
"xvmulsp 43, 35, 36 \n\t" | |||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
"xvmulsp %x[x2], 49, 36 \n\t" | |||
"xvmulsp %x[x1], 50, 36 \n\t" | |||
"xvmulsp %x[x3], 51, 36 \n\t" | |||
"xvmulsp 44, 32, 37 \n\t" // s * x | |||
"xvmulsp 45, 33, 37 \n\t" | |||
"xvmulsp 46, 34, 37 \n\t" | |||
"xvmulsp 47, 35, 37 \n\t" | |||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
"xvmulsp %x[x5], 49, 37 \n\t" | |||
"xvmulsp %x[x6], 50, 37 \n\t" | |||
"xvmulsp %x[x7], 51, 37 \n\t" | |||
"addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
"addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
"stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
"stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
"stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
"stxvd2x %x[x3], %[i48], %[y_ptr] " | |||
: | |||
[mem_x] "+m" (*(float (*)[2*n])x), | |||
[mem_y] "+m" (*(float (*)[2*n])y), | |||
[temp_n] "+r" (n), | |||
[x_ptr] "+&b" (x), | |||
[y_ptr] "+&b" (y), | |||
[x0] "=wa" (t0), | |||
[x1] "=wa" (t2), | |||
[x2] "=wa" (t1), | |||
[x3] "=wa" (t3), | |||
[x4] "=wa" (t4), | |||
[x5] "=wa" (t5), | |||
[x6] "=wa" (t6), | |||
[x7] "=wa" (t7) | |||
: | |||
[cos] "f" (c), | |||
[sin] "f" (s), | |||
[i16] "b" (16), | |||
[i32] "b" (32), | |||
[i48] "b" (48) | |||
: | |||
"cr0", | |||
"vs32","vs33","vs34","vs35","vs36","vs37", | |||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
"vs48","vs49","vs50","vs51" | |||
); | |||
} | |||
#endif | |||
#endif | |||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
{ | |||
BLASLONG i=0; | |||
BLASLONG ix=0,iy=0; | |||
FLOAT temp[2]; | |||
BLASLONG inc_x2; | |||
BLASLONG inc_y2; | |||
if ( n <= 0 ) return(0); | |||
if ( (inc_x == 1) && (inc_y == 1) ) | |||
{ | |||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||
BLASLONG n1 = n & -8; | |||
if ( n1 > 0 ) | |||
{ | |||
crot_kernel_8(n1, x, y, c, s); | |||
i=n1; | |||
ix=2*n1; | |||
} | |||
#endif | |||
while(i < n) | |||
{ | |||
temp[0] = c*x[ix] + s*y[ix] ; | |||
temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||
y[ix] = c*y[ix] - s*x[ix] ; | |||
y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||
x[ix] = temp[0] ; | |||
x[ix+1] = temp[1] ; | |||
ix += 2 ; | |||
i++ ; | |||
} | |||
} | |||
else | |||
{ | |||
inc_x2 = 2 * inc_x ; | |||
inc_y2 = 2 * inc_y ; | |||
while(i < n) | |||
{ | |||
temp[0] = c*x[ix] + s*y[iy] ; | |||
temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||
y[iy] = c*y[iy] - s*x[ix] ; | |||
y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||
x[ix] = temp[0] ; | |||
x[ix+1] = temp[1] ; | |||
ix += inc_x2 ; | |||
iy += inc_y2 ; | |||
i++ ; | |||
} | |||
} | |||
return(0); | |||
} | |||
/*************************************************************************** | |||
Copyright (c) 2013-2018, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||
static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||
{ | |||
__vector float t0; | |||
__vector float t1; | |||
__vector float t2; | |||
__vector float t3; | |||
__vector float t4; | |||
__vector float t5; | |||
__vector float t6; | |||
__vector float t7; | |||
__asm__ | |||
( | |||
"xscvdpspn 36, %x[cos] \n\t" // load c to all words | |||
"xxspltw 36, 36, 0 \n\t" | |||
"xscvdpspn 37, %x[sin] \n\t" // load s to all words | |||
"xxspltw 37, 37, 0 \n\t" | |||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
"lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
"lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
"lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
"lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
"lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
"lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
"addi %[x_ptr], %[x_ptr], 64 \n\t" | |||
"addi %[y_ptr], %[y_ptr], 64 \n\t" | |||
"addic. %[temp_n], %[temp_n], -8 \n\t" | |||
"ble two%= \n\t" | |||
".align 5 \n\t" | |||
"one%=: \n\t" | |||
"xvmulsp 40, 32, 36 \n\t" // c * x | |||
"xvmulsp 41, 33, 36 \n\t" | |||
"xvmulsp 42, 34, 36 \n\t" | |||
"xvmulsp 43, 35, 36 \n\t" | |||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
"xvmulsp %x[x2], 49, 36 \n\t" | |||
"xvmulsp %x[x1], 50, 36 \n\t" | |||
"xvmulsp %x[x3], 51, 36 \n\t" | |||
"xvmulsp 44, 32, 37 \n\t" // s * x | |||
"xvmulsp 45, 33, 37 \n\t" | |||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
"lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
"xvmulsp 46, 34, 37 \n\t" | |||
"xvmulsp 47, 35, 37 \n\t" | |||
"lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
"lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
"xvmulsp %x[x5], 49, 37 \n\t" | |||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
"lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
"xvmulsp %x[x6], 50, 37 \n\t" | |||
"xvmulsp %x[x7], 51, 37 \n\t" | |||
"lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
"lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
"addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
"addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
"stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
"stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
"stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
"stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||
"addi %[x_ptr], %[x_ptr], 128 \n\t" | |||
"addi %[y_ptr], %[y_ptr], 128 \n\t" | |||
"addic. %[temp_n], %[temp_n], -8 \n\t" | |||
"bgt one%= \n\t" | |||
"two%=: \n\t" | |||
"xvmulsp 40, 32, 36 \n\t" // c * x | |||
"xvmulsp 41, 33, 36 \n\t" | |||
"xvmulsp 42, 34, 36 \n\t" | |||
"xvmulsp 43, 35, 36 \n\t" | |||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
"xvmulsp %x[x2], 49, 36 \n\t" | |||
"xvmulsp %x[x1], 50, 36 \n\t" | |||
"xvmulsp %x[x3], 51, 36 \n\t" | |||
"xvmulsp 44, 32, 37 \n\t" // s * x | |||
"xvmulsp 45, 33, 37 \n\t" | |||
"xvmulsp 46, 34, 37 \n\t" | |||
"xvmulsp 47, 35, 37 \n\t" | |||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
"xvmulsp %x[x5], 49, 37 \n\t" | |||
"xvmulsp %x[x6], 50, 37 \n\t" | |||
"xvmulsp %x[x7], 51, 37 \n\t" | |||
"addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
"addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
"stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
"stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
"stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
"stxvd2x %x[x3], %[i48], %[y_ptr] " | |||
: | |||
[mem_x] "+m" (*(float (*)[2*n])x), | |||
[mem_y] "+m" (*(float (*)[2*n])y), | |||
[temp_n] "+r" (n), | |||
[x_ptr] "+&b" (x), | |||
[y_ptr] "+&b" (y), | |||
[x0] "=wa" (t0), | |||
[x1] "=wa" (t2), | |||
[x2] "=wa" (t1), | |||
[x3] "=wa" (t3), | |||
[x4] "=wa" (t4), | |||
[x5] "=wa" (t5), | |||
[x6] "=wa" (t6), | |||
[x7] "=wa" (t7) | |||
: | |||
[cos] "f" (c), | |||
[sin] "f" (s), | |||
[i16] "b" (16), | |||
[i32] "b" (32), | |||
[i48] "b" (48) | |||
: | |||
"cr0", | |||
"vs32","vs33","vs34","vs35","vs36","vs37", | |||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
"vs48","vs49","vs50","vs51" | |||
); | |||
} | |||
#endif | |||
#endif | |||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
{ | |||
BLASLONG i=0; | |||
BLASLONG ix=0,iy=0; | |||
FLOAT temp[2]; | |||
BLASLONG inc_x2; | |||
BLASLONG inc_y2; | |||
if ( n <= 0 ) return(0); | |||
if ( (inc_x == 1) && (inc_y == 1) ) | |||
{ | |||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||
BLASLONG n1 = n & -8; | |||
if ( n1 > 0 ) | |||
{ | |||
crot_kernel_8(n1, x, y, c, s); | |||
i=n1; | |||
ix=2*n1; | |||
} | |||
#endif | |||
while(i < n) | |||
{ | |||
temp[0] = c*x[ix] + s*y[ix] ; | |||
temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||
y[ix] = c*y[ix] - s*x[ix] ; | |||
y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||
x[ix] = temp[0] ; | |||
x[ix+1] = temp[1] ; | |||
ix += 2 ; | |||
i++ ; | |||
} | |||
} | |||
else | |||
{ | |||
inc_x2 = 2 * inc_x ; | |||
inc_y2 = 2 * inc_y ; | |||
while(i < n) | |||
{ | |||
temp[0] = c*x[ix] + s*y[iy] ; | |||
temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||
y[iy] = c*y[iy] - s*x[ix] ; | |||
y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||
x[ix] = temp[0] ; | |||
x[ix+1] = temp[1] ; | |||
ix += inc_x2 ; | |||
iy += inc_y2 ; | |||
i++ ; | |||
} | |||
} | |||
return(0); | |||
} | |||
@@ -1,249 +1,249 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "def_vsx.h" | |||
#define LOAD ld | |||
#define STACKSIZE (512 ) | |||
#define ALPHA_SP (296+192)(SP) | |||
#define FZERO (304+192)(SP) | |||
#define M r3 | |||
#define N r4 | |||
#define K r5 | |||
#define A r7 | |||
#define B r8 | |||
#define C r9 | |||
#define LDC r10 | |||
#define OFFSET r6 | |||
#define alpha_r vs18 | |||
#define o0 0 | |||
#define T4 r12 | |||
#define T3 r11 | |||
#define C4 r14 | |||
#define o8 r15 | |||
#define o24 r16 | |||
#define C2 r17 | |||
#define L r18 | |||
#define T1 r19 | |||
#define C3 r20 | |||
#define TEMP_REG r21 | |||
#define I r22 | |||
#define J r23 | |||
#define AO r24 | |||
#define BO r25 | |||
#define CO r26 | |||
#define o16 r27 | |||
#define o32 r28 | |||
#define o48 r29 | |||
#define PRE r30 | |||
#define T2 r31 | |||
#include "dgemm_macros_power9.S" | |||
#ifndef NEEDPARAM | |||
PROLOGUE | |||
PROFCODE | |||
addi SP, SP, -STACKSIZE | |||
li r0, 0 | |||
stfd f14, 0(SP) | |||
stfd f15, 8(SP) | |||
stfd f16, 16(SP) | |||
stfd f17, 24(SP) | |||
stfd f18, 32(SP) | |||
stfd f19, 40(SP) | |||
stfd f20, 48(SP) | |||
stfd f21, 56(SP) | |||
stfd f22, 64(SP) | |||
stfd f23, 72(SP) | |||
stfd f24, 80(SP) | |||
stfd f25, 88(SP) | |||
stfd f26, 96(SP) | |||
stfd f27, 104(SP) | |||
stfd f28, 112(SP) | |||
stfd f29, 120(SP) | |||
stfd f30, 128(SP) | |||
stfd f31, 136(SP) | |||
std r31, 144(SP) | |||
std r30, 152(SP) | |||
std r29, 160(SP) | |||
std r28, 168(SP) | |||
std r27, 176(SP) | |||
std r26, 184(SP) | |||
std r25, 192(SP) | |||
std r24, 200(SP) | |||
std r23, 208(SP) | |||
std r22, 216(SP) | |||
std r21, 224(SP) | |||
std r20, 232(SP) | |||
std r19, 240(SP) | |||
std r18, 248(SP) | |||
std r17, 256(SP) | |||
std r16, 264(SP) | |||
std r15, 272(SP) | |||
std r14, 280(SP) | |||
stxv vs52, 288(SP) | |||
stxv vs53, 304(SP) | |||
stxv vs54, 320(SP) | |||
stxv vs55, 336(SP) | |||
stxv vs56, 352(SP) | |||
stxv vs57, 368(SP) | |||
stxv vs58, 384(SP) | |||
stxv vs59, 400(SP) | |||
stxv vs60, 416(SP) | |||
stxv vs61, 432(SP) | |||
stxv vs62, 448(SP) | |||
stxv vs63, 464(SP) | |||
stfd f1, ALPHA_SP | |||
stw r0, FZERO | |||
slwi LDC, LDC, BASE_SHIFT | |||
#if defined(TRMMKERNEL) | |||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
#endif | |||
cmpwi cr0, M, 0 | |||
ble .L999_H1 | |||
cmpwi cr0, N, 0 | |||
ble .L999_H1 | |||
cmpwi cr0, K, 0 | |||
ble .L999_H1 | |||
addi T1, SP, 296+192 | |||
li PRE, 384 | |||
li o8 , 8 | |||
li o16, 16 | |||
li o24, 24 | |||
li o32, 32 | |||
li o48, 48 | |||
lxvdsx alpha_r, 0, T1 | |||
#include "dgemm_logic_power9.S" | |||
.L999: | |||
addi r3, 0, 0 | |||
lfd f14, 0(SP) | |||
lfd f15, 8(SP) | |||
lfd f16, 16(SP) | |||
lfd f17, 24(SP) | |||
lfd f18, 32(SP) | |||
lfd f19, 40(SP) | |||
lfd f20, 48(SP) | |||
lfd f21, 56(SP) | |||
lfd f22, 64(SP) | |||
lfd f23, 72(SP) | |||
lfd f24, 80(SP) | |||
lfd f25, 88(SP) | |||
lfd f26, 96(SP) | |||
lfd f27, 104(SP) | |||
lfd f28, 112(SP) | |||
lfd f29, 120(SP) | |||
lfd f30, 128(SP) | |||
lfd f31, 136(SP) | |||
ld r31, 144(SP) | |||
ld r30, 152(SP) | |||
ld r29, 160(SP) | |||
ld r28, 168(SP) | |||
ld r27, 176(SP) | |||
ld r26, 184(SP) | |||
ld r25, 192(SP) | |||
ld r24, 200(SP) | |||
ld r23, 208(SP) | |||
ld r22, 216(SP) | |||
ld r21, 224(SP) | |||
ld r20, 232(SP) | |||
ld r19, 240(SP) | |||
ld r18, 248(SP) | |||
ld r17, 256(SP) | |||
ld r16, 264(SP) | |||
ld r15, 272(SP) | |||
ld r14, 280(SP) | |||
lxv vs52, 288(SP) | |||
lxv vs53, 304(SP) | |||
lxv vs54, 320(SP) | |||
lxv vs55, 336(SP) | |||
lxv vs56, 352(SP) | |||
lxv vs57, 368(SP) | |||
lxv vs58, 384(SP) | |||
lxv vs59, 400(SP) | |||
lxv vs60, 416(SP) | |||
lxv vs61, 432(SP) | |||
lxv vs62, 448(SP) | |||
lxv vs63, 464(SP) | |||
addi SP, SP, STACKSIZE | |||
blr | |||
EPILOGUE | |||
#endif | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "def_vsx.h" | |||
#define LOAD ld | |||
#define STACKSIZE (512 ) | |||
#define ALPHA_SP (296+192)(SP) | |||
#define FZERO (304+192)(SP) | |||
#define M r3 | |||
#define N r4 | |||
#define K r5 | |||
#define A r7 | |||
#define B r8 | |||
#define C r9 | |||
#define LDC r10 | |||
#define OFFSET r6 | |||
#define alpha_r vs18 | |||
#define o0 0 | |||
#define T4 r12 | |||
#define T3 r11 | |||
#define C4 r14 | |||
#define o8 r15 | |||
#define o24 r16 | |||
#define C2 r17 | |||
#define L r18 | |||
#define T1 r19 | |||
#define C3 r20 | |||
#define TEMP_REG r21 | |||
#define I r22 | |||
#define J r23 | |||
#define AO r24 | |||
#define BO r25 | |||
#define CO r26 | |||
#define o16 r27 | |||
#define o32 r28 | |||
#define o48 r29 | |||
#define PRE r30 | |||
#define T2 r31 | |||
#include "dgemm_macros_power9.S" | |||
#ifndef NEEDPARAM | |||
PROLOGUE | |||
PROFCODE | |||
addi SP, SP, -STACKSIZE | |||
li r0, 0 | |||
stfd f14, 0(SP) | |||
stfd f15, 8(SP) | |||
stfd f16, 16(SP) | |||
stfd f17, 24(SP) | |||
stfd f18, 32(SP) | |||
stfd f19, 40(SP) | |||
stfd f20, 48(SP) | |||
stfd f21, 56(SP) | |||
stfd f22, 64(SP) | |||
stfd f23, 72(SP) | |||
stfd f24, 80(SP) | |||
stfd f25, 88(SP) | |||
stfd f26, 96(SP) | |||
stfd f27, 104(SP) | |||
stfd f28, 112(SP) | |||
stfd f29, 120(SP) | |||
stfd f30, 128(SP) | |||
stfd f31, 136(SP) | |||
std r31, 144(SP) | |||
std r30, 152(SP) | |||
std r29, 160(SP) | |||
std r28, 168(SP) | |||
std r27, 176(SP) | |||
std r26, 184(SP) | |||
std r25, 192(SP) | |||
std r24, 200(SP) | |||
std r23, 208(SP) | |||
std r22, 216(SP) | |||
std r21, 224(SP) | |||
std r20, 232(SP) | |||
std r19, 240(SP) | |||
std r18, 248(SP) | |||
std r17, 256(SP) | |||
std r16, 264(SP) | |||
std r15, 272(SP) | |||
std r14, 280(SP) | |||
stxv vs52, 288(SP) | |||
stxv vs53, 304(SP) | |||
stxv vs54, 320(SP) | |||
stxv vs55, 336(SP) | |||
stxv vs56, 352(SP) | |||
stxv vs57, 368(SP) | |||
stxv vs58, 384(SP) | |||
stxv vs59, 400(SP) | |||
stxv vs60, 416(SP) | |||
stxv vs61, 432(SP) | |||
stxv vs62, 448(SP) | |||
stxv vs63, 464(SP) | |||
stfd f1, ALPHA_SP | |||
stw r0, FZERO | |||
slwi LDC, LDC, BASE_SHIFT | |||
#if defined(TRMMKERNEL) | |||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
#endif | |||
cmpwi cr0, M, 0 | |||
ble .L999_H1 | |||
cmpwi cr0, N, 0 | |||
ble .L999_H1 | |||
cmpwi cr0, K, 0 | |||
ble .L999_H1 | |||
addi T1, SP, 296+192 | |||
li PRE, 384 | |||
li o8 , 8 | |||
li o16, 16 | |||
li o24, 24 | |||
li o32, 32 | |||
li o48, 48 | |||
lxvdsx alpha_r, 0, T1 | |||
#include "dgemm_logic_power9.S" | |||
.L999: | |||
addi r3, 0, 0 | |||
lfd f14, 0(SP) | |||
lfd f15, 8(SP) | |||
lfd f16, 16(SP) | |||
lfd f17, 24(SP) | |||
lfd f18, 32(SP) | |||
lfd f19, 40(SP) | |||
lfd f20, 48(SP) | |||
lfd f21, 56(SP) | |||
lfd f22, 64(SP) | |||
lfd f23, 72(SP) | |||
lfd f24, 80(SP) | |||
lfd f25, 88(SP) | |||
lfd f26, 96(SP) | |||
lfd f27, 104(SP) | |||
lfd f28, 112(SP) | |||
lfd f29, 120(SP) | |||
lfd f30, 128(SP) | |||
lfd f31, 136(SP) | |||
ld r31, 144(SP) | |||
ld r30, 152(SP) | |||
ld r29, 160(SP) | |||
ld r28, 168(SP) | |||
ld r27, 176(SP) | |||
ld r26, 184(SP) | |||
ld r25, 192(SP) | |||
ld r24, 200(SP) | |||
ld r23, 208(SP) | |||
ld r22, 216(SP) | |||
ld r21, 224(SP) | |||
ld r20, 232(SP) | |||
ld r19, 240(SP) | |||
ld r18, 248(SP) | |||
ld r17, 256(SP) | |||
ld r16, 264(SP) | |||
ld r15, 272(SP) | |||
ld r14, 280(SP) | |||
lxv vs52, 288(SP) | |||
lxv vs53, 304(SP) | |||
lxv vs54, 320(SP) | |||
lxv vs55, 336(SP) | |||
lxv vs56, 352(SP) | |||
lxv vs57, 368(SP) | |||
lxv vs58, 384(SP) | |||
lxv vs59, 400(SP) | |||
lxv vs60, 416(SP) | |||
lxv vs61, 432(SP) | |||
lxv vs62, 448(SP) | |||
lxv vs63, 464(SP) | |||
addi SP, SP, STACKSIZE | |||
blr | |||
EPILOGUE | |||
#endif |
@@ -1,328 +1,328 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
#include <altivec.h> | |||
#if defined(DOUBLE) | |||
#define ABS fabs | |||
#else | |||
#define ABS fabsf | |||
#endif | |||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code | |||
#if !defined(USE_MASK_PERMUTATIONS) | |||
static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ | |||
__vector float result; | |||
__asm__ ( | |||
"vmrgew %0,%1,%2;\n" | |||
: "=v" (result) | |||
: "v" (a), | |||
"v" (b) | |||
: ); | |||
return result; | |||
} | |||
static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ | |||
__vector float result; | |||
__asm__ ( | |||
"vmrgow %0,%1,%2;\n" | |||
: "=v" (result) | |||
: "v" (a), | |||
"v" (b) | |||
: ); | |||
return result; | |||
} | |||
#endif | |||
/** | |||
* Find maximum index | |||
* Warning: requirements n>0 and n % 32 == 0 | |||
* @param n | |||
* @param x pointer to the vector | |||
* @param maxf (out) maximum absolute value .( only for output ) | |||
* @return index | |||
*/ | |||
static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
BLASLONG index; | |||
BLASLONG i=0; | |||
#if defined(USE_MASK_PERMUTATIONS) | |||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||
#else | |||
register __vector unsigned int static_index0 = {2,0,3,1}; | |||
#endif | |||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
register __vector unsigned int static_index1=static_index0 +temp0; | |||
register __vector unsigned int static_index2=static_index0 +temp1; | |||
register __vector unsigned int static_index3=static_index1 +temp1; | |||
temp0=vec_xor(temp0,temp0); | |||
temp1=temp1 <<1 ; //{16,16,16,16} | |||
register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
register __vector float quadruple_values={0,0,0,0}; | |||
register __vector float * v_ptrx=(__vector float *)x; | |||
#if defined(USE_MASK_PERMUTATIONS) | |||
register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
#endif | |||
for(; i<n; i+=32 ){ | |||
//absolute temporary complex vectors | |||
register __vector float v0=vec_abs(v_ptrx[0]); | |||
register __vector float v1=vec_abs(v_ptrx[1]); | |||
register __vector float v2=vec_abs(v_ptrx[2]); | |||
register __vector float v3=vec_abs(v_ptrx[3]); | |||
register __vector float v4=vec_abs(v_ptrx[4]); | |||
register __vector float v5=vec_abs(v_ptrx[5]); | |||
register __vector float v6=vec_abs(v_ptrx[6]); | |||
register __vector float v7=vec_abs(v_ptrx[7]); | |||
//pack complex real and imaginary parts together to sum real+image | |||
#if defined(USE_MASK_PERMUTATIONS) | |||
register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
v1=t2+ti2; | |||
t1=vec_perm(v4,v5,real_pack_mask); | |||
ti=vec_perm(v4,v5,image_pack_mask); | |||
v2=t1+ti; //sum | |||
t2=vec_perm(v6,v7,real_pack_mask); | |||
ti2=vec_perm(v6,v7,image_pack_mask); | |||
v3=t2+ti2; | |||
#else | |||
register __vector float t1=mvec_mergee(v0,v1); | |||
register __vector float ti=mvec_mergeo(v0,v1); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
register __vector float t2= mvec_mergee(v2,v3); | |||
register __vector float ti2=mvec_mergeo(v2,v3); | |||
v1=t2+ti2; | |||
t1=mvec_mergee(v4,v5); | |||
ti=mvec_mergeo(v4,v5); | |||
v2=t1+ti; //sum | |||
t2=mvec_mergee(v6,v7); | |||
ti2=mvec_mergeo(v6,v7); | |||
v3=t2+ti2; | |||
#endif | |||
// now we have 16 summed elements . lets compare them | |||
v_ptrx+=8; | |||
register __vector bool int r1=vec_cmpgt(v1,v0); | |||
register __vector bool int r2=vec_cmpgt(v3,v2); | |||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
v0=vec_sel(v0,v1,r1); | |||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
v1=vec_sel(v2,v3,r2); | |||
//final cmp and select index and value for first 16 values | |||
r1=vec_cmpgt(v1,v0); | |||
register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
register __vector float vf0= vec_sel(v0,v1,r1); | |||
//absolute temporary complex vectors | |||
v0=vec_abs(v_ptrx[0]); | |||
v1=vec_abs(v_ptrx[1]); | |||
v2=vec_abs(v_ptrx[2]); | |||
v3=vec_abs(v_ptrx[3]); | |||
v4=vec_abs(v_ptrx[4]); | |||
v5=vec_abs(v_ptrx[5]); | |||
v6=vec_abs(v_ptrx[6]); | |||
v7=vec_abs(v_ptrx[7]); | |||
//pack complex real and imaginary parts together to sum real+image | |||
#if defined(USE_MASK_PERMUTATIONS) | |||
t1=vec_perm(v0,v1,real_pack_mask); | |||
ti=vec_perm(v0,v1,image_pack_mask); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
t2=vec_perm(v2,v3,real_pack_mask); | |||
ti2=vec_perm(v2,v3,image_pack_mask); | |||
v1=t2+ti2; | |||
t1=vec_perm(v4,v5,real_pack_mask); | |||
ti=vec_perm(v4,v5,image_pack_mask); | |||
v2=t1+ti; //sum | |||
t2=vec_perm(v6,v7,real_pack_mask); | |||
ti2=vec_perm(v6,v7,image_pack_mask); | |||
v3=t2+ti2; | |||
#else | |||
t1=mvec_mergee(v0,v1); | |||
ti=mvec_mergeo(v0,v1); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
t2=mvec_mergee(v2,v3); | |||
ti2=mvec_mergeo(v2,v3); | |||
v1=t2+ti2; | |||
t1=mvec_mergee(v4,v5); | |||
ti=mvec_mergeo(v4,v5); | |||
v2=t1+ti; //sum | |||
t2=mvec_mergee(v6,v7); | |||
ti2=mvec_mergeo(v6,v7); | |||
v3=t2+ti2; | |||
#endif | |||
// now we have 16 summed elements {from 16 to 31} . lets compare them | |||
v_ptrx+=8; | |||
r1=vec_cmpgt(v1,v0); | |||
r2=vec_cmpgt(v3,v2); | |||
ind2= vec_sel(static_index0,static_index1,r1); | |||
v0=vec_sel(v0,v1,r1); | |||
ind3= vec_sel(static_index2,static_index3,r2); | |||
v1=vec_sel(v2,v3,r2); | |||
//final cmp and select index and value for the second 16 values | |||
r1=vec_cmpgt(v1,v0); | |||
register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
register __vector float vv0= vec_sel(v0,v1,r1); | |||
indv0+=temp1; //make index from 16->31 | |||
//find final quadruple from 32 elements | |||
r2=vec_cmpgt(vv0,vf0); | |||
ind2 = vec_sel( indf0,indv0,r2); | |||
vv0= vec_sel(vf0,vv0,r2); | |||
//get asbolute index | |||
ind2+=temp0; | |||
//compare with old quadruple and update | |||
r1=vec_cmpgt(vv0,quadruple_values); | |||
quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
temp0+=temp_add; | |||
} | |||
//now we have to chose from 4 values and 4 different indices | |||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
// otherwise we will assign index of the maximum value | |||
float a1,a2,a3,a4; | |||
unsigned int i1,i2,i3,i4; | |||
a1=vec_extract(quadruple_values,0); | |||
a2=vec_extract(quadruple_values,1); | |||
a3=vec_extract(quadruple_values,2); | |||
a4=vec_extract(quadruple_values,3); | |||
i1=vec_extract(quadruple_indices,0); | |||
i2=vec_extract(quadruple_indices,1); | |||
i3=vec_extract(quadruple_indices,2); | |||
i4=vec_extract(quadruple_indices,3); | |||
if(a1==a2){ | |||
index=i1>i2?i2:i1; | |||
}else if(a2>a1){ | |||
index=i2; | |||
a1=a2; | |||
}else{ | |||
index= i1; | |||
} | |||
if(a4==a3){ | |||
i1=i3>i4?i4:i3; | |||
}else if(a4>a3){ | |||
i1=i4; | |||
a3=a4; | |||
}else{ | |||
i1= i3; | |||
} | |||
if(a1==a3){ | |||
index=i1>index?index:i1; | |||
*maxf=a1; | |||
}else if(a3>a1){ | |||
index=i1; | |||
*maxf=a3; | |||
}else{ | |||
*maxf=a1; | |||
} | |||
return index; | |||
} | |||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
{ | |||
BLASLONG i = 0; | |||
BLASLONG ix = 0; | |||
FLOAT maxf = 0; | |||
BLASLONG max = 0; | |||
BLASLONG inc_x2; | |||
if (n <= 0 || inc_x <= 0) return(max); | |||
if (inc_x == 1) { | |||
BLASLONG n1 = n & -32; | |||
if (n1 > 0) { | |||
max = ciamax_kernel_32(n1, x, &maxf); | |||
i = n1; | |||
ix = n1 << 1; | |||
} | |||
while(i < n) | |||
{ | |||
if( CABS1(x,ix) > maxf ) | |||
{ | |||
max = i; | |||
maxf = CABS1(x,ix); | |||
} | |||
ix += 2; | |||
i++; | |||
} | |||
return (max + 1); | |||
} else { | |||
inc_x2 = 2 * inc_x; | |||
maxf = CABS1(x,0); | |||
ix += inc_x2; | |||
i++; | |||
while(i < n) | |||
{ | |||
if( CABS1(x,ix) > maxf ) | |||
{ | |||
max = i; | |||
maxf = CABS1(x,ix); | |||
} | |||
ix += inc_x2; | |||
i++; | |||
} | |||
return (max + 1); | |||
} | |||
} | |||
/*************************************************************************** | |||
Copyright (c) 2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
#include <altivec.h> | |||
#if defined(DOUBLE) | |||
#define ABS fabs | |||
#else | |||
#define ABS fabsf | |||
#endif | |||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code | |||
#if !defined(USE_MASK_PERMUTATIONS) | |||
static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ | |||
__vector float result; | |||
__asm__ ( | |||
"vmrgew %0,%1,%2;\n" | |||
: "=v" (result) | |||
: "v" (a), | |||
"v" (b) | |||
: ); | |||
return result; | |||
} | |||
static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ | |||
__vector float result; | |||
__asm__ ( | |||
"vmrgow %0,%1,%2;\n" | |||
: "=v" (result) | |||
: "v" (a), | |||
"v" (b) | |||
: ); | |||
return result; | |||
} | |||
#endif | |||
/** | |||
* Find maximum index | |||
* Warning: requirements n>0 and n % 32 == 0 | |||
* @param n | |||
* @param x pointer to the vector | |||
* @param maxf (out) maximum absolute value .( only for output ) | |||
* @return index | |||
*/ | |||
static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
BLASLONG index; | |||
BLASLONG i=0; | |||
#if defined(USE_MASK_PERMUTATIONS) | |||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||
#else | |||
register __vector unsigned int static_index0 = {2,0,3,1}; | |||
#endif | |||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
register __vector unsigned int static_index1=static_index0 +temp0; | |||
register __vector unsigned int static_index2=static_index0 +temp1; | |||
register __vector unsigned int static_index3=static_index1 +temp1; | |||
temp0=vec_xor(temp0,temp0); | |||
temp1=temp1 <<1 ; //{16,16,16,16} | |||
register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
register __vector float quadruple_values={0,0,0,0}; | |||
register __vector float * v_ptrx=(__vector float *)x; | |||
#if defined(USE_MASK_PERMUTATIONS) | |||
register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
#endif | |||
for(; i<n; i+=32 ){ | |||
//absolute temporary complex vectors | |||
register __vector float v0=vec_abs(v_ptrx[0]); | |||
register __vector float v1=vec_abs(v_ptrx[1]); | |||
register __vector float v2=vec_abs(v_ptrx[2]); | |||
register __vector float v3=vec_abs(v_ptrx[3]); | |||
register __vector float v4=vec_abs(v_ptrx[4]); | |||
register __vector float v5=vec_abs(v_ptrx[5]); | |||
register __vector float v6=vec_abs(v_ptrx[6]); | |||
register __vector float v7=vec_abs(v_ptrx[7]); | |||
//pack complex real and imaginary parts together to sum real+image | |||
#if defined(USE_MASK_PERMUTATIONS) | |||
register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
v1=t2+ti2; | |||
t1=vec_perm(v4,v5,real_pack_mask); | |||
ti=vec_perm(v4,v5,image_pack_mask); | |||
v2=t1+ti; //sum | |||
t2=vec_perm(v6,v7,real_pack_mask); | |||
ti2=vec_perm(v6,v7,image_pack_mask); | |||
v3=t2+ti2; | |||
#else | |||
register __vector float t1=mvec_mergee(v0,v1); | |||
register __vector float ti=mvec_mergeo(v0,v1); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
register __vector float t2= mvec_mergee(v2,v3); | |||
register __vector float ti2=mvec_mergeo(v2,v3); | |||
v1=t2+ti2; | |||
t1=mvec_mergee(v4,v5); | |||
ti=mvec_mergeo(v4,v5); | |||
v2=t1+ti; //sum | |||
t2=mvec_mergee(v6,v7); | |||
ti2=mvec_mergeo(v6,v7); | |||
v3=t2+ti2; | |||
#endif | |||
// now we have 16 summed elements . lets compare them | |||
v_ptrx+=8; | |||
register __vector bool int r1=vec_cmpgt(v1,v0); | |||
register __vector bool int r2=vec_cmpgt(v3,v2); | |||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
v0=vec_sel(v0,v1,r1); | |||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
v1=vec_sel(v2,v3,r2); | |||
//final cmp and select index and value for first 16 values | |||
r1=vec_cmpgt(v1,v0); | |||
register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
register __vector float vf0= vec_sel(v0,v1,r1); | |||
//absolute temporary complex vectors | |||
v0=vec_abs(v_ptrx[0]); | |||
v1=vec_abs(v_ptrx[1]); | |||
v2=vec_abs(v_ptrx[2]); | |||
v3=vec_abs(v_ptrx[3]); | |||
v4=vec_abs(v_ptrx[4]); | |||
v5=vec_abs(v_ptrx[5]); | |||
v6=vec_abs(v_ptrx[6]); | |||
v7=vec_abs(v_ptrx[7]); | |||
//pack complex real and imaginary parts together to sum real+image | |||
#if defined(USE_MASK_PERMUTATIONS) | |||
t1=vec_perm(v0,v1,real_pack_mask); | |||
ti=vec_perm(v0,v1,image_pack_mask); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
t2=vec_perm(v2,v3,real_pack_mask); | |||
ti2=vec_perm(v2,v3,image_pack_mask); | |||
v1=t2+ti2; | |||
t1=vec_perm(v4,v5,real_pack_mask); | |||
ti=vec_perm(v4,v5,image_pack_mask); | |||
v2=t1+ti; //sum | |||
t2=vec_perm(v6,v7,real_pack_mask); | |||
ti2=vec_perm(v6,v7,image_pack_mask); | |||
v3=t2+ti2; | |||
#else | |||
t1=mvec_mergee(v0,v1); | |||
ti=mvec_mergeo(v0,v1); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
t2=mvec_mergee(v2,v3); | |||
ti2=mvec_mergeo(v2,v3); | |||
v1=t2+ti2; | |||
t1=mvec_mergee(v4,v5); | |||
ti=mvec_mergeo(v4,v5); | |||
v2=t1+ti; //sum | |||
t2=mvec_mergee(v6,v7); | |||
ti2=mvec_mergeo(v6,v7); | |||
v3=t2+ti2; | |||
#endif | |||
// now we have 16 summed elements {from 16 to 31} . lets compare them | |||
v_ptrx+=8; | |||
r1=vec_cmpgt(v1,v0); | |||
r2=vec_cmpgt(v3,v2); | |||
ind2= vec_sel(static_index0,static_index1,r1); | |||
v0=vec_sel(v0,v1,r1); | |||
ind3= vec_sel(static_index2,static_index3,r2); | |||
v1=vec_sel(v2,v3,r2); | |||
//final cmp and select index and value for the second 16 values | |||
r1=vec_cmpgt(v1,v0); | |||
register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
register __vector float vv0= vec_sel(v0,v1,r1); | |||
indv0+=temp1; //make index from 16->31 | |||
//find final quadruple from 32 elements | |||
r2=vec_cmpgt(vv0,vf0); | |||
ind2 = vec_sel( indf0,indv0,r2); | |||
vv0= vec_sel(vf0,vv0,r2); | |||
//get asbolute index | |||
ind2+=temp0; | |||
//compare with old quadruple and update | |||
r1=vec_cmpgt(vv0,quadruple_values); | |||
quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
temp0+=temp_add; | |||
} | |||
//now we have to chose from 4 values and 4 different indices | |||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
// otherwise we will assign index of the maximum value | |||
float a1,a2,a3,a4; | |||
unsigned int i1,i2,i3,i4; | |||
a1=vec_extract(quadruple_values,0); | |||
a2=vec_extract(quadruple_values,1); | |||
a3=vec_extract(quadruple_values,2); | |||
a4=vec_extract(quadruple_values,3); | |||
i1=vec_extract(quadruple_indices,0); | |||
i2=vec_extract(quadruple_indices,1); | |||
i3=vec_extract(quadruple_indices,2); | |||
i4=vec_extract(quadruple_indices,3); | |||
if(a1==a2){ | |||
index=i1>i2?i2:i1; | |||
}else if(a2>a1){ | |||
index=i2; | |||
a1=a2; | |||
}else{ | |||
index= i1; | |||
} | |||
if(a4==a3){ | |||
i1=i3>i4?i4:i3; | |||
}else if(a4>a3){ | |||
i1=i4; | |||
a3=a4; | |||
}else{ | |||
i1= i3; | |||
} | |||
if(a1==a3){ | |||
index=i1>index?index:i1; | |||
*maxf=a1; | |||
}else if(a3>a1){ | |||
index=i1; | |||
*maxf=a3; | |||
}else{ | |||
*maxf=a1; | |||
} | |||
return index; | |||
} | |||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
{ | |||
BLASLONG i = 0; | |||
BLASLONG ix = 0; | |||
FLOAT maxf = 0; | |||
BLASLONG max = 0; | |||
BLASLONG inc_x2; | |||
if (n <= 0 || inc_x <= 0) return(max); | |||
if (inc_x == 1) { | |||
BLASLONG n1 = n & -32; | |||
if (n1 > 0) { | |||
max = ciamax_kernel_32(n1, x, &maxf); | |||
i = n1; | |||
ix = n1 << 1; | |||
} | |||
while(i < n) | |||
{ | |||
if( CABS1(x,ix) > maxf ) | |||
{ | |||
max = i; | |||
maxf = CABS1(x,ix); | |||
} | |||
ix += 2; | |||
i++; | |||
} | |||
return (max + 1); | |||
} else { | |||
inc_x2 = 2 * inc_x; | |||
maxf = CABS1(x,0); | |||
ix += inc_x2; | |||
i++; | |||
while(i < n) | |||
{ | |||
if( CABS1(x,ix) > maxf ) | |||
{ | |||
max = i; | |||
maxf = CABS1(x,ix); | |||
} | |||
ix += inc_x2; | |||
i++; | |||
} | |||
return (max + 1); | |||
} | |||
} | |||
@@ -1,266 +1,266 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
#include <altivec.h> | |||
#if defined(DOUBLE) | |||
#define ABS fabs | |||
#else | |||
#define ABS fabsf | |||
#endif | |||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
/** | |||
* Find minimum index | |||
* Warning: requirements n>0 and n % 32 == 0 | |||
* @param n | |||
* @param x pointer to the vector | |||
* @param minf (out) minimum absolute value .( only for output ) | |||
* @return index | |||
*/ | |||
static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
BLASLONG index; | |||
BLASLONG i=0; | |||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
temp0=vec_xor(temp0,temp0); | |||
temp1=temp1 <<1 ; //{16,16,16,16} | |||
register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
float first_min=CABS1(x,0); | |||
register __vector float quadruple_values={first_min,first_min,first_min,first_min}; | |||
register __vector float * v_ptrx=(__vector float *)x; | |||
register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
for(; i<n; i+=32){ | |||
//absolute temporary complex vectors | |||
register __vector float v0=vec_abs(v_ptrx[0]); | |||
register __vector float v1=vec_abs(v_ptrx[1]); | |||
register __vector float v2=vec_abs(v_ptrx[2]); | |||
register __vector float v3=vec_abs(v_ptrx[3]); | |||
register __vector float v4=vec_abs(v_ptrx[4]); | |||
register __vector float v5=vec_abs(v_ptrx[5]); | |||
register __vector float v6=vec_abs(v_ptrx[6]); | |||
register __vector float v7=vec_abs(v_ptrx[7]); | |||
//pack complex real and imaginary parts together to sum real+image | |||
register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
v1=t2+ti2; | |||
t1=vec_perm(v4,v5,real_pack_mask); | |||
ti=vec_perm(v4,v5,image_pack_mask); | |||
v2=t1+ti; //sum | |||
t2=vec_perm(v6,v7,real_pack_mask); | |||
ti2=vec_perm(v6,v7,image_pack_mask); | |||
v3=t2+ti2; | |||
// now we have 16 summed elements . lets compare them | |||
v_ptrx+=8; | |||
register __vector bool int r1=vec_cmpgt(v0,v1); | |||
register __vector bool int r2=vec_cmpgt(v2,v3); | |||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
v0=vec_sel(v0,v1,r1); | |||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
v1=vec_sel(v2,v3,r2); | |||
//final cmp and select index and value for first 16 values | |||
r1=vec_cmpgt(v0,v1); | |||
register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
register __vector float vf0= vec_sel(v0,v1,r1); | |||
//absolute temporary complex vectors | |||
v0=vec_abs(v_ptrx[0]); | |||
v1=vec_abs(v_ptrx[1]); | |||
v2=vec_abs(v_ptrx[2]); | |||
v3=vec_abs(v_ptrx[3]); | |||
v4=vec_abs(v_ptrx[4]); | |||
v5=vec_abs(v_ptrx[5]); | |||
v6=vec_abs(v_ptrx[6]); | |||
v7=vec_abs(v_ptrx[7]); | |||
//pack complex real and imaginary parts together to sum real+image | |||
t1=vec_perm(v0,v1,real_pack_mask); | |||
ti=vec_perm(v0,v1,image_pack_mask); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
t2=vec_perm(v2,v3,real_pack_mask); | |||
ti2=vec_perm(v2,v3,image_pack_mask); | |||
v1=t2+ti2; | |||
t1=vec_perm(v4,v5,real_pack_mask); | |||
ti=vec_perm(v4,v5,image_pack_mask); | |||
v2=t1+ti; //sum | |||
t2=vec_perm(v6,v7,real_pack_mask); | |||
ti2=vec_perm(v6,v7,image_pack_mask); | |||
v3=t2+ti2; | |||
// now we have 16 summed elements {from 16 to 31} . lets compare them | |||
v_ptrx+=8; | |||
r1=vec_cmpgt(v0,v1); | |||
r2=vec_cmpgt(v2,v3); | |||
ind2= vec_sel(static_index0,static_index1,r1); | |||
v0=vec_sel(v0,v1,r1); | |||
ind3= vec_sel(static_index2,static_index3,r2); | |||
v1=vec_sel(v2,v3,r2); | |||
//final cmp and select index and value for the second 16 values | |||
r1=vec_cmpgt(v0,v1); | |||
register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
register __vector float vv0= vec_sel(v0,v1,r1); | |||
indv0+=temp1; //make index from 16->31 | |||
//find final quadruple from 32 elements | |||
r2=vec_cmpgt(vf0,vv0); | |||
ind2 = vec_sel( indf0,indv0,r2); | |||
vv0= vec_sel(vf0,vv0,r2); | |||
//get asbolute index | |||
ind2+=temp0; | |||
//compare with old quadruple and update | |||
r1=vec_cmpgt(quadruple_values,vv0); | |||
quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
temp0+=temp_add; | |||
} | |||
//now we have to chose from 4 values and 4 different indices | |||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
// otherwise we will assign index of the minimum value | |||
float a1,a2,a3,a4; | |||
unsigned int i1,i2,i3,i4; | |||
a1=vec_extract(quadruple_values,0); | |||
a2=vec_extract(quadruple_values,1); | |||
a3=vec_extract(quadruple_values,2); | |||
a4=vec_extract(quadruple_values,3); | |||
i1=vec_extract(quadruple_indices,0); | |||
i2=vec_extract(quadruple_indices,1); | |||
i3=vec_extract(quadruple_indices,2); | |||
i4=vec_extract(quadruple_indices,3); | |||
if(a1==a2){ | |||
index=i1>i2?i2:i1; | |||
}else if(a2<a1){ | |||
index=i2; | |||
a1=a2; | |||
}else{ | |||
index= i1; | |||
} | |||
if(a4==a3){ | |||
i1=i3>i4?i4:i3; | |||
}else if(a4<a3){ | |||
i1=i4; | |||
a3=a4; | |||
}else{ | |||
i1= i3; | |||
} | |||
if(a1==a3){ | |||
index=i1>index?index:i1; | |||
*minf=a1; | |||
}else if(a3<a1){ | |||
index=i1; | |||
*minf=a3; | |||
}else{ | |||
*minf=a1; | |||
} | |||
return index; | |||
} | |||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
{ | |||
BLASLONG i=0; | |||
BLASLONG ix=0; | |||
FLOAT minf; | |||
BLASLONG min=0; | |||
BLASLONG inc_x2; | |||
if (n <= 0 || inc_x <= 0) return(min); | |||
if (inc_x == 1) { | |||
minf = CABS1(x,0); //index will not be incremented | |||
BLASLONG n1 = n & -32; | |||
if (n1 > 0) { | |||
min = ciamin_kernel_32(n1, x, &minf); | |||
i = n1; | |||
ix = n1 << 1; | |||
} | |||
while(i < n) | |||
{ | |||
if( CABS1(x,ix) < minf ) | |||
{ | |||
min = i; | |||
minf = CABS1(x,ix); | |||
} | |||
ix += 2; | |||
i++; | |||
} | |||
return (min + 1); | |||
} else { | |||
inc_x2 = 2 * inc_x; | |||
minf = CABS1(x,0); | |||
ix += inc_x2; | |||
i++; | |||
while(i < n) | |||
{ | |||
if( CABS1(x,ix) < minf ) | |||
{ | |||
min = i; | |||
minf = CABS1(x,ix); | |||
} | |||
ix += inc_x2; | |||
i++; | |||
} | |||
return (min + 1); | |||
} | |||
} | |||
/*************************************************************************** | |||
Copyright (c) 2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
#include <altivec.h> | |||
#if defined(DOUBLE) | |||
#define ABS fabs | |||
#else | |||
#define ABS fabsf | |||
#endif | |||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
/** | |||
* Find minimum index | |||
* Warning: requirements n>0 and n % 32 == 0 | |||
* @param n | |||
* @param x pointer to the vector | |||
* @param minf (out) minimum absolute value .( only for output ) | |||
* @return index | |||
*/ | |||
static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
BLASLONG index; | |||
BLASLONG i=0; | |||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
temp0=vec_xor(temp0,temp0); | |||
temp1=temp1 <<1 ; //{16,16,16,16} | |||
register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
float first_min=CABS1(x,0); | |||
register __vector float quadruple_values={first_min,first_min,first_min,first_min}; | |||
register __vector float * v_ptrx=(__vector float *)x; | |||
register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
for(; i<n; i+=32){ | |||
//absolute temporary complex vectors | |||
register __vector float v0=vec_abs(v_ptrx[0]); | |||
register __vector float v1=vec_abs(v_ptrx[1]); | |||
register __vector float v2=vec_abs(v_ptrx[2]); | |||
register __vector float v3=vec_abs(v_ptrx[3]); | |||
register __vector float v4=vec_abs(v_ptrx[4]); | |||
register __vector float v5=vec_abs(v_ptrx[5]); | |||
register __vector float v6=vec_abs(v_ptrx[6]); | |||
register __vector float v7=vec_abs(v_ptrx[7]); | |||
//pack complex real and imaginary parts together to sum real+image | |||
register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
v1=t2+ti2; | |||
t1=vec_perm(v4,v5,real_pack_mask); | |||
ti=vec_perm(v4,v5,image_pack_mask); | |||
v2=t1+ti; //sum | |||
t2=vec_perm(v6,v7,real_pack_mask); | |||
ti2=vec_perm(v6,v7,image_pack_mask); | |||
v3=t2+ti2; | |||
// now we have 16 summed elements . lets compare them | |||
v_ptrx+=8; | |||
register __vector bool int r1=vec_cmpgt(v0,v1); | |||
register __vector bool int r2=vec_cmpgt(v2,v3); | |||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
v0=vec_sel(v0,v1,r1); | |||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
v1=vec_sel(v2,v3,r2); | |||
//final cmp and select index and value for first 16 values | |||
r1=vec_cmpgt(v0,v1); | |||
register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
register __vector float vf0= vec_sel(v0,v1,r1); | |||
//absolute temporary complex vectors | |||
v0=vec_abs(v_ptrx[0]); | |||
v1=vec_abs(v_ptrx[1]); | |||
v2=vec_abs(v_ptrx[2]); | |||
v3=vec_abs(v_ptrx[3]); | |||
v4=vec_abs(v_ptrx[4]); | |||
v5=vec_abs(v_ptrx[5]); | |||
v6=vec_abs(v_ptrx[6]); | |||
v7=vec_abs(v_ptrx[7]); | |||
//pack complex real and imaginary parts together to sum real+image | |||
t1=vec_perm(v0,v1,real_pack_mask); | |||
ti=vec_perm(v0,v1,image_pack_mask); | |||
v0=t1+ti; //sum quadruple real with quadruple image | |||
t2=vec_perm(v2,v3,real_pack_mask); | |||
ti2=vec_perm(v2,v3,image_pack_mask); | |||
v1=t2+ti2; | |||
t1=vec_perm(v4,v5,real_pack_mask); | |||
ti=vec_perm(v4,v5,image_pack_mask); | |||
v2=t1+ti; //sum | |||
t2=vec_perm(v6,v7,real_pack_mask); | |||
ti2=vec_perm(v6,v7,image_pack_mask); | |||
v3=t2+ti2; | |||
// now we have 16 summed elements {from 16 to 31} . lets compare them | |||
v_ptrx+=8; | |||
r1=vec_cmpgt(v0,v1); | |||
r2=vec_cmpgt(v2,v3); | |||
ind2= vec_sel(static_index0,static_index1,r1); | |||
v0=vec_sel(v0,v1,r1); | |||
ind3= vec_sel(static_index2,static_index3,r2); | |||
v1=vec_sel(v2,v3,r2); | |||
//final cmp and select index and value for the second 16 values | |||
r1=vec_cmpgt(v0,v1); | |||
register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
register __vector float vv0= vec_sel(v0,v1,r1); | |||
indv0+=temp1; //make index from 16->31 | |||
//find final quadruple from 32 elements | |||
r2=vec_cmpgt(vf0,vv0); | |||
ind2 = vec_sel( indf0,indv0,r2); | |||
vv0= vec_sel(vf0,vv0,r2); | |||
//get asbolute index | |||
ind2+=temp0; | |||
//compare with old quadruple and update | |||
r1=vec_cmpgt(quadruple_values,vv0); | |||
quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
temp0+=temp_add; | |||
} | |||
//now we have to chose from 4 values and 4 different indices | |||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
// otherwise we will assign index of the minimum value | |||
float a1,a2,a3,a4; | |||
unsigned int i1,i2,i3,i4; | |||
a1=vec_extract(quadruple_values,0); | |||
a2=vec_extract(quadruple_values,1); | |||
a3=vec_extract(quadruple_values,2); | |||
a4=vec_extract(quadruple_values,3); | |||
i1=vec_extract(quadruple_indices,0); | |||
i2=vec_extract(quadruple_indices,1); | |||
i3=vec_extract(quadruple_indices,2); | |||
i4=vec_extract(quadruple_indices,3); | |||
if(a1==a2){ | |||
index=i1>i2?i2:i1; | |||
}else if(a2<a1){ | |||
index=i2; | |||
a1=a2; | |||
}else{ | |||
index= i1; | |||
} | |||
if(a4==a3){ | |||
i1=i3>i4?i4:i3; | |||
}else if(a4<a3){ | |||
i1=i4; | |||
a3=a4; | |||
}else{ | |||
i1= i3; | |||
} | |||
if(a1==a3){ | |||
index=i1>index?index:i1; | |||
*minf=a1; | |||
}else if(a3<a1){ | |||
index=i1; | |||
*minf=a3; | |||
}else{ | |||
*minf=a1; | |||
} | |||
return index; | |||
} | |||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
{ | |||
BLASLONG i=0; | |||
BLASLONG ix=0; | |||
FLOAT minf; | |||
BLASLONG min=0; | |||
BLASLONG inc_x2; | |||
if (n <= 0 || inc_x <= 0) return(min); | |||
if (inc_x == 1) { | |||
minf = CABS1(x,0); //index will not be incremented | |||
BLASLONG n1 = n & -32; | |||
if (n1 > 0) { | |||
min = ciamin_kernel_32(n1, x, &minf); | |||
i = n1; | |||
ix = n1 << 1; | |||
} | |||
while(i < n) | |||
{ | |||
if( CABS1(x,ix) < minf ) | |||
{ | |||
min = i; | |||
minf = CABS1(x,ix); | |||
} | |||
ix += 2; | |||
i++; | |||
} | |||
return (min + 1); | |||
} else { | |||
inc_x2 = 2 * inc_x; | |||
minf = CABS1(x,0); | |||
ix += inc_x2; | |||
i++; | |||
while(i < n) | |||
{ | |||
if( CABS1(x,ix) < minf ) | |||
{ | |||
min = i; | |||
minf = CABS1(x,ix); | |||
} | |||
ix += inc_x2; | |||
i++; | |||
} | |||
return (min + 1); | |||
} | |||
} | |||
@@ -1,288 +1,288 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
#include <altivec.h> | |||
#if defined(DOUBLE) | |||
#define ABS fabs | |||
#else | |||
#define ABS fabsf | |||
#endif | |||
/** | |||
* Find maximum index | |||
* Warning: requirements n>0 and n % 64 == 0 | |||
* @param n | |||
* @param x pointer to the vector | |||
* @param maxf (out) maximum absolute value .( only for output ) | |||
* @return index | |||
*/ | |||
static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
BLASLONG index; | |||
BLASLONG i=0; | |||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
temp0=vec_xor(temp0,temp0); | |||
temp1=temp1 <<1 ; //{16,16,16,16} | |||
register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
register __vector float quadruple_values={0,0,0,0}; | |||
register __vector float * v_ptrx=(__vector float *)x; | |||
for(; i<n; i+=64){ | |||
//absolute temporary vectors | |||
register __vector float v0=vec_abs(v_ptrx[0]); | |||
register __vector float v1=vec_abs(v_ptrx[1]); | |||
register __vector float v2=vec_abs(v_ptrx[2]); | |||
register __vector float v3=vec_abs(v_ptrx[3]); | |||
register __vector float v4=vec_abs(v_ptrx[4]); | |||
register __vector float v5=vec_abs(v_ptrx[5]); | |||
register __vector float v6=vec_abs(v_ptrx[6]); | |||
register __vector float v7=vec_abs(v_ptrx[7]); | |||
//cmp quadruple pairs | |||
register __vector bool int r1=vec_cmpgt(v1,v0); | |||
register __vector bool int r2=vec_cmpgt(v3,v2); | |||
register __vector bool int r3=vec_cmpgt(v5,v4); | |||
register __vector bool int r4=vec_cmpgt(v7,v6); | |||
//select | |||
register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
register __vector float vf0= vec_sel(v0,v1,r1); | |||
register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
register __vector float vf1= vec_sel(v2,v3,r2); | |||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
v0=vec_sel(v4,v5,r3); | |||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
v1=vec_sel(v6,v7,r4); | |||
// cmp selected | |||
r1=vec_cmpgt(vf1,vf0); | |||
r2=vec_cmpgt(v1,v0); | |||
v_ptrx+=8; | |||
//select from above | |||
ind0_first= vec_sel(ind0_first,ind1,r1); | |||
vf0= vec_sel(vf0,vf1,r1) ; | |||
ind2= vec_sel(ind2,ind3,r2); | |||
vf1= vec_sel(v0,v1,r2); | |||
//second indices actually should be within [16,31] so ind2+16 | |||
ind2 +=temp1; | |||
//final cmp and select index and value for the first 32 values | |||
r1=vec_cmpgt(vf1,vf0); | |||
ind0_first = vec_sel(ind0_first,ind2,r1); | |||
vf0= vec_sel(vf0,vf1,r1); | |||
ind0_first+=temp0; //get absolute index | |||
temp0+=temp1; | |||
temp0+=temp1; //temp0+32 | |||
//second part of 32 | |||
// absolute temporary vectors | |||
v0=vec_abs(v_ptrx[0]); | |||
v1=vec_abs(v_ptrx[1]); | |||
v2=vec_abs(v_ptrx[2]); | |||
v3=vec_abs(v_ptrx[3]); | |||
v4=vec_abs(v_ptrx[4]); | |||
v5=vec_abs(v_ptrx[5]); | |||
v6=vec_abs(v_ptrx[6]); | |||
v7=vec_abs(v_ptrx[7]); | |||
//cmp quadruple pairs | |||
r1=vec_cmpgt(v1,v0); | |||
r2=vec_cmpgt(v3,v2); | |||
r3=vec_cmpgt(v5,v4); | |||
r4=vec_cmpgt(v7,v6); | |||
//select | |||
register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
register __vector float vv0= vec_sel(v0,v1,r1); | |||
ind1= vec_sel(static_index2,static_index3,r2); | |||
register __vector float vv1= vec_sel(v2,v3,r2); | |||
ind2= vec_sel(static_index0,static_index1,r3); | |||
v0=vec_sel(v4,v5,r3); | |||
ind3= vec_sel(static_index2,static_index3,r4); | |||
v1=vec_sel(v6,v7,r4); | |||
// cmp selected | |||
r1=vec_cmpgt(vv1,vv0); | |||
r2=vec_cmpgt(v1,v0); | |||
v_ptrx+=8; | |||
//select from above | |||
ind0_second= vec_sel(ind0_second,ind1,r1); | |||
vv0= vec_sel(vv0,vv1,r1) ; | |||
ind2= vec_sel(ind2,ind3,r2); | |||
vv1= vec_sel(v0,v1,r2) ; | |||
//second indices actually should be within [16,31] so ind2+16 | |||
ind2 +=temp1; | |||
//final cmp and select index and value for the second 32 values | |||
r1=vec_cmpgt(vv1,vv0); | |||
ind0_second = vec_sel(ind0_second,ind2,r1); | |||
vv0= vec_sel(vv0,vv1,r1); | |||
ind0_second+=temp0; //get absolute index | |||
//find final quadruple from 64 elements | |||
r2=vec_cmpgt(vv0,vf0); | |||
ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
vv0= vec_sel(vf0,vv0,r2); | |||
//compare with old quadruple and update | |||
r3=vec_cmpgt(vv0,quadruple_values); | |||
quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
temp0+=temp1; | |||
temp0+=temp1; //temp0+32 | |||
} | |||
//now we have to chose from 4 values and 4 different indices | |||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
// otherwise we will assign index of the maximum value | |||
float a1,a2,a3,a4; | |||
unsigned int i1,i2,i3,i4; | |||
a1=vec_extract(quadruple_values,0); | |||
a2=vec_extract(quadruple_values,1); | |||
a3=vec_extract(quadruple_values,2); | |||
a4=vec_extract(quadruple_values,3); | |||
i1=vec_extract(quadruple_indices,0); | |||
i2=vec_extract(quadruple_indices,1); | |||
i3=vec_extract(quadruple_indices,2); | |||
i4=vec_extract(quadruple_indices,3); | |||
if(a1==a2){ | |||
index=i1>i2?i2:i1; | |||
}else if(a2>a1){ | |||
index=i2; | |||
a1=a2; | |||
}else{ | |||
index= i1; | |||
} | |||
if(a4==a3){ | |||
i1=i3>i4?i4:i3; | |||
}else if(a4>a3){ | |||
i1=i4; | |||
a3=a4; | |||
}else{ | |||
i1= i3; | |||
} | |||
if(a1==a3){ | |||
index=i1>index?index:i1; | |||
*maxf=a1; | |||
}else if(a3>a1){ | |||
index=i1; | |||
*maxf=a3; | |||
}else{ | |||
*maxf=a1; | |||
} | |||
return index; | |||
} | |||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
BLASLONG i = 0; | |||
BLASLONG j = 0; | |||
FLOAT maxf = 0.0; | |||
BLASLONG max = 0; | |||
if (n <= 0 || inc_x <= 0) return (max); | |||
if (inc_x == 1) { | |||
BLASLONG n1 = n & -64; | |||
if (n1 > 0) { | |||
max = siamax_kernel_64(n1, x, &maxf); | |||
i = n1; | |||
} | |||
while (i < n) { | |||
if (ABS(x[i]) > maxf) { | |||
max = i; | |||
maxf = ABS(x[i]); | |||
} | |||
i++; | |||
} | |||
return (max + 1); | |||
} else { | |||
BLASLONG n1 = n & -4; | |||
while (j < n1) { | |||
if (ABS(x[i]) > maxf) { | |||
max = j; | |||
maxf = ABS(x[i]); | |||
} | |||
if (ABS(x[i + inc_x]) > maxf) { | |||
max = j + 1; | |||
maxf = ABS(x[i + inc_x]); | |||
} | |||
if (ABS(x[i + 2 * inc_x]) > maxf) { | |||
max = j + 2; | |||
maxf = ABS(x[i + 2 * inc_x]); | |||
} | |||
if (ABS(x[i + 3 * inc_x]) > maxf) { | |||
max = j + 3; | |||
maxf = ABS(x[i + 3 * inc_x]); | |||
} | |||
i += inc_x * 4; | |||
j += 4; | |||
} | |||
while (j < n) { | |||
if (ABS(x[i]) > maxf) { | |||
max = j; | |||
maxf = ABS(x[i]); | |||
} | |||
i += inc_x; | |||
j++; | |||
} | |||
return (max + 1); | |||
} | |||
} | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
#include <altivec.h> | |||
#if defined(DOUBLE) | |||
#define ABS fabs | |||
#else | |||
#define ABS fabsf | |||
#endif | |||
/** | |||
* Find maximum index | |||
* Warning: requirements n>0 and n % 64 == 0 | |||
* @param n | |||
* @param x pointer to the vector | |||
* @param maxf (out) maximum absolute value .( only for output ) | |||
* @return index | |||
*/ | |||
static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
BLASLONG index; | |||
BLASLONG i=0; | |||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
temp0=vec_xor(temp0,temp0); | |||
temp1=temp1 <<1 ; //{16,16,16,16} | |||
register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
register __vector float quadruple_values={0,0,0,0}; | |||
register __vector float * v_ptrx=(__vector float *)x; | |||
for(; i<n; i+=64){ | |||
//absolute temporary vectors | |||
register __vector float v0=vec_abs(v_ptrx[0]); | |||
register __vector float v1=vec_abs(v_ptrx[1]); | |||
register __vector float v2=vec_abs(v_ptrx[2]); | |||
register __vector float v3=vec_abs(v_ptrx[3]); | |||
register __vector float v4=vec_abs(v_ptrx[4]); | |||
register __vector float v5=vec_abs(v_ptrx[5]); | |||
register __vector float v6=vec_abs(v_ptrx[6]); | |||
register __vector float v7=vec_abs(v_ptrx[7]); | |||
//cmp quadruple pairs | |||
register __vector bool int r1=vec_cmpgt(v1,v0); | |||
register __vector bool int r2=vec_cmpgt(v3,v2); | |||
register __vector bool int r3=vec_cmpgt(v5,v4); | |||
register __vector bool int r4=vec_cmpgt(v7,v6); | |||
//select | |||
register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
register __vector float vf0= vec_sel(v0,v1,r1); | |||
register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
register __vector float vf1= vec_sel(v2,v3,r2); | |||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
v0=vec_sel(v4,v5,r3); | |||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
v1=vec_sel(v6,v7,r4); | |||
// cmp selected | |||
r1=vec_cmpgt(vf1,vf0); | |||
r2=vec_cmpgt(v1,v0); | |||
v_ptrx+=8; | |||
//select from above | |||
ind0_first= vec_sel(ind0_first,ind1,r1); | |||
vf0= vec_sel(vf0,vf1,r1) ; | |||
ind2= vec_sel(ind2,ind3,r2); | |||
vf1= vec_sel(v0,v1,r2); | |||
//second indices actually should be within [16,31] so ind2+16 | |||
ind2 +=temp1; | |||
//final cmp and select index and value for the first 32 values | |||
r1=vec_cmpgt(vf1,vf0); | |||
ind0_first = vec_sel(ind0_first,ind2,r1); | |||
vf0= vec_sel(vf0,vf1,r1); | |||
ind0_first+=temp0; //get absolute index | |||
temp0+=temp1; | |||
temp0+=temp1; //temp0+32 | |||
//second part of 32 | |||
// absolute temporary vectors | |||
v0=vec_abs(v_ptrx[0]); | |||
v1=vec_abs(v_ptrx[1]); | |||
v2=vec_abs(v_ptrx[2]); | |||
v3=vec_abs(v_ptrx[3]); | |||
v4=vec_abs(v_ptrx[4]); | |||
v5=vec_abs(v_ptrx[5]); | |||
v6=vec_abs(v_ptrx[6]); | |||
v7=vec_abs(v_ptrx[7]); | |||
//cmp quadruple pairs | |||
r1=vec_cmpgt(v1,v0); | |||
r2=vec_cmpgt(v3,v2); | |||
r3=vec_cmpgt(v5,v4); | |||
r4=vec_cmpgt(v7,v6); | |||
//select | |||
register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
register __vector float vv0= vec_sel(v0,v1,r1); | |||
ind1= vec_sel(static_index2,static_index3,r2); | |||
register __vector float vv1= vec_sel(v2,v3,r2); | |||
ind2= vec_sel(static_index0,static_index1,r3); | |||
v0=vec_sel(v4,v5,r3); | |||
ind3= vec_sel(static_index2,static_index3,r4); | |||
v1=vec_sel(v6,v7,r4); | |||
// cmp selected | |||
r1=vec_cmpgt(vv1,vv0); | |||
r2=vec_cmpgt(v1,v0); | |||
v_ptrx+=8; | |||
//select from above | |||
ind0_second= vec_sel(ind0_second,ind1,r1); | |||
vv0= vec_sel(vv0,vv1,r1) ; | |||
ind2= vec_sel(ind2,ind3,r2); | |||
vv1= vec_sel(v0,v1,r2) ; | |||
//second indices actually should be within [16,31] so ind2+16 | |||
ind2 +=temp1; | |||
//final cmp and select index and value for the second 32 values | |||
r1=vec_cmpgt(vv1,vv0); | |||
ind0_second = vec_sel(ind0_second,ind2,r1); | |||
vv0= vec_sel(vv0,vv1,r1); | |||
ind0_second+=temp0; //get absolute index | |||
//find final quadruple from 64 elements | |||
r2=vec_cmpgt(vv0,vf0); | |||
ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
vv0= vec_sel(vf0,vv0,r2); | |||
//compare with old quadruple and update | |||
r3=vec_cmpgt(vv0,quadruple_values); | |||
quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
temp0+=temp1; | |||
temp0+=temp1; //temp0+32 | |||
} | |||
//now we have to chose from 4 values and 4 different indices | |||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
// otherwise we will assign index of the maximum value | |||
float a1,a2,a3,a4; | |||
unsigned int i1,i2,i3,i4; | |||
a1=vec_extract(quadruple_values,0); | |||
a2=vec_extract(quadruple_values,1); | |||
a3=vec_extract(quadruple_values,2); | |||
a4=vec_extract(quadruple_values,3); | |||
i1=vec_extract(quadruple_indices,0); | |||
i2=vec_extract(quadruple_indices,1); | |||
i3=vec_extract(quadruple_indices,2); | |||
i4=vec_extract(quadruple_indices,3); | |||
if(a1==a2){ | |||
index=i1>i2?i2:i1; | |||
}else if(a2>a1){ | |||
index=i2; | |||
a1=a2; | |||
}else{ | |||
index= i1; | |||
} | |||
if(a4==a3){ | |||
i1=i3>i4?i4:i3; | |||
}else if(a4>a3){ | |||
i1=i4; | |||
a3=a4; | |||
}else{ | |||
i1= i3; | |||
} | |||
if(a1==a3){ | |||
index=i1>index?index:i1; | |||
*maxf=a1; | |||
}else if(a3>a1){ | |||
index=i1; | |||
*maxf=a3; | |||
}else{ | |||
*maxf=a1; | |||
} | |||
return index; | |||
} | |||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
BLASLONG i = 0; | |||
BLASLONG j = 0; | |||
FLOAT maxf = 0.0; | |||
BLASLONG max = 0; | |||
if (n <= 0 || inc_x <= 0) return (max); | |||
if (inc_x == 1) { | |||
BLASLONG n1 = n & -64; | |||
if (n1 > 0) { | |||
max = siamax_kernel_64(n1, x, &maxf); | |||
i = n1; | |||
} | |||
while (i < n) { | |||
if (ABS(x[i]) > maxf) { | |||
max = i; | |||
maxf = ABS(x[i]); | |||
} | |||
i++; | |||
} | |||
return (max + 1); | |||
} else { | |||
BLASLONG n1 = n & -4; | |||
while (j < n1) { | |||
if (ABS(x[i]) > maxf) { | |||
max = j; | |||
maxf = ABS(x[i]); | |||
} | |||
if (ABS(x[i + inc_x]) > maxf) { | |||
max = j + 1; | |||
maxf = ABS(x[i + inc_x]); | |||
} | |||
if (ABS(x[i + 2 * inc_x]) > maxf) { | |||
max = j + 2; | |||
maxf = ABS(x[i + 2 * inc_x]); | |||
} | |||
if (ABS(x[i + 3 * inc_x]) > maxf) { | |||
max = j + 3; | |||
maxf = ABS(x[i + 3 * inc_x]); | |||
} | |||
i += inc_x * 4; | |||
j += 4; | |||
} | |||
while (j < n) { | |||
if (ABS(x[i]) > maxf) { | |||
max = j; | |||
maxf = ABS(x[i]); | |||
} | |||
i += inc_x; | |||
j++; | |||
} | |||
return (max + 1); | |||
} | |||
} |
@@ -1,288 +1,288 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
#include <altivec.h> | |||
#if defined(DOUBLE) | |||
#define ABS fabs | |||
#else | |||
#define ABS fabsf | |||
#endif | |||
/** | |||
* Find minimum index | |||
* Warning: requirements n>0 and n % 64 == 0 | |||
* @param n | |||
* @param x pointer to the vector | |||
* @param minf (out) minimum absolute value .( only for output ) | |||
* @return index | |||
*/ | |||
static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
BLASLONG index; | |||
BLASLONG i=0; | |||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
temp0=vec_xor(temp0,temp0); | |||
temp1=temp1 <<1 ; //{16,16,16,16} | |||
register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; | |||
register __vector float * v_ptrx=(__vector float *)x; | |||
register __vector float quadruple_values=vec_abs(v_ptrx[0]); | |||
for(; i<n; i+=64){ | |||
//absolute temporary vectors | |||
register __vector float v0=vec_abs(v_ptrx[0]); | |||
register __vector float v1=vec_abs(v_ptrx[1]); | |||
register __vector float v2=vec_abs(v_ptrx[2]); | |||
register __vector float v3=vec_abs(v_ptrx[3]); | |||
register __vector float v4=vec_abs(v_ptrx[4]); | |||
register __vector float v5=vec_abs(v_ptrx[5]); | |||
register __vector float v6=vec_abs(v_ptrx[6]); | |||
register __vector float v7=vec_abs(v_ptrx[7]); | |||
//cmp quadruple pairs | |||
register __vector bool int r1=vec_cmpgt(v0,v1); | |||
register __vector bool int r2=vec_cmpgt(v2,v3); | |||
register __vector bool int r3=vec_cmpgt(v4,v5); | |||
register __vector bool int r4=vec_cmpgt(v6,v7); | |||
//select | |||
register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
register __vector float vf0= vec_sel(v0,v1,r1); | |||
register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
register __vector float vf1= vec_sel(v2,v3,r2); | |||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
v0=vec_sel(v4,v5,r3); | |||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
v1=vec_sel(v6,v7,r4); | |||
// cmp selected | |||
r1=vec_cmpgt(vf0,vf1); | |||
r2=vec_cmpgt(v0,v1); | |||
v_ptrx+=8; | |||
//select from above | |||
ind0_first= vec_sel(ind0_first,ind1,r1); | |||
vf0= vec_sel(vf0,vf1,r1) ; | |||
ind2= vec_sel(ind2,ind3,r2); | |||
vf1= vec_sel(v0,v1,r2); | |||
//second indices actually should be within [16,31] so ind2+16 | |||
ind2 +=temp1; | |||
//final cmp and select index and value for the first 32 values | |||
r1=vec_cmpgt(vf0,vf1); | |||
ind0_first = vec_sel(ind0_first,ind2,r1); | |||
vf0= vec_sel(vf0,vf1,r1); | |||
ind0_first+=temp0; //get absolute index | |||
temp0+=temp1; | |||
temp0+=temp1; //temp0+32 | |||
//second part of 32 | |||
// absolute temporary vectors | |||
v0=vec_abs(v_ptrx[0]); | |||
v1=vec_abs(v_ptrx[1]); | |||
v2=vec_abs(v_ptrx[2]); | |||
v3=vec_abs(v_ptrx[3]); | |||
v4=vec_abs(v_ptrx[4]); | |||
v5=vec_abs(v_ptrx[5]); | |||
v6=vec_abs(v_ptrx[6]); | |||
v7=vec_abs(v_ptrx[7]); | |||
//cmp quadruple pairs | |||
r1=vec_cmpgt(v0,v1); | |||
r2=vec_cmpgt(v2,v3); | |||
r3=vec_cmpgt(v4,v5); | |||
r4=vec_cmpgt(v6,v7); | |||
//select | |||
register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
register __vector float vv0= vec_sel(v0,v1,r1); | |||
ind1= vec_sel(static_index2,static_index3,r2); | |||
register __vector float vv1= vec_sel(v2,v3,r2); | |||
ind2= vec_sel(static_index0,static_index1,r3); | |||
v0=vec_sel(v4,v5,r3); | |||
ind3= vec_sel(static_index2,static_index3,r4); | |||
v1=vec_sel(v6,v7,r4); | |||
// cmp selected | |||
r1=vec_cmpgt(vv0,vv1); | |||
r2=vec_cmpgt(v0,v1); | |||
v_ptrx+=8; | |||
//select from above | |||
ind0_second= vec_sel(ind0_second,ind1,r1); | |||
vv0= vec_sel(vv0,vv1,r1) ; | |||
ind2= vec_sel(ind2,ind3,r2); | |||
vv1= vec_sel(v0,v1,r2) ; | |||
//second indices actually should be within [16,31] so ind2+16 | |||
ind2 +=temp1; | |||
//final cmp and select index and value for the second 32 values | |||
r1=vec_cmpgt(vv0,vv1); | |||
ind0_second = vec_sel(ind0_second,ind2,r1); | |||
vv0= vec_sel(vv0,vv1,r1); | |||
ind0_second+=temp0; //get absolute index | |||
//find final quadruple from 64 elements | |||
r2=vec_cmpgt(vf0,vv0); | |||
ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
vv0= vec_sel(vf0,vv0,r2); | |||
//compare with old quadruple and update | |||
r3=vec_cmpgt( quadruple_values,vv0); | |||
quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
temp0+=temp1; | |||
temp0+=temp1; //temp0+32 | |||
} | |||
//now we have to chose from 4 values and 4 different indices | |||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
// otherwise we will assign index of the minimum value | |||
float a1,a2,a3,a4; | |||
unsigned int i1,i2,i3,i4; | |||
a1=vec_extract(quadruple_values,0); | |||
a2=vec_extract(quadruple_values,1); | |||
a3=vec_extract(quadruple_values,2); | |||
a4=vec_extract(quadruple_values,3); | |||
i1=vec_extract(quadruple_indices,0); | |||
i2=vec_extract(quadruple_indices,1); | |||
i3=vec_extract(quadruple_indices,2); | |||
i4=vec_extract(quadruple_indices,3); | |||
if(a1==a2){ | |||
index=i1>i2?i2:i1; | |||
}else if(a2<a1){ | |||
index=i2; | |||
a1=a2; | |||
}else{ | |||
index= i1; | |||
} | |||
if(a4==a3){ | |||
i1=i3>i4?i4:i3; | |||
}else if(a4<a3){ | |||
i1=i4; | |||
a3=a4; | |||
}else{ | |||
i1= i3; | |||
} | |||
if(a1==a3){ | |||
index=i1>index?index:i1; | |||
*minf=a1; | |||
}else if(a3<a1){ | |||
index=i1; | |||
*minf=a3; | |||
}else{ | |||
*minf=a1; | |||
} | |||
return index; | |||
} | |||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
BLASLONG i = 0; | |||
BLASLONG j = 0; | |||
BLASLONG min = 0; | |||
FLOAT minf = 0.0; | |||
if (n <= 0 || inc_x <= 0) return (min); | |||
minf = ABS(x[0]); //index's not incremented | |||
if (inc_x == 1) { | |||
BLASLONG n1 = n & -64; | |||
if (n1 > 0) { | |||
min = siamin_kernel_64(n1, x, &minf); | |||
i = n1; | |||
} | |||
while (i < n) { | |||
if (ABS(x[i]) < minf) { | |||
min = i; | |||
minf = ABS(x[i]); | |||
} | |||
i++; | |||
} | |||
return (min + 1); | |||
} else { | |||
BLASLONG n1 = n & -4; | |||
while (j < n1) { | |||
if (ABS(x[i]) < minf) { | |||
min = j; | |||
minf = ABS(x[i]); | |||
} | |||
if (ABS(x[i + inc_x]) < minf) { | |||
min = j + 1; | |||
minf = ABS(x[i + inc_x]); | |||
} | |||
if (ABS(x[i + 2 * inc_x]) < minf) { | |||
min = j + 2; | |||
minf = ABS(x[i + 2 * inc_x]); | |||
} | |||
if (ABS(x[i + 3 * inc_x]) < minf) { | |||
min = j + 3; | |||
minf = ABS(x[i + 3 * inc_x]); | |||
} | |||
i += inc_x * 4; | |||
j += 4; | |||
} | |||
while (j < n) { | |||
if (ABS(x[i]) < minf) { | |||
min = j; | |||
minf = ABS(x[i]); | |||
} | |||
i += inc_x; | |||
j++; | |||
} | |||
return (min + 1); | |||
} | |||
} | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
#include <altivec.h> | |||
#if defined(DOUBLE) | |||
#define ABS fabs | |||
#else | |||
#define ABS fabsf | |||
#endif | |||
/** | |||
* Find minimum index | |||
* Warning: requirements n>0 and n % 64 == 0 | |||
* @param n | |||
* @param x pointer to the vector | |||
* @param minf (out) minimum absolute value .( only for output ) | |||
* @return index | |||
*/ | |||
static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
BLASLONG index; | |||
BLASLONG i=0; | |||
register __vector unsigned int static_index0 = {0,1,2,3}; | |||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
temp0=vec_xor(temp0,temp0); | |||
temp1=temp1 <<1 ; //{16,16,16,16} | |||
register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; | |||
register __vector float * v_ptrx=(__vector float *)x; | |||
register __vector float quadruple_values=vec_abs(v_ptrx[0]); | |||
for(; i<n; i+=64){ | |||
//absolute temporary vectors | |||
register __vector float v0=vec_abs(v_ptrx[0]); | |||
register __vector float v1=vec_abs(v_ptrx[1]); | |||
register __vector float v2=vec_abs(v_ptrx[2]); | |||
register __vector float v3=vec_abs(v_ptrx[3]); | |||
register __vector float v4=vec_abs(v_ptrx[4]); | |||
register __vector float v5=vec_abs(v_ptrx[5]); | |||
register __vector float v6=vec_abs(v_ptrx[6]); | |||
register __vector float v7=vec_abs(v_ptrx[7]); | |||
//cmp quadruple pairs | |||
register __vector bool int r1=vec_cmpgt(v0,v1); | |||
register __vector bool int r2=vec_cmpgt(v2,v3); | |||
register __vector bool int r3=vec_cmpgt(v4,v5); | |||
register __vector bool int r4=vec_cmpgt(v6,v7); | |||
//select | |||
register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
register __vector float vf0= vec_sel(v0,v1,r1); | |||
register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
register __vector float vf1= vec_sel(v2,v3,r2); | |||
register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
v0=vec_sel(v4,v5,r3); | |||
register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
v1=vec_sel(v6,v7,r4); | |||
// cmp selected | |||
r1=vec_cmpgt(vf0,vf1); | |||
r2=vec_cmpgt(v0,v1); | |||
v_ptrx+=8; | |||
//select from above | |||
ind0_first= vec_sel(ind0_first,ind1,r1); | |||
vf0= vec_sel(vf0,vf1,r1) ; | |||
ind2= vec_sel(ind2,ind3,r2); | |||
vf1= vec_sel(v0,v1,r2); | |||
//second indices actually should be within [16,31] so ind2+16 | |||
ind2 +=temp1; | |||
//final cmp and select index and value for the first 32 values | |||
r1=vec_cmpgt(vf0,vf1); | |||
ind0_first = vec_sel(ind0_first,ind2,r1); | |||
vf0= vec_sel(vf0,vf1,r1); | |||
ind0_first+=temp0; //get absolute index | |||
temp0+=temp1; | |||
temp0+=temp1; //temp0+32 | |||
//second part of 32 | |||
// absolute temporary vectors | |||
v0=vec_abs(v_ptrx[0]); | |||
v1=vec_abs(v_ptrx[1]); | |||
v2=vec_abs(v_ptrx[2]); | |||
v3=vec_abs(v_ptrx[3]); | |||
v4=vec_abs(v_ptrx[4]); | |||
v5=vec_abs(v_ptrx[5]); | |||
v6=vec_abs(v_ptrx[6]); | |||
v7=vec_abs(v_ptrx[7]); | |||
//cmp quadruple pairs | |||
r1=vec_cmpgt(v0,v1); | |||
r2=vec_cmpgt(v2,v3); | |||
r3=vec_cmpgt(v4,v5); | |||
r4=vec_cmpgt(v6,v7); | |||
//select | |||
register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
register __vector float vv0= vec_sel(v0,v1,r1); | |||
ind1= vec_sel(static_index2,static_index3,r2); | |||
register __vector float vv1= vec_sel(v2,v3,r2); | |||
ind2= vec_sel(static_index0,static_index1,r3); | |||
v0=vec_sel(v4,v5,r3); | |||
ind3= vec_sel(static_index2,static_index3,r4); | |||
v1=vec_sel(v6,v7,r4); | |||
// cmp selected | |||
r1=vec_cmpgt(vv0,vv1); | |||
r2=vec_cmpgt(v0,v1); | |||
v_ptrx+=8; | |||
//select from above | |||
ind0_second= vec_sel(ind0_second,ind1,r1); | |||
vv0= vec_sel(vv0,vv1,r1) ; | |||
ind2= vec_sel(ind2,ind3,r2); | |||
vv1= vec_sel(v0,v1,r2) ; | |||
//second indices actually should be within [16,31] so ind2+16 | |||
ind2 +=temp1; | |||
//final cmp and select index and value for the second 32 values | |||
r1=vec_cmpgt(vv0,vv1); | |||
ind0_second = vec_sel(ind0_second,ind2,r1); | |||
vv0= vec_sel(vv0,vv1,r1); | |||
ind0_second+=temp0; //get absolute index | |||
//find final quadruple from 64 elements | |||
r2=vec_cmpgt(vf0,vv0); | |||
ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
vv0= vec_sel(vf0,vv0,r2); | |||
//compare with old quadruple and update | |||
r3=vec_cmpgt( quadruple_values,vv0); | |||
quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
temp0+=temp1; | |||
temp0+=temp1; //temp0+32 | |||
} | |||
//now we have to chose from 4 values and 4 different indices | |||
// we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
// otherwise we will assign index of the minimum value | |||
float a1,a2,a3,a4; | |||
unsigned int i1,i2,i3,i4; | |||
a1=vec_extract(quadruple_values,0); | |||
a2=vec_extract(quadruple_values,1); | |||
a3=vec_extract(quadruple_values,2); | |||
a4=vec_extract(quadruple_values,3); | |||
i1=vec_extract(quadruple_indices,0); | |||
i2=vec_extract(quadruple_indices,1); | |||
i3=vec_extract(quadruple_indices,2); | |||
i4=vec_extract(quadruple_indices,3); | |||
if(a1==a2){ | |||
index=i1>i2?i2:i1; | |||
}else if(a2<a1){ | |||
index=i2; | |||
a1=a2; | |||
}else{ | |||
index= i1; | |||
} | |||
if(a4==a3){ | |||
i1=i3>i4?i4:i3; | |||
}else if(a4<a3){ | |||
i1=i4; | |||
a3=a4; | |||
}else{ | |||
i1= i3; | |||
} | |||
if(a1==a3){ | |||
index=i1>index?index:i1; | |||
*minf=a1; | |||
}else if(a3<a1){ | |||
index=i1; | |||
*minf=a3; | |||
}else{ | |||
*minf=a1; | |||
} | |||
return index; | |||
} | |||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
BLASLONG i = 0; | |||
BLASLONG j = 0; | |||
BLASLONG min = 0; | |||
FLOAT minf = 0.0; | |||
if (n <= 0 || inc_x <= 0) return (min); | |||
minf = ABS(x[0]); //index's not incremented | |||
if (inc_x == 1) { | |||
BLASLONG n1 = n & -64; | |||
if (n1 > 0) { | |||
min = siamin_kernel_64(n1, x, &minf); | |||
i = n1; | |||
} | |||
while (i < n) { | |||
if (ABS(x[i]) < minf) { | |||
min = i; | |||
minf = ABS(x[i]); | |||
} | |||
i++; | |||
} | |||
return (min + 1); | |||
} else { | |||
BLASLONG n1 = n & -4; | |||
while (j < n1) { | |||
if (ABS(x[i]) < minf) { | |||
min = j; | |||
minf = ABS(x[i]); | |||
} | |||
if (ABS(x[i + inc_x]) < minf) { | |||
min = j + 1; | |||
minf = ABS(x[i + inc_x]); | |||
} | |||
if (ABS(x[i + 2 * inc_x]) < minf) { | |||
min = j + 2; | |||
minf = ABS(x[i + 2 * inc_x]); | |||
} | |||
if (ABS(x[i + 3 * inc_x]) < minf) { | |||
min = j + 3; | |||
minf = ABS(x[i + 3 * inc_x]); | |||
} | |||
i += inc_x * 4; | |||
j += 4; | |||
} | |||
while (j < n) { | |||
if (ABS(x[i]) < minf) { | |||
min = j; | |||
minf = ABS(x[i]); | |||
} | |||
i += inc_x; | |||
j++; | |||
} | |||
return (min + 1); | |||
} | |||
} |
@@ -1,272 +1,272 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "def_vsx.h" | |||
#define LOAD ld | |||
#define STACKSIZE (512 ) | |||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
#define M r3 | |||
#define N r4 | |||
#define K r5 | |||
#define A r7 | |||
#define B r8 | |||
#define C r9 | |||
#define LDC r10 | |||
#define OFFSET r6 | |||
#define alpha_r vs20 | |||
#define save_permute_1 vs21 | |||
#define save_permute_2 vs22 | |||
#define permute_mask vs23 | |||
#define o0 0 | |||
#define T1 r11 | |||
#define T2 r12 | |||
#define T3 r14 | |||
#define T4 r15 | |||
#define T5 r16 | |||
#define T6 r17 | |||
#define L r18 | |||
#define T7 r19 | |||
#define T8 r20 | |||
#define TEMP_REG r21 | |||
#define I r22 | |||
#define J r23 | |||
#define AO r24 | |||
#define BO r25 | |||
#define CO r26 | |||
#define T9 r27 | |||
#define T10 r28 | |||
#define T11 r29 | |||
#define T12 r30 | |||
#define T13 r31 | |||
#include "sgemm_macros_power9.S" | |||
.equ perm_const1, 0x0405060700010203 | |||
.equ perm_const2, 0x0c0d0e0f08090a0b | |||
.equ save_permute_11, 0x1415161718191a1b | |||
.equ save_permute_12, 0x0405060708090a0b | |||
.equ save_permute_21, 0x101112131c1d1e1f | |||
.equ save_permute_22, 0x000102030c0d0e0f | |||
#ifndef NEEDPARAM | |||
PROLOGUE | |||
PROFCODE | |||
addi SP, SP, -STACKSIZE | |||
mflr r0 | |||
stfd f14, 0(SP) | |||
stfd f15, 8(SP) | |||
stfd f16, 16(SP) | |||
stfd f17, 24(SP) | |||
stfd f18, 32(SP) | |||
stfd f19, 40(SP) | |||
stfd f20, 48(SP) | |||
stfd f21, 56(SP) | |||
stfd f22, 64(SP) | |||
stfd f23, 72(SP) | |||
stfd f24, 80(SP) | |||
stfd f25, 88(SP) | |||
stfd f26, 96(SP) | |||
stfd f27, 104(SP) | |||
stfd f28, 112(SP) | |||
stfd f29, 120(SP) | |||
stfd f30, 128(SP) | |||
stfd f31, 136(SP) | |||
std r31, 144(SP) | |||
std r30, 152(SP) | |||
std r29, 160(SP) | |||
std r28, 168(SP) | |||
std r27, 176(SP) | |||
std r26, 184(SP) | |||
std r25, 192(SP) | |||
std r24, 200(SP) | |||
std r23, 208(SP) | |||
std r22, 216(SP) | |||
std r21, 224(SP) | |||
std r20, 232(SP) | |||
std r19, 240(SP) | |||
std r18, 248(SP) | |||
std r17, 256(SP) | |||
std r16, 264(SP) | |||
std r15, 272(SP) | |||
std r14, 280(SP) | |||
stxv vs52, 288(SP) | |||
stxv vs53, 304(SP) | |||
stxv vs54, 320(SP) | |||
stxv vs55, 336(SP) | |||
stxv vs56, 352(SP) | |||
stxv vs57, 368(SP) | |||
stxv vs58, 384(SP) | |||
stxv vs59, 400(SP) | |||
stxv vs60, 416(SP) | |||
stxv vs61, 432(SP) | |||
stxv vs62, 448(SP) | |||
stxv vs63, 464(SP) | |||
std r0, FLINK_SAVE(SP) | |||
#if defined(TRMMKERNEL) | |||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
#endif | |||
slwi LDC, LDC, 2 | |||
/*alpha is stored in f1. convert to single and splat*/ | |||
xscvdpspn alpha_r,vs1 | |||
xxspltw alpha_r,alpha_r,0 | |||
/*load reverse permute mask for big endian | |||
uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
*/ | |||
lis T2, perm_const2@highest | |||
lis T1, perm_const1@highest | |||
lis T3, save_permute_12@highest | |||
lis T4, save_permute_11@highest | |||
lis T5, save_permute_22@highest | |||
lis T6, save_permute_21@highest | |||
ori T2, T2, perm_const2@higher | |||
ori T1, T1, perm_const1@higher | |||
ori T3, T3, save_permute_12@higher | |||
ori T4, T4, save_permute_11@higher | |||
ori T5, T5, save_permute_22@higher | |||
ori T6, T6, save_permute_21@higher | |||
rldicr T2, T2, 32, 31 | |||
rldicr T1, T1, 32, 31 | |||
rldicr T3, T3, 32, 31 | |||
rldicr T4, T4, 32, 31 | |||
rldicr T5, T5, 32, 31 | |||
rldicr T6, T6, 32, 31 | |||
oris T2, T2, perm_const2@h | |||
oris T1, T1, perm_const1@h | |||
oris T3, T3, save_permute_12@h | |||
oris T4, T4, save_permute_11@h | |||
oris T5, T5, save_permute_22@h | |||
oris T6, T6, save_permute_21@h | |||
ori T2, T2, perm_const2@l | |||
ori T1, T1, perm_const1@l | |||
ori T3, T3, save_permute_12@l | |||
ori T4, T4, save_permute_11@l | |||
ori T5, T5, save_permute_22@l | |||
ori T6, T6, save_permute_21@l | |||
li r0,0 | |||
mtvsrdd permute_mask,T2,T1 | |||
mtvsrdd save_permute_1,T3,T4 | |||
mtvsrdd save_permute_2,T5,T6 | |||
#include "sgemm_logic_power9.S" | |||
.L999: | |||
lfd f14, 0(SP) | |||
lfd f15, 8(SP) | |||
lfd f16, 16(SP) | |||
lfd f17, 24(SP) | |||
lfd f18, 32(SP) | |||
lfd f19, 40(SP) | |||
lfd f20, 48(SP) | |||
lfd f21, 56(SP) | |||
lfd f22, 64(SP) | |||
lfd f23, 72(SP) | |||
lfd f24, 80(SP) | |||
lfd f25, 88(SP) | |||
lfd f26, 96(SP) | |||
lfd f27, 104(SP) | |||
lfd f28, 112(SP) | |||
lfd f29, 120(SP) | |||
lfd f30, 128(SP) | |||
lfd f31, 136(SP) | |||
ld r31, 144(SP) | |||
ld r30, 152(SP) | |||
ld r29, 160(SP) | |||
ld r28, 168(SP) | |||
ld r27, 176(SP) | |||
ld r26, 184(SP) | |||
ld r25, 192(SP) | |||
ld r24, 200(SP) | |||
ld r23, 208(SP) | |||
ld r22, 216(SP) | |||
ld r21, 224(SP) | |||
ld r20, 232(SP) | |||
ld r19, 240(SP) | |||
ld r18, 248(SP) | |||
ld r17, 256(SP) | |||
ld r16, 264(SP) | |||
ld r15, 272(SP) | |||
ld r14, 280(SP) | |||
ld r0, FLINK_SAVE(SP) | |||
lxv vs52, 288(SP) | |||
lxv vs53, 304(SP) | |||
lxv vs54, 320(SP) | |||
lxv vs55, 336(SP) | |||
lxv vs56, 352(SP) | |||
lxv vs57, 368(SP) | |||
lxv vs58, 384(SP) | |||
lxv vs59, 400(SP) | |||
mtlr r0 | |||
lxv vs60, 416(SP) | |||
lxv vs61, 432(SP) | |||
lxv vs62, 448(SP) | |||
lxv vs63, 464(SP) | |||
addi SP, SP, STACKSIZE | |||
blr | |||
EPILOGUE | |||
#endif | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "def_vsx.h" | |||
#define LOAD ld | |||
#define STACKSIZE (512 ) | |||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
#define M r3 | |||
#define N r4 | |||
#define K r5 | |||
#define A r7 | |||
#define B r8 | |||
#define C r9 | |||
#define LDC r10 | |||
#define OFFSET r6 | |||
#define alpha_r vs20 | |||
#define save_permute_1 vs21 | |||
#define save_permute_2 vs22 | |||
#define permute_mask vs23 | |||
#define o0 0 | |||
#define T1 r11 | |||
#define T2 r12 | |||
#define T3 r14 | |||
#define T4 r15 | |||
#define T5 r16 | |||
#define T6 r17 | |||
#define L r18 | |||
#define T7 r19 | |||
#define T8 r20 | |||
#define TEMP_REG r21 | |||
#define I r22 | |||
#define J r23 | |||
#define AO r24 | |||
#define BO r25 | |||
#define CO r26 | |||
#define T9 r27 | |||
#define T10 r28 | |||
#define T11 r29 | |||
#define T12 r30 | |||
#define T13 r31 | |||
#include "sgemm_macros_power9.S" | |||
.equ perm_const1, 0x0405060700010203 | |||
.equ perm_const2, 0x0c0d0e0f08090a0b | |||
.equ save_permute_11, 0x1415161718191a1b | |||
.equ save_permute_12, 0x0405060708090a0b | |||
.equ save_permute_21, 0x101112131c1d1e1f | |||
.equ save_permute_22, 0x000102030c0d0e0f | |||
#ifndef NEEDPARAM | |||
PROLOGUE | |||
PROFCODE | |||
addi SP, SP, -STACKSIZE | |||
mflr r0 | |||
stfd f14, 0(SP) | |||
stfd f15, 8(SP) | |||
stfd f16, 16(SP) | |||
stfd f17, 24(SP) | |||
stfd f18, 32(SP) | |||
stfd f19, 40(SP) | |||
stfd f20, 48(SP) | |||
stfd f21, 56(SP) | |||
stfd f22, 64(SP) | |||
stfd f23, 72(SP) | |||
stfd f24, 80(SP) | |||
stfd f25, 88(SP) | |||
stfd f26, 96(SP) | |||
stfd f27, 104(SP) | |||
stfd f28, 112(SP) | |||
stfd f29, 120(SP) | |||
stfd f30, 128(SP) | |||
stfd f31, 136(SP) | |||
std r31, 144(SP) | |||
std r30, 152(SP) | |||
std r29, 160(SP) | |||
std r28, 168(SP) | |||
std r27, 176(SP) | |||
std r26, 184(SP) | |||
std r25, 192(SP) | |||
std r24, 200(SP) | |||
std r23, 208(SP) | |||
std r22, 216(SP) | |||
std r21, 224(SP) | |||
std r20, 232(SP) | |||
std r19, 240(SP) | |||
std r18, 248(SP) | |||
std r17, 256(SP) | |||
std r16, 264(SP) | |||
std r15, 272(SP) | |||
std r14, 280(SP) | |||
stxv vs52, 288(SP) | |||
stxv vs53, 304(SP) | |||
stxv vs54, 320(SP) | |||
stxv vs55, 336(SP) | |||
stxv vs56, 352(SP) | |||
stxv vs57, 368(SP) | |||
stxv vs58, 384(SP) | |||
stxv vs59, 400(SP) | |||
stxv vs60, 416(SP) | |||
stxv vs61, 432(SP) | |||
stxv vs62, 448(SP) | |||
stxv vs63, 464(SP) | |||
std r0, FLINK_SAVE(SP) | |||
#if defined(TRMMKERNEL) | |||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
#endif | |||
slwi LDC, LDC, 2 | |||
/*alpha is stored in f1. convert to single and splat*/ | |||
xscvdpspn alpha_r,vs1 | |||
xxspltw alpha_r,alpha_r,0 | |||
/*load reverse permute mask for big endian | |||
uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
*/ | |||
lis T2, perm_const2@highest | |||
lis T1, perm_const1@highest | |||
lis T3, save_permute_12@highest | |||
lis T4, save_permute_11@highest | |||
lis T5, save_permute_22@highest | |||
lis T6, save_permute_21@highest | |||
ori T2, T2, perm_const2@higher | |||
ori T1, T1, perm_const1@higher | |||
ori T3, T3, save_permute_12@higher | |||
ori T4, T4, save_permute_11@higher | |||
ori T5, T5, save_permute_22@higher | |||
ori T6, T6, save_permute_21@higher | |||
rldicr T2, T2, 32, 31 | |||
rldicr T1, T1, 32, 31 | |||
rldicr T3, T3, 32, 31 | |||
rldicr T4, T4, 32, 31 | |||
rldicr T5, T5, 32, 31 | |||
rldicr T6, T6, 32, 31 | |||
oris T2, T2, perm_const2@h | |||
oris T1, T1, perm_const1@h | |||
oris T3, T3, save_permute_12@h | |||
oris T4, T4, save_permute_11@h | |||
oris T5, T5, save_permute_22@h | |||
oris T6, T6, save_permute_21@h | |||
ori T2, T2, perm_const2@l | |||
ori T1, T1, perm_const1@l | |||
ori T3, T3, save_permute_12@l | |||
ori T4, T4, save_permute_11@l | |||
ori T5, T5, save_permute_22@l | |||
ori T6, T6, save_permute_21@l | |||
li r0,0 | |||
mtvsrdd permute_mask,T2,T1 | |||
mtvsrdd save_permute_1,T3,T4 | |||
mtvsrdd save_permute_2,T5,T6 | |||
#include "sgemm_logic_power9.S" | |||
.L999: | |||
lfd f14, 0(SP) | |||
lfd f15, 8(SP) | |||
lfd f16, 16(SP) | |||
lfd f17, 24(SP) | |||
lfd f18, 32(SP) | |||
lfd f19, 40(SP) | |||
lfd f20, 48(SP) | |||
lfd f21, 56(SP) | |||
lfd f22, 64(SP) | |||
lfd f23, 72(SP) | |||
lfd f24, 80(SP) | |||
lfd f25, 88(SP) | |||
lfd f26, 96(SP) | |||
lfd f27, 104(SP) | |||
lfd f28, 112(SP) | |||
lfd f29, 120(SP) | |||
lfd f30, 128(SP) | |||
lfd f31, 136(SP) | |||
ld r31, 144(SP) | |||
ld r30, 152(SP) | |||
ld r29, 160(SP) | |||
ld r28, 168(SP) | |||
ld r27, 176(SP) | |||
ld r26, 184(SP) | |||
ld r25, 192(SP) | |||
ld r24, 200(SP) | |||
ld r23, 208(SP) | |||
ld r22, 216(SP) | |||
ld r21, 224(SP) | |||
ld r20, 232(SP) | |||
ld r19, 240(SP) | |||
ld r18, 248(SP) | |||
ld r17, 256(SP) | |||
ld r16, 264(SP) | |||
ld r15, 272(SP) | |||
ld r14, 280(SP) | |||
ld r0, FLINK_SAVE(SP) | |||
lxv vs52, 288(SP) | |||
lxv vs53, 304(SP) | |||
lxv vs54, 320(SP) | |||
lxv vs55, 336(SP) | |||
lxv vs56, 352(SP) | |||
lxv vs57, 368(SP) | |||
lxv vs58, 384(SP) | |||
lxv vs59, 400(SP) | |||
mtlr r0 | |||
lxv vs60, 416(SP) | |||
lxv vs61, 432(SP) | |||
lxv vs62, 448(SP) | |||
lxv vs63, 464(SP) | |||
addi SP, SP, STACKSIZE | |||
blr | |||
EPILOGUE | |||
#endif |
@@ -1,470 +1,470 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
#include "../arm/gemv_n.c" | |||
#else | |||
#include "common.h" | |||
#define NBMAX 4096 | |||
static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
{ | |||
BLASLONG i; | |||
FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; | |||
FLOAT x0,x1,x2,x3,x4,x5,x6,x7; | |||
a0 = ap[0]; | |||
a1 = ap[1]; | |||
a2 = ap[2]; | |||
a3 = ap[3]; | |||
b0 = a0 + lda4 ; | |||
b1 = a1 + lda4 ; | |||
b2 = a2 + lda4 ; | |||
b3 = a3 + lda4 ; | |||
x0 = xo[0] * *alpha; | |||
x1 = xo[1] * *alpha; | |||
x2 = xo[2] * *alpha; | |||
x3 = xo[3] * *alpha; | |||
x4 = xo[4] * *alpha; | |||
x5 = xo[5] * *alpha; | |||
x6 = xo[6] * *alpha; | |||
x7 = xo[7] * *alpha; | |||
__vector float* va0 = (__vector float*)a0; | |||
__vector float* va1 = (__vector float*)a1; | |||
__vector float* va2 = (__vector float*)a2; | |||
__vector float* va3 = (__vector float*)a3; | |||
__vector float* vb0 = (__vector float*)b0; | |||
__vector float* vb1 = (__vector float*)b1; | |||
__vector float* vb2 = (__vector float*)b2; | |||
__vector float* vb3 = (__vector float*)b3; | |||
__vector float v_x0 = {x0,x0,x0,x0}; | |||
__vector float v_x1 = {x1,x1,x1,x1}; | |||
__vector float v_x2 = {x2,x2,x2,x2}; | |||
__vector float v_x3 = {x3,x3,x3,x3}; | |||
__vector float v_x4 = {x4,x4,x4,x4}; | |||
__vector float v_x5 = {x5,x5,x5,x5}; | |||
__vector float v_x6 = {x6,x6,x6,x6}; | |||
__vector float v_x7 = {x7,x7,x7,x7}; | |||
__vector float* v_y =(__vector float*)y; | |||
for ( i=0; i< n/4; i++) | |||
{ | |||
register __vector float vy=v_y[i]; | |||
vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; | |||
v_y[i] =vy; | |||
} | |||
} | |||
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
{ | |||
BLASLONG i; | |||
FLOAT x0,x1,x2,x3; | |||
x0 = xo[0] * *alpha; | |||
x1 = xo[1] * *alpha; | |||
x2 = xo[2] * *alpha; | |||
x3 = xo[3] * *alpha; | |||
__vector float v_x0 = {x0,x0,x0,x0}; | |||
__vector float v_x1 = {x1,x1,x1,x1}; | |||
__vector float v_x2 = {x2,x2,x2,x2}; | |||
__vector float v_x3 = {x3,x3,x3,x3}; | |||
__vector float* v_y =(__vector float*)y; | |||
__vector float* va0 = (__vector float*)ap[0]; | |||
__vector float* va1 = (__vector float*)ap[1]; | |||
__vector float* va2 = (__vector float*)ap[2]; | |||
__vector float* va3 = (__vector float*)ap[3]; | |||
for ( i=0; i< n/4; i++ ) | |||
{ | |||
register __vector float vy=v_y[i]; | |||
vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
v_y[i] =vy; | |||
} | |||
} | |||
static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
{ | |||
BLASLONG i; | |||
FLOAT x0,x1; | |||
x0 = x[0] * *alpha; | |||
x1 = x[1] * *alpha; | |||
__vector float v_x0 = {x0,x0,x0,x0}; | |||
__vector float v_x1 = {x1,x1,x1,x1}; | |||
__vector float* v_y =(__vector float*)y; | |||
__vector float* va0 = (__vector float*)ap[0]; | |||
__vector float* va1 = (__vector float*)ap[1]; | |||
for ( i=0; i< n/4; i++ ) | |||
{ | |||
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; | |||
} | |||
} | |||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
{ | |||
BLASLONG i; | |||
FLOAT x0 ; | |||
x0 = x[0] * *alpha; | |||
__vector float v_x0 = {x0,x0,x0,x0}; | |||
__vector float* v_y =(__vector float*)y; | |||
__vector float* va0 = (__vector float*)ap; | |||
for ( i=0; i< n/4; i++ ) | |||
{ | |||
v_y[i] += v_x0 * va0[i] ; | |||
} | |||
} | |||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
{ | |||
BLASLONG i; | |||
for ( i=0; i<n; i++ ){ | |||
*dest += *src; | |||
src++; | |||
dest += inc_dest; | |||
} | |||
return; | |||
} | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
{ | |||
BLASLONG i; | |||
FLOAT *a_ptr; | |||
FLOAT *x_ptr; | |||
FLOAT *y_ptr; | |||
FLOAT *ap[4]; | |||
BLASLONG n1; | |||
BLASLONG m1; | |||
BLASLONG m2; | |||
BLASLONG m3; | |||
BLASLONG n2; | |||
BLASLONG lda4 = lda << 2; | |||
BLASLONG lda8 = lda << 3; | |||
FLOAT xbuffer[8] __attribute__((aligned(16))); | |||
FLOAT *ybuffer; | |||
if ( m < 1 ) return(0); | |||
if ( n < 1 ) return(0); | |||
ybuffer = buffer; | |||
if ( inc_x == 1 ) | |||
{ | |||
n1 = n >> 3 ; | |||
n2 = n & 7 ; | |||
} | |||
else | |||
{ | |||
n1 = n >> 2 ; | |||
n2 = n & 3 ; | |||
} | |||
m3 = m & 3 ; | |||
m1 = m & -4 ; | |||
m2 = (m & (NBMAX-1)) - m3 ; | |||
y_ptr = y; | |||
BLASLONG NB = NBMAX; | |||
while ( NB == NBMAX ) | |||
{ | |||
m1 -= NB; | |||
if ( m1 < 0) | |||
{ | |||
if ( m2 == 0 ) break; | |||
NB = m2; | |||
} | |||
a_ptr = a; | |||
x_ptr = x; | |||
ap[0] = a_ptr; | |||
ap[1] = a_ptr + lda; | |||
ap[2] = ap[1] + lda; | |||
ap[3] = ap[2] + lda; | |||
if ( inc_y != 1 ) | |||
memset(ybuffer,0,NB*4); | |||
else | |||
ybuffer = y_ptr; | |||
if ( inc_x == 1 ) | |||
{ | |||
for( i = 0; i < n1 ; i++) | |||
{ | |||
sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); | |||
ap[0] += lda8; | |||
ap[1] += lda8; | |||
ap[2] += lda8; | |||
ap[3] += lda8; | |||
a_ptr += lda8; | |||
x_ptr += 8; | |||
} | |||
if ( n2 & 4 ) | |||
{ | |||
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); | |||
ap[0] += lda4; | |||
ap[1] += lda4; | |||
ap[2] += lda4; | |||
ap[3] += lda4; | |||
a_ptr += lda4; | |||
x_ptr += 4; | |||
} | |||
if ( n2 & 2 ) | |||
{ | |||
sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); | |||
a_ptr += lda*2; | |||
x_ptr += 2; | |||
} | |||
if ( n2 & 1 ) | |||
{ | |||
sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); | |||
a_ptr += lda; | |||
x_ptr += 1; | |||
} | |||
} | |||
else | |||
{ | |||
for( i = 0; i < n1 ; i++) | |||
{ | |||
xbuffer[0] = x_ptr[0]; | |||
x_ptr += inc_x; | |||
xbuffer[1] = x_ptr[0]; | |||
x_ptr += inc_x; | |||
xbuffer[2] = x_ptr[0]; | |||
x_ptr += inc_x; | |||
xbuffer[3] = x_ptr[0]; | |||
x_ptr += inc_x; | |||
sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); | |||
ap[0] += lda4; | |||
ap[1] += lda4; | |||
ap[2] += lda4; | |||
ap[3] += lda4; | |||
a_ptr += lda4; | |||
} | |||
for( i = 0; i < n2 ; i++) | |||
{ | |||
xbuffer[0] = x_ptr[0]; | |||
x_ptr += inc_x; | |||
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); | |||
a_ptr += lda; | |||
} | |||
} | |||
a += NB; | |||
if ( inc_y != 1 ) | |||
{ | |||
add_y(NB,ybuffer,y_ptr,inc_y); | |||
y_ptr += NB * inc_y; | |||
} | |||
else | |||
y_ptr += NB ; | |||
} | |||
if ( m3 == 0 ) return(0); | |||
if ( m3 == 3 ) | |||
{ | |||
a_ptr = a; | |||
x_ptr = x; | |||
FLOAT temp0 = 0.0; | |||
FLOAT temp1 = 0.0; | |||
FLOAT temp2 = 0.0; | |||
if ( lda == 3 && inc_x ==1 ) | |||
{ | |||
for( i = 0; i < ( n & -4 ); i+=4 ) | |||
{ | |||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
a_ptr += 12; | |||
x_ptr += 4; | |||
} | |||
for( ; i < n; i++ ) | |||
{ | |||
temp0 += a_ptr[0] * x_ptr[0]; | |||
temp1 += a_ptr[1] * x_ptr[0]; | |||
temp2 += a_ptr[2] * x_ptr[0]; | |||
a_ptr += 3; | |||
x_ptr ++; | |||
} | |||
} | |||
else | |||
{ | |||
for( i = 0; i < n; i++ ) | |||
{ | |||
temp0 += a_ptr[0] * x_ptr[0]; | |||
temp1 += a_ptr[1] * x_ptr[0]; | |||
temp2 += a_ptr[2] * x_ptr[0]; | |||
a_ptr += lda; | |||
x_ptr += inc_x; | |||
} | |||
} | |||
y_ptr[0] += alpha * temp0; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha * temp1; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha * temp2; | |||
return(0); | |||
} | |||
if ( m3 == 2 ) | |||
{ | |||
a_ptr = a; | |||
x_ptr = x; | |||
FLOAT temp0 = 0.0; | |||
FLOAT temp1 = 0.0; | |||
if ( lda == 2 && inc_x ==1 ) | |||
{ | |||
for( i = 0; i < (n & -4) ; i+=4 ) | |||
{ | |||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
a_ptr += 8; | |||
x_ptr += 4; | |||
} | |||
for( ; i < n; i++ ) | |||
{ | |||
temp0 += a_ptr[0] * x_ptr[0]; | |||
temp1 += a_ptr[1] * x_ptr[0]; | |||
a_ptr += 2; | |||
x_ptr ++; | |||
} | |||
} | |||
else | |||
{ | |||
for( i = 0; i < n; i++ ) | |||
{ | |||
temp0 += a_ptr[0] * x_ptr[0]; | |||
temp1 += a_ptr[1] * x_ptr[0]; | |||
a_ptr += lda; | |||
x_ptr += inc_x; | |||
} | |||
} | |||
y_ptr[0] += alpha * temp0; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha * temp1; | |||
return(0); | |||
} | |||
if ( m3 == 1 ) | |||
{ | |||
a_ptr = a; | |||
x_ptr = x; | |||
FLOAT temp = 0.0; | |||
if ( lda == 1 && inc_x ==1 ) | |||
{ | |||
for( i = 0; i < (n & -4); i+=4 ) | |||
{ | |||
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||
} | |||
for( ; i < n; i++ ) | |||
{ | |||
temp += a_ptr[i] * x_ptr[i]; | |||
} | |||
} | |||
else | |||
{ | |||
for( i = 0; i < n; i++ ) | |||
{ | |||
temp += a_ptr[0] * x_ptr[0]; | |||
a_ptr += lda; | |||
x_ptr += inc_x; | |||
} | |||
} | |||
y_ptr[0] += alpha * temp; | |||
return(0); | |||
} | |||
return(0); | |||
} | |||
#endif | |||
/*************************************************************************** | |||
Copyright (c) 2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
#include "../arm/gemv_n.c" | |||
#else | |||
#include "common.h" | |||
#define NBMAX 4096 | |||
static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
{ | |||
BLASLONG i; | |||
FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; | |||
FLOAT x0,x1,x2,x3,x4,x5,x6,x7; | |||
a0 = ap[0]; | |||
a1 = ap[1]; | |||
a2 = ap[2]; | |||
a3 = ap[3]; | |||
b0 = a0 + lda4 ; | |||
b1 = a1 + lda4 ; | |||
b2 = a2 + lda4 ; | |||
b3 = a3 + lda4 ; | |||
x0 = xo[0] * *alpha; | |||
x1 = xo[1] * *alpha; | |||
x2 = xo[2] * *alpha; | |||
x3 = xo[3] * *alpha; | |||
x4 = xo[4] * *alpha; | |||
x5 = xo[5] * *alpha; | |||
x6 = xo[6] * *alpha; | |||
x7 = xo[7] * *alpha; | |||
__vector float* va0 = (__vector float*)a0; | |||
__vector float* va1 = (__vector float*)a1; | |||
__vector float* va2 = (__vector float*)a2; | |||
__vector float* va3 = (__vector float*)a3; | |||
__vector float* vb0 = (__vector float*)b0; | |||
__vector float* vb1 = (__vector float*)b1; | |||
__vector float* vb2 = (__vector float*)b2; | |||
__vector float* vb3 = (__vector float*)b3; | |||
__vector float v_x0 = {x0,x0,x0,x0}; | |||
__vector float v_x1 = {x1,x1,x1,x1}; | |||
__vector float v_x2 = {x2,x2,x2,x2}; | |||
__vector float v_x3 = {x3,x3,x3,x3}; | |||
__vector float v_x4 = {x4,x4,x4,x4}; | |||
__vector float v_x5 = {x5,x5,x5,x5}; | |||
__vector float v_x6 = {x6,x6,x6,x6}; | |||
__vector float v_x7 = {x7,x7,x7,x7}; | |||
__vector float* v_y =(__vector float*)y; | |||
for ( i=0; i< n/4; i++) | |||
{ | |||
register __vector float vy=v_y[i]; | |||
vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; | |||
v_y[i] =vy; | |||
} | |||
} | |||
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
{ | |||
BLASLONG i; | |||
FLOAT x0,x1,x2,x3; | |||
x0 = xo[0] * *alpha; | |||
x1 = xo[1] * *alpha; | |||
x2 = xo[2] * *alpha; | |||
x3 = xo[3] * *alpha; | |||
__vector float v_x0 = {x0,x0,x0,x0}; | |||
__vector float v_x1 = {x1,x1,x1,x1}; | |||
__vector float v_x2 = {x2,x2,x2,x2}; | |||
__vector float v_x3 = {x3,x3,x3,x3}; | |||
__vector float* v_y =(__vector float*)y; | |||
__vector float* va0 = (__vector float*)ap[0]; | |||
__vector float* va1 = (__vector float*)ap[1]; | |||
__vector float* va2 = (__vector float*)ap[2]; | |||
__vector float* va3 = (__vector float*)ap[3]; | |||
for ( i=0; i< n/4; i++ ) | |||
{ | |||
register __vector float vy=v_y[i]; | |||
vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
v_y[i] =vy; | |||
} | |||
} | |||
static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
{ | |||
BLASLONG i; | |||
FLOAT x0,x1; | |||
x0 = x[0] * *alpha; | |||
x1 = x[1] * *alpha; | |||
__vector float v_x0 = {x0,x0,x0,x0}; | |||
__vector float v_x1 = {x1,x1,x1,x1}; | |||
__vector float* v_y =(__vector float*)y; | |||
__vector float* va0 = (__vector float*)ap[0]; | |||
__vector float* va1 = (__vector float*)ap[1]; | |||
for ( i=0; i< n/4; i++ ) | |||
{ | |||
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; | |||
} | |||
} | |||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
{ | |||
BLASLONG i; | |||
FLOAT x0 ; | |||
x0 = x[0] * *alpha; | |||
__vector float v_x0 = {x0,x0,x0,x0}; | |||
__vector float* v_y =(__vector float*)y; | |||
__vector float* va0 = (__vector float*)ap; | |||
for ( i=0; i< n/4; i++ ) | |||
{ | |||
v_y[i] += v_x0 * va0[i] ; | |||
} | |||
} | |||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
{ | |||
BLASLONG i; | |||
for ( i=0; i<n; i++ ){ | |||
*dest += *src; | |||
src++; | |||
dest += inc_dest; | |||
} | |||
return; | |||
} | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
{ | |||
BLASLONG i; | |||
FLOAT *a_ptr; | |||
FLOAT *x_ptr; | |||
FLOAT *y_ptr; | |||
FLOAT *ap[4]; | |||
BLASLONG n1; | |||
BLASLONG m1; | |||
BLASLONG m2; | |||
BLASLONG m3; | |||
BLASLONG n2; | |||
BLASLONG lda4 = lda << 2; | |||
BLASLONG lda8 = lda << 3; | |||
FLOAT xbuffer[8] __attribute__((aligned(16))); | |||
FLOAT *ybuffer; | |||
if ( m < 1 ) return(0); | |||
if ( n < 1 ) return(0); | |||
ybuffer = buffer; | |||
if ( inc_x == 1 ) | |||
{ | |||
n1 = n >> 3 ; | |||
n2 = n & 7 ; | |||
} | |||
else | |||
{ | |||
n1 = n >> 2 ; | |||
n2 = n & 3 ; | |||
} | |||
m3 = m & 3 ; | |||
m1 = m & -4 ; | |||
m2 = (m & (NBMAX-1)) - m3 ; | |||
y_ptr = y; | |||
BLASLONG NB = NBMAX; | |||
while ( NB == NBMAX ) | |||
{ | |||
m1 -= NB; | |||
if ( m1 < 0) | |||
{ | |||
if ( m2 == 0 ) break; | |||
NB = m2; | |||
} | |||
a_ptr = a; | |||
x_ptr = x; | |||
ap[0] = a_ptr; | |||
ap[1] = a_ptr + lda; | |||
ap[2] = ap[1] + lda; | |||
ap[3] = ap[2] + lda; | |||
if ( inc_y != 1 ) | |||
memset(ybuffer,0,NB*4); | |||
else | |||
ybuffer = y_ptr; | |||
if ( inc_x == 1 ) | |||
{ | |||
for( i = 0; i < n1 ; i++) | |||
{ | |||
sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); | |||
ap[0] += lda8; | |||
ap[1] += lda8; | |||
ap[2] += lda8; | |||
ap[3] += lda8; | |||
a_ptr += lda8; | |||
x_ptr += 8; | |||
} | |||
if ( n2 & 4 ) | |||
{ | |||
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); | |||
ap[0] += lda4; | |||
ap[1] += lda4; | |||
ap[2] += lda4; | |||
ap[3] += lda4; | |||
a_ptr += lda4; | |||
x_ptr += 4; | |||
} | |||
if ( n2 & 2 ) | |||
{ | |||
sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); | |||
a_ptr += lda*2; | |||
x_ptr += 2; | |||
} | |||
if ( n2 & 1 ) | |||
{ | |||
sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); | |||
a_ptr += lda; | |||
x_ptr += 1; | |||
} | |||
} | |||
else | |||
{ | |||
for( i = 0; i < n1 ; i++) | |||
{ | |||
xbuffer[0] = x_ptr[0]; | |||
x_ptr += inc_x; | |||
xbuffer[1] = x_ptr[0]; | |||
x_ptr += inc_x; | |||
xbuffer[2] = x_ptr[0]; | |||
x_ptr += inc_x; | |||
xbuffer[3] = x_ptr[0]; | |||
x_ptr += inc_x; | |||
sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); | |||
ap[0] += lda4; | |||
ap[1] += lda4; | |||
ap[2] += lda4; | |||
ap[3] += lda4; | |||
a_ptr += lda4; | |||
} | |||
for( i = 0; i < n2 ; i++) | |||
{ | |||
xbuffer[0] = x_ptr[0]; | |||
x_ptr += inc_x; | |||
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); | |||
a_ptr += lda; | |||
} | |||
} | |||
a += NB; | |||
if ( inc_y != 1 ) | |||
{ | |||
add_y(NB,ybuffer,y_ptr,inc_y); | |||
y_ptr += NB * inc_y; | |||
} | |||
else | |||
y_ptr += NB ; | |||
} | |||
if ( m3 == 0 ) return(0); | |||
if ( m3 == 3 ) | |||
{ | |||
a_ptr = a; | |||
x_ptr = x; | |||
FLOAT temp0 = 0.0; | |||
FLOAT temp1 = 0.0; | |||
FLOAT temp2 = 0.0; | |||
if ( lda == 3 && inc_x ==1 ) | |||
{ | |||
for( i = 0; i < ( n & -4 ); i+=4 ) | |||
{ | |||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
a_ptr += 12; | |||
x_ptr += 4; | |||
} | |||
for( ; i < n; i++ ) | |||
{ | |||
temp0 += a_ptr[0] * x_ptr[0]; | |||
temp1 += a_ptr[1] * x_ptr[0]; | |||
temp2 += a_ptr[2] * x_ptr[0]; | |||
a_ptr += 3; | |||
x_ptr ++; | |||
} | |||
} | |||
else | |||
{ | |||
for( i = 0; i < n; i++ ) | |||
{ | |||
temp0 += a_ptr[0] * x_ptr[0]; | |||
temp1 += a_ptr[1] * x_ptr[0]; | |||
temp2 += a_ptr[2] * x_ptr[0]; | |||
a_ptr += lda; | |||
x_ptr += inc_x; | |||
} | |||
} | |||
y_ptr[0] += alpha * temp0; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha * temp1; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha * temp2; | |||
return(0); | |||
} | |||
if ( m3 == 2 ) | |||
{ | |||
a_ptr = a; | |||
x_ptr = x; | |||
FLOAT temp0 = 0.0; | |||
FLOAT temp1 = 0.0; | |||
if ( lda == 2 && inc_x ==1 ) | |||
{ | |||
for( i = 0; i < (n & -4) ; i+=4 ) | |||
{ | |||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
a_ptr += 8; | |||
x_ptr += 4; | |||
} | |||
for( ; i < n; i++ ) | |||
{ | |||
temp0 += a_ptr[0] * x_ptr[0]; | |||
temp1 += a_ptr[1] * x_ptr[0]; | |||
a_ptr += 2; | |||
x_ptr ++; | |||
} | |||
} | |||
else | |||
{ | |||
for( i = 0; i < n; i++ ) | |||
{ | |||
temp0 += a_ptr[0] * x_ptr[0]; | |||
temp1 += a_ptr[1] * x_ptr[0]; | |||
a_ptr += lda; | |||
x_ptr += inc_x; | |||
} | |||
} | |||
y_ptr[0] += alpha * temp0; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha * temp1; | |||
return(0); | |||
} | |||
if ( m3 == 1 ) | |||
{ | |||
a_ptr = a; | |||
x_ptr = x; | |||
FLOAT temp = 0.0; | |||
if ( lda == 1 && inc_x ==1 ) | |||
{ | |||
for( i = 0; i < (n & -4); i+=4 ) | |||
{ | |||
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||
} | |||
for( ; i < n; i++ ) | |||
{ | |||
temp += a_ptr[i] * x_ptr[i]; | |||
} | |||
} | |||
else | |||
{ | |||
for( i = 0; i < n; i++ ) | |||
{ | |||
temp += a_ptr[0] * x_ptr[0]; | |||
a_ptr += lda; | |||
x_ptr += inc_x; | |||
} | |||
} | |||
y_ptr[0] += alpha * temp; | |||
return(0); | |||
} | |||
return(0); | |||
} | |||
#endif | |||
@@ -1,484 +1,484 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
#include "../arm/gemv_t.c" | |||
#else | |||
#include "common.h" | |||
#define NBMAX 2048 | |||
#include <altivec.h> | |||
static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
BLASLONG i; | |||
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||
__vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; | |||
register __vector float temp0 = {0,0,0,0}; | |||
register __vector float temp1 = {0,0,0,0}; | |||
register __vector float temp2 = {0,0,0,0}; | |||
register __vector float temp3 = {0,0,0,0}; | |||
register __vector float temp4 = {0,0,0,0}; | |||
register __vector float temp5 = {0,0,0,0}; | |||
register __vector float temp6 = {0,0,0,0}; | |||
register __vector float temp7 = {0,0,0,0}; | |||
a0 = ap; | |||
a1 = ap + lda; | |||
a2 = a1 + lda; | |||
a3 = a2 + lda; | |||
a4 = a3 + lda; | |||
a5 = a4 + lda; | |||
a6 = a5 + lda; | |||
a7 = a6 + lda; | |||
va0 = (__vector float*) a0; | |||
va1 = (__vector float*) a1; | |||
va2 = (__vector float*) a2; | |||
va3 = (__vector float*) a3; | |||
va4 = (__vector float*) a4; | |||
va5 = (__vector float*) a5; | |||
va6 = (__vector float*) a6; | |||
va7 = (__vector float*) a7; | |||
v_x = (__vector float*) x; | |||
for (i = 0; i < n/4; i ++) { | |||
temp0 += v_x[i] * va0[i]; | |||
temp1 += v_x[i] * va1[i]; | |||
temp2 += v_x[i] * va2[i]; | |||
temp3 += v_x[i] * va3[i]; | |||
temp4 += v_x[i] * va4[i]; | |||
temp5 += v_x[i] * va5[i]; | |||
temp6 += v_x[i] * va6[i]; | |||
temp7 += v_x[i] * va7[i]; | |||
} | |||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); | |||
y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); | |||
y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); | |||
y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); | |||
} | |||
static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
BLASLONG i = 0; | |||
FLOAT *a0, *a1, *a2, *a3; | |||
a0 = ap; | |||
a1 = ap + lda; | |||
a2 = a1 + lda; | |||
a3 = a2 + lda; | |||
__vector float* va0 = (__vector float*) a0; | |||
__vector float* va1 = (__vector float*) a1; | |||
__vector float* va2 = (__vector float*) a2; | |||
__vector float* va3 = (__vector float*) a3; | |||
__vector float* v_x = (__vector float*) x; | |||
register __vector float temp0 = {0,0,0,0}; | |||
register __vector float temp1 = {0,0,0,0}; | |||
register __vector float temp2 = {0,0,0,0}; | |||
register __vector float temp3 = {0,0,0,0}; | |||
for (i = 0; i < n / 4; i ++) { | |||
temp0 += v_x[i] * va0[i]; | |||
temp1 += v_x[i] * va1[i]; | |||
temp2 += v_x[i] * va2[i]; | |||
temp3 += v_x[i] * va3[i]; | |||
} | |||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
} | |||
static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||
BLASLONG i; | |||
FLOAT *a0, *a1; | |||
a0 = ap; | |||
a1 = ap + lda; | |||
__vector float* va0 = (__vector float*) a0; | |||
__vector float* va1 = (__vector float*) a1; | |||
__vector float* v_x = (__vector float*) x; | |||
__vector float temp0 = {0,0,0,0}; | |||
__vector float temp1 = {0,0,0,0}; | |||
for (i = 0; i < n / 4; i ++) { | |||
temp0 += v_x[i] * va0[i]; | |||
temp1 += v_x[i] * va1[i]; | |||
} | |||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
} | |||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
BLASLONG i; | |||
FLOAT *a0; | |||
a0 = ap; | |||
__vector float* va0 = (__vector float*) a0; | |||
__vector float* v_x = (__vector float*) x; | |||
__vector float temp0 = {0,0,0,0}; | |||
for (i = 0; i < n / 4; i ++) { | |||
temp0 += v_x[i] * va0[i] ; | |||
} | |||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
} | |||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
BLASLONG i; | |||
for (i = 0; i < n; i++) { | |||
*dest++ = *src; | |||
src += inc_src; | |||
} | |||
} | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||
BLASLONG i; | |||
BLASLONG j; | |||
FLOAT *a_ptr; | |||
FLOAT *x_ptr; | |||
FLOAT *y_ptr; | |||
BLASLONG n1; | |||
BLASLONG m1; | |||
BLASLONG m2; | |||
BLASLONG m3; | |||
BLASLONG n2; | |||
FLOAT ybuffer[8] __attribute__((aligned(16))); | |||
FLOAT *xbuffer; | |||
if (m < 1) return (0); | |||
if (n < 1) return (0); | |||
xbuffer = buffer; | |||
n1 = n >> 3; | |||
n2 = n & 7; | |||
m3 = m & 3; | |||
m1 = m - m3; | |||
m2 = (m & (NBMAX - 1)) - m3; | |||
BLASLONG NB = NBMAX; | |||
while (NB == NBMAX) { | |||
m1 -= NB; | |||
if (m1 < 0) { | |||
if (m2 == 0) break; | |||
NB = m2; | |||
} | |||
y_ptr = y; | |||
a_ptr = a; | |||
x_ptr = x; | |||
if (inc_x != 1) | |||
copy_x(NB, x_ptr, xbuffer, inc_x); | |||
else | |||
xbuffer = x_ptr; | |||
BLASLONG lda8 = lda << 3; | |||
if (inc_y == 1) { | |||
for (i = 0; i < n1; i++) { | |||
sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); | |||
y_ptr += 8; | |||
a_ptr += lda8; | |||
} | |||
} else { | |||
for (i = 0; i < n1; i++) { | |||
ybuffer[0] = 0; | |||
ybuffer[1] = 0; | |||
ybuffer[2] = 0; | |||
ybuffer[3] = 0; | |||
ybuffer[4] = 0; | |||
ybuffer[5] = 0; | |||
ybuffer[6] = 0; | |||
ybuffer[7] = 0; | |||
sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
*y_ptr += ybuffer[0]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[1]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[2]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[3]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[4]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[5]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[6]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[7]; | |||
y_ptr += inc_y; | |||
a_ptr += lda8; | |||
} | |||
} | |||
if (n2 & 4) { | |||
ybuffer[0] = 0; | |||
ybuffer[1] = 0; | |||
ybuffer[2] = 0; | |||
ybuffer[3] = 0; | |||
sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
a_ptr += lda<<2; | |||
*y_ptr += ybuffer[0]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[1]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[2]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[3]; | |||
y_ptr += inc_y; | |||
} | |||
if (n2 & 2) { | |||
sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); | |||
a_ptr += lda << 1; | |||
y_ptr += 2 * inc_y; | |||
} | |||
if (n2 & 1) { | |||
sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); | |||
a_ptr += lda; | |||
y_ptr += inc_y; | |||
} | |||
a += NB; | |||
x += NB * inc_x; | |||
} | |||
if (m3 == 0) return (0); | |||
x_ptr = x; | |||
a_ptr = a; | |||
if (m3 == 3) { | |||
FLOAT xtemp0 = *x_ptr * alpha; | |||
x_ptr += inc_x; | |||
FLOAT xtemp1 = *x_ptr * alpha; | |||
x_ptr += inc_x; | |||
FLOAT xtemp2 = *x_ptr * alpha; | |||
FLOAT *aj = a_ptr; | |||
y_ptr = y; | |||
if (lda == 3 && inc_y == 1) { | |||
for (j = 0; j < (n & -4); j += 4) { | |||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||
y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||
y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||
aj += 12; | |||
} | |||
for (; j < n; j++) { | |||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
aj += 3; | |||
} | |||
} else { | |||
if (inc_y == 1) { | |||
BLASLONG register lda2 = lda << 1; | |||
BLASLONG register lda4 = lda << 2; | |||
BLASLONG register lda3 = lda2 + lda; | |||
for (j = 0; j < (n & -4); j += 4) { | |||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; | |||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; | |||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; | |||
aj += lda4; | |||
} | |||
for (; j < n; j++) { | |||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
aj += lda; | |||
} | |||
} else { | |||
for (j = 0; j < n; j++) { | |||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
y_ptr += inc_y; | |||
aj += lda; | |||
} | |||
} | |||
} | |||
return (0); | |||
} | |||
if (m3 == 2) { | |||
FLOAT xtemp0 = *x_ptr * alpha; | |||
x_ptr += inc_x; | |||
FLOAT xtemp1 = *x_ptr * alpha; | |||
FLOAT *aj = a_ptr; | |||
y_ptr = y; | |||
if (lda == 2 && inc_y == 1) { | |||
for (j = 0; j < (n & -4); j += 4) { | |||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; | |||
y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; | |||
y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; | |||
aj += 8; | |||
} | |||
for (; j < n; j++) { | |||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
aj += 2; | |||
} | |||
} else { | |||
if (inc_y == 1) { | |||
BLASLONG register lda2 = lda << 1; | |||
BLASLONG register lda4 = lda << 2; | |||
BLASLONG register lda3 = lda2 + lda; | |||
for (j = 0; j < (n & -4); j += 4) { | |||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||
aj += lda4; | |||
} | |||
for (; j < n; j++) { | |||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
aj += lda; | |||
} | |||
} else { | |||
for (j = 0; j < n; j++) { | |||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
y_ptr += inc_y; | |||
aj += lda; | |||
} | |||
} | |||
} | |||
return (0); | |||
} | |||
FLOAT xtemp = *x_ptr * alpha; | |||
FLOAT *aj = a_ptr; | |||
y_ptr = y; | |||
if (lda == 1 && inc_y == 1) { | |||
for (j = 0; j < (n & -4); j += 4) { | |||
y_ptr[j] += aj[j] * xtemp; | |||
y_ptr[j + 1] += aj[j + 1] * xtemp; | |||
y_ptr[j + 2] += aj[j + 2] * xtemp; | |||
y_ptr[j + 3] += aj[j + 3] * xtemp; | |||
} | |||
for (; j < n; j++) { | |||
y_ptr[j] += aj[j] * xtemp; | |||
} | |||
} else { | |||
if (inc_y == 1) { | |||
BLASLONG register lda2 = lda << 1; | |||
BLASLONG register lda4 = lda << 2; | |||
BLASLONG register lda3 = lda2 + lda; | |||
for (j = 0; j < (n & -4); j += 4) { | |||
y_ptr[j] += *aj * xtemp; | |||
y_ptr[j + 1] += *(aj + lda) * xtemp; | |||
y_ptr[j + 2] += *(aj + lda2) * xtemp; | |||
y_ptr[j + 3] += *(aj + lda3) * xtemp; | |||
aj += lda4; | |||
} | |||
for (; j < n; j++) { | |||
y_ptr[j] += *aj * xtemp; | |||
aj += lda; | |||
} | |||
} else { | |||
for (j = 0; j < n; j++) { | |||
*y_ptr += *aj * xtemp; | |||
y_ptr += inc_y; | |||
aj += lda; | |||
} | |||
} | |||
} | |||
return (0); | |||
} | |||
#endif | |||
/*************************************************************************** | |||
Copyright (c) 2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
#include "../arm/gemv_t.c" | |||
#else | |||
#include "common.h" | |||
#define NBMAX 2048 | |||
#include <altivec.h> | |||
static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
BLASLONG i; | |||
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||
__vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; | |||
register __vector float temp0 = {0,0,0,0}; | |||
register __vector float temp1 = {0,0,0,0}; | |||
register __vector float temp2 = {0,0,0,0}; | |||
register __vector float temp3 = {0,0,0,0}; | |||
register __vector float temp4 = {0,0,0,0}; | |||
register __vector float temp5 = {0,0,0,0}; | |||
register __vector float temp6 = {0,0,0,0}; | |||
register __vector float temp7 = {0,0,0,0}; | |||
a0 = ap; | |||
a1 = ap + lda; | |||
a2 = a1 + lda; | |||
a3 = a2 + lda; | |||
a4 = a3 + lda; | |||
a5 = a4 + lda; | |||
a6 = a5 + lda; | |||
a7 = a6 + lda; | |||
va0 = (__vector float*) a0; | |||
va1 = (__vector float*) a1; | |||
va2 = (__vector float*) a2; | |||
va3 = (__vector float*) a3; | |||
va4 = (__vector float*) a4; | |||
va5 = (__vector float*) a5; | |||
va6 = (__vector float*) a6; | |||
va7 = (__vector float*) a7; | |||
v_x = (__vector float*) x; | |||
for (i = 0; i < n/4; i ++) { | |||
temp0 += v_x[i] * va0[i]; | |||
temp1 += v_x[i] * va1[i]; | |||
temp2 += v_x[i] * va2[i]; | |||
temp3 += v_x[i] * va3[i]; | |||
temp4 += v_x[i] * va4[i]; | |||
temp5 += v_x[i] * va5[i]; | |||
temp6 += v_x[i] * va6[i]; | |||
temp7 += v_x[i] * va7[i]; | |||
} | |||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); | |||
y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); | |||
y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); | |||
y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); | |||
} | |||
static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
BLASLONG i = 0; | |||
FLOAT *a0, *a1, *a2, *a3; | |||
a0 = ap; | |||
a1 = ap + lda; | |||
a2 = a1 + lda; | |||
a3 = a2 + lda; | |||
__vector float* va0 = (__vector float*) a0; | |||
__vector float* va1 = (__vector float*) a1; | |||
__vector float* va2 = (__vector float*) a2; | |||
__vector float* va3 = (__vector float*) a3; | |||
__vector float* v_x = (__vector float*) x; | |||
register __vector float temp0 = {0,0,0,0}; | |||
register __vector float temp1 = {0,0,0,0}; | |||
register __vector float temp2 = {0,0,0,0}; | |||
register __vector float temp3 = {0,0,0,0}; | |||
for (i = 0; i < n / 4; i ++) { | |||
temp0 += v_x[i] * va0[i]; | |||
temp1 += v_x[i] * va1[i]; | |||
temp2 += v_x[i] * va2[i]; | |||
temp3 += v_x[i] * va3[i]; | |||
} | |||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
} | |||
static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||
BLASLONG i; | |||
FLOAT *a0, *a1; | |||
a0 = ap; | |||
a1 = ap + lda; | |||
__vector float* va0 = (__vector float*) a0; | |||
__vector float* va1 = (__vector float*) a1; | |||
__vector float* v_x = (__vector float*) x; | |||
__vector float temp0 = {0,0,0,0}; | |||
__vector float temp1 = {0,0,0,0}; | |||
for (i = 0; i < n / 4; i ++) { | |||
temp0 += v_x[i] * va0[i]; | |||
temp1 += v_x[i] * va1[i]; | |||
} | |||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
} | |||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
BLASLONG i; | |||
FLOAT *a0; | |||
a0 = ap; | |||
__vector float* va0 = (__vector float*) a0; | |||
__vector float* v_x = (__vector float*) x; | |||
__vector float temp0 = {0,0,0,0}; | |||
for (i = 0; i < n / 4; i ++) { | |||
temp0 += v_x[i] * va0[i] ; | |||
} | |||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
} | |||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
BLASLONG i; | |||
for (i = 0; i < n; i++) { | |||
*dest++ = *src; | |||
src += inc_src; | |||
} | |||
} | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||
BLASLONG i; | |||
BLASLONG j; | |||
FLOAT *a_ptr; | |||
FLOAT *x_ptr; | |||
FLOAT *y_ptr; | |||
BLASLONG n1; | |||
BLASLONG m1; | |||
BLASLONG m2; | |||
BLASLONG m3; | |||
BLASLONG n2; | |||
FLOAT ybuffer[8] __attribute__((aligned(16))); | |||
FLOAT *xbuffer; | |||
if (m < 1) return (0); | |||
if (n < 1) return (0); | |||
xbuffer = buffer; | |||
n1 = n >> 3; | |||
n2 = n & 7; | |||
m3 = m & 3; | |||
m1 = m - m3; | |||
m2 = (m & (NBMAX - 1)) - m3; | |||
BLASLONG NB = NBMAX; | |||
while (NB == NBMAX) { | |||
m1 -= NB; | |||
if (m1 < 0) { | |||
if (m2 == 0) break; | |||
NB = m2; | |||
} | |||
y_ptr = y; | |||
a_ptr = a; | |||
x_ptr = x; | |||
if (inc_x != 1) | |||
copy_x(NB, x_ptr, xbuffer, inc_x); | |||
else | |||
xbuffer = x_ptr; | |||
BLASLONG lda8 = lda << 3; | |||
if (inc_y == 1) { | |||
for (i = 0; i < n1; i++) { | |||
sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); | |||
y_ptr += 8; | |||
a_ptr += lda8; | |||
} | |||
} else { | |||
for (i = 0; i < n1; i++) { | |||
ybuffer[0] = 0; | |||
ybuffer[1] = 0; | |||
ybuffer[2] = 0; | |||
ybuffer[3] = 0; | |||
ybuffer[4] = 0; | |||
ybuffer[5] = 0; | |||
ybuffer[6] = 0; | |||
ybuffer[7] = 0; | |||
sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
*y_ptr += ybuffer[0]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[1]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[2]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[3]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[4]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[5]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[6]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[7]; | |||
y_ptr += inc_y; | |||
a_ptr += lda8; | |||
} | |||
} | |||
if (n2 & 4) { | |||
ybuffer[0] = 0; | |||
ybuffer[1] = 0; | |||
ybuffer[2] = 0; | |||
ybuffer[3] = 0; | |||
sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
a_ptr += lda<<2; | |||
*y_ptr += ybuffer[0]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[1]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[2]; | |||
y_ptr += inc_y; | |||
*y_ptr += ybuffer[3]; | |||
y_ptr += inc_y; | |||
} | |||
if (n2 & 2) { | |||
sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); | |||
a_ptr += lda << 1; | |||
y_ptr += 2 * inc_y; | |||
} | |||
if (n2 & 1) { | |||
sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); | |||
a_ptr += lda; | |||
y_ptr += inc_y; | |||
} | |||
a += NB; | |||
x += NB * inc_x; | |||
} | |||
if (m3 == 0) return (0); | |||
x_ptr = x; | |||
a_ptr = a; | |||
if (m3 == 3) { | |||
FLOAT xtemp0 = *x_ptr * alpha; | |||
x_ptr += inc_x; | |||
FLOAT xtemp1 = *x_ptr * alpha; | |||
x_ptr += inc_x; | |||
FLOAT xtemp2 = *x_ptr * alpha; | |||
FLOAT *aj = a_ptr; | |||
y_ptr = y; | |||
if (lda == 3 && inc_y == 1) { | |||
for (j = 0; j < (n & -4); j += 4) { | |||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||
y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||
y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||
aj += 12; | |||
} | |||
for (; j < n; j++) { | |||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
aj += 3; | |||
} | |||
} else { | |||
if (inc_y == 1) { | |||
BLASLONG register lda2 = lda << 1; | |||
BLASLONG register lda4 = lda << 2; | |||
BLASLONG register lda3 = lda2 + lda; | |||
for (j = 0; j < (n & -4); j += 4) { | |||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; | |||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; | |||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; | |||
aj += lda4; | |||
} | |||
for (; j < n; j++) { | |||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
aj += lda; | |||
} | |||
} else { | |||
for (j = 0; j < n; j++) { | |||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
y_ptr += inc_y; | |||
aj += lda; | |||
} | |||
} | |||
} | |||
return (0); | |||
} | |||
if (m3 == 2) { | |||
FLOAT xtemp0 = *x_ptr * alpha; | |||
x_ptr += inc_x; | |||
FLOAT xtemp1 = *x_ptr * alpha; | |||
FLOAT *aj = a_ptr; | |||
y_ptr = y; | |||
if (lda == 2 && inc_y == 1) { | |||
for (j = 0; j < (n & -4); j += 4) { | |||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; | |||
y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; | |||
y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; | |||
aj += 8; | |||
} | |||
for (; j < n; j++) { | |||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
aj += 2; | |||
} | |||
} else { | |||
if (inc_y == 1) { | |||
BLASLONG register lda2 = lda << 1; | |||
BLASLONG register lda4 = lda << 2; | |||
BLASLONG register lda3 = lda2 + lda; | |||
for (j = 0; j < (n & -4); j += 4) { | |||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||
aj += lda4; | |||
} | |||
for (; j < n; j++) { | |||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
aj += lda; | |||
} | |||
} else { | |||
for (j = 0; j < n; j++) { | |||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
y_ptr += inc_y; | |||
aj += lda; | |||
} | |||
} | |||
} | |||
return (0); | |||
} | |||
FLOAT xtemp = *x_ptr * alpha; | |||
FLOAT *aj = a_ptr; | |||
y_ptr = y; | |||
if (lda == 1 && inc_y == 1) { | |||
for (j = 0; j < (n & -4); j += 4) { | |||
y_ptr[j] += aj[j] * xtemp; | |||
y_ptr[j + 1] += aj[j + 1] * xtemp; | |||
y_ptr[j + 2] += aj[j + 2] * xtemp; | |||
y_ptr[j + 3] += aj[j + 3] * xtemp; | |||
} | |||
for (; j < n; j++) { | |||
y_ptr[j] += aj[j] * xtemp; | |||
} | |||
} else { | |||
if (inc_y == 1) { | |||
BLASLONG register lda2 = lda << 1; | |||
BLASLONG register lda4 = lda << 2; | |||
BLASLONG register lda3 = lda2 + lda; | |||
for (j = 0; j < (n & -4); j += 4) { | |||
y_ptr[j] += *aj * xtemp; | |||
y_ptr[j + 1] += *(aj + lda) * xtemp; | |||
y_ptr[j + 2] += *(aj + lda2) * xtemp; | |||
y_ptr[j + 3] += *(aj + lda3) * xtemp; | |||
aj += lda4; | |||
} | |||
for (; j < n; j++) { | |||
y_ptr[j] += *aj * xtemp; | |||
aj += lda; | |||
} | |||
} else { | |||
for (j = 0; j < n; j++) { | |||
*y_ptr += *aj * xtemp; | |||
y_ptr += inc_y; | |||
aj += lda; | |||
} | |||
} | |||
} | |||
return (0); | |||
} | |||
#endif |
@@ -1,245 +1,245 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "def_vsx.h" | |||
#define LOAD ld | |||
#define STACKSIZE 512 | |||
#define FZERO 312+192(SP) | |||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
#define M r3 | |||
#define N r4 | |||
#define K r5 | |||
#define A r8 | |||
#define B r9 | |||
#define C r10 | |||
#define LDC r6 | |||
#define OFFSET r7 | |||
#define o0 0 | |||
#define alpha_r vs30 | |||
#define alpha_i vs31 | |||
#define VECSAVE r11 | |||
#define FRAMEPOINTER r12 | |||
#define T10 r14 | |||
#define L r15 | |||
#define T8 r16 | |||
#define T5 r17 | |||
#define T2 r19 | |||
#define TEMP_REG r20 | |||
#define T6 r21 | |||
#define I r22 | |||
#define J r23 | |||
#define AO r24 | |||
#define BO r25 | |||
#define CO r26 | |||
#define T7 r27 | |||
#define T3 r28 | |||
#define T4 r29 | |||
#define PRE r30 | |||
#define T1 r31 | |||
#ifndef NEEDPARAM | |||
PROLOGUE | |||
PROFCODE | |||
mr FRAMEPOINTER, SP | |||
addi SP, SP, -STACKSIZE | |||
mflr r0 | |||
stfd f14, 0(SP) | |||
stfd f15, 8(SP) | |||
stfd f16, 16(SP) | |||
stfd f17, 24(SP) | |||
stfd f18, 32(SP) | |||
stfd f19, 40(SP) | |||
stfd f20, 48(SP) | |||
stfd f21, 56(SP) | |||
stfd f22, 64(SP) | |||
stfd f23, 72(SP) | |||
stfd f24, 80(SP) | |||
stfd f25, 88(SP) | |||
stfd f26, 96(SP) | |||
stfd f27, 104(SP) | |||
stfd f28, 112(SP) | |||
stfd f29, 120(SP) | |||
stfd f30, 128(SP) | |||
stfd f31, 136(SP) | |||
xxspltd alpha_r,vs1,0 /*copy from register f1 */ | |||
xxspltd alpha_i,vs2,0 /*copy from register f2 */ | |||
std r31, 144(SP) | |||
std r30, 152(SP) | |||
std r29, 160(SP) | |||
std r28, 168(SP) | |||
std r27, 176(SP) | |||
std r26, 184(SP) | |||
std r25, 192(SP) | |||
std r24, 200(SP) | |||
std r23, 208(SP) | |||
std r22, 216(SP) | |||
std r21, 224(SP) | |||
std r20, 232(SP) | |||
std r19, 240(SP) | |||
std r18, 248(SP) | |||
std r17, 256(SP) | |||
std r16, 264(SP) | |||
std r15, 272(SP) | |||
std r14, 280(SP) | |||
stxv vs52, 288(SP) | |||
stxv vs53, 304(SP) | |||
stxv vs54, 320(SP) | |||
stxv vs55, 336(SP) | |||
stxv vs56, 352(SP) | |||
stxv vs57, 368(SP) | |||
stxv vs58, 384(SP) | |||
stxv vs59, 400(SP) | |||
stxv vs60, 416(SP) | |||
stxv vs61, 432(SP) | |||
stxv vs62, 448(SP) | |||
stxv vs63, 464(SP) | |||
std r0, FLINK_SAVE(SP) | |||
#if defined(linux) || defined(__FreeBSD__) | |||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
#endif | |||
#ifdef TRMMKERNEL | |||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
#endif | |||
#endif | |||
#include "zgemm_macros_power9.S" | |||
slwi LDC, LDC, ZBASE_SHIFT | |||
li PRE, 512 | |||
li r0, 0 | |||
#if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
/*negate for this case as we will use addition -1*(a+b) */ | |||
xvnegdp alpha_r,alpha_r | |||
xvnegdp alpha_i,alpha_i | |||
#endif | |||
.align 4 | |||
#include "zgemm_logic_power9.S" | |||
L999: | |||
lfd f14, 0(SP) | |||
lfd f15, 8(SP) | |||
lfd f16, 16(SP) | |||
lfd f17, 24(SP) | |||
lfd f18, 32(SP) | |||
lfd f19, 40(SP) | |||
lfd f20, 48(SP) | |||
lfd f21, 56(SP) | |||
lfd f22, 64(SP) | |||
lfd f23, 72(SP) | |||
lfd f24, 80(SP) | |||
lfd f25, 88(SP) | |||
lfd f26, 96(SP) | |||
lfd f27, 104(SP) | |||
lfd f28, 112(SP) | |||
lfd f29, 120(SP) | |||
lfd f30, 128(SP) | |||
lfd f31, 136(SP) | |||
ld r31, 144(SP) | |||
ld r30, 152(SP) | |||
ld r29, 160(SP) | |||
ld r28, 168(SP) | |||
ld r27, 176(SP) | |||
ld r26, 184(SP) | |||
ld r25, 192(SP) | |||
ld r24, 200(SP) | |||
ld r23, 208(SP) | |||
ld r22, 216(SP) | |||
ld r21, 224(SP) | |||
ld r20, 232(SP) | |||
ld r19, 240(SP) | |||
ld r18, 248(SP) | |||
ld r17, 256(SP) | |||
ld r16, 264(SP) | |||
ld r15, 272(SP) | |||
ld r14, 280(SP) | |||
ld r0, FLINK_SAVE(SP) | |||
lxv vs52, 288(SP) | |||
lxv vs53, 304(SP) | |||
lxv vs54, 320(SP) | |||
lxv vs55, 336(SP) | |||
lxv vs56, 352(SP) | |||
lxv vs57, 368(SP) | |||
lxv vs58, 384(SP) | |||
lxv vs59, 400(SP) | |||
mtlr r0 | |||
lxv vs60, 416(SP) | |||
lxv vs61, 432(SP) | |||
lxv vs62, 448(SP) | |||
lxv vs63, 464(SP) | |||
addi SP, SP, STACKSIZE | |||
blr | |||
EPILOGUE | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "def_vsx.h" | |||
#define LOAD ld | |||
#define STACKSIZE 512 | |||
#define FZERO 312+192(SP) | |||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
#define M r3 | |||
#define N r4 | |||
#define K r5 | |||
#define A r8 | |||
#define B r9 | |||
#define C r10 | |||
#define LDC r6 | |||
#define OFFSET r7 | |||
#define o0 0 | |||
#define alpha_r vs30 | |||
#define alpha_i vs31 | |||
#define VECSAVE r11 | |||
#define FRAMEPOINTER r12 | |||
#define T10 r14 | |||
#define L r15 | |||
#define T8 r16 | |||
#define T5 r17 | |||
#define T2 r19 | |||
#define TEMP_REG r20 | |||
#define T6 r21 | |||
#define I r22 | |||
#define J r23 | |||
#define AO r24 | |||
#define BO r25 | |||
#define CO r26 | |||
#define T7 r27 | |||
#define T3 r28 | |||
#define T4 r29 | |||
#define PRE r30 | |||
#define T1 r31 | |||
#ifndef NEEDPARAM | |||
PROLOGUE | |||
PROFCODE | |||
mr FRAMEPOINTER, SP | |||
addi SP, SP, -STACKSIZE | |||
mflr r0 | |||
stfd f14, 0(SP) | |||
stfd f15, 8(SP) | |||
stfd f16, 16(SP) | |||
stfd f17, 24(SP) | |||
stfd f18, 32(SP) | |||
stfd f19, 40(SP) | |||
stfd f20, 48(SP) | |||
stfd f21, 56(SP) | |||
stfd f22, 64(SP) | |||
stfd f23, 72(SP) | |||
stfd f24, 80(SP) | |||
stfd f25, 88(SP) | |||
stfd f26, 96(SP) | |||
stfd f27, 104(SP) | |||
stfd f28, 112(SP) | |||
stfd f29, 120(SP) | |||
stfd f30, 128(SP) | |||
stfd f31, 136(SP) | |||
xxspltd alpha_r,vs1,0 /*copy from register f1 */ | |||
xxspltd alpha_i,vs2,0 /*copy from register f2 */ | |||
std r31, 144(SP) | |||
std r30, 152(SP) | |||
std r29, 160(SP) | |||
std r28, 168(SP) | |||
std r27, 176(SP) | |||
std r26, 184(SP) | |||
std r25, 192(SP) | |||
std r24, 200(SP) | |||
std r23, 208(SP) | |||
std r22, 216(SP) | |||
std r21, 224(SP) | |||
std r20, 232(SP) | |||
std r19, 240(SP) | |||
std r18, 248(SP) | |||
std r17, 256(SP) | |||
std r16, 264(SP) | |||
std r15, 272(SP) | |||
std r14, 280(SP) | |||
stxv vs52, 288(SP) | |||
stxv vs53, 304(SP) | |||
stxv vs54, 320(SP) | |||
stxv vs55, 336(SP) | |||
stxv vs56, 352(SP) | |||
stxv vs57, 368(SP) | |||
stxv vs58, 384(SP) | |||
stxv vs59, 400(SP) | |||
stxv vs60, 416(SP) | |||
stxv vs61, 432(SP) | |||
stxv vs62, 448(SP) | |||
stxv vs63, 464(SP) | |||
std r0, FLINK_SAVE(SP) | |||
#if defined(linux) || defined(__FreeBSD__) | |||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
#endif | |||
#ifdef TRMMKERNEL | |||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
#endif | |||
#endif | |||
#include "zgemm_macros_power9.S" | |||
slwi LDC, LDC, ZBASE_SHIFT | |||
li PRE, 512 | |||
li r0, 0 | |||
#if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
/*negate for this case as we will use addition -1*(a+b) */ | |||
xvnegdp alpha_r,alpha_r | |||
xvnegdp alpha_i,alpha_i | |||
#endif | |||
.align 4 | |||
#include "zgemm_logic_power9.S" | |||
L999: | |||
lfd f14, 0(SP) | |||
lfd f15, 8(SP) | |||
lfd f16, 16(SP) | |||
lfd f17, 24(SP) | |||
lfd f18, 32(SP) | |||
lfd f19, 40(SP) | |||
lfd f20, 48(SP) | |||
lfd f21, 56(SP) | |||
lfd f22, 64(SP) | |||
lfd f23, 72(SP) | |||
lfd f24, 80(SP) | |||
lfd f25, 88(SP) | |||
lfd f26, 96(SP) | |||
lfd f27, 104(SP) | |||
lfd f28, 112(SP) | |||
lfd f29, 120(SP) | |||
lfd f30, 128(SP) | |||
lfd f31, 136(SP) | |||
ld r31, 144(SP) | |||
ld r30, 152(SP) | |||
ld r29, 160(SP) | |||
ld r28, 168(SP) | |||
ld r27, 176(SP) | |||
ld r26, 184(SP) | |||
ld r25, 192(SP) | |||
ld r24, 200(SP) | |||
ld r23, 208(SP) | |||
ld r22, 216(SP) | |||
ld r21, 224(SP) | |||
ld r20, 232(SP) | |||
ld r19, 240(SP) | |||
ld r18, 248(SP) | |||
ld r17, 256(SP) | |||
ld r16, 264(SP) | |||
ld r15, 272(SP) | |||
ld r14, 280(SP) | |||
ld r0, FLINK_SAVE(SP) | |||
lxv vs52, 288(SP) | |||
lxv vs53, 304(SP) | |||
lxv vs54, 320(SP) | |||
lxv vs55, 336(SP) | |||
lxv vs56, 352(SP) | |||
lxv vs57, 368(SP) | |||
lxv vs58, 384(SP) | |||
lxv vs59, 400(SP) | |||
mtlr r0 | |||
lxv vs60, 416(SP) | |||
lxv vs61, 432(SP) | |||
lxv vs62, 448(SP) | |||
lxv vs63, 464(SP) | |||
addi SP, SP, STACKSIZE | |||
blr | |||
EPILOGUE | |||
#endif |
@@ -1,226 +1,226 @@ | |||
/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ | |||
/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ | |||
/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ | |||
#define init_m8n4(c1,c2,c3,c4)\ | |||
"vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ | |||
"vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" | |||
#define INIT_m8n4 init_m8n4(4,5,6,7) | |||
#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) | |||
#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) | |||
#define init_m4n4(c1,c2,c3,c4)\ | |||
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ | |||
"vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" | |||
#define INIT_m4n4 init_m4n4(4,5,6,7) | |||
#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) | |||
#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) | |||
#define init_m2n4(c1,c2)\ | |||
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" | |||
#define INIT_m2n4 init_m2n4(4,5) | |||
#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) | |||
#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) | |||
#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" | |||
#define INIT_m1n4 init_m1n4(4) | |||
#define INIT_m1n8 INIT_m1n4 init_m1n4(5) | |||
#define INIT_m1n12 INIT_m1n8 init_m1n4(6) | |||
#define GEMM_KERNEL_k1m8n4 \ | |||
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ | |||
"vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ | |||
"vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" | |||
#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ | |||
"vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ | |||
"vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" | |||
#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ | |||
"vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ | |||
"vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" | |||
#define GEMM_KERNEL_k1m4n4 \ | |||
"vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ | |||
"vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ | |||
"vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ | |||
"vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ | |||
"vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" | |||
#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ | |||
"vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ | |||
"vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" | |||
#define GEMM_KERNEL_k1m2n4 \ | |||
"vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ | |||
"vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" | |||
#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ | |||
"vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ | |||
"vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" | |||
#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" | |||
#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" | |||
#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" | |||
#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ | |||
"vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
"vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||
"vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ | |||
"vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
"vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||
"vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" | |||
#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ | |||
"vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||
"vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||
"vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ | |||
"vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ | |||
"vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||
"vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||
"vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ | |||
"vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ | |||
"vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" | |||
#define GEMM_SUM_REORDER_2x4(c1,c2)\ | |||
"vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ | |||
"vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ | |||
"vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
"vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ | |||
#define GEMM_SUM_REORDER_1x4(c1)\ | |||
"vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
"vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
"vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" | |||
#define SOLVE_le_m4n2(b_off,c1,...)\ | |||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
"vmovsldup %%ymm"#c1",%%ymm1;" | |||
#define SOLVE_le_m8n2(b_off,c1,c2,...)\ | |||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
"vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" | |||
#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
#define SOLVE_ri_m4n2(b_off,c1,...)\ | |||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
"vmovshdup %%ymm"#c1",%%ymm1;" | |||
#define SOLVE_ri_m8n2(b_off,c1,c2,...)\ | |||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
"vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" | |||
#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
"vpermilps $0,%%xmm"#c1",%%xmm1;" | |||
#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
"vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" | |||
#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
#define SOLVE_col2_mul_m1n4(b_off,c1,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
"vpermilps $85,%%xmm"#c1",%%xmm1;" | |||
#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
"vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" | |||
#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
#define SOLVE_col3_mul_m1n4(b_off,c1,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
"vpermilps $170,%%xmm"#c1",%%xmm1;" | |||
#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
"vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" | |||
#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
"vpermilps $255,%%xmm"#c1",%%xmm1;" | |||
#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
"vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" | |||
#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ | |||
"vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ | |||
"vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ | |||
"vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ | |||
"vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
#define SAVE_SOLUTION_m4n2(c1,a_off)\ | |||
"vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ | |||
"vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ | |||
"vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
"vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" | |||
#define SAVE_SOLUTION_m1n4(c1,a_off)\ | |||
"vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
"vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ | |||
/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ | |||
/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ | |||
#define init_m8n4(c1,c2,c3,c4)\ | |||
"vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ | |||
"vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" | |||
#define INIT_m8n4 init_m8n4(4,5,6,7) | |||
#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) | |||
#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) | |||
#define init_m4n4(c1,c2,c3,c4)\ | |||
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ | |||
"vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" | |||
#define INIT_m4n4 init_m4n4(4,5,6,7) | |||
#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) | |||
#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) | |||
#define init_m2n4(c1,c2)\ | |||
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" | |||
#define INIT_m2n4 init_m2n4(4,5) | |||
#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) | |||
#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) | |||
#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" | |||
#define INIT_m1n4 init_m1n4(4) | |||
#define INIT_m1n8 INIT_m1n4 init_m1n4(5) | |||
#define INIT_m1n12 INIT_m1n8 init_m1n4(6) | |||
#define GEMM_KERNEL_k1m8n4 \ | |||
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ | |||
"vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ | |||
"vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" | |||
#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ | |||
"vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ | |||
"vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" | |||
#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ | |||
"vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ | |||
"vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" | |||
#define GEMM_KERNEL_k1m4n4 \ | |||
"vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ | |||
"vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ | |||
"vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ | |||
"vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ | |||
"vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" | |||
#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ | |||
"vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ | |||
"vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" | |||
#define GEMM_KERNEL_k1m2n4 \ | |||
"vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ | |||
"vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" | |||
#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ | |||
"vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ | |||
"vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" | |||
#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" | |||
#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" | |||
#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" | |||
#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ | |||
"vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
"vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||
"vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ | |||
"vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
"vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||
"vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" | |||
#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ | |||
"vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||
"vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||
"vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ | |||
"vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ | |||
"vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||
"vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||
"vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ | |||
"vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ | |||
"vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" | |||
#define GEMM_SUM_REORDER_2x4(c1,c2)\ | |||
"vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ | |||
"vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ | |||
"vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
"vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ | |||
#define GEMM_SUM_REORDER_1x4(c1)\ | |||
"vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
"vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
"vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" | |||
#define SOLVE_le_m4n2(b_off,c1,...)\ | |||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
"vmovsldup %%ymm"#c1",%%ymm1;" | |||
#define SOLVE_le_m8n2(b_off,c1,c2,...)\ | |||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
"vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" | |||
#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
#define SOLVE_ri_m4n2(b_off,c1,...)\ | |||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
"vmovshdup %%ymm"#c1",%%ymm1;" | |||
#define SOLVE_ri_m8n2(b_off,c1,c2,...)\ | |||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
"vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" | |||
#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
"vpermilps $0,%%xmm"#c1",%%xmm1;" | |||
#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
"vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" | |||
#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
#define SOLVE_col2_mul_m1n4(b_off,c1,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
"vpermilps $85,%%xmm"#c1",%%xmm1;" | |||
#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
"vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" | |||
#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
#define SOLVE_col3_mul_m1n4(b_off,c1,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
"vpermilps $170,%%xmm"#c1",%%xmm1;" | |||
#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
"vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" | |||
#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
"vpermilps $255,%%xmm"#c1",%%xmm1;" | |||
#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ | |||
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
"vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" | |||
#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ | |||
"vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
"vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ | |||
"vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ | |||
"vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ | |||
"vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ | |||
"vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
#define SAVE_SOLUTION_m4n2(c1,a_off)\ | |||
"vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ | |||
"vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ | |||
"vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
"vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" | |||
#define SAVE_SOLUTION_m1n4(c1,a_off)\ | |||
"vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
"vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" |
@@ -1,86 +1,86 @@ | |||
include_directories(${PROJECT_SOURCE_DIR}) | |||
include_directories(${PROJECT_BINARY_DIR}) | |||
include_directories(${PROJECT_SOURCE_DIR}/relapack) | |||
set(RELAFILES | |||
clauum.c | |||
ctrsyl_rec2.c | |||
dsytrf.c | |||
spbtrf.c | |||
strsyl_rec2.c | |||
zhetrf_rook_rec2.c | |||
ztrsyl.c | |||
cgbtrf.c | |||
cpbtrf.c | |||
ctrtri.c | |||
dsytrf_rec2.c | |||
spotrf.c | |||
strtri.c | |||
zlauum.c | |||
ztrsyl_rec2.c | |||
cgemmt.c | |||
cpotrf.c | |||
dgbtrf.c | |||
dsytrf_rook.c | |||
lapack_wrappers.c | |||
ssygst.c | |||
zgbtrf.c | |||
zpbtrf.c | |||
ztrtri.c | |||
cgetrf.c | |||
csytrf.c | |||
dgemmt.c | |||
dsytrf_rook_rec2.c | |||
ssytrf.c | |||
zgemmt.c | |||
zpotrf.c | |||
chegst.c | |||
csytrf_rec2.c | |||
dgetrf.c | |||
dtgsyl.c | |||
ssytrf_rec2.c | |||
zgetrf.c | |||
zsytrf.c | |||
chetrf.c | |||
csytrf_rook.c | |||
dlauum.c | |||
dtrsyl.c | |||
sgbtrf.c | |||
ssytrf_rook.c | |||
zhegst.c | |||
zsytrf_rec2.c | |||
chetrf_rec2.c | |||
csytrf_rook_rec2.c | |||
dpbtrf.c | |||
dtrsyl_rec2.c | |||
sgemmt.c | |||
ssytrf_rook_rec2.c | |||
zhetrf.c | |||
zsytrf_rook.c | |||
chetrf_rook.c | |||
ctgsyl.c | |||
dpotrf.c | |||
dtrtri.c | |||
sgetrf.c | |||
stgsyl.c | |||
zhetrf_rec2.c | |||
zsytrf_rook_rec2.c | |||
chetrf_rook_rec2.c | |||
ctrsyl.c | |||
dsygst.c | |||
f2c.c | |||
slauum.c | |||
strsyl.c | |||
zhetrf_rook.c | |||
ztgsyl.c | |||
) | |||
# add relapack folder to the sources | |||
set(RELA_SOURCES "") | |||
foreach (RELA_FILE ${RELAFILES}) | |||
list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}") | |||
endforeach () | |||
add_library(relapack_src OBJECT ${RELA_SOURCES}) | |||
set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") | |||
include_directories(${PROJECT_SOURCE_DIR}) | |||
include_directories(${PROJECT_BINARY_DIR}) | |||
include_directories(${PROJECT_SOURCE_DIR}/relapack) | |||
set(RELAFILES | |||
clauum.c | |||
ctrsyl_rec2.c | |||
dsytrf.c | |||
spbtrf.c | |||
strsyl_rec2.c | |||
zhetrf_rook_rec2.c | |||
ztrsyl.c | |||
cgbtrf.c | |||
cpbtrf.c | |||
ctrtri.c | |||
dsytrf_rec2.c | |||
spotrf.c | |||
strtri.c | |||
zlauum.c | |||
ztrsyl_rec2.c | |||
cgemmt.c | |||
cpotrf.c | |||
dgbtrf.c | |||
dsytrf_rook.c | |||
lapack_wrappers.c | |||
ssygst.c | |||
zgbtrf.c | |||
zpbtrf.c | |||
ztrtri.c | |||
cgetrf.c | |||
csytrf.c | |||
dgemmt.c | |||
dsytrf_rook_rec2.c | |||
ssytrf.c | |||
zgemmt.c | |||
zpotrf.c | |||
chegst.c | |||
csytrf_rec2.c | |||
dgetrf.c | |||
dtgsyl.c | |||
ssytrf_rec2.c | |||
zgetrf.c | |||
zsytrf.c | |||
chetrf.c | |||
csytrf_rook.c | |||
dlauum.c | |||
dtrsyl.c | |||
sgbtrf.c | |||
ssytrf_rook.c | |||
zhegst.c | |||
zsytrf_rec2.c | |||
chetrf_rec2.c | |||
csytrf_rook_rec2.c | |||
dpbtrf.c | |||
dtrsyl_rec2.c | |||
sgemmt.c | |||
ssytrf_rook_rec2.c | |||
zhetrf.c | |||
zsytrf_rook.c | |||
chetrf_rook.c | |||
ctgsyl.c | |||
dpotrf.c | |||
dtrtri.c | |||
sgetrf.c | |||
stgsyl.c | |||
zhetrf_rec2.c | |||
zsytrf_rook_rec2.c | |||
chetrf_rook_rec2.c | |||
ctrsyl.c | |||
dsygst.c | |||
f2c.c | |||
slauum.c | |||
strsyl.c | |||
zhetrf_rook.c | |||
ztgsyl.c | |||
) | |||
# add relapack folder to the sources | |||
set(RELA_SOURCES "") | |||
foreach (RELA_FILE ${RELAFILES}) | |||
list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}") | |||
endforeach () | |||
add_library(relapack_src OBJECT ${RELA_SOURCES}) | |||
set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") |