possible cgemv,caxpy,cdot fixtags/v0.3.8^2
@@ -24,12 +24,21 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
*****************************************************************************/ | *****************************************************************************/ | ||||
#include "common.h" | #include "common.h" | ||||
#ifndef HAVE_ASM_KERNEL | #ifndef HAVE_ASM_KERNEL | ||||
#include <altivec.h> | #include <altivec.h> | ||||
#define offset_0 0 | |||||
#define offset_1 16 | |||||
#define offset_2 32 | |||||
#define offset_3 48 | |||||
#define offset_4 64 | |||||
#define offset_5 80 | |||||
#define offset_6 96 | |||||
#define offset_7 112 | |||||
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||||
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) | static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) | ||||
{ | { | ||||
@@ -43,28 +52,29 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT | |||||
register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i}; | register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i}; | ||||
#endif | #endif | ||||
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||||
register __vector float *vy = (__vector float *) y; | |||||
register __vector float *vx = (__vector float *) x; | |||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||||
register __vector float *vptr_y = (__vector float *) y; | |||||
register __vector float *vptr_x = (__vector float *) x; | |||||
BLASLONG i=0; | BLASLONG i=0; | ||||
for (; i < n/2; i += 8) { | |||||
for(;i<n/2;i+=8){ | |||||
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ; | |||||
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ; | |||||
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ; | |||||
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ; | |||||
register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ; | |||||
register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ; | |||||
register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ; | |||||
register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ; | |||||
register __vector float vy_0 = vy[i]; | |||||
register __vector float vy_1 = vy[i + 1]; | |||||
register __vector float vy_2 = vy[i + 2]; | |||||
register __vector float vy_3 = vy[i + 3]; | |||||
register __vector float vy_4 = vy[i + 4]; | |||||
register __vector float vy_5 = vy[i + 5]; | |||||
register __vector float vy_6 = vy[i + 6]; | |||||
register __vector float vy_7 = vy[i + 7]; | |||||
register __vector float vx_0 = vx[i]; | |||||
register __vector float vx_1 = vx[i + 1]; | |||||
register __vector float vx_2 = vx[i + 2]; | |||||
register __vector float vx_3 = vx[i + 3]; | |||||
register __vector float vx_4 = vx[i + 4]; | |||||
register __vector float vx_5 = vx[i + 5]; | |||||
register __vector float vx_6 = vx[i + 6]; | |||||
register __vector float vx_7 = vx[i + 7]; | |||||
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ; | |||||
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ; | |||||
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ; | |||||
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ; | |||||
register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ; | |||||
register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ; | |||||
register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ; | |||||
register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ; | |||||
vy_0 += vx_0*valpha_r; | vy_0 += vx_0*valpha_r; | ||||
vy_1 += vx_1*valpha_r; | vy_1 += vx_1*valpha_r; | ||||
vy_2 += vx_2*valpha_r; | vy_2 += vx_2*valpha_r; | ||||
@@ -89,15 +99,17 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT | |||||
vy_5 += vx_5*valpha_i; | vy_5 += vx_5*valpha_i; | ||||
vy_6 += vx_6*valpha_i; | vy_6 += vx_6*valpha_i; | ||||
vy_7 += vx_7*valpha_i; | vy_7 += vx_7*valpha_i; | ||||
vy[i] = vy_0; | |||||
vy[i + 1] = vy_1; | |||||
vy[i + 2] = vy_2; | |||||
vy[i + 3] = vy_3; | |||||
vy[i + 4] = vy_4; | |||||
vy[i + 5] = vy_5 ; | |||||
vy[i + 6] = vy_6 ; | |||||
vy[i + 7] = vy_7 ; | |||||
vec_vsx_st( vy_0, offset_0 ,vptr_y ) ; | |||||
vec_vsx_st( vy_1, offset_1 ,vptr_y ) ; | |||||
vec_vsx_st( vy_2, offset_2 ,vptr_y ) ; | |||||
vec_vsx_st( vy_3, offset_3 ,vptr_y ) ; | |||||
vec_vsx_st( vy_4, offset_4 ,vptr_y ) ; | |||||
vec_vsx_st( vy_5, offset_5 ,vptr_y ) ; | |||||
vec_vsx_st( vy_6, offset_6 ,vptr_y ) ; | |||||
vec_vsx_st( vy_7, offset_7 ,vptr_y ) ; | |||||
vptr_x+=8; | |||||
vptr_y+=8; | |||||
} | } | ||||
} | } | ||||
#endif | #endif | ||||
@@ -25,15 +25,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | *****************************************************************************/ | ||||
#include "common.h" | #include "common.h" | ||||
#ifndef HAVE_KERNEL_8 | #ifndef HAVE_KERNEL_8 | ||||
#include <altivec.h> | #include <altivec.h> | ||||
#define offset_0 0 | |||||
#define offset_1 16 | |||||
#define offset_2 32 | |||||
#define offset_3 48 | |||||
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||||
static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) | static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) | ||||
{ | { | ||||
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||||
register __vector float *vy = (__vector float *) y; | |||||
register __vector float *vx = (__vector float *) x; | |||||
BLASLONG i = 0; | |||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||||
register __vector float *vptr_y = (__vector float *) y; | |||||
register __vector float *vptr_x = (__vector float *) x; | |||||
register __vector float vd_0 = { 0 }; | register __vector float vd_0 = { 0 }; | ||||
register __vector float vd_1 = { 0 }; | register __vector float vd_1 = { 0 }; | ||||
register __vector float vd_2 = { 0 }; | register __vector float vd_2 = { 0 }; | ||||
@@ -41,26 +48,23 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) | |||||
register __vector float vdd_0 = { 0 }; | register __vector float vdd_0 = { 0 }; | ||||
register __vector float vdd_1 = { 0 }; | register __vector float vdd_1 = { 0 }; | ||||
register __vector float vdd_2 = { 0 }; | register __vector float vdd_2 = { 0 }; | ||||
register __vector float vdd_3 = { 0 }; | |||||
for (; i < n/2; i += 4) { | |||||
register __vector float vyy_0 ; | |||||
register __vector float vyy_1 ; | |||||
register __vector float vyy_2 ; | |||||
register __vector float vyy_3 ; | |||||
register __vector float vy_0 = vy[i]; | |||||
register __vector float vy_1 = vy[i + 1]; | |||||
register __vector float vy_2 = vy[i + 2]; | |||||
register __vector float vy_3 = vy[i + 3]; | |||||
register __vector float vx_0= vx[i]; | |||||
register __vector float vx_1 = vx[i + 1]; | |||||
register __vector float vx_2 = vx[i + 2]; | |||||
register __vector float vx_3 = vx[i + 3]; | |||||
vyy_0 = vec_perm(vy_0, vy_0, swap_mask); | |||||
vyy_1 = vec_perm(vy_1, vy_1, swap_mask); | |||||
vyy_2 = vec_perm(vy_2, vy_2, swap_mask); | |||||
vyy_3 = vec_perm(vy_3, vy_3, swap_mask); | |||||
register __vector float vdd_3 = { 0 }; | |||||
BLASLONG i=0; | |||||
for(;i<n/2;i+=4){ | |||||
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ; | |||||
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ; | |||||
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ; | |||||
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ; | |||||
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ; | |||||
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ; | |||||
register __vector float vyy_0 = vec_perm(vy_0, vy_0, swap_mask); | |||||
register __vector float vyy_1 = vec_perm(vy_1, vy_1, swap_mask); | |||||
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ; | |||||
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ; | |||||
register __vector float vyy_2 = vec_perm(vy_2, vy_2, swap_mask); | |||||
register __vector float vyy_3 = vec_perm(vy_3, vy_3, swap_mask); | |||||
vd_0 += vx_0 * vy_0; | vd_0 += vx_0 * vy_0; | ||||
vd_1 += vx_1 * vy_1; | vd_1 += vx_1 * vy_1; | ||||
@@ -72,6 +76,8 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) | |||||
vdd_2 += vx_2 * vyy_2; | vdd_2 += vx_2 * vyy_2; | ||||
vdd_3 += vx_3 * vyy_3; | vdd_3 += vx_3 * vyy_3; | ||||
vptr_x+=4; | |||||
vptr_y+=4; | |||||
} | } | ||||
//aggregate | //aggregate | ||||
@@ -96,7 +102,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
BLASLONG i = 0; | BLASLONG i = 0; | ||||
BLASLONG ix=0, iy=0; | BLASLONG ix=0, iy=0; | ||||
OPENBLAS_COMPLEX_FLOAT result; | OPENBLAS_COMPLEX_FLOAT result; | ||||
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; | |||||
FLOAT dot[4] __attribute__((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; | |||||
if (n <= 0) { | if (n <= 0) { | ||||
CREAL(result) = 0.0; | CREAL(result) = 0.0; | ||||
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define NBMAX 1024 | #define NBMAX 1024 | ||||
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||||
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||||
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { | static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { | ||||
@@ -62,23 +62,24 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA | |||||
register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; | register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; | ||||
register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; | register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; | ||||
#endif | #endif | ||||
register __vector float *vy = (__vector float *) y; | |||||
register __vector float *vptr_y = (__vector float *) y; | |||||
register __vector float *vptr_a0 = (__vector float *) a0; | register __vector float *vptr_a0 = (__vector float *) a0; | ||||
register __vector float *vptr_a1 = (__vector float *) a1; | register __vector float *vptr_a1 = (__vector float *) a1; | ||||
register __vector float *vptr_a2 = (__vector float *) a2; | register __vector float *vptr_a2 = (__vector float *) a2; | ||||
register __vector float *vptr_a3 = (__vector float *) a3; | register __vector float *vptr_a3 = (__vector float *) a3; | ||||
BLASLONG i = 0; | BLASLONG i = 0; | ||||
for (;i< n / 2; i+=2) { | |||||
register __vector float vy_0 = vy[i]; | |||||
register __vector float vy_1 = vy[i + 1]; | |||||
register __vector float va0 = vptr_a0[i]; | |||||
register __vector float va1 = vptr_a1[i]; | |||||
register __vector float va2 = vptr_a2[i]; | |||||
register __vector float va3 = vptr_a3[i]; | |||||
register __vector float va0_1 = vptr_a0[i + 1]; | |||||
register __vector float va1_1 = vptr_a1[i + 1]; | |||||
register __vector float va2_1 = vptr_a2[i + 1]; | |||||
register __vector float va3_1 = vptr_a3[i + 1]; | |||||
BLASLONG i2=16; | |||||
for (;i< n * 8; i+=32,i2+=32) { | |||||
register __vector float vy_0 = vec_vsx_ld(i,vptr_y); | |||||
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); | |||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0); | |||||
register __vector float va1 = vec_vsx_ld(i, vptr_a1); | |||||
register __vector float va2 = vec_vsx_ld(i ,vptr_a2); | |||||
register __vector float va3 = vec_vsx_ld(i ,vptr_a3); | |||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); | |||||
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); | |||||
register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); | |||||
register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); | |||||
vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; | vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; | ||||
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; | vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; | ||||
@@ -93,8 +94,8 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA | |||||
vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; | vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; | ||||
vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; | vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; | ||||
vy[i] = vy_0; | |||||
vy[i + 1] = vy_1; | |||||
vec_vsx_st(vy_0 ,i, vptr_y); | |||||
vec_vsx_st(vy_1,i2,vptr_y); | |||||
} | } | ||||
} | } | ||||
@@ -118,17 +119,19 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA | |||||
register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; | register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; | ||||
register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; | register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; | ||||
#endif | #endif | ||||
register __vector float *vy = (__vector float *) y; | |||||
register __vector float *vptr_y = (__vector float *) y; | |||||
register __vector float *vptr_a0 = (__vector float *) a0; | register __vector float *vptr_a0 = (__vector float *) a0; | ||||
register __vector float *vptr_a1 = (__vector float *) a1; | register __vector float *vptr_a1 = (__vector float *) a1; | ||||
BLASLONG i = 0; | |||||
for (;i< n / 2; i+=2) { | |||||
register __vector float vy_0 = vy[i]; | |||||
register __vector float vy_1 = vy[i + 1]; | |||||
register __vector float va0 = vptr_a0[i]; | |||||
register __vector float va1 = vptr_a1[i]; | |||||
register __vector float va0_1 = vptr_a0[i + 1]; | |||||
register __vector float va1_1 = vptr_a1[i + 1]; | |||||
BLASLONG i = 0; | |||||
BLASLONG i2 = 16; | |||||
for (;i< n * 8; i+=32, i2+=32) { | |||||
register __vector float vy_0 = vec_vsx_ld(i,vptr_y); | |||||
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); | |||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0); | |||||
register __vector float va1 = vec_vsx_ld(i, vptr_a1); | |||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); | |||||
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); | |||||
register __vector float va0x = vec_perm(va0, va0,swap_mask); | register __vector float va0x = vec_perm(va0, va0,swap_mask); | ||||
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); | register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); | ||||
register __vector float va1x = vec_perm(va1, va1,swap_mask); | register __vector float va1x = vec_perm(va1, va1,swap_mask); | ||||
@@ -136,8 +139,8 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA | |||||
vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; | vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; | ||||
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; | vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; | ||||
vy[i] = vy_0; | |||||
vy[i + 1] = vy_1; | |||||
vec_vsx_st(vy_0 ,i, vptr_y); | |||||
vec_vsx_st(vy_1,i2,vptr_y); | |||||
} | } | ||||
} | } | ||||
@@ -154,21 +157,23 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||||
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; | register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; | ||||
register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; | register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; | ||||
#endif | #endif | ||||
register __vector float *vy = (__vector float *) y; | |||||
register __vector float *vptr_y = (__vector float *) y; | |||||
register __vector float *vptr_a0 = (__vector float *) ap; | register __vector float *vptr_a0 = (__vector float *) ap; | ||||
BLASLONG i = 0; | BLASLONG i = 0; | ||||
for (;i< n / 2; i+=2) { | |||||
register __vector float vy_0 = vy[i]; | |||||
register __vector float vy_1 = vy[i + 1]; | |||||
register __vector float va0 = vptr_a0[i]; | |||||
register __vector float va0_1 = vptr_a0[i + 1]; | |||||
BLASLONG i2 = 16; | |||||
for (;i< n * 8; i+=32, i2+=32) { | |||||
register __vector float vy_0 = vec_vsx_ld(i,vptr_y); | |||||
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); | |||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0); | |||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); | |||||
register __vector float va0x = vec_perm(va0, va0,swap_mask); | register __vector float va0x = vec_perm(va0, va0,swap_mask); | ||||
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); | register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); | ||||
vy_0 += va0*vx0_r + va0x*vx0_i; | vy_0 += va0*vx0_r + va0x*vx0_i; | ||||
vy_1 += va0_1*vx0_r + va0x_1*vx0_i; | vy_1 += va0_1*vx0_r + va0x_1*vx0_i; | ||||
vy[i] = vy_0; | |||||
vy[i + 1] = vy_1; | |||||
vec_vsx_st(vy_0 ,i, vptr_y); | |||||
vec_vsx_st(vy_1,i2,vptr_y); | |||||
} | } | ||||
} | } | ||||
@@ -176,7 +181,7 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { | static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { | ||||
BLASLONG i; | |||||
BLASLONG i=0; | |||||
if (inc_dest != 2) { | if (inc_dest != 2) { | ||||
@@ -213,20 +218,24 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT | |||||
register __vector float *vptr_src = (__vector float *) src; | register __vector float *vptr_src = (__vector float *) src; | ||||
register __vector float *vptr_y = (__vector float *) dest; | register __vector float *vptr_y = (__vector float *) dest; | ||||
for (i = 0; i < n/2; i += 2 ){ | |||||
register __vector float vy_0 = vptr_y[i]; | |||||
register __vector float vy_1 = vptr_y[i +1]; | |||||
BLASLONG i2 = 16; | |||||
for (;i< n * 8; i+=32, i2+=32) { | |||||
register __vector float vy_0 = vec_vsx_ld(i,vptr_y); | |||||
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); | |||||
register __vector float vsrc = vptr_src[i]; | |||||
register __vector float vsrc_1 = vptr_src[i + 1]; | |||||
register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask); | |||||
register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask); | |||||
register __vector float vsrc = vec_vsx_ld(i,vptr_src); | |||||
register __vector float vsrc_1 = vec_vsx_ld(i2,vptr_src); | |||||
vy_0 += vsrc*valpha_r + vsrcx*valpha_i; | |||||
vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i; | |||||
vptr_y[i] = vy_0; | |||||
vptr_y[i+1 ] = vy_1; | |||||
register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask); | |||||
register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask); | |||||
vy_0 += vsrc*valpha_r + vsrcx*valpha_i; | |||||
vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i; | |||||
vec_vsx_st(vy_0 ,i, vptr_y); | |||||
vec_vsx_st(vy_1,i2,vptr_y); | |||||
} | } | ||||
@@ -237,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT | |||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { | int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { | ||||
BLASLONG i; | |||||
BLASLONG i=0; | |||||
FLOAT *a_ptr; | FLOAT *a_ptr; | ||||
FLOAT *x_ptr; | FLOAT *x_ptr; | ||||
FLOAT *y_ptr; | FLOAT *y_ptr; | ||||
@@ -247,8 +256,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
BLASLONG m2; | BLASLONG m2; | ||||
BLASLONG m3; | BLASLONG m3; | ||||
BLASLONG n2; | BLASLONG n2; | ||||
FLOAT xbuffer[8], *ybuffer; | |||||
FLOAT xbuffer[8] __attribute__((aligned(16))); | |||||
FLOAT *ybuffer; | |||||
if (m < 1) return (0); | if (m < 1) return (0); | ||||
if (n < 1) return (0); | if (n < 1) return (0); | ||||
@@ -29,10 +29,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define NBMAX 1024 | #define NBMAX 1024 | ||||
#include <altivec.h> | #include <altivec.h> | ||||
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||||
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||||
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | ||||
BLASLONG i; | |||||
FLOAT *a0, *a1, *a2, *a3; | FLOAT *a0, *a1, *a2, *a3; | ||||
a0 = ap; | a0 = ap; | ||||
a1 = ap + lda; | a1 = ap + lda; | ||||
@@ -48,26 +48,39 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA | |||||
register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; | register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; | ||||
register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; | register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; | ||||
register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; | register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; | ||||
__vector float* va0 = (__vector float*) a0; | |||||
__vector float* va1 = (__vector float*) a1; | |||||
__vector float* va2 = (__vector float*) a2; | |||||
__vector float* va3 = (__vector float*) a3; | |||||
__vector float* vptr_a0 = (__vector float*) a0; | |||||
__vector float* vptr_a1 = (__vector float*) a1; | |||||
__vector float* vptr_a2 = (__vector float*) a2; | |||||
__vector float* vptr_a3 = (__vector float*) a3; | |||||
__vector float* v_x = (__vector float*) x; | __vector float* v_x = (__vector float*) x; | ||||
for (i = 0; i < n / 2; i+=2) { | |||||
register __vector float vx_0 = v_x[i]; | |||||
register __vector float vx_1 = v_x[i+1]; | |||||
BLASLONG i = 0; | |||||
BLASLONG i2 = 16; | |||||
for (;i< n * 8; i+=32, i2+=32) { | |||||
register __vector float vx_0 = vec_vsx_ld( i,v_x) ; | |||||
register __vector float vx_1 = vec_vsx_ld(i2, v_x); | |||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); | register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); | ||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); | register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); | ||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; | |||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; | |||||
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; | |||||
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; | |||||
vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1]; | |||||
vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1]; | |||||
vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1]; | |||||
vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1]; | |||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0); | |||||
register __vector float va1 = vec_vsx_ld(i, vptr_a1); | |||||
register __vector float va2 = vec_vsx_ld(i ,vptr_a2); | |||||
register __vector float va3 = vec_vsx_ld(i ,vptr_a3); | |||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); | |||||
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); | |||||
register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); | |||||
register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); | |||||
vtemp0_p += vx_0*va0 + vx_1*va0_1 ; | |||||
vtemp0_r += vxr_0*va0 + vxr_1*va0_1; | |||||
vtemp1_p += vx_0*va1 + vx_1*va1_1; | |||||
vtemp1_r += vxr_0*va1 + vxr_1*va1_1; | |||||
vtemp2_p += vx_0*va2 + vx_1*va2_1; | |||||
vtemp2_r += vxr_0*va2 + vxr_1*va2_1; | |||||
vtemp3_p += vx_0*va3 + vx_1*va3_1; | |||||
vtemp3_r += vxr_0*va3 + vxr_1*va3_1; | |||||
} | } | ||||
@@ -128,7 +141,7 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA | |||||
static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | ||||
BLASLONG i; | |||||
FLOAT *a0, *a1; | FLOAT *a0, *a1; | ||||
a0 = ap; | a0 = ap; | ||||
a1 = ap + lda; | a1 = ap + lda; | ||||
@@ -138,23 +151,33 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA | |||||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; | register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; | ||||
register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; | register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; | ||||
register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; | register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; | ||||
__vector float* va0 = (__vector float*) a0; | |||||
__vector float* va1 = (__vector float*) a1; | |||||
__vector float* vptr_a0 = (__vector float*) a0; | |||||
__vector float* vptr_a1 = (__vector float*) a1; | |||||
__vector float* v_x = (__vector float*) x; | __vector float* v_x = (__vector float*) x; | ||||
for (i = 0; i < n / 2; i+=2) { | |||||
register __vector float vx_0 = v_x[i]; | |||||
register __vector float vx_1 = v_x[i+1]; | |||||
BLASLONG i = 0; | |||||
BLASLONG i2 = 16; | |||||
for (;i< n * 8; i+=32, i2+=32) { | |||||
register __vector float vx_0 = vec_vsx_ld( i,v_x) ; | |||||
register __vector float vx_1 = vec_vsx_ld(i2, v_x); | |||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); | register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); | ||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); | register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); | ||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; | |||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; | |||||
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; | |||||
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; | |||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0); | |||||
register __vector float va1 = vec_vsx_ld(i, vptr_a1); | |||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); | |||||
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); | |||||
} | |||||
vtemp0_p += vx_0*va0 + vx_1*va0_1 ; | |||||
vtemp0_r += vxr_0*va0 + vxr_1*va0_1; | |||||
vtemp1_p += vx_0*va1 + vx_1*va1_1; | |||||
vtemp1_r += vxr_0*va1 + vxr_1*va1_1; | |||||
} | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | ||||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; | register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; | ||||
@@ -193,23 +216,27 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA | |||||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | ||||
BLASLONG i; | |||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | ||||
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) | //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) | ||||
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; | register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; | ||||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; | register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; | ||||
__vector float* va0 = (__vector float*) ap; | |||||
__vector float* vptr_a0 = (__vector float*) ap; | |||||
__vector float* v_x = (__vector float*) x; | __vector float* v_x = (__vector float*) x; | ||||
for (i = 0; i < n / 2; i+=2) { | |||||
register __vector float vx_0 = v_x[i]; | |||||
register __vector float vx_1 = v_x[i+1]; | |||||
BLASLONG i = 0; | |||||
BLASLONG i2 = 16; | |||||
for (;i< n * 8; i+=32, i2+=32) { | |||||
register __vector float vx_0 = vec_vsx_ld( i,v_x) ; | |||||
register __vector float vx_1 = vec_vsx_ld(i2, v_x); | |||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); | register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); | ||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); | register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); | ||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; | |||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; | |||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0); | |||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); | |||||
vtemp0_p += vx_0*va0 + vx_1*va0_1 ; | |||||
vtemp0_r += vxr_0*va0 + vxr_1*va0_1; | |||||
} | } | ||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | ||||
@@ -249,8 +276,8 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||||
} | } | ||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | ||||
BLASLONG i; | |||||
BLASLONG j; | |||||
BLASLONG i=0; | |||||
BLASLONG j=0; | |||||
FLOAT *a_ptr; | FLOAT *a_ptr; | ||||
FLOAT *x_ptr; | FLOAT *x_ptr; | ||||
FLOAT *y_ptr; | FLOAT *y_ptr; | ||||
@@ -260,8 +287,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
BLASLONG m2; | BLASLONG m2; | ||||
BLASLONG m3; | BLASLONG m3; | ||||
BLASLONG n2; | BLASLONG n2; | ||||
FLOAT ybuffer[8], *xbuffer; | |||||
FLOAT ybuffer[8] __attribute__((aligned(16))); | |||||
FLOAT *xbuffer; | |||||
if (m < 1) return (0); | if (m < 1) return (0); | ||||
if (n < 1) return (0); | if (n < 1) return (0); | ||||
@@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
BLASLONG m3; | BLASLONG m3; | ||||
BLASLONG n2; | BLASLONG n2; | ||||
BLASLONG lda4 = lda << 2; | BLASLONG lda4 = lda << 2; | ||||
FLOAT xbuffer[8] __attribute__ ((aligned (16)));; | |||||
FLOAT xbuffer[8] __attribute__ ((aligned (16))); | |||||
FLOAT *ybuffer; | FLOAT *ybuffer; | ||||
if ( m < 1 ) return(0); | if ( m < 1 ) return(0); | ||||
@@ -581,9 +581,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
BLASLONG m1; | BLASLONG m1; | ||||
BLASLONG m2; | BLASLONG m2; | ||||
BLASLONG m3; | BLASLONG m3; | ||||
BLASLONG n2; | |||||
FLOAT ybuffer[8], *xbuffer; | |||||
BLASLONG n2; | |||||
FLOAT ybuffer[8] __attribute__((aligned(16))); | |||||
FLOAT *xbuffer; | |||||
if (m < 1) return (0); | if (m < 1) return (0); | ||||
if (n < 1) return (0); | if (n < 1) return (0); | ||||
@@ -174,7 +174,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
BLASLONG n2; | BLASLONG n2; | ||||
BLASLONG lda4 = lda << 2; | BLASLONG lda4 = lda << 2; | ||||
BLASLONG lda8 = lda << 3; | BLASLONG lda8 = lda << 3; | ||||
FLOAT xbuffer[8],*ybuffer; | |||||
FLOAT xbuffer[8] __attribute__((aligned(16))); | |||||
FLOAT *ybuffer; | |||||
if ( m < 1 ) return(0); | if ( m < 1 ) return(0); | ||||
if ( n < 1 ) return(0); | if ( n < 1 ) return(0); | ||||
@@ -213,7 +213,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
BLASLONG n2; | BLASLONG n2; | ||||
BLASLONG lda4 = lda << 2; | BLASLONG lda4 = lda << 2; | ||||
BLASLONG lda8 = lda << 3; | BLASLONG lda8 = lda << 3; | ||||
FLOAT xbuffer[8],*ybuffer; | |||||
FLOAT xbuffer[8] __attribute__((aligned(16))); | |||||
FLOAT *ybuffer; | |||||
if ( m < 1 ) return(0); | if ( m < 1 ) return(0); | ||||
if ( n < 1 ) return(0); | if ( n < 1 ) return(0); | ||||
@@ -177,10 +177,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
BLASLONG m1; | BLASLONG m1; | ||||
BLASLONG m2; | BLASLONG m2; | ||||
BLASLONG m3; | BLASLONG m3; | ||||
BLASLONG n2; | |||||
FLOAT ybuffer[8], *xbuffer; | |||||
BLASLONG n2; | |||||
FLOAT ybuffer[8] __attribute__((aligned(16))); | |||||
FLOAT *xbuffer; | |||||
if (m < 1) return (0); | if (m < 1) return (0); | ||||
if (n < 1) return (0); | if (n < 1) return (0); | ||||
@@ -204,8 +204,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
BLASLONG m3; | BLASLONG m3; | ||||
BLASLONG n2; | BLASLONG n2; | ||||
FLOAT ybuffer[8], *xbuffer; | |||||
FLOAT ybuffer[8] __attribute__((aligned(16))); | |||||
FLOAT *xbuffer; | |||||
if (m < 1) return (0); | if (m < 1) return (0); | ||||
if (n < 1) return (0); | if (n < 1) return (0); | ||||
@@ -614,8 +614,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
BLASLONG m2; | BLASLONG m2; | ||||
BLASLONG m3; | BLASLONG m3; | ||||
BLASLONG n2; | BLASLONG n2; | ||||
FLOAT xbuffer[8], *ybuffer; | |||||
FLOAT xbuffer[8] __attribute__((aligned(16))); | |||||
FLOAT *ybuffer; | |||||
if (m < 1) return (0); | if (m < 1) return (0); | ||||
if (n < 1) return (0); | if (n < 1) return (0); | ||||
@@ -532,8 +532,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
BLASLONG m2; | BLASLONG m2; | ||||
BLASLONG m3; | BLASLONG m3; | ||||
BLASLONG n2; | BLASLONG n2; | ||||
FLOAT ybuffer[8], *xbuffer; | |||||
FLOAT ybuffer[8] __attribute__((aligned(16))); | |||||
FLOAT *xbuffer; | |||||
if (m < 1) return (0); | if (m < 1) return (0); | ||||
if (n < 1) return (0); | if (n < 1) return (0); | ||||