optimized dtrsm_kernel_LT for POWER8tags/v0.2.19^2
@@ -262,7 +262,8 @@ endif | |||||
essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ | essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ | ||||
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ | cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ | ||||
slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ | slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ | ||||
scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl | |||||
scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \ | |||||
strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl | |||||
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ | veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ | ||||
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ | scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ | ||||
@@ -696,6 +697,9 @@ strsm.mkl : strsm.$(SUFFIX) | |||||
strsm.veclib : strsm.$(SUFFIX) | strsm.veclib : strsm.$(SUFFIX) | ||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | ||||
strsm.essl : strsm.$(SUFFIX) | |||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
##################################### Dtrsm #################################################### | ##################################### Dtrsm #################################################### | ||||
dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) | dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) | ||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | ||||
@@ -712,6 +716,9 @@ dtrsm.mkl : dtrsm.$(SUFFIX) | |||||
dtrsm.veclib : dtrsm.$(SUFFIX) | dtrsm.veclib : dtrsm.$(SUFFIX) | ||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | ||||
dtrsm.essl : dtrsm.$(SUFFIX) | |||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
##################################### Ctrsm #################################################### | ##################################### Ctrsm #################################################### | ||||
ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) | ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) | ||||
@@ -729,6 +736,9 @@ ctrsm.mkl : ctrsm.$(SUFFIX) | |||||
ctrsm.veclib : ctrsm.$(SUFFIX) | ctrsm.veclib : ctrsm.$(SUFFIX) | ||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | ||||
ctrsm.essl : ctrsm.$(SUFFIX) | |||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
##################################### Ztrsm #################################################### | ##################################### Ztrsm #################################################### | ||||
ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) | ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) | ||||
@@ -746,6 +756,9 @@ ztrsm.mkl : ztrsm.$(SUFFIX) | |||||
ztrsm.veclib : ztrsm.$(SUFFIX) | ztrsm.veclib : ztrsm.$(SUFFIX) | ||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | ||||
ztrsm.essl : ztrsm.$(SUFFIX) | |||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
##################################### Ssyrk #################################################### | ##################################### Ssyrk #################################################### | ||||
ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) | ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) | ||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | ||||
@@ -54,7 +54,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | |||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
@@ -0,0 +1,294 @@ | |||||
/*********************************************************************/ | |||||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
/* All rights reserved. */ | |||||
/* */ | |||||
/* Redistribution and use in source and binary forms, with or */ | |||||
/* without modification, are permitted provided that the following */ | |||||
/* conditions are met: */ | |||||
/* */ | |||||
/* 1. Redistributions of source code must retain the above */ | |||||
/* copyright notice, this list of conditions and the following */ | |||||
/* disclaimer. */ | |||||
/* */ | |||||
/* 2. Redistributions in binary form must reproduce the above */ | |||||
/* copyright notice, this list of conditions and the following */ | |||||
/* disclaimer in the documentation and/or other materials */ | |||||
/* provided with the distribution. */ | |||||
/* */ | |||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||||
/* */ | |||||
/* The views and conclusions contained in the software and */ | |||||
/* documentation are those of the authors and should not be */ | |||||
/* interpreted as representing official policies, either expressed */ | |||||
/* or implied, of The University of Texas at Austin. */ | |||||
/*********************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "def_vsx.h" | |||||
#ifndef __64BIT__ | |||||
#define LOAD lwz | |||||
#else | |||||
#define LOAD ld | |||||
#endif | |||||
#ifdef __64BIT__ | |||||
#define STACKSIZE 320 | |||||
#define ALPHA 296(SP) | |||||
#define FZERO 304(SP) | |||||
#else | |||||
#define STACKSIZE 240 | |||||
#define ALPHA 224(SP) | |||||
#define FZERO 232(SP) | |||||
#endif | |||||
#define M r3 | |||||
#define N r4 | |||||
#define K r5 | |||||
#ifdef linux | |||||
#ifndef __64BIT__ | |||||
#define A r6 | |||||
#define B r7 | |||||
#define C r8 | |||||
#define LDC r9 | |||||
#define OFFSET r10 | |||||
#else | |||||
#define A r7 | |||||
#define B r8 | |||||
#define C r9 | |||||
#define LDC r10 | |||||
#define OFFSET r6 | |||||
#endif | |||||
#endif | |||||
#if defined(_AIX) || defined(__APPLE__) | |||||
#if !defined(__64BIT__) && defined(DOUBLE) | |||||
#define A r8 | |||||
#define B r9 | |||||
#define C r10 | |||||
#define LDC r7 | |||||
#define OFFSET r6 | |||||
#else | |||||
#define A r7 | |||||
#define B r8 | |||||
#define C r9 | |||||
#define LDC r10 | |||||
#define OFFSET r6 | |||||
#endif | |||||
#endif | |||||
#define o0 0 | |||||
#define PRE r15 | |||||
#define T4 r16 | |||||
#define L r17 | |||||
#define T3 r18 | |||||
#define T2 r19 | |||||
#define KK r20 | |||||
#define I r21 | |||||
#define J r22 | |||||
#define AO r23 | |||||
#define BO r24 | |||||
#define CO r25 | |||||
#define o8 r26 | |||||
#define o16 r27 | |||||
#define o24 r28 | |||||
#define o32 r29 | |||||
#define o48 r30 | |||||
#define T1 r31 | |||||
#include "dtrsm_macros_LT_16x4_power8.S" | |||||
#ifndef NEEDPARAM | |||||
PROLOGUE | |||||
PROFCODE | |||||
addi SP, SP, -STACKSIZE | |||||
li r0, 0 | |||||
stfd f14, 0(SP) | |||||
stfd f15, 8(SP) | |||||
stfd f16, 16(SP) | |||||
stfd f17, 24(SP) | |||||
stfd f18, 32(SP) | |||||
stfd f19, 40(SP) | |||||
stfd f20, 48(SP) | |||||
stfd f21, 56(SP) | |||||
stfd f22, 64(SP) | |||||
stfd f23, 72(SP) | |||||
stfd f24, 80(SP) | |||||
stfd f25, 88(SP) | |||||
stfd f26, 96(SP) | |||||
stfd f27, 104(SP) | |||||
stfd f28, 112(SP) | |||||
stfd f29, 120(SP) | |||||
stfd f30, 128(SP) | |||||
stfd f31, 136(SP) | |||||
#ifdef __64BIT__ | |||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
#else | |||||
stw r31, 144(SP) | |||||
stw r30, 148(SP) | |||||
stw r29, 152(SP) | |||||
stw r28, 156(SP) | |||||
stw r27, 160(SP) | |||||
stw r26, 164(SP) | |||||
stw r25, 168(SP) | |||||
stw r24, 172(SP) | |||||
stw r23, 176(SP) | |||||
stw r22, 180(SP) | |||||
stw r21, 184(SP) | |||||
stw r20, 188(SP) | |||||
stw r19, 192(SP) | |||||
stw r18, 196(SP) | |||||
#endif | |||||
#if defined(_AIX) || defined(__APPLE__) | |||||
#if !defined(__64BIT__) && defined(DOUBLE) | |||||
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||||
#endif | |||||
#endif | |||||
#if defined(linux) && defined(__64BIT__) | |||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
#endif | |||||
#if defined(_AIX) || defined(__APPLE__) | |||||
#ifdef __64BIT__ | |||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
#else | |||||
#ifdef DOUBLE | |||||
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||||
#else | |||||
lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
#endif | |||||
#endif | |||||
#endif | |||||
cmpwi cr0, M, 0 | |||||
ble L999 | |||||
cmpwi cr0, N, 0 | |||||
ble L999 | |||||
cmpwi cr0, K, 0 | |||||
ble L999 | |||||
slwi LDC, LDC, BASE_SHIFT | |||||
li o8, 8 | |||||
li o16, 16 | |||||
li o24, 24 | |||||
li o32, 32 | |||||
li o48, 48 | |||||
li PRE, 384 | |||||
mr KK, OFFSET | |||||
#include "dtrsm_logic_LT_16x4_power8.S" | |||||
L999: | |||||
addi r3, 0, 0 | |||||
lfd f14, 0(SP) | |||||
lfd f15, 8(SP) | |||||
lfd f16, 16(SP) | |||||
lfd f17, 24(SP) | |||||
lfd f18, 32(SP) | |||||
lfd f19, 40(SP) | |||||
lfd f20, 48(SP) | |||||
lfd f21, 56(SP) | |||||
lfd f22, 64(SP) | |||||
lfd f23, 72(SP) | |||||
lfd f24, 80(SP) | |||||
lfd f25, 88(SP) | |||||
lfd f26, 96(SP) | |||||
lfd f27, 104(SP) | |||||
lfd f28, 112(SP) | |||||
lfd f29, 120(SP) | |||||
lfd f30, 128(SP) | |||||
lfd f31, 136(SP) | |||||
#ifdef __64BIT__ | |||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
#else | |||||
lwz r31, 144(SP) | |||||
lwz r30, 148(SP) | |||||
lwz r29, 152(SP) | |||||
lwz r28, 156(SP) | |||||
lwz r27, 160(SP) | |||||
lwz r26, 164(SP) | |||||
lwz r25, 168(SP) | |||||
lwz r24, 172(SP) | |||||
lwz r23, 176(SP) | |||||
lwz r22, 180(SP) | |||||
lwz r21, 184(SP) | |||||
lwz r20, 188(SP) | |||||
lwz r19, 192(SP) | |||||
lwz r18, 196(SP) | |||||
#endif | |||||
addi SP, SP, STACKSIZE | |||||
blr | |||||
EPILOGUE | |||||
#endif |
@@ -0,0 +1,758 @@ | |||||
srawi. J, N, 2 | |||||
ble DSTRM_LT_L4_END | |||||
DSTRM_LT_L4_BEGIN: | |||||
mr CO, C | |||||
mr AO, A | |||||
slwi T1, LDC , 2 | |||||
add C, C, T1 | |||||
mr KK, OFFSET | |||||
srawi. I, M, 4 | |||||
ble DSTRM_LT_L4x16_END | |||||
DSTRM_LT_L4x16_BEGIN: | |||||
mr BO, B | |||||
li L, -128 | |||||
mr T1, CO | |||||
add T2, T1, LDC | |||||
add T3, T2, LDC | |||||
add T4, T3, LDC | |||||
and T1, T1, L | |||||
and T2, T2, L | |||||
and T3, T3, L | |||||
and T4, T4, L | |||||
dcbt T1, r0 | |||||
dcbt T2, r0 | |||||
dcbt T3, r0 | |||||
dcbt T4, r0 | |||||
addi T1, T1, 128 | |||||
addi T2, T2, 128 | |||||
addi T3, T3, 128 | |||||
addi T4, T4, 128 | |||||
dcbt T1, r0 | |||||
dcbt T2, r0 | |||||
dcbt T3, r0 | |||||
dcbt T4, r0 | |||||
DSTRM_LT_L4x16_LOOP_START: | |||||
INIT_16x4 | |||||
addic. L, KK, 0 | |||||
ble- DSTRM_LT_L4x16_SAVE | |||||
DSTRM_LT_L4x16_LOOP: | |||||
dcbt AO, PRE | |||||
dcbt BO, PRE | |||||
KERNEL_16x4 | |||||
addic. L, L, -1 | |||||
ble- DSTRM_LT_L4x16_SAVE | |||||
dcbt AO, PRE | |||||
KERNEL_16x4 | |||||
addic. L, L, -1 | |||||
ble- DSTRM_LT_L4x16_SAVE | |||||
dcbt AO, PRE | |||||
KERNEL_16x4 | |||||
addic. L, L, -1 | |||||
ble- DSTRM_LT_L4x16_SAVE | |||||
dcbt AO, PRE | |||||
KERNEL_16x4 | |||||
addic. L, L, -1 | |||||
bgt+ DSTRM_LT_L4x16_LOOP | |||||
DSTRM_LT_L4x16_SAVE: | |||||
SOLVE_LT_16x4 | |||||
addi CO, CO, 16*SIZE | |||||
sub T3, K, KK | |||||
sub T4, K, KK | |||||
slwi T3, T3, 4+BASE_SHIFT | |||||
slwi T4, T4, 2+BASE_SHIFT | |||||
add AO, AO, T3 | |||||
add BO, BO, T4 | |||||
addi KK, KK, 16 | |||||
addic. I, I, -1 | |||||
bgt DSTRM_LT_L4x16_BEGIN | |||||
DSTRM_LT_L4x16_END: | |||||
DSTRM_LT_L4x8_BEGIN: | |||||
andi. T2, M, 15 | |||||
ble DSTRM_LT_L4x1_END | |||||
andi. T1, M, 8 | |||||
ble DSTRM_LT_L4x8_END | |||||
mr BO, B | |||||
DSTRM_LT_L4x8_LOOP_START: | |||||
INIT_8x4 | |||||
addic. L, KK, 0 | |||||
ble DSTRM_LT_L4x8_SAVE | |||||
DSTRM_LT_L4x8_LOOP: | |||||
KERNEL_8x4 | |||||
addic. L, L, -1 | |||||
bgt DSTRM_LT_L4x8_LOOP | |||||
DSTRM_LT_L4x8_SAVE: | |||||
SOLVE_LT_8x4 | |||||
addi CO, CO, 8*SIZE | |||||
sub T3, K, KK | |||||
sub T4, K, KK | |||||
slwi T3, T3, 3+BASE_SHIFT | |||||
slwi T4, T4, 2+BASE_SHIFT | |||||
add AO, AO, T3 | |||||
add BO, BO, T4 | |||||
addi KK, KK, 8 | |||||
DSTRM_LT_L4x8_END: | |||||
DSTRM_LT_L4x4_BEGIN: | |||||
andi. T1, M, 4 | |||||
ble DSTRM_LT_L4x4_END | |||||
mr BO, B | |||||
DSTRM_LT_L4x4_LOOP_START: | |||||
INIT_4x4 | |||||
addic. L, KK, 0 | |||||
ble DSTRM_LT_L4x4_SAVE | |||||
DSTRM_LT_L4x4_LOOP: | |||||
KERNEL_4x4 | |||||
addic. L, L, -1 | |||||
bgt DSTRM_LT_L4x4_LOOP | |||||
DSTRM_LT_L4x4_SAVE: | |||||
SOLVE_LT_4x4 | |||||
addi CO, CO, 4*SIZE | |||||
sub T3, K, KK | |||||
sub T4, K, KK | |||||
slwi T3, T3, 2+BASE_SHIFT | |||||
slwi T4, T4, 2+BASE_SHIFT | |||||
add AO, AO, T3 | |||||
add BO, BO, T4 | |||||
addi KK, KK, 4 | |||||
DSTRM_LT_L4x4_END: | |||||
DSTRM_LT_L4x2_BEGIN: | |||||
andi. T1, M, 2 | |||||
ble DSTRM_LT_L4x2_END | |||||
mr BO, B | |||||
DSTRM_LT_L4x2_LOOP_START: | |||||
INIT_2x4 | |||||
addic. L, KK, 0 | |||||
ble DSTRM_LT_L4x2_SAVE | |||||
DSTRM_LT_L4x2_LOOP: | |||||
KERNEL_2x4 | |||||
addic. L, L, -1 | |||||
bgt DSTRM_LT_L4x2_LOOP | |||||
DSTRM_LT_L4x2_SAVE: | |||||
SOLVE_LT_2x4 | |||||
addi CO, CO, 2*SIZE | |||||
sub T3, K, KK | |||||
sub T4, K, KK | |||||
slwi T3, T3, 1+BASE_SHIFT | |||||
slwi T4, T4, 2+BASE_SHIFT | |||||
add AO, AO, T3 | |||||
add BO, BO, T4 | |||||
addi KK, KK, 2 | |||||
DSTRM_LT_L4x2_END: | |||||
DSTRM_LT_L4x1_BEGIN: | |||||
andi. T1, M, 1 | |||||
ble DSTRM_LT_L4x1_END | |||||
mr BO, B | |||||
DSTRM_LT_L4x1_LOOP_START: | |||||
INIT_1x4 | |||||
addic. L, KK, 0 | |||||
ble DSTRM_LT_L4x1_SAVE | |||||
DSTRM_LT_L4x1_LOOP: | |||||
KERNEL_1x4 | |||||
addic. L, L, -1 | |||||
bgt DSTRM_LT_L4x1_LOOP | |||||
DSTRM_LT_L4x1_SAVE: | |||||
SOLVE_LT_1x4 | |||||
addi CO, CO, 1*SIZE | |||||
sub T3, K, KK | |||||
sub T4, K, KK | |||||
slwi T3, T3, 0+BASE_SHIFT | |||||
slwi T4, T4, 2+BASE_SHIFT | |||||
add AO, AO, T3 | |||||
add BO, BO, T4 | |||||
addi KK, KK, 1 | |||||
DSTRM_LT_L4x1_END: | |||||
slwi T1, K, 2+BASE_SHIFT | |||||
add B, B, T1 | |||||
addic. J, J, -1 | |||||
bgt DSTRM_LT_L4_BEGIN | |||||
andi. T2, N, 3 | |||||
ble L999 | |||||
DSTRM_LT_L4_END: | |||||
b DSTRM_LT_L2_BEGIN | |||||
L999_H1: | |||||
b L999 | |||||
DSTRM_LT_L2_BEGIN: | |||||
andi. T1, N, 2 | |||||
ble DSTRM_LT_L2_END | |||||
mr CO, C | |||||
mr AO, A | |||||
slwi T1, LDC , 1 | |||||
add C, C, T1 | |||||
mr KK, OFFSET | |||||
srawi. I, M, 4 | |||||
ble DSTRM_LT_L2x16_END | |||||
DSTRM_LT_L2x16_BEGIN: | |||||
mr BO, B | |||||
DSTRM_LT_L2x16_LOOP_START: | |||||
INIT_16x2 | |||||
addic. L, KK, 0 | |||||
ble DSTRM_LT_L2x16_SAVE | |||||
DSTRM_LT_L2x16_LOOP: | |||||
KERNEL_16x2 | |||||
addic. L, L, -1 | |||||
bgt DSTRM_LT_L2x16_LOOP | |||||
DSTRM_LT_L2x16_SAVE: | |||||
SOLVE_LT_16x2 | |||||
addi CO, CO, 16*SIZE | |||||
sub T3, K, KK | |||||
sub T4, K, KK | |||||
slwi T3, T3, 4+BASE_SHIFT | |||||
slwi T4, T4, 1+BASE_SHIFT | |||||
add AO, AO, T3 | |||||
add BO, BO, T4 | |||||
addi KK, KK, 16 | |||||
addic. I, I, -1 | |||||
bgt DSTRM_LT_L2x16_BEGIN | |||||
DSTRM_LT_L2x16_END: | |||||
DSTRM_LT_L2x8_BEGIN: | |||||
andi. T2, M, 15 | |||||
ble DSTRM_LT_L2x1_END | |||||
andi. T1, M, 8 | |||||
ble DSTRM_LT_L2x8_END | |||||
mr BO, B | |||||
DSTRM_LT_L2x8_LOOP_START: | |||||
INIT_8x2 | |||||
addic. L, KK, 0 | |||||
ble DSTRM_LT_L2x8_SAVE | |||||
DSTRM_LT_L2x8_LOOP: | |||||
KERNEL_8x2 | |||||
addic. L, L, -1 | |||||
bgt DSTRM_LT_L2x8_LOOP | |||||
DSTRM_LT_L2x8_SAVE: | |||||
SOLVE_LT_8x2 | |||||
addi CO, CO, 8*SIZE | |||||
sub T3, K, KK | |||||
sub T4, K, KK | |||||
slwi T3, T3, 3+BASE_SHIFT | |||||
slwi T4, T4, 1+BASE_SHIFT | |||||
add AO, AO, T3 | |||||
add BO, BO, T4 | |||||
addi KK, KK, 8 | |||||
DSTRM_LT_L2x8_END: | |||||
DSTRM_LT_L2x4_BEGIN: | |||||
andi. T1, M, 4 | |||||
ble DSTRM_LT_L2x4_END | |||||
mr BO, B | |||||
DSTRM_LT_L2x4_LOOP_START: | |||||
INIT_4x2 | |||||
addic. L, KK, 0 | |||||
ble DSTRM_LT_L2x4_SAVE | |||||
DSTRM_LT_L2x4_LOOP: | |||||
KERNEL_4x2 | |||||
addic. L, L, -1 | |||||
bgt DSTRM_LT_L2x4_LOOP | |||||
DSTRM_LT_L2x4_SAVE: | |||||
SOLVE_LT_4x2 | |||||
addi CO, CO, 4*SIZE | |||||
sub T3, K, KK | |||||
sub T4, K, KK | |||||
slwi T3, T3, 2+BASE_SHIFT | |||||
slwi T4, T4, 1+BASE_SHIFT | |||||
add AO, AO, T3 | |||||
add BO, BO, T4 | |||||
addi KK, KK, 4 | |||||
DSTRM_LT_L2x4_END: | |||||
DSTRM_LT_L2x2_BEGIN: | |||||
andi. T1, M, 2 | |||||
ble DSTRM_LT_L2x2_END | |||||
mr BO, B | |||||
DSTRM_LT_L2x2_LOOP_START: | |||||
INIT_2x2 | |||||
addic. L, KK, 0 | |||||
ble DSTRM_LT_L2x2_SAVE | |||||
DSTRM_LT_L2x2_LOOP: | |||||
KERNEL_2x2 | |||||
addic. L, L, -1 | |||||
bgt DSTRM_LT_L2x2_LOOP | |||||
DSTRM_LT_L2x2_SAVE: | |||||
SOLVE_LT_2x2 | |||||
addi CO, CO, 2*SIZE | |||||
sub T3, K, KK | |||||
sub T4, K, KK | |||||
slwi T3, T3, 1+BASE_SHIFT | |||||
slwi T4, T4, 1+BASE_SHIFT | |||||
add AO, AO, T3 | |||||
add BO, BO, T4 | |||||
addi KK, KK, 2 | |||||
DSTRM_LT_L2x2_END: | |||||
DSTRM_LT_L2x1_BEGIN: | |||||
andi. T1, M, 1 | |||||
ble DSTRM_LT_L2x1_END | |||||
mr BO, B | |||||
DSTRM_LT_L2x1_LOOP_START: | |||||
INIT_1x2 | |||||
addic. L, KK, 0 | |||||
ble DSTRM_LT_L2x1_SAVE | |||||
DSTRM_LT_L2x1_LOOP: | |||||
KERNEL_1x2 | |||||
addic. L, L, -1 | |||||
bgt DSTRM_LT_L2x1_LOOP | |||||
DSTRM_LT_L2x1_SAVE: | |||||
SOLVE_LT_1x2 | |||||
addi CO, CO, 1*SIZE | |||||
sub T3, K, KK | |||||
sub T4, K, KK | |||||
slwi T3, T3, 0+BASE_SHIFT | |||||
slwi T4, T4, 1+BASE_SHIFT | |||||
add AO, AO, T3 | |||||
add BO, BO, T4 | |||||
addi KK, KK, 1 | |||||
DSTRM_LT_L2x1_END: | |||||
slwi T1, K, 1+BASE_SHIFT | |||||
add B, B, T1 | |||||
DSTRM_LT_L2_END: | |||||
DSTRM_LT_L1_BEGIN: | |||||
andi. T1, N, 1 | |||||
ble DSTRM_LT_L1_END | |||||
mr CO, C | |||||
mr AO, A | |||||
mr KK, OFFSET | |||||
srawi. I, M, 4 | |||||
ble DSTRM_LT_L1x16_END | |||||
DSTRM_LT_L1x16_BEGIN: | |||||
mr BO, B | |||||
DSTRM_LT_L1x16_LOOP_START: | |||||
INIT_16x1 | |||||
addic. L, KK, 0 | |||||
ble DSTRM_LT_L1x16_SAVE | |||||
DSTRM_LT_L1x16_LOOP: | |||||
KERNEL_16x1 | |||||
addic. L, L, -1 | |||||
bgt DSTRM_LT_L1x16_LOOP | |||||
DSTRM_LT_L1x16_SAVE: | |||||
SOLVE_LT_16x1 | |||||
addi CO, CO, 16*SIZE | |||||
sub T3, K, KK | |||||
sub T4, K, KK | |||||
slwi T3, T3, 4+BASE_SHIFT | |||||
slwi T4, T4, 0+BASE_SHIFT | |||||
add AO, AO, T3 | |||||
add BO, BO, T4 | |||||
addi KK, KK, 16 | |||||
addic. I, I, -1 | |||||
bgt DSTRM_LT_L1x16_BEGIN | |||||
DSTRM_LT_L1x16_END: | |||||
DSTRM_LT_L1x8_BEGIN: | |||||
andi. T1, M, 8 | |||||
ble DSTRM_LT_L1x8_END | |||||
mr BO, B | |||||
DSTRM_LT_L1x8_LOOP_START: | |||||
INIT_8x1 | |||||
addic. L, KK, 0 | |||||
ble DSTRM_LT_L1x8_SAVE | |||||
DSTRM_LT_L1x8_LOOP: | |||||
KERNEL_8x1 | |||||
addic. L, L, -1 | |||||
bgt DSTRM_LT_L1x8_LOOP | |||||
DSTRM_LT_L1x8_SAVE: | |||||
SOLVE_LT_8x1 | |||||
addi CO, CO, 8*SIZE | |||||
sub T3, K, KK | |||||
sub T4, K, KK | |||||
slwi T3, T3, 3+BASE_SHIFT | |||||
slwi T4, T4, 0+BASE_SHIFT | |||||
add AO, AO, T3 | |||||
add BO, BO, T4 | |||||
addi KK, KK, 8 | |||||
DSTRM_LT_L1x8_END: | |||||
DSTRM_LT_L1x4_BEGIN: | |||||
andi. T1, M, 4 | |||||
ble DSTRM_LT_L1x4_END | |||||
mr BO, B | |||||
DSTRM_LT_L1x4_LOOP_START: | |||||
INIT_4x1 | |||||
addic. L, KK, 0 | |||||
ble DSTRM_LT_L1x4_SAVE | |||||
DSTRM_LT_L1x4_LOOP: | |||||
KERNEL_4x1 | |||||
addic. L, L, -1 | |||||
bgt DSTRM_LT_L1x4_LOOP | |||||
DSTRM_LT_L1x4_SAVE: | |||||
SOLVE_LT_4x1 | |||||
addi CO, CO, 4*SIZE | |||||
sub T3, K, KK | |||||
sub T4, K, KK | |||||
slwi T3, T3, 2+BASE_SHIFT | |||||
slwi T4, T4, 0+BASE_SHIFT | |||||
add AO, AO, T3 | |||||
add BO, BO, T4 | |||||
addi KK, KK, 4 | |||||
DSTRM_LT_L1x4_END: | |||||
DSTRM_LT_L1x2_BEGIN: | |||||
andi. T1, M, 2 | |||||
ble DSTRM_LT_L1x2_END | |||||
mr BO, B | |||||
DSTRM_LT_L1x2_LOOP_START: | |||||
INIT_2x1 | |||||
addic. L, KK, 0 | |||||
ble DSTRM_LT_L1x2_SAVE | |||||
DSTRM_LT_L1x2_LOOP: | |||||
KERNEL_2x1 | |||||
addic. L, L, -1 | |||||
bgt DSTRM_LT_L1x2_LOOP | |||||
DSTRM_LT_L1x2_SAVE: | |||||
SOLVE_LT_2x1 | |||||
addi CO, CO, 2*SIZE | |||||
sub T3, K, KK | |||||
sub T4, K, KK | |||||
slwi T3, T3, 1+BASE_SHIFT | |||||
slwi T4, T4, 0+BASE_SHIFT | |||||
add AO, AO, T3 | |||||
add BO, BO, T4 | |||||
addi KK, KK, 2 | |||||
DSTRM_LT_L1x2_END: | |||||
DSTRM_LT_L1x1_BEGIN: | |||||
andi. T1, M, 1 | |||||
ble DSTRM_LT_L1x1_END | |||||
mr BO, B | |||||
DSTRM_LT_L1x1_LOOP_START: | |||||
INIT_1x1 | |||||
addic. L, KK, 0 | |||||
ble DSTRM_LT_L1x1_SAVE | |||||
DSTRM_LT_L1x1_LOOP: | |||||
KERNEL_1x1 | |||||
addic. L, L, -1 | |||||
bgt DSTRM_LT_L1x1_LOOP | |||||
DSTRM_LT_L1x1_SAVE: | |||||
SOLVE_LT_1x1 | |||||
addi CO, CO, 1*SIZE | |||||
sub T3, K, KK | |||||
sub T4, K, KK | |||||
slwi T3, T3, 0+BASE_SHIFT | |||||
slwi T4, T4, 0+BASE_SHIFT | |||||
add AO, AO, T3 | |||||
add BO, BO, T4 | |||||
addi KK, KK, 1 | |||||
DSTRM_LT_L1x1_END: | |||||
DSTRM_LT_L1_END: |