diff --git a/kernel/arm64/sgemm_ncopy_4.S b/kernel/arm64/sgemm_ncopy_4.S
index 30450cc7d..c819ee6fb 100644
--- a/kernel/arm64/sgemm_ncopy_4.S
+++ b/kernel/arm64/sgemm_ncopy_4.S
@@ -1,333 +1,333 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define ASSEMBLER
-#include "common.h"
-
-#define	M	x0
-#define	N	x1
-#define	A00	x2
-#define	LDA	x3
-#define	B00	x4
-
-#define	A01	x5
-#define	A02	x6
-#define	A03	x7
-#define	A04	x8
-
-#define I	x9
-#define	J	x10
-
-#define	TEMP1	x11
-#define	TEMP2	x12
-
-#define A_PREFETCH	2560
-
-/**************************************************************************************
-* Macro definitions
-**************************************************************************************/
-
-.macro SAVE_REGS
-	add	sp, sp, #-(11 * 16)
-	stp	d8, d9, [sp, #(0 * 16)]
-	stp	d10, d11, [sp, #(1 * 16)]
-	stp	d12, d13, [sp, #(2 * 16)]
-	stp	d14, d15, [sp, #(3 * 16)]
-	stp	d16, d17, [sp, #(4 * 16)]
-	stp	x18, x19, [sp, #(5 * 16)]
-	stp	x20, x21, [sp, #(6 * 16)]
-	stp	x22, x23, [sp, #(7 * 16)]
-	stp	x24, x25, [sp, #(8 * 16)]
-	stp	x26, x27, [sp, #(9 * 16)]
-	str	x28, [sp, #(10 * 16)]
-.endm
-
-.macro RESTORE_REGS
-	ldp	d8, d9, [sp, #(0 * 16)]
-	ldp	d10, d11, [sp, #(1 * 16)]
-	ldp	d12, d13, [sp, #(2 * 16)]
-	ldp	d14, d15, [sp, #(3 * 16)]
-	ldp	d16, d17, [sp, #(4 * 16)]
-	ldp	x18, x19, [sp, #(5 * 16)]
-	ldp	x20, x21, [sp, #(6 * 16)]
-	ldp	x22, x23, [sp, #(7 * 16)]
-	ldp	x24, x25, [sp, #(8 * 16)]
-	ldp	x26, x27, [sp, #(9 * 16)]
-	ldr	x28, [sp, #(10 * 16)]
-	add	sp, sp, #(11*16)
-.endm
-
-.macro COPY4x4
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ldr	q0, [A01], #16
-	ins	v8.s[0], v0.s[0]
-	ins	v9.s[0], v0.s[1]
-	ins	v10.s[0], v0.s[2]
-	ins	v11.s[0], v0.s[3]
-
-	ldr	q1, [A02], #16
-	ins	v8.s[1], v1.s[0]
-	ins	v9.s[1], v1.s[1]
-	ins	v10.s[1], v1.s[2]
-	ins	v11.s[1], v1.s[3]
-
-	ldr	q2, [A03], #16
-	ins	v8.s[2], v2.s[0]
-	ins	v9.s[2], v2.s[1]
-	ins	v10.s[2], v2.s[2]
-	ins	v11.s[2], v2.s[3]
-
-	ldr	q3, [A04], #16
-	ins	v8.s[3], v3.s[0]
-	ins	v9.s[3], v3.s[1]
-	ins	v10.s[3], v3.s[2]
-	ins	v11.s[3], v3.s[3]
-
-	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B00]
-	add	B00, B00, #64
-
-.endm
-
-.macro COPY1x4
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ldr	s0, [A01], #4
-	ldr	s1, [A02], #4
-	ldr	s2, [A03], #4
-	ldr	s3, [A04], #4
-
-	stp	s0, s1, [B00]
-	add	B00, B00, #8
-   	stp	s2, s3, [B00]
-	add	B00, B00, #8
-.endm
-
-.macro COPY4x2
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ldr	q0, [A01], #16
-	ins	v8.s[0], v0.s[0]
-	ins	v9.s[0], v0.s[1]
-	ins	v10.s[0], v0.s[2]
-	ins	v11.s[0], v0.s[3]
-
-	ldr	q1, [A02], #16
-	ins	v8.s[1], v1.s[0]
-	ins	v9.s[1], v1.s[1]
-	ins	v10.s[1], v1.s[2]
-	ins	v11.s[1], v1.s[3]
-
-	st1	{v8.2s, v9.2s, v10.2s, v11.2s}, [B00]
-	add	B00, B00, #32
-.endm
-
-
-.macro COPY1x2
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ldr	s0, [A01], #4
-	ldr	s1, [A02], #4
-
-	stp	s0, s1, [B00]
-	add	B00, B00, #8
-.endm
-
-.macro COPY4x1
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-
-	ldr	q0, [A01], #16
-	str	q0, [B00], #16
-.endm
-
-
-.macro COPY1x1
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-
-	ldr	s0, [A01], #4
-	str	s0, [B00], #4
-.endm
-
-/**************************************************************************************
-* End of macro definitions
-**************************************************************************************/
-
-	PROLOGUE
-
-	.align 5
-
-	SAVE_REGS
-
-	lsl	LDA, LDA, #2					// LDA = LDA * SIZE
-
-.Ldgemm_ncopy_L4_BEGIN:
-
-	asr	J, N, #2					// J = N / 4
-	cmp 	J, #0
-	ble	.Ldgemm_ncopy_L2_BEGIN
-
-	.align	5
-.Ldgemm_ncopy_L4_M4_BEGIN:
-
-	mov	A01, A00
-	add	A02, A01, LDA
-	add	A03, A02, LDA
-	add	A04, A03, LDA
-	add	A00, A04, LDA
-
-	asr	I, M, #2					// I = M / 4
-	cmp	I, #0
-	ble	.Ldgemm_ncopy_L4_M4_40
-
-	.align	5
-.Ldgemm_ncopy_L4_M4_20:
-
-	COPY4x4
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L4_M4_20
-
-.Ldgemm_ncopy_L4_M4_40:
-
-	and	I, M , #3
-	cmp	I, #0
-	ble	.Ldgemm_ncopy_L4_M4_END
-
-	.align	5
-.Ldgemm_ncopy_L4_M4_60:
-
-	COPY1x4
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L4_M4_60
-
-.Ldgemm_ncopy_L4_M4_END:
-
-	subs	J , J, #1						// j--
-	bne	.Ldgemm_ncopy_L4_M4_BEGIN
-
-/*********************************************************************************************/
-
-.Ldgemm_ncopy_L2_BEGIN:
-
-	tst	N, #3
-	ble	.Ldgemm_ncopy_L999
-
-	tst	N, #2
-	ble	.Ldgemm_ncopy_L1_BEGIN
-
-.Ldgemm_ncopy_L2_M4_BEGIN:
-	mov	A01, A00
-	add	A02, A01, LDA
-	add	A00, A02, LDA
-
-	asr	I, M, #2					// I = M / 4
-	cmp 	I, #0
-	ble	.Ldgemm_ncopy_L2_M4_40
-
-	.align	5
-.Ldgemm_ncopy_L2_M4_20:
-
-	COPY4x2
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L2_M4_20
-
-.Ldgemm_ncopy_L2_M4_40:
-
-	and	I, M , #3
-	cmp	I, #0
-	ble	.Ldgemm_ncopy_L2_M4_END
-
-	.align	5
-.Ldgemm_ncopy_L2_M4_60:
-
-	COPY1x2
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L2_M4_60
-
-.Ldgemm_ncopy_L2_M4_END:
-
-
-/*********************************************************************************************/
-
-.Ldgemm_ncopy_L1_BEGIN:
-
-	tst	N, #1
-	ble	.Ldgemm_ncopy_L999
-
-.Ldgemm_ncopy_L1_M4_BEGIN:
-
-	mov	A01, A00
-
-	asr	I, M, #2					// I = M / 4
-	cmp	I, #0
-	ble	.Ldgemm_ncopy_L1_M4_40
-
-	.align	5
-.Ldgemm_ncopy_L1_M4_20:
-
-	COPY4x1
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L1_M4_20
-
-
-.Ldgemm_ncopy_L1_M4_40:
-
-	and	I, M , #3
-	cmp	I, #0
-	ble	.Ldgemm_ncopy_L1_M4_END
-
-	.align	5
-.Ldgemm_ncopy_L1_M4_60:
-
-	COPY1x1
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L1_M4_60
-
-
-.Ldgemm_ncopy_L1_M4_END:
-
-.Ldgemm_ncopy_L999:
-
-	mov	x0, #0
-	RESTORE_REGS
-	ret
-
-	EPILOGUE
-
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define	M	x0
+#define	N	x1
+#define	A00	x2
+#define	LDA	x3
+#define	B00	x4
+
+#define	A01	x5
+#define	A02	x6
+#define	A03	x7
+#define	A04	x8
+
+#define I	x9
+#define	J	x10
+
+#define	TEMP1	x11
+#define	TEMP2	x12
+
+#define A_PREFETCH	2560
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro SAVE_REGS
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+.endm
+
+.macro COPY4x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	q0, [A01], #16
+	ins	v8.s[0], v0.s[0]
+	ins	v9.s[0], v0.s[1]
+	ins	v10.s[0], v0.s[2]
+	ins	v11.s[0], v0.s[3]
+
+	ldr	q1, [A02], #16
+	ins	v8.s[1], v1.s[0]
+	ins	v9.s[1], v1.s[1]
+	ins	v10.s[1], v1.s[2]
+	ins	v11.s[1], v1.s[3]
+
+	ldr	q2, [A03], #16
+	ins	v8.s[2], v2.s[0]
+	ins	v9.s[2], v2.s[1]
+	ins	v10.s[2], v2.s[2]
+	ins	v11.s[2], v2.s[3]
+
+	ldr	q3, [A04], #16
+	ins	v8.s[3], v3.s[0]
+	ins	v9.s[3], v3.s[1]
+	ins	v10.s[3], v3.s[2]
+	ins	v11.s[3], v3.s[3]
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B00]
+	add	B00, B00, #64
+
+.endm
+
+.macro COPY1x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	s0, [A01], #4
+	ldr	s1, [A02], #4
+	ldr	s2, [A03], #4
+	ldr	s3, [A04], #4
+
+	stp	s0, s1, [B00]
+	add	B00, B00, #8
+   	stp	s2, s3, [B00]
+	add	B00, B00, #8
+.endm
+
+.macro COPY4x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	q0, [A01], #16
+	ins	v8.s[0], v0.s[0]
+	ins	v9.s[0], v0.s[1]
+	ins	v10.s[0], v0.s[2]
+	ins	v11.s[0], v0.s[3]
+
+	ldr	q1, [A02], #16
+	ins	v8.s[1], v1.s[0]
+	ins	v9.s[1], v1.s[1]
+	ins	v10.s[1], v1.s[2]
+	ins	v11.s[1], v1.s[3]
+
+	st1	{v8.2s, v9.2s, v10.2s, v11.2s}, [B00]
+	add	B00, B00, #32
+.endm
+
+
+.macro COPY1x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	s0, [A01], #4
+	ldr	s1, [A02], #4
+
+	stp	s0, s1, [B00]
+	add	B00, B00, #8
+.endm
+
+.macro COPY4x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	q0, [A01], #16
+	str	q0, [B00], #16
+.endm
+
+
+.macro COPY1x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	s0, [A01], #4
+	str	s0, [B00], #4
+.endm
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	SAVE_REGS
+
+	lsl	LDA, LDA, #2					// LDA = LDA * SIZE
+
+.Ldgemm_ncopy_L4_BEGIN:
+
+	asr	J, N, #2					// J = N / 4
+	cmp 	J, #0
+	ble	.Ldgemm_ncopy_L2_BEGIN
+
+	.align	5
+.Ldgemm_ncopy_L4_M4_BEGIN:
+
+	mov	A01, A00
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A00, A04, LDA
+
+	asr	I, M, #2					// I = M / 4
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L4_M4_40
+
+	.align	5
+.Ldgemm_ncopy_L4_M4_20:
+
+	COPY4x4
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L4_M4_20
+
+.Ldgemm_ncopy_L4_M4_40:
+
+	and	I, M , #3
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L4_M4_END
+
+	.align	5
+.Ldgemm_ncopy_L4_M4_60:
+
+	COPY1x4
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L4_M4_60
+
+.Ldgemm_ncopy_L4_M4_END:
+
+	subs	J , J, #1						// j--
+	bne	.Ldgemm_ncopy_L4_M4_BEGIN
+
+/*********************************************************************************************/
+
+.Ldgemm_ncopy_L2_BEGIN:
+
+	tst	N, #3
+	ble	.Ldgemm_ncopy_L999
+
+	tst	N, #2
+	ble	.Ldgemm_ncopy_L1_BEGIN
+
+.Ldgemm_ncopy_L2_M4_BEGIN:
+	mov	A01, A00
+	add	A02, A01, LDA
+	add	A00, A02, LDA
+
+	asr	I, M, #2					// I = M / 4
+	cmp 	I, #0
+	ble	.Ldgemm_ncopy_L2_M4_40
+
+	.align	5
+.Ldgemm_ncopy_L2_M4_20:
+
+	COPY4x2
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L2_M4_20
+
+.Ldgemm_ncopy_L2_M4_40:
+
+	and	I, M , #3
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L2_M4_END
+
+	.align	5
+.Ldgemm_ncopy_L2_M4_60:
+
+	COPY1x2
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L2_M4_60
+
+.Ldgemm_ncopy_L2_M4_END:
+
+
+/*********************************************************************************************/
+
+.Ldgemm_ncopy_L1_BEGIN:
+
+	tst	N, #1
+	ble	.Ldgemm_ncopy_L999
+
+.Ldgemm_ncopy_L1_M4_BEGIN:
+
+	mov	A01, A00
+
+	asr	I, M, #2					// I = M / 4
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L1_M4_40
+
+	.align	5
+.Ldgemm_ncopy_L1_M4_20:
+
+	COPY4x1
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L1_M4_20
+
+
+.Ldgemm_ncopy_L1_M4_40:
+
+	and	I, M , #3
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L1_M4_END
+
+	.align	5
+.Ldgemm_ncopy_L1_M4_60:
+
+	COPY1x1
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L1_M4_60
+
+
+.Ldgemm_ncopy_L1_M4_END:
+
+.Ldgemm_ncopy_L999:
+
+	mov	x0, #0
+	RESTORE_REGS
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S
index 431f1ae2a..3066421bb 100644
--- a/kernel/arm64/sgemm_tcopy_16.S
+++ b/kernel/arm64/sgemm_tcopy_16.S
@@ -1,814 +1,814 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-*****************************************************************************/
-
-#define ASSEMBLER
-#include "common.h"
-
-#define	M		x0
-#define	N		x1
-#define	A		x2
-#define	LDA		x3
-#define	B		x4
-
-#define M8		x5
-
-#define	A01		x6
-#define	A02		x7
-#define	A03		x8
-#define	A04		x9
-#define	A05		x10
-#define	A06		x11
-#define	A07		x12
-#define	A08		x13
-
-#define	B01		x14
-#define	B02		x15
-#define	B03		x16
-#define	B04		x17
-#define	B00		x22
-
-
-#define I		x21
-#define	J		x19
-
-#define TEMP1		x20
-
-#define A_PREFETCH	256
-
-/**************************************************************************************
-* Macro definitions
-**************************************************************************************/
-.macro SAVE_REGS
-	add	sp, sp, #-(11 * 16)
-	stp	d8, d9, [sp, #(0 * 16)]
-	stp	d10, d11, [sp, #(1 * 16)]
-	stp	d12, d13, [sp, #(2 * 16)]
-	stp	d14, d15, [sp, #(3 * 16)]
-	stp	d16, d17, [sp, #(4 * 16)]
-	stp	x18, x19, [sp, #(5 * 16)]
-	stp	x20, x21, [sp, #(6 * 16)]
-	stp	x22, x23, [sp, #(7 * 16)]
-	stp	x24, x25, [sp, #(8 * 16)]
-	stp	x26, x27, [sp, #(9 * 16)]
-	str	x28, [sp, #(10 * 16)]
-.endm
-
-.macro RESTORE_REGS
-	ldp	d8, d9, [sp, #(0 * 16)]
-	ldp	d10, d11, [sp, #(1 * 16)]
-	ldp	d12, d13, [sp, #(2 * 16)]
-	ldp	d14, d15, [sp, #(3 * 16)]
-	ldp	d16, d17, [sp, #(4 * 16)]
-	ldp	x18, x19, [sp, #(5 * 16)]
-	ldp	x20, x21, [sp, #(6 * 16)]
-	ldp	x22, x23, [sp, #(7 * 16)]
-	ldp	x24, x25, [sp, #(8 * 16)]
-	ldp	x26, x27, [sp, #(9 * 16)]
-	ldr	x28, [sp, #(10 * 16)]
-	add	sp, sp, #(11*16)
-.endm
-
-/*************************************************************************************************************************/
-
-.macro COPY16x8
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A05, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A06, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A07, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A08, #A_PREFETCH]
-	//prfm	PSTL1KEEP, [B00, M8]
-	
-	ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
-	add  A01, A01, #64
-	
-	st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
-	add TEMP1, B00, #64
-
-	ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
-	add  A02, A02, #64
-	
-	st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
-	add TEMP1, TEMP1, #64
-
-	ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
-	add  A03, A03, #64
-	
-	st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
-	add TEMP1, TEMP1, #64
-
-	ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
-	add  A04, A04, #64
-	
-	st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
-	add TEMP1, TEMP1, #64
-
-	ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05]
-	add  A05, A05, #64
-	
-	st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1]
-	add TEMP1, TEMP1, #64
-
-	ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06]
-	add  A06, A06, #64
-	
-	st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1]
-	add TEMP1, TEMP1, #64
-
-	ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07]
-	add  A07, A07, #64
-	
-	st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1]
-	add TEMP1, TEMP1, #64
-
-	ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08]
-	add  A08, A08, #64
-	
-	st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1]
-	add TEMP1, TEMP1, #64
-	
-	add	B00, B00, M8
-
-.endm
-
-.macro COPY8x8
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A05, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A06, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A07, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A08, #A_PREFETCH]
-
-	ldp	q0, q1, [A01]
-	ldp	q2, q3, [A02]
-	add	A01, A01, #32
-	add	A02, A02, #32
-	
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
-	add	B01, B01, #64
-	
-	ldp	q4, q5, [A03]
-	ldp	q6, q7, [A04]
-	add	A03, A03, #32
-	add	A04, A04, #32
-
-	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
-	add	B01, B01, #64
-
-	ldp	q8, q9, [A05]
-	ldp	q10, q11, [A06]
-	add	A05, A05, #32
-	add	A06, A06, #32
-
-	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B01]
-	add	B01, B01, #64
-
-	ldp	q12, q13, [A07]
-	ldp	q14, q15, [A08]
-	add	A07, A07, #32
-	add	A08, A08, #32
-
-	st1	{v12.4s, v13.4s, v14.4s, v15.4s}, [B01]
-	add	B01, B01, #64
-.endm
-
-.macro COPY4x8
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
-
-	ldr	q0, [A01]
-	ldr	q1, [A02]
-	ldr	q2, [A03]
-	ldr	q3, [A04]
-	add	A01, A01, #16
-	add	A02, A02, #16
-	add	A03, A03, #16
-	add	A04, A04, #16
-
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
-	add	B02, B02, #64
-
-	ldr	q4, [A05]
-	ldr	q5, [A06]
-	ldr	q6, [A07]
-	ldr	q7, [A08]
-
-	add	A05, A05, #16
-	add	A06, A06, #16
-	add	A07, A07, #16
-	add	A08, A08, #16
-
-	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B02]
-	add	B02, B02, #64
-.endm
-
-.macro COPY2x8
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
-
-	ldr	d0, [A01]
-	ldr	d1, [A02]
-	ldr	d2, [A03]
-	ldr	d3, [A04]
-	
-	add	A01, A01, #8
-	add	A02, A02, #8
-	add	A03, A03, #8
-	add	A04, A04, #8
-
-	stp	d0, d1, [B03]
-	add	B03, B03, #16
-	stp	d2, d3, [B03]
-	add	B03, B03, #16
-
-	ldr	d4, [A05]
-	ldr	d5, [A06]
-	ldr	d6, [A07]
-	ldr	d7, [A08]
-	
-	add	A05, A05, #8
-	add	A06, A06, #8
-	add	A07, A07, #8
-	add	A08, A08, #8
-
-	stp	d4, d5, [B03]
-	add	B03, B03, #16
-	stp	d6, d7, [B03]
-	add	B03, B03, #16
-
-.endm
-
-.macro COPY1x8
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
-
-	ldr	s0, [A01]
-	ldr	s1, [A02]
-	ldr	s2, [A03]
-	ldr	s3, [A04]
-
-	stp	s0, s1, [B04]
-	add	B04, B04, #8
-	stp	s2, s3, [B04]
-	add	B04, B04, #8
-
-	ldr	s4, [A05]
-	ldr	s5, [A06]
-	ldr	s6, [A07]
-	ldr	s7, [A08]
-
-	stp	s4, s5, [B04]
-	add	B04, B04, #8
-	stp	s6, s7, [B04]
-	add	B04, B04, #8
-
-.endm
-
-/*************************************************************************************************************************/
-.macro COPY16x4
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
-	add	A01, A01, #64
-
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
-	add	TEMP1, B00, #64
-
-	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
-	add	A02, A02, #64
-
-	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
-	add	TEMP1, TEMP1, #64
-
-	ld1	{v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
-	add	A03, A03, #64
-
-	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
-	add	TEMP1, TEMP1, #64
-
-	ld1	{v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
-	add	A04, A04, #64
-
-	st1	{v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
-
-	add	B00, B00, M8
-.endm
-
-.macro COPY8x4
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ldp	q0, q1, [A01]
-	ldp	q2, q3, [A02]
-	add	A01, A01, #32
-	add	A02, A02, #32
-
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
-	add	B01, B01, #64
-
-	ldp	q4, q5, [A03]
-	ldp	q6, q7, [A04]
-	add	A03, A03, #32
-	add	A04, A04, #32
-
-	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
-	add	B01, B01, #64
-.endm
-
-.macro COPY4x4
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ldr	q0, [A01]
-	ldr	q1, [A02]
-	ldr	q2, [A03]
-	ldr	q3, [A04]
-	add	A01, A01, #16
-	add	A02, A02, #16
-	add	A03, A03, #16
-	add	A04, A04, #16
-
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
-
-	add	B02, B02, #64
-.endm
-
-.macro COPY2x4
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ldr	d0, [A01]
-	ldr	d1, [A02]
-	ldr	d2, [A03]
-	ldr	d3, [A04]
-	
-	add	A01, A01, #8
-	add	A02, A02, #8
-	add	A03, A03, #8
-	add	A04, A04, #8
-	
-	stp	d0, d1, [B03]
-	add	B03, B03, #16
-	stp	d2, d3, [B03]
-
-	add	B03, B03, #16
-.endm
-
-.macro COPY1x4
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ldr	s0, [A01]
-	ldr	s1, [A02]
-	ldr	s2, [A03]
-	ldr	s3, [A04]
-	
-	add	A01, A01, #4
-	add	A02, A02, #4
-	add	A03, A03, #4
-	add	A04, A04, #4
-
-	stp	s0, s1, [B04]
-	add	B04, B04, #8
-	stp	s2, s3, [B04]
-	add	B04, B04, #8
-
-.endm
-
-/*************************************************************************************************************************/
-
-.macro COPY16x2
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
-	add	A01, A01, #64
-	
-	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
-	add	A02, A02, #64
-
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
-	add	TEMP1, B00, #64
-	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
-	add	B00, B00, M8
-.endm
-
-.macro COPY8x2
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ld1	{v0.4s, v1.4s}, [A01]
-	ld1	{v2.4s, v3.4s}, [A02]
-	add	A01, A01, #32
-	add	A02, A02, #32
-
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
-	add	B01, B01, #64
-.endm
-
-.macro COPY4x2
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ldr	q0, [A01]
-	ldr	q1, [A02]
-	add	A01, A01, #16
-	add	A02, A02, #16
-
-	stp	q0, q1, [B02]
-	add	B02, B02, #32
-.endm
-
-.macro COPY2x2
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ldr	d0, [A01]
-	ldr	d1, [A02]
-	
-	add	A01, A01, #8
-	add	A02, A02, #8
-	
-	stp	d0, d1, [B03]
-	add	B03, B03, #16
-.endm
-
-.macro COPY1x2
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ldr	s0, [A01]
-	ldr	s1, [A02]
-	
-	add	A01, A01, #4
-	add	A02, A02, #4
-
-	stp	s0, s1, [B04]
-
-	add	B04, B04, #8
-.endm
-
-/*************************************************************************************************************************/
-
-.macro COPY16x1
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-
-	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
-	add	A01, A01, #64
-
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
-	add	B00, B00, M8
-.endm
-
-.macro COPY8x1
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	
-	ldp	q0, q1, [A01]
-	add	A01, A01, #32
-	stp	q0, q1, [B01]
-
-	add	B01, B01, #32
-.endm
-
-.macro COPY4x1
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-
-	ldr q0, [A01]
-	add	A01, A01, #16
-	str q0, [B02]
-
-	add	B02, B02, #16
-.endm
-
-.macro COPY2x1
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-
-	ldr	d0, [A01]
-	add	A01, A01, #8
-	str d0, [B03]
-
-	add	B03, B03, #8
-.endm
-
-.macro COPY1x1
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-
-	ldr	s0, [A01]
-	add	A01, A01, #4
-	str	s0, [B04]
-
-	add	B04, B04, #4
-.endm
-
-/**************************************************************************************
-* End of macro definitions
-**************************************************************************************/
-
-	PROLOGUE
-
-	.align 5
-
-	SAVE_REGS
-
-	lsl	LDA, LDA, #2					// LDA = LDA * SIZE
-
-	lsl	TEMP1, M, #2					// TEMP1 = M * SIZE
-
-	and	B01 , N , #-16
-	and	B02 , N , #-8
-	and	B03 , N , #-4
-	and	B04 , N , #-2
-
-	mul	B01, B01, TEMP1
-	mul	B02, B02, TEMP1
-	mul	B03, B03, TEMP1
-	mul	B04, B04, TEMP1
-
-	add	B01 , B01, B
-	add	B02 , B02, B
-	add	B03 , B03, B
-	add	B04 , B04, B
-
-	lsl	M8, M, #6					// M8 = M * 16 * SIZE
-
-.Lsgemm_tcopy_L8_BEGIN:
-	asr 	J, M, #3					// J = M / 8
-	cmp 	J, #0
-	ble	.Lsgemm_tcopy_L4_BEGIN
-
-	.align	5
-.Lsgemm_tcopy_L8_M16_BEGIN:
-
-	mov	A01, A
-	add	A02, A01, LDA
-	add	A03, A02, LDA
-	add	A04, A03, LDA
-	add	A05, A04, LDA
-	add	A06, A05, LDA
-	add	A07, A06, LDA
-	add	A08, A07, LDA
-	add	A, A08, LDA
-
-	mov	B00, B
-	add	B, B00, #512					// B = B + 8 * 16 * SIZE
-
-	asr	I, N, #4					// I = N / 16
-	cmp 	I, #0
-	ble	.Lsgemm_tcopy_L8_M16_40
-
-	.align	5
-.Lsgemm_tcopy_L8_M16_20:
-
-	COPY16x8
-
-	subs	I , I , #1
-	bne	.Lsgemm_tcopy_L8_M16_20
-
-.Lsgemm_tcopy_L8_M16_40:
-	tst	N , #8
-	ble	.Lsgemm_tcopy_L8_M16_60
-
-	COPY8x8
-	
-.Lsgemm_tcopy_L8_M16_60:
-	tst	N , #4
-	ble	.Lsgemm_tcopy_L8_M16_80
-
-	COPY4x8
-
-.Lsgemm_tcopy_L8_M16_80:
-
-	tst	N , #2
-	ble	.Lsgemm_tcopy_L8_M16_100
-
-	COPY2x8
-
-.Lsgemm_tcopy_L8_M16_100:
-
-	tst	N, #1
-	ble	.Lsgemm_tcopy_L8_M16_END
-
-	COPY1x8
-
-.Lsgemm_tcopy_L8_M16_END:
-
-	subs	J , J, #1						// j--
-	bne	.Lsgemm_tcopy_L8_M16_BEGIN
-
-/*********************************************************************************************/
-
-.Lsgemm_tcopy_L4_BEGIN:
-	tst	M, #7
-	ble	.Lsgemm_tcopy_L999
-
-	tst	M, #4
-	ble	.Lsgemm_tcopy_L2_BEGIN
-	
-.Lsgemm_tcopy_L4_M16_BEGIN:
-
-	mov	A01, A
-	add	A02, A01, LDA
-	add	A03, A02, LDA
-	add	A04, A03, LDA
-	add	A, A04, LDA
-
-	mov	B00, B
-	add	B, B00, #256					// B = B + 4 * 16 * SIZE
-
-	asr	I, N, #4					// I = N / 16
-	cmp 	I, #0
-	ble	.Lsgemm_tcopy_L4_M16_40
-
-	.align	5
-.Lsgemm_tcopy_L4_M16_20:
-
-	COPY16x4
-
-	subs	I , I , #1
-	bne	.Lsgemm_tcopy_L4_M16_20
-
-.Lsgemm_tcopy_L4_M16_40:
-	tst	N , #8
-	ble	.Lsgemm_tcopy_L4_M16_60
-
-	COPY8x4
-	
-.Lsgemm_tcopy_L4_M16_60:
-	tst	N , #4
-	ble	.Lsgemm_tcopy_L4_M16_80
-
-	COPY4x4
-
-.Lsgemm_tcopy_L4_M16_80:
-
-	tst	N , #2
-	ble	.Lsgemm_tcopy_L4_M16_100
-
-	COPY2x4
-
-
-.Lsgemm_tcopy_L4_M16_100:
-
-	tst	N, #1
-	ble	.Lsgemm_tcopy_L4_M16_END
-
-	COPY1x4
-
-
-.Lsgemm_tcopy_L4_M16_END:
-
-/*********************************************************************************************/
-
-.Lsgemm_tcopy_L2_BEGIN:
-
-	tst	M, #3
-	ble	.Lsgemm_tcopy_L999
-
-	tst	M, #2
-	ble	.Lsgemm_tcopy_L1_BEGIN
-
-.Lsgemm_tcopy_L2_M16_BEGIN:
-	mov	A01, A
-	add	A02, A01, LDA
-	add	A, A02, LDA
-
-	mov	B00, B
-	add	B, B00, #128					// B = B + 2 * 16 * SIZE
-
-	asr	I, N, #4					// I = N / 16
-	cmp 	I, #0
-	ble	.Lsgemm_tcopy_L2_M16_40
-
-	.align	5
-.Lsgemm_tcopy_L2_M16_20:
-
-	COPY16x2
-
-	subs	I , I , #1
-	bne	.Lsgemm_tcopy_L2_M16_20
-
-.Lsgemm_tcopy_L2_M16_40:
-	tst	N , #8
-	ble	.Lsgemm_tcopy_L2_M16_60
-
-	COPY8x2
-
-.Lsgemm_tcopy_L2_M16_60:
-	tst	N , #4
-	ble	.Lsgemm_tcopy_L2_M16_80
-
-	COPY4x2
-
-.Lsgemm_tcopy_L2_M16_80:
-
-	tst	N , #2
-	ble	.Lsgemm_tcopy_L2_M16_100
-
-	COPY2x2
-
-.Lsgemm_tcopy_L2_M16_100:
-
-	tst	N , #1
-	ble	.Lsgemm_tcopy_L2_M16_END
-
-	COPY1x2
-
-.Lsgemm_tcopy_L2_M16_END:
-
-/*********************************************************************************************/
-
-.Lsgemm_tcopy_L1_BEGIN:
-
-	tst	M, #1
-	ble	.Lsgemm_tcopy_L999
-
-
-.Lsgemm_tcopy_L1_M16_BEGIN:
-
-	mov	A01, A						// A01 = A
-	mov	B00, B
-
-	asr	I, N, #4					// I = M / 16
-	cmp 	I, #0
-	ble	.Lsgemm_tcopy_L1_M16_40
-
-	.align	5
-.Lsgemm_tcopy_L1_M16_20:
-
-	COPY16x1
-
-	subs	I , I , #1
-	bne	.Lsgemm_tcopy_L1_M16_20
-	
-.Lsgemm_tcopy_L1_M16_40:
-	tst	N , #8
-	ble	.Lsgemm_tcopy_L1_M16_60
-
-	COPY8x1
-
-.Lsgemm_tcopy_L1_M16_60:
-	tst	N , #4
-	ble	.Lsgemm_tcopy_L1_M16_80
-
-	COPY4x1
-
-.Lsgemm_tcopy_L1_M16_80:
-
-	tst	N , #2
-	ble	.Lsgemm_tcopy_L1_M16_100
-
-	COPY2x1
-
-.Lsgemm_tcopy_L1_M16_100:
-
-	tst	N , #1
-	ble	.Lsgemm_tcopy_L1_M16_END
-
-	COPY1x1
-
-
-.Lsgemm_tcopy_L1_M16_END:
-
-.Lsgemm_tcopy_L999:
-	mov	x0, #0						// set return value
-	RESTORE_REGS
-	ret
-
-	EPILOGUE
-
-
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define	M		x0
+#define	N		x1
+#define	A		x2
+#define	LDA		x3
+#define	B		x4
+
+#define M8		x5
+
+#define	A01		x6
+#define	A02		x7
+#define	A03		x8
+#define	A04		x9
+#define	A05		x10
+#define	A06		x11
+#define	A07		x12
+#define	A08		x13
+
+#define	B01		x14
+#define	B02		x15
+#define	B03		x16
+#define	B04		x17
+#define	B00		x22
+
+
+#define I		x21
+#define	J		x19
+
+#define TEMP1		x20
+
+#define A_PREFETCH	256
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+.macro SAVE_REGS
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY16x8
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+	//prfm	PSTL1KEEP, [B00, M8]
+	
+	ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add  A01, A01, #64
+	
+	st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add TEMP1, B00, #64
+
+	ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
+	add  A02, A02, #64
+	
+	st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
+	add  A03, A03, #64
+	
+	st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
+	add  A04, A04, #64
+	
+	st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05]
+	add  A05, A05, #64
+	
+	st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06]
+	add  A06, A06, #64
+	
+	st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07]
+	add  A07, A07, #64
+	
+	st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08]
+	add  A08, A08, #64
+	
+	st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+	
+	add	B00, B00, M8
+
+.endm
+
+.macro COPY8x8
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldp	q0, q1, [A01]
+	ldp	q2, q3, [A02]
+	add	A01, A01, #32
+	add	A02, A02, #32
+	
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
+	add	B01, B01, #64
+	
+	ldp	q4, q5, [A03]
+	ldp	q6, q7, [A04]
+	add	A03, A03, #32
+	add	A04, A04, #32
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
+	add	B01, B01, #64
+
+	ldp	q8, q9, [A05]
+	ldp	q10, q11, [A06]
+	add	A05, A05, #32
+	add	A06, A06, #32
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B01]
+	add	B01, B01, #64
+
+	ldp	q12, q13, [A07]
+	ldp	q14, q15, [A08]
+	add	A07, A07, #32
+	add	A08, A08, #32
+
+	st1	{v12.4s, v13.4s, v14.4s, v15.4s}, [B01]
+	add	B01, B01, #64
+.endm
+
+.macro COPY4x8
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldr	q0, [A01]
+	ldr	q1, [A02]
+	ldr	q2, [A03]
+	ldr	q3, [A04]
+	add	A01, A01, #16
+	add	A02, A02, #16
+	add	A03, A03, #16
+	add	A04, A04, #16
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
+	add	B02, B02, #64
+
+	ldr	q4, [A05]
+	ldr	q5, [A06]
+	ldr	q6, [A07]
+	ldr	q7, [A08]
+
+	add	A05, A05, #16
+	add	A06, A06, #16
+	add	A07, A07, #16
+	add	A08, A08, #16
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B02]
+	add	B02, B02, #64
+.endm
+
+.macro COPY2x8
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	ldr	d1, [A02]
+	ldr	d2, [A03]
+	ldr	d3, [A04]
+	
+	add	A01, A01, #8
+	add	A02, A02, #8
+	add	A03, A03, #8
+	add	A04, A04, #8
+
+	stp	d0, d1, [B03]
+	add	B03, B03, #16
+	stp	d2, d3, [B03]
+	add	B03, B03, #16
+
+	ldr	d4, [A05]
+	ldr	d5, [A06]
+	ldr	d6, [A07]
+	ldr	d7, [A08]
+	
+	add	A05, A05, #8
+	add	A06, A06, #8
+	add	A07, A07, #8
+	add	A08, A08, #8
+
+	stp	d4, d5, [B03]
+	add	B03, B03, #16
+	stp	d6, d7, [B03]
+	add	B03, B03, #16
+
+.endm
+
+.macro COPY1x8
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	ldr	s1, [A02]
+	ldr	s2, [A03]
+	ldr	s3, [A04]
+
+	stp	s0, s1, [B04]
+	add	B04, B04, #8
+	stp	s2, s3, [B04]
+	add	B04, B04, #8
+
+	ldr	s4, [A05]
+	ldr	s5, [A06]
+	ldr	s6, [A07]
+	ldr	s7, [A08]
+
+	stp	s4, s5, [B04]
+	add	B04, B04, #8
+	stp	s6, s7, [B04]
+	add	B04, B04, #8
+
+.endm
+
+/*************************************************************************************************************************/
+.macro COPY16x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add	A01, A01, #64
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add	TEMP1, B00, #64
+
+	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
+	add	A02, A02, #64
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
+	add	TEMP1, TEMP1, #64
+
+	ld1	{v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
+	add	A03, A03, #64
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
+	add	TEMP1, TEMP1, #64
+
+	ld1	{v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
+	add	A04, A04, #64
+
+	st1	{v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
+
+	add	B00, B00, M8
+.endm
+
+.macro COPY8x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldp	q0, q1, [A01]
+	ldp	q2, q3, [A02]
+	add	A01, A01, #32
+	add	A02, A02, #32
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
+	add	B01, B01, #64
+
+	ldp	q4, q5, [A03]
+	ldp	q6, q7, [A04]
+	add	A03, A03, #32
+	add	A04, A04, #32
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
+	add	B01, B01, #64
+.endm
+
+.macro COPY4x4
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	q0, [A01]
+	ldr	q1, [A02]
+	ldr	q2, [A03]
+	ldr	q3, [A04]
+	add	A01, A01, #16
+	add	A02, A02, #16
+	add	A03, A03, #16
+	add	A04, A04, #16
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
+
+	add	B02, B02, #64
+.endm
+
+.macro COPY2x4
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	ldr	d1, [A02]
+	ldr	d2, [A03]
+	ldr	d3, [A04]
+	
+	add	A01, A01, #8
+	add	A02, A02, #8
+	add	A03, A03, #8
+	add	A04, A04, #8
+	
+	stp	d0, d1, [B03]
+	add	B03, B03, #16
+	stp	d2, d3, [B03]
+
+	add	B03, B03, #16
+.endm
+
+.macro COPY1x4
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	ldr	s1, [A02]
+	ldr	s2, [A03]
+	ldr	s3, [A04]
+	
+	add	A01, A01, #4
+	add	A02, A02, #4
+	add	A03, A03, #4
+	add	A04, A04, #4
+
+	stp	s0, s1, [B04]
+	add	B04, B04, #8
+	stp	s2, s3, [B04]
+	add	B04, B04, #8
+
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY16x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add	A01, A01, #64
+	
+	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
+	add	A02, A02, #64
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add	TEMP1, B00, #64
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
+	add	B00, B00, M8
+.endm
+
+.macro COPY8x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s}, [A01]
+	ld1	{v2.4s, v3.4s}, [A02]
+	add	A01, A01, #32
+	add	A02, A02, #32
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
+	add	B01, B01, #64
+.endm
+
+.macro COPY4x2
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	q0, [A01]
+	ldr	q1, [A02]
+	add	A01, A01, #16
+	add	A02, A02, #16
+
+	stp	q0, q1, [B02]
+	add	B02, B02, #32
+.endm
+
+.macro COPY2x2
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	ldr	d1, [A02]
+	
+	add	A01, A01, #8
+	add	A02, A02, #8
+	
+	stp	d0, d1, [B03]
+	add	B03, B03, #16
+.endm
+
+.macro COPY1x2
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	ldr	s1, [A02]
+	
+	add	A01, A01, #4
+	add	A02, A02, #4
+
+	stp	s0, s1, [B04]
+
+	add	B04, B04, #8
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY16x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add	A01, A01, #64
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add	B00, B00, M8
+.endm
+
+.macro COPY8x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	
+	ldp	q0, q1, [A01]
+	add	A01, A01, #32
+	stp	q0, q1, [B01]
+
+	add	B01, B01, #32
+.endm
+
+.macro COPY4x1
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr q0, [A01]
+	add	A01, A01, #16
+	str q0, [B02]
+
+	add	B02, B02, #16
+.endm
+
+.macro COPY2x1
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	add	A01, A01, #8
+	str d0, [B03]
+
+	add	B03, B03, #8
+.endm
+
+.macro COPY1x1
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	add	A01, A01, #4
+	str	s0, [B04]
+
+	add	B04, B04, #4
+.endm
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	SAVE_REGS
+
+	lsl	LDA, LDA, #2					// LDA = LDA * SIZE
+
+	lsl	TEMP1, M, #2					// TEMP1 = M * SIZE
+
+	and	B01 , N , #-16
+	and	B02 , N , #-8
+	and	B03 , N , #-4
+	and	B04 , N , #-2
+
+	mul	B01, B01, TEMP1
+	mul	B02, B02, TEMP1
+	mul	B03, B03, TEMP1
+	mul	B04, B04, TEMP1
+
+	add	B01 , B01, B
+	add	B02 , B02, B
+	add	B03 , B03, B
+	add	B04 , B04, B
+
+	lsl	M8, M, #6					// M8 = M * 16 * SIZE
+
+.Lsgemm_tcopy_L8_BEGIN:
+	asr 	J, M, #3					// J = M / 8
+	cmp 	J, #0
+	ble	.Lsgemm_tcopy_L4_BEGIN
+
+	.align	5
+.Lsgemm_tcopy_L8_M16_BEGIN:
+
+	mov	A01, A
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A05, A04, LDA
+	add	A06, A05, LDA
+	add	A07, A06, LDA
+	add	A08, A07, LDA
+	add	A, A08, LDA
+
+	mov	B00, B
+	add	B, B00, #512					// B = B + 8 * 16 * SIZE
+
+	asr	I, N, #4					// I = N / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L8_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L8_M16_20:
+
+	COPY16x8
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L8_M16_20
+
+.Lsgemm_tcopy_L8_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L8_M16_60
+
+	COPY8x8
+	
+.Lsgemm_tcopy_L8_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L8_M16_80
+
+	COPY4x8
+
+.Lsgemm_tcopy_L8_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L8_M16_100
+
+	COPY2x8
+
+.Lsgemm_tcopy_L8_M16_100:
+
+	tst	N, #1
+	ble	.Lsgemm_tcopy_L8_M16_END
+
+	COPY1x8
+
+.Lsgemm_tcopy_L8_M16_END:
+
+	subs	J , J, #1						// j--
+	bne	.Lsgemm_tcopy_L8_M16_BEGIN
+
+/*********************************************************************************************/
+
+.Lsgemm_tcopy_L4_BEGIN:
+	tst	M, #7
+	ble	.Lsgemm_tcopy_L999
+
+	tst	M, #4
+	ble	.Lsgemm_tcopy_L2_BEGIN
+	
+.Lsgemm_tcopy_L4_M16_BEGIN:
+
+	mov	A01, A
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A, A04, LDA
+
+	mov	B00, B
+	add	B, B00, #256					// B = B + 4 * 16 * SIZE
+
+	asr	I, N, #4					// I = N / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L4_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L4_M16_20:
+
+	COPY16x4
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L4_M16_20
+
+.Lsgemm_tcopy_L4_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L4_M16_60
+
+	COPY8x4
+	
+.Lsgemm_tcopy_L4_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L4_M16_80
+
+	COPY4x4
+
+.Lsgemm_tcopy_L4_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L4_M16_100
+
+	COPY2x4
+
+
+.Lsgemm_tcopy_L4_M16_100:
+
+	tst	N, #1
+	ble	.Lsgemm_tcopy_L4_M16_END
+
+	COPY1x4
+
+
+.Lsgemm_tcopy_L4_M16_END:
+
+/*********************************************************************************************/
+
+.Lsgemm_tcopy_L2_BEGIN:
+
+	tst	M, #3
+	ble	.Lsgemm_tcopy_L999
+
+	tst	M, #2
+	ble	.Lsgemm_tcopy_L1_BEGIN
+
+.Lsgemm_tcopy_L2_M16_BEGIN:
+	mov	A01, A
+	add	A02, A01, LDA
+	add	A, A02, LDA
+
+	mov	B00, B
+	add	B, B00, #128					// B = B + 2 * 16 * SIZE
+
+	asr	I, N, #4					// I = N / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L2_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L2_M16_20:
+
+	COPY16x2
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L2_M16_20
+
+.Lsgemm_tcopy_L2_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L2_M16_60
+
+	COPY8x2
+
+.Lsgemm_tcopy_L2_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L2_M16_80
+
+	COPY4x2
+
+.Lsgemm_tcopy_L2_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L2_M16_100
+
+	COPY2x2
+
+.Lsgemm_tcopy_L2_M16_100:
+
+	tst	N , #1
+	ble	.Lsgemm_tcopy_L2_M16_END
+
+	COPY1x2
+
+.Lsgemm_tcopy_L2_M16_END:
+
+/*********************************************************************************************/
+
+.Lsgemm_tcopy_L1_BEGIN:
+
+	tst	M, #1
+	ble	.Lsgemm_tcopy_L999
+
+
+.Lsgemm_tcopy_L1_M16_BEGIN:
+
+	mov	A01, A						// A01 = A
+	mov	B00, B
+
+	asr	I, N, #4					// I = M / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L1_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L1_M16_20:
+
+	COPY16x1
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L1_M16_20
+	
+.Lsgemm_tcopy_L1_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L1_M16_60
+
+	COPY8x1
+
+.Lsgemm_tcopy_L1_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L1_M16_80
+
+	COPY4x1
+
+.Lsgemm_tcopy_L1_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L1_M16_100
+
+	COPY2x1
+
+.Lsgemm_tcopy_L1_M16_100:
+
+	tst	N , #1
+	ble	.Lsgemm_tcopy_L1_M16_END
+
+	COPY1x1
+
+
+.Lsgemm_tcopy_L1_M16_END:
+
+.Lsgemm_tcopy_L999:
+	mov	x0, #0						// set return value
+	RESTORE_REGS
+	ret
+
+	EPILOGUE
+
+
diff --git a/kernel/power/cgemm_kernel_power9.S b/kernel/power/cgemm_kernel_power9.S
index 4b5c2fa31..dfe17f3ef 100644
--- a/kernel/power/cgemm_kernel_power9.S
+++ b/kernel/power/cgemm_kernel_power9.S
@@ -1,293 +1,293 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* Abdelrauf(quickwritereader@gmail.com)
-* 	 BLASTEST 		: OK
-* 	 CTEST			: OK
-* 	 TEST			: OK
-*	 LAPACK-TEST		: OK
-**************************************************************************************/
-#define ASSEMBLER
-#include "common.h"
-#include "def_vsx.h"
-
- 
-#define LOAD	ld
-#define STACKSIZE  (512 )  
-#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
-#define	M	r3
-#define	N	r4
-#define	K	r5
-
-
-#define A	r8
-#define	B	r9
-#define	C	r10
-#define	LDC	r6
-#define OFFSET	r7
-
-
-#define alpha_r vs19
-#define alpha_i vs20
-#define save_permute_1 vs21
-#define permute_mask vs22
-#define o0	0
- 
-
-#define T1	r11
-#define T2	r12
-#define T3	r14
-#define T4	r15
-#define T5	r16
-#define T6	r17
-#define L	r18
-#define T7	r19
-#define T8	r20
-#define TEMP_REG	r21
-#define	I	r22
-#define J	r23
-#define AO	r24
-#define	BO	r25
-#define	CO 	r26
-#define T9	r27
-#define	T10	r28
-#define	PRE	r29
-
-#define T12	r30
-#define T13	r31
-
-#include "cgemm_macros_power9.S"
-
-.equ    perm_const1, 0x0405060700010203
-.equ    perm_const2, 0x0c0d0e0f08090a0b
-.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
-.equ save_permute_11, 0x0405060714151617
-
-
-
-#ifndef NEEDPARAM
-
-	PROLOGUE
-	PROFCODE
-
-
-	addi	SP, SP, -STACKSIZE
-	mflr r0
-
-
-	stfd	f14,    0(SP)
-	stfd	f15,    8(SP)
-	stfd	f16,   16(SP)
-	stfd	f17,   24(SP)
-
-	stfd	f18,   32(SP)
-	stfd	f19,   40(SP)
-	stfd	f20,   48(SP)
-	stfd	f21,   56(SP)
-
-	stfd	f22,   64(SP)
-	stfd	f23,   72(SP)
-	stfd	f24,   80(SP)
-	stfd	f25,   88(SP)
-
-	stfd	f26,   96(SP)
-	stfd	f27,  104(SP)
-	stfd	f28,  112(SP)
-	stfd	f29,  120(SP)
-
-	stfd	f30,  128(SP)
-	stfd	f31,  136(SP)
-
-
-	std	r31,  144(SP)
-	std	r30,  152(SP)
-	std	r29,  160(SP)
-	std	r28,  168(SP)
-	std	r27,  176(SP)
-	std	r26,  184(SP)
-	std	r25,  192(SP)
-	std	r24,  200(SP)
-	std	r23,  208(SP)
-	std	r22,  216(SP)
-	std	r21,  224(SP)
-	std	r20,  232(SP)
-	std	r19,  240(SP)
-	std	r18,  248(SP)
-	std	r17,  256(SP)
-	std	r16,  264(SP)
-	std	r15,  272(SP)
-	std	r14,  280(SP)
- 
- 
-  stxv    vs52,  288(SP)
-  stxv    vs53,  304(SP)
-  stxv    vs54,  320(SP)
-  stxv    vs55,  336(SP)
-  stxv    vs56,  352(SP)
-  stxv    vs57,  368(SP)
-  stxv    vs58,  384(SP)
-  stxv    vs59,  400(SP)
-  stxv    vs60,  416(SP)
-  stxv    vs61,  432(SP)
-  stxv    vs62,  448(SP)
-  stxv    vs63,  464(SP)
-  std     r0,   FLINK_SAVE(SP)
- 
-
-
-	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
-
-
-
-#ifdef TRMMKERNEL
-	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
-#endif
-   slwi    LDC, LDC, ZBASE_SHIFT
-
- 
- 
-	/*alpha is stored in f1. convert to single and splat*/
-    xscvdpspn alpha_r,vs1 
-    xscvdpspn alpha_i,vs2 
-	xxspltw   alpha_r,alpha_r,0 
-	xxspltw   alpha_i,alpha_i,0 
-/*load reverse permute mask for big endian
-  uint128 = 0xc0d0e0f08090a0b0405060700010203
-*/ 
-		
-	lis T2, perm_const2@highest
-	lis T1, perm_const1@highest
-	lis T3, save_permute_12@highest
-	lis T4, save_permute_11@highest
-
-	
-	ori T2, T2, perm_const2@higher
-	ori T1, T1, perm_const1@higher
-	ori T3, T3, save_permute_12@higher
-	ori T4, T4, save_permute_11@higher
-
-	
-	rldicr T2, T2, 32, 31
-	rldicr T1, T1, 32, 31
-	rldicr T3, T3, 32, 31
-	rldicr T4, T4, 32, 31 
-
-	oris T2, T2, perm_const2@h
-	oris T1, T1, perm_const1@h
-	oris T3, T3, save_permute_12@h
-	oris T4, T4, save_permute_11@h
-
-	
-	ori T2, T2, perm_const2@l  
-	ori T1, T1, perm_const1@l
-	ori T3, T3, save_permute_12@l  
-	ori T4, T4, save_permute_11@l
-
-	
-  li r0,0
-  li PRE,512
-
-#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
-/*negate for this case as we will use addition -1*(a+b) */
-  xvnegsp alpha_r,alpha_r
-  xvnegsp alpha_i,alpha_i
-#endif
-
-	mtvsrdd permute_mask,T2,T1
-	mtvsrdd save_permute_1,T3,T4 	
-
-     /*mask is reverse permute so we have to make it inner permute */
- 	xxpermdi	permute_mask,	permute_mask,	permute_mask,2 
-
-#include "cgemm_logic_power9.S"
-
-.L999: 
-	lfd	f14,    0(SP)
-	lfd	f15,    8(SP)
-	lfd	f16,   16(SP)
-	lfd	f17,   24(SP)
-
-	lfd	f18,   32(SP)
-	lfd	f19,   40(SP)
-	lfd	f20,   48(SP)
-	lfd	f21,   56(SP)
-
-	lfd	f22,   64(SP)
-	lfd	f23,   72(SP)
-	lfd	f24,   80(SP)
-	lfd	f25,   88(SP)
-
-	lfd	f26,   96(SP)
-	lfd	f27,  104(SP)
-	lfd	f28,  112(SP)
-	lfd	f29,  120(SP)
-
-	lfd	f30,  128(SP)
-	lfd	f31,  136(SP)
-
-	ld	r31,  144(SP)
-	ld	r30,  152(SP)
-	ld	r29,  160(SP)
-	ld	r28,  168(SP)
-	ld	r27,  176(SP)
-	ld	r26,  184(SP)
-	ld	r25,  192(SP)
-	ld	r24,  200(SP)
-	ld	r23,  208(SP)
-	ld	r22,  216(SP)
-	ld	r21,  224(SP)
-	ld	r20,  232(SP)
-	ld	r19,  240(SP)
-	ld	r18,  248(SP)
-	ld	r17,  256(SP)
-	ld	r16,  264(SP)
-	ld	r15,  272(SP)
-	ld	r14,  280(SP)
-
-	ld    r0, 	 FLINK_SAVE(SP)	
- 
-    lxv    vs52,  288(SP)
-    lxv    vs53,  304(SP)
-    lxv    vs54,  320(SP)
-    lxv    vs55,  336(SP)
-    lxv    vs56,  352(SP)
-    lxv    vs57,  368(SP)
-    lxv    vs58,  384(SP) 
-    lxv    vs59,  400(SP)
-	mtlr r0
-    lxv    vs60,  416(SP)
-    lxv    vs61,  432(SP) 
-    lxv    vs62,  448(SP)
-    lxv    vs63,  464(SP)
-
-	addi	SP, SP, STACKSIZE 
-	blr
-
-
-	EPILOGUE
-#endif
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+#define STACKSIZE  (512 )  
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+
+
+#define alpha_r vs19
+#define alpha_i vs20
+#define save_permute_1 vs21
+#define permute_mask vs22
+#define o0	0
+ 
+
+#define T1	r11
+#define T2	r12
+#define T3	r14
+#define T4	r15
+#define T5	r16
+#define T6	r17
+#define L	r18
+#define T7	r19
+#define T8	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define T9	r27
+#define	T10	r28
+#define	PRE	r29
+
+#define T12	r30
+#define T13	r31
+
+#include "cgemm_macros_power9.S"
+
+.equ    perm_const1, 0x0405060700010203
+.equ    perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
+.equ save_permute_11, 0x0405060714151617
+
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+
+	addi	SP, SP, -STACKSIZE
+	mflr r0
+
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+  stxv    vs52,  288(SP)
+  stxv    vs53,  304(SP)
+  stxv    vs54,  320(SP)
+  stxv    vs55,  336(SP)
+  stxv    vs56,  352(SP)
+  stxv    vs57,  368(SP)
+  stxv    vs58,  384(SP)
+  stxv    vs59,  400(SP)
+  stxv    vs60,  416(SP)
+  stxv    vs61,  432(SP)
+  stxv    vs62,  448(SP)
+  stxv    vs63,  464(SP)
+  std     r0,   FLINK_SAVE(SP)
+ 
+
+
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+
+
+
+#ifdef TRMMKERNEL
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+   slwi    LDC, LDC, ZBASE_SHIFT
+
+ 
+ 
+	/*alpha is stored in f1. convert to single and splat*/
+    xscvdpspn alpha_r,vs1 
+    xscvdpspn alpha_i,vs2 
+	xxspltw   alpha_r,alpha_r,0 
+	xxspltw   alpha_i,alpha_i,0 
+/*load reverse permute mask for big endian
+  uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/ 
+		
+	lis T2, perm_const2@highest
+	lis T1, perm_const1@highest
+	lis T3, save_permute_12@highest
+	lis T4, save_permute_11@highest
+
+	
+	ori T2, T2, perm_const2@higher
+	ori T1, T1, perm_const1@higher
+	ori T3, T3, save_permute_12@higher
+	ori T4, T4, save_permute_11@higher
+
+	
+	rldicr T2, T2, 32, 31
+	rldicr T1, T1, 32, 31
+	rldicr T3, T3, 32, 31
+	rldicr T4, T4, 32, 31 
+
+	oris T2, T2, perm_const2@h
+	oris T1, T1, perm_const1@h
+	oris T3, T3, save_permute_12@h
+	oris T4, T4, save_permute_11@h
+
+	
+	ori T2, T2, perm_const2@l  
+	ori T1, T1, perm_const1@l
+	ori T3, T3, save_permute_12@l  
+	ori T4, T4, save_permute_11@l
+
+	
+  li r0,0
+  li PRE,512
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
+/*negate for this case as we will use addition -1*(a+b) */
+  xvnegsp alpha_r,alpha_r
+  xvnegsp alpha_i,alpha_i
+#endif
+
+	mtvsrdd permute_mask,T2,T1
+	mtvsrdd save_permute_1,T3,T4 	
+
+     /*mask is reverse permute so we have to make it inner permute */
+ 	xxpermdi	permute_mask,	permute_mask,	permute_mask,2 
+
+#include "cgemm_logic_power9.S"
+
+.L999: 
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+
+	ld    r0, 	 FLINK_SAVE(SP)	
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP)
+	mtlr r0
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE 
+	blr
+
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/cgemm_logic_power9.S b/kernel/power/cgemm_logic_power9.S
index b4f937e90..a191219fa 100644
--- a/kernel/power/cgemm_logic_power9.S
+++ b/kernel/power/cgemm_logic_power9.S
@@ -1,2816 +1,2816 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* Abdelrauf(quickwritereader@gmail.com)
-* 	 BLASTEST 		: OK
-* 	 CTEST			: OK
-* 	 TEST			: OK
-*	 LAPACK-TEST		: OK
-**************************************************************************************/
-#define MY_ALIGN .align 3
-b CGEMM_L4
-/*                MINI SUBROUTINES                            */      
-/*                4x8 MAIN 128x+2 LOOP                     */      
-
-
-CGEMM_L4x8_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD4x8_2 
-    MY_ALIGN
-CGEMM_L4x8_LOOP:
-/*----------------------------------------*/   
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL4x8_L2 128,64,0,0 
-CGEMM_L4x8_K128:
-/*----------------------------------------*/   
-    KERNEL4x8_L2 128,64,1,0
-    dcbt    AO, T2  
-    KERNEL4x8_L2 128,64,2,0
-    KERNEL4x8_L2 128,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL4x8_L2 128,64,4,0
-    KERNEL4x8_L2 128,64,5,0
-    dcbt    AO, T4  
-    KERNEL4x8_L2 128,64,6,0
-    KERNEL4x8_L2 128,64,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL4x8_L2 128,64,8,0
-    KERNEL4x8_L2 128,64,9,0
-    KERNEL4x8_L2 128,64,10,0
-    KERNEL4x8_L2 128,64,11,0  
-    dcbt    BO, T4
-    KERNEL4x8_L2 128,64,12,0
-    KERNEL4x8_L2 128,64,13,0
-    KERNEL4x8_L2 128,64,14,0
-    KERNEL4x8_L2 128,64,15,0  
-    KERNEL4x8_L2 128,64,16,0
-    KERNEL4x8_L2 128,64,17,0 
-    KERNEL4x8_L2 128,64,18,0
-    KERNEL4x8_L2 128,64,19,0  
-    KERNEL4x8_L2 128,64,20,0
-    KERNEL4x8_L2 128,64,21,0 
-    KERNEL4x8_L2 128,64,22,0
-    KERNEL4x8_L2 128,64,23,0   
-    KERNEL4x8_L2 128,64,24,0
-    KERNEL4x8_L2 128,64,25,0
-    KERNEL4x8_L2 128,64,26,0
-    KERNEL4x8_L2 128,64,27,0  
-    KERNEL4x8_L2 128,64,28,0
-    KERNEL4x8_L2 128,64,29,0
-    KERNEL4x8_L2 128,64,30,0
-    KERNEL4x8_L2 128,64,31,0 
-    KERNEL4x8_L2 128,64,32,0
-    KERNEL4x8_L2 128,64,33,0
-    KERNEL4x8_L2 128,64,34,0
-    KERNEL4x8_L2 128,64,35,0 
-    KERNEL4x8_L2 128,64,36,0
-    KERNEL4x8_L2 128,64,37,0
-    KERNEL4x8_L2 128,64,38,0
-    KERNEL4x8_L2 128,64,39,0  
-    KERNEL4x8_L2 128,64,40,0
-    KERNEL4x8_L2 128,64,41,0
-    KERNEL4x8_L2 128,64,42,0
-    KERNEL4x8_L2 128,64,43,0  
-    KERNEL4x8_L2 128,64,44,0
-    KERNEL4x8_L2 128,64,45,0
-    KERNEL4x8_L2 128,64,46,0
-    KERNEL4x8_L2 128,64,47,0 
-    KERNEL4x8_L2 128,64,48,0
-    KERNEL4x8_L2 128,64,49,0 
-    KERNEL4x8_L2 128,64,50,0
-    KERNEL4x8_L2 128,64,51,0  
-    KERNEL4x8_L2 128,64,52,0
-    KERNEL4x8_L2 128,64,53,0 
-    KERNEL4x8_L2 128,64,54,0
-    KERNEL4x8_L2 128,64,55,0  
-    KERNEL4x8_L2 128,64,56,0
-    KERNEL4x8_L2 128,64,57,0
-    KERNEL4x8_L2 128,64,58,0
-    KERNEL4x8_L2 128,64,59,0  
-    KERNEL4x8_L2 128,64,60,0
-    KERNEL4x8_L2 128,64,61,0
-    KERNEL4x8_L2 128,64,62,0 
-    KERNEL4x8_L2 128,64,63,1  
-    bdnz    CGEMM_L4x8_LOOP
-    MY_ALIGN  
-CGEMM_L4x8_LOOP_END:
-/*----------------------------------------*/   
-    END4x8_2
-    blr
-    MY_ALIGN
-
-
-CGEMM_4x8_L64_SUB:
-/*----------------------------------------*/   
-    LOAD4x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL4x8_L2 128,64,0,0 
-    KERNEL4x8_L2 128,64,1,0
-    dcbt    AO, T2  
-    KERNEL4x8_L2 128,64,2,0
-    KERNEL4x8_L2 128,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL4x8_L2 128,64,4,0
-    KERNEL4x8_L2 128,64,5,0
-    dcbt    AO, T4  
-    KERNEL4x8_L2 128,64,6,0
-    KERNEL4x8_L2 128,64,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL4x8_L2 128,64,8,0
-    KERNEL4x8_L2 128,64,9,0
-    KERNEL4x8_L2 128,64,10,0
-    KERNEL4x8_L2 128,64,11,0  
-    dcbt    BO, T4
-    KERNEL4x8_L2 128,64,12,0
-    KERNEL4x8_L2 128,64,13,0
-    KERNEL4x8_L2 128,64,14,0
-    KERNEL4x8_L2 128,64,15,0  
-    KERNEL4x8_L2 128,64,16,0
-    KERNEL4x8_L2 128,64,17,0 
-    KERNEL4x8_L2 128,64,18,0
-    KERNEL4x8_L2 128,64,19,0  
-    KERNEL4x8_L2 128,64,20,0
-    KERNEL4x8_L2 128,64,21,0 
-    KERNEL4x8_L2 128,64,22,0
-    KERNEL4x8_L2 128,64,23,0   
-    KERNEL4x8_L2 128,64,24,0
-    KERNEL4x8_L2 128,64,25,0
-    KERNEL4x8_L2 128,64,26,0
-    KERNEL4x8_L2 128,64,27,0  
-    KERNEL4x8_L2 128,64,28,0
-    KERNEL4x8_L2 128,64,29,0
-    KERNEL4x8_L2 128,64,30,0
-    KERNEL4x8_E2 128,64,31,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_4x8_L32_SUB:
-/*----------------------------------------*/   
-    LOAD4x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL4x8_L2 128,64,0,0 
-    KERNEL4x8_L2 128,64,1,0
-    dcbt    AO, T2  
-    KERNEL4x8_L2 128,64,2,0
-    KERNEL4x8_L2 128,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL4x8_L2 128,64,4,0
-    KERNEL4x8_L2 128,64,5,0
-    dcbt    AO, T4  
-    KERNEL4x8_L2 128,64,6,0
-    KERNEL4x8_L2 128,64,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL4x8_L2 128,64,8,0
-    KERNEL4x8_L2 128,64,9,0
-    KERNEL4x8_L2 128,64,10,0
-    KERNEL4x8_L2 128,64,11,0  
-    dcbt    BO, T4
-    KERNEL4x8_L2 128,64,12,0
-    KERNEL4x8_L2 128,64,13,0
-    KERNEL4x8_L2 128,64,14,0
-    KERNEL4x8_E2 128,64,15,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_4x8_L16_SUB:
-/*----------------------------------------*/   
-    LOAD4x8_2 
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL4x8_L2 128,64,0,0 
-    KERNEL4x8_L2 128,64,1,0
-    dcbt    AO, T2  
-    KERNEL4x8_L2 128,64,2,0
-    KERNEL4x8_L2 128,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL4x8_L2 128,64,4,0
-    KERNEL4x8_L2 128,64,5,0
-    dcbt    AO, T4  
-    KERNEL4x8_L2 128,64,6,0
-    KERNEL4x8_E2 128,64,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_4x4_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD4x4_2  
-    MY_ALIGN
-CGEMM_L4x4_LOOP:
-/*----------------------------------------*/   
-    KERNEL4x4_L2 64,64,0,0
-CGEMM_L4x4_K32:
-/*----------------------------------------*/   
-    KERNEL4x4_L2 64,64,1,0   
-    KERNEL4x4_L2 64,64,2,0
-    KERNEL4x4_L2 64,64,3,0  
-    KERNEL4x4_L2 64,64,4,0
-    KERNEL4x4_L2 64,64,5,0 
-    KERNEL4x4_L2 64,64,6,0
-    KERNEL4x4_L2 64,64,7,0
-    KERNEL4x4_L2 64,64,8,0
-    KERNEL4x4_L2 64,64,9,0   
-    KERNEL4x4_L2 64,64,10,0
-    KERNEL4x4_L2 64,64,11,0  
-    KERNEL4x4_L2 64,64,12,0
-    KERNEL4x4_L2 64,64,13,0 
-    KERNEL4x4_L2 64,64,14,0
-    KERNEL4x4_L2 64,64,15,1    
-    bdnz    CGEMM_L4x4_LOOP
-    MY_ALIGN  
-CGEMM_L4x4_LOOP_END:
-/*----------------------------------------*/   
-    END4x4_2 
-    blr
-    MY_ALIGN
-
-
-CGEMM_4x4_L16_SUB:
-/*----------------------------------------*/   
-    LOAD4x4_2
-    KERNEL4x4_L2 64,64,0,0
-    KERNEL4x4_L2 64,64,1,0   
-    KERNEL4x4_L2 64,64,2,0
-    KERNEL4x4_L2 64,64,3,0  
-    KERNEL4x4_L2 64,64,4,0
-    KERNEL4x4_L2 64,64,5,0 
-    KERNEL4x4_L2 64,64,6,0
-    KERNEL4x4_E2 64,64,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_4x4_L8_SUB:
-/*----------------------------------------*/   
-    LOAD4x4_2
-    KERNEL4x4_L2 64,64,0,0
-    KERNEL4x4_L2 64,64,1,0   
-    KERNEL4x4_L2 64,64,2,0
-    KERNEL4x4_E2 64,64,3,1 
-    blr
-
-
-CGEMM_4x2_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD4x2_2  
-    MY_ALIGN 
-CGEMM_L4x2_LOOP:
-/*----------------------------------------*/   
-    KERNEL4x2_L2 32,64,0,0 
-CGEMM_L4x2_K32:
-/*----------------------------------------*/   
-    KERNEL4x2_L2 32,64,1,0  
-    KERNEL4x2_L2 32,64,2,0
-    KERNEL4x2_L2 32,64,3,0  
-    KERNEL4x2_L2 32,64,4,0
-    KERNEL4x2_L2 32,64,5,0 
-    KERNEL4x2_L2 32,64,6,0
-    KERNEL4x2_L2 32,64,7,0
-    KERNEL4x2_L2 32,64,8,0
-    KERNEL4x2_L2 32,64,9,0  
-    KERNEL4x2_L2 32,64,10,0
-    KERNEL4x2_L2 32,64,11,0  
-    KERNEL4x2_L2 32,64,12,0
-    KERNEL4x2_L2 32,64,13,0 
-    KERNEL4x2_L2 32,64,14,0
-    KERNEL4x2_L2 32,64,15,1   
-    bdnz    CGEMM_L4x2_LOOP
-    MY_ALIGN  
-
-
-CGEMM_L4x2_LOOP_END:
-/*----------------------------------------*/   
-    END4x2_2 
-    blr
-    MY_ALIGN
-CGEMM_4x2_L16_SUB:
-/*----------------------------------------*/   
-    LOAD4x2_2
-    KERNEL4x2_L2 32,64,0,0
-    KERNEL4x2_L2 32,64,1,0  
-    KERNEL4x2_L2 32,64,2,0
-    KERNEL4x2_L2 32,64,3,0  
-    KERNEL4x2_L2 32,64,4,0
-    KERNEL4x2_L2 32,64,5,0 
-    KERNEL4x2_L2 32,64,6,0
-    KERNEL4x2_E2 32,64,7,1
-    blr
-    MY_ALIGN
-CGEMM_4x2_L8_SUB:
-/*----------------------------------------*/   
-    LOAD4x2_2
-    KERNEL4x2_L2 32,64,0,0
-    KERNEL4x2_L2 32,64,1,0  
-    KERNEL4x2_L2 32,64,2,0
-    KERNEL4x2_E2 32,64,3,1  
-    blr
-
-
-CGEMM_4x1_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD4x1_2  
-    MY_ALIGN
-CGEMM_L4x1_LOOP:
-/*----------------------------------------*/   
-    KERNEL4x1_L2 16,64,0,0 
-CGEMM_L4x1_K32:
-/*----------------------------------------*/   
-    KERNEL4x1_L2 16,64,1,0  
-    KERNEL4x1_L2 16,64,2,0
-    KERNEL4x1_L2 16,64,3,0  
-    KERNEL4x1_L2 16,64,4,0
-    KERNEL4x1_L2 16,64,5,0 
-    KERNEL4x1_L2 16,64,6,0
-    KERNEL4x1_L2 16,64,7,0
-    KERNEL4x1_L2 16,64,8,0
-    KERNEL4x1_L2 16,64,9,0  
-    KERNEL4x1_L2 16,64,10,0
-    KERNEL4x1_L2 16,64,11,0  
-    KERNEL4x1_L2 16,64,12,0
-    KERNEL4x1_L2 16,64,13,0 
-    KERNEL4x1_L2 16,64,14,0
-    KERNEL4x1_L2 16,64,15,1   
-    bdnz    CGEMM_L4x1_LOOP
-    MY_ALIGN  
-CGEMM_L4x1_LOOP_END:
-/*----------------------------------------*/   
-    END4x1_2 
-    blr
-
-    MY_ALIGN
-CGEMM_4x1_L16_SUB:
-/*----------------------------------------*/   
-    LOAD4x1_2
-    KERNEL4x1_L2 16,64,0,0
-    KERNEL4x1_L2 16,64,1,0  
-    KERNEL4x1_L2 16,64,2,0
-    KERNEL4x1_L2 16,64,3,0  
-    KERNEL4x1_L2 16,64,4,0
-    KERNEL4x1_L2 16,64,5,0 
-    KERNEL4x1_L2 16,64,6,0
-    KERNEL4x1_E2 16,64,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_4x1_L8_SUB:
-/*----------------------------------------*/   
-    LOAD4x1_2
-    KERNEL4x1_L2 16,64,0,0
-    KERNEL4x1_L2 16,64,1,0  
-    KERNEL4x1_L2 16,64,2,0
-    KERNEL4x1_E2 16,64,3,1  
-    blr
-
-
-
-/*             MAIN LOOP BEGINS               */   
-    MY_ALIGN
-
-
-CGEMM_L4:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL) && !defined(LEFT)   
-    neg TEMP_REG, OFFSET 
-#endif   
-    srawi.    J,  N,  2
-    ble   CGEMM_L4_END
-
-
-CGEMM_L4_BEGIN:
-/*----------------------------------------*/   
-    mr    CO, C
-    slwi    T1, LDC , 2     
-    add     T2,C,LDC    
-    mr    AO, A  
-    add   C,  C,  T1
-#if defined(TRMMKERNEL) && defined(LEFT)   
-    mr TEMP_REG, OFFSET  /*off = offset;*/
-#endif     
-    srawi.    I,  M,  3
-    ble   CGEMM_L4x8_END
-    dcbt    CO,r0  /*just prefetch*/
-    dcbt    T2,r0    
-
-
-CGEMM_L4x8_BEGIN:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
-#else    
-    mr    BO, B  
-    dcbt    B,  r0  
-#endif     
-    dcbt    AO, r0
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,8,4
-    mr T1, T6
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512   
-    srawi.   T8, T1, 7 /**(T1-2) % 128x */
-#else   
-    mr T1, K
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512 
-    srawi.   T8, T1, 7 /**(K-2) % 128x */
-#endif   
-    ZERO4x8  
-    ble   CGEMM_L4x8_SUB0
-    bl CGEMM_L4x8_LMAIN_SUB
-    andi.   L,  T1, 127
-    ble   CGEMM_L4x8_SAVE
-    b   CGEMM_L4x8_SUB2
-
-
-CGEMM_L4x8_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 255
-    cmpwi   T6,129
-#else   
-    andi.   L,  K,  255
-    cmpwi   K,129
-#endif       
-    li T8,1
-    bne CMP4x8_128K
-    addi BO,BO,-32
-    addi AO,AO,-64 
-    LOAD4x8O 64,32 
-    END4x8_WITHOUT_ADD   
-    LOAD4x8_2O  128, 64 
-    mtctr   T8    
-    bl CGEMM_L4x8_K128   
-    b CGEMM_L4x8_SAVE  
-    CMP4x8_128K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,128
-#else    
-    cmpwi   K,128
-#endif        
-    bne CGEMM_L4x8_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-128   
-    LOAD4x8_2O 128,64
-    bl CGEMM_L4x8_K128   
-    b CGEMM_L4x8_SAVE 
-    MY_ALIGN
-
-
-CGEMM_L4x8_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 64
-    ble CGEMM_L4x8_SUB2_32
-    bl  CGEMM_4x8_L64_SUB
-    MY_ALIGN
-
-
-CGEMM_L4x8_SUB2_32:
-/*----------------------------------------*/   
-    andi.      T1,L, 32
-    ble CGEMM_L4x8_SUB2_16    
-    bl  CGEMM_4x8_L32_SUB
-    MY_ALIGN 
-
-
-CGEMM_L4x8_SUB2_16:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L4x8_SUB2_8
-    bl  CGEMM_4x8_L16_SUB  
-    MY_ALIGN    
-
-
-CGEMM_L4x8_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L4x8_SUB2_4
-    LOAD4x8_2
-    KERNEL4x8_L2  128,64, 0,0
-    KERNEL4x8_L2  128,64, 1,0
-    KERNEL4x8_L2  128,64, 2,0
-    KERNEL4x8_E2  128,64, 3,1
-    MY_ALIGN   
-
-
-CGEMM_L4x8_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L4x8_SUB2_2
-    LOAD4x8_2
-    KERNEL4x8_L2  128,64, 0,0
-    KERNEL4x8_E2  128,64, 1,1
-    MY_ALIGN
-
-
-CGEMM_L4x8_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L4x8_SUB2_1
-    LOAD4x8_2 
-    KERNEL4x8_E2  128,64, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L4x8_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L4x8_SAVE 
-    KERNEL4x8
-
-    MY_ALIGN
-CGEMM_L4x8_SAVE:
-/*----------------------------------------*/   
-    addic.    I,  I,  -1
-    MY_ALIGN
-    SAVE4x8
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4
-#endif     
-    bgt   CGEMM_L4x8_BEGIN
-    andi.   T2, M,  7
-    ble   CGEMM_L4x1_END
-    andi.   T1, M,  4
-    ble   CGEMM_L4x4_END
-    b   CGEMM_L4x4_BEGIN
-    MY_ALIGN 
-
-
-CGEMM_L4x8_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L4x4_BEGIN:
-/*----------------------------------------*/   
-    andi.   T2, M,  7
-    ble   CGEMM_L4x1_END
-    andi.   T1, M,  4
-    ble   CGEMM_L4x4_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,4,4
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO4x4
-    ble   CGEMM_L4x4_SUB0 
-    bl CGEMM_4x4_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L4x4_SAVE
-    b    CGEMM_L4x4_SUB2
-
-
-CGEMM_L4x4_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP4x4_32K
-    addi BO,BO,-32
-    addi AO,AO,-32  
-    LOAD4x4O 32,32 
-    END4x4_WITHOUT_ADD   
-    LOAD4x4_2O  64, 64 
-    mtctr   T8    
-    bl CGEMM_L4x4_K32   
-    b CGEMM_L4x4_SAVE  
-    CMP4x4_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L4x4_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-64   
-    LOAD4x4_2O 64,64
-    bl CGEMM_L4x4_K32   
-    b CGEMM_L4x4_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L4x4_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L4x4_SUB2_8
-    bl  CGEMM_4x4_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L4x4_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L4x4_SUB2_4
-    bl CGEMM_4x4_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L4x4_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L4x4_SUB2_2
-    LOAD4x4_2
-    KERNEL4x4_L2  64,64, 0,0
-    KERNEL4x4_E2  64,64, 1,1
-    MY_ALIGN
-
-
-CGEMM_L4x4_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L4x4_SUB2_1
-    LOAD4x4_2
-    KERNEL4x4_E2  64,64, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L4x4_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L4x4_SAVE 
-    KERNEL4x4
-
-
-CGEMM_L4x4_SAVE:
-/*----------------------------------------*/   
-    SAVE4x4
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4
-#endif     
-
-
-CGEMM_L4x4_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L4x2_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  2
-    ble   CGEMM_L4x2_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,2,4
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO4x2
-    ble   CGEMM_L4x2_SUB0 
-    bl CGEMM_4x2_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L4x2_SAVE
-    b   CGEMM_L4x2_SUB2
-
-
-CGEMM_L4x2_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP4x2_32K
-    addi BO,BO,-32
-    addi AO,AO,-16  
-    LOAD4x2O 16,32 
-    END4x2_WITHOUT_ADD   
-    LOAD4x2_2O  32, 64  
-    mtctr   T8    
-    bl CGEMM_L4x2_K32   
-    b CGEMM_L4x2_SAVE  
-    CMP4x2_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L4x2_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-32   
-    LOAD4x2_2O 32,64
-    bl CGEMM_L4x2_K32   
-    b CGEMM_L4x2_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L4x2_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L4x2_SUB2_8
-    bl CGEMM_4x2_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L4x2_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L4x2_SUB2_4
-    bl CGEMM_4x2_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L4x2_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L4x2_SUB2_2
-    LOAD4x2_2
-    KERNEL4x2_L2  32,64, 0,0
-    KERNEL4x2_E2  32,64, 1,1
-    MY_ALIGN
-
-
-CGEMM_L4x2_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L4x2_SUB2_1
-    LOAD4x2_2
-    KERNEL4x2_E2  32,64, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L4x2_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L4x2_SAVE 
-    KERNEL4x2
-
-    MY_ALIGN
-CGEMM_L4x2_SAVE:
-/*----------------------------------------*/   
-    SAVE4x2
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4
-#endif     
-
-
-CGEMM_L4x2_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L4x1_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  1
-    ble   CGEMM_L4x1_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,1,4
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO4x1
-    ble   CGEMM_L4x1_SUB0 
-    bl CGEMM_4x1_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L4x1_SAVE
-    b   CGEMM_L4x1_SUB2
-
-
-CGEMM_L4x1_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP4x1_32K
-    addi BO,BO,-32
-    addi AO,AO,-8  
-    LOAD4x1O 8,32 
-    END4x1_WITHOUT_ADD   
-    LOAD4x1_2O  16, 64  
-    mtctr   T8    
-    bl CGEMM_L4x1_K32   
-    b CGEMM_L4x1_SAVE  
-    CMP4x1_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L4x1_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-16   
-    LOAD4x1_2O 16,64
-    bl CGEMM_L4x1_K32   
-    b CGEMM_L4x1_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L4x1_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L4x1_SUB2_8
-    bl CGEMM_4x1_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L4x1_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L4x1_SUB2_4
-    bl CGEMM_4x1_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L4x1_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L4x1_SUB2_2
-    LOAD4x1_2
-    KERNEL4x1_L2  16,64, 0,0
-    KERNEL4x1_E2  16,64, 1,1
-    MY_ALIGN
-
-
-CGEMM_L4x1_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L4x1_SUB2_1
-    LOAD4x1_2
-    KERNEL4x1_E2  16,64, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L4x1_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L4x1_SAVE 
-    KERNEL4x1
-
-    MY_ALIGN
-CGEMM_L4x1_SAVE:
-/*----------------------------------------*/  
-     
-    SAVE4x1
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4
-#endif   
-
-
-CGEMM_L4x1_END:
-/*----------------------------------------*/   
-    slwi    T1, K,  5
-    addic.    J,  J,  -1
-    add   B,  B,  T1
-#if defined(TRMMKERNEL) && !defined(LEFT)   
-    addi TEMP_REG, TEMP_REG, 4
-#endif   
-    bgt   CGEMM_L4_BEGIN
-
-
-CGEMM_L4_END:
-
-b CGEMM_L2
-/*                MINI SUBROUTINES                            */      
-/*                2x8 MAIN 128x+2 LOOP                     */      
-
-
-CGEMM_L2x8_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x8_2 
-    MY_ALIGN
-CGEMM_L2x8_LOOP:
-/*----------------------------------------*/   
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 128,32,0,0 
-CGEMM_L2x8_K128:
-/*----------------------------------------*/   
-    KERNEL2x8_L2 128,32,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 128,32,2,0
-    KERNEL2x8_L2 128,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 128,32,4,0
-    KERNEL2x8_L2 128,32,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 128,32,6,0
-    KERNEL2x8_L2 128,32,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL2x8_L2 128,32,8,0
-    KERNEL2x8_L2 128,32,9,0
-    KERNEL2x8_L2 128,32,10,0
-    KERNEL2x8_L2 128,32,11,0  
-    dcbt    BO, T4
-    KERNEL2x8_L2 128,32,12,0
-    KERNEL2x8_L2 128,32,13,0
-    KERNEL2x8_L2 128,32,14,0
-    KERNEL2x8_L2 128,32,15,0  
-    KERNEL2x8_L2 128,32,16,0
-    KERNEL2x8_L2 128,32,17,0 
-    KERNEL2x8_L2 128,32,18,0
-    KERNEL2x8_L2 128,32,19,0  
-    KERNEL2x8_L2 128,32,20,0
-    KERNEL2x8_L2 128,32,21,0 
-    KERNEL2x8_L2 128,32,22,0
-    KERNEL2x8_L2 128,32,23,0   
-    KERNEL2x8_L2 128,32,24,0
-    KERNEL2x8_L2 128,32,25,0
-    KERNEL2x8_L2 128,32,26,0
-    KERNEL2x8_L2 128,32,27,0  
-    KERNEL2x8_L2 128,32,28,0
-    KERNEL2x8_L2 128,32,29,0
-    KERNEL2x8_L2 128,32,30,0
-    KERNEL2x8_L2 128,32,31,0 
-    KERNEL2x8_L2 128,32,32,0
-    KERNEL2x8_L2 128,32,33,0
-    KERNEL2x8_L2 128,32,34,0
-    KERNEL2x8_L2 128,32,35,0 
-    KERNEL2x8_L2 128,32,36,0
-    KERNEL2x8_L2 128,32,37,0
-    KERNEL2x8_L2 128,32,38,0
-    KERNEL2x8_L2 128,32,39,0  
-    KERNEL2x8_L2 128,32,40,0
-    KERNEL2x8_L2 128,32,41,0
-    KERNEL2x8_L2 128,32,42,0
-    KERNEL2x8_L2 128,32,43,0  
-    KERNEL2x8_L2 128,32,44,0
-    KERNEL2x8_L2 128,32,45,0
-    KERNEL2x8_L2 128,32,46,0
-    KERNEL2x8_L2 128,32,47,0 
-    KERNEL2x8_L2 128,32,48,0
-    KERNEL2x8_L2 128,32,49,0 
-    KERNEL2x8_L2 128,32,50,0
-    KERNEL2x8_L2 128,32,51,0  
-    KERNEL2x8_L2 128,32,52,0
-    KERNEL2x8_L2 128,32,53,0 
-    KERNEL2x8_L2 128,32,54,0
-    KERNEL2x8_L2 128,32,55,0  
-    KERNEL2x8_L2 128,32,56,0
-    KERNEL2x8_L2 128,32,57,0
-    KERNEL2x8_L2 128,32,58,0
-    KERNEL2x8_L2 128,32,59,0  
-    KERNEL2x8_L2 128,32,60,0
-    KERNEL2x8_L2 128,32,61,0
-    KERNEL2x8_L2 128,32,62,0 
-    KERNEL2x8_L2 128,32,63,1  
-    bdnz    CGEMM_L2x8_LOOP
-    MY_ALIGN  
-CGEMM_L2x8_LOOP_END:
-/*----------------------------------------*/   
-    END2x8_2
-    blr
-    MY_ALIGN
-
-
-CGEMM_2x8_L64_SUB:
-/*----------------------------------------*/   
-    LOAD2x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 128,32,0,0 
-    KERNEL2x8_L2 128,32,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 128,32,2,0
-    KERNEL2x8_L2 128,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 128,32,4,0
-    KERNEL2x8_L2 128,32,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 128,32,6,0
-    KERNEL2x8_L2 128,32,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL2x8_L2 128,32,8,0
-    KERNEL2x8_L2 128,32,9,0
-    KERNEL2x8_L2 128,32,10,0
-    KERNEL2x8_L2 128,32,11,0  
-    dcbt    BO, T4
-    KERNEL2x8_L2 128,32,12,0
-    KERNEL2x8_L2 128,32,13,0
-    KERNEL2x8_L2 128,32,14,0
-    KERNEL2x8_L2 128,32,15,0  
-    KERNEL2x8_L2 128,32,16,0
-    KERNEL2x8_L2 128,32,17,0 
-    KERNEL2x8_L2 128,32,18,0
-    KERNEL2x8_L2 128,32,19,0  
-    KERNEL2x8_L2 128,32,20,0
-    KERNEL2x8_L2 128,32,21,0 
-    KERNEL2x8_L2 128,32,22,0
-    KERNEL2x8_L2 128,32,23,0   
-    KERNEL2x8_L2 128,32,24,0
-    KERNEL2x8_L2 128,32,25,0
-    KERNEL2x8_L2 128,32,26,0
-    KERNEL2x8_L2 128,32,27,0  
-    KERNEL2x8_L2 128,32,28,0
-    KERNEL2x8_L2 128,32,29,0
-    KERNEL2x8_L2 128,32,30,0
-    KERNEL2x8_E2 128,32,31,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_2x8_L32_SUB:
-/*----------------------------------------*/   
-    LOAD2x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 128,32,0,0 
-    KERNEL2x8_L2 128,32,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 128,32,2,0
-    KERNEL2x8_L2 128,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 128,32,4,0
-    KERNEL2x8_L2 128,32,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 128,32,6,0
-    KERNEL2x8_L2 128,32,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL2x8_L2 128,32,8,0
-    KERNEL2x8_L2 128,32,9,0
-    KERNEL2x8_L2 128,32,10,0
-    KERNEL2x8_L2 128,32,11,0  
-    dcbt    BO, T4
-    KERNEL2x8_L2 128,32,12,0
-    KERNEL2x8_L2 128,32,13,0
-    KERNEL2x8_L2 128,32,14,0
-    KERNEL2x8_E2 128,32,15,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_2x8_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x8_2 
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 128,32,0,0 
-    KERNEL2x8_L2 128,32,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 128,32,2,0
-    KERNEL2x8_L2 128,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 128,32,4,0
-    KERNEL2x8_L2 128,32,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 128,32,6,0
-    KERNEL2x8_E2 128,32,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_2x4_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x4_2  
-    MY_ALIGN
-CGEMM_L2x4_LOOP:
-/*----------------------------------------*/   
-    KERNEL2x4_L2 64,32,0,0
-CGEMM_L2x4_K32:
-/*----------------------------------------*/   
-    KERNEL2x4_L2 64,32,1,0   
-    KERNEL2x4_L2 64,32,2,0
-    KERNEL2x4_L2 64,32,3,0  
-    KERNEL2x4_L2 64,32,4,0
-    KERNEL2x4_L2 64,32,5,0 
-    KERNEL2x4_L2 64,32,6,0
-    KERNEL2x4_L2 64,32,7,0
-    KERNEL2x4_L2 64,32,8,0
-    KERNEL2x4_L2 64,32,9,0   
-    KERNEL2x4_L2 64,32,10,0
-    KERNEL2x4_L2 64,32,11,0  
-    KERNEL2x4_L2 64,32,12,0
-    KERNEL2x4_L2 64,32,13,0 
-    KERNEL2x4_L2 64,32,14,0
-    KERNEL2x4_L2 64,32,15,1    
-    bdnz    CGEMM_L2x4_LOOP
-    MY_ALIGN  
-CGEMM_L2x4_LOOP_END:
-/*----------------------------------------*/   
-    END2x4_2 
-    blr
-    MY_ALIGN
-
-
-CGEMM_2x4_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x4_2
-    KERNEL2x4_L2 64,32,0,0
-    KERNEL2x4_L2 64,32,1,0   
-    KERNEL2x4_L2 64,32,2,0
-    KERNEL2x4_L2 64,32,3,0  
-    KERNEL2x4_L2 64,32,4,0
-    KERNEL2x4_L2 64,32,5,0 
-    KERNEL2x4_L2 64,32,6,0
-    KERNEL2x4_E2 64,32,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_2x4_L8_SUB:
-/*----------------------------------------*/   
-    LOAD2x4_2
-    KERNEL2x4_L2 64,32,0,0
-    KERNEL2x4_L2 64,32,1,0   
-    KERNEL2x4_L2 64,32,2,0
-    KERNEL2x4_E2 64,32,3,1 
-    blr
-
-
-CGEMM_2x2_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x2_2  
-    MY_ALIGN 
-CGEMM_L2x2_LOOP:
-/*----------------------------------------*/   
-    KERNEL2x2_L2 32,32,0,0 
-CGEMM_L2x2_K32:
-/*----------------------------------------*/   
-    KERNEL2x2_L2 32,32,1,0  
-    KERNEL2x2_L2 32,32,2,0
-    KERNEL2x2_L2 32,32,3,0  
-    KERNEL2x2_L2 32,32,4,0
-    KERNEL2x2_L2 32,32,5,0 
-    KERNEL2x2_L2 32,32,6,0
-    KERNEL2x2_L2 32,32,7,0
-    KERNEL2x2_L2 32,32,8,0
-    KERNEL2x2_L2 32,32,9,0  
-    KERNEL2x2_L2 32,32,10,0
-    KERNEL2x2_L2 32,32,11,0  
-    KERNEL2x2_L2 32,32,12,0
-    KERNEL2x2_L2 32,32,13,0 
-    KERNEL2x2_L2 32,32,14,0
-    KERNEL2x2_L2 32,32,15,1   
-    bdnz    CGEMM_L2x2_LOOP
-    MY_ALIGN  
-
-
-CGEMM_L2x2_LOOP_END:
-/*----------------------------------------*/   
-    END2x2_2 
-    blr
-    MY_ALIGN
-CGEMM_2x2_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x2_2
-    KERNEL2x2_L2 32,32,0,0
-    KERNEL2x2_L2 32,32,1,0  
-    KERNEL2x2_L2 32,32,2,0
-    KERNEL2x2_L2 32,32,3,0  
-    KERNEL2x2_L2 32,32,4,0
-    KERNEL2x2_L2 32,32,5,0 
-    KERNEL2x2_L2 32,32,6,0
-    KERNEL2x2_E2 32,32,7,1
-    blr
-    MY_ALIGN
-CGEMM_2x2_L8_SUB:
-/*----------------------------------------*/   
-    LOAD2x2_2
-    KERNEL2x2_L2 32,32,0,0
-    KERNEL2x2_L2 32,32,1,0  
-    KERNEL2x2_L2 32,32,2,0
-    KERNEL2x2_E2 32,32,3,1  
-    blr
-
-
-CGEMM_2x1_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x1_2  
-    MY_ALIGN
-CGEMM_L2x1_LOOP:
-/*----------------------------------------*/   
-    KERNEL2x1_L2 16,32,0,0 
-CGEMM_L2x1_K32:
-/*----------------------------------------*/   
-    KERNEL2x1_L2 16,32,1,0  
-    KERNEL2x1_L2 16,32,2,0
-    KERNEL2x1_L2 16,32,3,0  
-    KERNEL2x1_L2 16,32,4,0
-    KERNEL2x1_L2 16,32,5,0 
-    KERNEL2x1_L2 16,32,6,0
-    KERNEL2x1_L2 16,32,7,0
-    KERNEL2x1_L2 16,32,8,0
-    KERNEL2x1_L2 16,32,9,0  
-    KERNEL2x1_L2 16,32,10,0
-    KERNEL2x1_L2 16,32,11,0  
-    KERNEL2x1_L2 16,32,12,0
-    KERNEL2x1_L2 16,32,13,0 
-    KERNEL2x1_L2 16,32,14,0
-    KERNEL2x1_L2 16,32,15,1   
-    bdnz    CGEMM_L2x1_LOOP
-    MY_ALIGN  
-CGEMM_L2x1_LOOP_END:
-/*----------------------------------------*/   
-    END2x1_2 
-    blr
-
-    MY_ALIGN
-CGEMM_2x1_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x1_2
-    KERNEL2x1_L2 16,32,0,0
-    KERNEL2x1_L2 16,32,1,0  
-    KERNEL2x1_L2 16,32,2,0
-    KERNEL2x1_L2 16,32,3,0  
-    KERNEL2x1_L2 16,32,4,0
-    KERNEL2x1_L2 16,32,5,0 
-    KERNEL2x1_L2 16,32,6,0
-    KERNEL2x1_E2 16,32,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_2x1_L8_SUB:
-/*----------------------------------------*/   
-    LOAD2x1_2
-    KERNEL2x1_L2 16,32,0,0
-    KERNEL2x1_L2 16,32,1,0  
-    KERNEL2x1_L2 16,32,2,0
-    KERNEL2x1_E2 16,32,3,1  
-    blr
-
-
-
-/*             MAIN LOOP BEGINS               */   
-    MY_ALIGN
-
-
-CGEMM_L2:
-/*----------------------------------------*/   
-
-    andi.    J,  N,  2
-    ble   CGEMM_L2_END
-
-
-CGEMM_L2_BEGIN:
-/*----------------------------------------*/   
-    mr    CO, C
-    slwi    T1, LDC , 1     
-    add     T2,C,LDC    
-    mr    AO, A  
-    add   C,  C,  T1
-#if defined(TRMMKERNEL) && defined(LEFT)   
-    mr TEMP_REG, OFFSET  /*off = offset;*/
-#endif     
-    srawi.    I,  M,  3
-    ble   CGEMM_L2x8_END
-    dcbt    CO,r0  /*just prefetch*/
-    dcbt    T2,r0    
-
-
-CGEMM_L2x8_BEGIN:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
-#else    
-    mr    BO, B  
-    dcbt    B,  r0  
-#endif     
-    dcbt    AO, r0
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
-    mr T1, T6
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512   
-    srawi.   T8, T1, 7 /**(T1-2) % 128x */
-#else   
-    mr T1, K
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512 
-    srawi.   T8, T1, 7 /**(K-2) % 128x */
-#endif   
-    ZERO2x8  
-    ble   CGEMM_L2x8_SUB0
-    bl CGEMM_L2x8_LMAIN_SUB
-    andi.   L,  T1, 127
-    ble   CGEMM_L2x8_SAVE
-    b   CGEMM_L2x8_SUB2
-
-
-CGEMM_L2x8_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 255
-    cmpwi   T6,129
-#else   
-    andi.   L,  K,  255
-    cmpwi   K,129
-#endif       
-    li T8,1
-    bne CMP2x8_128K
-    addi BO,BO,-16
-    addi AO,AO,-64 
-    LOAD2x8O 64,16 
-    END2x8_WITHOUT_ADD   
-    LOAD2x8_2O  128, 32 
-    mtctr   T8    
-    bl CGEMM_L2x8_K128   
-    b CGEMM_L2x8_SAVE  
-    CMP2x8_128K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,128
-#else    
-    cmpwi   K,128
-#endif        
-    bne CGEMM_L2x8_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-128   
-    LOAD2x8_2O 128,32
-    bl CGEMM_L2x8_K128   
-    b CGEMM_L2x8_SAVE 
-    MY_ALIGN
-
-
-CGEMM_L2x8_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 64
-    ble CGEMM_L2x8_SUB2_32
-    bl  CGEMM_2x8_L64_SUB
-    MY_ALIGN
-
-
-CGEMM_L2x8_SUB2_32:
-/*----------------------------------------*/   
-    andi.      T1,L, 32
-    ble CGEMM_L2x8_SUB2_16    
-    bl  CGEMM_2x8_L32_SUB
-    MY_ALIGN 
-
-
-CGEMM_L2x8_SUB2_16:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L2x8_SUB2_8
-    bl  CGEMM_2x8_L16_SUB  
-    MY_ALIGN    
-
-
-CGEMM_L2x8_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L2x8_SUB2_4
-    LOAD2x8_2
-    KERNEL2x8_L2  128,32, 0,0
-    KERNEL2x8_L2  128,32, 1,0
-    KERNEL2x8_L2  128,32, 2,0
-    KERNEL2x8_E2  128,32, 3,1
-    MY_ALIGN   
-
-
-CGEMM_L2x8_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L2x8_SUB2_2
-    LOAD2x8_2
-    KERNEL2x8_L2  128,32, 0,0
-    KERNEL2x8_E2  128,32, 1,1
-    MY_ALIGN
-
-
-CGEMM_L2x8_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L2x8_SUB2_1
-    LOAD2x8_2 
-    KERNEL2x8_E2  128,32, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L2x8_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L2x8_SAVE 
-    KERNEL2x8
-
-    MY_ALIGN
-CGEMM_L2x8_SAVE:
-/*----------------------------------------*/   
-    addic.    I,  I,  -1
-    MY_ALIGN
-    SAVE2x8
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
-#endif     
-    bgt   CGEMM_L2x8_BEGIN
-    andi.   T2, M,  7
-    ble   CGEMM_L2x1_END
-    andi.   T1, M,  4
-    ble   CGEMM_L2x4_END
-    b   CGEMM_L2x4_BEGIN
-    MY_ALIGN 
-
-
-CGEMM_L2x8_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L2x4_BEGIN:
-/*----------------------------------------*/   
-    andi.   T2, M,  7
-    ble   CGEMM_L2x1_END
-    andi.   T1, M,  4
-    ble   CGEMM_L2x4_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO2x4
-    ble   CGEMM_L2x4_SUB0 
-    bl CGEMM_2x4_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L2x4_SAVE
-    b    CGEMM_L2x4_SUB2
-
-
-CGEMM_L2x4_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP2x4_32K
-    addi BO,BO,-16
-    addi AO,AO,-32  
-    LOAD2x4O 32,16 
-    END2x4_WITHOUT_ADD   
-    LOAD2x4_2O  64, 32 
-    mtctr   T8    
-    bl CGEMM_L2x4_K32   
-    b CGEMM_L2x4_SAVE  
-    CMP2x4_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L2x4_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-64   
-    LOAD2x4_2O 64,32
-    bl CGEMM_L2x4_K32   
-    b CGEMM_L2x4_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L2x4_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L2x4_SUB2_8
-    bl  CGEMM_2x4_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L2x4_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L2x4_SUB2_4
-    bl CGEMM_2x4_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L2x4_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L2x4_SUB2_2
-    LOAD2x4_2
-    KERNEL2x4_L2  64,32, 0,0
-    KERNEL2x4_E2  64,32, 1,1
-    MY_ALIGN
-
-
-CGEMM_L2x4_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L2x4_SUB2_1
-    LOAD2x4_2
-    KERNEL2x4_E2  64,32, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L2x4_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L2x4_SAVE 
-    KERNEL2x4
-
-
-CGEMM_L2x4_SAVE:
-/*----------------------------------------*/   
-    SAVE2x4
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
-#endif     
-
-
-CGEMM_L2x4_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L2x2_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  2
-    ble   CGEMM_L2x2_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO2x2
-    ble   CGEMM_L2x2_SUB0 
-    bl CGEMM_2x2_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L2x2_SAVE
-    b   CGEMM_L2x2_SUB2
-
-
-CGEMM_L2x2_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP2x2_32K
-    addi BO,BO,-16
-    addi AO,AO,-16  
-    LOAD2x2O 16,16 
-    END2x2_WITHOUT_ADD   
-    LOAD2x2_2O  32, 32  
-    mtctr   T8    
-    bl CGEMM_L2x2_K32   
-    b CGEMM_L2x2_SAVE  
-    CMP2x2_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L2x2_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-32   
-    LOAD2x2_2O 32,32
-    bl CGEMM_L2x2_K32   
-    b CGEMM_L2x2_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L2x2_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L2x2_SUB2_8
-    bl CGEMM_2x2_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L2x2_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L2x2_SUB2_4
-    bl CGEMM_2x2_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L2x2_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L2x2_SUB2_2
-    LOAD2x2_2
-    KERNEL2x2_L2  32,32, 0,0
-    KERNEL2x2_E2  32,32, 1,1
-    MY_ALIGN
-
-
-CGEMM_L2x2_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L2x2_SUB2_1
-    LOAD2x2_2
-    KERNEL2x2_E2  32,32, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L2x2_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L2x2_SAVE 
-    KERNEL2x2
-
-    MY_ALIGN
-CGEMM_L2x2_SAVE:
-/*----------------------------------------*/   
-    SAVE2x2
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
-#endif     
-
-
-CGEMM_L2x2_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L2x1_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  1
-    ble   CGEMM_L2x1_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO2x1
-    ble   CGEMM_L2x1_SUB0 
-    bl CGEMM_2x1_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L2x1_SAVE
-    b   CGEMM_L2x1_SUB2
-
-
-CGEMM_L2x1_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP2x1_32K
-    addi BO,BO,-16
-    addi AO,AO,-8  
-    LOAD2x1O 8,16 
-    END2x1_WITHOUT_ADD   
-    LOAD2x1_2O  16, 32  
-    mtctr   T8    
-    bl CGEMM_L2x1_K32   
-    b CGEMM_L2x1_SAVE  
-    CMP2x1_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L2x1_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-16   
-    LOAD2x1_2O 16,32
-    bl CGEMM_L2x1_K32   
-    b CGEMM_L2x1_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L2x1_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L2x1_SUB2_8
-    bl CGEMM_2x1_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L2x1_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L2x1_SUB2_4
-    bl CGEMM_2x1_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L2x1_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L2x1_SUB2_2
-    LOAD2x1_2
-    KERNEL2x1_L2  16,32, 0,0
-    KERNEL2x1_E2  16,32, 1,1
-    MY_ALIGN
-
-
-CGEMM_L2x1_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L2x1_SUB2_1
-    LOAD2x1_2
-    KERNEL2x1_E2  16,32, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L2x1_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L2x1_SAVE 
-    KERNEL2x1
-
-    MY_ALIGN
-CGEMM_L2x1_SAVE:
-/*----------------------------------------*/  
-     
-    SAVE2x1
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
-#endif   
-
-
-CGEMM_L2x1_END:
-/*----------------------------------------*/   
-    slwi    T1, K,  4
-
-    add   B,  B,  T1
-#if defined(TRMMKERNEL) && !defined(LEFT)   
-    addi TEMP_REG, TEMP_REG, 2
-#endif   
-
-CGEMM_L2_END:
-
-
-b CGEMM_L1
-/*                MINI SUBROUTINES                            */      
-/*                1x8 MAIN 128x+2 LOOP                     */      
-
-
-CGEMM_L1x8_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x8_2 
-    MY_ALIGN
-CGEMM_L1x8_LOOP:
-/*----------------------------------------*/   
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 128,16,0,0 
-CGEMM_L1x8_K128:
-/*----------------------------------------*/   
-    KERNEL1x8_L2 128,16,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 128,16,2,0
-    KERNEL1x8_L2 128,16,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 128,16,4,0
-    KERNEL1x8_L2 128,16,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 128,16,6,0
-    KERNEL1x8_L2 128,16,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL1x8_L2 128,16,8,0
-    KERNEL1x8_L2 128,16,9,0
-    KERNEL1x8_L2 128,16,10,0
-    KERNEL1x8_L2 128,16,11,0  
-    dcbt    BO, T4
-    KERNEL1x8_L2 128,16,12,0
-    KERNEL1x8_L2 128,16,13,0
-    KERNEL1x8_L2 128,16,14,0
-    KERNEL1x8_L2 128,16,15,0  
-    KERNEL1x8_L2 128,16,16,0
-    KERNEL1x8_L2 128,16,17,0 
-    KERNEL1x8_L2 128,16,18,0
-    KERNEL1x8_L2 128,16,19,0  
-    KERNEL1x8_L2 128,16,20,0
-    KERNEL1x8_L2 128,16,21,0 
-    KERNEL1x8_L2 128,16,22,0
-    KERNEL1x8_L2 128,16,23,0   
-    KERNEL1x8_L2 128,16,24,0
-    KERNEL1x8_L2 128,16,25,0
-    KERNEL1x8_L2 128,16,26,0
-    KERNEL1x8_L2 128,16,27,0  
-    KERNEL1x8_L2 128,16,28,0
-    KERNEL1x8_L2 128,16,29,0
-    KERNEL1x8_L2 128,16,30,0
-    KERNEL1x8_L2 128,16,31,0 
-    KERNEL1x8_L2 128,16,32,0
-    KERNEL1x8_L2 128,16,33,0
-    KERNEL1x8_L2 128,16,34,0
-    KERNEL1x8_L2 128,16,35,0 
-    KERNEL1x8_L2 128,16,36,0
-    KERNEL1x8_L2 128,16,37,0
-    KERNEL1x8_L2 128,16,38,0
-    KERNEL1x8_L2 128,16,39,0  
-    KERNEL1x8_L2 128,16,40,0
-    KERNEL1x8_L2 128,16,41,0
-    KERNEL1x8_L2 128,16,42,0
-    KERNEL1x8_L2 128,16,43,0  
-    KERNEL1x8_L2 128,16,44,0
-    KERNEL1x8_L2 128,16,45,0
-    KERNEL1x8_L2 128,16,46,0
-    KERNEL1x8_L2 128,16,47,0 
-    KERNEL1x8_L2 128,16,48,0
-    KERNEL1x8_L2 128,16,49,0 
-    KERNEL1x8_L2 128,16,50,0
-    KERNEL1x8_L2 128,16,51,0  
-    KERNEL1x8_L2 128,16,52,0
-    KERNEL1x8_L2 128,16,53,0 
-    KERNEL1x8_L2 128,16,54,0
-    KERNEL1x8_L2 128,16,55,0  
-    KERNEL1x8_L2 128,16,56,0
-    KERNEL1x8_L2 128,16,57,0
-    KERNEL1x8_L2 128,16,58,0
-    KERNEL1x8_L2 128,16,59,0  
-    KERNEL1x8_L2 128,16,60,0
-    KERNEL1x8_L2 128,16,61,0
-    KERNEL1x8_L2 128,16,62,0 
-    KERNEL1x8_L2 128,16,63,1  
-    bdnz    CGEMM_L1x8_LOOP
-    MY_ALIGN  
-CGEMM_L1x8_LOOP_END:
-/*----------------------------------------*/   
-    END1x8_2
-    blr
-    MY_ALIGN
-
-
-CGEMM_1x8_L64_SUB:
-/*----------------------------------------*/   
-    LOAD1x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 128,16,0,0 
-    KERNEL1x8_L2 128,16,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 128,16,2,0
-    KERNEL1x8_L2 128,16,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 128,16,4,0
-    KERNEL1x8_L2 128,16,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 128,16,6,0
-    KERNEL1x8_L2 128,16,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL1x8_L2 128,16,8,0
-    KERNEL1x8_L2 128,16,9,0
-    KERNEL1x8_L2 128,16,10,0
-    KERNEL1x8_L2 128,16,11,0  
-    dcbt    BO, T4
-    KERNEL1x8_L2 128,16,12,0
-    KERNEL1x8_L2 128,16,13,0
-    KERNEL1x8_L2 128,16,14,0
-    KERNEL1x8_L2 128,16,15,0  
-    KERNEL1x8_L2 128,16,16,0
-    KERNEL1x8_L2 128,16,17,0 
-    KERNEL1x8_L2 128,16,18,0
-    KERNEL1x8_L2 128,16,19,0  
-    KERNEL1x8_L2 128,16,20,0
-    KERNEL1x8_L2 128,16,21,0 
-    KERNEL1x8_L2 128,16,22,0
-    KERNEL1x8_L2 128,16,23,0   
-    KERNEL1x8_L2 128,16,24,0
-    KERNEL1x8_L2 128,16,25,0
-    KERNEL1x8_L2 128,16,26,0
-    KERNEL1x8_L2 128,16,27,0  
-    KERNEL1x8_L2 128,16,28,0
-    KERNEL1x8_L2 128,16,29,0
-    KERNEL1x8_L2 128,16,30,0
-    KERNEL1x8_E2 128,16,31,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_1x8_L32_SUB:
-/*----------------------------------------*/   
-    LOAD1x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 128,16,0,0 
-    KERNEL1x8_L2 128,16,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 128,16,2,0
-    KERNEL1x8_L2 128,16,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 128,16,4,0
-    KERNEL1x8_L2 128,16,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 128,16,6,0
-    KERNEL1x8_L2 128,16,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL1x8_L2 128,16,8,0
-    KERNEL1x8_L2 128,16,9,0
-    KERNEL1x8_L2 128,16,10,0
-    KERNEL1x8_L2 128,16,11,0  
-    dcbt    BO, T4
-    KERNEL1x8_L2 128,16,12,0
-    KERNEL1x8_L2 128,16,13,0
-    KERNEL1x8_L2 128,16,14,0
-    KERNEL1x8_E2 128,16,15,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_1x8_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x8_2 
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 128,16,0,0 
-    KERNEL1x8_L2 128,16,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 128,16,2,0
-    KERNEL1x8_L2 128,16,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 128,16,4,0
-    KERNEL1x8_L2 128,16,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 128,16,6,0
-    KERNEL1x8_E2 128,16,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_1x4_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x4_2  
-    MY_ALIGN
-CGEMM_L1x4_LOOP:
-/*----------------------------------------*/   
-    KERNEL1x4_L2 64,16,0,0
-CGEMM_L1x4_K32:
-/*----------------------------------------*/   
-    KERNEL1x4_L2 64,16,1,0   
-    KERNEL1x4_L2 64,16,2,0
-    KERNEL1x4_L2 64,16,3,0  
-    KERNEL1x4_L2 64,16,4,0
-    KERNEL1x4_L2 64,16,5,0 
-    KERNEL1x4_L2 64,16,6,0
-    KERNEL1x4_L2 64,16,7,0
-    KERNEL1x4_L2 64,16,8,0
-    KERNEL1x4_L2 64,16,9,0   
-    KERNEL1x4_L2 64,16,10,0
-    KERNEL1x4_L2 64,16,11,0  
-    KERNEL1x4_L2 64,16,12,0
-    KERNEL1x4_L2 64,16,13,0 
-    KERNEL1x4_L2 64,16,14,0
-    KERNEL1x4_L2 64,16,15,1    
-    bdnz    CGEMM_L1x4_LOOP
-    MY_ALIGN  
-CGEMM_L1x4_LOOP_END:
-/*----------------------------------------*/   
-    END1x4_2 
-    blr
-    MY_ALIGN
-
-
-CGEMM_1x4_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x4_2
-    KERNEL1x4_L2 64,16,0,0
-    KERNEL1x4_L2 64,16,1,0   
-    KERNEL1x4_L2 64,16,2,0
-    KERNEL1x4_L2 64,16,3,0  
-    KERNEL1x4_L2 64,16,4,0
-    KERNEL1x4_L2 64,16,5,0 
-    KERNEL1x4_L2 64,16,6,0
-    KERNEL1x4_E2 64,16,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_1x4_L8_SUB:
-/*----------------------------------------*/   
-    LOAD1x4_2
-    KERNEL1x4_L2 64,16,0,0
-    KERNEL1x4_L2 64,16,1,0   
-    KERNEL1x4_L2 64,16,2,0
-    KERNEL1x4_E2 64,16,3,1 
-    blr
-
-
-CGEMM_1x2_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x2_2  
-    MY_ALIGN 
-CGEMM_L1x2_LOOP:
-/*----------------------------------------*/   
-    KERNEL1x2_L2 32,16,0,0 
-CGEMM_L1x2_K32:
-/*----------------------------------------*/   
-    KERNEL1x2_L2 32,16,1,0  
-    KERNEL1x2_L2 32,16,2,0
-    KERNEL1x2_L2 32,16,3,0  
-    KERNEL1x2_L2 32,16,4,0
-    KERNEL1x2_L2 32,16,5,0 
-    KERNEL1x2_L2 32,16,6,0
-    KERNEL1x2_L2 32,16,7,0
-    KERNEL1x2_L2 32,16,8,0
-    KERNEL1x2_L2 32,16,9,0  
-    KERNEL1x2_L2 32,16,10,0
-    KERNEL1x2_L2 32,16,11,0  
-    KERNEL1x2_L2 32,16,12,0
-    KERNEL1x2_L2 32,16,13,0 
-    KERNEL1x2_L2 32,16,14,0
-    KERNEL1x2_L2 32,16,15,1   
-    bdnz    CGEMM_L1x2_LOOP
-    MY_ALIGN  
-
-
-CGEMM_L1x2_LOOP_END:
-/*----------------------------------------*/   
-    END1x2_2 
-    blr
-    MY_ALIGN
-CGEMM_1x2_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x2_2
-    KERNEL1x2_L2 32,16,0,0
-    KERNEL1x2_L2 32,16,1,0  
-    KERNEL1x2_L2 32,16,2,0
-    KERNEL1x2_L2 32,16,3,0  
-    KERNEL1x2_L2 32,16,4,0
-    KERNEL1x2_L2 32,16,5,0 
-    KERNEL1x2_L2 32,16,6,0
-    KERNEL1x2_E2 32,16,7,1
-    blr
-    MY_ALIGN
-CGEMM_1x2_L8_SUB:
-/*----------------------------------------*/   
-    LOAD1x2_2
-    KERNEL1x2_L2 32,16,0,0
-    KERNEL1x2_L2 32,16,1,0  
-    KERNEL1x2_L2 32,16,2,0
-    KERNEL1x2_E2 32,16,3,1  
-    blr
-
-
-CGEMM_1x1_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x1_2  
-    MY_ALIGN
-CGEMM_L1x1_LOOP:
-/*----------------------------------------*/   
-    KERNEL1x1_L2 16,16,0,0 
-CGEMM_L1x1_K32:
-/*----------------------------------------*/   
-    KERNEL1x1_L2 16,16,1,0  
-    KERNEL1x1_L2 16,16,2,0
-    KERNEL1x1_L2 16,16,3,0  
-    KERNEL1x1_L2 16,16,4,0
-    KERNEL1x1_L2 16,16,5,0 
-    KERNEL1x1_L2 16,16,6,0
-    KERNEL1x1_L2 16,16,7,0
-    KERNEL1x1_L2 16,16,8,0
-    KERNEL1x1_L2 16,16,9,0  
-    KERNEL1x1_L2 16,16,10,0
-    KERNEL1x1_L2 16,16,11,0  
-    KERNEL1x1_L2 16,16,12,0
-    KERNEL1x1_L2 16,16,13,0 
-    KERNEL1x1_L2 16,16,14,0
-    KERNEL1x1_L2 16,16,15,1   
-    bdnz    CGEMM_L1x1_LOOP
-    MY_ALIGN  
-CGEMM_L1x1_LOOP_END:
-/*----------------------------------------*/   
-    END1x1_2 
-    blr
-
-    MY_ALIGN
-CGEMM_1x1_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x1_2
-    KERNEL1x1_L2 16,16,0,0
-    KERNEL1x1_L2 16,16,1,0  
-    KERNEL1x1_L2 16,16,2,0
-    KERNEL1x1_L2 16,16,3,0  
-    KERNEL1x1_L2 16,16,4,0
-    KERNEL1x1_L2 16,16,5,0 
-    KERNEL1x1_L2 16,16,6,0
-    KERNEL1x1_E2 16,16,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_1x1_L8_SUB:
-/*----------------------------------------*/   
-    LOAD1x1_2
-    KERNEL1x1_L2 16,16,0,0
-    KERNEL1x1_L2 16,16,1,0  
-    KERNEL1x1_L2 16,16,2,0
-    KERNEL1x1_E2 16,16,3,1  
-    blr
-
-
-
-/*             MAIN LOOP BEGINS               */   
-    MY_ALIGN
-
-
-CGEMM_L1:
-/*----------------------------------------*/   
-
-    andi.    J,  N,  1
-    ble   CGEMM_L1_END
-
-CGEMM_L1_BEGIN:
-/*----------------------------------------*/   
-    mr    CO, C  
-    add     T2,C,LDC    
-    mr    AO, A  
-    add   C,  C,  T1
-#if defined(TRMMKERNEL) && defined(LEFT)   
-    mr TEMP_REG, OFFSET  /*off = offset;*/
-#endif     
-    srawi.    I,  M,  3
-    ble   CGEMM_L1x8_END
-    dcbt    CO,r0  /*just prefetch*/
-    dcbt    T2,r0    
-
-
-CGEMM_L1x8_BEGIN:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
-#else    
-    mr    BO, B  
-    dcbt    B,  r0  
-#endif     
-    dcbt    AO, r0
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
-    mr T1, T6
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512   
-    srawi.   T8, T1, 7 /**(T1-2) % 128x */
-#else   
-    mr T1, K
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512 
-    srawi.   T8, T1, 7 /**(K-2) % 128x */
-#endif   
-    ZERO1x8  
-    ble   CGEMM_L1x8_SUB0
-    bl CGEMM_L1x8_LMAIN_SUB
-    andi.   L,  T1, 127
-    ble   CGEMM_L1x8_SAVE
-    b   CGEMM_L1x8_SUB2
-
-
-CGEMM_L1x8_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 255
-    cmpwi   T6,129
-#else   
-    andi.   L,  K,  255
-    cmpwi   K,129
-#endif       
-    li T8,1
-    bne CMP1x8_128K
-    addi BO,BO,-8
-    addi AO,AO,-64 
-    LOAD1x8O 64,8 
-    END1x8_WITHOUT_ADD   
-    LOAD1x8_2O  128, 16 
-    mtctr   T8    
-    bl CGEMM_L1x8_K128   
-    b CGEMM_L1x8_SAVE  
-    CMP1x8_128K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,128
-#else    
-    cmpwi   K,128
-#endif        
-    bne CGEMM_L1x8_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-16
-    addi AO,AO,-128   
-    LOAD1x8_2O 128,16
-    bl CGEMM_L1x8_K128   
-    b CGEMM_L1x8_SAVE 
-    MY_ALIGN
-
-
-CGEMM_L1x8_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 64
-    ble CGEMM_L1x8_SUB2_32
-    bl  CGEMM_1x8_L64_SUB
-    MY_ALIGN
-
-
-CGEMM_L1x8_SUB2_32:
-/*----------------------------------------*/   
-    andi.      T1,L, 32
-    ble CGEMM_L1x8_SUB2_16    
-    bl  CGEMM_1x8_L32_SUB
-    MY_ALIGN 
-
-
-CGEMM_L1x8_SUB2_16:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L1x8_SUB2_8
-    bl  CGEMM_1x8_L16_SUB  
-    MY_ALIGN    
-
-
-CGEMM_L1x8_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L1x8_SUB2_4
-    LOAD1x8_2
-    KERNEL1x8_L2  128,16, 0,0
-    KERNEL1x8_L2  128,16, 1,0
-    KERNEL1x8_L2  128,16, 2,0
-    KERNEL1x8_E2  128,16, 3,1
-    MY_ALIGN   
-
-
-CGEMM_L1x8_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L1x8_SUB2_2
-    LOAD1x8_2
-    KERNEL1x8_L2  128,16, 0,0
-    KERNEL1x8_E2  128,16, 1,1
-    MY_ALIGN
-
-
-CGEMM_L1x8_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L1x8_SUB2_1
-    LOAD1x8_2 
-    KERNEL1x8_E2  128,16, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L1x8_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L1x8_SAVE 
-    KERNEL1x8
-
-    MY_ALIGN
-CGEMM_L1x8_SAVE:
-/*----------------------------------------*/   
-    addic.    I,  I,  -1
-    MY_ALIGN
-    SAVE1x8
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
-#endif     
-    bgt   CGEMM_L1x8_BEGIN
-    andi.   T2, M,  7
-    ble   CGEMM_L1x1_END
-    andi.   T1, M,  4
-    ble   CGEMM_L1x4_END
-    b   CGEMM_L1x4_BEGIN
-    MY_ALIGN 
-
-
-CGEMM_L1x8_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L1x4_BEGIN:
-/*----------------------------------------*/   
-    andi.   T2, M,  7
-    ble   CGEMM_L1x1_END
-    andi.   T1, M,  4
-    ble   CGEMM_L1x4_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 31x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 31x */
-#endif     
-    ZERO1x4
-    ble   CGEMM_L1x4_SUB0 
-    bl CGEMM_1x4_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L1x4_SAVE
-    b    CGEMM_L1x4_SUB2
-
-
-CGEMM_L1x4_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP1x4_32K
-    addi BO,BO,-8
-    addi AO,AO,-32  
-    LOAD1x4O 32,8 
-    END1x4_WITHOUT_ADD   
-    LOAD1x4_2O  64, 16 
-    mtctr   T8    
-    bl CGEMM_L1x4_K32   
-    b CGEMM_L1x4_SAVE  
-    CMP1x4_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L1x4_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-16
-    addi AO,AO,-64   
-    LOAD1x4_2O 64,16
-    bl CGEMM_L1x4_K32   
-    b CGEMM_L1x4_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L1x4_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L1x4_SUB2_8
-    bl  CGEMM_1x4_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L1x4_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L1x4_SUB2_4
-    bl CGEMM_1x4_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L1x4_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L1x4_SUB2_2
-    LOAD1x4_2
-    KERNEL1x4_L2  64,16, 0,0
-    KERNEL1x4_E2  64,16, 1,1
-    MY_ALIGN
-
-
-CGEMM_L1x4_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L1x4_SUB2_1
-    LOAD1x4_2
-    KERNEL1x4_E2  64,16, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L1x4_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L1x4_SAVE 
-    KERNEL1x4
-
-
-CGEMM_L1x4_SAVE:
-/*----------------------------------------*/   
-    SAVE1x4
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
-#endif     
-
-
-CGEMM_L1x4_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L1x2_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  2
-    ble   CGEMM_L1x2_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 31x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 31x */
-#endif     
-    ZERO1x2
-    ble   CGEMM_L1x2_SUB0 
-    bl CGEMM_1x2_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L1x2_SAVE
-    b   CGEMM_L1x2_SUB2
-
-
-CGEMM_L1x2_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP1x2_32K
-    addi BO,BO,-8
-    addi AO,AO,-16  
-    LOAD1x2O 16,8 
-    END1x2_WITHOUT_ADD   
-    LOAD1x2_2O  32, 16  
-    mtctr   T8    
-    bl CGEMM_L1x2_K32   
-    b CGEMM_L1x2_SAVE  
-    CMP1x2_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L1x2_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-16
-    addi AO,AO,-32   
-    LOAD1x2_2O 32,16
-    bl CGEMM_L1x2_K32   
-    b CGEMM_L1x2_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L1x2_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L1x2_SUB2_8
-    bl CGEMM_1x2_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L1x2_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L1x2_SUB2_4
-    bl CGEMM_1x2_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L1x2_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L1x2_SUB2_2
-    LOAD1x2_2
-    KERNEL1x2_L2  32,16, 0,0
-    KERNEL1x2_E2  32,16, 1,1
-    MY_ALIGN
-
-
-CGEMM_L1x2_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L1x2_SUB2_1
-    LOAD1x2_2
-    KERNEL1x2_E2  32,16, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L1x2_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L1x2_SAVE 
-    KERNEL1x2
-
-    MY_ALIGN
-CGEMM_L1x2_SAVE:
-/*----------------------------------------*/   
-    SAVE1x2
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
-#endif     
-
-
-CGEMM_L1x2_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L1x1_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  1
-    ble   CGEMM_L1x1_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 31x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 31x */
-#endif     
-    ZERO1x1
-    ble   CGEMM_L1x1_SUB0 
-    bl CGEMM_1x1_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L1x1_SAVE
-    b   CGEMM_L1x1_SUB2
-
-
-CGEMM_L1x1_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP1x1_32K
-    addi BO,BO,-8
-    addi AO,AO,-8  
-    LOAD1x1O 8,8 
-    END1x1_WITHOUT_ADD   
-    LOAD1x1_2O  16, 16  
-    mtctr   T8    
-    bl CGEMM_L1x1_K32   
-    b CGEMM_L1x1_SAVE  
-    CMP1x1_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L1x1_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-16
-    addi AO,AO,-16   
-    LOAD1x1_2O 16,16
-    bl CGEMM_L1x1_K32   
-    b CGEMM_L1x1_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L1x1_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L1x1_SUB2_8
-    bl CGEMM_1x1_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L1x1_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L1x1_SUB2_4
-    bl CGEMM_1x1_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L1x1_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L1x1_SUB2_2
-    LOAD1x1_2
-    KERNEL1x1_L2  16,16, 0,0
-    KERNEL1x1_E2  16,16, 1,1
-    MY_ALIGN
-
-
-CGEMM_L1x1_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L1x1_SUB2_1
-    LOAD1x1_2
-    KERNEL1x1_E2  16,16, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L1x1_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L1x1_SAVE 
-    KERNEL1x1
-
-    MY_ALIGN
-CGEMM_L1x1_SAVE:
-/*----------------------------------------*/  
-     
-    SAVE1x1
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
-#endif   
-
-
-CGEMM_L1x1_END:
-/*----------------------------------------*/   
-    slwi    T1, K,  3
-
-    add   B,  B,  T1
-#if defined(TRMMKERNEL) && !defined(LEFT)   
-    addi TEMP_REG, TEMP_REG, 1
-#endif   
-
-CGEMM_L1_END:
-
-
-
-
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+#define MY_ALIGN .align 3
+b CGEMM_L4
+/*                MINI SUBROUTINES                            */      
+/*                4x8 MAIN 128x+2 LOOP                     */      
+
+
+CGEMM_L4x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x8_2 
+    MY_ALIGN
+CGEMM_L4x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+CGEMM_L4x8_K128:
+/*----------------------------------------*/   
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_L2 128,64,15,0  
+    KERNEL4x8_L2 128,64,16,0
+    KERNEL4x8_L2 128,64,17,0 
+    KERNEL4x8_L2 128,64,18,0
+    KERNEL4x8_L2 128,64,19,0  
+    KERNEL4x8_L2 128,64,20,0
+    KERNEL4x8_L2 128,64,21,0 
+    KERNEL4x8_L2 128,64,22,0
+    KERNEL4x8_L2 128,64,23,0   
+    KERNEL4x8_L2 128,64,24,0
+    KERNEL4x8_L2 128,64,25,0
+    KERNEL4x8_L2 128,64,26,0
+    KERNEL4x8_L2 128,64,27,0  
+    KERNEL4x8_L2 128,64,28,0
+    KERNEL4x8_L2 128,64,29,0
+    KERNEL4x8_L2 128,64,30,0
+    KERNEL4x8_L2 128,64,31,0 
+    KERNEL4x8_L2 128,64,32,0
+    KERNEL4x8_L2 128,64,33,0
+    KERNEL4x8_L2 128,64,34,0
+    KERNEL4x8_L2 128,64,35,0 
+    KERNEL4x8_L2 128,64,36,0
+    KERNEL4x8_L2 128,64,37,0
+    KERNEL4x8_L2 128,64,38,0
+    KERNEL4x8_L2 128,64,39,0  
+    KERNEL4x8_L2 128,64,40,0
+    KERNEL4x8_L2 128,64,41,0
+    KERNEL4x8_L2 128,64,42,0
+    KERNEL4x8_L2 128,64,43,0  
+    KERNEL4x8_L2 128,64,44,0
+    KERNEL4x8_L2 128,64,45,0
+    KERNEL4x8_L2 128,64,46,0
+    KERNEL4x8_L2 128,64,47,0 
+    KERNEL4x8_L2 128,64,48,0
+    KERNEL4x8_L2 128,64,49,0 
+    KERNEL4x8_L2 128,64,50,0
+    KERNEL4x8_L2 128,64,51,0  
+    KERNEL4x8_L2 128,64,52,0
+    KERNEL4x8_L2 128,64,53,0 
+    KERNEL4x8_L2 128,64,54,0
+    KERNEL4x8_L2 128,64,55,0  
+    KERNEL4x8_L2 128,64,56,0
+    KERNEL4x8_L2 128,64,57,0
+    KERNEL4x8_L2 128,64,58,0
+    KERNEL4x8_L2 128,64,59,0  
+    KERNEL4x8_L2 128,64,60,0
+    KERNEL4x8_L2 128,64,61,0
+    KERNEL4x8_L2 128,64,62,0 
+    KERNEL4x8_L2 128,64,63,1  
+    bdnz    CGEMM_L4x8_LOOP
+    MY_ALIGN  
+CGEMM_L4x8_LOOP_END:
+/*----------------------------------------*/   
+    END4x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_L2 128,64,15,0  
+    KERNEL4x8_L2 128,64,16,0
+    KERNEL4x8_L2 128,64,17,0 
+    KERNEL4x8_L2 128,64,18,0
+    KERNEL4x8_L2 128,64,19,0  
+    KERNEL4x8_L2 128,64,20,0
+    KERNEL4x8_L2 128,64,21,0 
+    KERNEL4x8_L2 128,64,22,0
+    KERNEL4x8_L2 128,64,23,0   
+    KERNEL4x8_L2 128,64,24,0
+    KERNEL4x8_L2 128,64,25,0
+    KERNEL4x8_L2 128,64,26,0
+    KERNEL4x8_L2 128,64,27,0  
+    KERNEL4x8_L2 128,64,28,0
+    KERNEL4x8_L2 128,64,29,0
+    KERNEL4x8_L2 128,64,30,0
+    KERNEL4x8_E2 128,64,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_E2 128,64,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_E2 128,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x4_2  
+    MY_ALIGN
+CGEMM_L4x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x4_L2 64,64,0,0
+CGEMM_L4x4_K32:
+/*----------------------------------------*/   
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_L2 64,64,3,0  
+    KERNEL4x4_L2 64,64,4,0
+    KERNEL4x4_L2 64,64,5,0 
+    KERNEL4x4_L2 64,64,6,0
+    KERNEL4x4_L2 64,64,7,0
+    KERNEL4x4_L2 64,64,8,0
+    KERNEL4x4_L2 64,64,9,0   
+    KERNEL4x4_L2 64,64,10,0
+    KERNEL4x4_L2 64,64,11,0  
+    KERNEL4x4_L2 64,64,12,0
+    KERNEL4x4_L2 64,64,13,0 
+    KERNEL4x4_L2 64,64,14,0
+    KERNEL4x4_L2 64,64,15,1    
+    bdnz    CGEMM_L4x4_LOOP
+    MY_ALIGN  
+CGEMM_L4x4_LOOP_END:
+/*----------------------------------------*/   
+    END4x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x4_2
+    KERNEL4x4_L2 64,64,0,0
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_L2 64,64,3,0  
+    KERNEL4x4_L2 64,64,4,0
+    KERNEL4x4_L2 64,64,5,0 
+    KERNEL4x4_L2 64,64,6,0
+    KERNEL4x4_E2 64,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x4_2
+    KERNEL4x4_L2 64,64,0,0
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_E2 64,64,3,1 
+    blr
+
+
+CGEMM_4x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x2_2  
+    MY_ALIGN 
+CGEMM_L4x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x2_L2 32,64,0,0 
+CGEMM_L4x2_K32:
+/*----------------------------------------*/   
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_L2 32,64,3,0  
+    KERNEL4x2_L2 32,64,4,0
+    KERNEL4x2_L2 32,64,5,0 
+    KERNEL4x2_L2 32,64,6,0
+    KERNEL4x2_L2 32,64,7,0
+    KERNEL4x2_L2 32,64,8,0
+    KERNEL4x2_L2 32,64,9,0  
+    KERNEL4x2_L2 32,64,10,0
+    KERNEL4x2_L2 32,64,11,0  
+    KERNEL4x2_L2 32,64,12,0
+    KERNEL4x2_L2 32,64,13,0 
+    KERNEL4x2_L2 32,64,14,0
+    KERNEL4x2_L2 32,64,15,1   
+    bdnz    CGEMM_L4x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L4x2_LOOP_END:
+/*----------------------------------------*/   
+    END4x2_2 
+    blr
+    MY_ALIGN
+CGEMM_4x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x2_2
+    KERNEL4x2_L2 32,64,0,0
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_L2 32,64,3,0  
+    KERNEL4x2_L2 32,64,4,0
+    KERNEL4x2_L2 32,64,5,0 
+    KERNEL4x2_L2 32,64,6,0
+    KERNEL4x2_E2 32,64,7,1
+    blr
+    MY_ALIGN
+CGEMM_4x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x2_2
+    KERNEL4x2_L2 32,64,0,0
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_E2 32,64,3,1  
+    blr
+
+
+CGEMM_4x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x1_2  
+    MY_ALIGN
+CGEMM_L4x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x1_L2 16,64,0,0 
+CGEMM_L4x1_K32:
+/*----------------------------------------*/   
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_L2 16,64,3,0  
+    KERNEL4x1_L2 16,64,4,0
+    KERNEL4x1_L2 16,64,5,0 
+    KERNEL4x1_L2 16,64,6,0
+    KERNEL4x1_L2 16,64,7,0
+    KERNEL4x1_L2 16,64,8,0
+    KERNEL4x1_L2 16,64,9,0  
+    KERNEL4x1_L2 16,64,10,0
+    KERNEL4x1_L2 16,64,11,0  
+    KERNEL4x1_L2 16,64,12,0
+    KERNEL4x1_L2 16,64,13,0 
+    KERNEL4x1_L2 16,64,14,0
+    KERNEL4x1_L2 16,64,15,1   
+    bdnz    CGEMM_L4x1_LOOP
+    MY_ALIGN  
+CGEMM_L4x1_LOOP_END:
+/*----------------------------------------*/   
+    END4x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_4x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x1_2
+    KERNEL4x1_L2 16,64,0,0
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_L2 16,64,3,0  
+    KERNEL4x1_L2 16,64,4,0
+    KERNEL4x1_L2 16,64,5,0 
+    KERNEL4x1_L2 16,64,6,0
+    KERNEL4x1_E2 16,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x1_2
+    KERNEL4x1_L2 16,64,0,0
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_E2 16,64,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L4:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    neg TEMP_REG, OFFSET 
+#endif   
+    srawi.    J,  N,  2
+    ble   CGEMM_L4_END
+
+
+CGEMM_L4_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 2     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   CGEMM_L4x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L4x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,4
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO4x8  
+    ble   CGEMM_L4x8_SUB0
+    bl CGEMM_L4x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L4x8_SAVE
+    b   CGEMM_L4x8_SUB2
+
+
+CGEMM_L4x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP4x8_128K
+    addi BO,BO,-32
+    addi AO,AO,-64 
+    LOAD4x8O 64,32 
+    END4x8_WITHOUT_ADD   
+    LOAD4x8_2O  128, 64 
+    mtctr   T8    
+    bl CGEMM_L4x8_K128   
+    b CGEMM_L4x8_SAVE  
+    CMP4x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L4x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-128   
+    LOAD4x8_2O 128,64
+    bl CGEMM_L4x8_K128   
+    b CGEMM_L4x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L4x8_SUB2_32
+    bl  CGEMM_4x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L4x8_SUB2_16    
+    bl  CGEMM_4x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L4x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x8_SUB2_8
+    bl  CGEMM_4x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L4x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x8_SUB2_4
+    LOAD4x8_2
+    KERNEL4x8_L2  128,64, 0,0
+    KERNEL4x8_L2  128,64, 1,0
+    KERNEL4x8_L2  128,64, 2,0
+    KERNEL4x8_E2  128,64, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L4x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x8_SUB2_2
+    LOAD4x8_2
+    KERNEL4x8_L2  128,64, 0,0
+    KERNEL4x8_E2  128,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x8_SUB2_1
+    LOAD4x8_2 
+    KERNEL4x8_E2  128,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x8_SAVE 
+    KERNEL4x8
+
+    MY_ALIGN
+CGEMM_L4x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE4x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4
+#endif     
+    bgt   CGEMM_L4x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L4x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L4x4_END
+    b   CGEMM_L4x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L4x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L4x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L4x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x4
+    ble   CGEMM_L4x4_SUB0 
+    bl CGEMM_4x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x4_SAVE
+    b    CGEMM_L4x4_SUB2
+
+
+CGEMM_L4x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x4_32K
+    addi BO,BO,-32
+    addi AO,AO,-32  
+    LOAD4x4O 32,32 
+    END4x4_WITHOUT_ADD   
+    LOAD4x4_2O  64, 64 
+    mtctr   T8    
+    bl CGEMM_L4x4_K32   
+    b CGEMM_L4x4_SAVE  
+    CMP4x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-64   
+    LOAD4x4_2O 64,64
+    bl CGEMM_L4x4_K32   
+    b CGEMM_L4x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x4_SUB2_8
+    bl  CGEMM_4x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x4_SUB2_4
+    bl CGEMM_4x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x4_SUB2_2
+    LOAD4x4_2
+    KERNEL4x4_L2  64,64, 0,0
+    KERNEL4x4_E2  64,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x4_SUB2_1
+    LOAD4x4_2
+    KERNEL4x4_E2  64,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x4_SAVE 
+    KERNEL4x4
+
+
+CGEMM_L4x4_SAVE:
+/*----------------------------------------*/   
+    SAVE4x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4
+#endif     
+
+
+CGEMM_L4x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L4x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x2
+    ble   CGEMM_L4x2_SUB0 
+    bl CGEMM_4x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x2_SAVE
+    b   CGEMM_L4x2_SUB2
+
+
+CGEMM_L4x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x2_32K
+    addi BO,BO,-32
+    addi AO,AO,-16  
+    LOAD4x2O 16,32 
+    END4x2_WITHOUT_ADD   
+    LOAD4x2_2O  32, 64  
+    mtctr   T8    
+    bl CGEMM_L4x2_K32   
+    b CGEMM_L4x2_SAVE  
+    CMP4x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-32   
+    LOAD4x2_2O 32,64
+    bl CGEMM_L4x2_K32   
+    b CGEMM_L4x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x2_SUB2_8
+    bl CGEMM_4x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x2_SUB2_4
+    bl CGEMM_4x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x2_SUB2_2
+    LOAD4x2_2
+    KERNEL4x2_L2  32,64, 0,0
+    KERNEL4x2_E2  32,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x2_SUB2_1
+    LOAD4x2_2
+    KERNEL4x2_E2  32,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x2_SAVE 
+    KERNEL4x2
+
+    MY_ALIGN
+CGEMM_L4x2_SAVE:
+/*----------------------------------------*/   
+    SAVE4x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4
+#endif     
+
+
+CGEMM_L4x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L4x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x1
+    ble   CGEMM_L4x1_SUB0 
+    bl CGEMM_4x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x1_SAVE
+    b   CGEMM_L4x1_SUB2
+
+
+CGEMM_L4x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x1_32K
+    addi BO,BO,-32
+    addi AO,AO,-8  
+    LOAD4x1O 8,32 
+    END4x1_WITHOUT_ADD   
+    LOAD4x1_2O  16, 64  
+    mtctr   T8    
+    bl CGEMM_L4x1_K32   
+    b CGEMM_L4x1_SAVE  
+    CMP4x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-16   
+    LOAD4x1_2O 16,64
+    bl CGEMM_L4x1_K32   
+    b CGEMM_L4x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x1_SUB2_8
+    bl CGEMM_4x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x1_SUB2_4
+    bl CGEMM_4x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x1_SUB2_2
+    LOAD4x1_2
+    KERNEL4x1_L2  16,64, 0,0
+    KERNEL4x1_E2  16,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x1_SUB2_1
+    LOAD4x1_2
+    KERNEL4x1_E2  16,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x1_SAVE 
+    KERNEL4x1
+
+    MY_ALIGN
+CGEMM_L4x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE4x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4
+#endif   
+
+
+CGEMM_L4x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  5
+    addic.    J,  J,  -1
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 4
+#endif   
+    bgt   CGEMM_L4_BEGIN
+
+
+CGEMM_L4_END:
+
+b CGEMM_L2
+/*                MINI SUBROUTINES                            */      
+/*                2x8 MAIN 128x+2 LOOP                     */      
+
+
+CGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x8_2 
+    MY_ALIGN
+CGEMM_L2x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+CGEMM_L2x8_K128:
+/*----------------------------------------*/   
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_L2 128,32,15,0  
+    KERNEL2x8_L2 128,32,16,0
+    KERNEL2x8_L2 128,32,17,0 
+    KERNEL2x8_L2 128,32,18,0
+    KERNEL2x8_L2 128,32,19,0  
+    KERNEL2x8_L2 128,32,20,0
+    KERNEL2x8_L2 128,32,21,0 
+    KERNEL2x8_L2 128,32,22,0
+    KERNEL2x8_L2 128,32,23,0   
+    KERNEL2x8_L2 128,32,24,0
+    KERNEL2x8_L2 128,32,25,0
+    KERNEL2x8_L2 128,32,26,0
+    KERNEL2x8_L2 128,32,27,0  
+    KERNEL2x8_L2 128,32,28,0
+    KERNEL2x8_L2 128,32,29,0
+    KERNEL2x8_L2 128,32,30,0
+    KERNEL2x8_L2 128,32,31,0 
+    KERNEL2x8_L2 128,32,32,0
+    KERNEL2x8_L2 128,32,33,0
+    KERNEL2x8_L2 128,32,34,0
+    KERNEL2x8_L2 128,32,35,0 
+    KERNEL2x8_L2 128,32,36,0
+    KERNEL2x8_L2 128,32,37,0
+    KERNEL2x8_L2 128,32,38,0
+    KERNEL2x8_L2 128,32,39,0  
+    KERNEL2x8_L2 128,32,40,0
+    KERNEL2x8_L2 128,32,41,0
+    KERNEL2x8_L2 128,32,42,0
+    KERNEL2x8_L2 128,32,43,0  
+    KERNEL2x8_L2 128,32,44,0
+    KERNEL2x8_L2 128,32,45,0
+    KERNEL2x8_L2 128,32,46,0
+    KERNEL2x8_L2 128,32,47,0 
+    KERNEL2x8_L2 128,32,48,0
+    KERNEL2x8_L2 128,32,49,0 
+    KERNEL2x8_L2 128,32,50,0
+    KERNEL2x8_L2 128,32,51,0  
+    KERNEL2x8_L2 128,32,52,0
+    KERNEL2x8_L2 128,32,53,0 
+    KERNEL2x8_L2 128,32,54,0
+    KERNEL2x8_L2 128,32,55,0  
+    KERNEL2x8_L2 128,32,56,0
+    KERNEL2x8_L2 128,32,57,0
+    KERNEL2x8_L2 128,32,58,0
+    KERNEL2x8_L2 128,32,59,0  
+    KERNEL2x8_L2 128,32,60,0
+    KERNEL2x8_L2 128,32,61,0
+    KERNEL2x8_L2 128,32,62,0 
+    KERNEL2x8_L2 128,32,63,1  
+    bdnz    CGEMM_L2x8_LOOP
+    MY_ALIGN  
+CGEMM_L2x8_LOOP_END:
+/*----------------------------------------*/   
+    END2x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_L2 128,32,15,0  
+    KERNEL2x8_L2 128,32,16,0
+    KERNEL2x8_L2 128,32,17,0 
+    KERNEL2x8_L2 128,32,18,0
+    KERNEL2x8_L2 128,32,19,0  
+    KERNEL2x8_L2 128,32,20,0
+    KERNEL2x8_L2 128,32,21,0 
+    KERNEL2x8_L2 128,32,22,0
+    KERNEL2x8_L2 128,32,23,0   
+    KERNEL2x8_L2 128,32,24,0
+    KERNEL2x8_L2 128,32,25,0
+    KERNEL2x8_L2 128,32,26,0
+    KERNEL2x8_L2 128,32,27,0  
+    KERNEL2x8_L2 128,32,28,0
+    KERNEL2x8_L2 128,32,29,0
+    KERNEL2x8_L2 128,32,30,0
+    KERNEL2x8_E2 128,32,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_E2 128,32,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_E2 128,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x4_2  
+    MY_ALIGN
+CGEMM_L2x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 64,32,0,0
+CGEMM_L2x4_K32:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_L2 64,32,3,0  
+    KERNEL2x4_L2 64,32,4,0
+    KERNEL2x4_L2 64,32,5,0 
+    KERNEL2x4_L2 64,32,6,0
+    KERNEL2x4_L2 64,32,7,0
+    KERNEL2x4_L2 64,32,8,0
+    KERNEL2x4_L2 64,32,9,0   
+    KERNEL2x4_L2 64,32,10,0
+    KERNEL2x4_L2 64,32,11,0  
+    KERNEL2x4_L2 64,32,12,0
+    KERNEL2x4_L2 64,32,13,0 
+    KERNEL2x4_L2 64,32,14,0
+    KERNEL2x4_L2 64,32,15,1    
+    bdnz    CGEMM_L2x4_LOOP
+    MY_ALIGN  
+CGEMM_L2x4_LOOP_END:
+/*----------------------------------------*/   
+    END2x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 64,32,0,0
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_L2 64,32,3,0  
+    KERNEL2x4_L2 64,32,4,0
+    KERNEL2x4_L2 64,32,5,0 
+    KERNEL2x4_L2 64,32,6,0
+    KERNEL2x4_E2 64,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 64,32,0,0
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_E2 64,32,3,1 
+    blr
+
+
+CGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x2_2  
+    MY_ALIGN 
+CGEMM_L2x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 32,32,0,0 
+CGEMM_L2x2_K32:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_L2 32,32,3,0  
+    KERNEL2x2_L2 32,32,4,0
+    KERNEL2x2_L2 32,32,5,0 
+    KERNEL2x2_L2 32,32,6,0
+    KERNEL2x2_L2 32,32,7,0
+    KERNEL2x2_L2 32,32,8,0
+    KERNEL2x2_L2 32,32,9,0  
+    KERNEL2x2_L2 32,32,10,0
+    KERNEL2x2_L2 32,32,11,0  
+    KERNEL2x2_L2 32,32,12,0
+    KERNEL2x2_L2 32,32,13,0 
+    KERNEL2x2_L2 32,32,14,0
+    KERNEL2x2_L2 32,32,15,1   
+    bdnz    CGEMM_L2x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/   
+    END2x2_2 
+    blr
+    MY_ALIGN
+CGEMM_2x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 32,32,0,0
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_L2 32,32,3,0  
+    KERNEL2x2_L2 32,32,4,0
+    KERNEL2x2_L2 32,32,5,0 
+    KERNEL2x2_L2 32,32,6,0
+    KERNEL2x2_E2 32,32,7,1
+    blr
+    MY_ALIGN
+CGEMM_2x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 32,32,0,0
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_E2 32,32,3,1  
+    blr
+
+
+CGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x1_2  
+    MY_ALIGN
+CGEMM_L2x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 16,32,0,0 
+CGEMM_L2x1_K32:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_L2 16,32,3,0  
+    KERNEL2x1_L2 16,32,4,0
+    KERNEL2x1_L2 16,32,5,0 
+    KERNEL2x1_L2 16,32,6,0
+    KERNEL2x1_L2 16,32,7,0
+    KERNEL2x1_L2 16,32,8,0
+    KERNEL2x1_L2 16,32,9,0  
+    KERNEL2x1_L2 16,32,10,0
+    KERNEL2x1_L2 16,32,11,0  
+    KERNEL2x1_L2 16,32,12,0
+    KERNEL2x1_L2 16,32,13,0 
+    KERNEL2x1_L2 16,32,14,0
+    KERNEL2x1_L2 16,32,15,1   
+    bdnz    CGEMM_L2x1_LOOP
+    MY_ALIGN  
+CGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/   
+    END2x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_2x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 16,32,0,0
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_L2 16,32,3,0  
+    KERNEL2x1_L2 16,32,4,0
+    KERNEL2x1_L2 16,32,5,0 
+    KERNEL2x1_L2 16,32,6,0
+    KERNEL2x1_E2 16,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 16,32,0,0
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_E2 16,32,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L2:
+/*----------------------------------------*/   
+
+    andi.    J,  N,  2
+    ble   CGEMM_L2_END
+
+
+CGEMM_L2_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 1     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   CGEMM_L2x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L2x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO2x8  
+    ble   CGEMM_L2x8_SUB0
+    bl CGEMM_L2x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L2x8_SAVE
+    b   CGEMM_L2x8_SUB2
+
+
+CGEMM_L2x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP2x8_128K
+    addi BO,BO,-16
+    addi AO,AO,-64 
+    LOAD2x8O 64,16 
+    END2x8_WITHOUT_ADD   
+    LOAD2x8_2O  128, 32 
+    mtctr   T8    
+    bl CGEMM_L2x8_K128   
+    b CGEMM_L2x8_SAVE  
+    CMP2x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L2x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-128   
+    LOAD2x8_2O 128,32
+    bl CGEMM_L2x8_K128   
+    b CGEMM_L2x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L2x8_SUB2_32
+    bl  CGEMM_2x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L2x8_SUB2_16    
+    bl  CGEMM_2x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x8_SUB2_8
+    bl  CGEMM_2x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x8_SUB2_4
+    LOAD2x8_2
+    KERNEL2x8_L2  128,32, 0,0
+    KERNEL2x8_L2  128,32, 1,0
+    KERNEL2x8_L2  128,32, 2,0
+    KERNEL2x8_E2  128,32, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x8_SUB2_2
+    LOAD2x8_2
+    KERNEL2x8_L2  128,32, 0,0
+    KERNEL2x8_E2  128,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x8_SUB2_1
+    LOAD2x8_2 
+    KERNEL2x8_E2  128,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x8_SAVE 
+    KERNEL2x8
+
+    MY_ALIGN
+CGEMM_L2x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE2x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
+#endif     
+    bgt   CGEMM_L2x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L2x4_END
+    b   CGEMM_L2x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L2x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L2x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x4
+    ble   CGEMM_L2x4_SUB0 
+    bl CGEMM_2x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x4_SAVE
+    b    CGEMM_L2x4_SUB2
+
+
+CGEMM_L2x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x4_32K
+    addi BO,BO,-16
+    addi AO,AO,-32  
+    LOAD2x4O 32,16 
+    END2x4_WITHOUT_ADD   
+    LOAD2x4_2O  64, 32 
+    mtctr   T8    
+    bl CGEMM_L2x4_K32   
+    b CGEMM_L2x4_SAVE  
+    CMP2x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-64   
+    LOAD2x4_2O 64,32
+    bl CGEMM_L2x4_K32   
+    b CGEMM_L2x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x4_SUB2_8
+    bl  CGEMM_2x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x4_SUB2_4
+    bl CGEMM_2x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x4_SUB2_2
+    LOAD2x4_2
+    KERNEL2x4_L2  64,32, 0,0
+    KERNEL2x4_E2  64,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x4_SUB2_1
+    LOAD2x4_2
+    KERNEL2x4_E2  64,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x4_SAVE 
+    KERNEL2x4
+
+
+CGEMM_L2x4_SAVE:
+/*----------------------------------------*/   
+    SAVE2x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
+#endif     
+
+
+CGEMM_L2x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L2x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x2
+    ble   CGEMM_L2x2_SUB0 
+    bl CGEMM_2x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x2_SAVE
+    b   CGEMM_L2x2_SUB2
+
+
+CGEMM_L2x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x2_32K
+    addi BO,BO,-16
+    addi AO,AO,-16  
+    LOAD2x2O 16,16 
+    END2x2_WITHOUT_ADD   
+    LOAD2x2_2O  32, 32  
+    mtctr   T8    
+    bl CGEMM_L2x2_K32   
+    b CGEMM_L2x2_SAVE  
+    CMP2x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-32   
+    LOAD2x2_2O 32,32
+    bl CGEMM_L2x2_K32   
+    b CGEMM_L2x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x2_SUB2_8
+    bl CGEMM_2x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x2_SUB2_4
+    bl CGEMM_2x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x2_SUB2_2
+    LOAD2x2_2
+    KERNEL2x2_L2  32,32, 0,0
+    KERNEL2x2_E2  32,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x2_SUB2_1
+    LOAD2x2_2
+    KERNEL2x2_E2  32,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x2_SAVE 
+    KERNEL2x2
+
+    MY_ALIGN
+CGEMM_L2x2_SAVE:
+/*----------------------------------------*/   
+    SAVE2x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
+#endif     
+
+
+CGEMM_L2x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L2x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x1
+    ble   CGEMM_L2x1_SUB0 
+    bl CGEMM_2x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x1_SAVE
+    b   CGEMM_L2x1_SUB2
+
+
+CGEMM_L2x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x1_32K
+    addi BO,BO,-16
+    addi AO,AO,-8  
+    LOAD2x1O 8,16 
+    END2x1_WITHOUT_ADD   
+    LOAD2x1_2O  16, 32  
+    mtctr   T8    
+    bl CGEMM_L2x1_K32   
+    b CGEMM_L2x1_SAVE  
+    CMP2x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-16   
+    LOAD2x1_2O 16,32
+    bl CGEMM_L2x1_K32   
+    b CGEMM_L2x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x1_SUB2_8
+    bl CGEMM_2x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x1_SUB2_4
+    bl CGEMM_2x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x1_SUB2_2
+    LOAD2x1_2
+    KERNEL2x1_L2  16,32, 0,0
+    KERNEL2x1_E2  16,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x1_SUB2_1
+    LOAD2x1_2
+    KERNEL2x1_E2  16,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x1_SAVE 
+    KERNEL2x1
+
+    MY_ALIGN
+CGEMM_L2x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE2x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
+#endif   
+
+
+CGEMM_L2x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  4
+
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 2
+#endif   
+
+CGEMM_L2_END:
+
+
+b CGEMM_L1
+/*                MINI SUBROUTINES                            */      
+/*                1x8 MAIN 128x+2 LOOP                     */      
+
+
+CGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x8_2 
+    MY_ALIGN
+CGEMM_L1x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+CGEMM_L1x8_K128:
+/*----------------------------------------*/   
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_L2 128,16,15,0  
+    KERNEL1x8_L2 128,16,16,0
+    KERNEL1x8_L2 128,16,17,0 
+    KERNEL1x8_L2 128,16,18,0
+    KERNEL1x8_L2 128,16,19,0  
+    KERNEL1x8_L2 128,16,20,0
+    KERNEL1x8_L2 128,16,21,0 
+    KERNEL1x8_L2 128,16,22,0
+    KERNEL1x8_L2 128,16,23,0   
+    KERNEL1x8_L2 128,16,24,0
+    KERNEL1x8_L2 128,16,25,0
+    KERNEL1x8_L2 128,16,26,0
+    KERNEL1x8_L2 128,16,27,0  
+    KERNEL1x8_L2 128,16,28,0
+    KERNEL1x8_L2 128,16,29,0
+    KERNEL1x8_L2 128,16,30,0
+    KERNEL1x8_L2 128,16,31,0 
+    KERNEL1x8_L2 128,16,32,0
+    KERNEL1x8_L2 128,16,33,0
+    KERNEL1x8_L2 128,16,34,0
+    KERNEL1x8_L2 128,16,35,0 
+    KERNEL1x8_L2 128,16,36,0
+    KERNEL1x8_L2 128,16,37,0
+    KERNEL1x8_L2 128,16,38,0
+    KERNEL1x8_L2 128,16,39,0  
+    KERNEL1x8_L2 128,16,40,0
+    KERNEL1x8_L2 128,16,41,0
+    KERNEL1x8_L2 128,16,42,0
+    KERNEL1x8_L2 128,16,43,0  
+    KERNEL1x8_L2 128,16,44,0
+    KERNEL1x8_L2 128,16,45,0
+    KERNEL1x8_L2 128,16,46,0
+    KERNEL1x8_L2 128,16,47,0 
+    KERNEL1x8_L2 128,16,48,0
+    KERNEL1x8_L2 128,16,49,0 
+    KERNEL1x8_L2 128,16,50,0
+    KERNEL1x8_L2 128,16,51,0  
+    KERNEL1x8_L2 128,16,52,0
+    KERNEL1x8_L2 128,16,53,0 
+    KERNEL1x8_L2 128,16,54,0
+    KERNEL1x8_L2 128,16,55,0  
+    KERNEL1x8_L2 128,16,56,0
+    KERNEL1x8_L2 128,16,57,0
+    KERNEL1x8_L2 128,16,58,0
+    KERNEL1x8_L2 128,16,59,0  
+    KERNEL1x8_L2 128,16,60,0
+    KERNEL1x8_L2 128,16,61,0
+    KERNEL1x8_L2 128,16,62,0 
+    KERNEL1x8_L2 128,16,63,1  
+    bdnz    CGEMM_L1x8_LOOP
+    MY_ALIGN  
+CGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/   
+    END1x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_L2 128,16,15,0  
+    KERNEL1x8_L2 128,16,16,0
+    KERNEL1x8_L2 128,16,17,0 
+    KERNEL1x8_L2 128,16,18,0
+    KERNEL1x8_L2 128,16,19,0  
+    KERNEL1x8_L2 128,16,20,0
+    KERNEL1x8_L2 128,16,21,0 
+    KERNEL1x8_L2 128,16,22,0
+    KERNEL1x8_L2 128,16,23,0   
+    KERNEL1x8_L2 128,16,24,0
+    KERNEL1x8_L2 128,16,25,0
+    KERNEL1x8_L2 128,16,26,0
+    KERNEL1x8_L2 128,16,27,0  
+    KERNEL1x8_L2 128,16,28,0
+    KERNEL1x8_L2 128,16,29,0
+    KERNEL1x8_L2 128,16,30,0
+    KERNEL1x8_E2 128,16,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_E2 128,16,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_E2 128,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x4_2  
+    MY_ALIGN
+CGEMM_L1x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 64,16,0,0
+CGEMM_L1x4_K32:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_L2 64,16,3,0  
+    KERNEL1x4_L2 64,16,4,0
+    KERNEL1x4_L2 64,16,5,0 
+    KERNEL1x4_L2 64,16,6,0
+    KERNEL1x4_L2 64,16,7,0
+    KERNEL1x4_L2 64,16,8,0
+    KERNEL1x4_L2 64,16,9,0   
+    KERNEL1x4_L2 64,16,10,0
+    KERNEL1x4_L2 64,16,11,0  
+    KERNEL1x4_L2 64,16,12,0
+    KERNEL1x4_L2 64,16,13,0 
+    KERNEL1x4_L2 64,16,14,0
+    KERNEL1x4_L2 64,16,15,1    
+    bdnz    CGEMM_L1x4_LOOP
+    MY_ALIGN  
+CGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/   
+    END1x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 64,16,0,0
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_L2 64,16,3,0  
+    KERNEL1x4_L2 64,16,4,0
+    KERNEL1x4_L2 64,16,5,0 
+    KERNEL1x4_L2 64,16,6,0
+    KERNEL1x4_E2 64,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 64,16,0,0
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_E2 64,16,3,1 
+    blr
+
+
+CGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x2_2  
+    MY_ALIGN 
+CGEMM_L1x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 32,16,0,0 
+CGEMM_L1x2_K32:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_L2 32,16,3,0  
+    KERNEL1x2_L2 32,16,4,0
+    KERNEL1x2_L2 32,16,5,0 
+    KERNEL1x2_L2 32,16,6,0
+    KERNEL1x2_L2 32,16,7,0
+    KERNEL1x2_L2 32,16,8,0
+    KERNEL1x2_L2 32,16,9,0  
+    KERNEL1x2_L2 32,16,10,0
+    KERNEL1x2_L2 32,16,11,0  
+    KERNEL1x2_L2 32,16,12,0
+    KERNEL1x2_L2 32,16,13,0 
+    KERNEL1x2_L2 32,16,14,0
+    KERNEL1x2_L2 32,16,15,1   
+    bdnz    CGEMM_L1x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/   
+    END1x2_2 
+    blr
+    MY_ALIGN
+CGEMM_1x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 32,16,0,0
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_L2 32,16,3,0  
+    KERNEL1x2_L2 32,16,4,0
+    KERNEL1x2_L2 32,16,5,0 
+    KERNEL1x2_L2 32,16,6,0
+    KERNEL1x2_E2 32,16,7,1
+    blr
+    MY_ALIGN
+CGEMM_1x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 32,16,0,0
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_E2 32,16,3,1  
+    blr
+
+
+CGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x1_2  
+    MY_ALIGN
+CGEMM_L1x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 16,16,0,0 
+CGEMM_L1x1_K32:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_L2 16,16,3,0  
+    KERNEL1x1_L2 16,16,4,0
+    KERNEL1x1_L2 16,16,5,0 
+    KERNEL1x1_L2 16,16,6,0
+    KERNEL1x1_L2 16,16,7,0
+    KERNEL1x1_L2 16,16,8,0
+    KERNEL1x1_L2 16,16,9,0  
+    KERNEL1x1_L2 16,16,10,0
+    KERNEL1x1_L2 16,16,11,0  
+    KERNEL1x1_L2 16,16,12,0
+    KERNEL1x1_L2 16,16,13,0 
+    KERNEL1x1_L2 16,16,14,0
+    KERNEL1x1_L2 16,16,15,1   
+    bdnz    CGEMM_L1x1_LOOP
+    MY_ALIGN  
+CGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/   
+    END1x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_1x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 16,16,0,0
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_L2 16,16,3,0  
+    KERNEL1x1_L2 16,16,4,0
+    KERNEL1x1_L2 16,16,5,0 
+    KERNEL1x1_L2 16,16,6,0
+    KERNEL1x1_E2 16,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 16,16,0,0
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_E2 16,16,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L1:
+/*----------------------------------------*/   
+
+    andi.    J,  N,  1
+    ble   CGEMM_L1_END
+
+CGEMM_L1_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C  
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   CGEMM_L1x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L1x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO1x8  
+    ble   CGEMM_L1x8_SUB0
+    bl CGEMM_L1x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L1x8_SAVE
+    b   CGEMM_L1x8_SUB2
+
+
+CGEMM_L1x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP1x8_128K
+    addi BO,BO,-8
+    addi AO,AO,-64 
+    LOAD1x8O 64,8 
+    END1x8_WITHOUT_ADD   
+    LOAD1x8_2O  128, 16 
+    mtctr   T8    
+    bl CGEMM_L1x8_K128   
+    b CGEMM_L1x8_SAVE  
+    CMP1x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L1x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-128   
+    LOAD1x8_2O 128,16
+    bl CGEMM_L1x8_K128   
+    b CGEMM_L1x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L1x8_SUB2_32
+    bl  CGEMM_1x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L1x8_SUB2_16    
+    bl  CGEMM_1x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x8_SUB2_8
+    bl  CGEMM_1x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x8_SUB2_4
+    LOAD1x8_2
+    KERNEL1x8_L2  128,16, 0,0
+    KERNEL1x8_L2  128,16, 1,0
+    KERNEL1x8_L2  128,16, 2,0
+    KERNEL1x8_E2  128,16, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x8_SUB2_2
+    LOAD1x8_2
+    KERNEL1x8_L2  128,16, 0,0
+    KERNEL1x8_E2  128,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x8_SUB2_1
+    LOAD1x8_2 
+    KERNEL1x8_E2  128,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x8_SAVE 
+    KERNEL1x8
+
+    MY_ALIGN
+CGEMM_L1x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE1x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
+#endif     
+    bgt   CGEMM_L1x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L1x4_END
+    b   CGEMM_L1x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L1x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L1x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x4
+    ble   CGEMM_L1x4_SUB0 
+    bl CGEMM_1x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x4_SAVE
+    b    CGEMM_L1x4_SUB2
+
+
+CGEMM_L1x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x4_32K
+    addi BO,BO,-8
+    addi AO,AO,-32  
+    LOAD1x4O 32,8 
+    END1x4_WITHOUT_ADD   
+    LOAD1x4_2O  64, 16 
+    mtctr   T8    
+    bl CGEMM_L1x4_K32   
+    b CGEMM_L1x4_SAVE  
+    CMP1x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-64   
+    LOAD1x4_2O 64,16
+    bl CGEMM_L1x4_K32   
+    b CGEMM_L1x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x4_SUB2_8
+    bl  CGEMM_1x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x4_SUB2_4
+    bl CGEMM_1x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x4_SUB2_2
+    LOAD1x4_2
+    KERNEL1x4_L2  64,16, 0,0
+    KERNEL1x4_E2  64,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x4_SUB2_1
+    LOAD1x4_2
+    KERNEL1x4_E2  64,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x4_SAVE 
+    KERNEL1x4
+
+
+CGEMM_L1x4_SAVE:
+/*----------------------------------------*/   
+    SAVE1x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
+#endif     
+
+
+CGEMM_L1x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L1x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x2
+    ble   CGEMM_L1x2_SUB0 
+    bl CGEMM_1x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x2_SAVE
+    b   CGEMM_L1x2_SUB2
+
+
+CGEMM_L1x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x2_32K
+    addi BO,BO,-8
+    addi AO,AO,-16  
+    LOAD1x2O 16,8 
+    END1x2_WITHOUT_ADD   
+    LOAD1x2_2O  32, 16  
+    mtctr   T8    
+    bl CGEMM_L1x2_K32   
+    b CGEMM_L1x2_SAVE  
+    CMP1x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-32   
+    LOAD1x2_2O 32,16
+    bl CGEMM_L1x2_K32   
+    b CGEMM_L1x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x2_SUB2_8
+    bl CGEMM_1x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x2_SUB2_4
+    bl CGEMM_1x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x2_SUB2_2
+    LOAD1x2_2
+    KERNEL1x2_L2  32,16, 0,0
+    KERNEL1x2_E2  32,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x2_SUB2_1
+    LOAD1x2_2
+    KERNEL1x2_E2  32,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x2_SAVE 
+    KERNEL1x2
+
+    MY_ALIGN
+CGEMM_L1x2_SAVE:
+/*----------------------------------------*/   
+    SAVE1x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
+#endif     
+
+
+CGEMM_L1x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L1x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x1
+    ble   CGEMM_L1x1_SUB0 
+    bl CGEMM_1x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x1_SAVE
+    b   CGEMM_L1x1_SUB2
+
+
+CGEMM_L1x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x1_32K
+    addi BO,BO,-8
+    addi AO,AO,-8  
+    LOAD1x1O 8,8 
+    END1x1_WITHOUT_ADD   
+    LOAD1x1_2O  16, 16  
+    mtctr   T8    
+    bl CGEMM_L1x1_K32   
+    b CGEMM_L1x1_SAVE  
+    CMP1x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-16   
+    LOAD1x1_2O 16,16
+    bl CGEMM_L1x1_K32   
+    b CGEMM_L1x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x1_SUB2_8
+    bl CGEMM_1x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x1_SUB2_4
+    bl CGEMM_1x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x1_SUB2_2
+    LOAD1x1_2
+    KERNEL1x1_L2  16,16, 0,0
+    KERNEL1x1_E2  16,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x1_SUB2_1
+    LOAD1x1_2
+    KERNEL1x1_E2  16,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x1_SAVE 
+    KERNEL1x1
+
+    MY_ALIGN
+CGEMM_L1x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE1x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
+#endif   
+
+
+CGEMM_L1x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  3
+
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 1
+#endif   
+
+CGEMM_L1_END:
+
+
+
+
diff --git a/kernel/power/cgemm_macros_power9.S b/kernel/power/cgemm_macros_power9.S
index a256e1a01..be2b74f01 100644
--- a/kernel/power/cgemm_macros_power9.S
+++ b/kernel/power/cgemm_macros_power9.S
@@ -1,3019 +1,3019 @@
-
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* Abdelrauf(quickwritereader@gmail.com)
-* BLASTEST 	     	: OK
-*  CTEST		    	: OK
-*  TEST			      : OK
-*	 LAPACK-TEST		: OK
-**************************************************************************************/
-#define unit_size 8
-#define DISP32(ind,disp) (ind*unit_size*32+disp)
-#define DISP16(ind,disp) (ind*unit_size*16+disp)
-#define DISP8(ind,disp) (ind*unit_size*8+disp)
-#define DISP4(ind,disp) (ind*unit_size*4+disp)
-#define DISP2(ind,disp) (ind*unit_size*2+disp)
-#define DISP1(ind,disp) (ind*unit_size+disp)
-#define DISPX(disp)  (disp)
-
-.macro  AGGREGATE_REALS_IMAGES  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
-#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
-	xvsubsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
-#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
-	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvsubsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
-#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
-	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvsubsp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
-#else	// CC || CR || RC || RR 
-    /*we will assume {-alpha_r,-alpha_i} for this case */
-    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
-	xvsubsp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
-    /*we will negate alpha image   instead to fix sign*/
-	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
-#endif
-.endm
-
-
-.macro  AGGREGATE_REALS_IMAGES_A_PERMUTE  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
-#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
-	xvsubsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
-#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
-	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvsubsp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
-#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
-	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvsubsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
-#else	// CC || CR || RC || RR 
-    /*we will assume {-alpha_r,-alpha_i} for this case */
-    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
-	xvsubsp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
-    /*we will negate alpha image   instead to fix sign*/
-	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
-#endif
-.endm
- 
-/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */
-
-.macro MULT_APLHA_PART1  VSINRR,VSINII,VSOUT1,VSOUT2
-	xvmulsp \VSOUT1,\VSINII, alpha_i 
-	xvmulsp  \VSOUT2,\VSINRR, alpha_i
-.endm
-
-/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
-
-.macro MULT_APLHA_PART2  VSINRR,VSINII,VSOUT1,VSOUT2 
-	xvmsubasp  \VSOUT1,\VSINRR, alpha_r
-	xvmaddasp \VSOUT2,\VSINII, alpha_r
-.endm
-
-/*                                             macros for N=4 and M=8
-**********************************************************************************************/
-
-.macro Zero4x8
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs38,	vs38,	vs38
-	xxlxor	vs39,	vs39,	vs39
-	xxlxor	vs40,	vs40,	vs40
-	xxlxor	vs41,	vs41,	vs41
-	xxlxor	vs42,	vs42,	vs42
-	xxlxor	vs43,	vs43,	vs43
-	xxlxor	vs44,	vs44,	vs44
-	xxlxor	vs45,	vs45,	vs45
-	xxlxor	vs46,	vs46,	vs46
-	xxlxor	vs47,	vs47,	vs47
-	xxlxor	vs48,	vs48,	vs48
-	xxlxor	vs49,	vs49,	vs49
-	xxlxor	vs50,	vs50,	vs50
-	xxlxor	vs51,	vs51,	vs51
-	xxlxor	vs52,	vs52,	vs52
-	xxlxor	vs53,	vs53,	vs53
-	xxlxor	vs54,	vs54,	vs54
-	xxlxor	vs55,	vs55,	vs55
-	xxlxor	vs56,	vs56,	vs56
-	xxlxor	vs57,	vs57,	vs57
-	xxlxor	vs58,	vs58,	vs58
-	xxlxor	vs59,	vs59,	vs59
-	xxlxor	vs60,	vs60,	vs60
-	xxlxor	vs61,	vs61,	vs61
-	xxlxor	vs62,	vs62,	vs62
-	xxlxor	vs63,	vs63,	vs63
-.endm
-
-
-.macro LOAD4x8   
-	LOAD4x8O 0,0 
-.endm
-
-
-.macro LOAD4x8O  OffsetA,OffsetB
-	lxv	vs24,	(\OffsetB+0)(BO)
-	lxv	vs28,	(\OffsetB+16)(BO)
-	xxperm  	vs26,	vs24,		permute_mask
-	xxperm  	vs30,	vs28,		permute_mask	  
-	lxv	vs0,	(\OffsetA+0)(AO)
-	lxv	vs1,	(\OffsetA+16)(AO)
-	xxpermdi	vs25,	vs24,	vs24,2	   
-	xxpermdi	vs29,	vs28,	vs28,2	  
-	lxv	vs2,	(\OffsetA+32)(AO)
-	lxv	vs3,	(\OffsetA+48)(AO) 
-	xxpermdi	vs27,	vs26,	vs26,2	
-	xxpermdi	vs31,	vs30,	vs30,2	 	
-.endm
-
-
-.macro END4x8_NORMAL
-	END4x8 AO,BO,64,32
-.endm
-
-
-.macro END4x8_WITHOUT_ADD
-	END4x8 AO,BO,0,0
-.endm
-
-
-.macro END4x8	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs34, vs2,vs24  
-    xvmaddasp       vs35, vs3,vs24  
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-    xvmaddasp       vs38, vs2,vs25  
-    xvmaddasp       vs39, vs3,vs25 
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs42, vs2,vs26  
-    xvmaddasp       vs43, vs3,vs26
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-    xvmaddasp       vs46, vs2,vs27  
-    xvmaddasp       vs47, vs3,vs27
-    xvmaddasp       vs48, vs0,vs28
-    xvmaddasp       vs49, vs1,vs28
-    xvmaddasp       vs50, vs2,vs28  
-    xvmaddasp       vs51, vs3,vs28  
-    xvmaddasp       vs52, vs0,vs29
-    xvmaddasp       vs53, vs1,vs29
-    xvmaddasp       vs54, vs2,vs29  
-    xvmaddasp       vs55, vs3,vs29
-    xvmaddasp       vs56, vs0,vs30
-    xvmaddasp       vs57, vs1,vs30
-    xvmaddasp       vs58, vs2,vs30  
-    xvmaddasp       vs59, vs3,vs30
-    xvmaddasp       vs60, vs0,vs31
-    xvmaddasp       vs61, vs1,vs31
-    xvmaddasp       vs62, vs2,vs31  
-    xvmaddasp       vs63, vs3,vs31 
-.endm
-
-
-.macro LOAD4x8_2
-    LOAD4x8_2O 0,0
-.endm
-	
-
-.macro LOAD4x8_2O  OffsetA,OffsetB
-  lxv	vs8,	(\OffsetB)(BO)
-  lxv	vs12,	(16+\OffsetB)(BO)
-  lxv	vs24,	(32+\OffsetB)(BO)
-  lxv	vs28,	(32+16+\OffsetB)(BO)
-  lxv	vs4,	(0+\OffsetA)(AO)
-  lxv	vs5,	(16+\OffsetA)(AO)
-  xxperm  	vs10,	vs8,		permute_mask
-  xxperm  	vs14,	vs12,		permute_mask	
-  lxv	vs6,	(32+\OffsetA)(AO)
-  lxv	vs7,	(48+\OffsetA)(AO) 
-  xxpermdi	vs9,	vs8,	 vs8,2	 
-  xxpermdi	vs13,	vs12,	vs12,2	 
-  lxv	vs0,	(64+\OffsetA)(AO)
-  lxv	vs1,	(64+16+\OffsetA)(AO) 
-  xxpermdi	vs11,	vs10,	vs10,2	
-  xxpermdi	vs15,	vs14,	vs14,2	
-  lxv	vs2,	(64+32+\OffsetA)(AO)
-  lxv	vs3,	(64+48+\OffsetA)(AO)
-  xxperm  	vs26,	vs24,	permute_mask
-  xxperm  	vs30,	vs28,	permute_mask	
-  xxpermdi	vs25,	vs24,	vs24,2 
-  xxpermdi	vs29,	vs28,	vs28,2	      
-  xxpermdi	vs27,	vs26,	vs26,2	
-  xxpermdi	vs31,	vs30,	vs30,2	 
-.endm
-	
-
-.macro END4x8_2	  
-  /*for load2 offset will be 128 and 64*/
-   KERNEL4x8_2	AO,BO,	128,64,0 ,1,1 
-.endm
-
-
-.macro KERNEL4x8_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL4x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL4x8_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL4x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp		vs32, vs4,vs8
-  xvmaddasp		vs33, vs5,vs8
-  xvmaddasp		vs48, vs4,vs12
-  xvmaddasp		vs49, vs5,vs12
-  xvmaddasp		vs40, vs4,vs10
-  xvmaddasp		vs41, vs5,vs10
-  xvmaddasp		vs56, vs4,vs14
-  xvmaddasp		vs57, vs5,vs14
-  xvmaddasp		vs36, vs4,vs9
-  xvmaddasp		vs37, vs5,vs9
-  xvmaddasp		vs52, vs4,vs13
-  xvmaddasp		vs53, vs5,vs13
-  xvmaddasp		vs44, vs4,vs11
-  xvmaddasp		vs45, vs5,vs11
-  xvmaddasp		vs60, vs4,vs15
-  xvmaddasp		vs61, vs5,vs15
-.if \Complete==0	
-   lxv	vs4,	DISP16(\Index,0+\OffsetA)(\AREG)
-   lxv	vs5,	DISP16(\Index,16+\OffsetA)(\AREG)
-.endif
-
-  xvmaddasp		vs34, vs6,vs8	
-  xvmaddasp		vs35, vs7,vs8	
-  xvmaddasp		vs50, vs6,vs12
-  xvmaddasp		vs51, vs7,vs12
-.if \Complete==0  
-  lxv vs8,  DISP8(\Index,\OffsetB)(\BREG)
-  lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)
-.endif    
-  xvmaddasp		vs42, vs6,vs10
-  xvmaddasp		vs43, vs7,vs10
-  xvmaddasp		vs58, vs6,vs14
-  xvmaddasp		vs59, vs7,vs14
-.if \Complete==0  
-  xxperm    vs10, vs8,    permute_mask
-  xxperm    vs14, vs12,   permute_mask    
-.endif    
-  xvmaddasp		vs38, vs6,vs9	
-  xvmaddasp		vs39, vs7,vs9	
-  xvmaddasp   vs54, vs6,vs13
-  xvmaddasp   vs55, vs7,vs13
-.if \Complete==0
-  xxpermdi  vs9,  vs8,   vs8,2   
-  xxpermdi  vs13, vs12, vs12,2   
-.endif    
-  xvmaddasp		vs46, vs6,vs11
-  xvmaddasp		vs47, vs7,vs11
-  xvmaddasp		vs62, vs6,vs15
-  xvmaddasp		vs63, vs7,vs15
-.if \Complete==0
-  xxpermdi  vs11, vs10, vs10,2  
-  xxpermdi  vs15, vs14, vs14,2  
-.endif  
-.if \Complete==0
-   lxv	vs6,	DISP16(\Index,32+\OffsetA)(\AREG)
-   lxv	vs7,	DISP16(\Index,48+\OffsetA)(\AREG) 
-.endif 
-  xvmaddasp		vs32, vs0,vs24
-  xvmaddasp		vs33, vs1,vs24
-  xvmaddasp		vs48, vs0,vs28
-  xvmaddasp		vs49, vs1,vs28
-  xvmaddasp		vs40, vs0,vs26
-  xvmaddasp		vs41, vs1,vs26
-  xvmaddasp		vs56, vs0,vs30
-  xvmaddasp		vs57, vs1,vs30
-  xvmaddasp		vs36, vs0,vs25
-  xvmaddasp		vs37, vs1,vs25
-  xvmaddasp		vs52, vs0,vs29
-  xvmaddasp		vs53, vs1,vs29
-  xvmaddasp		vs44, vs0,vs27
-  xvmaddasp		vs45, vs1,vs27
-  xvmaddasp		vs60, vs0,vs31
-  xvmaddasp		vs61, vs1,vs31 
-.if \Complete==0
-  lxv	vs0,	DISP16(\Index,64+\OffsetA)(\AREG)
-  lxv	vs1,	DISP16(\Index,64+16+\OffsetA)(\AREG) 
-.endif
-
-  xvmaddasp		vs34, vs2,vs24
-  xvmaddasp		vs35, vs3,vs24	  
-  xvmaddasp		vs50, vs2,vs28
-  xvmaddasp		vs51, vs3,vs28
-.if \Complete==0
-  lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)
-  lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)
-.endif  
-  xvmaddasp		vs42, vs2,vs26
-  xvmaddasp		vs43, vs3,vs26
-  xvmaddasp		vs58, vs2,vs30
-  xvmaddasp		vs59, vs3,vs30
-.if \Complete==0
-  xxperm    vs26, vs24, permute_mask
-  xxperm    vs30, vs28, permute_mask  
-.endif  
-  xvmaddasp		vs38, vs2,vs25
-  xvmaddasp		vs39, vs3,vs25
-  xvmaddasp		vs54, vs2,vs29
-  xvmaddasp		vs55, vs3,vs29
-.if \Complete==0
-  xxpermdi  vs25, vs24, vs24,2 
-  xxpermdi  vs29, vs28, vs28,2    
-.endif  
-  xvmaddasp		vs46, vs2,vs27
-  xvmaddasp		vs47, vs3,vs27
-  xvmaddasp		vs62, vs2,vs31	
-  xvmaddasp		vs63, vs3,vs31
-.if \Complete==0
-  xxpermdi  vs27, vs26, vs26,2  
-  xxpermdi  vs31, vs30, vs30,2   
-.endif
-
-.if \Complete==0
-  lxv	vs2,	DISP16(\Index,64+32+\OffsetA)(\AREG)
-  lxv	vs3,	DISP16(\Index,64+48+\OffsetA)(\AREG)
-.endif
-
-.if \IsLast==1	
-.if \Complete==1
-	addi		\BREG, \BREG,  DISP8(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP16(\Index,\OffsetA)  
-.else
-	addi		\BREG, \BREG,  DISP8(\Index,64)
-  addi    \AREG, \AREG, DISP16(\Index,128)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL4x8
-  LOAD4x8
-  END4x8  AO, BO, 64,32
-.endm
-
-
-.macro SAVE4x8
-  add T4, LDC,LDC
-	add	T1, CO ,LDC  
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO)
-  lxv vs25 , 16(CO)
-#endif
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-#ifndef TRMMKERNEL  
-  lxv vs26 , 32(CO)
-  lxv vs27 , 48(CO)
-#endif  
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask
-#ifndef TRMMKERNEL  
-  lxv vs28 , 0(T1)
-  lxv vs29 , 16(T1)
-#endif  
-  xxperm  vs2,vs34,permute_mask
-  xxperm  vs6,vs42,permute_mask
-#ifndef TRMMKERNEL  
-  lxv vs30 , 32(T1)
-  lxv vs31 , 48(T1)
-#endif 
-  xxperm  vs3,vs35,permute_mask
-  xxperm  vs7,vs43,permute_mask 
-  add T2,CO,T4
-  add T3,T1,T4  
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  xxperm  vs8,vs36,permute_mask
-  xxperm  vs12,vs44,permute_mask
-  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
-  xxperm  vs9,vs37,permute_mask
-  xxperm  vs13,vs45,permute_mask
-  AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
-  xxperm  vs10,vs38,permute_mask
-  xxperm  vs14,vs46,permute_mask
-  AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 
-  xxperm  vs11,vs39,permute_mask
-  xxperm  vs15,vs47,permute_mask 
-  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
-  xxperm  vs0,vs48,permute_mask
-  xxperm  vs4,vs56,permute_mask
-  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
-  xxperm  vs1,vs49,permute_mask
-  xxperm  vs5,vs57,permute_mask
-  AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14
-  xxperm  vs2,vs50,permute_mask
-  xxperm  vs6,vs58,permute_mask
-  AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 
-  xxperm  vs3,vs51,permute_mask
-  xxperm  vs7,vs59,permute_mask 
-  AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4
-  xxperm  vs8,vs52,permute_mask
-  xxperm  vs12,vs60,permute_mask
-  AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5
-  xxperm  vs9,vs53,permute_mask
-  xxperm  vs13,vs61,permute_mask
-  AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6
-  xxperm  vs10,vs54,permute_mask
-  xxperm  vs14,vs62,permute_mask
-  AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 
-  xxperm  vs11,vs55,permute_mask
-  xxperm  vs15,vs63,permute_mask 
-  AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12
-  AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
-  AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3  
-  AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15  
-  MULT_APLHA_PART1    vs34,vs42,vs4,vs5
-  MULT_APLHA_PART1    vs35,vs43,vs6,vs7  
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
-  MULT_APLHA_PART2    vs34,vs42,vs4,vs5
-  MULT_APLHA_PART2    vs35,vs43,vs6,vs7  
- #ifndef TRMMKERNEL  
-  lxv vs32 , 0(T2)
-  lxv vs40 , 16(T2)
-#endif 
-  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
-#ifndef TRMMKERNEL  
-  lxv vs33 , 32(T2)
-  lxv vs41 , 48(T2)
-#endif  
-  MULT_APLHA_PART1    vs38,vs46,vs12,vs13
-  MULT_APLHA_PART1    vs39,vs47,vs14,vs15
-#ifndef TRMMKERNEL  
-  lxv vs34 , 0(T3)
-  lxv vs42 , 16(T3)
-#endif  
-  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
-#ifndef TRMMKERNEL  
-  lxv vs35 , 32(T3)
-  lxv vs43 , 48(T3)
-#endif    
-  MULT_APLHA_PART2    vs38,vs46,vs12,vs13
-  MULT_APLHA_PART2    vs39,vs47,vs14,vs15
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs2,vs3, save_permute_1
-  xxperm  vs4,vs5, save_permute_1
-  xxperm  vs6,vs7, save_permute_1
-  xxperm  vs8,vs9, save_permute_1
-  xxperm  vs10,vs11, save_permute_1
-  xxperm  vs12,vs13, save_permute_1
-  xxperm  vs14,vs15, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxpermdi vs1,vs8,vs0,2
-  xxpermdi vs3,vs10,vs2,2
-  xxpermdi vs5,vs12,vs4,2
-  xxpermdi vs7,vs14,vs6,2
-  xxpermdi vs9,vs0,vs8,2
-  xxpermdi vs11,vs2,vs10,2  
-  xvaddsp vs24,vs24,vs1
-  xvaddsp vs25,vs25,vs3
-  xxpermdi vs13,vs4,vs12,2  
-  xxpermdi vs15,vs6,vs14,2
-  xvaddsp vs26,vs26,vs5
-  xvaddsp  vs27,vs27,vs7
-  xvaddsp vs28,vs28,vs9
-  xvaddsp vs29,vs29,vs11 
-  xvaddsp vs30,vs30,vs13
-  xvaddsp vs31,vs31,vs15  
-#else
-  xxpermdi vs24,vs8,vs0,2
-  xxpermdi vs25,vs10,vs2,2
-  xxpermdi vs26,vs12,vs4,2
-  xxpermdi vs27,vs14,vs6,2 
-  xxpermdi vs28,vs0,vs8,2
-  xxpermdi vs29,vs2,vs10,2  
-  xxpermdi vs30,vs4,vs12,2  
-  xxpermdi vs31,vs6,vs14,2
-#endif
-  stxv vs24 , 0(CO)
-  stxv vs25 , 16(CO)
-  MULT_APLHA_PART1    vs48,vs56,vs0,vs1
-  MULT_APLHA_PART1    vs49,vs57,vs2,vs3
-  stxv vs26 , 32(CO)
-  stxv vs27 , 48(CO)
-  MULT_APLHA_PART1    vs50,vs58,vs4,vs5
-  MULT_APLHA_PART1    vs51,vs59,vs6,vs7
-  stxv vs28 , 0(T1)
-  stxv vs29 , 16(T1)
-  MULT_APLHA_PART2    vs48,vs56,vs0,vs1
-  MULT_APLHA_PART2    vs49,vs57,vs2,vs3
-  stxv vs30 , 32(T1)
-  stxv vs31 , 48(T1)  
-  MULT_APLHA_PART2    vs50,vs58,vs4,vs5
-  MULT_APLHA_PART2    vs51,vs59,vs6,vs7
-  MULT_APLHA_PART1    vs52,vs60,vs8,vs9
-  MULT_APLHA_PART1    vs53,vs61,vs10,vs11
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs2,vs3, save_permute_1
-  MULT_APLHA_PART1    vs54,vs62,vs12,vs13
-  MULT_APLHA_PART1    vs55,vs63,vs14,vs15
-  xxperm  vs4,vs5, save_permute_1
-  xxperm  vs6,vs7, save_permute_1
-  MULT_APLHA_PART2    vs52,vs60,vs8,vs9
-  MULT_APLHA_PART2    vs53,vs61,vs10,vs11
-  xxperm  vs8,vs9, save_permute_1
-  xxperm  vs10,vs11, save_permute_1
-  MULT_APLHA_PART2    vs54,vs62,vs12,vs13
-  MULT_APLHA_PART2    vs55,vs63,vs14,vs15
-  xxperm  vs12,vs13, save_permute_1
-  xxperm  vs14,vs15, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxpermdi vs1,vs8,vs0,2
-  xxpermdi vs3,vs10,vs2,2
-  xxpermdi vs5,vs12,vs4,2
-  xxpermdi vs7,vs14,vs6,2
-  xxpermdi vs9,vs0,vs8,2
-  xxpermdi vs11,vs2,vs10,2  
-  xvaddsp vs32,vs32,vs1
-  xvaddsp vs40,vs40,vs3
-  xxpermdi vs13,vs4,vs12,2  
-  xxpermdi vs15,vs6,vs14,2
-  xvaddsp vs33,vs33,vs5
-  xvaddsp  vs41,vs41,vs7
-  xvaddsp vs34,vs34,vs9
-  xvaddsp vs42,vs42,vs11 
-  xvaddsp vs35,vs35,vs13
-  xvaddsp vs43,vs43,vs15  
-#else
-  xxpermdi vs32,vs8,vs0,2
-  xxpermdi vs40,vs10,vs2,2
-  xxpermdi vs33,vs12,vs4,2
-  xxpermdi vs41,vs14,vs6,2 
-  xxpermdi vs34,vs0,vs8,2
-  xxpermdi vs42,vs2,vs10,2  
-  xxpermdi vs35,vs4,vs12,2  
-  xxpermdi vs43,vs6,vs14,2
-#endif
-  stxv vs32 , 0(T2)
-  stxv vs40 , 16(T2)
-  stxv vs33 , 32(T2)
-  stxv vs41 , 48(T2)
-  stxv vs34 , 0(T3)
-  stxv vs42 , 16(T3)
-  stxv vs35 , 32(T3)
-  stxv vs43 , 48(T3)  
-	addi	CO, CO, 64
-.endm
-
-/*                                             macros for N=4 and M=4
-**********************************************************************************************/
-
-.macro Zero4x4
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs40,	vs40,	vs40
-	xxlxor	vs41,	vs41,	vs41
-	xxlxor	vs44,	vs44,	vs44
-	xxlxor	vs45,	vs45,	vs45
-	xxlxor	vs48,	vs48,	vs48
-	xxlxor	vs49,	vs49,	vs49
-	xxlxor	vs52,	vs52,	vs52
-	xxlxor	vs53,	vs53,	vs53
-	xxlxor	vs56,	vs56,	vs56
-	xxlxor	vs57,	vs57,	vs57
-	xxlxor	vs60,	vs60,	vs60
-	xxlxor	vs61,	vs61,	vs61
-.endm
-
-
-.macro LOAD4x4   
-	LOAD4x4O 0,0 
-.endm
-
-
-.macro LOAD4x4O  OffsetA,OffsetB
-	lxv	vs24,	(\OffsetB+0)(BO)
-	lxv	vs28,	(\OffsetB+16)(BO)
-	xxperm  	vs26,	vs24,		permute_mask
-	xxperm  	vs30,	vs28,		permute_mask	  
-	lxv	vs0,	(\OffsetA+0)(AO)
-	lxv	vs1,	(\OffsetA+16)(AO)
-	xxpermdi	vs25,	vs24,	vs24,2	   
-	xxpermdi	vs29,	vs28,	vs28,2	  
-	xxpermdi	vs27,	vs26,	vs26,2	
-	xxpermdi	vs31,	vs30,	vs30,2	 	
-.endm
-
-
-.macro END4x4_NORMAL
-	END4x4 AO,BO,32,32
-.endm
-
-
-.macro END4x4_WITHOUT_ADD
-	END4x4 AO,BO,0,0
-.endm
-
-
-.macro END4x4	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-    xvmaddasp       vs48, vs0,vs28
-    xvmaddasp       vs49, vs1,vs28
-    xvmaddasp       vs52, vs0,vs29
-    xvmaddasp       vs53, vs1,vs29
-    xvmaddasp       vs56, vs0,vs30
-    xvmaddasp       vs57, vs1,vs30
-    xvmaddasp       vs60, vs0,vs31
-    xvmaddasp       vs61, vs1,vs31
-.endm
-
-
-.macro LOAD4x4_2
-    LOAD4x4_2O 0,0
-.endm
-	
-
-.macro LOAD4x4_2O  OffsetA,OffsetB
-  lxv	vs8,	(\OffsetB)(BO)
-  lxv	vs12,	(16+\OffsetB)(BO)
-  lxv	vs24,	(32+\OffsetB)(BO)
-  lxv	vs28,	(32+16+\OffsetB)(BO)
-  lxv	vs4,	(0+\OffsetA)(AO)
-  lxv	vs5,	(16+\OffsetA)(AO)
-  xxperm  	vs10,	vs8,		permute_mask
-  xxperm  	vs14,	vs12,		permute_mask	
-  xxpermdi	vs9,	vs8,	 vs8,2	 
-  xxpermdi	vs13,	vs12,	vs12,2	 
-  lxv	vs0,	(32+\OffsetA)(AO)
-  lxv	vs1,	(32+16+\OffsetA)(AO) 
-  xxpermdi	vs11,	vs10,	vs10,2	
-  xxpermdi	vs15,	vs14,	vs14,2	
-  xxperm  	vs26,	vs24,	permute_mask
-  xxperm  	vs30,	vs28,	permute_mask	
-  xxpermdi	vs25,	vs24,	vs24,2 
-  xxpermdi	vs29,	vs28,	vs28,2	      
-  xxpermdi	vs27,	vs26,	vs26,2	
-  xxpermdi	vs31,	vs30,	vs30,2	 
-.endm
-
-
-.macro END4x4_2	  
-  /*for load2 offset will be 64 and 64*/
-   KERNEL4x4_2	AO,BO,	64,64,0 ,1,1 
-.endm
-
-
-.macro KERNEL4x4_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL4x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL4x4_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL4x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL4x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp		vs32, vs4,vs8
-  xvmaddasp		vs33, vs5,vs8
-  xvmaddasp		vs48, vs4,vs12
-  xvmaddasp		vs49, vs5,vs12
-  xvmaddasp		vs40, vs4,vs10
-  xvmaddasp		vs41, vs5,vs10
-  xvmaddasp		vs56, vs4,vs14
-  xvmaddasp		vs57, vs5,vs14
-.if \Complete==0  
-  lxv vs8,  DISP8(\Index,\OffsetB)(\BREG)
-  lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)
-.endif  
-  xvmaddasp		vs36, vs4,vs9
-  xvmaddasp		vs37, vs5,vs9
-  xvmaddasp		vs52, vs4,vs13
-  xvmaddasp		vs53, vs5,vs13
-.if \Complete==0  
-  xxperm    vs10, vs8,    permute_mask
-  xxperm    vs14, vs12,   permute_mask    
-.endif    
-  xvmaddasp		vs44, vs4,vs11
-  xvmaddasp		vs45, vs5,vs11
-  xvmaddasp		vs60, vs4,vs15
-  xvmaddasp		vs61, vs5,vs15
-.if \Complete==0
-  xxpermdi  vs9,  vs8,   vs8,2   
-  xxpermdi  vs13, vs12, vs12,2   
-.endif    
-.if \Complete==0	
-   lxv	vs4,	DISP8(\Index,0+\OffsetA)(\AREG)
-   lxv	vs5,	DISP8(\Index,16+\OffsetA)(\AREG)
-.endif
-
-.if \Complete==0
-  xxpermdi  vs11, vs10, vs10,2  
-  xxpermdi  vs15, vs14, vs14,2  
-.endif  
-  xvmaddasp		vs32, vs0,vs24
-  xvmaddasp		vs33, vs1,vs24
-  xvmaddasp		vs48, vs0,vs28
-  xvmaddasp		vs49, vs1,vs28
-  xvmaddasp		vs40, vs0,vs26
-  xvmaddasp		vs41, vs1,vs26
-  xvmaddasp		vs56, vs0,vs30
-  xvmaddasp		vs57, vs1,vs30
-.if \Complete==0
-  lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)
-  lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)
-.endif   
-  xvmaddasp		vs36, vs0,vs25
-  xvmaddasp		vs37, vs1,vs25
-  xvmaddasp		vs52, vs0,vs29
-  xvmaddasp		vs53, vs1,vs29
-.if \Complete==0
-  xxperm    vs26, vs24, permute_mask
-  xxperm    vs30, vs28, permute_mask  
-.endif    
-  xvmaddasp		vs44, vs0,vs27
-  xvmaddasp		vs45, vs1,vs27
-  xvmaddasp		vs60, vs0,vs31
-  xvmaddasp		vs61, vs1,vs31 
-.if \Complete==0
-  xxpermdi  vs25, vs24, vs24,2 
-  xxpermdi  vs29, vs28, vs28,2    
-.endif  
-.if \Complete==0
-  lxv	vs0,	DISP8(\Index,32+\OffsetA)(\AREG)
-  lxv	vs1,	DISP8(\Index,32+16+\OffsetA)(\AREG) 
-.endif
-
-.if \Complete==0
-  xxpermdi  vs27, vs26, vs26,2  
-  xxpermdi  vs31, vs30, vs30,2   
-.endif
-
-.if \IsLast==1	
-.if \Complete==1
-	addi		\BREG, \BREG,  DISP8(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP8(\Index,\OffsetA)  
-.else
-	addi		\BREG, \BREG,  DISP8(\Index,64)
-  addi    \AREG, \AREG, DISP8(\Index,64)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL4x4
-  LOAD4x4
-  END4x4  AO, BO, 32,32
-.endm
-
-
-.macro SAVE4x4
-  add T4, LDC,LDC
-  add T1, CO ,LDC  
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO)
-  lxv vs25 , 16(CO)
-#endif
-  add T2,CO,T4
-  add T3,T1,T4  
-#ifndef TRMMKERNEL  
-  lxv vs26 , 0(T1)
-  lxv vs27 , 16(T1)
-#endif  
- #ifndef TRMMKERNEL  
-  lxv vs28 , 0(T2)
-  lxv vs29 , 16(T2)
-#endif
-#ifndef TRMMKERNEL  
-  lxv vs30 , 0(T3)
-  lxv vs31 , 16(T3)
-#endif   
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask
-  xxperm  vs8,vs36,permute_mask
-  xxperm  vs12,vs44,permute_mask
-  xxperm  vs9,vs37,permute_mask
-  xxperm  vs13,vs45,permute_mask
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
-  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
-  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
-  xxperm  vs0,vs48,permute_mask
-  xxperm  vs4,vs56,permute_mask
-  xxperm  vs1,vs49,permute_mask
-  xxperm  vs5,vs57,permute_mask 
-  xxperm  vs8,vs52,permute_mask
-  xxperm  vs12,vs60,permute_mask
-  xxperm  vs9,vs53,permute_mask
-  xxperm  vs13,vs61,permute_mask
-  AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4
-  AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5
-  AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12
-  AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
-  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
-  MULT_APLHA_PART1    vs48,vs56,vs4,vs5
-  MULT_APLHA_PART1    vs49,vs57,vs6,vs7    
-  MULT_APLHA_PART1    vs52,vs60,vs12,vs13
-  MULT_APLHA_PART1    vs53,vs61,vs14,vs15
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
-  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
-  MULT_APLHA_PART2    vs48,vs56,vs4,vs5
-  MULT_APLHA_PART2    vs49,vs57,vs6,vs7    
-  MULT_APLHA_PART2    vs52,vs60,vs12,vs13
-  MULT_APLHA_PART2    vs53,vs61,vs14,vs15
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs2,vs3, save_permute_1
-  xxperm  vs8,vs9, save_permute_1
-  xxperm  vs10,vs11, save_permute_1
-  xxperm  vs4,vs5, save_permute_1
-  xxperm  vs6,vs7, save_permute_1
-  xxperm  vs12,vs13, save_permute_1
-  xxperm  vs14,vs15, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxpermdi vs1,vs8,vs0,2
-  xxpermdi vs3,vs10,vs2,2 
-  xxpermdi vs9,vs0,vs8,2
-  xxpermdi vs11,vs2,vs10,2  
-  xxpermdi vs5,vs12,vs4,2
-  xxpermdi vs7,vs14,vs6,2 
-  xxpermdi vs13,vs4,vs12,2
-  xxpermdi vs15,vs6,vs14,2   
-  xvaddsp vs24,vs24,vs1
-  xvaddsp vs25,vs25,vs3 
-  xvaddsp vs26,vs26,vs9
-  xvaddsp vs27,vs27,vs11 
-  xvaddsp vs28,vs28,vs5
-  xvaddsp vs29,vs29,vs7 
-  xvaddsp vs30,vs30,vs13
-  xvaddsp vs31,vs31,vs15 
-#else
-  xxpermdi vs24,vs8,vs0,2
-  xxpermdi vs25,vs10,vs2,2
-  xxpermdi vs26,vs0,vs8,2
-  xxpermdi vs27,vs2,vs10,2  
-  xxpermdi vs28,vs12,vs4,2
-  xxpermdi vs29,vs14,vs6,2 
-  xxpermdi vs30,vs4,vs12,2
-  xxpermdi vs31,vs6,vs14,2   
-#endif
-  stxv vs24 , 0(CO)
-  stxv vs25 , 16(CO)
-  stxv vs26 , 0(T1)
-  stxv vs27 , 16(T1)
-  stxv vs28 , 0(T2)
-  stxv vs29 , 16(T2)
-  stxv vs30 , 0(T3)
-  stxv vs31 , 16(T3)  
-  addi  CO, CO, 32
-.endm
-
-/*                                             macros for N=4 and M=2
-**********************************************************************************************/
-
-.macro Zero4x2
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs40,	vs40,	vs40
-	xxlxor	vs41,	vs41,	vs41
-	xxlxor	vs44,	vs44,	vs44
-	xxlxor	vs45,	vs45,	vs45
-.endm
-
-
-.macro LOAD4x2   
-	LOAD4x2O 0,0 
-.endm
-
-
-.macro LOAD4x2O  OffsetA,OffsetB
-	lxv	vs24,	(\OffsetA+0)(AO)
-  lxv vs0,  (\OffsetB+0)(BO)
-  lxv vs1,  (\OffsetB+16)(BO)
-	xxperm  	vs26,	vs24,		permute_mask  
-	xxpermdi	vs25,	vs24,	vs24,2	    
-	xxpermdi	vs27,	vs26,	vs26,2	
-.endm
-
-
-.macro END4x2_NORMAL
-	END4x2 AO,BO,16,32
-.endm
-
-
-.macro END4x2_WITHOUT_ADD
-	END4x2 AO,BO,0,0
-.endm
-
-
-.macro END4x2	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-.endm
-
-
-.macro LOAD4x2_2
-    LOAD4x2_2O 0,0
-.endm
-	
-
-.macro LOAD4x2_2O  OffsetA,OffsetB
-  lxv	vs8,	(\OffsetA)(AO) 
-  lxv	vs24,	(16+\OffsetA)(AO) 
-  lxv	vs4,	(0+\OffsetB)(BO)
-  lxv	vs5,	(16+\OffsetB)(BO)
-  xxperm  	vs10,	vs8,		permute_mask
-  xxpermdi	vs9,	vs8,	 vs8,2	 
-  xxperm  	vs26,	vs24,	permute_mask
-  xxpermdi	vs25,	vs24,	vs24,2    
-  lxv vs0,  (32+\OffsetB)(BO)
-  lxv vs1,  (32+16+\OffsetB)(BO) 
-  xxpermdi  vs11, vs10, vs10,2  
-  xxpermdi	vs27,	vs26,	vs26,2	
-.endm
-
-
-.macro END4x2_2	  
-  /*for load2 offset will be 32 and 64*/
-   KERNEL4x2_2	AO,BO,	32,64,0 ,1,1 
-.endm
-
-
-.macro KERNEL4x2_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL4x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL4x2_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL4x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL4x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp		vs32, vs4,vs8
-  xvmaddasp		vs33, vs5,vs8
-  xvmaddasp		vs40, vs4,vs10
-  xvmaddasp		vs41, vs5,vs10
-.if \Complete==0  
-  lxv vs8,  DISP4(\Index,\OffsetA)(\AREG) 
-.endif  
-  xvmaddasp		vs36, vs4,vs9
-  xvmaddasp		vs37, vs5,vs9
-  xvmaddasp   vs44, vs4,vs11
-  xvmaddasp   vs45, vs5,vs11
-.if \Complete==0  
-  xxperm    vs10, vs8,    permute_mask 
-  xxpermdi  vs9,  vs8,   vs8,2  
-.endif    
-.if \Complete==0	
-   lxv	vs4,	DISP8(\Index,0+\OffsetB)(\BREG)
-   lxv	vs5,	DISP8(\Index,16+\OffsetB)(\BREG)
-.endif
-
-.if \Complete==0
-  xxpermdi  vs11, vs10, vs10,2   
-.endif  
-  xvmaddasp		vs32, vs0,vs24
-  xvmaddasp		vs33, vs1,vs24
-  xvmaddasp		vs40, vs0,vs26
-  xvmaddasp		vs41, vs1,vs26
-.if \Complete==0
-  lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) 
-.endif   
-  xvmaddasp		vs36, vs0,vs25
-  xvmaddasp		vs37, vs1,vs25
-  xvmaddasp		vs44, vs0,vs27
-  xvmaddasp		vs45, vs1,vs27
-.if \Complete==0
-  xxperm    vs26, vs24, permute_mask 
-  xxpermdi  vs25, vs24, vs24,2    
-.endif  
-.if \Complete==0
-  lxv	vs0,	DISP8(\Index,32+\OffsetB)(\BREG)
-  lxv	vs1,	DISP8(\Index,32+16+\OffsetB)(\BREG) 
-.endif
-
-.if \Complete==0
-  xxpermdi  vs27, vs26, vs26,2    
-.endif
-
-.if \IsLast==1	
-.if \Complete==1
-  addi    \AREG, \AREG, DISP4(\Index,\OffsetA) 
-	addi		\BREG, \BREG,  DISP8(\Index,\OffsetB)
-.else
-  addi    \AREG, \AREG, DISP4(\Index,32)  
-	addi		\BREG, \BREG,  DISP8(\Index,64)
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL4x2
-  LOAD4x2
-  END4x2  AO, BO, 16,32
-.endm
-
-
-.macro SAVE4x2
-  add T4, LDC,LDC
-  add T1, CO ,LDC  
-  add T2,CO,T4
-  add T3,T1,T4  
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO) 
-#endif
-#ifndef TRMMKERNEL  
-  lxv vs25 , 0(T1) 
-#endif  
-#ifndef TRMMKERNEL  
-  lxv vs26 , 0(T2) 
-#endif
-#ifndef TRMMKERNEL  
-  lxv vs27 , 0(T3) 
-#endif   
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask 
-  xxperm  vs8,vs36,permute_mask
-  xxperm  vs12,vs44,permute_mask
-  xxperm  vs9,vs37,permute_mask
-  xxperm  vs13,vs45,permute_mask
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
-  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
-  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs2,vs3, save_permute_1
-  xxperm  vs8,vs9, save_permute_1
-  xxperm  vs10,vs11, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxpermdi vs1,vs8,vs0,0
-  xxpermdi vs9,vs10,vs2,0 
-  xxpermdi vs3,vs0,vs8,3
-  xxpermdi vs11,vs2,vs10,3 
-  xvaddsp vs24,vs24,vs1
-  xvaddsp vs26,vs26,vs9 
-  xvaddsp vs25,vs25,vs3 
-  xvaddsp vs27,vs27,vs11 
-#else
-  xxpermdi vs24,vs8,vs0,0
-  xxpermdi vs26,vs10,vs2,0 
-  xxpermdi vs25,vs0,vs8,3
-  xxpermdi vs27,vs2,vs10,3 
-#endif
-  stxv vs24 , 0(CO) 
-  stxv vs25 , 0(T1) 
-  stxv vs26 , 0(T2) 
-  stxv vs27 , 0(T3)  
-  addi  CO, CO, 16
-.endm
-
-/*                                             macros for N=4 and M=2
-**********************************************************************************************/
-
-.macro Zero4x1
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs33, vs33, vs33 
-  xxlxor  vs40, vs40, vs40
-  xxlxor  vs41, vs41, vs41 
-.endm
-
-
-.macro LOAD4x1   
-  LOAD4x1O 0,0 
-.endm
-
-
-.macro LOAD4x1O  OffsetA,OffsetB
-  lxsd v4, (\OffsetA+0)(AO) 
-  lxv vs0,  (\OffsetB+0)(BO)
-  lxv vs1,  (\OffsetB+16)(BO)
-  xxspltd  vs24,vs36,0
-  xxperm    vs26, vs24,   permute_mask   
-.endm
-
-
-.macro END4x1_NORMAL
-  END4x1 AO,BO,8,32
-.endm
-
-
-.macro END4x1_WITHOUT_ADD
-  END4x1 AO,BO,0,0
-.endm
-
-
-.macro END4x1 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-.endm
-
-
-.macro LOAD4x1_2
-    LOAD4x1_2O 0,0
-.endm
- 
-
-.macro LOAD4x1_2O  OffsetA,OffsetB
-  lxv vs27,  (\OffsetA)(AO) 
-  xxspltd  vs8,vs27,1
-  xxspltd  vs24,vs27,0  
-  lxv vs4,  (0+\OffsetB)(BO)
-  lxv vs5,  (16+\OffsetB)(BO) 
-  xxperm    vs10, vs8,    permute_mask 
-  xxperm    vs26, vs24, permute_mask      
-  lxv vs0,  (32+\OffsetB)(BO)
-  lxv vs1,  (32+16+\OffsetB)(BO)
-.endm
-
-
-.macro END4x1_2   
-  /*for load2 offset will be 16 and 64*/
-   KERNEL4x1_2  AO,BO,  16,64,0 ,1,1 
-.endm
-
-
-.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL4x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL4x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL4x1_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs33, vs5,vs8
-  xvmaddasp   vs40, vs4,vs10
-  xvmaddasp   vs41, vs5,vs10
-.if \Complete==0  
-  lxv vs27,  DISP2(\Index,\OffsetA)(\AREG) 
-  xxspltd  vs8,vs27,1 
-.endif  
-.if \Complete==0  
-   lxv  vs4,  DISP8(\Index,0+\OffsetB)(\BREG)
-   lxv  vs5,  DISP8(\Index,16+\OffsetB)(\BREG)
-.endif
-
-.if \Complete==0  
-  xxperm    vs10, vs8,    permute_mask  
-.endif    
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs33, vs1,vs24
-  xvmaddasp   vs40, vs0,vs26
-  xvmaddasp   vs41, vs1,vs26
-.if \Complete==0 
-  xxspltd  vs24,vs27,0  
-  xxperm   vs26, vs24, permute_mask   
-.endif  
-.if \Complete==0
-  lxv vs0,  DISP8(\Index,32+\OffsetB)(\BREG)
-  lxv vs1,  DISP8(\Index,32+16+\OffsetB)(\BREG) 
-.endif
-
-.if \IsLast==1  
-.if \Complete==1
-  addi    \AREG, \AREG, DISP2(\Index,\OffsetA) 
-  addi    \BREG, \BREG,  DISP8(\Index,\OffsetB)
-.else
-  addi    \AREG, \AREG, DISP2(\Index,16)  
-  addi    \BREG, \BREG,  DISP8(\Index,64)
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL4x1
-  LOAD4x1
-  END4x1  AO, BO, 8,32
-.endm
-
-
-.macro SAVE4x1
-  add T4, LDC,LDC
-  add T1, CO ,LDC  
-  add T2,CO,T4
-  add T3,T1,T4  
-#ifndef TRMMKERNEL  
-  lxsd v4 , 0(CO) 
-#endif
-#ifndef TRMMKERNEL  
-  lxsd v5 , 0(T1) 
-#endif  
-#ifndef TRMMKERNEL  
-  lxsd v6 , 0(T2) 
-#endif
-#ifndef TRMMKERNEL  
-  lxsd v7 , 0(T3) 
-#endif   
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask 
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3     
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3    
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs2,vs3, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxspltd vs1,vs0,0
-  xxspltd vs3,vs0,1
-  xxspltd vs9,vs2,0
-  xxspltd vs11,vs2,1
- /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
-  xvaddsp vs36,vs36,vs1
-  xvaddsp vs37,vs37,vs3   
-  xvaddsp vs38,vs38,vs9  
-  xvaddsp vs39,vs39,vs11 
-#else 
- /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
-  xxspltd vs36,vs0,0
-  xxspltd vs37,vs0,1
-  xxspltd vs38,vs2,0
-  xxspltd vs39,vs2,1
-#endif
-  stxsd v4 , 0(CO) 
-  stxsd v5 , 0(T1) 
-  stxsd v6 , 0(T2) 
-  stxsd v7 , 0(T3)  
-  addi  CO, CO, 8
-.endm
-
-/*                                             macros for N=2 and M=8
-**********************************************************************************************/
-
-.macro Zero2x8
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs33, vs33, vs33
-  xxlxor  vs34, vs34, vs34
-  xxlxor  vs35, vs35, vs35
-  xxlxor  vs36, vs36, vs36
-  xxlxor  vs37, vs37, vs37
-  xxlxor  vs38, vs38, vs38
-  xxlxor  vs39, vs39, vs39
-  xxlxor  vs40, vs40, vs40
-  xxlxor  vs41, vs41, vs41
-  xxlxor  vs42, vs42, vs42
-  xxlxor  vs43, vs43, vs43
-  xxlxor  vs44, vs44, vs44
-  xxlxor  vs45, vs45, vs45
-  xxlxor  vs46, vs46, vs46
-  xxlxor  vs47, vs47, vs47
-.endm
-
-
-.macro LOAD2x8   
-  LOAD2x8O 0,0 
-.endm
-
-
-.macro LOAD2x8O  OffsetA,OffsetB
-  lxv vs24, (\OffsetB+0)(BO) 
-  xxperm    vs26, vs24,   permute_mask    
-  lxv vs0,  (\OffsetA+0)(AO)
-  lxv vs1,  (\OffsetA+16)(AO)
-  lxv vs2,  (\OffsetA+32)(AO)
-  lxv vs3,  (\OffsetA+48)(AO) 
-  xxpermdi  vs25, vs24, vs24,2  
-  xxpermdi  vs27, vs26, vs26,2
-.endm
-
-
-.macro END2x8_NORMAL
-  END2x8 AO,BO,64,16
-.endm
-
-
-.macro END2x8_WITHOUT_ADD
-  END2x8 AO,BO,0,0
-.endm
-
-
-.macro END2x8 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs34, vs2,vs24  
-    xvmaddasp       vs35, vs3,vs24  
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-    xvmaddasp       vs38, vs2,vs25  
-    xvmaddasp       vs39, vs3,vs25 
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs42, vs2,vs26  
-    xvmaddasp       vs43, vs3,vs26
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-    xvmaddasp       vs46, vs2,vs27  
-    xvmaddasp       vs47, vs3,vs27
-.endm
-
-
-.macro LOAD2x8_2
-    LOAD2x8_2O 0,0
-.endm
- 
-
-.macro LOAD2x8_2O  OffsetA,OffsetB
-  lxv vs8,  (\OffsetB)(BO)
-  lxv vs24, (16+\OffsetB)(BO)
-  lxv vs4,  (0+\OffsetA)(AO)
-  lxv vs5,  (16+\OffsetA)(AO)
-  xxperm    vs10, vs8,    permute_mask 
-  xxperm    vs26, vs24, permute_mask  
-  lxv vs6,  (32+\OffsetA)(AO)
-  lxv vs7,  (48+\OffsetA)(AO) 
-  lxv vs0,  (64+\OffsetA)(AO)
-  lxv vs1,  (64+16+\OffsetA)(AO) 
-  xxpermdi  vs9,  vs8,   vs8,2    
-  xxpermdi  vs25, vs24, vs24,2     
-  lxv vs2,  (64+32+\OffsetA)(AO)
-  lxv vs3,  (64+48+\OffsetA)(AO)
-  xxpermdi  vs11, vs10, vs10,2
-  xxpermdi  vs27, vs26, vs26,2 
-.endm
- 
-
-.macro END2x8_2   
-  /*for load2 offset will be 128 and 32*/
-   KERNEL2x8_2  AO,BO,  128,32,0 ,1,1 
-.endm
-
-
-.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL2x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x8_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs33, vs5,vs8
-  xvmaddasp   vs40, vs4,vs10
-  xvmaddasp   vs41, vs5,vs10
-  xvmaddasp   vs36, vs4,vs9
-  xvmaddasp   vs37, vs5,vs9
-  xvmaddasp   vs44, vs4,vs11
-  xvmaddasp   vs45, vs5,vs11
-.if \Complete==0  
-   lxv  vs4,  DISP16(\Index,0+\OffsetA)(\AREG)
-   lxv  vs5,  DISP16(\Index,16+\OffsetA)(\AREG)
-.endif
-
-  xvmaddasp   vs34, vs6,vs8 
-  xvmaddasp   vs35, vs7,vs8
-.if \Complete==0  
-  lxv vs8,  DISP4(\Index,\OffsetB)(\BREG)
-.endif    
-  xvmaddasp   vs42, vs6,vs10
-  xvmaddasp   vs43, vs7,vs10
-  xvmaddasp   vs38, vs6,vs9 
-  xvmaddasp   vs39, vs7,vs9
-.if \Complete==0
-  xxperm    vs10, vs8,    permute_mask  
-  xxpermdi  vs9,  vs8,   vs8,2   
-.endif    
-  xvmaddasp   vs46, vs6,vs11
-  xvmaddasp   vs47, vs7,vs11
-.if \Complete==0
-  xxpermdi  vs11, vs10, vs10,2   
-.endif  
-.if \Complete==0
-   lxv  vs6,  DISP16(\Index,32+\OffsetA)(\AREG)
-   lxv  vs7,  DISP16(\Index,48+\OffsetA)(\AREG) 
-.endif 
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs33, vs1,vs24
-  xvmaddasp   vs40, vs0,vs26
-  xvmaddasp   vs41, vs1,vs26
-  xvmaddasp   vs36, vs0,vs25
-  xvmaddasp   vs37, vs1,vs25
-  xvmaddasp   vs44, vs0,vs27
-  xvmaddasp   vs45, vs1,vs27
-.if \Complete==0
-  lxv vs0,  DISP16(\Index,64+\OffsetA)(\AREG)
-  lxv vs1,  DISP16(\Index,64+16+\OffsetA)(\AREG) 
-.endif
-
-  xvmaddasp   vs34, vs2,vs24
-  xvmaddasp   vs35, vs3,vs24    
-.if \Complete==0
-  lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)
-.endif  
-  xvmaddasp   vs42, vs2,vs26
-  xvmaddasp   vs43, vs3,vs26
-  xvmaddasp   vs38, vs2,vs25
-  xvmaddasp   vs39, vs3,vs25
-.if \Complete==0
-  xxperm    vs26, vs24, permute_mask 
-  xxpermdi  vs25, vs24, vs24,2   
-.endif  
-  xvmaddasp   vs46, vs2,vs27
-  xvmaddasp   vs47, vs3,vs27
-.if \Complete==0
-  xxpermdi  vs27, vs26, vs26,2   
-.endif
-
-.if \Complete==0
-  lxv vs2,  DISP16(\Index,64+32+\OffsetA)(\AREG)
-  lxv vs3,  DISP16(\Index,64+48+\OffsetA)(\AREG)
-.endif
-
-.if \IsLast==1  
-.if \Complete==1
-  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP16(\Index,\OffsetA)  
-.else
-  addi    \BREG, \BREG,  DISP4(\Index,32)
-  addi    \AREG, \AREG, DISP16(\Index,128)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL2x8
-  LOAD2x8
-  END2x8  AO, BO, 64,16
-.endm
-
-
-.macro SAVE2x8
-  add T1, CO ,LDC  
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO)
-  lxv vs25 , 16(CO)
-#endif
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-#ifndef TRMMKERNEL  
-  lxv vs26 , 32(CO)
-  lxv vs27 , 48(CO)
-#endif  
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask
-#ifndef TRMMKERNEL  
-  lxv vs28 , 0(T1)
-  lxv vs29 , 16(T1)
-#endif  
-  xxperm  vs2,vs34,permute_mask
-  xxperm  vs6,vs42,permute_mask
-#ifndef TRMMKERNEL  
-  lxv vs30 , 32(T1)
-  lxv vs31 , 48(T1)
-#endif 
-  xxperm  vs3,vs35,permute_mask
-  xxperm  vs7,vs43,permute_mask 
-  add T2,CO,T4
-  add T3,T1,T4  
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  xxperm  vs8,vs36,permute_mask
-  xxperm  vs12,vs44,permute_mask
-  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
-  xxperm  vs9,vs37,permute_mask
-  xxperm  vs13,vs45,permute_mask
-  AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
-  xxperm  vs10,vs38,permute_mask
-  xxperm  vs14,vs46,permute_mask
-  AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 
-  xxperm  vs11,vs39,permute_mask
-  xxperm  vs15,vs47,permute_mask 
-  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
-  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
-  AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14
-  AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
-  MULT_APLHA_PART1    vs34,vs42,vs4,vs5
-  MULT_APLHA_PART1    vs35,vs43,vs6,vs7  
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
-  MULT_APLHA_PART2    vs34,vs42,vs4,vs5
-  MULT_APLHA_PART2    vs35,vs43,vs6,vs7  
-  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
-  MULT_APLHA_PART1    vs38,vs46,vs12,vs13
-  MULT_APLHA_PART1    vs39,vs47,vs14,vs15
-  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
-  MULT_APLHA_PART2    vs38,vs46,vs12,vs13
-  MULT_APLHA_PART2    vs39,vs47,vs14,vs15
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs2,vs3, save_permute_1
-  xxperm  vs4,vs5, save_permute_1
-  xxperm  vs6,vs7, save_permute_1
-  xxperm  vs8,vs9, save_permute_1
-  xxperm  vs10,vs11, save_permute_1
-  xxperm  vs12,vs13, save_permute_1
-  xxperm  vs14,vs15, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxpermdi vs1,vs8,vs0,2
-  xxpermdi vs3,vs10,vs2,2
-  xxpermdi vs5,vs12,vs4,2
-  xxpermdi vs7,vs14,vs6,2
-  xxpermdi vs9,vs0,vs8,2
-  xxpermdi vs11,vs2,vs10,2  
-  xvaddsp vs24,vs24,vs1
-  xvaddsp vs25,vs25,vs3
-  xxpermdi vs13,vs4,vs12,2  
-  xxpermdi vs15,vs6,vs14,2
-  xvaddsp vs26,vs26,vs5
-  xvaddsp  vs27,vs27,vs7
-  xvaddsp vs28,vs28,vs9
-  xvaddsp vs29,vs29,vs11 
-  xvaddsp vs30,vs30,vs13
-  xvaddsp vs31,vs31,vs15  
-#else
-  xxpermdi vs24,vs8,vs0,2
-  xxpermdi vs25,vs10,vs2,2
-  xxpermdi vs26,vs12,vs4,2
-  xxpermdi vs27,vs14,vs6,2 
-  xxpermdi vs28,vs0,vs8,2
-  xxpermdi vs29,vs2,vs10,2  
-  xxpermdi vs30,vs4,vs12,2  
-  xxpermdi vs31,vs6,vs14,2
-#endif
-  stxv vs24 , 0(CO)
-  stxv vs25 , 16(CO) 
-  stxv vs26 , 32(CO)
-  stxv vs27 , 48(CO) 
-  stxv vs28 , 0(T1)
-  stxv vs29 , 16(T1) 
-  stxv vs30 , 32(T1)
-  stxv vs31 , 48(T1)  
-  addi  CO, CO, 64
-.endm
-
-/*                                             macros for N=2 and M=4
-**********************************************************************************************/
-
-.macro Zero2x4
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs33, vs33, vs33
-  xxlxor  vs36, vs36, vs36
-  xxlxor  vs37, vs37, vs37
-  xxlxor  vs40, vs40, vs40
-  xxlxor  vs41, vs41, vs41
-  xxlxor  vs44, vs44, vs44
-  xxlxor  vs45, vs45, vs45
-.endm
-
-
-.macro LOAD2x4   
-  LOAD2x4O 0,0 
-.endm
-
-
-.macro LOAD2x4O  OffsetA,OffsetB
-  lxv vs24, (\OffsetB+0)(BO)
-  lxv vs0,  (\OffsetA+0)(AO)
-  lxv vs1,  (\OffsetA+16)(AO)
-  xxperm    vs26, vs24,   permute_mask  
-  xxpermdi  vs25, vs24, vs24,2     
-  xxpermdi  vs27, vs26, vs26,2  
-.endm
-
-
-.macro END2x4_NORMAL
-  END2x4 AO,BO,32,16
-.endm
-
-
-.macro END2x4_WITHOUT_ADD
-  END2x4 AO,BO,0,0
-.endm
-
-
-.macro END2x4 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-.endm
-
-
-.macro LOAD2x4_2
-    LOAD2x4_2O 0,0
-.endm
- 
-
-.macro LOAD2x4_2O  OffsetA,OffsetB
-  lxv vs8,  (\OffsetB)(BO)
-  lxv vs24, (16+\OffsetB)(BO)
-  lxv vs4,  (0+\OffsetA)(AO)
-  lxv vs5,  (16+\OffsetA)(AO)
-  xxperm    vs10, vs8,    permute_mask
-  xxperm    vs26, vs24, permute_mask
-  xxpermdi  vs9,  vs8,   vs8,2   
-  xxpermdi  vs25, vs24, vs24,2     
-  lxv vs0,  (32+\OffsetA)(AO)
-  lxv vs1,  (32+16+\OffsetA)(AO) 
-  xxpermdi  vs11, vs10, vs10,2  
-  xxpermdi  vs27, vs26, vs26,2  
-.endm
-
-
-.macro END2x4_2   
-  /*for load2 offset will be 64 and 32*/
-   KERNEL2x4_2  AO,BO,  64,32,0 ,1,1 
-.endm
-
-
-.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL2x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x4_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs33, vs5,vs8
-  xvmaddasp   vs40, vs4,vs10
-  xvmaddasp   vs41, vs5,vs10
-.if \Complete==0  
-  lxv vs8,  DISP4(\Index,\OffsetB)(\BREG)
-.endif  
-  xvmaddasp   vs36, vs4,vs9
-  xvmaddasp   vs37, vs5,vs9
-  xvmaddasp   vs44, vs4,vs11
-  xvmaddasp   vs45, vs5,vs11
-.if \Complete==0
-  xxperm    vs10, vs8,    permute_mask 
-  xxpermdi  vs9,  vs8,   vs8,2   
-.endif    
-.if \Complete==0  
-   lxv  vs4,  DISP8(\Index,0+\OffsetA)(\AREG)
-   lxv  vs5,  DISP8(\Index,16+\OffsetA)(\AREG)
-.endif
-
-.if \Complete==0
-  xxpermdi  vs11, vs10, vs10,2 
-.endif  
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs33, vs1,vs24
-  xvmaddasp   vs40, vs0,vs26
-  xvmaddasp   vs41, vs1,vs26
-.if \Complete==0
-  lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)
-.endif   
-  xvmaddasp   vs36, vs0,vs25
-  xvmaddasp   vs37, vs1,vs25
-  xvmaddasp   vs44, vs0,vs27
-  xvmaddasp   vs45, vs1,vs27
-.if \Complete==0
-  xxperm    vs26, vs24, permute_mask
-  xxpermdi  vs25, vs24, vs24,2 
-.endif  
-.if \Complete==0
-  lxv vs0,  DISP8(\Index,32+\OffsetA)(\AREG)
-  lxv vs1,  DISP8(\Index,32+16+\OffsetA)(\AREG) 
-.endif
-
-.if \Complete==0
-  xxpermdi  vs27, vs26, vs26,2  
-.endif
-
-.if \IsLast==1  
-.if \Complete==1
-  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP8(\Index,\OffsetA)  
-.else
-  addi    \BREG, \BREG,  DISP4(\Index,32)
-  addi    \AREG, \AREG, DISP8(\Index,64)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL2x4
-  LOAD2x4
-  END2x4  AO, BO, 32,16
-.endm
-
-
-.macro SAVE2x4
-  add T1, CO ,LDC  
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO)
-  lxv vs25 , 16(CO)
-#endif
-#ifndef TRMMKERNEL  
-  lxv vs26 , 0(T1)
-  lxv vs27 , 16(T1)
-#endif  
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask
-  xxperm  vs8,vs36,permute_mask
-  xxperm  vs12,vs44,permute_mask
-  xxperm  vs9,vs37,permute_mask
-  xxperm  vs13,vs45,permute_mask
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
-  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
-  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
-  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
-  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs2,vs3, save_permute_1
-  xxperm  vs8,vs9, save_permute_1
-  xxperm  vs10,vs11, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxpermdi vs1,vs8,vs0,2
-  xxpermdi vs3,vs10,vs2,2 
-  xxpermdi vs9,vs0,vs8,2
-  xxpermdi vs11,vs2,vs10,2  
-  xvaddsp vs24,vs24,vs1
-  xvaddsp vs25,vs25,vs3 
-  xvaddsp vs26,vs26,vs9
-  xvaddsp vs27,vs27,vs11 
-#else
-  xxpermdi vs24,vs8,vs0,2
-  xxpermdi vs25,vs10,vs2,2
-  xxpermdi vs26,vs0,vs8,2
-  xxpermdi vs27,vs2,vs10,2  
-#endif
-  stxv vs24 , 0(CO)
-  stxv vs25 , 16(CO)
-  stxv vs26 , 0(T1)
-  stxv vs27 , 16(T1)
-  addi  CO, CO, 32
-.endm
-
-/*                                             macros for N=2 and M=2
-**********************************************************************************************/
-
-.macro Zero2x2
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs36, vs36, vs36
-  xxlxor  vs40, vs40, vs40
-  xxlxor  vs44, vs44, vs44
-.endm
-
-
-.macro LOAD2x2   
-  LOAD2x2O 0,0 
-.endm
-
-
-.macro LOAD2x2O  OffsetA,OffsetB
-  lxv vs24, (\OffsetA+0)(AO)
-  lxv vs0,  (\OffsetB+0)(BO)
-  xxperm    vs26, vs24,   permute_mask  
-  xxpermdi  vs25, vs24, vs24,2      
-  xxpermdi  vs27, vs26, vs26,2  
-.endm
-
-
-.macro END2x2_NORMAL
-  END2x2 AO,BO,16,16
-.endm
-
-
-.macro END2x2_WITHOUT_ADD
-  END2x2 AO,BO,0,0
-.endm
-
-
-.macro END2x2 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs44, vs0,vs27
-.endm
-
-
-.macro LOAD2x2_2
-    LOAD2x2_2O 0,0
-.endm
- 
-
-.macro LOAD2x2_2O  OffsetA,OffsetB
-  lxv vs8,  (\OffsetA)(AO) 
-  lxv vs24, (16+\OffsetA)(AO) 
-  lxv vs4,  (0+\OffsetB)(BO)
-  lxv vs0,  (16+\OffsetB)(BO)
-  xxperm    vs10, vs8,    permute_mask
-  xxpermdi  vs9,  vs8,   vs8,2   
-  xxperm    vs26, vs24, permute_mask
-  xxpermdi  vs25, vs24, vs24,2    
-  xxpermdi  vs11, vs10, vs10,2  
-  xxpermdi  vs27, vs26, vs26,2  
-.endm
-
-
-.macro END2x2_2   
-  /*for load2 offset will be 32 and 32*/
-   KERNEL2x2_2  AO,BO,  32,32,0 ,1,1 
-.endm
-
-
-.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL2x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x2_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs40, vs4,vs10
-.if \Complete==0  
-  lxv vs8,  DISP4(\Index,\OffsetA)(\AREG) 
-.endif  
-  xvmaddasp   vs36, vs4,vs9
-  xvmaddasp   vs44, vs4,vs11
-.if \Complete==0  
-  xxperm    vs10, vs8,    permute_mask 
-  xxpermdi  vs9,  vs8,   vs8,2  
-.endif    
-.if \Complete==0  
-   lxv  vs4,  DISP4(\Index,0+\OffsetB)(\BREG)
-.endif
-
-.if \Complete==0
-  xxpermdi  vs11, vs10, vs10,2   
-.endif  
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs40, vs0,vs26
-.if \Complete==0
-  lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) 
-.endif   
-  xvmaddasp   vs36, vs0,vs25
-  xvmaddasp   vs44, vs0,vs27
-.if \Complete==0
-  xxperm    vs26, vs24, permute_mask 
-  xxpermdi  vs25, vs24, vs24,2    
-.endif  
-.if \Complete==0
-  lxv vs0,  DISP4(\Index,16+\OffsetB)(\BREG)
-.endif
-
-.if \Complete==0
-  xxpermdi  vs27, vs26, vs26,2    
-.endif
-
-.if \IsLast==1  
-.if \Complete==1
-  addi    \AREG, \AREG, DISP4(\Index,\OffsetA) 
-  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
-.else
-  addi    \AREG, \AREG, DISP4(\Index,32)  
-  addi    \BREG, \BREG,  DISP4(\Index,32)
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL2x2
-  LOAD2x2
-  END2x2  AO, BO, 16,16
-.endm
-
-
-.macro SAVE2x2
-  add T1, CO ,LDC  
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO) 
-#endif
-#ifndef TRMMKERNEL  
-  lxv vs26 , 0(T1) 
-#endif  
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  xxperm  vs8,vs36,permute_mask
-  xxperm  vs12,vs44,permute_mask
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
-  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1   
-  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs8,vs9, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxpermdi vs1,vs8,vs0,0
-  xxpermdi vs9,vs0,vs8,3 
-  xvaddsp vs24,vs24,vs1
-  xvaddsp vs26,vs26,vs9 
-#else
-  xxpermdi vs24,vs8,vs0,0
-  xxpermdi vs26,vs0,vs8,3 
-#endif
-  stxv vs24 , 0(CO) 
-  stxv vs26 , 0(T1)
-  addi  CO, CO, 16
-.endm
-
-/*                                             macros for N=2 and M=1
-**********************************************************************************************/
-
-.macro Zero2x1
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs40, vs40, vs40
-.endm
-
-
-.macro LOAD2x1   
-  LOAD2x1O 0,0 
-.endm
-
-
-.macro LOAD2x1O  OffsetA,OffsetB
-  lxsd v4, (\OffsetA+0)(AO) 
-  lxv vs0,  (\OffsetB+0)(BO)
-  xxspltd  vs24,vs36,0
-  xxperm    vs26, vs24,   permute_mask   
-.endm
-
-
-.macro END2x1_NORMAL
-  END2x1 AO,BO,8,16
-.endm
-
-
-.macro END2x1_WITHOUT_ADD
-  END2x1 AO,BO,0,0
-.endm
-
-
-.macro END2x1 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs40, vs0,vs26
-.endm
-
-
-.macro LOAD2x1_2
-    LOAD2x1_2O 0,0
-.endm
- 
-
-.macro LOAD2x1_2O  OffsetA,OffsetB
-  lxv vs27,  (\OffsetA)(AO) 
-  lxv vs4,  (0+\OffsetB)(BO)
-  lxv vs0,  (16+\OffsetB)(BO)
-  xxspltd  vs8,vs27,1
-  xxspltd  vs24,vs27,0  
-  xxperm    vs10, vs8,    permute_mask 
-  xxperm    vs26, vs24, permute_mask      
-.endm
-
-
-.macro END2x1_2   
-  /*for load2 offset will be 16 and 32*/
-   KERNEL2x1_2  AO,BO,  16,32,0 ,1,1 
-.endm
-
-
-.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL2x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x1_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs40, vs4,vs10
-.if \Complete==0  
-  lxv vs27,  DISP2(\Index,\OffsetA)(\AREG) 
-  xxspltd  vs8,vs27,1 
-.endif  
-.if \Complete==0  
-   lxv  vs4,  DISP4(\Index,0+\OffsetB)(\BREG)
-.endif
-
-.if \Complete==0  
-  xxperm    vs10, vs8,    permute_mask  
-.endif    
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs40, vs0,vs26
-.if \Complete==0 
-  xxspltd  vs24,vs27,0  
-  xxperm   vs26, vs24, permute_mask   
-.endif  
-.if \Complete==0
-  lxv vs0,  DISP4(\Index,16+\OffsetB)(\BREG)
-.endif
-
-.if \IsLast==1  
-.if \Complete==1
-  addi    \AREG, \AREG, DISP2(\Index,\OffsetA) 
-  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
-.else
-  addi    \AREG, \AREG, DISP2(\Index,16)  
-  addi    \BREG, \BREG,  DISP4(\Index,32)
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL2x1
-  LOAD2x1
-  END2x1  AO, BO, 8,16
-.endm
-
-
-.macro SAVE2x1
-  add T1, CO ,LDC  
-#ifndef TRMMKERNEL  
-  lxsd v4 , 0(CO) 
-#endif
-#ifndef TRMMKERNEL  
-  lxsd v5 , 0(T1) 
-#endif  
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1      
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1 
-#ifndef TRMMKERNEL
-  /* add */
-  xxspltd vs1,vs0,0
-  xxspltd vs3,vs0,1
- /*--v4==vs36 v5==vs37---*/
-  xvaddsp vs36,vs36,vs1
-  xvaddsp vs37,vs37,vs3  
-#else 
- /*--v4==vs36 v5==vs37---*/
-  xxspltd vs36,vs0,0
-  xxspltd vs37,vs0,1
-#endif
-  stxsd v4 , 0(CO) 
-  stxsd v5 , 0(T1) 
-  addi  CO, CO, 8
-.endm
-
-/*                                             macros for N=1 and M=8
-**********************************************************************************************/
-
-.macro Zero1x8
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs33, vs33, vs33
-  xxlxor  vs34, vs34, vs34
-  xxlxor  vs35, vs35, vs35
-  xxlxor  vs40, vs40, vs40
-  xxlxor  vs41, vs41, vs41
-  xxlxor  vs42, vs42, vs42
-  xxlxor  vs43, vs43, vs43
-.endm
-
-
-.macro LOAD1x8   
-  LOAD1x8O 0,0 
-.endm
-
-
-.macro LOAD1x8O  OffsetA,OffsetB
-  lxsd vs4, (\OffsetB+0)(BO) 
-  lxv vs0,  (\OffsetA+0)(AO)
-  lxv vs1,  (\OffsetA+16)(AO)
-  lxv vs2,  (\OffsetA+32)(AO)
-  lxv vs3,  (\OffsetA+48)(AO) 
-  xxspltd   vs24,vs36,0
-  xxperm    vs26, vs24,   permute_mask    
-.endm
-
-
-.macro END1x8_NORMAL
-  END1x8 AO,BO,64,8
-.endm
-
-
-.macro END1x8_WITHOUT_ADD
-  END1x8 AO,BO,0,0
-.endm
-
-
-.macro END1x8 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs34, vs2,vs24  
-    xvmaddasp       vs35, vs3,vs24  
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs42, vs2,vs26  
-    xvmaddasp       vs43, vs3,vs26
-.endm
-
-
-.macro LOAD1x8_2
-    LOAD1x8_2O 0,0
-.endm
- 
-
-.macro LOAD1x8_2O  OffsetA,OffsetB
-  lxv vs27,  (\OffsetB)(BO)
-  lxv vs4,  (0+\OffsetA)(AO)
-  lxv vs5,  (16+\OffsetA)(AO)
-  xxspltd  vs8,vs27,1
-  xxspltd  vs24,vs27,0    
-  lxv vs6,  (32+\OffsetA)(AO)
-  lxv vs7,  (48+\OffsetA)(AO) 
-  lxv vs0,  (64+\OffsetA)(AO)
-  lxv vs1,  (64+16+\OffsetA)(AO)     
-  lxv vs2,  (64+32+\OffsetA)(AO)
-  lxv vs3,  (64+48+\OffsetA)(AO)
-  xxperm    vs10, vs8,    permute_mask 
-  xxperm    vs26, vs24, permute_mask   
-.endm
- 
-
-.macro END1x8_2   
-  /*for load2 offset will be 128 and 16*/
-   KERNEL1x8_2  AO,BO,  128,16,0 ,1,1 
-.endm
-
-
-.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL1x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x8_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-.if \Complete==0  
-  lxv vs27,  DISP2(\Index,\OffsetB)(\BREG)
-.endif    
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs33, vs5,vs8
-  xvmaddasp   vs40, vs4,vs10
-  xvmaddasp   vs41, vs5,vs10
-.if \Complete==0  
-   lxv  vs4,  DISP16(\Index,0+\OffsetA)(\AREG)
-   lxv  vs5,  DISP16(\Index,16+\OffsetA)(\AREG)
-.endif
-
-  xvmaddasp   vs34, vs6,vs8 
-  xvmaddasp   vs35, vs7,vs8
-  xvmaddasp   vs42, vs6,vs10
-  xvmaddasp   vs43, vs7,vs10
-.if \Complete==0
-   lxv  vs6,  DISP16(\Index,32+\OffsetA)(\AREG)
-   lxv  vs7,  DISP16(\Index,48+\OffsetA)(\AREG) 
-.endif 
-.if \Complete==0 
-  xxspltd  vs8,vs27,1    
-  xxperm    vs10, vs8,    permute_mask   
-.endif    
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs33, vs1,vs24
-  xvmaddasp   vs40, vs0,vs26
-  xvmaddasp   vs41, vs1,vs26
-.if \Complete==0
-  lxv vs0,  DISP16(\Index,64+\OffsetA)(\AREG)
-  lxv vs1,  DISP16(\Index,64+16+\OffsetA)(\AREG) 
-.endif
-
-  xvmaddasp   vs34, vs2,vs24
-  xvmaddasp   vs35, vs3,vs24    
-  xvmaddasp   vs42, vs2,vs26
-  xvmaddasp   vs43, vs3,vs26
-.if \Complete==0
-  xxspltd  vs24,vs27,0   
-  xxperm    vs26, vs24, permute_mask  
-.endif  
-.if \Complete==0
-  lxv vs2,  DISP16(\Index,64+32+\OffsetA)(\AREG)
-  lxv vs3,  DISP16(\Index,64+48+\OffsetA)(\AREG)
-.endif
-
-.if \IsLast==1  
-.if \Complete==1
-  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP16(\Index,\OffsetA)  
-.else
-  addi    \BREG, \BREG,  DISP2(\Index,16)
-  addi    \AREG, \AREG, DISP16(\Index,128)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL1x8
-  LOAD1x8
-  END1x8  AO, BO, 64,8
-.endm
-
-
-.macro SAVE1x8
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO)
-  lxv vs25 , 16(CO)
-#endif
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-#ifndef TRMMKERNEL  
-  lxv vs26 , 32(CO)
-  lxv vs27 , 48(CO)
-#endif  
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask
-  xxperm  vs2,vs34,permute_mask
-  xxperm  vs6,vs42,permute_mask
-  xxperm  vs3,vs35,permute_mask
-  xxperm  vs7,vs43,permute_mask 
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
-  AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
-  AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 
-  /*inner reverse save_permute and store vs28 */
-  xxpermdi vs28,save_permute_1,save_permute_1,2
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
-  MULT_APLHA_PART1    vs34,vs42,vs4,vs5
-  MULT_APLHA_PART1    vs35,vs43,vs6,vs7  
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
-  MULT_APLHA_PART2    vs34,vs42,vs4,vs5
-  MULT_APLHA_PART2    vs35,vs43,vs6,vs7  
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, vs28
-  xxperm  vs2,vs3, vs28
-  xxperm  vs4,vs5, vs28
-  xxperm  vs6,vs7, vs28  
-#ifndef TRMMKERNEL
-  /* add */
-  xvaddsp vs24,vs24,vs0
-  xvaddsp vs25,vs25,vs2
-  xvaddsp vs26,vs26,vs4
-  xvaddsp  vs27,vs27,vs6
-  stxv vs24 , 0(CO)
-  stxv vs25 , 16(CO) 
-  stxv vs26 , 32(CO)
-  stxv vs27 , 48(CO)    
-#else
-/* reconstruct r,i pairs*/
-  stxv vs0 , 0(CO)
-  stxv vs2 , 16(CO) 
-  stxv vs4 , 32(CO)
-  stxv vs6 , 48(CO)  
-#endif
-  addi  CO, CO, 64
-.endm
-
-/*                                             macros for N=1 and M=4
-**********************************************************************************************/
-
-.macro Zero1x4
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs33, vs33, vs33
-  xxlxor  vs40, vs40, vs40
-  xxlxor  vs41, vs41, vs41
-.endm
-
-
-.macro LOAD1x4   
-  LOAD1x4O 0,0 
-.endm
-
-
-.macro LOAD1x4O  OffsetA,OffsetB
-  lxsd vs4, (\OffsetB+0)(BO) 
-  lxv vs0,  (\OffsetA+0)(AO)
-  lxv vs1,  (\OffsetA+16)(AO)
-  xxspltd   vs24,vs36,0
-  xxperm    vs26, vs24,   permute_mask    
-.endm
-
-
-.macro END1x4_NORMAL
-  END1x4 AO,BO,32,8
-.endm
-
-
-.macro END1x4_WITHOUT_ADD
-  END1x4 AO,BO,0,0
-.endm
-
-
-.macro END1x4 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-.endm
-
-
-.macro LOAD1x4_2
-    LOAD1x4_2O 0,0
-.endm
- 
-
-.macro LOAD1x4_2O  OffsetA,OffsetB
-  lxv vs27,  (\OffsetB)(BO)
-  lxv vs4,  (0+\OffsetA)(AO)
-  lxv vs5,  (16+\OffsetA)(AO)
-  xxspltd  vs8,vs27,1
-  xxspltd  vs24,vs27,0    
-  lxv vs0,  (32+\OffsetA)(AO)
-  lxv vs1,  (32+16+\OffsetA)(AO)     
-  xxperm    vs10, vs8,    permute_mask 
-  xxperm    vs26, vs24, permute_mask   
-.endm
- 
-
-.macro END1x4_2   
-  /*for load2 offset will be 64 and 16*/
-   KERNEL1x4_2  AO,BO,  64,16,0 ,1,1 
-.endm
-
-
-.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL1x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x4_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-.if \Complete==0  
-  lxv vs27,  DISP2(\Index,\OffsetB)(\BREG)
-.endif    
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs33, vs5,vs8
-  xvmaddasp   vs40, vs4,vs10
-  xvmaddasp   vs41, vs5,vs10
-.if \Complete==0  
-   lxv  vs4,  DISP8(\Index,0+\OffsetA)(\AREG)
-   lxv  vs5,  DISP8(\Index,16+\OffsetA)(\AREG)
-.endif
-
-.if \Complete==0 
-  xxspltd  vs8,vs27,1    
-  xxperm    vs10, vs8,    permute_mask   
-.endif    
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs33, vs1,vs24
-  xvmaddasp   vs40, vs0,vs26
-  xvmaddasp   vs41, vs1,vs26
-.if \Complete==0
-  lxv vs0,  DISP8(\Index,32+\OffsetA)(\AREG)
-  lxv vs1,  DISP8(\Index,32+16+\OffsetA)(\AREG) 
-.endif
-
-.if \Complete==0
-  xxspltd  vs24,vs27,0   
-  xxperm    vs26, vs24, permute_mask  
-.endif  
-.if \IsLast==1  
-.if \Complete==1
-  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP8(\Index,\OffsetA)  
-.else
-  addi    \BREG, \BREG,  DISP2(\Index,16)
-  addi    \AREG, \AREG, DISP8(\Index,64)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL1x4
-  LOAD1x4
-  END1x4  AO, BO, 32,8
-.endm
-
-
-.macro SAVE1x4
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO)
-  lxv vs25 , 16(CO)
-#endif
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
-  /*inner reverse save_permute and store vs28 */
-  xxpermdi vs28,save_permute_1,save_permute_1,2
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, vs28
-  xxperm  vs2,vs3, vs28
-#ifndef TRMMKERNEL
-  /* add */
-  xvaddsp vs24,vs24,vs0
-  xvaddsp vs25,vs25,vs2
-  stxv vs24 , 0(CO)
-  stxv vs25 , 16(CO) 
-#else
-/* reconstruct r,i pairs*/
-  stxv vs0 , 0(CO)
-  stxv vs2 , 16(CO) 
-#endif
-  addi  CO, CO, 32
-.endm
-
-/*                                             macros for N=1 and M=2
-**********************************************************************************************/
-
-.macro Zero1x2
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs40, vs40, vs40
-.endm
-
-
-.macro LOAD1x2   
-  LOAD1x2O 0,0 
-.endm
-
-
-.macro LOAD1x2O  OffsetA,OffsetB
-  lxsd vs4, (\OffsetB+0)(BO) 
-  lxv vs0,  (\OffsetA+0)(AO)
-  xxspltd   vs24,vs36,0
-  xxperm    vs26, vs24,   permute_mask    
-.endm
-
-
-.macro END1x2_NORMAL
-  END1x2 AO,BO,16,8
-.endm
-
-
-.macro END1x2_WITHOUT_ADD
-  END1x2 AO,BO,0,0
-.endm
-
-
-.macro END1x2 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs40, vs0,vs26
-.endm
-
-
-.macro LOAD1x2_2
-    LOAD1x2_2O 0,0
-.endm
- 
-
-.macro LOAD1x2_2O  OffsetA,OffsetB
-  lxv vs27,  (\OffsetB)(BO)
-  lxv vs4,  (0+\OffsetA)(AO)
-  lxv vs0,  (16+\OffsetA)(AO)
-  xxspltd  vs8,vs27,1
-  xxspltd  vs24,vs27,0    
-  xxperm    vs10, vs8,    permute_mask 
-  xxperm    vs26, vs24, permute_mask   
-.endm
- 
-
-.macro END1x2_2   
-  /*for load2 offset will be 32 and 16*/
-   KERNEL1x2_2  AO,BO,  32,16,0 ,1,1 
-.endm
-
-
-.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL1x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x2_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-.if \Complete==0  
-  lxv vs27,  DISP2(\Index,\OffsetB)(\BREG)
-.endif    
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs40, vs4,vs10
-.if \Complete==0  
-   lxv  vs4,  DISP4(\Index,0+\OffsetA)(\AREG)
-.endif
-
-.if \Complete==0 
-  xxspltd  vs8,vs27,1    
-  xxperm    vs10, vs8,    permute_mask   
-.endif    
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs40, vs0,vs26
-.if \Complete==0
-  lxv vs0,  DISP4(\Index,16+\OffsetA)(\AREG)
-.endif
-
-.if \Complete==0
-  xxspltd  vs24,vs27,0   
-  xxperm    vs26, vs24, permute_mask  
-.endif  
-.if \IsLast==1  
-.if \Complete==1
-  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP4(\Index,\OffsetA)  
-.else
-  addi    \BREG, \BREG,  DISP2(\Index,16)
-  addi    \AREG, \AREG, DISP4(\Index,32)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL1x2
-  LOAD1x2
-  END1x2  AO, BO, 16,8
-.endm
-
-
-.macro SAVE1x2
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO)
-#endif
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  /*inner reverse save_permute and store vs28 */
-  xxpermdi vs28,save_permute_1,save_permute_1,2
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, vs28
-#ifndef TRMMKERNEL
-  /* add */
-  xvaddsp vs24,vs24,vs0
-  stxv vs24 , 0(CO)
-#else
-/* reconstruct r,i pairs*/
-  stxv vs0 , 0(CO)
-#endif
-  addi  CO, CO, 16
-.endm
-
-/*                                             macros for N=1 and M=1
-**********************************************************************************************/
-.macro Zero1x1
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs40, vs40, vs40
-.endm
-
-
-.macro LOAD1x1   
-  LOAD1x1O 0,0 
-.endm
-
-
-.macro LOAD1x1O  OffsetA,OffsetB
-  lxsd v4, (\OffsetB+0)(BO) 
-  lxsd v5,  (\OffsetA+0)(AO)
-  xxperm    vs38, vs36,   permute_mask    
-.endm
-
-
-.macro END1x1_NORMAL
-  END1x1 AO,BO,8,8
-.endm
-
-
-.macro END1x1_WITHOUT_ADD
-  END1x1 AO,BO,0,0
-.endm
-
-
-.macro END1x1 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs37,vs36
-    xvmaddasp       vs40, vs37,vs38
-.endm
-
-
-.macro LOAD1x1_2
-    LOAD1x1_2O 0,0
-.endm
- 
-
-.macro LOAD1x1_2O  OffsetA,OffsetB
-  lxv vs8,  (\OffsetB)(BO)
-  lxv vs4,  (0+\OffsetA)(AO) 
-  xxperm    vs10, vs8,    permute_mask  
-.endm
- 
-
-.macro END1x1_2   
-  /*for load2 offset will be 16 and 16*/
-   KERNEL1x1_2  AO,BO,  16,16,0 ,1,1 
-.endm
-
-
-.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL1x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x1_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
- 
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs40, vs4,vs10
-.if \Complete==0  
-  lxv vs8,  DISP2(\Index,\OffsetB)(\BREG)
-  lxv vs4,  DISP2(\Index,\OffsetB)(\AREG)
-  xxperm    vs10, vs8,    permute_mask  
-.endif
-
-.if \IsLast==1  
-.if \Complete==1
-  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP2(\Index,\OffsetA)  
-.else
-  addi    \BREG, \BREG,  DISP2(\Index,16)
-  addi    \AREG, \AREG, DISP2(\Index,16)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL1x1
-  LOAD1x1
-  END1x1  AO, BO, 8,8
-.endm
-
-
-.macro SAVE1x1
-#ifndef TRMMKERNEL  
-  lxsd v4 , 0(CO)
-#endif
-  /*aggregate x2*/
-  xxpermdi vs33,vs32,vs32,2
-  xxpermdi vs41,vs40,vs40,2 
-  xvaddsp vs32,vs32,vs33
-  xvaddsp vs40,vs40,vs41
-
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  /*inner reverse save_permute and store vs28 */
-  xxpermdi vs28,save_permute_1,save_permute_1,2
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs37,vs1 
-  MULT_APLHA_PART2    vs32,vs40,vs37,vs1    
-
-/* reconstruct r,i pairs*/
-  xxperm  vs37,vs1, vs28  
-
-#ifndef TRMMKERNEL
-  /* add */
-  xvaddsp vs36,vs36,vs37
-  stxsd v4 , 0(CO)
-#else
-
-/* vs37 is v5 */
-  stxsd v5 , 0(CO)
-#endif
-  addi  CO, CO, 8
-.endm
-
- 
- 
-
-/****************************TRMM POINTER REFRESH MACROSES*************************/
-
-
-.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
-		.if \SHIFT_VAL==16 
-			slwi		\REG1,	\REG2,	7			
-		.elseif \SHIFT_VAL==8  
-			slwi		\REG1,	\REG2,	6			 
-		.elseif \SHIFT_VAL==4
-			slwi		\REG1,	\REG2,	5			  
-		.elseif \SHIFT_VAL==2
-			slwi		\REG1,	\REG2,	4			 
-		.elseif \SHIFT_VAL==1
-			slwi		\REG1,	\REG2,	3			 
-		.endif
-.endm
-
-/*
-//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		ptrbb = bb;
-// #else
-// 		ptrba += off*8;
-// 		ptrbb = bb + off*4;
-// #endif
-*/
-.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
-    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
-        /* ptrbb = bb;*/
-        mr \PTR_B,\B_VAL     /* refresh BPOINT */
-
-    #else
-		    /*
-        // ptrba  =ptrba+ off*C_A;
-        // ptrbb = bb + off*C_B; 
-				*/
-		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
-		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
-		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
-		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
-    #endif 
-.endm
-
-
-/*
-// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-// 		temp = bk-off;
-// #elif defined(LEFT)
-// 		temp = off+8;	// number of values in A
-// #else
-// 		temp = off+4;	// number of values in B
-// #endif
-*/
-.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
-    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
-                            /* temp = bk-off;*/
-           sub \TEMP_BK,\BK_VAL,\OFF_VAL
-
-    #elif defined(LEFT)
-                            /* temp = off+INCR_A;	// number of values in A */
-           addi \TEMP_BK, \OFF_VAL, \INCR_A
-    #else
-                            /* temp = off+INCR_B	// number of values in B*/
-           addi \TEMP_BK,\OFF_VAL, \INCR_B
-    #endif
-
-.endm
-/*
-// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		temp = bk - off;
-// #ifdef LEFT
-// 		temp -= 8; // number of values in A
-// #else
-// 		temp -= 4; // number of values in B
-// #endif
-// 		ptrba += temp*8;
-// 		ptrbb += temp*4;
-// #endif
-
-// #ifdef LEFT
-// 		off += 8; // number of values in A
-// #endif
-*/
- 
-
-.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
-
-    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-                    /*temp = bk - off;*/
-                sub \TEMP_BK,\BK_VAL,\OFF_VAL
-    #ifdef LEFT
-                    /*temp -= 8; // number of values in A*/
-                addi \TEMP_BK,\TEMP_BK,-\C_A
-    #else
-                    /*temp -= 4; // number of values in B*/
-                addi \TEMP_BK,\TEMP_BK,-\C_B 
-    #endif
-                    /*ptrba += temp*C_A;
-                    ptrbb += temp*C_B;*/ 
-                SHIFT_REG T4,\TEMP_BK,\C_A
-								SHIFT_REG T2,\TEMP_BK,\C_B
-                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
-								add \PTR_B, \PTR_B,T2 
-
-    #endif
-
-    #ifdef LEFT
-                    /*off += 8; // number of values in A*/
-                 addi \OFF_VAL,\OFF_VAL,\C_A
-    #endif
+
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* BLASTEST 	     	: OK
+*  CTEST		    	: OK
+*  TEST			      : OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+#define unit_size 8
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+#define DISPX(disp)  (disp)
+
+.macro  AGGREGATE_REALS_IMAGES  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
+	xvsubsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
+	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
+	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubsp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
+#else	// CC || CR || RC || RR 
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+	xvsubsp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
+    /*we will negate alpha image   instead to fix sign*/
+	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#endif
+.endm
+
+
+.macro  AGGREGATE_REALS_IMAGES_A_PERMUTE  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
+	xvsubsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
+	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubsp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
+	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#else	// CC || CR || RC || RR 
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+	xvsubsp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
+    /*we will negate alpha image   instead to fix sign*/
+	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#endif
+.endm
+ 
+/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */
+
+.macro MULT_APLHA_PART1  VSINRR,VSINII,VSOUT1,VSOUT2
+	xvmulsp \VSOUT1,\VSINII, alpha_i 
+	xvmulsp  \VSOUT2,\VSINRR, alpha_i
+.endm
+
+/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+.macro MULT_APLHA_PART2  VSINRR,VSINII,VSOUT1,VSOUT2 
+	xvmsubasp  \VSOUT1,\VSINRR, alpha_r
+	xvmaddasp \VSOUT2,\VSINII, alpha_r
+.endm
+
+/*                                             macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro Zero4x8
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs42,	vs42,	vs42
+	xxlxor	vs43,	vs43,	vs43
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+	xxlxor	vs46,	vs46,	vs46
+	xxlxor	vs47,	vs47,	vs47
+	xxlxor	vs48,	vs48,	vs48
+	xxlxor	vs49,	vs49,	vs49
+	xxlxor	vs50,	vs50,	vs50
+	xxlxor	vs51,	vs51,	vs51
+	xxlxor	vs52,	vs52,	vs52
+	xxlxor	vs53,	vs53,	vs53
+	xxlxor	vs54,	vs54,	vs54
+	xxlxor	vs55,	vs55,	vs55
+	xxlxor	vs56,	vs56,	vs56
+	xxlxor	vs57,	vs57,	vs57
+	xxlxor	vs58,	vs58,	vs58
+	xxlxor	vs59,	vs59,	vs59
+	xxlxor	vs60,	vs60,	vs60
+	xxlxor	vs61,	vs61,	vs61
+	xxlxor	vs62,	vs62,	vs62
+	xxlxor	vs63,	vs63,	vs63
+.endm
+
+
+.macro LOAD4x8   
+	LOAD4x8O 0,0 
+.endm
+
+
+.macro LOAD4x8O  OffsetA,OffsetB
+	lxv	vs24,	(\OffsetB+0)(BO)
+	lxv	vs28,	(\OffsetB+16)(BO)
+	xxperm  	vs26,	vs24,		permute_mask
+	xxperm  	vs30,	vs28,		permute_mask	  
+	lxv	vs0,	(\OffsetA+0)(AO)
+	lxv	vs1,	(\OffsetA+16)(AO)
+	xxpermdi	vs25,	vs24,	vs24,2	   
+	xxpermdi	vs29,	vs28,	vs28,2	  
+	lxv	vs2,	(\OffsetA+32)(AO)
+	lxv	vs3,	(\OffsetA+48)(AO) 
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2	 	
+.endm
+
+
+.macro END4x8_NORMAL
+	END4x8 AO,BO,64,32
+.endm
+
+
+.macro END4x8_WITHOUT_ADD
+	END4x8 AO,BO,0,0
+.endm
+
+
+.macro END4x8	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs38, vs2,vs25  
+    xvmaddasp       vs39, vs3,vs25 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs46, vs2,vs27  
+    xvmaddasp       vs47, vs3,vs27
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+    xvmaddasp       vs50, vs2,vs28  
+    xvmaddasp       vs51, vs3,vs28  
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+    xvmaddasp       vs54, vs2,vs29  
+    xvmaddasp       vs55, vs3,vs29
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+    xvmaddasp       vs58, vs2,vs30  
+    xvmaddasp       vs59, vs3,vs30
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+    xvmaddasp       vs62, vs2,vs31  
+    xvmaddasp       vs63, vs3,vs31 
+.endm
+
+
+.macro LOAD4x8_2
+    LOAD4x8_2O 0,0
+.endm
+	
+
+.macro LOAD4x8_2O  OffsetA,OffsetB
+  lxv	vs8,	(\OffsetB)(BO)
+  lxv	vs12,	(16+\OffsetB)(BO)
+  lxv	vs24,	(32+\OffsetB)(BO)
+  lxv	vs28,	(32+16+\OffsetB)(BO)
+  lxv	vs4,	(0+\OffsetA)(AO)
+  lxv	vs5,	(16+\OffsetA)(AO)
+  xxperm  	vs10,	vs8,		permute_mask
+  xxperm  	vs14,	vs12,		permute_mask	
+  lxv	vs6,	(32+\OffsetA)(AO)
+  lxv	vs7,	(48+\OffsetA)(AO) 
+  xxpermdi	vs9,	vs8,	 vs8,2	 
+  xxpermdi	vs13,	vs12,	vs12,2	 
+  lxv	vs0,	(64+\OffsetA)(AO)
+  lxv	vs1,	(64+16+\OffsetA)(AO) 
+  xxpermdi	vs11,	vs10,	vs10,2	
+  xxpermdi	vs15,	vs14,	vs14,2	
+  lxv	vs2,	(64+32+\OffsetA)(AO)
+  lxv	vs3,	(64+48+\OffsetA)(AO)
+  xxperm  	vs26,	vs24,	permute_mask
+  xxperm  	vs30,	vs28,	permute_mask	
+  xxpermdi	vs25,	vs24,	vs24,2 
+  xxpermdi	vs29,	vs28,	vs28,2	      
+  xxpermdi	vs27,	vs26,	vs26,2	
+  xxpermdi	vs31,	vs30,	vs30,2	 
+.endm
+	
+
+.macro END4x8_2	  
+  /*for load2 offset will be 128 and 64*/
+   KERNEL4x8_2	AO,BO,	128,64,0 ,1,1 
+.endm
+
+
+.macro KERNEL4x8_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL4x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL4x8_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL4x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp		vs32, vs4,vs8
+  xvmaddasp		vs33, vs5,vs8
+  xvmaddasp		vs48, vs4,vs12
+  xvmaddasp		vs49, vs5,vs12
+  xvmaddasp		vs40, vs4,vs10
+  xvmaddasp		vs41, vs5,vs10
+  xvmaddasp		vs56, vs4,vs14
+  xvmaddasp		vs57, vs5,vs14
+  xvmaddasp		vs36, vs4,vs9
+  xvmaddasp		vs37, vs5,vs9
+  xvmaddasp		vs52, vs4,vs13
+  xvmaddasp		vs53, vs5,vs13
+  xvmaddasp		vs44, vs4,vs11
+  xvmaddasp		vs45, vs5,vs11
+  xvmaddasp		vs60, vs4,vs15
+  xvmaddasp		vs61, vs5,vs15
+.if \Complete==0	
+   lxv	vs4,	DISP16(\Index,0+\OffsetA)(\AREG)
+   lxv	vs5,	DISP16(\Index,16+\OffsetA)(\AREG)
+.endif
+
+  xvmaddasp		vs34, vs6,vs8	
+  xvmaddasp		vs35, vs7,vs8	
+  xvmaddasp		vs50, vs6,vs12
+  xvmaddasp		vs51, vs7,vs12
+.if \Complete==0  
+  lxv vs8,  DISP8(\Index,\OffsetB)(\BREG)
+  lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)
+.endif    
+  xvmaddasp		vs42, vs6,vs10
+  xvmaddasp		vs43, vs7,vs10
+  xvmaddasp		vs58, vs6,vs14
+  xvmaddasp		vs59, vs7,vs14
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask
+  xxperm    vs14, vs12,   permute_mask    
+.endif    
+  xvmaddasp		vs38, vs6,vs9	
+  xvmaddasp		vs39, vs7,vs9	
+  xvmaddasp   vs54, vs6,vs13
+  xvmaddasp   vs55, vs7,vs13
+.if \Complete==0
+  xxpermdi  vs9,  vs8,   vs8,2   
+  xxpermdi  vs13, vs12, vs12,2   
+.endif    
+  xvmaddasp		vs46, vs6,vs11
+  xvmaddasp		vs47, vs7,vs11
+  xvmaddasp		vs62, vs6,vs15
+  xvmaddasp		vs63, vs7,vs15
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi  vs15, vs14, vs14,2  
+.endif  
+.if \Complete==0
+   lxv	vs6,	DISP16(\Index,32+\OffsetA)(\AREG)
+   lxv	vs7,	DISP16(\Index,48+\OffsetA)(\AREG) 
+.endif 
+  xvmaddasp		vs32, vs0,vs24
+  xvmaddasp		vs33, vs1,vs24
+  xvmaddasp		vs48, vs0,vs28
+  xvmaddasp		vs49, vs1,vs28
+  xvmaddasp		vs40, vs0,vs26
+  xvmaddasp		vs41, vs1,vs26
+  xvmaddasp		vs56, vs0,vs30
+  xvmaddasp		vs57, vs1,vs30
+  xvmaddasp		vs36, vs0,vs25
+  xvmaddasp		vs37, vs1,vs25
+  xvmaddasp		vs52, vs0,vs29
+  xvmaddasp		vs53, vs1,vs29
+  xvmaddasp		vs44, vs0,vs27
+  xvmaddasp		vs45, vs1,vs27
+  xvmaddasp		vs60, vs0,vs31
+  xvmaddasp		vs61, vs1,vs31 
+.if \Complete==0
+  lxv	vs0,	DISP16(\Index,64+\OffsetA)(\AREG)
+  lxv	vs1,	DISP16(\Index,64+16+\OffsetA)(\AREG) 
+.endif
+
+  xvmaddasp		vs34, vs2,vs24
+  xvmaddasp		vs35, vs3,vs24	  
+  xvmaddasp		vs50, vs2,vs28
+  xvmaddasp		vs51, vs3,vs28
+.if \Complete==0
+  lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)
+  lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)
+.endif  
+  xvmaddasp		vs42, vs2,vs26
+  xvmaddasp		vs43, vs3,vs26
+  xvmaddasp		vs58, vs2,vs30
+  xvmaddasp		vs59, vs3,vs30
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask
+  xxperm    vs30, vs28, permute_mask  
+.endif  
+  xvmaddasp		vs38, vs2,vs25
+  xvmaddasp		vs39, vs3,vs25
+  xvmaddasp		vs54, vs2,vs29
+  xvmaddasp		vs55, vs3,vs29
+.if \Complete==0
+  xxpermdi  vs25, vs24, vs24,2 
+  xxpermdi  vs29, vs28, vs28,2    
+.endif  
+  xvmaddasp		vs46, vs2,vs27
+  xvmaddasp		vs47, vs3,vs27
+  xvmaddasp		vs62, vs2,vs31	
+  xvmaddasp		vs63, vs3,vs31
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2  
+  xxpermdi  vs31, vs30, vs30,2   
+.endif
+
+.if \Complete==0
+  lxv	vs2,	DISP16(\Index,64+32+\OffsetA)(\AREG)
+  lxv	vs3,	DISP16(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+.if \IsLast==1	
+.if \Complete==1
+	addi		\BREG, \BREG,  DISP8(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP16(\Index,\OffsetA)  
+.else
+	addi		\BREG, \BREG,  DISP8(\Index,64)
+  addi    \AREG, \AREG, DISP16(\Index,128)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL4x8
+  LOAD4x8
+  END4x8  AO, BO, 64,32
+.endm
+
+
+.macro SAVE4x8
+  add T4, LDC,LDC
+	add	T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs26 , 32(CO)
+  lxv vs27 , 48(CO)
+#endif  
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs28 , 0(T1)
+  lxv vs29 , 16(T1)
+#endif  
+  xxperm  vs2,vs34,permute_mask
+  xxperm  vs6,vs42,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs30 , 32(T1)
+  lxv vs31 , 48(T1)
+#endif 
+  xxperm  vs3,vs35,permute_mask
+  xxperm  vs7,vs43,permute_mask 
+  add T2,CO,T4
+  add T3,T1,T4  
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  xxperm  vs9,vs37,permute_mask
+  xxperm  vs13,vs45,permute_mask
+  AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
+  xxperm  vs10,vs38,permute_mask
+  xxperm  vs14,vs46,permute_mask
+  AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 
+  xxperm  vs11,vs39,permute_mask
+  xxperm  vs15,vs47,permute_mask 
+  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+  xxperm  vs0,vs48,permute_mask
+  xxperm  vs4,vs56,permute_mask
+  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+  xxperm  vs1,vs49,permute_mask
+  xxperm  vs5,vs57,permute_mask
+  AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14
+  xxperm  vs2,vs50,permute_mask
+  xxperm  vs6,vs58,permute_mask
+  AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 
+  xxperm  vs3,vs51,permute_mask
+  xxperm  vs7,vs59,permute_mask 
+  AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4
+  xxperm  vs8,vs52,permute_mask
+  xxperm  vs12,vs60,permute_mask
+  AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5
+  xxperm  vs9,vs53,permute_mask
+  xxperm  vs13,vs61,permute_mask
+  AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6
+  xxperm  vs10,vs54,permute_mask
+  xxperm  vs14,vs62,permute_mask
+  AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 
+  xxperm  vs11,vs55,permute_mask
+  xxperm  vs15,vs63,permute_mask 
+  AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12
+  AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3  
+  AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15  
+  MULT_APLHA_PART1    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART1    vs35,vs43,vs6,vs7  
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART2    vs35,vs43,vs6,vs7  
+ #ifndef TRMMKERNEL  
+  lxv vs32 , 0(T2)
+  lxv vs40 , 16(T2)
+#endif 
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
+#ifndef TRMMKERNEL  
+  lxv vs33 , 32(T2)
+  lxv vs41 , 48(T2)
+#endif  
+  MULT_APLHA_PART1    vs38,vs46,vs12,vs13
+  MULT_APLHA_PART1    vs39,vs47,vs14,vs15
+#ifndef TRMMKERNEL  
+  lxv vs34 , 0(T3)
+  lxv vs42 , 16(T3)
+#endif  
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
+#ifndef TRMMKERNEL  
+  lxv vs35 , 32(T3)
+  lxv vs43 , 48(T3)
+#endif    
+  MULT_APLHA_PART2    vs38,vs46,vs12,vs13
+  MULT_APLHA_PART2    vs39,vs47,vs14,vs15
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  xxperm  vs4,vs5, save_permute_1
+  xxperm  vs6,vs7, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+  xxperm  vs12,vs13, save_permute_1
+  xxperm  vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,2
+  xxpermdi vs3,vs10,vs2,2
+  xxpermdi vs5,vs12,vs4,2
+  xxpermdi vs7,vs14,vs6,2
+  xxpermdi vs9,vs0,vs8,2
+  xxpermdi vs11,vs2,vs10,2  
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs25,vs25,vs3
+  xxpermdi vs13,vs4,vs12,2  
+  xxpermdi vs15,vs6,vs14,2
+  xvaddsp vs26,vs26,vs5
+  xvaddsp  vs27,vs27,vs7
+  xvaddsp vs28,vs28,vs9
+  xvaddsp vs29,vs29,vs11 
+  xvaddsp vs30,vs30,vs13
+  xvaddsp vs31,vs31,vs15  
+#else
+  xxpermdi vs24,vs8,vs0,2
+  xxpermdi vs25,vs10,vs2,2
+  xxpermdi vs26,vs12,vs4,2
+  xxpermdi vs27,vs14,vs6,2 
+  xxpermdi vs28,vs0,vs8,2
+  xxpermdi vs29,vs2,vs10,2  
+  xxpermdi vs30,vs4,vs12,2  
+  xxpermdi vs31,vs6,vs14,2
+#endif
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO)
+  MULT_APLHA_PART1    vs48,vs56,vs0,vs1
+  MULT_APLHA_PART1    vs49,vs57,vs2,vs3
+  stxv vs26 , 32(CO)
+  stxv vs27 , 48(CO)
+  MULT_APLHA_PART1    vs50,vs58,vs4,vs5
+  MULT_APLHA_PART1    vs51,vs59,vs6,vs7
+  stxv vs28 , 0(T1)
+  stxv vs29 , 16(T1)
+  MULT_APLHA_PART2    vs48,vs56,vs0,vs1
+  MULT_APLHA_PART2    vs49,vs57,vs2,vs3
+  stxv vs30 , 32(T1)
+  stxv vs31 , 48(T1)  
+  MULT_APLHA_PART2    vs50,vs58,vs4,vs5
+  MULT_APLHA_PART2    vs51,vs59,vs6,vs7
+  MULT_APLHA_PART1    vs52,vs60,vs8,vs9
+  MULT_APLHA_PART1    vs53,vs61,vs10,vs11
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  MULT_APLHA_PART1    vs54,vs62,vs12,vs13
+  MULT_APLHA_PART1    vs55,vs63,vs14,vs15
+  xxperm  vs4,vs5, save_permute_1
+  xxperm  vs6,vs7, save_permute_1
+  MULT_APLHA_PART2    vs52,vs60,vs8,vs9
+  MULT_APLHA_PART2    vs53,vs61,vs10,vs11
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+  MULT_APLHA_PART2    vs54,vs62,vs12,vs13
+  MULT_APLHA_PART2    vs55,vs63,vs14,vs15
+  xxperm  vs12,vs13, save_permute_1
+  xxperm  vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,2
+  xxpermdi vs3,vs10,vs2,2
+  xxpermdi vs5,vs12,vs4,2
+  xxpermdi vs7,vs14,vs6,2
+  xxpermdi vs9,vs0,vs8,2
+  xxpermdi vs11,vs2,vs10,2  
+  xvaddsp vs32,vs32,vs1
+  xvaddsp vs40,vs40,vs3
+  xxpermdi vs13,vs4,vs12,2  
+  xxpermdi vs15,vs6,vs14,2
+  xvaddsp vs33,vs33,vs5
+  xvaddsp  vs41,vs41,vs7
+  xvaddsp vs34,vs34,vs9
+  xvaddsp vs42,vs42,vs11 
+  xvaddsp vs35,vs35,vs13
+  xvaddsp vs43,vs43,vs15  
+#else
+  xxpermdi vs32,vs8,vs0,2
+  xxpermdi vs40,vs10,vs2,2
+  xxpermdi vs33,vs12,vs4,2
+  xxpermdi vs41,vs14,vs6,2 
+  xxpermdi vs34,vs0,vs8,2
+  xxpermdi vs42,vs2,vs10,2  
+  xxpermdi vs35,vs4,vs12,2  
+  xxpermdi vs43,vs6,vs14,2
+#endif
+  stxv vs32 , 0(T2)
+  stxv vs40 , 16(T2)
+  stxv vs33 , 32(T2)
+  stxv vs41 , 48(T2)
+  stxv vs34 , 0(T3)
+  stxv vs42 , 16(T3)
+  stxv vs35 , 32(T3)
+  stxv vs43 , 48(T3)  
+	addi	CO, CO, 64
+.endm
+
+/*                                             macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro Zero4x4
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+	xxlxor	vs48,	vs48,	vs48
+	xxlxor	vs49,	vs49,	vs49
+	xxlxor	vs52,	vs52,	vs52
+	xxlxor	vs53,	vs53,	vs53
+	xxlxor	vs56,	vs56,	vs56
+	xxlxor	vs57,	vs57,	vs57
+	xxlxor	vs60,	vs60,	vs60
+	xxlxor	vs61,	vs61,	vs61
+.endm
+
+
+.macro LOAD4x4   
+	LOAD4x4O 0,0 
+.endm
+
+
+.macro LOAD4x4O  OffsetA,OffsetB
+	lxv	vs24,	(\OffsetB+0)(BO)
+	lxv	vs28,	(\OffsetB+16)(BO)
+	xxperm  	vs26,	vs24,		permute_mask
+	xxperm  	vs30,	vs28,		permute_mask	  
+	lxv	vs0,	(\OffsetA+0)(AO)
+	lxv	vs1,	(\OffsetA+16)(AO)
+	xxpermdi	vs25,	vs24,	vs24,2	   
+	xxpermdi	vs29,	vs28,	vs28,2	  
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2	 	
+.endm
+
+
+.macro END4x4_NORMAL
+	END4x4 AO,BO,32,32
+.endm
+
+
+.macro END4x4_WITHOUT_ADD
+	END4x4 AO,BO,0,0
+.endm
+
+
+.macro END4x4	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+.endm
+
+
+.macro LOAD4x4_2
+    LOAD4x4_2O 0,0
+.endm
+	
+
+.macro LOAD4x4_2O  OffsetA,OffsetB
+  lxv	vs8,	(\OffsetB)(BO)
+  lxv	vs12,	(16+\OffsetB)(BO)
+  lxv	vs24,	(32+\OffsetB)(BO)
+  lxv	vs28,	(32+16+\OffsetB)(BO)
+  lxv	vs4,	(0+\OffsetA)(AO)
+  lxv	vs5,	(16+\OffsetA)(AO)
+  xxperm  	vs10,	vs8,		permute_mask
+  xxperm  	vs14,	vs12,		permute_mask	
+  xxpermdi	vs9,	vs8,	 vs8,2	 
+  xxpermdi	vs13,	vs12,	vs12,2	 
+  lxv	vs0,	(32+\OffsetA)(AO)
+  lxv	vs1,	(32+16+\OffsetA)(AO) 
+  xxpermdi	vs11,	vs10,	vs10,2	
+  xxpermdi	vs15,	vs14,	vs14,2	
+  xxperm  	vs26,	vs24,	permute_mask
+  xxperm  	vs30,	vs28,	permute_mask	
+  xxpermdi	vs25,	vs24,	vs24,2 
+  xxpermdi	vs29,	vs28,	vs28,2	      
+  xxpermdi	vs27,	vs26,	vs26,2	
+  xxpermdi	vs31,	vs30,	vs30,2	 
+.endm
+
+
+.macro END4x4_2	  
+  /*for load2 offset will be 64 and 64*/
+   KERNEL4x4_2	AO,BO,	64,64,0 ,1,1 
+.endm
+
+
+.macro KERNEL4x4_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL4x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL4x4_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL4x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp		vs32, vs4,vs8
+  xvmaddasp		vs33, vs5,vs8
+  xvmaddasp		vs48, vs4,vs12
+  xvmaddasp		vs49, vs5,vs12
+  xvmaddasp		vs40, vs4,vs10
+  xvmaddasp		vs41, vs5,vs10
+  xvmaddasp		vs56, vs4,vs14
+  xvmaddasp		vs57, vs5,vs14
+.if \Complete==0  
+  lxv vs8,  DISP8(\Index,\OffsetB)(\BREG)
+  lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)
+.endif  
+  xvmaddasp		vs36, vs4,vs9
+  xvmaddasp		vs37, vs5,vs9
+  xvmaddasp		vs52, vs4,vs13
+  xvmaddasp		vs53, vs5,vs13
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask
+  xxperm    vs14, vs12,   permute_mask    
+.endif    
+  xvmaddasp		vs44, vs4,vs11
+  xvmaddasp		vs45, vs5,vs11
+  xvmaddasp		vs60, vs4,vs15
+  xvmaddasp		vs61, vs5,vs15
+.if \Complete==0
+  xxpermdi  vs9,  vs8,   vs8,2   
+  xxpermdi  vs13, vs12, vs12,2   
+.endif    
+.if \Complete==0	
+   lxv	vs4,	DISP8(\Index,0+\OffsetA)(\AREG)
+   lxv	vs5,	DISP8(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi  vs15, vs14, vs14,2  
+.endif  
+  xvmaddasp		vs32, vs0,vs24
+  xvmaddasp		vs33, vs1,vs24
+  xvmaddasp		vs48, vs0,vs28
+  xvmaddasp		vs49, vs1,vs28
+  xvmaddasp		vs40, vs0,vs26
+  xvmaddasp		vs41, vs1,vs26
+  xvmaddasp		vs56, vs0,vs30
+  xvmaddasp		vs57, vs1,vs30
+.if \Complete==0
+  lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)
+  lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)
+.endif   
+  xvmaddasp		vs36, vs0,vs25
+  xvmaddasp		vs37, vs1,vs25
+  xvmaddasp		vs52, vs0,vs29
+  xvmaddasp		vs53, vs1,vs29
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask
+  xxperm    vs30, vs28, permute_mask  
+.endif    
+  xvmaddasp		vs44, vs0,vs27
+  xvmaddasp		vs45, vs1,vs27
+  xvmaddasp		vs60, vs0,vs31
+  xvmaddasp		vs61, vs1,vs31 
+.if \Complete==0
+  xxpermdi  vs25, vs24, vs24,2 
+  xxpermdi  vs29, vs28, vs28,2    
+.endif  
+.if \Complete==0
+  lxv	vs0,	DISP8(\Index,32+\OffsetA)(\AREG)
+  lxv	vs1,	DISP8(\Index,32+16+\OffsetA)(\AREG) 
+.endif
+
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2  
+  xxpermdi  vs31, vs30, vs30,2   
+.endif
+
+.if \IsLast==1	
+.if \Complete==1
+	addi		\BREG, \BREG,  DISP8(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP8(\Index,\OffsetA)  
+.else
+	addi		\BREG, \BREG,  DISP8(\Index,64)
+  addi    \AREG, \AREG, DISP8(\Index,64)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL4x4
+  LOAD4x4
+  END4x4  AO, BO, 32,32
+.endm
+
+
+.macro SAVE4x4
+  add T4, LDC,LDC
+  add T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+  add T2,CO,T4
+  add T3,T1,T4  
+#ifndef TRMMKERNEL  
+  lxv vs26 , 0(T1)
+  lxv vs27 , 16(T1)
+#endif  
+ #ifndef TRMMKERNEL  
+  lxv vs28 , 0(T2)
+  lxv vs29 , 16(T2)
+#endif
+#ifndef TRMMKERNEL  
+  lxv vs30 , 0(T3)
+  lxv vs31 , 16(T3)
+#endif   
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  xxperm  vs9,vs37,permute_mask
+  xxperm  vs13,vs45,permute_mask
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+  xxperm  vs0,vs48,permute_mask
+  xxperm  vs4,vs56,permute_mask
+  xxperm  vs1,vs49,permute_mask
+  xxperm  vs5,vs57,permute_mask 
+  xxperm  vs8,vs52,permute_mask
+  xxperm  vs12,vs60,permute_mask
+  xxperm  vs9,vs53,permute_mask
+  xxperm  vs13,vs61,permute_mask
+  AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4
+  AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5
+  AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12
+  AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART1    vs48,vs56,vs4,vs5
+  MULT_APLHA_PART1    vs49,vs57,vs6,vs7    
+  MULT_APLHA_PART1    vs52,vs60,vs12,vs13
+  MULT_APLHA_PART1    vs53,vs61,vs14,vs15
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART2    vs48,vs56,vs4,vs5
+  MULT_APLHA_PART2    vs49,vs57,vs6,vs7    
+  MULT_APLHA_PART2    vs52,vs60,vs12,vs13
+  MULT_APLHA_PART2    vs53,vs61,vs14,vs15
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+  xxperm  vs4,vs5, save_permute_1
+  xxperm  vs6,vs7, save_permute_1
+  xxperm  vs12,vs13, save_permute_1
+  xxperm  vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,2
+  xxpermdi vs3,vs10,vs2,2 
+  xxpermdi vs9,vs0,vs8,2
+  xxpermdi vs11,vs2,vs10,2  
+  xxpermdi vs5,vs12,vs4,2
+  xxpermdi vs7,vs14,vs6,2 
+  xxpermdi vs13,vs4,vs12,2
+  xxpermdi vs15,vs6,vs14,2   
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs25,vs25,vs3 
+  xvaddsp vs26,vs26,vs9
+  xvaddsp vs27,vs27,vs11 
+  xvaddsp vs28,vs28,vs5
+  xvaddsp vs29,vs29,vs7 
+  xvaddsp vs30,vs30,vs13
+  xvaddsp vs31,vs31,vs15 
+#else
+  xxpermdi vs24,vs8,vs0,2
+  xxpermdi vs25,vs10,vs2,2
+  xxpermdi vs26,vs0,vs8,2
+  xxpermdi vs27,vs2,vs10,2  
+  xxpermdi vs28,vs12,vs4,2
+  xxpermdi vs29,vs14,vs6,2 
+  xxpermdi vs30,vs4,vs12,2
+  xxpermdi vs31,vs6,vs14,2   
+#endif
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO)
+  stxv vs26 , 0(T1)
+  stxv vs27 , 16(T1)
+  stxv vs28 , 0(T2)
+  stxv vs29 , 16(T2)
+  stxv vs30 , 0(T3)
+  stxv vs31 , 16(T3)  
+  addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro Zero4x2
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+.endm
+
+
+.macro LOAD4x2   
+	LOAD4x2O 0,0 
+.endm
+
+
+.macro LOAD4x2O  OffsetA,OffsetB
+	lxv	vs24,	(\OffsetA+0)(AO)
+  lxv vs0,  (\OffsetB+0)(BO)
+  lxv vs1,  (\OffsetB+16)(BO)
+	xxperm  	vs26,	vs24,		permute_mask  
+	xxpermdi	vs25,	vs24,	vs24,2	    
+	xxpermdi	vs27,	vs26,	vs26,2	
+.endm
+
+
+.macro END4x2_NORMAL
+	END4x2 AO,BO,16,32
+.endm
+
+
+.macro END4x2_WITHOUT_ADD
+	END4x2 AO,BO,0,0
+.endm
+
+
+.macro END4x2	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+.endm
+
+
+.macro LOAD4x2_2
+    LOAD4x2_2O 0,0
+.endm
+	
+
+.macro LOAD4x2_2O  OffsetA,OffsetB
+  lxv	vs8,	(\OffsetA)(AO) 
+  lxv	vs24,	(16+\OffsetA)(AO) 
+  lxv	vs4,	(0+\OffsetB)(BO)
+  lxv	vs5,	(16+\OffsetB)(BO)
+  xxperm  	vs10,	vs8,		permute_mask
+  xxpermdi	vs9,	vs8,	 vs8,2	 
+  xxperm  	vs26,	vs24,	permute_mask
+  xxpermdi	vs25,	vs24,	vs24,2    
+  lxv vs0,  (32+\OffsetB)(BO)
+  lxv vs1,  (32+16+\OffsetB)(BO) 
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi	vs27,	vs26,	vs26,2	
+.endm
+
+
+.macro END4x2_2	  
+  /*for load2 offset will be 32 and 64*/
+   KERNEL4x2_2	AO,BO,	32,64,0 ,1,1 
+.endm
+
+
+.macro KERNEL4x2_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL4x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL4x2_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL4x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL4x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp		vs32, vs4,vs8
+  xvmaddasp		vs33, vs5,vs8
+  xvmaddasp		vs40, vs4,vs10
+  xvmaddasp		vs41, vs5,vs10
+.if \Complete==0  
+  lxv vs8,  DISP4(\Index,\OffsetA)(\AREG) 
+.endif  
+  xvmaddasp		vs36, vs4,vs9
+  xvmaddasp		vs37, vs5,vs9
+  xvmaddasp   vs44, vs4,vs11
+  xvmaddasp   vs45, vs5,vs11
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask 
+  xxpermdi  vs9,  vs8,   vs8,2  
+.endif    
+.if \Complete==0	
+   lxv	vs4,	DISP8(\Index,0+\OffsetB)(\BREG)
+   lxv	vs5,	DISP8(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2   
+.endif  
+  xvmaddasp		vs32, vs0,vs24
+  xvmaddasp		vs33, vs1,vs24
+  xvmaddasp		vs40, vs0,vs26
+  xvmaddasp		vs41, vs1,vs26
+.if \Complete==0
+  lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) 
+.endif   
+  xvmaddasp		vs36, vs0,vs25
+  xvmaddasp		vs37, vs1,vs25
+  xvmaddasp		vs44, vs0,vs27
+  xvmaddasp		vs45, vs1,vs27
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask 
+  xxpermdi  vs25, vs24, vs24,2    
+.endif  
+.if \Complete==0
+  lxv	vs0,	DISP8(\Index,32+\OffsetB)(\BREG)
+  lxv	vs1,	DISP8(\Index,32+16+\OffsetB)(\BREG) 
+.endif
+
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2    
+.endif
+
+.if \IsLast==1	
+.if \Complete==1
+  addi    \AREG, \AREG, DISP4(\Index,\OffsetA) 
+	addi		\BREG, \BREG,  DISP8(\Index,\OffsetB)
+.else
+  addi    \AREG, \AREG, DISP4(\Index,32)  
+	addi		\BREG, \BREG,  DISP8(\Index,64)
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL4x2
+  LOAD4x2
+  END4x2  AO, BO, 16,32
+.endm
+
+
+.macro SAVE4x2
+  add T4, LDC,LDC
+  add T1, CO ,LDC  
+  add T2,CO,T4
+  add T3,T1,T4  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO) 
+#endif
+#ifndef TRMMKERNEL  
+  lxv vs25 , 0(T1) 
+#endif  
+#ifndef TRMMKERNEL  
+  lxv vs26 , 0(T2) 
+#endif
+#ifndef TRMMKERNEL  
+  lxv vs27 , 0(T3) 
+#endif   
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask 
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  xxperm  vs9,vs37,permute_mask
+  xxperm  vs13,vs45,permute_mask
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,0
+  xxpermdi vs9,vs10,vs2,0 
+  xxpermdi vs3,vs0,vs8,3
+  xxpermdi vs11,vs2,vs10,3 
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs26,vs26,vs9 
+  xvaddsp vs25,vs25,vs3 
+  xvaddsp vs27,vs27,vs11 
+#else
+  xxpermdi vs24,vs8,vs0,0
+  xxpermdi vs26,vs10,vs2,0 
+  xxpermdi vs25,vs0,vs8,3
+  xxpermdi vs27,vs2,vs10,3 
+#endif
+  stxv vs24 , 0(CO) 
+  stxv vs25 , 0(T1) 
+  stxv vs26 , 0(T2) 
+  stxv vs27 , 0(T3)  
+  addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro Zero4x1
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs33, vs33, vs33 
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs41, vs41, vs41 
+.endm
+
+
+.macro LOAD4x1   
+  LOAD4x1O 0,0 
+.endm
+
+
+.macro LOAD4x1O  OffsetA,OffsetB
+  lxsd v4, (\OffsetA+0)(AO) 
+  lxv vs0,  (\OffsetB+0)(BO)
+  lxv vs1,  (\OffsetB+16)(BO)
+  xxspltd  vs24,vs36,0
+  xxperm    vs26, vs24,   permute_mask   
+.endm
+
+
+.macro END4x1_NORMAL
+  END4x1 AO,BO,8,32
+.endm
+
+
+.macro END4x1_WITHOUT_ADD
+  END4x1 AO,BO,0,0
+.endm
+
+
+.macro END4x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+.endm
+
+
+.macro LOAD4x1_2
+    LOAD4x1_2O 0,0
+.endm
+ 
+
+.macro LOAD4x1_2O  OffsetA,OffsetB
+  lxv vs27,  (\OffsetA)(AO) 
+  xxspltd  vs8,vs27,1
+  xxspltd  vs24,vs27,0  
+  lxv vs4,  (0+\OffsetB)(BO)
+  lxv vs5,  (16+\OffsetB)(BO) 
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask      
+  lxv vs0,  (32+\OffsetB)(BO)
+  lxv vs1,  (32+16+\OffsetB)(BO)
+.endm
+
+
+.macro END4x1_2   
+  /*for load2 offset will be 16 and 64*/
+   KERNEL4x1_2  AO,BO,  16,64,0 ,1,1 
+.endm
+
+
+.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL4x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL4x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL4x1_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs33, vs5,vs8
+  xvmaddasp   vs40, vs4,vs10
+  xvmaddasp   vs41, vs5,vs10
+.if \Complete==0  
+  lxv vs27,  DISP2(\Index,\OffsetA)(\AREG) 
+  xxspltd  vs8,vs27,1 
+.endif  
+.if \Complete==0  
+   lxv  vs4,  DISP8(\Index,0+\OffsetB)(\BREG)
+   lxv  vs5,  DISP8(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask  
+.endif    
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs33, vs1,vs24
+  xvmaddasp   vs40, vs0,vs26
+  xvmaddasp   vs41, vs1,vs26
+.if \Complete==0 
+  xxspltd  vs24,vs27,0  
+  xxperm   vs26, vs24, permute_mask   
+.endif  
+.if \Complete==0
+  lxv vs0,  DISP8(\Index,32+\OffsetB)(\BREG)
+  lxv vs1,  DISP8(\Index,32+16+\OffsetB)(\BREG) 
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \AREG, \AREG, DISP2(\Index,\OffsetA) 
+  addi    \BREG, \BREG,  DISP8(\Index,\OffsetB)
+.else
+  addi    \AREG, \AREG, DISP2(\Index,16)  
+  addi    \BREG, \BREG,  DISP8(\Index,64)
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL4x1
+  LOAD4x1
+  END4x1  AO, BO, 8,32
+.endm
+
+
+.macro SAVE4x1
+  add T4, LDC,LDC
+  add T1, CO ,LDC  
+  add T2,CO,T4
+  add T3,T1,T4  
+#ifndef TRMMKERNEL  
+  lxsd v4 , 0(CO) 
+#endif
+#ifndef TRMMKERNEL  
+  lxsd v5 , 0(T1) 
+#endif  
+#ifndef TRMMKERNEL  
+  lxsd v6 , 0(T2) 
+#endif
+#ifndef TRMMKERNEL  
+  lxsd v7 , 0(T3) 
+#endif   
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask 
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3     
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3    
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxspltd vs1,vs0,0
+  xxspltd vs3,vs0,1
+  xxspltd vs9,vs2,0
+  xxspltd vs11,vs2,1
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+  xvaddsp vs36,vs36,vs1
+  xvaddsp vs37,vs37,vs3   
+  xvaddsp vs38,vs38,vs9  
+  xvaddsp vs39,vs39,vs11 
+#else 
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+  xxspltd vs36,vs0,0
+  xxspltd vs37,vs0,1
+  xxspltd vs38,vs2,0
+  xxspltd vs39,vs2,1
+#endif
+  stxsd v4 , 0(CO) 
+  stxsd v5 , 0(T1) 
+  stxsd v6 , 0(T2) 
+  stxsd v7 , 0(T3)  
+  addi  CO, CO, 8
+.endm
+
+/*                                             macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro Zero2x8
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs33, vs33, vs33
+  xxlxor  vs34, vs34, vs34
+  xxlxor  vs35, vs35, vs35
+  xxlxor  vs36, vs36, vs36
+  xxlxor  vs37, vs37, vs37
+  xxlxor  vs38, vs38, vs38
+  xxlxor  vs39, vs39, vs39
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs41, vs41, vs41
+  xxlxor  vs42, vs42, vs42
+  xxlxor  vs43, vs43, vs43
+  xxlxor  vs44, vs44, vs44
+  xxlxor  vs45, vs45, vs45
+  xxlxor  vs46, vs46, vs46
+  xxlxor  vs47, vs47, vs47
+.endm
+
+
+.macro LOAD2x8   
+  LOAD2x8O 0,0 
+.endm
+
+
+.macro LOAD2x8O  OffsetA,OffsetB
+  lxv vs24, (\OffsetB+0)(BO) 
+  xxperm    vs26, vs24,   permute_mask    
+  lxv vs0,  (\OffsetA+0)(AO)
+  lxv vs1,  (\OffsetA+16)(AO)
+  lxv vs2,  (\OffsetA+32)(AO)
+  lxv vs3,  (\OffsetA+48)(AO) 
+  xxpermdi  vs25, vs24, vs24,2  
+  xxpermdi  vs27, vs26, vs26,2
+.endm
+
+
+.macro END2x8_NORMAL
+  END2x8 AO,BO,64,16
+.endm
+
+
+.macro END2x8_WITHOUT_ADD
+  END2x8 AO,BO,0,0
+.endm
+
+
+.macro END2x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs38, vs2,vs25  
+    xvmaddasp       vs39, vs3,vs25 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs46, vs2,vs27  
+    xvmaddasp       vs47, vs3,vs27
+.endm
+
+
+.macro LOAD2x8_2
+    LOAD2x8_2O 0,0
+.endm
+ 
+
+.macro LOAD2x8_2O  OffsetA,OffsetB
+  lxv vs8,  (\OffsetB)(BO)
+  lxv vs24, (16+\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO)
+  lxv vs5,  (16+\OffsetA)(AO)
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask  
+  lxv vs6,  (32+\OffsetA)(AO)
+  lxv vs7,  (48+\OffsetA)(AO) 
+  lxv vs0,  (64+\OffsetA)(AO)
+  lxv vs1,  (64+16+\OffsetA)(AO) 
+  xxpermdi  vs9,  vs8,   vs8,2    
+  xxpermdi  vs25, vs24, vs24,2     
+  lxv vs2,  (64+32+\OffsetA)(AO)
+  lxv vs3,  (64+48+\OffsetA)(AO)
+  xxpermdi  vs11, vs10, vs10,2
+  xxpermdi  vs27, vs26, vs26,2 
+.endm
+ 
+
+.macro END2x8_2   
+  /*for load2 offset will be 128 and 32*/
+   KERNEL2x8_2  AO,BO,  128,32,0 ,1,1 
+.endm
+
+
+.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL2x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x8_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs33, vs5,vs8
+  xvmaddasp   vs40, vs4,vs10
+  xvmaddasp   vs41, vs5,vs10
+  xvmaddasp   vs36, vs4,vs9
+  xvmaddasp   vs37, vs5,vs9
+  xvmaddasp   vs44, vs4,vs11
+  xvmaddasp   vs45, vs5,vs11
+.if \Complete==0  
+   lxv  vs4,  DISP16(\Index,0+\OffsetA)(\AREG)
+   lxv  vs5,  DISP16(\Index,16+\OffsetA)(\AREG)
+.endif
+
+  xvmaddasp   vs34, vs6,vs8 
+  xvmaddasp   vs35, vs7,vs8
+.if \Complete==0  
+  lxv vs8,  DISP4(\Index,\OffsetB)(\BREG)
+.endif    
+  xvmaddasp   vs42, vs6,vs10
+  xvmaddasp   vs43, vs7,vs10
+  xvmaddasp   vs38, vs6,vs9 
+  xvmaddasp   vs39, vs7,vs9
+.if \Complete==0
+  xxperm    vs10, vs8,    permute_mask  
+  xxpermdi  vs9,  vs8,   vs8,2   
+.endif    
+  xvmaddasp   vs46, vs6,vs11
+  xvmaddasp   vs47, vs7,vs11
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2   
+.endif  
+.if \Complete==0
+   lxv  vs6,  DISP16(\Index,32+\OffsetA)(\AREG)
+   lxv  vs7,  DISP16(\Index,48+\OffsetA)(\AREG) 
+.endif 
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs33, vs1,vs24
+  xvmaddasp   vs40, vs0,vs26
+  xvmaddasp   vs41, vs1,vs26
+  xvmaddasp   vs36, vs0,vs25
+  xvmaddasp   vs37, vs1,vs25
+  xvmaddasp   vs44, vs0,vs27
+  xvmaddasp   vs45, vs1,vs27
+.if \Complete==0
+  lxv vs0,  DISP16(\Index,64+\OffsetA)(\AREG)
+  lxv vs1,  DISP16(\Index,64+16+\OffsetA)(\AREG) 
+.endif
+
+  xvmaddasp   vs34, vs2,vs24
+  xvmaddasp   vs35, vs3,vs24    
+.if \Complete==0
+  lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)
+.endif  
+  xvmaddasp   vs42, vs2,vs26
+  xvmaddasp   vs43, vs3,vs26
+  xvmaddasp   vs38, vs2,vs25
+  xvmaddasp   vs39, vs3,vs25
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask 
+  xxpermdi  vs25, vs24, vs24,2   
+.endif  
+  xvmaddasp   vs46, vs2,vs27
+  xvmaddasp   vs47, vs3,vs27
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2   
+.endif
+
+.if \Complete==0
+  lxv vs2,  DISP16(\Index,64+32+\OffsetA)(\AREG)
+  lxv vs3,  DISP16(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP16(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP4(\Index,32)
+  addi    \AREG, \AREG, DISP16(\Index,128)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL2x8
+  LOAD2x8
+  END2x8  AO, BO, 64,16
+.endm
+
+
+.macro SAVE2x8
+  add T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs26 , 32(CO)
+  lxv vs27 , 48(CO)
+#endif  
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs28 , 0(T1)
+  lxv vs29 , 16(T1)
+#endif  
+  xxperm  vs2,vs34,permute_mask
+  xxperm  vs6,vs42,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs30 , 32(T1)
+  lxv vs31 , 48(T1)
+#endif 
+  xxperm  vs3,vs35,permute_mask
+  xxperm  vs7,vs43,permute_mask 
+  add T2,CO,T4
+  add T3,T1,T4  
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  xxperm  vs9,vs37,permute_mask
+  xxperm  vs13,vs45,permute_mask
+  AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
+  xxperm  vs10,vs38,permute_mask
+  xxperm  vs14,vs46,permute_mask
+  AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 
+  xxperm  vs11,vs39,permute_mask
+  xxperm  vs15,vs47,permute_mask 
+  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+  AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14
+  AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART1    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART1    vs35,vs43,vs6,vs7  
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART2    vs35,vs43,vs6,vs7  
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART1    vs38,vs46,vs12,vs13
+  MULT_APLHA_PART1    vs39,vs47,vs14,vs15
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART2    vs38,vs46,vs12,vs13
+  MULT_APLHA_PART2    vs39,vs47,vs14,vs15
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  xxperm  vs4,vs5, save_permute_1
+  xxperm  vs6,vs7, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+  xxperm  vs12,vs13, save_permute_1
+  xxperm  vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,2
+  xxpermdi vs3,vs10,vs2,2
+  xxpermdi vs5,vs12,vs4,2
+  xxpermdi vs7,vs14,vs6,2
+  xxpermdi vs9,vs0,vs8,2
+  xxpermdi vs11,vs2,vs10,2  
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs25,vs25,vs3
+  xxpermdi vs13,vs4,vs12,2  
+  xxpermdi vs15,vs6,vs14,2
+  xvaddsp vs26,vs26,vs5
+  xvaddsp  vs27,vs27,vs7
+  xvaddsp vs28,vs28,vs9
+  xvaddsp vs29,vs29,vs11 
+  xvaddsp vs30,vs30,vs13
+  xvaddsp vs31,vs31,vs15  
+#else
+  xxpermdi vs24,vs8,vs0,2
+  xxpermdi vs25,vs10,vs2,2
+  xxpermdi vs26,vs12,vs4,2
+  xxpermdi vs27,vs14,vs6,2 
+  xxpermdi vs28,vs0,vs8,2
+  xxpermdi vs29,vs2,vs10,2  
+  xxpermdi vs30,vs4,vs12,2  
+  xxpermdi vs31,vs6,vs14,2
+#endif
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO) 
+  stxv vs26 , 32(CO)
+  stxv vs27 , 48(CO) 
+  stxv vs28 , 0(T1)
+  stxv vs29 , 16(T1) 
+  stxv vs30 , 32(T1)
+  stxv vs31 , 48(T1)  
+  addi  CO, CO, 64
+.endm
+
+/*                                             macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro Zero2x4
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs33, vs33, vs33
+  xxlxor  vs36, vs36, vs36
+  xxlxor  vs37, vs37, vs37
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs41, vs41, vs41
+  xxlxor  vs44, vs44, vs44
+  xxlxor  vs45, vs45, vs45
+.endm
+
+
+.macro LOAD2x4   
+  LOAD2x4O 0,0 
+.endm
+
+
+.macro LOAD2x4O  OffsetA,OffsetB
+  lxv vs24, (\OffsetB+0)(BO)
+  lxv vs0,  (\OffsetA+0)(AO)
+  lxv vs1,  (\OffsetA+16)(AO)
+  xxperm    vs26, vs24,   permute_mask  
+  xxpermdi  vs25, vs24, vs24,2     
+  xxpermdi  vs27, vs26, vs26,2  
+.endm
+
+
+.macro END2x4_NORMAL
+  END2x4 AO,BO,32,16
+.endm
+
+
+.macro END2x4_WITHOUT_ADD
+  END2x4 AO,BO,0,0
+.endm
+
+
+.macro END2x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+.endm
+
+
+.macro LOAD2x4_2
+    LOAD2x4_2O 0,0
+.endm
+ 
+
+.macro LOAD2x4_2O  OffsetA,OffsetB
+  lxv vs8,  (\OffsetB)(BO)
+  lxv vs24, (16+\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO)
+  lxv vs5,  (16+\OffsetA)(AO)
+  xxperm    vs10, vs8,    permute_mask
+  xxperm    vs26, vs24, permute_mask
+  xxpermdi  vs9,  vs8,   vs8,2   
+  xxpermdi  vs25, vs24, vs24,2     
+  lxv vs0,  (32+\OffsetA)(AO)
+  lxv vs1,  (32+16+\OffsetA)(AO) 
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi  vs27, vs26, vs26,2  
+.endm
+
+
+.macro END2x4_2   
+  /*for load2 offset will be 64 and 32*/
+   KERNEL2x4_2  AO,BO,  64,32,0 ,1,1 
+.endm
+
+
+.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL2x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x4_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs33, vs5,vs8
+  xvmaddasp   vs40, vs4,vs10
+  xvmaddasp   vs41, vs5,vs10
+.if \Complete==0  
+  lxv vs8,  DISP4(\Index,\OffsetB)(\BREG)
+.endif  
+  xvmaddasp   vs36, vs4,vs9
+  xvmaddasp   vs37, vs5,vs9
+  xvmaddasp   vs44, vs4,vs11
+  xvmaddasp   vs45, vs5,vs11
+.if \Complete==0
+  xxperm    vs10, vs8,    permute_mask 
+  xxpermdi  vs9,  vs8,   vs8,2   
+.endif    
+.if \Complete==0  
+   lxv  vs4,  DISP8(\Index,0+\OffsetA)(\AREG)
+   lxv  vs5,  DISP8(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2 
+.endif  
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs33, vs1,vs24
+  xvmaddasp   vs40, vs0,vs26
+  xvmaddasp   vs41, vs1,vs26
+.if \Complete==0
+  lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)
+.endif   
+  xvmaddasp   vs36, vs0,vs25
+  xvmaddasp   vs37, vs1,vs25
+  xvmaddasp   vs44, vs0,vs27
+  xvmaddasp   vs45, vs1,vs27
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask
+  xxpermdi  vs25, vs24, vs24,2 
+.endif  
+.if \Complete==0
+  lxv vs0,  DISP8(\Index,32+\OffsetA)(\AREG)
+  lxv vs1,  DISP8(\Index,32+16+\OffsetA)(\AREG) 
+.endif
+
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2  
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP8(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP4(\Index,32)
+  addi    \AREG, \AREG, DISP8(\Index,64)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL2x4
+  LOAD2x4
+  END2x4  AO, BO, 32,16
+.endm
+
+
+.macro SAVE2x4
+  add T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+#ifndef TRMMKERNEL  
+  lxv vs26 , 0(T1)
+  lxv vs27 , 16(T1)
+#endif  
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  xxperm  vs9,vs37,permute_mask
+  xxperm  vs13,vs45,permute_mask
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,2
+  xxpermdi vs3,vs10,vs2,2 
+  xxpermdi vs9,vs0,vs8,2
+  xxpermdi vs11,vs2,vs10,2  
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs25,vs25,vs3 
+  xvaddsp vs26,vs26,vs9
+  xvaddsp vs27,vs27,vs11 
+#else
+  xxpermdi vs24,vs8,vs0,2
+  xxpermdi vs25,vs10,vs2,2
+  xxpermdi vs26,vs0,vs8,2
+  xxpermdi vs27,vs2,vs10,2  
+#endif
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO)
+  stxv vs26 , 0(T1)
+  stxv vs27 , 16(T1)
+  addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro Zero2x2
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs36, vs36, vs36
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs44, vs44, vs44
+.endm
+
+
+.macro LOAD2x2   
+  LOAD2x2O 0,0 
+.endm
+
+
+.macro LOAD2x2O  OffsetA,OffsetB
+  lxv vs24, (\OffsetA+0)(AO)
+  lxv vs0,  (\OffsetB+0)(BO)
+  xxperm    vs26, vs24,   permute_mask  
+  xxpermdi  vs25, vs24, vs24,2      
+  xxpermdi  vs27, vs26, vs26,2  
+.endm
+
+
+.macro END2x2_NORMAL
+  END2x2 AO,BO,16,16
+.endm
+
+
+.macro END2x2_WITHOUT_ADD
+  END2x2 AO,BO,0,0
+.endm
+
+
+.macro END2x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs44, vs0,vs27
+.endm
+
+
+.macro LOAD2x2_2
+    LOAD2x2_2O 0,0
+.endm
+ 
+
+.macro LOAD2x2_2O  OffsetA,OffsetB
+  lxv vs8,  (\OffsetA)(AO) 
+  lxv vs24, (16+\OffsetA)(AO) 
+  lxv vs4,  (0+\OffsetB)(BO)
+  lxv vs0,  (16+\OffsetB)(BO)
+  xxperm    vs10, vs8,    permute_mask
+  xxpermdi  vs9,  vs8,   vs8,2   
+  xxperm    vs26, vs24, permute_mask
+  xxpermdi  vs25, vs24, vs24,2    
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi  vs27, vs26, vs26,2  
+.endm
+
+
+.macro END2x2_2   
+  /*for load2 offset will be 32 and 32*/
+   KERNEL2x2_2  AO,BO,  32,32,0 ,1,1 
+.endm
+
+
+.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL2x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x2_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs40, vs4,vs10
+.if \Complete==0  
+  lxv vs8,  DISP4(\Index,\OffsetA)(\AREG) 
+.endif  
+  xvmaddasp   vs36, vs4,vs9
+  xvmaddasp   vs44, vs4,vs11
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask 
+  xxpermdi  vs9,  vs8,   vs8,2  
+.endif    
+.if \Complete==0  
+   lxv  vs4,  DISP4(\Index,0+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2   
+.endif  
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs40, vs0,vs26
+.if \Complete==0
+  lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) 
+.endif   
+  xvmaddasp   vs36, vs0,vs25
+  xvmaddasp   vs44, vs0,vs27
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask 
+  xxpermdi  vs25, vs24, vs24,2    
+.endif  
+.if \Complete==0
+  lxv vs0,  DISP4(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2    
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \AREG, \AREG, DISP4(\Index,\OffsetA) 
+  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+  addi    \AREG, \AREG, DISP4(\Index,32)  
+  addi    \BREG, \BREG,  DISP4(\Index,32)
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL2x2
+  LOAD2x2
+  END2x2  AO, BO, 16,16
+.endm
+
+
+.macro SAVE2x2
+  add T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO) 
+#endif
+#ifndef TRMMKERNEL  
+  lxv vs26 , 0(T1) 
+#endif  
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1   
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,0
+  xxpermdi vs9,vs0,vs8,3 
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs26,vs26,vs9 
+#else
+  xxpermdi vs24,vs8,vs0,0
+  xxpermdi vs26,vs0,vs8,3 
+#endif
+  stxv vs24 , 0(CO) 
+  stxv vs26 , 0(T1)
+  addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro Zero2x1
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs40, vs40, vs40
+.endm
+
+
+.macro LOAD2x1   
+  LOAD2x1O 0,0 
+.endm
+
+
+.macro LOAD2x1O  OffsetA,OffsetB
+  lxsd v4, (\OffsetA+0)(AO) 
+  lxv vs0,  (\OffsetB+0)(BO)
+  xxspltd  vs24,vs36,0
+  xxperm    vs26, vs24,   permute_mask   
+.endm
+
+
+.macro END2x1_NORMAL
+  END2x1 AO,BO,8,16
+.endm
+
+
+.macro END2x1_WITHOUT_ADD
+  END2x1 AO,BO,0,0
+.endm
+
+
+.macro END2x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs40, vs0,vs26
+.endm
+
+
+.macro LOAD2x1_2
+    LOAD2x1_2O 0,0
+.endm
+ 
+
+.macro LOAD2x1_2O  OffsetA,OffsetB
+  lxv vs27,  (\OffsetA)(AO) 
+  lxv vs4,  (0+\OffsetB)(BO)
+  lxv vs0,  (16+\OffsetB)(BO)
+  xxspltd  vs8,vs27,1
+  xxspltd  vs24,vs27,0  
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask      
+.endm
+
+
+.macro END2x1_2   
+  /*for load2 offset will be 16 and 32*/
+   KERNEL2x1_2  AO,BO,  16,32,0 ,1,1 
+.endm
+
+
+.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL2x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x1_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs40, vs4,vs10
+.if \Complete==0  
+  lxv vs27,  DISP2(\Index,\OffsetA)(\AREG) 
+  xxspltd  vs8,vs27,1 
+.endif  
+.if \Complete==0  
+   lxv  vs4,  DISP4(\Index,0+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask  
+.endif    
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs40, vs0,vs26
+.if \Complete==0 
+  xxspltd  vs24,vs27,0  
+  xxperm   vs26, vs24, permute_mask   
+.endif  
+.if \Complete==0
+  lxv vs0,  DISP4(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \AREG, \AREG, DISP2(\Index,\OffsetA) 
+  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+  addi    \AREG, \AREG, DISP2(\Index,16)  
+  addi    \BREG, \BREG,  DISP4(\Index,32)
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL2x1
+  LOAD2x1
+  END2x1  AO, BO, 8,16
+.endm
+
+
+.macro SAVE2x1
+  add T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxsd v4 , 0(CO) 
+#endif
+#ifndef TRMMKERNEL  
+  lxsd v5 , 0(T1) 
+#endif  
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1      
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1 
+#ifndef TRMMKERNEL
+  /* add */
+  xxspltd vs1,vs0,0
+  xxspltd vs3,vs0,1
+ /*--v4==vs36 v5==vs37---*/
+  xvaddsp vs36,vs36,vs1
+  xvaddsp vs37,vs37,vs3  
+#else 
+ /*--v4==vs36 v5==vs37---*/
+  xxspltd vs36,vs0,0
+  xxspltd vs37,vs0,1
+#endif
+  stxsd v4 , 0(CO) 
+  stxsd v5 , 0(T1) 
+  addi  CO, CO, 8
+.endm
+
+/*                                             macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro Zero1x8
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs33, vs33, vs33
+  xxlxor  vs34, vs34, vs34
+  xxlxor  vs35, vs35, vs35
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs41, vs41, vs41
+  xxlxor  vs42, vs42, vs42
+  xxlxor  vs43, vs43, vs43
+.endm
+
+
+.macro LOAD1x8   
+  LOAD1x8O 0,0 
+.endm
+
+
+.macro LOAD1x8O  OffsetA,OffsetB
+  lxsd vs4, (\OffsetB+0)(BO) 
+  lxv vs0,  (\OffsetA+0)(AO)
+  lxv vs1,  (\OffsetA+16)(AO)
+  lxv vs2,  (\OffsetA+32)(AO)
+  lxv vs3,  (\OffsetA+48)(AO) 
+  xxspltd   vs24,vs36,0
+  xxperm    vs26, vs24,   permute_mask    
+.endm
+
+
+.macro END1x8_NORMAL
+  END1x8 AO,BO,64,8
+.endm
+
+
+.macro END1x8_WITHOUT_ADD
+  END1x8 AO,BO,0,0
+.endm
+
+
+.macro END1x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+.endm
+
+
+.macro LOAD1x8_2
+    LOAD1x8_2O 0,0
+.endm
+ 
+
+.macro LOAD1x8_2O  OffsetA,OffsetB
+  lxv vs27,  (\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO)
+  lxv vs5,  (16+\OffsetA)(AO)
+  xxspltd  vs8,vs27,1
+  xxspltd  vs24,vs27,0    
+  lxv vs6,  (32+\OffsetA)(AO)
+  lxv vs7,  (48+\OffsetA)(AO) 
+  lxv vs0,  (64+\OffsetA)(AO)
+  lxv vs1,  (64+16+\OffsetA)(AO)     
+  lxv vs2,  (64+32+\OffsetA)(AO)
+  lxv vs3,  (64+48+\OffsetA)(AO)
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask   
+.endm
+ 
+
+.macro END1x8_2   
+  /*for load2 offset will be 128 and 16*/
+   KERNEL1x8_2  AO,BO,  128,16,0 ,1,1 
+.endm
+
+
+.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL1x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x8_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+.if \Complete==0  
+  lxv vs27,  DISP2(\Index,\OffsetB)(\BREG)
+.endif    
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs33, vs5,vs8
+  xvmaddasp   vs40, vs4,vs10
+  xvmaddasp   vs41, vs5,vs10
+.if \Complete==0  
+   lxv  vs4,  DISP16(\Index,0+\OffsetA)(\AREG)
+   lxv  vs5,  DISP16(\Index,16+\OffsetA)(\AREG)
+.endif
+
+  xvmaddasp   vs34, vs6,vs8 
+  xvmaddasp   vs35, vs7,vs8
+  xvmaddasp   vs42, vs6,vs10
+  xvmaddasp   vs43, vs7,vs10
+.if \Complete==0
+   lxv  vs6,  DISP16(\Index,32+\OffsetA)(\AREG)
+   lxv  vs7,  DISP16(\Index,48+\OffsetA)(\AREG) 
+.endif 
+.if \Complete==0 
+  xxspltd  vs8,vs27,1    
+  xxperm    vs10, vs8,    permute_mask   
+.endif    
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs33, vs1,vs24
+  xvmaddasp   vs40, vs0,vs26
+  xvmaddasp   vs41, vs1,vs26
+.if \Complete==0
+  lxv vs0,  DISP16(\Index,64+\OffsetA)(\AREG)
+  lxv vs1,  DISP16(\Index,64+16+\OffsetA)(\AREG) 
+.endif
+
+  xvmaddasp   vs34, vs2,vs24
+  xvmaddasp   vs35, vs3,vs24    
+  xvmaddasp   vs42, vs2,vs26
+  xvmaddasp   vs43, vs3,vs26
+.if \Complete==0
+  xxspltd  vs24,vs27,0   
+  xxperm    vs26, vs24, permute_mask  
+.endif  
+.if \Complete==0
+  lxv vs2,  DISP16(\Index,64+32+\OffsetA)(\AREG)
+  lxv vs3,  DISP16(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP16(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP2(\Index,16)
+  addi    \AREG, \AREG, DISP16(\Index,128)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL1x8
+  LOAD1x8
+  END1x8  AO, BO, 64,8
+.endm
+
+
+.macro SAVE1x8
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs26 , 32(CO)
+  lxv vs27 , 48(CO)
+#endif  
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+  xxperm  vs2,vs34,permute_mask
+  xxperm  vs6,vs42,permute_mask
+  xxperm  vs3,vs35,permute_mask
+  xxperm  vs7,vs43,permute_mask 
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
+  AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 
+  /*inner reverse save_permute and store vs28 */
+  xxpermdi vs28,save_permute_1,save_permute_1,2
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART1    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART1    vs35,vs43,vs6,vs7  
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART2    vs35,vs43,vs6,vs7  
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, vs28
+  xxperm  vs2,vs3, vs28
+  xxperm  vs4,vs5, vs28
+  xxperm  vs6,vs7, vs28  
+#ifndef TRMMKERNEL
+  /* add */
+  xvaddsp vs24,vs24,vs0
+  xvaddsp vs25,vs25,vs2
+  xvaddsp vs26,vs26,vs4
+  xvaddsp  vs27,vs27,vs6
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO) 
+  stxv vs26 , 32(CO)
+  stxv vs27 , 48(CO)    
+#else
+/* reconstruct r,i pairs*/
+  stxv vs0 , 0(CO)
+  stxv vs2 , 16(CO) 
+  stxv vs4 , 32(CO)
+  stxv vs6 , 48(CO)  
+#endif
+  addi  CO, CO, 64
+.endm
+
+/*                                             macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro Zero1x4
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs33, vs33, vs33
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs41, vs41, vs41
+.endm
+
+
+.macro LOAD1x4   
+  LOAD1x4O 0,0 
+.endm
+
+
+.macro LOAD1x4O  OffsetA,OffsetB
+  lxsd vs4, (\OffsetB+0)(BO) 
+  lxv vs0,  (\OffsetA+0)(AO)
+  lxv vs1,  (\OffsetA+16)(AO)
+  xxspltd   vs24,vs36,0
+  xxperm    vs26, vs24,   permute_mask    
+.endm
+
+
+.macro END1x4_NORMAL
+  END1x4 AO,BO,32,8
+.endm
+
+
+.macro END1x4_WITHOUT_ADD
+  END1x4 AO,BO,0,0
+.endm
+
+
+.macro END1x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+.endm
+
+
+.macro LOAD1x4_2
+    LOAD1x4_2O 0,0
+.endm
+ 
+
+.macro LOAD1x4_2O  OffsetA,OffsetB
+  lxv vs27,  (\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO)
+  lxv vs5,  (16+\OffsetA)(AO)
+  xxspltd  vs8,vs27,1
+  xxspltd  vs24,vs27,0    
+  lxv vs0,  (32+\OffsetA)(AO)
+  lxv vs1,  (32+16+\OffsetA)(AO)     
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask   
+.endm
+ 
+
+.macro END1x4_2   
+  /*for load2 offset will be 64 and 16*/
+   KERNEL1x4_2  AO,BO,  64,16,0 ,1,1 
+.endm
+
+
+.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL1x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x4_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+.if \Complete==0  
+  lxv vs27,  DISP2(\Index,\OffsetB)(\BREG)
+.endif    
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs33, vs5,vs8
+  xvmaddasp   vs40, vs4,vs10
+  xvmaddasp   vs41, vs5,vs10
+.if \Complete==0  
+   lxv  vs4,  DISP8(\Index,0+\OffsetA)(\AREG)
+   lxv  vs5,  DISP8(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0 
+  xxspltd  vs8,vs27,1    
+  xxperm    vs10, vs8,    permute_mask   
+.endif    
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs33, vs1,vs24
+  xvmaddasp   vs40, vs0,vs26
+  xvmaddasp   vs41, vs1,vs26
+.if \Complete==0
+  lxv vs0,  DISP8(\Index,32+\OffsetA)(\AREG)
+  lxv vs1,  DISP8(\Index,32+16+\OffsetA)(\AREG) 
+.endif
+
+.if \Complete==0
+  xxspltd  vs24,vs27,0   
+  xxperm    vs26, vs24, permute_mask  
+.endif  
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP8(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP2(\Index,16)
+  addi    \AREG, \AREG, DISP8(\Index,64)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL1x4
+  LOAD1x4
+  END1x4  AO, BO, 32,8
+.endm
+
+
+.macro SAVE1x4
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  /*inner reverse save_permute and store vs28 */
+  xxpermdi vs28,save_permute_1,save_permute_1,2
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, vs28
+  xxperm  vs2,vs3, vs28
+#ifndef TRMMKERNEL
+  /* add */
+  xvaddsp vs24,vs24,vs0
+  xvaddsp vs25,vs25,vs2
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO) 
+#else
+/* reconstruct r,i pairs*/
+  stxv vs0 , 0(CO)
+  stxv vs2 , 16(CO) 
+#endif
+  addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro Zero1x2
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs40, vs40, vs40
+.endm
+
+
+.macro LOAD1x2   
+  LOAD1x2O 0,0 
+.endm
+
+
+.macro LOAD1x2O  OffsetA,OffsetB
+  lxsd vs4, (\OffsetB+0)(BO) 
+  lxv vs0,  (\OffsetA+0)(AO)
+  xxspltd   vs24,vs36,0
+  xxperm    vs26, vs24,   permute_mask    
+.endm
+
+
+.macro END1x2_NORMAL
+  END1x2 AO,BO,16,8
+.endm
+
+
+.macro END1x2_WITHOUT_ADD
+  END1x2 AO,BO,0,0
+.endm
+
+
+.macro END1x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs40, vs0,vs26
+.endm
+
+
+.macro LOAD1x2_2
+    LOAD1x2_2O 0,0
+.endm
+ 
+
+.macro LOAD1x2_2O  OffsetA,OffsetB
+  lxv vs27,  (\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO)
+  lxv vs0,  (16+\OffsetA)(AO)
+  xxspltd  vs8,vs27,1
+  xxspltd  vs24,vs27,0    
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask   
+.endm
+ 
+
+.macro END1x2_2   
+  /*for load2 offset will be 32 and 16*/
+   KERNEL1x2_2  AO,BO,  32,16,0 ,1,1 
+.endm
+
+
+.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL1x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x2_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+.if \Complete==0  
+  lxv vs27,  DISP2(\Index,\OffsetB)(\BREG)
+.endif    
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs40, vs4,vs10
+.if \Complete==0  
+   lxv  vs4,  DISP4(\Index,0+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0 
+  xxspltd  vs8,vs27,1    
+  xxperm    vs10, vs8,    permute_mask   
+.endif    
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs40, vs0,vs26
+.if \Complete==0
+  lxv vs0,  DISP4(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+  xxspltd  vs24,vs27,0   
+  xxperm    vs26, vs24, permute_mask  
+.endif  
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP4(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP2(\Index,16)
+  addi    \AREG, \AREG, DISP4(\Index,32)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL1x2
+  LOAD1x2
+  END1x2  AO, BO, 16,8
+.endm
+
+
+.macro SAVE1x2
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+#endif
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  /*inner reverse save_permute and store vs28 */
+  xxpermdi vs28,save_permute_1,save_permute_1,2
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, vs28
+#ifndef TRMMKERNEL
+  /* add */
+  xvaddsp vs24,vs24,vs0
+  stxv vs24 , 0(CO)
+#else
+/* reconstruct r,i pairs*/
+  stxv vs0 , 0(CO)
+#endif
+  addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=1 and M=1
+**********************************************************************************************/
+.macro Zero1x1
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs40, vs40, vs40
+.endm
+
+
+.macro LOAD1x1   
+  LOAD1x1O 0,0 
+.endm
+
+
+.macro LOAD1x1O  OffsetA,OffsetB
+  lxsd v4, (\OffsetB+0)(BO) 
+  lxsd v5,  (\OffsetA+0)(AO)
+  xxperm    vs38, vs36,   permute_mask    
+.endm
+
+
+.macro END1x1_NORMAL
+  END1x1 AO,BO,8,8
+.endm
+
+
+.macro END1x1_WITHOUT_ADD
+  END1x1 AO,BO,0,0
+.endm
+
+
+.macro END1x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs37,vs36
+    xvmaddasp       vs40, vs37,vs38
+.endm
+
+
+.macro LOAD1x1_2
+    LOAD1x1_2O 0,0
+.endm
+ 
+
+.macro LOAD1x1_2O  OffsetA,OffsetB
+  lxv vs8,  (\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO) 
+  xxperm    vs10, vs8,    permute_mask  
+.endm
+ 
+
+.macro END1x1_2   
+  /*for load2 offset will be 16 and 16*/
+   KERNEL1x1_2  AO,BO,  16,16,0 ,1,1 
+.endm
+
+
+.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL1x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x1_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+ 
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs40, vs4,vs10
+.if \Complete==0  
+  lxv vs8,  DISP2(\Index,\OffsetB)(\BREG)
+  lxv vs4,  DISP2(\Index,\OffsetB)(\AREG)
+  xxperm    vs10, vs8,    permute_mask  
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP2(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP2(\Index,16)
+  addi    \AREG, \AREG, DISP2(\Index,16)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL1x1
+  LOAD1x1
+  END1x1  AO, BO, 8,8
+.endm
+
+
+.macro SAVE1x1
+#ifndef TRMMKERNEL  
+  lxsd v4 , 0(CO)
+#endif
+  /*aggregate x2*/
+  xxpermdi vs33,vs32,vs32,2
+  xxpermdi vs41,vs40,vs40,2 
+  xvaddsp vs32,vs32,vs33
+  xvaddsp vs40,vs40,vs41
+
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  /*inner reverse save_permute and store vs28 */
+  xxpermdi vs28,save_permute_1,save_permute_1,2
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs37,vs1 
+  MULT_APLHA_PART2    vs32,vs40,vs37,vs1    
+
+/* reconstruct r,i pairs*/
+  xxperm  vs37,vs1, vs28  
+
+#ifndef TRMMKERNEL
+  /* add */
+  xvaddsp vs36,vs36,vs37
+  stxsd v4 , 0(CO)
+#else
+
+/* vs37 is v5 */
+  stxsd v5 , 0(CO)
+#endif
+  addi  CO, CO, 8
+.endm
+
+ 
+ 
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+
+
+.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
+		.if \SHIFT_VAL==16 
+			slwi		\REG1,	\REG2,	7			
+		.elseif \SHIFT_VAL==8  
+			slwi		\REG1,	\REG2,	6			 
+		.elseif \SHIFT_VAL==4
+			slwi		\REG1,	\REG2,	5			  
+		.elseif \SHIFT_VAL==2
+			slwi		\REG1,	\REG2,	4			 
+		.elseif \SHIFT_VAL==1
+			slwi		\REG1,	\REG2,	3			 
+		.endif
+.endm
+
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*8;
+// 		ptrbb = bb + off*4;
+// #endif
+*/
+.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+        /* ptrbb = bb;*/
+        mr \PTR_B,\B_VAL     /* refresh BPOINT */
+
+    #else
+		    /*
+        // ptrba  =ptrba+ off*C_A;
+        // ptrbb = bb + off*C_B; 
+				*/
+		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
+		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
+		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
+		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
+    #endif 
+.endm
+
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+8;	// number of values in A
+// #else
+// 		temp = off+4;	// number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+                            /* temp = bk-off;*/
+           sub \TEMP_BK,\BK_VAL,\OFF_VAL
+
+    #elif defined(LEFT)
+                            /* temp = off+INCR_A;	// number of values in A */
+           addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+                            /* temp = off+INCR_B	// number of values in B*/
+           addi \TEMP_BK,\OFF_VAL, \INCR_B
+    #endif
+
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 8; // number of values in A
+// #else
+// 		temp -= 4; // number of values in B
+// #endif
+// 		ptrba += temp*8;
+// 		ptrbb += temp*4;
+// #endif
+
+// #ifdef LEFT
+// 		off += 8; // number of values in A
+// #endif
+*/
+ 
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                    /*temp = bk - off;*/
+                sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #ifdef LEFT
+                    /*temp -= 8; // number of values in A*/
+                addi \TEMP_BK,\TEMP_BK,-\C_A
+    #else
+                    /*temp -= 4; // number of values in B*/
+                addi \TEMP_BK,\TEMP_BK,-\C_B 
+    #endif
+                    /*ptrba += temp*C_A;
+                    ptrbb += temp*C_B;*/ 
+                SHIFT_REG T4,\TEMP_BK,\C_A
+								SHIFT_REG T2,\TEMP_BK,\C_B
+                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
+								add \PTR_B, \PTR_B,T2 
+
+    #endif
+
+    #ifdef LEFT
+                    /*off += 8; // number of values in A*/
+                 addi \OFF_VAL,\OFF_VAL,\C_A
+    #endif
 .endm
\ No newline at end of file
diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c
index 8663039c5..575847da2 100644
--- a/kernel/power/cgemv_n.c
+++ b/kernel/power/cgemv_n.c
@@ -1,597 +1,597 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-#if !defined(__VEC__) || !defined(__ALTIVEC__)
-#include "../arm/zgemv_n.c"
-#else
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "common.h" 
-#include <altivec.h>   
-#define NBMAX 1024
-
-
-static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
-
- 
-static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
-  
- FLOAT *a0, *a1, *a2, *a3;
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-    register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
-    register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
-    register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
-    register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]};
-    register __vector float vx2_r = {x[4], x[4],x[4], x[4]};
-    register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]};
-    register __vector float vx3_r = {x[6], x[6],x[6], x[6]};
-    register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]};
-#else
-    register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
-    register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
-    register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
-    register __vector float vx1_i = {x[3], x[3],x[3], x[3]};
-    register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]};
-    register __vector float vx2_i = {x[5], x[5],x[5], x[5]};
-    register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]};
-    register __vector float vx3_i = {x[7], x[7],x[7], x[7]};
-#endif
-    register __vector float *vptr_y = (__vector float *) y;
-    register __vector float *vptr_a0 = (__vector float *) a0;
-    register __vector float *vptr_a1 = (__vector float *) a1;
-    register __vector float *vptr_a2 = (__vector float *) a2;
-    register __vector float *vptr_a3 = (__vector float *) a3; 
-    BLASLONG  i = 0; 
-    BLASLONG i2=16;
-    for (;i< n * 8; i+=32,i2+=32) {
-        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
-        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
-        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
-        register __vector float va1   = vec_vsx_ld(i, vptr_a1);
-        register __vector float va2   = vec_vsx_ld(i ,vptr_a2);
-        register __vector float va3   = vec_vsx_ld(i ,vptr_a3);
-        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
-        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
-        register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
-        register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
-
-        vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r;
-        vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r;
-        va0   = vec_perm(va0, va0,swap_mask);
-        va0_1 = vec_perm(va0_1, va0_1,swap_mask);
-        va1   = vec_perm(va1, va1,swap_mask);
-        va1_1 = vec_perm(va1_1, va1_1,swap_mask);
-        va2   = vec_perm(va2, va2,swap_mask);
-        va2_1 = vec_perm(va2_1, va2_1,swap_mask);
-        va3   = vec_perm(va3, va3,swap_mask);
-        va3_1 = vec_perm(va3_1, va3_1,swap_mask);
-        vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i;
-        vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i;
-
-        vec_vsx_st(vy_0 ,i,  vptr_y);
-        vec_vsx_st(vy_1,i2,vptr_y);
-    }
-
-}	
- 
-
-
-static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
- 
-    FLOAT *a0, *a1;
-    a0 = ap;
-    a1 = ap + lda; 
-    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-    register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
-    register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
-    register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
-    register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; 
-#else
-    register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
-    register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
-    register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
-    register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; 
-#endif
-    register __vector float *vptr_y = (__vector float *) y;
-    register __vector float *vptr_a0 = (__vector float *) a0;
-    register __vector float *vptr_a1 = (__vector float *) a1; 
-    BLASLONG  i = 0;
-    BLASLONG  i2 = 16;  
-    for (;i< n * 8; i+=32, i2+=32) { 
-        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
-        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
-        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
-        register __vector float va1   = vec_vsx_ld(i, vptr_a1); 
-        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
-        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); 
-
-        register __vector float va0x   = vec_perm(va0, va0,swap_mask);
-        register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
-        register __vector float va1x   = vec_perm(va1, va1,swap_mask);
-        register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask);
-        vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i;
-        vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; 
-
-        vec_vsx_st(vy_0 ,i,  vptr_y);
-        vec_vsx_st(vy_1,i2,vptr_y);
-    }
-
-}
-
- 
-
-static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
-
-    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-    register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
-    register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; 
-#else
-    register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
-    register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; 
-#endif
-    register __vector float *vptr_y = (__vector float *) y;
-    register __vector float *vptr_a0 = (__vector float *) ap; 
-    BLASLONG  i = 0; 
-    BLASLONG  i2 = 16;  
-    for (;i< n * 8; i+=32, i2+=32) { 
-        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
-        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
-        register __vector float va0   = vec_vsx_ld(i,vptr_a0); 
-        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); 
-
-        register __vector float va0x   = vec_perm(va0, va0,swap_mask);
-        register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
-        vy_0 += va0*vx0_r + va0x*vx0_i;
-        vy_1 += va0_1*vx0_r + va0x_1*vx0_i; 
-
-        vec_vsx_st(vy_0 ,i,  vptr_y);
-        vec_vsx_st(vy_1,i2,vptr_y);
-    }
-}
-
-
-
-
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
-    BLASLONG i=0;
-
-
-    if (inc_dest != 2) {
- 		FLOAT temp_r;
-		FLOAT temp_i;
-		for ( i=0; i<n; i++ )
-		{
-#if !defined(XCONJ) 
-			temp_r = alpha_r * src[0] - alpha_i * src[1];
-			temp_i = alpha_r * src[1] + alpha_i * src[0];
-#else
-			temp_r =  alpha_r * src[0] + alpha_i * src[1];
-			temp_i = -alpha_r * src[1] + alpha_i * src[0];
-#endif
-
-			*dest += temp_r;
-			*(dest+1) += temp_i;
-
-			src+=2;
-			dest += inc_dest;
-		}
-        return;
-    } else {
-        __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-#if   !defined(XCONJ) 
-
-        register __vector float valpha_r = {alpha_r, alpha_r, alpha_r, alpha_r};
-        register __vector float valpha_i = {-alpha_i, alpha_i, -alpha_i, alpha_i};
-
-#else
-        register __vector float valpha_r = {alpha_r, -alpha_r, alpha_r, -alpha_r};
-        register __vector float valpha_i = {alpha_i, alpha_i, alpha_i, alpha_i};
-#endif
-
-        register __vector float *vptr_src = (__vector float *) src;
-        register __vector float *vptr_y = (__vector float *) dest; 
-
-    BLASLONG  i2 = 16;  
-    for (;i< n * 8; i+=32, i2+=32) { 
-        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
-        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
-
-
-        register __vector float vsrc  = vec_vsx_ld(i,vptr_src);
-        register __vector float vsrc_1  = vec_vsx_ld(i2,vptr_src);
-
-        register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
-        register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
-
-        vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
-        vy_1 += vsrc_1*valpha_r +  vsrcx_1*valpha_i;  
-
-        vec_vsx_st(vy_0 ,i,  vptr_y);
-        vec_vsx_st(vy_1,i2,vptr_y);
-
-        }
- 
-    }
-    return;
-}
-
-
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
-    BLASLONG i=0;
-    FLOAT *a_ptr;
-    FLOAT *x_ptr;
-    FLOAT *y_ptr;
-
-    BLASLONG n1;
-    BLASLONG m1;
-    BLASLONG m2;
-    BLASLONG m3;
-    BLASLONG n2;
-    FLOAT xbuffer[8] __attribute__((aligned(16)));
-    FLOAT *ybuffer;
-
-    if (m < 1) return (0);
-    if (n < 1) return (0);
-
-    ybuffer = buffer;
-
-    inc_x *= 2;
-    inc_y *= 2;
-    lda *= 2;
-
-    n1 = n / 4;
-    n2 = n % 4;
-
-    m3 = m % 4;
-    m1 = m - (m % 4);
-    m2 = (m % NBMAX) - (m % 4);
-
-    y_ptr = y;
-
-    BLASLONG NB = NBMAX;
-
-    while (NB == NBMAX) {
-
-        m1 -= NB;
-        if (m1 < 0) {
-            if (m2 == 0) break;
-            NB = m2;
-        }
-
-        a_ptr = a;
-
-        x_ptr = x; 
-
-        memset(ybuffer, 0, NB * 2*sizeof(FLOAT));  
-
-        if (inc_x == 2) {
-
-            for (i = 0; i < n1; i++) {
-                cgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);
-
-                a_ptr += lda << 2;
-                x_ptr += 8;
-            }
-
-            if (n2 & 2) {
-                cgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);
-                x_ptr += 4;
-                a_ptr += 2 * lda;
-
-            }
-
-            if (n2 & 1) {
-                cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
-                x_ptr += 2;
-                a_ptr += lda;
-
-            }
-        } else {
-
-            for (i = 0; i < n1; i++) {
-
-                xbuffer[0] = x_ptr[0];
-                xbuffer[1] = x_ptr[1];
-                x_ptr += inc_x;
-                xbuffer[2] = x_ptr[0];
-                xbuffer[3] = x_ptr[1];
-                x_ptr += inc_x;
-                xbuffer[4] = x_ptr[0];
-                xbuffer[5] = x_ptr[1];
-                x_ptr += inc_x;
-                xbuffer[6] = x_ptr[0];
-                xbuffer[7] = x_ptr[1];
-                x_ptr += inc_x;
-
-                cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer);
-
-                a_ptr += lda << 2;
-            }
-
-            for (i = 0; i < n2; i++) {
-                xbuffer[0] = x_ptr[0];
-                xbuffer[1] = x_ptr[1];
-                x_ptr += inc_x;
-                cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
-                a_ptr += lda;
-
-            }
-
-        }
-
-        add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
-        a += 2 * NB;
-        y_ptr += NB * inc_y;
-    }
-
-    if (m3 == 0) return (0);
-
-    if (m3 == 1) {
-        a_ptr = a;
-        x_ptr = x;
-        FLOAT temp_r = 0.0;
-        FLOAT temp_i = 0.0;
-
-        if (lda == 2 && inc_x == 2) {
-
-            for (i = 0; i < (n & -2); i += 2) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
-                temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
-#else
-                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
-                temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
-#endif
-
-                a_ptr += 4;
-                x_ptr += 4;
-            }
-
-            for (; i < n; i++) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-#else
-                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-#endif
-
-                a_ptr += 2;
-                x_ptr += 2;
-            }
-
-        } else {
-
-            for (i = 0; i < n; i++) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-#else
-                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-#endif
-
-                a_ptr += lda;
-                x_ptr += inc_x;
-            }
-
-        }
-#if !defined(XCONJ) 
-        y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-        y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-#else
-        y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-        y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-#endif
-        return (0);
-    }
-
-    if (m3 == 2) {
-        a_ptr = a;
-        x_ptr = x;
-        FLOAT temp_r0 = 0.0;
-        FLOAT temp_i0 = 0.0;
-        FLOAT temp_r1 = 0.0;
-        FLOAT temp_i1 = 0.0;
-
-        if (lda == 4 && inc_x == 2) {
-
-            for (i = 0; i < (n & -2); i += 2) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-
-                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
-
-                temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
-                temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
-                temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
-                temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
-#else
-                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
-
-                temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
-                temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
-                temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
-                temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
-#endif
-
-                a_ptr += 8;
-                x_ptr += 4;
-            }
-
-            for (; i < n; i++) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
-#else
-                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
-#endif
-
-                a_ptr += 4;
-                x_ptr += 2;
-            }
-
-        } else {
-
-            for (i = 0; i < n; i++) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
-#else
-                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
-#endif
-
-                a_ptr += lda;
-                x_ptr += inc_x;
-            }
-
-        }
-#if !defined(XCONJ) 
-        y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-        y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
-        y_ptr += inc_y;
-        y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
-        y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
-#else
-        y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-        y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
-        y_ptr += inc_y;
-        y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
-        y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
-#endif
-        return (0);
-    }
-
-    if (m3 == 3) {
-        a_ptr = a;
-        x_ptr = x;
-        FLOAT temp_r0 = 0.0;
-        FLOAT temp_i0 = 0.0;
-        FLOAT temp_r1 = 0.0;
-        FLOAT temp_i1 = 0.0;
-        FLOAT temp_r2 = 0.0;
-        FLOAT temp_i2 = 0.0;
-
-        if (lda == 6 && inc_x == 2) {
-
-            for (i = 0; i < n; i++) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
-                temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
-                temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
-#else
-                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
-                temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
-                temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
-#endif
-
-                a_ptr += 6;
-                x_ptr += 2;
-            }
-
-        } else {
-
-            for (i = 0; i < n; i++) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
-                temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
-                temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
-#else
-                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
-                temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
-                temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
-#endif
-
-                a_ptr += lda;
-                x_ptr += inc_x;
-            }
-
-        }
-#if !defined(XCONJ) 
-        y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-        y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
-        y_ptr += inc_y;
-        y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
-        y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
-        y_ptr += inc_y;
-        y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
-        y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
-#else
-        y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-        y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
-        y_ptr += inc_y;
-        y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
-        y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
-        y_ptr += inc_y;
-        y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
-        y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
-#endif
-        return (0);
-    }
-
-    return (0);
-}
-#endif
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/zgemv_n.c"
+#else
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "common.h" 
+#include <altivec.h>   
+#define NBMAX 1024
+
+
+static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
+
+ 
+static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+  
+ FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
+    register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
+    register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
+    register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]};
+    register __vector float vx2_r = {x[4], x[4],x[4], x[4]};
+    register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]};
+    register __vector float vx3_r = {x[6], x[6],x[6], x[6]};
+    register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]};
+#else
+    register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
+    register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
+    register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
+    register __vector float vx1_i = {x[3], x[3],x[3], x[3]};
+    register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]};
+    register __vector float vx2_i = {x[5], x[5],x[5], x[5]};
+    register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]};
+    register __vector float vx3_i = {x[7], x[7],x[7], x[7]};
+#endif
+    register __vector float *vptr_y = (__vector float *) y;
+    register __vector float *vptr_a0 = (__vector float *) a0;
+    register __vector float *vptr_a1 = (__vector float *) a1;
+    register __vector float *vptr_a2 = (__vector float *) a2;
+    register __vector float *vptr_a3 = (__vector float *) a3; 
+    BLASLONG  i = 0; 
+    BLASLONG i2=16;
+    for (;i< n * 8; i+=32,i2+=32) {
+        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
+        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
+        register __vector float va1   = vec_vsx_ld(i, vptr_a1);
+        register __vector float va2   = vec_vsx_ld(i ,vptr_a2);
+        register __vector float va3   = vec_vsx_ld(i ,vptr_a3);
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
+        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
+        register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
+        register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
+
+        vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r;
+        vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r;
+        va0   = vec_perm(va0, va0,swap_mask);
+        va0_1 = vec_perm(va0_1, va0_1,swap_mask);
+        va1   = vec_perm(va1, va1,swap_mask);
+        va1_1 = vec_perm(va1_1, va1_1,swap_mask);
+        va2   = vec_perm(va2, va2,swap_mask);
+        va2_1 = vec_perm(va2_1, va2_1,swap_mask);
+        va3   = vec_perm(va3, va3,swap_mask);
+        va3_1 = vec_perm(va3_1, va3_1,swap_mask);
+        vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i;
+        vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i;
+
+        vec_vsx_st(vy_0 ,i,  vptr_y);
+        vec_vsx_st(vy_1,i2,vptr_y);
+    }
+
+}	
+ 
+
+
+static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+ 
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda; 
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
+    register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
+    register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
+    register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; 
+#else
+    register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
+    register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
+    register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
+    register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; 
+#endif
+    register __vector float *vptr_y = (__vector float *) y;
+    register __vector float *vptr_a0 = (__vector float *) a0;
+    register __vector float *vptr_a1 = (__vector float *) a1; 
+    BLASLONG  i = 0;
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
+        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
+        register __vector float va1   = vec_vsx_ld(i, vptr_a1); 
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
+        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); 
+
+        register __vector float va0x   = vec_perm(va0, va0,swap_mask);
+        register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
+        register __vector float va1x   = vec_perm(va1, va1,swap_mask);
+        register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask);
+        vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i;
+        vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; 
+
+        vec_vsx_st(vy_0 ,i,  vptr_y);
+        vec_vsx_st(vy_1,i2,vptr_y);
+    }
+
+}
+
+ 
+
+static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
+
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
+    register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; 
+#else
+    register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
+    register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; 
+#endif
+    register __vector float *vptr_y = (__vector float *) y;
+    register __vector float *vptr_a0 = (__vector float *) ap; 
+    BLASLONG  i = 0; 
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
+        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0); 
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); 
+
+        register __vector float va0x   = vec_perm(va0, va0,swap_mask);
+        register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
+        vy_0 += va0*vx0_r + va0x*vx0_i;
+        vy_1 += va0_1*vx0_r + va0x_1*vx0_i; 
+
+        vec_vsx_st(vy_0 ,i,  vptr_y);
+        vec_vsx_st(vy_1,i2,vptr_y);
+    }
+}
+
+
+
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
+    BLASLONG i=0;
+
+
+    if (inc_dest != 2) {
+ 		FLOAT temp_r;
+		FLOAT temp_i;
+		for ( i=0; i<n; i++ )
+		{
+#if !defined(XCONJ) 
+			temp_r = alpha_r * src[0] - alpha_i * src[1];
+			temp_i = alpha_r * src[1] + alpha_i * src[0];
+#else
+			temp_r =  alpha_r * src[0] + alpha_i * src[1];
+			temp_i = -alpha_r * src[1] + alpha_i * src[0];
+#endif
+
+			*dest += temp_r;
+			*(dest+1) += temp_i;
+
+			src+=2;
+			dest += inc_dest;
+		}
+        return;
+    } else {
+        __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+#if   !defined(XCONJ) 
+
+        register __vector float valpha_r = {alpha_r, alpha_r, alpha_r, alpha_r};
+        register __vector float valpha_i = {-alpha_i, alpha_i, -alpha_i, alpha_i};
+
+#else
+        register __vector float valpha_r = {alpha_r, -alpha_r, alpha_r, -alpha_r};
+        register __vector float valpha_i = {alpha_i, alpha_i, alpha_i, alpha_i};
+#endif
+
+        register __vector float *vptr_src = (__vector float *) src;
+        register __vector float *vptr_y = (__vector float *) dest; 
+
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
+        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
+
+
+        register __vector float vsrc  = vec_vsx_ld(i,vptr_src);
+        register __vector float vsrc_1  = vec_vsx_ld(i2,vptr_src);
+
+        register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
+        register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
+
+        vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
+        vy_1 += vsrc_1*valpha_r +  vsrcx_1*valpha_i;  
+
+        vec_vsx_st(vy_0 ,i,  vptr_y);
+        vec_vsx_st(vy_1,i2,vptr_y);
+
+        }
+ 
+    }
+    return;
+}
+
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
+    BLASLONG i=0;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2;
+    FLOAT xbuffer[8] __attribute__((aligned(16)));
+    FLOAT *ybuffer;
+
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    ybuffer = buffer;
+
+    inc_x *= 2;
+    inc_y *= 2;
+    lda *= 2;
+
+    n1 = n / 4;
+    n2 = n % 4;
+
+    m3 = m % 4;
+    m1 = m - (m % 4);
+    m2 = (m % NBMAX) - (m % 4);
+
+    y_ptr = y;
+
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        a_ptr = a;
+
+        x_ptr = x; 
+
+        memset(ybuffer, 0, NB * 2*sizeof(FLOAT));  
+
+        if (inc_x == 2) {
+
+            for (i = 0; i < n1; i++) {
+                cgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);
+
+                a_ptr += lda << 2;
+                x_ptr += 8;
+            }
+
+            if (n2 & 2) {
+                cgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);
+                x_ptr += 4;
+                a_ptr += 2 * lda;
+
+            }
+
+            if (n2 & 1) {
+                cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
+                x_ptr += 2;
+                a_ptr += lda;
+
+            }
+        } else {
+
+            for (i = 0; i < n1; i++) {
+
+                xbuffer[0] = x_ptr[0];
+                xbuffer[1] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[2] = x_ptr[0];
+                xbuffer[3] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[4] = x_ptr[0];
+                xbuffer[5] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[6] = x_ptr[0];
+                xbuffer[7] = x_ptr[1];
+                x_ptr += inc_x;
+
+                cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer);
+
+                a_ptr += lda << 2;
+            }
+
+            for (i = 0; i < n2; i++) {
+                xbuffer[0] = x_ptr[0];
+                xbuffer[1] = x_ptr[1];
+                x_ptr += inc_x;
+                cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
+                a_ptr += lda;
+
+            }
+
+        }
+
+        add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
+        a += 2 * NB;
+        y_ptr += NB * inc_y;
+    }
+
+    if (m3 == 0) return (0);
+
+    if (m3 == 1) {
+        a_ptr = a;
+        x_ptr = x;
+        FLOAT temp_r = 0.0;
+        FLOAT temp_i = 0.0;
+
+        if (lda == 2 && inc_x == 2) {
+
+            for (i = 0; i < (n & -2); i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
+                temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
+#else
+                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
+                temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
+#endif
+
+                a_ptr += 4;
+                x_ptr += 4;
+            }
+
+            for (; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+                a_ptr += 2;
+                x_ptr += 2;
+            }
+
+        } else {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+
+        }
+#if !defined(XCONJ) 
+        y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+        y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+        y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+        y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+        return (0);
+    }
+
+    if (m3 == 2) {
+        a_ptr = a;
+        x_ptr = x;
+        FLOAT temp_r0 = 0.0;
+        FLOAT temp_i0 = 0.0;
+        FLOAT temp_r1 = 0.0;
+        FLOAT temp_i1 = 0.0;
+
+        if (lda == 4 && inc_x == 2) {
+
+            for (i = 0; i < (n & -2); i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+
+                temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
+                temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
+                temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
+                temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+
+                temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
+                temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
+                temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+                temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
+#endif
+
+                a_ptr += 8;
+                x_ptr += 4;
+            }
+
+            for (; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+                a_ptr += 4;
+                x_ptr += 2;
+            }
+
+        } else {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+
+        }
+#if !defined(XCONJ) 
+        y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+        y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+        y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#else
+        y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+        y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+        y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+#endif
+        return (0);
+    }
+
+    if (m3 == 3) {
+        a_ptr = a;
+        x_ptr = x;
+        FLOAT temp_r0 = 0.0;
+        FLOAT temp_i0 = 0.0;
+        FLOAT temp_r1 = 0.0;
+        FLOAT temp_i1 = 0.0;
+        FLOAT temp_r2 = 0.0;
+        FLOAT temp_i2 = 0.0;
+
+        if (lda == 6 && inc_x == 2) {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+                a_ptr += 6;
+                x_ptr += 2;
+            }
+
+        } else {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+
+        }
+#if !defined(XCONJ) 
+        y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+        y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+        y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
+        y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+#else
+        y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+        y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+        y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
+        y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+#endif
+        return (0);
+    }
+
+    return (0);
+}
+#endif
diff --git a/kernel/power/cgemv_t.c b/kernel/power/cgemv_t.c
index 1bfc235db..77493dc2f 100644
--- a/kernel/power/cgemv_t.c
+++ b/kernel/power/cgemv_t.c
@@ -1,601 +1,601 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-#if !defined(__VEC__) || !defined(__ALTIVEC__)
-#include "../arm/zgemv_t.c"
-#else
-
-#include "common.h"
-
-#define NBMAX 1024 
-#include <altivec.h> 
-static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
-
-static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
-
-    FLOAT *a0, *a1, *a2, *a3;
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-    //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
-    register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0};
-    __vector float* vptr_a0 = (__vector float*) a0;
-    __vector float* vptr_a1 = (__vector float*) a1;
-    __vector float* vptr_a2 = (__vector float*) a2;
-    __vector float* vptr_a3 = (__vector float*) a3;
-    __vector float* v_x = (__vector float*) x;
-
-    BLASLONG  i = 0;
-    BLASLONG  i2 = 16;  
-    for (;i< n * 8; i+=32, i2+=32) { 
-        register __vector float vx_0  = vec_vsx_ld( i,v_x) ; 
-        register __vector float vx_1  = vec_vsx_ld(i2, v_x); 
-                
-        register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
-        register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
-
-        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
-        register __vector float va1   = vec_vsx_ld(i, vptr_a1);
-        register __vector float va2   = vec_vsx_ld(i ,vptr_a2);
-        register __vector float va3   = vec_vsx_ld(i ,vptr_a3);
-        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
-        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
-        register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
-        register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
-
-
-        vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
-        vtemp0_r += vxr_0*va0 + vxr_1*va0_1; 
-        vtemp1_p += vx_0*va1 + vx_1*va1_1;
-        vtemp1_r += vxr_0*va1 + vxr_1*va1_1; 
-        vtemp2_p += vx_0*va2 + vx_1*va2_1;
-        vtemp2_r += vxr_0*va2 + vxr_1*va2_1; 
-        vtemp3_p += vx_0*va3 + vx_1*va3_1;
-        vtemp3_r += vxr_0*va3 + vxr_1*va3_1; 
-
-    }
-
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-
-    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
-    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
-
-    register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
-    register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
-
-    register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3];
-    register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3];
-
-    register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3];
-    register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3];
-
-#else
-    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
-    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
-
-    register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
-    register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3];
-
-    register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3];
-    register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3];
-
-    register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3];
-    register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3];
-
-#endif    
-
-#if !defined(XCONJ)
-
-    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
-    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
-    y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
-    y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
-    y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
-    y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
-
-#else
-
-    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
-    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
-    y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
-    y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
-    y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
-    y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
-
-#endif
-
-}
- 
-
-static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
-
-    FLOAT *a0, *a1;
-    a0 = ap;
-    a1 = ap + lda; 
-    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-    //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
-    register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; 
-
-
-    __vector float* vptr_a0 = (__vector float*) a0;
-    __vector float* vptr_a1 = (__vector float*) a1; 
-    __vector float* v_x = (__vector float*) x;
-
-    BLASLONG  i = 0;
-    BLASLONG  i2 = 16;  
-    for (;i< n * 8; i+=32, i2+=32) { 
-        register __vector float vx_0  = vec_vsx_ld( i,v_x) ; 
-        register __vector float vx_1  = vec_vsx_ld(i2, v_x); 
-                
-        register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
-        register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
-
-        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
-        register __vector float va1   = vec_vsx_ld(i, vptr_a1); 
-        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
-        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); 
-
-
-        vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
-        vtemp0_r += vxr_0*va0 + vxr_1*va0_1; 
-        vtemp1_p += vx_0*va1 + vx_1*va1_1;
-        vtemp1_r += vxr_0*va1 + vxr_1*va1_1;  
-
-    }
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-
-    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
-    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
-
-    register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
-    register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
- 
-
-#else
-    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
-    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
-
-    register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
-    register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; 
-
-#endif    
-
-#if !defined(XCONJ)
-
-    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
-    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; 
-
-#else
-
-    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
-    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; 
-
-#endif
-  
-}
- 
-
-static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
- 
-    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-    //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
-    register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; 
-    __vector float* vptr_a0 = (__vector float*) ap; 
-    __vector float* v_x = (__vector float*) x;
-    BLASLONG  i = 0;
-    BLASLONG  i2 = 16;  
-    for (;i< n * 8; i+=32, i2+=32) { 
-        register __vector float vx_0  = vec_vsx_ld( i,v_x) ; 
-        register __vector float vx_1  = vec_vsx_ld(i2, v_x); 
-                
-        register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
-        register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
-
-        register __vector float va0   = vec_vsx_ld(i,vptr_a0); 
-        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);  
-
-        vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
-        vtemp0_r += vxr_0*va0 + vxr_1*va0_1;  
-    }
-
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-
-    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
-    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; 
-
-#else
-    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
-    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; 
-
-#endif    
-
-#if !defined(XCONJ)
-
-    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; 
-
-#else
-
-    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; 
-
-#endif
-
-
-}
- 
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
-    BLASLONG i;
-    for (i = 0; i < n; i++) {
-        *dest = *src;
-        *(dest + 1) = *(src + 1);
-        dest += 2;
-        src += inc_src;
-    }
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
-    BLASLONG i=0;
-    BLASLONG j=0;
-    FLOAT *a_ptr;
-    FLOAT *x_ptr;
-    FLOAT *y_ptr;
-
-    BLASLONG n1;
-    BLASLONG m1;
-    BLASLONG m2;
-    BLASLONG m3;
-    BLASLONG n2;
-    FLOAT ybuffer[8] __attribute__((aligned(16)));
-    FLOAT *xbuffer;
-
-    if (m < 1) return (0);
-    if (n < 1) return (0);
-
-    inc_x <<= 1;
-    inc_y <<= 1;
-    lda <<= 1;
-
-    xbuffer = buffer;
-
-    n1 = n >> 2;
-    n2 = n & 3;
-
-    m3 = m & 3;
-    m1 = m - m3;
-    m2 = (m & (NBMAX - 1)) - m3;
-
-    BLASLONG NB = NBMAX;
-
-    while (NB == NBMAX) {
-
-        m1 -= NB;
-        if (m1 < 0) {
-            if (m2 == 0) break;
-            NB = m2;
-        }
-
-        y_ptr = y;
-        a_ptr = a;
-        x_ptr = x;
-
-        if (inc_x != 2)
-            copy_x(NB, x_ptr, xbuffer, inc_x);
-        else
-            xbuffer = x_ptr;
-
-        if (inc_y == 2) {
-
-            for (i = 0; i < n1; i++) {
-                cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
-                a_ptr += lda << 2;
-                y_ptr += 8;
-
-            }
-
-            if (n2 & 2) {
-                cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
-                a_ptr += lda << 1;
-                y_ptr += 4;
-
-            }
-
-            if (n2 & 1) {
-                cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
-                a_ptr += lda;
-                y_ptr += 2;
-
-            }
-
-        } else {
-
-            for (i = 0; i < n1; i++) {
-                memset(ybuffer, 0, sizeof (ybuffer));
-                cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
-
-                a_ptr += lda << 2;
-
-                y_ptr[0] += ybuffer[0];
-                y_ptr[1] += ybuffer[1];
-                y_ptr += inc_y;
-                y_ptr[0] += ybuffer[2];
-                y_ptr[1] += ybuffer[3];
-                y_ptr += inc_y;
-                y_ptr[0] += ybuffer[4];
-                y_ptr[1] += ybuffer[5];
-                y_ptr += inc_y;
-                y_ptr[0] += ybuffer[6];
-                y_ptr[1] += ybuffer[7];
-                y_ptr += inc_y;
-
-            }
-
-            for (i = 0; i < n2; i++) {
-                memset(ybuffer, 0, sizeof (ybuffer));
-                cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
-                a_ptr += lda;
-                y_ptr[0] += ybuffer[0];
-                y_ptr[1] += ybuffer[1];
-                y_ptr += inc_y;
-
-            }
-
-        }
-        a += 2 * NB;
-        x += NB * inc_x;
-    }
-
-    if (m3 == 0) return (0);
-
-    x_ptr = x;
-    j = 0;
-    a_ptr = a;
-    y_ptr = y;
-
-    if (m3 == 3) {
-
-        FLOAT temp_r;
-        FLOAT temp_i;
-        FLOAT x0 = x_ptr[0];
-        FLOAT x1 = x_ptr[1];
-        x_ptr += inc_x;
-        FLOAT x2 = x_ptr[0];
-        FLOAT x3 = x_ptr[1];
-        x_ptr += inc_x;
-        FLOAT x4 = x_ptr[0];
-        FLOAT x5 = x_ptr[1];
-        while (j < n) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
-            temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
-            temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
-#else
-
-            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
-            temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
-            temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
-#endif
-
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-#else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-#endif
-
-            a_ptr += lda;
-            y_ptr += inc_y;
-            j++;
-        }
-        return (0);
-    }
-
-    if (m3 == 2) {
-
-        FLOAT temp_r;
-        FLOAT temp_i;
-        FLOAT temp_r1;
-        FLOAT temp_i1;
-        FLOAT x0 = x_ptr[0];
-        FLOAT x1 = x_ptr[1];
-        x_ptr += inc_x;
-        FLOAT x2 = x_ptr[0];
-        FLOAT x3 = x_ptr[1];
-
-        while (j < (n & -2)) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
-            a_ptr += lda;
-            temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
-            temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
-            temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
-#else
-
-            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
-            a_ptr += lda;
-            temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
-            temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
-            temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
-#endif
-
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
-            y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
-#else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
-            y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
-#endif
-
-            a_ptr += lda;
-            y_ptr += inc_y;
-            j += 2;
-        }
-
-        while (j < n) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
-#else
-
-            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
-#endif
-
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-#else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-#endif
-
-            a_ptr += lda;
-            y_ptr += inc_y;
-            j++;
-        }
-
-        return (0);
-    }
-
-    if (m3 == 1) {
-
-        FLOAT temp_r;
-        FLOAT temp_i;
-        FLOAT temp_r1;
-        FLOAT temp_i1;
-        FLOAT x0 = x_ptr[0];
-        FLOAT x1 = x_ptr[1];
-
-        while (j < (n & -2)) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
-            a_ptr += lda;
-            temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
-#else
-
-            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
-            a_ptr += lda;
-            temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
-#endif
-
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
-            y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
-#else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
-            y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
-#endif
-
-            a_ptr += lda;
-            y_ptr += inc_y;
-            j += 2;
-        }
-
-        while (j < n) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
-#else
-
-            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
-#endif
-
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-#else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-#endif
-
-            a_ptr += lda;
-            y_ptr += inc_y;
-            j++;
-        }
-        return (0);
-    }
-
-    return (0);
-
-}
-#endif
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/zgemv_t.c"
+#else
+
+#include "common.h"
+
+#define NBMAX 1024 
+#include <altivec.h> 
+static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
+
+static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
+
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+    //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
+    register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0};
+    __vector float* vptr_a0 = (__vector float*) a0;
+    __vector float* vptr_a1 = (__vector float*) a1;
+    __vector float* vptr_a2 = (__vector float*) a2;
+    __vector float* vptr_a3 = (__vector float*) a3;
+    __vector float* v_x = (__vector float*) x;
+
+    BLASLONG  i = 0;
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vx_0  = vec_vsx_ld( i,v_x) ; 
+        register __vector float vx_1  = vec_vsx_ld(i2, v_x); 
+                
+        register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
+        register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
+
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
+        register __vector float va1   = vec_vsx_ld(i, vptr_a1);
+        register __vector float va2   = vec_vsx_ld(i ,vptr_a2);
+        register __vector float va3   = vec_vsx_ld(i ,vptr_a3);
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
+        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
+        register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
+        register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
+
+
+        vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
+        vtemp0_r += vxr_0*va0 + vxr_1*va0_1; 
+        vtemp1_p += vx_0*va1 + vx_1*va1_1;
+        vtemp1_r += vxr_0*va1 + vxr_1*va1_1; 
+        vtemp2_p += vx_0*va2 + vx_1*va2_1;
+        vtemp2_r += vxr_0*va2 + vxr_1*va2_1; 
+        vtemp3_p += vx_0*va3 + vx_1*va3_1;
+        vtemp3_r += vxr_0*va3 + vxr_1*va3_1; 
+
+    }
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
+
+    register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
+    register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
+
+    register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3];
+    register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3];
+
+    register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3];
+    register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3];
+
+#else
+    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
+
+    register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
+    register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3];
+
+    register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3];
+    register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3];
+
+    register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3];
+    register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3];
+
+#endif    
+
+#if !defined(XCONJ)
+
+    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
+    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
+    y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
+    y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
+    y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
+    y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
+
+#else
+
+    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
+    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+    y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
+    y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+    y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
+    y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
+
+#endif
+
+}
+ 
+
+static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
+
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda; 
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+    //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
+    register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; 
+
+
+    __vector float* vptr_a0 = (__vector float*) a0;
+    __vector float* vptr_a1 = (__vector float*) a1; 
+    __vector float* v_x = (__vector float*) x;
+
+    BLASLONG  i = 0;
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vx_0  = vec_vsx_ld( i,v_x) ; 
+        register __vector float vx_1  = vec_vsx_ld(i2, v_x); 
+                
+        register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
+        register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
+
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
+        register __vector float va1   = vec_vsx_ld(i, vptr_a1); 
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
+        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); 
+
+
+        vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
+        vtemp0_r += vxr_0*va0 + vxr_1*va0_1; 
+        vtemp1_p += vx_0*va1 + vx_1*va1_1;
+        vtemp1_r += vxr_0*va1 + vxr_1*va1_1;  
+
+    }
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
+
+    register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
+    register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
+ 
+
+#else
+    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
+
+    register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
+    register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; 
+
+#endif    
+
+#if !defined(XCONJ)
+
+    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
+    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; 
+
+#else
+
+    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
+    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; 
+
+#endif
+  
+}
+ 
+
+static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
+ 
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+    //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
+    register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; 
+    __vector float* vptr_a0 = (__vector float*) ap; 
+    __vector float* v_x = (__vector float*) x;
+    BLASLONG  i = 0;
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vx_0  = vec_vsx_ld( i,v_x) ; 
+        register __vector float vx_1  = vec_vsx_ld(i2, v_x); 
+                
+        register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
+        register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
+
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0); 
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);  
+
+        vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
+        vtemp0_r += vxr_0*va0 + vxr_1*va0_1;  
+    }
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; 
+
+#else
+    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; 
+
+#endif    
+
+#if !defined(XCONJ)
+
+    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; 
+
+#else
+
+    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; 
+
+#endif
+
+
+}
+ 
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+    BLASLONG i;
+    for (i = 0; i < n; i++) {
+        *dest = *src;
+        *(dest + 1) = *(src + 1);
+        dest += 2;
+        src += inc_src;
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
+    BLASLONG i=0;
+    BLASLONG j=0;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2;
+    FLOAT ybuffer[8] __attribute__((aligned(16)));
+    FLOAT *xbuffer;
+
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    inc_x <<= 1;
+    inc_y <<= 1;
+    lda <<= 1;
+
+    xbuffer = buffer;
+
+    n1 = n >> 2;
+    n2 = n & 3;
+
+    m3 = m & 3;
+    m1 = m - m3;
+    m2 = (m & (NBMAX - 1)) - m3;
+
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        y_ptr = y;
+        a_ptr = a;
+        x_ptr = x;
+
+        if (inc_x != 2)
+            copy_x(NB, x_ptr, xbuffer, inc_x);
+        else
+            xbuffer = x_ptr;
+
+        if (inc_y == 2) {
+
+            for (i = 0; i < n1; i++) {
+                cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
+                a_ptr += lda << 2;
+                y_ptr += 8;
+
+            }
+
+            if (n2 & 2) {
+                cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
+                a_ptr += lda << 1;
+                y_ptr += 4;
+
+            }
+
+            if (n2 & 1) {
+                cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
+                a_ptr += lda;
+                y_ptr += 2;
+
+            }
+
+        } else {
+
+            for (i = 0; i < n1; i++) {
+                memset(ybuffer, 0, sizeof (ybuffer));
+                cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
+
+                a_ptr += lda << 2;
+
+                y_ptr[0] += ybuffer[0];
+                y_ptr[1] += ybuffer[1];
+                y_ptr += inc_y;
+                y_ptr[0] += ybuffer[2];
+                y_ptr[1] += ybuffer[3];
+                y_ptr += inc_y;
+                y_ptr[0] += ybuffer[4];
+                y_ptr[1] += ybuffer[5];
+                y_ptr += inc_y;
+                y_ptr[0] += ybuffer[6];
+                y_ptr[1] += ybuffer[7];
+                y_ptr += inc_y;
+
+            }
+
+            for (i = 0; i < n2; i++) {
+                memset(ybuffer, 0, sizeof (ybuffer));
+                cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
+                a_ptr += lda;
+                y_ptr[0] += ybuffer[0];
+                y_ptr[1] += ybuffer[1];
+                y_ptr += inc_y;
+
+            }
+
+        }
+        a += 2 * NB;
+        x += NB * inc_x;
+    }
+
+    if (m3 == 0) return (0);
+
+    x_ptr = x;
+    j = 0;
+    a_ptr = a;
+    y_ptr = y;
+
+    if (m3 == 3) {
+
+        FLOAT temp_r;
+        FLOAT temp_i;
+        FLOAT x0 = x_ptr[0];
+        FLOAT x1 = x_ptr[1];
+        x_ptr += inc_x;
+        FLOAT x2 = x_ptr[0];
+        FLOAT x3 = x_ptr[1];
+        x_ptr += inc_x;
+        FLOAT x4 = x_ptr[0];
+        FLOAT x5 = x_ptr[1];
+        while (j < n) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+            temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
+            temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
+#else
+
+            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+            temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
+            temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
+#endif
+
+#if !defined(XCONJ) 
+            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+            a_ptr += lda;
+            y_ptr += inc_y;
+            j++;
+        }
+        return (0);
+    }
+
+    if (m3 == 2) {
+
+        FLOAT temp_r;
+        FLOAT temp_i;
+        FLOAT temp_r1;
+        FLOAT temp_i1;
+        FLOAT x0 = x_ptr[0];
+        FLOAT x1 = x_ptr[1];
+        x_ptr += inc_x;
+        FLOAT x2 = x_ptr[0];
+        FLOAT x3 = x_ptr[1];
+
+        while (j < (n & -2)) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+            a_ptr += lda;
+            temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
+            temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
+            temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
+#else
+
+            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+            a_ptr += lda;
+            temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
+            temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
+            temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
+#endif
+
+#if !defined(XCONJ) 
+            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+            y_ptr += inc_y;
+            y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+            y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#else
+            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+            y_ptr += inc_y;
+            y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+            y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+#endif
+
+            a_ptr += lda;
+            y_ptr += inc_y;
+            j += 2;
+        }
+
+        while (j < n) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+#else
+
+            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+#endif
+
+#if !defined(XCONJ) 
+            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+            a_ptr += lda;
+            y_ptr += inc_y;
+            j++;
+        }
+
+        return (0);
+    }
+
+    if (m3 == 1) {
+
+        FLOAT temp_r;
+        FLOAT temp_i;
+        FLOAT temp_r1;
+        FLOAT temp_i1;
+        FLOAT x0 = x_ptr[0];
+        FLOAT x1 = x_ptr[1];
+
+        while (j < (n & -2)) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+            a_ptr += lda;
+            temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
+#else
+
+            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+            a_ptr += lda;
+            temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
+#endif
+
+#if !defined(XCONJ) 
+            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+            y_ptr += inc_y;
+            y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+            y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#else
+            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+            y_ptr += inc_y;
+            y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+            y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+#endif
+
+            a_ptr += lda;
+            y_ptr += inc_y;
+            j += 2;
+        }
+
+        while (j < n) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+#else
+
+            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+#endif
+
+#if !defined(XCONJ) 
+            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+            a_ptr += lda;
+            y_ptr += inc_y;
+            j++;
+        }
+        return (0);
+    }
+
+    return (0);
+
+}
+#endif
diff --git a/kernel/power/crot.c b/kernel/power/crot.c
index 84ba5d913..dbd7e3482 100644
--- a/kernel/power/crot.c
+++ b/kernel/power/crot.c
@@ -1,233 +1,233 @@
-/***************************************************************************
-Copyright (c) 2013-2018, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-#include "common.h"
- 
-#if defined(POWER8) || defined(POWER9) || defined(POWER10)
-#if defined(__VEC__) || defined(__ALTIVEC__)
-
-static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
-{
-  __vector float t0;
-  __vector float t1;
-  __vector float t2;
-  __vector float t3;
-  __vector float t4;
-  __vector float t5;
-  __vector float t6;
-  __vector float t7;
-  __asm__
-    (
-       "xscvdpspn   36, %x[cos]               \n\t" // load c to all words
-       "xxspltw     36, 36, 0                 \n\t" 
-       "xscvdpspn   37, %x[sin]               \n\t" // load s to all words
-       "xxspltw     37, 37, 0                 \n\t" 
-       "lxvd2x      32, 0, %[x_ptr]           \n\t" // load x
-       "lxvd2x      33, %[i16], %[x_ptr]      \n\t" 
-       "lxvd2x      34, %[i32], %[x_ptr]      \n\t" 
-       "lxvd2x      35, %[i48], %[x_ptr]      \n\t" 
-       "lxvd2x      48, 0, %[y_ptr]           \n\t" // load y
-       "lxvd2x      49, %[i16], %[y_ptr]      \n\t" 
-       "lxvd2x      50, %[i32], %[y_ptr]      \n\t" 
-       "lxvd2x      51, %[i48], %[y_ptr]      \n\t" 
-       "addi        %[x_ptr], %[x_ptr], 64    \n\t" 
-       "addi        %[y_ptr], %[y_ptr], 64    \n\t" 
-       "addic.      %[temp_n], %[temp_n], -8  \n\t" 
-       "ble         two%=                        \n\t" 
-       ".align    5                         \n\t" 
-       "one%=:                                    \n\t" 
-       "xvmulsp     40, 32, 36                \n\t" // c * x
-       "xvmulsp     41, 33, 36                \n\t" 
-       "xvmulsp     42, 34, 36                \n\t" 
-       "xvmulsp     43, 35, 36                \n\t" 
-       "xvmulsp     %x[x0], 48, 36            \n\t" // c * y
-       "xvmulsp     %x[x2], 49, 36            \n\t" 
-       "xvmulsp     %x[x1], 50, 36            \n\t" 
-       "xvmulsp     %x[x3], 51, 36            \n\t" 
-       "xvmulsp     44, 32, 37                \n\t" // s * x
-       "xvmulsp     45, 33, 37                \n\t" 
-       "lxvd2x      32, 0, %[x_ptr]           \n\t" // load x
-       "lxvd2x      33, %[i16], %[x_ptr]      \n\t" 
-       "xvmulsp     46, 34, 37                \n\t" 
-       "xvmulsp     47, 35, 37                \n\t" 
-       "lxvd2x      34, %[i32], %[x_ptr]      \n\t" 
-       "lxvd2x      35, %[i48], %[x_ptr]      \n\t" 
-       "xvmulsp     %x[x4], 48, 37            \n\t" // s * y
-       "xvmulsp     %x[x5], 49, 37            \n\t" 
-       "lxvd2x      48, 0, %[y_ptr]           \n\t" // load y
-       "lxvd2x      49, %[i16], %[y_ptr]      \n\t" 
-       "xvmulsp     %x[x6], 50, 37            \n\t" 
-       "xvmulsp     %x[x7], 51, 37            \n\t" 
-       "lxvd2x      50, %[i32], %[y_ptr]      \n\t" 
-       "lxvd2x      51, %[i48], %[y_ptr]      \n\t" 
-       "xvaddsp     40, 40, %x[x4]            \n\t" // c * x + s * y
-       "xvaddsp     41, 41, %x[x5]            \n\t" // c * x + s * y
-       "addi        %[x_ptr], %[x_ptr], -64   \n\t" 
-       "addi        %[y_ptr], %[y_ptr], -64   \n\t" 
-       "xvaddsp     42, 42, %x[x6]            \n\t" // c * x + s * y
-       "xvaddsp     43, 43, %x[x7]            \n\t" // c * x + s * y
-       "xvsubsp     %x[x0], %x[x0], 44        \n\t" // c * y - s * x
-       "xvsubsp     %x[x2], %x[x2], 45        \n\t" // c * y - s * x
-       "xvsubsp     %x[x1], %x[x1], 46        \n\t" // c * y - s * x
-       "xvsubsp     %x[x3], %x[x3], 47        \n\t" // c * y - s * x
-       "stxvd2x     40, 0, %[x_ptr]           \n\t" // store x
-       "stxvd2x     41, %[i16], %[x_ptr]      \n\t" 
-       "stxvd2x     42, %[i32], %[x_ptr]      \n\t" 
-       "stxvd2x     43, %[i48], %[x_ptr]      \n\t" 
-       "stxvd2x     %x[x0], 0, %[y_ptr]       \n\t" // store y
-       "stxvd2x     %x[x2], %[i16], %[y_ptr]  \n\t" 
-       "stxvd2x     %x[x1], %[i32], %[y_ptr]  \n\t" 
-       "stxvd2x     %x[x3], %[i48], %[y_ptr]  \n\t" 
-       "addi        %[x_ptr], %[x_ptr], 128   \n\t" 
-       "addi        %[y_ptr], %[y_ptr], 128   \n\t" 
-       "addic.      %[temp_n], %[temp_n], -8  \n\t" 
-       "bgt         one%=                        \n\t" 
-       "two%=:                                    \n\t" 
-       "xvmulsp     40, 32, 36                \n\t" // c * x
-       "xvmulsp     41, 33, 36                \n\t" 
-       "xvmulsp     42, 34, 36                \n\t" 
-       "xvmulsp     43, 35, 36                \n\t" 
-       "xvmulsp     %x[x0], 48, 36            \n\t" // c * y
-       "xvmulsp     %x[x2], 49, 36            \n\t" 
-       "xvmulsp     %x[x1], 50, 36            \n\t" 
-       "xvmulsp     %x[x3], 51, 36            \n\t" 
-       "xvmulsp     44, 32, 37                \n\t" // s * x
-       "xvmulsp     45, 33, 37                \n\t" 
-       "xvmulsp     46, 34, 37                \n\t" 
-       "xvmulsp     47, 35, 37                \n\t" 
-       "xvmulsp     %x[x4], 48, 37            \n\t" // s * y
-       "xvmulsp     %x[x5], 49, 37            \n\t" 
-       "xvmulsp     %x[x6], 50, 37            \n\t" 
-       "xvmulsp     %x[x7], 51, 37            \n\t" 
-       "addi        %[x_ptr], %[x_ptr], -64   \n\t" 
-       "addi        %[y_ptr], %[y_ptr], -64   \n\t" 
-       "xvaddsp     40, 40, %x[x4]            \n\t" // c * x + s * y
-       "xvaddsp     41, 41, %x[x5]            \n\t" // c * x + s * y
-       "xvaddsp     42, 42, %x[x6]            \n\t" // c * x + s * y
-       "xvaddsp     43, 43, %x[x7]            \n\t" // c * x + s * y
-       "xvsubsp     %x[x0], %x[x0], 44        \n\t" // c * y - s * x
-       "xvsubsp     %x[x2], %x[x2], 45        \n\t" // c * y - s * x
-       "xvsubsp     %x[x1], %x[x1], 46        \n\t" // c * y - s * x
-       "xvsubsp     %x[x3], %x[x3], 47        \n\t" // c * y - s * x
-       "stxvd2x     40, 0, %[x_ptr]           \n\t" // store x
-       "stxvd2x     41, %[i16], %[x_ptr]      \n\t" 
-       "stxvd2x     42, %[i32], %[x_ptr]      \n\t" 
-       "stxvd2x     43, %[i48], %[x_ptr]      \n\t" 
-       "stxvd2x     %x[x0], 0, %[y_ptr]       \n\t" // store y
-       "stxvd2x     %x[x2], %[i16], %[y_ptr]  \n\t" 
-       "stxvd2x     %x[x1], %[i32], %[y_ptr]  \n\t" 
-       "stxvd2x     %x[x3], %[i48], %[y_ptr]  "
-     :
-       [mem_x]  "+m"  (*(float (*)[2*n])x),
-       [mem_y]  "+m"  (*(float (*)[2*n])y),
-       [temp_n] "+r"  (n),
-       [x_ptr]  "+&b" (x),
-       [y_ptr]  "+&b" (y),
-       [x0]     "=wa" (t0),
-       [x1]     "=wa" (t2),
-       [x2]     "=wa" (t1),
-       [x3]     "=wa" (t3),
-       [x4]     "=wa" (t4),
-       [x5]     "=wa" (t5),
-       [x6]     "=wa" (t6),
-       [x7]     "=wa" (t7)     
-     : 
-       [cos]    "f"   (c),
-       [sin]    "f"   (s),
-       [i16]    "b"   (16),
-       [i32]    "b"   (32),
-       [i48]    "b"   (48)     
-     :
-       "cr0",
-       "vs32","vs33","vs34","vs35","vs36","vs37",
-       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
-       "vs48","vs49","vs50","vs51"
-     );
-}
- 
-#endif
-#endif
-
-
-int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
-{
-   BLASLONG i=0;
-    BLASLONG ix=0,iy=0;
-    FLOAT temp[2];
-    BLASLONG inc_x2;
-    BLASLONG inc_y2;
-
-    if ( n <= 0     )  return(0); 
-
-    if ( (inc_x == 1) && (inc_y == 1) )
-    {
-#if defined(__VEC__) || defined(__ALTIVEC__)
-        BLASLONG n1 = n & -8; 
-        if ( n1 > 0 )
-        { 
-            crot_kernel_8(n1, x, y, c, s);
-            i=n1; 
-            ix=2*n1; 
-        }
-#endif
-         while(i < n)
-           {
-                temp[0]   = c*x[ix]   + s*y[ix] ;
-                temp[1]   = c*x[ix+1] + s*y[ix+1] ;
-                y[ix]     = c*y[ix]   - s*x[ix] ;
-                y[ix+1]   = c*y[ix+1] - s*x[ix+1] ;
-                x[ix]     = temp[0] ;
-                x[ix+1]   = temp[1] ;
-
-                ix += 2 ; 
-                i++ ;
-
-            }
-
-    }
-    else
-    {
-        inc_x2 = 2 * inc_x ;
-        inc_y2 = 2 * inc_y ;
-        while(i < n)
-        {
-            temp[0]   = c*x[ix]   + s*y[iy] ;
-            temp[1]   = c*x[ix+1] + s*y[iy+1] ;
-            y[iy]     = c*y[iy]   - s*x[ix] ;
-            y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
-            x[ix]     = temp[0] ;
-            x[ix+1]   = temp[1] ;
-
-            ix += inc_x2 ;
-            iy += inc_y2 ;
-            i++ ;
-
-        }
-    }
-	return(0);
-}
-
+/***************************************************************************
+Copyright (c) 2013-2018, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#include "common.h"
+ 
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
+
+static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
+{
+  __vector float t0;
+  __vector float t1;
+  __vector float t2;
+  __vector float t3;
+  __vector float t4;
+  __vector float t5;
+  __vector float t6;
+  __vector float t7;
+  __asm__
+    (
+       "xscvdpspn   36, %x[cos]               \n\t" // load c to all words
+       "xxspltw     36, 36, 0                 \n\t" 
+       "xscvdpspn   37, %x[sin]               \n\t" // load s to all words
+       "xxspltw     37, 37, 0                 \n\t" 
+       "lxvd2x      32, 0, %[x_ptr]           \n\t" // load x
+       "lxvd2x      33, %[i16], %[x_ptr]      \n\t" 
+       "lxvd2x      34, %[i32], %[x_ptr]      \n\t" 
+       "lxvd2x      35, %[i48], %[x_ptr]      \n\t" 
+       "lxvd2x      48, 0, %[y_ptr]           \n\t" // load y
+       "lxvd2x      49, %[i16], %[y_ptr]      \n\t" 
+       "lxvd2x      50, %[i32], %[y_ptr]      \n\t" 
+       "lxvd2x      51, %[i48], %[y_ptr]      \n\t" 
+       "addi        %[x_ptr], %[x_ptr], 64    \n\t" 
+       "addi        %[y_ptr], %[y_ptr], 64    \n\t" 
+       "addic.      %[temp_n], %[temp_n], -8  \n\t" 
+       "ble         two%=                        \n\t" 
+       ".align    5                         \n\t" 
+       "one%=:                                    \n\t" 
+       "xvmulsp     40, 32, 36                \n\t" // c * x
+       "xvmulsp     41, 33, 36                \n\t" 
+       "xvmulsp     42, 34, 36                \n\t" 
+       "xvmulsp     43, 35, 36                \n\t" 
+       "xvmulsp     %x[x0], 48, 36            \n\t" // c * y
+       "xvmulsp     %x[x2], 49, 36            \n\t" 
+       "xvmulsp     %x[x1], 50, 36            \n\t" 
+       "xvmulsp     %x[x3], 51, 36            \n\t" 
+       "xvmulsp     44, 32, 37                \n\t" // s * x
+       "xvmulsp     45, 33, 37                \n\t" 
+       "lxvd2x      32, 0, %[x_ptr]           \n\t" // load x
+       "lxvd2x      33, %[i16], %[x_ptr]      \n\t" 
+       "xvmulsp     46, 34, 37                \n\t" 
+       "xvmulsp     47, 35, 37                \n\t" 
+       "lxvd2x      34, %[i32], %[x_ptr]      \n\t" 
+       "lxvd2x      35, %[i48], %[x_ptr]      \n\t" 
+       "xvmulsp     %x[x4], 48, 37            \n\t" // s * y
+       "xvmulsp     %x[x5], 49, 37            \n\t" 
+       "lxvd2x      48, 0, %[y_ptr]           \n\t" // load y
+       "lxvd2x      49, %[i16], %[y_ptr]      \n\t" 
+       "xvmulsp     %x[x6], 50, 37            \n\t" 
+       "xvmulsp     %x[x7], 51, 37            \n\t" 
+       "lxvd2x      50, %[i32], %[y_ptr]      \n\t" 
+       "lxvd2x      51, %[i48], %[y_ptr]      \n\t" 
+       "xvaddsp     40, 40, %x[x4]            \n\t" // c * x + s * y
+       "xvaddsp     41, 41, %x[x5]            \n\t" // c * x + s * y
+       "addi        %[x_ptr], %[x_ptr], -64   \n\t" 
+       "addi        %[y_ptr], %[y_ptr], -64   \n\t" 
+       "xvaddsp     42, 42, %x[x6]            \n\t" // c * x + s * y
+       "xvaddsp     43, 43, %x[x7]            \n\t" // c * x + s * y
+       "xvsubsp     %x[x0], %x[x0], 44        \n\t" // c * y - s * x
+       "xvsubsp     %x[x2], %x[x2], 45        \n\t" // c * y - s * x
+       "xvsubsp     %x[x1], %x[x1], 46        \n\t" // c * y - s * x
+       "xvsubsp     %x[x3], %x[x3], 47        \n\t" // c * y - s * x
+       "stxvd2x     40, 0, %[x_ptr]           \n\t" // store x
+       "stxvd2x     41, %[i16], %[x_ptr]      \n\t" 
+       "stxvd2x     42, %[i32], %[x_ptr]      \n\t" 
+       "stxvd2x     43, %[i48], %[x_ptr]      \n\t" 
+       "stxvd2x     %x[x0], 0, %[y_ptr]       \n\t" // store y
+       "stxvd2x     %x[x2], %[i16], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x1], %[i32], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x3], %[i48], %[y_ptr]  \n\t" 
+       "addi        %[x_ptr], %[x_ptr], 128   \n\t" 
+       "addi        %[y_ptr], %[y_ptr], 128   \n\t" 
+       "addic.      %[temp_n], %[temp_n], -8  \n\t" 
+       "bgt         one%=                        \n\t" 
+       "two%=:                                    \n\t" 
+       "xvmulsp     40, 32, 36                \n\t" // c * x
+       "xvmulsp     41, 33, 36                \n\t" 
+       "xvmulsp     42, 34, 36                \n\t" 
+       "xvmulsp     43, 35, 36                \n\t" 
+       "xvmulsp     %x[x0], 48, 36            \n\t" // c * y
+       "xvmulsp     %x[x2], 49, 36            \n\t" 
+       "xvmulsp     %x[x1], 50, 36            \n\t" 
+       "xvmulsp     %x[x3], 51, 36            \n\t" 
+       "xvmulsp     44, 32, 37                \n\t" // s * x
+       "xvmulsp     45, 33, 37                \n\t" 
+       "xvmulsp     46, 34, 37                \n\t" 
+       "xvmulsp     47, 35, 37                \n\t" 
+       "xvmulsp     %x[x4], 48, 37            \n\t" // s * y
+       "xvmulsp     %x[x5], 49, 37            \n\t" 
+       "xvmulsp     %x[x6], 50, 37            \n\t" 
+       "xvmulsp     %x[x7], 51, 37            \n\t" 
+       "addi        %[x_ptr], %[x_ptr], -64   \n\t" 
+       "addi        %[y_ptr], %[y_ptr], -64   \n\t" 
+       "xvaddsp     40, 40, %x[x4]            \n\t" // c * x + s * y
+       "xvaddsp     41, 41, %x[x5]            \n\t" // c * x + s * y
+       "xvaddsp     42, 42, %x[x6]            \n\t" // c * x + s * y
+       "xvaddsp     43, 43, %x[x7]            \n\t" // c * x + s * y
+       "xvsubsp     %x[x0], %x[x0], 44        \n\t" // c * y - s * x
+       "xvsubsp     %x[x2], %x[x2], 45        \n\t" // c * y - s * x
+       "xvsubsp     %x[x1], %x[x1], 46        \n\t" // c * y - s * x
+       "xvsubsp     %x[x3], %x[x3], 47        \n\t" // c * y - s * x
+       "stxvd2x     40, 0, %[x_ptr]           \n\t" // store x
+       "stxvd2x     41, %[i16], %[x_ptr]      \n\t" 
+       "stxvd2x     42, %[i32], %[x_ptr]      \n\t" 
+       "stxvd2x     43, %[i48], %[x_ptr]      \n\t" 
+       "stxvd2x     %x[x0], 0, %[y_ptr]       \n\t" // store y
+       "stxvd2x     %x[x2], %[i16], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x1], %[i32], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x3], %[i48], %[y_ptr]  "
+     :
+       [mem_x]  "+m"  (*(float (*)[2*n])x),
+       [mem_y]  "+m"  (*(float (*)[2*n])y),
+       [temp_n] "+r"  (n),
+       [x_ptr]  "+&b" (x),
+       [y_ptr]  "+&b" (y),
+       [x0]     "=wa" (t0),
+       [x1]     "=wa" (t2),
+       [x2]     "=wa" (t1),
+       [x3]     "=wa" (t3),
+       [x4]     "=wa" (t4),
+       [x5]     "=wa" (t5),
+       [x6]     "=wa" (t6),
+       [x7]     "=wa" (t7)     
+     : 
+       [cos]    "f"   (c),
+       [sin]    "f"   (s),
+       [i16]    "b"   (16),
+       [i32]    "b"   (32),
+       [i48]    "b"   (48)     
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+}
+ 
+#endif
+#endif
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+   BLASLONG i=0;
+    BLASLONG ix=0,iy=0;
+    FLOAT temp[2];
+    BLASLONG inc_x2;
+    BLASLONG inc_y2;
+
+    if ( n <= 0     )  return(0); 
+
+    if ( (inc_x == 1) && (inc_y == 1) )
+    {
+#if defined(__VEC__) || defined(__ALTIVEC__)
+        BLASLONG n1 = n & -8; 
+        if ( n1 > 0 )
+        { 
+            crot_kernel_8(n1, x, y, c, s);
+            i=n1; 
+            ix=2*n1; 
+        }
+#endif
+         while(i < n)
+           {
+                temp[0]   = c*x[ix]   + s*y[ix] ;
+                temp[1]   = c*x[ix+1] + s*y[ix+1] ;
+                y[ix]     = c*y[ix]   - s*x[ix] ;
+                y[ix+1]   = c*y[ix+1] - s*x[ix+1] ;
+                x[ix]     = temp[0] ;
+                x[ix+1]   = temp[1] ;
+
+                ix += 2 ; 
+                i++ ;
+
+            }
+
+    }
+    else
+    {
+        inc_x2 = 2 * inc_x ;
+        inc_y2 = 2 * inc_y ;
+        while(i < n)
+        {
+            temp[0]   = c*x[ix]   + s*y[iy] ;
+            temp[1]   = c*x[ix+1] + s*y[iy+1] ;
+            y[iy]     = c*y[iy]   - s*x[ix] ;
+            y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
+            x[ix]     = temp[0] ;
+            x[ix+1]   = temp[1] ;
+
+            ix += inc_x2 ;
+            iy += inc_y2 ;
+            i++ ;
+
+        }
+    }
+	return(0);
+}
+
diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S
index 2fb1b27ef..86108f20c 100644
--- a/kernel/power/dgemm_kernel_power9.S
+++ b/kernel/power/dgemm_kernel_power9.S
@@ -1,249 +1,249 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-#define ASSEMBLER
-#include "common.h"
-#include "def_vsx.h"
-
- 
-#define LOAD	ld
- 
- 
-
- 
-#define STACKSIZE  (512 )
-#define ALPHA_SP   (296+192)(SP)
-#define FZERO	(304+192)(SP)
- 
-
- 
-#define	M	r3
-#define	N	r4
-#define	K	r5
- 
-#define A	r7
-#define	B	r8
-#define	C	r9
-#define	LDC	r10
-#define OFFSET	r6
- 
- 
-
-#define alpha_r vs18
-
-#define o0	0
-
-
-#define T4	r12
-#define T3	r11
-#define C4	r14
-#define o8	r15
-#define o24	r16
-#define C2	r17
-#define L	r18
-#define T1	r19
-#define C3	r20
-#define TEMP_REG	r21
-#define	I	r22
-#define J	r23
-#define AO	r24
-#define	BO	r25
-#define	CO 	r26
-#define o16	r27
-#define	o32	r28
-#define	o48	r29
-
-#define PRE	r30
-#define T2	r31
-
-#include "dgemm_macros_power9.S"
-
-
-#ifndef NEEDPARAM
-
-	PROLOGUE
-	PROFCODE
-
-	addi	SP, SP, -STACKSIZE
-	li	r0, 0
-
-	stfd	f14,    0(SP)
-	stfd	f15,    8(SP)
-	stfd	f16,   16(SP)
-	stfd	f17,   24(SP)
-
-	stfd	f18,   32(SP)
-	stfd	f19,   40(SP)
-	stfd	f20,   48(SP)
-	stfd	f21,   56(SP)
-
-	stfd	f22,   64(SP)
-	stfd	f23,   72(SP)
-	stfd	f24,   80(SP)
-	stfd	f25,   88(SP)
-
-	stfd	f26,   96(SP)
-	stfd	f27,  104(SP)
-	stfd	f28,  112(SP)
-	stfd	f29,  120(SP)
-
-	stfd	f30,  128(SP)
-	stfd	f31,  136(SP)
-
- 
-	std	r31,  144(SP)
-	std	r30,  152(SP)
-	std	r29,  160(SP)
-	std	r28,  168(SP)
-	std	r27,  176(SP)
-	std	r26,  184(SP)
-	std	r25,  192(SP)
-	std	r24,  200(SP)
-	std	r23,  208(SP)
-	std	r22,  216(SP)
-	std	r21,  224(SP)
-	std	r20,  232(SP)
-	std	r19,  240(SP)
-	std	r18,  248(SP)
-	std	r17,  256(SP)
-	std	r16,  264(SP)
-	std	r15,  272(SP)
-	std	r14,  280(SP)
- 
- 
-    stxv    vs52,  288(SP)
-    stxv    vs53,  304(SP)
-    stxv    vs54,  320(SP)
-    stxv    vs55,  336(SP)
-    stxv    vs56,  352(SP)
-    stxv    vs57,  368(SP)
-    stxv    vs58,  384(SP) 
-    stxv    vs59,  400(SP) 
-    stxv    vs60,  416(SP)
-    stxv    vs61,  432(SP) 
-    stxv    vs62,  448(SP)
-    stxv    vs63,  464(SP)
-
-
-	stfd	f1,  ALPHA_SP
-	stw	r0,  FZERO 
-
-	slwi	LDC, LDC, BASE_SHIFT
-
-#if defined(TRMMKERNEL)
-	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
-#endif
-
-
-	cmpwi	cr0, M, 0
-	ble	.L999_H1
-	cmpwi	cr0, N, 0
-	ble	.L999_H1
-	cmpwi	cr0, K, 0
-	ble	.L999_H1
-
- 
- 
-   	addi	T1, SP, 296+192
- 
-
-	li	PRE, 384
-	li	o8 , 8
-	li	o16, 16
-	li	o24, 24
-	li	o32, 32
-	li	o48, 48
-
-
-	lxvdsx	alpha_r, 0, T1
-
-#include "dgemm_logic_power9.S"
-
-.L999:
-	addi	r3, 0, 0
-
-	lfd	f14,    0(SP)
-	lfd	f15,    8(SP)
-	lfd	f16,   16(SP)
-	lfd	f17,   24(SP)
-
-	lfd	f18,   32(SP)
-	lfd	f19,   40(SP)
-	lfd	f20,   48(SP)
-	lfd	f21,   56(SP)
-
-	lfd	f22,   64(SP)
-	lfd	f23,   72(SP)
-	lfd	f24,   80(SP)
-	lfd	f25,   88(SP)
-
-	lfd	f26,   96(SP)
-	lfd	f27,  104(SP)
-	lfd	f28,  112(SP)
-	lfd	f29,  120(SP)
-
-	lfd	f30,  128(SP)
-	lfd	f31,  136(SP)
-
- 
-	ld	r31,  144(SP)
-	ld	r30,  152(SP)
-	ld	r29,  160(SP)
-	ld	r28,  168(SP)
-	ld	r27,  176(SP)
-	ld	r26,  184(SP)
-	ld	r25,  192(SP)
-	ld	r24,  200(SP)
-	ld	r23,  208(SP)
-	ld	r22,  216(SP)
-	ld	r21,  224(SP)
-	ld	r20,  232(SP)
-	ld	r19,  240(SP)
-	ld	r18,  248(SP)
-	ld	r17,  256(SP)
-	ld	r16,  264(SP)
-	ld	r15,  272(SP)
-	ld	r14,  280(SP)
- 
-    lxv    vs52,  288(SP)
-    lxv    vs53,  304(SP)
-    lxv    vs54,  320(SP)
-    lxv    vs55,  336(SP)
-    lxv    vs56,  352(SP)
-    lxv    vs57,  368(SP)
-    lxv    vs58,  384(SP) 
-    lxv    vs59,  400(SP) 
-    lxv    vs60,  416(SP)
-    lxv    vs61,  432(SP) 
-    lxv    vs62,  448(SP)
-    lxv    vs63,  464(SP)
-
-	addi	SP, SP, STACKSIZE
-	blr
-
-	EPILOGUE
-#endif
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+ 
+ 
+
+ 
+#define STACKSIZE  (512 )
+#define ALPHA_SP   (296+192)(SP)
+#define FZERO	(304+192)(SP)
+ 
+
+ 
+#define	M	r3
+#define	N	r4
+#define	K	r5
+ 
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+ 
+ 
+
+#define alpha_r vs18
+
+#define o0	0
+
+
+#define T4	r12
+#define T3	r11
+#define C4	r14
+#define o8	r15
+#define o24	r16
+#define C2	r17
+#define L	r18
+#define T1	r19
+#define C3	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define o16	r27
+#define	o32	r28
+#define	o48	r29
+
+#define PRE	r30
+#define T2	r31
+
+#include "dgemm_macros_power9.S"
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+ 
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+    stxv    vs52,  288(SP)
+    stxv    vs53,  304(SP)
+    stxv    vs54,  320(SP)
+    stxv    vs55,  336(SP)
+    stxv    vs56,  352(SP)
+    stxv    vs57,  368(SP)
+    stxv    vs58,  384(SP) 
+    stxv    vs59,  400(SP) 
+    stxv    vs60,  416(SP)
+    stxv    vs61,  432(SP) 
+    stxv    vs62,  448(SP)
+    stxv    vs63,  464(SP)
+
+
+	stfd	f1,  ALPHA_SP
+	stw	r0,  FZERO 
+
+	slwi	LDC, LDC, BASE_SHIFT
+
+#if defined(TRMMKERNEL)
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+
+	cmpwi	cr0, M, 0
+	ble	.L999_H1
+	cmpwi	cr0, N, 0
+	ble	.L999_H1
+	cmpwi	cr0, K, 0
+	ble	.L999_H1
+
+ 
+ 
+   	addi	T1, SP, 296+192
+ 
+
+	li	PRE, 384
+	li	o8 , 8
+	li	o16, 16
+	li	o24, 24
+	li	o32, 32
+	li	o48, 48
+
+
+	lxvdsx	alpha_r, 0, T1
+
+#include "dgemm_logic_power9.S"
+
+.L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+ 
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP) 
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/dgemm_logic_power9.S b/kernel/power/dgemm_logic_power9.S
index 251839d19..a48bc685a 100644
--- a/kernel/power/dgemm_logic_power9.S
+++ b/kernel/power/dgemm_logic_power9.S
@@ -1,1981 +1,1981 @@
-/***************************************************************************
-Copyright (c) 2013-2019 The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-
-#define MY_ALIGN .align 3
-
-#if defined(TRMMKERNEL) && !defined(LEFT)
-   neg TEMP_REG, OFFSET 
-#endif
-
-	srawi.		J,	N,	2
-	ble		LDGEMM_L4_END
-
-LDGEMM_L4_BEGIN:
-
- 
-	li		T1,	128
-	li		T2,	256
- 
-	mr		AO,	A
-	mr		CO,	C
-	slwi		T3,	LDC	,	2
-	add		C,	C,	T3
-
- 
-	dcbt		A,	T1
-	dcbt		A,	T2
-#if defined(TRMMKERNEL) && defined(LEFT)
-	mr TEMP_REG, OFFSET	 /*off = offset;*/
-#endif 
-	srawi.		I,	M,	4
-	ble		LDGEMM_L4x16_END
-
-	MY_ALIGN
-LDGEMM_L4x16_BEGIN:
-
-	li		L,	-128
-
-
-	SAVE4x16_REGS
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,4
-#else
-	mr		BO,	B
-#endif	
- 
-	and		T1,	CO,	L
-	and		T2,	C2,	L
-	and		T3,	C3,	L
-	and		T4,	C4,	L
-
-	dcbt		T1,	r0
-	dcbt		T2,	r0
-	dcbt		T3,	r0
-	dcbt		T4,	r0
- 
-
-	addi		T1, T1, 128
-	addi		T2, T2, 128
-	addi		T3, T3, 128
-	addi		T4, T4, 128
-
-	dcbt		T1,	r0
-	dcbt		T2,	r0
-	dcbt		T3,	r0
-	dcbt		T4,	r0
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T3,K,TEMP_REG,16,4
-   srawi.		L, T3,	5
-#else
-   srawi.		L,	K,	5
-#endif 
-	
-	ble		LDGEMM_L4x16_SUB0
-
-
-	MY_ALIGN
-LDGEMM_L4x16_LOOP_START:
-
-	li	T2,	512
- 
- 
-	LOAD4x16_1
-    ##OffsetA=128 OffsetB=32
-    addi AO,AO,2176
- #   addi BO,BO,32 
-	addic.		L,	L,	-1
-
-	ble		LDGEMM_L4x16_LOOP_END
-
-	
-	mtctr		L
-
-	MY_ALIGN
-
-LDGEMM_L4x16_LOOP:
-
-	#dcbt	AO,	PRE
-    KERNEL4x16_I1_L2_2  -2048,32, 0,0
-    KERNEL4x16_I1_L2_2  -2048,32, 1,0
-    KERNEL4x16_I1_L2_2  -2048,32, 2,0
-    KERNEL4x16_I1_L2_2  -2048,32, 3,0
-    KERNEL4x16_I1_L2_2  -2048,32, 4,0
-    KERNEL4x16_I1_L2_2  -2048,32, 5,0        
-    KERNEL4x16_I1_L2_2  -2048,32, 6,0
-    KERNEL4x16_I1_L2_2  -2048,32, 7,0  
-    KERNEL4x16_I1_L2_2  -2048,32, 8,0      
-    KERNEL4x16_I1_L2_2  -2048,32, 9,0
-    KERNEL4x16_I1_L2_2  -2048,32, 10,0
-    KERNEL4x16_I1_L2_2  -2048,32, 11,0
-    KERNEL4x16_I1_L2_2  -2048,32, 12,0
-    KERNEL4x16_I1_L2_2  -2048,32, 13,0    
-    KERNEL4x16_I1_L2_2  -2048,32, 14,0    
-    KERNEL4x16_I1_L2_2  -2048,32, 15,1  	
-
-
-	bdnz		LDGEMM_L4x16_LOOP
-
-	MY_ALIGN
-	MY_ALIGN
-LDGEMM_L4x16_LOOP_END:
-
-    KERNEL4x16_I1_L2_2  -2048,32, 0,0
-    KERNEL4x16_I1_L2_2  -2048,32, 1,0
-    KERNEL4x16_I1_L2_2  -2048,32, 2,0
-    KERNEL4x16_I1_L2_2  -2048,32, 3,0
-    KERNEL4x16_I1_L2_2  -2048,32, 4,0
-    KERNEL4x16_I1_L2_2  -2048,32, 5,0        
-    KERNEL4x16_I1_L2_2  -2048,32, 6,0
-    KERNEL4x16_I1_L2_2  -2048,32, 7,0  
-    KERNEL4x16_I1_L2_2  -2048,32, 8,0      
-    KERNEL4x16_I1_L2_2  -2048,32, 9,0
-    KERNEL4x16_I1_L2_2  -2048,32, 10,0
-    KERNEL4x16_I1_L2_2  -2048,32, 11,0
-    KERNEL4x16_I1_L2_2  -2048,32, 12,0
-    KERNEL4x16_I1_L2_2  -2048,32, 13,0    
-    KERNEL4x16_I1_L2_2  -2048,32, 14,0    
-    KERNEL4x16_I1_L2_3  -2048,32, 15,1    
-	b		LDGEMM_L4x16_SUB1
-
-
-	MY_ALIGN
-LDGEMM_L4x16_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	31
-#else
-	andi.		L,	K,	31
-#endif
-	KERNEL4x16 1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L4x16_SAVE
-	b		LDGEMM_L4x16_SUB2
-	MY_ALIGN
-LDGEMM_L4x16_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	31
-#else
-	andi.		L,	K,	31
-#endif	
-	ble		LDGEMM_L4x16_SAVE
-	MY_ALIGN
-LDGEMM_L4x16_SUB2:
-
-    andi.      T1,L, 16
-    ble LDGEMM_L4x16_SUB2_8
-	LOAD4x16_0 
-    KERNEL4x16_I1_L2_2  128,32, 0,0
-    KERNEL4x16_I1_L2_2  128,32, 1,0
-    KERNEL4x16_I1_L2_2  128,32, 2,0
-    KERNEL4x16_I1_L2_2  128,32, 3,0
-    KERNEL4x16_I1_L2_2  128,32, 4,0
-    KERNEL4x16_I1_L2_2  128,32, 5,0        
-    KERNEL4x16_I1_L2_2  128,32, 6,0
-    KERNEL4x16_I1_L2_3  128,32, 7,1 
-    MY_ALIGN
-LDGEMM_L4x16_SUB2_8:
-    andi.      T1,L, 8
-    ble LDGEMM_L4x16_SUB2_4
-	LOAD4x16_0
-    KERNEL4x16_I1_L2_2  128,32, 0,0
-    KERNEL4x16_I1_L2_2  128,32, 1,0
-    KERNEL4x16_I1_L2_2  128,32, 2,0
-    KERNEL4x16_I1_L2_3  128,32, 3,1
-	MY_ALIGN
-LDGEMM_L4x16_SUB2_4:
-    andi.      T1,L, 4
-    ble LDGEMM_L4x16_SUB2_2 
-	LOAD4x16_0
-    KERNEL4x16_I1_L2_2  128,32, 0,0
-    KERNEL4x16_I1_L2_3  128,32, 1,1
-	MY_ALIGN	
-LDGEMM_L4x16_SUB2_2:
-    andi.      T1,L, 2
-    ble LDGEMM_L4x16_SUB2_1
-    LOAD4x16_0
-    KERNEL4x16_I1_L2_3  128,32, 0,1
-    MY_ALIGN
-LDGEMM_L4x16_SUB2_1:
-    andi.      T1,L, 1
-    ble LDGEMM_L4x16_SAVE	
-    KERNEL4x16 0
-#	addic.		L,	L,	-1
-#	bgt		LDGEMM_L4x16_SUB2
-
-	MY_ALIGN
-LDGEMM_L4x16_SAVE:
-	SAVE4x16
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4
-#endif	
-	addic.		I,	I,	-1
-	bgt+		LDGEMM_L4x16_BEGIN
-
-LDGEMM_L4x16_END:
-
-LDGEMM_L4x8_BEGIN:
-
-	andi.		T2,	M,	15
-	ble		LDGEMM_L4x1_END
-
-	andi.		T1,	M,	8
-	ble		LDGEMM_L4x8_END
-
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
-    REFRESH_TEMP_BK T3,K,TEMP_REG,8,4
-    srawi.		L, T3,	4	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	4	
-#endif		 
- 
-
-	ble		LDGEMM_L4x8_SUB0
-
-LDGEMM_L4x8_LOOP_START:
-
-
-	LOAD4x8_1
-    ##OffsetA=64 OffsetB=32
-
-
-	addic.		L,	L,	-1
-
-	ble		LDGEMM_L4x8_LOOP_END
-
-    mtctr		L
-	MY_ALIGN
-
-LDGEMM_L4x8_LOOP:
-
-    KERNEL4x8_I1_L2_2  64,32, 0,0
-    KERNEL4x8_I1_L2_2  64,32, 1,0
-    KERNEL4x8_I1_L2_2  64,32, 2,0
-    KERNEL4x8_I1_L2_2  64,32, 3,0
-    KERNEL4x8_I1_L2_2  64,32, 4,0
-    KERNEL4x8_I1_L2_2  64,32, 5,0        
-    KERNEL4x8_I1_L2_2  64,32, 6,0
-    KERNEL4x8_I1_L2_2  64,32, 7,1     
-
-	bdnz		LDGEMM_L4x8_LOOP
-	MY_ALIGN
-LDGEMM_L4x8_LOOP_END:
-
-    KERNEL4x8_I1_L2_2  64,32, 0,0
-    KERNEL4x8_I1_L2_2  64,32, 1,0
-    KERNEL4x8_I1_L2_2  64,32, 2,0
-    KERNEL4x8_I1_L2_2  64,32, 3,0
-    KERNEL4x8_I1_L2_2  64,32, 4,0
-    KERNEL4x8_I1_L2_2  64,32, 5,0        
-    KERNEL4x8_I1_L2_2  64,32, 6,0
-    KERNEL4x8_I1_L2_3  64,32, 7,1  
-
-	b		LDGEMM_L4x8_SUB1
-	MY_ALIGN
-LDGEMM_L4x8_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	15
-#else
-	andi.		L,	K,	15
-#endif
-	KERNEL4x8 1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L4x8_SAVE
-	b		LDGEMM_L4x8_SUB2
-	MY_ALIGN
-LDGEMM_L4x8_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	15
-#else
-	andi.		L,	K,	15
-#endif	
-	ble		LDGEMM_L4x8_SAVE
-	MY_ALIGN
-LDGEMM_L4x8_SUB2:
-
-    andi.      T1,L, 8
-    ble LDGEMM_L4x8_SUB2_4
-	LOAD4x8_0
-    KERNEL4x8_I1_L2_2  64,32, 0,0
-    KERNEL4x8_I1_L2_2  64,32, 1,0
-    KERNEL4x8_I1_L2_2  64,32, 2,0
-    KERNEL4x8_I1_L2_3  64,32, 3,1
-	MY_ALIGN
-LDGEMM_L4x8_SUB2_4:
-    andi.      T1,L, 4
-    ble LDGEMM_L4x8_SUB2_2 
-	LOAD4x8_0
-    KERNEL4x8_I1_L2_2  64,32, 0,0
-    KERNEL4x8_I1_L2_3  64,32, 1,1
-	MY_ALIGN	
-LDGEMM_L4x8_SUB2_2:
-    andi.      T1,L, 2
-    ble LDGEMM_L4x8_SUB2_1
-    LOAD4x8_0
-    KERNEL4x8_I1_L2_3  64,32, 0,1
-    MY_ALIGN
-LDGEMM_L4x8_SUB2_1:
-    andi.      T1,L, 1
-    ble LDGEMM_L4x8_SAVE	
-    KERNEL4x8 0
- 
-	MY_ALIGN
-LDGEMM_L4x8_SAVE:
-	SAVE4x8
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4
-#endif	
-LDGEMM_L4x8_END:
-
-LDGEMM_L4x4_BEGIN:
-
-
-	andi.		T1,	M,	4
-	ble		LDGEMM_L4x4_END
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
-    REFRESH_TEMP_BK T3,K,TEMP_REG,4,4
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif		
-	ble		LDGEMM_L4x4_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L4x4_SUB4
-
-LDGEMM_L4x4_LOOP_START:
-
-	#dcbt	AO,	PRE
-	LOAD4x4_1
-	KERNEL4x4_I1
-	KERNEL4x4_2
-	KERNEL4x4_1
-	#dcbt	AO,	PRE
-	KERNEL4x4_2
-
-	KERNEL4x4_1
-	KERNEL4x4_2
-	KERNEL4x4_1
-	#dcbt	AO,	PRE
-	KERNEL4x4_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L4x4_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L4x4_LOOP:
-
-	KERNEL4x4_1
-	KERNEL4x4_2
-	KERNEL4x4_1
-	#dcbt	AO,	PRE
-	KERNEL4x4_2
-
-	KERNEL4x4_1
-	KERNEL4x4_2
-	KERNEL4x4_1
-	#dcbt	AO,	PRE
-	KERNEL4x4_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L4x4_LOOP
-
-LDGEMM_L4x4_LOOP_END:
-
-	KERNEL4x4_1
-	KERNEL4x4_2
-	KERNEL4x4_1
-	KERNEL4x4_2
-
-	KERNEL4x4_1
-	KERNEL4x4_2
-	KERNEL4x4_1
-	KERNEL4x4_E2
-
-	b		LDGEMM_L4x4_SUB1
-
-LDGEMM_L4x4_SUB4:
-
-	KERNEL4x4_SUBI1
-	KERNEL4x4_SUB1
-	KERNEL4x4_SUB1
-	KERNEL4x4_SUB1
-
-	KERNEL4x4_SUB1
-	KERNEL4x4_SUB1
-	KERNEL4x4_SUB1
-	KERNEL4x4_SUB1
-
-	b		LDGEMM_L4x4_SUB1
-
-LDGEMM_L4x4_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL4x4_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L4x4_SAVE
-	b		LDGEMM_L4x4_SUB2
-
-LDGEMM_L4x4_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L4x4_SAVE
-
-LDGEMM_L4x4_SUB2:
-
-	KERNEL4x4_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L4x4_SUB2
-
-LDGEMM_L4x4_SAVE:
-
-	SAVE4x4
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4
-#endif	
-LDGEMM_L4x4_END:
-
-LDGEMM_L4x2_BEGIN:
-
-
-	andi.		T1,	M,	2
-	ble		LDGEMM_L4x2_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
-    REFRESH_TEMP_BK T3,K,TEMP_REG,2,4
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L4x2_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L4x2_SUB4
-
-LDGEMM_L4x2_LOOP_START:
-
-	LOAD4x2_1
-	KERNEL4x2_I1
-	KERNEL4x2_2
-	KERNEL4x2_1
-	KERNEL4x2_2
-
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_1
-	KERNEL4x2_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L4x2_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L4x2_LOOP:
-
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_1
-	KERNEL4x2_2
-
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_1
-	KERNEL4x2_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L4x2_LOOP
-
-LDGEMM_L4x2_LOOP_END:
-
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_1
-	KERNEL4x2_2
-
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_1
-	KERNEL4x2_E2
-
-	b		LDGEMM_L4x2_SUB1
-
-LDGEMM_L4x2_SUB4:
-
-	KERNEL4x2_SUBI1
-	KERNEL4x2_SUB1
-	KERNEL4x2_SUB1
-	KERNEL4x2_SUB1
-
-	KERNEL4x2_SUB1
-	KERNEL4x2_SUB1
-	KERNEL4x2_SUB1
-	KERNEL4x2_SUB1
-
-	b		LDGEMM_L4x2_SUB1
-
-LDGEMM_L4x2_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL4x2_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L4x2_SAVE
-	b		LDGEMM_L4x2_SUB2
-
-LDGEMM_L4x2_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L4x2_SAVE
-
-LDGEMM_L4x2_SUB2:
-
-	KERNEL4x2_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L4x2_SUB2
-
-LDGEMM_L4x2_SAVE:
-
-	SAVE4x2
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4
-#endif	
-LDGEMM_L4x2_END:
-
-LDGEMM_L4x1_BEGIN:
-
-
-	andi.		T1,	M,	1
-	ble		LDGEMM_L4x1_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
-    REFRESH_TEMP_BK T3,K,TEMP_REG,1,4
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L4x1_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L4x1_SUB4
-
-LDGEMM_L4x1_LOOP_START:
-
-	LOAD4x1_1
-	KERNEL4x1_I1
-	KERNEL4x1_2
-	KERNEL4x1_1
-	KERNEL4x1_2
-
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_1
-	KERNEL4x1_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L4x1_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L4x1_LOOP:
-
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_1
-	KERNEL4x1_2
-
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_1
-	KERNEL4x1_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L4x1_LOOP
-
-LDGEMM_L4x1_LOOP_END:
-
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_1
-	KERNEL4x1_2
-
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_1
-	KERNEL4x1_E2
-
-	b		LDGEMM_L4x1_SUB1
-
-LDGEMM_L4x1_SUB4:
-
-	KERNEL4x1_SUBI1
-	KERNEL4x1_SUB1
-	KERNEL4x1_SUB1
-	KERNEL4x1_SUB1
-
-	KERNEL4x1_SUB1
-	KERNEL4x1_SUB1
-	KERNEL4x1_SUB1
-	KERNEL4x1_SUB1
-
-	b		LDGEMM_L4x1_SUB1
-
-LDGEMM_L4x1_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL4x1_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L4x1_SAVE
-	b		LDGEMM_L4x1_SUB2
-
-LDGEMM_L4x1_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L4x1_SAVE
-
-LDGEMM_L4x1_SUB2:
-
-	KERNEL4x1_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L4x1_SUB2
-
-LDGEMM_L4x1_SAVE:
-
-	SAVE4x1
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4
-#endif	
-LDGEMM_L4x1_END:
-
-	slwi		T1,	K,	5
-	add		B,	B,	T1
-#if defined(TRMMKERNEL) && !defined(LEFT)
-    addi TEMP_REG, TEMP_REG, 4
-#endif
-	addic.		J,	J,	-1
-	bgt		LDGEMM_L4_BEGIN
-
-	andi.		T2,	N,	3
-	ble		.L999
-
-LDGEMM_L4_END:
-
-	b		LDGEMM_L2_BEGIN
-
-.L999_H1:
-
-	b		.L999
-
-LDGEMM_L2_BEGIN:
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-	mr TEMP_REG, OFFSET	 /*off = offset;*/
-#endif 
-	andi.		T1,	N,	2
-	ble		LDGEMM_L2_END
-	mr		CO,	C
-	mr		AO,	A
-	slwi		T1,	LDC	,	1
-	add		C,	C,	T1
-	srawi.		I,	M,	4
-	ble		LDGEMM_L2x16_END
-
-LDGEMM_L2x16_BEGIN:
-
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,2
-    REFRESH_TEMP_BK T3,K,TEMP_REG,16,2
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L2x16_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L2x16_SUB4
-
-LDGEMM_L2x16_LOOP_START:
-
-	#dcbt		AO,	PRE
-	LOAD2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_I1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L2x16_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L2x16_LOOP:
-
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x16_LOOP
-
-LDGEMM_L2x16_LOOP_END:
-
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	KERNEL2x16_E2
-
-	b		LDGEMM_L2x16_SUB1
-
-LDGEMM_L2x16_SUB4:
-
-	#dcbt		AO,	PRE
-	KERNEL2x16_SUBI1
-	#dcbt		AO,	PRE
-	KERNEL2x16_SUB1
-	#dcbt		AO,	PRE
-	KERNEL2x16_SUB1
-	#dcbt		AO,	PRE
-	KERNEL2x16_SUB1
-
-	KERNEL2x16_SUB1
-	KERNEL2x16_SUB1
-	KERNEL2x16_SUB1
-	KERNEL2x16_SUB1
-
-	b		LDGEMM_L2x16_SUB1
-
-LDGEMM_L2x16_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL2x16_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L2x16_SAVE
-	b		LDGEMM_L2x16_SUB2
-
-LDGEMM_L2x16_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L2x16_SAVE
-
-LDGEMM_L2x16_SUB2:
-
-	KERNEL2x16_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x16_SUB2
-
-LDGEMM_L2x16_SAVE:
-
-	SAVE2x16
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2
-#endif	
-	addic.		I,	I,	-1
-	bgt		LDGEMM_L2x16_BEGIN
-
-LDGEMM_L2x16_END:
-
-LDGEMM_L2x8_BEGIN:
-
-	andi.		T2,	M,	15
-	ble		LDGEMM_L2x1_END
-
-	andi.		T1,	M,	8
-	ble		LDGEMM_L2x8_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
-    REFRESH_TEMP_BK T3,K,TEMP_REG,8,2
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L2x8_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L2x8_SUB4
-
-LDGEMM_L2x8_LOOP_START:
-
-	#dcbt	AO,	PRE
-	LOAD2x8_1
-	KERNEL2x8_I1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-	KERNEL2x8_1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-
-	KERNEL2x8_1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-	KERNEL2x8_1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L2x8_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L2x8_LOOP:
-
-	KERNEL2x8_1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-	KERNEL2x8_1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-
-	KERNEL2x8_1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-	KERNEL2x8_1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x8_LOOP
-
-LDGEMM_L2x8_LOOP_END:
-
-	KERNEL2x8_1
-	KERNEL2x8_2
-	KERNEL2x8_1
-	KERNEL2x8_2
-
-	KERNEL2x8_1
-	KERNEL2x8_2
-	KERNEL2x8_1
-	KERNEL2x8_E2
-
-	b		LDGEMM_L2x8_SUB1
-
-LDGEMM_L2x8_SUB4:
-
-	KERNEL2x8_SUBI1
-	KERNEL2x8_SUB1
-	KERNEL2x8_SUB1
-	KERNEL2x8_SUB1
-
-	KERNEL2x8_SUB1
-	KERNEL2x8_SUB1
-	KERNEL2x8_SUB1
-	KERNEL2x8_SUB1
-
-	b		LDGEMM_L2x8_SUB1
-
-LDGEMM_L2x8_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL2x8_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L2x8_SAVE
-	b		LDGEMM_L2x8_SUB2
-
-LDGEMM_L2x8_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L2x8_SAVE
-
-LDGEMM_L2x8_SUB2:
-
-	KERNEL2x8_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x8_SUB2
-
-LDGEMM_L2x8_SAVE:
-
-	SAVE2x8
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2
-#endif
-LDGEMM_L2x8_END:
-
-LDGEMM_L2x4_BEGIN:
-
-
-	andi.		T1,	M,	4
-	ble		LDGEMM_L2x4_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
-    REFRESH_TEMP_BK T3,K,TEMP_REG,4,2
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L2x4_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L2x4_SUB4
-
-LDGEMM_L2x4_LOOP_START:
-
-	LOAD2x4_1
-	KERNEL2x4_I1
-	KERNEL2x4_2
-	KERNEL2x4_1
-	KERNEL2x4_2
-
-	KERNEL2x4_1
-	KERNEL2x4_2
-	KERNEL2x4_1
-	KERNEL2x4_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L2x4_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L2x4_LOOP:
-
-	KERNEL2x4_1
-	KERNEL2x4_2
-	KERNEL2x4_1
-	KERNEL2x4_2
-
-	KERNEL2x4_1
-	KERNEL2x4_2
-	KERNEL2x4_1
-	KERNEL2x4_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x4_LOOP
-
-LDGEMM_L2x4_LOOP_END:
-
-	KERNEL2x4_1
-	KERNEL2x4_2
-	KERNEL2x4_1
-	KERNEL2x4_2
-
-	KERNEL2x4_1
-	KERNEL2x4_2
-	KERNEL2x4_1
-	KERNEL2x4_E2
-
-	b		LDGEMM_L2x4_SUB1
-
-LDGEMM_L2x4_SUB4:
-
-	KERNEL2x4_SUBI1
-	KERNEL2x4_SUB1
-	KERNEL2x4_SUB1
-	KERNEL2x4_SUB1
-
-	KERNEL2x4_SUB1
-	KERNEL2x4_SUB1
-	KERNEL2x4_SUB1
-	KERNEL2x4_SUB1
-
-	b		LDGEMM_L2x4_SUB1
-
-LDGEMM_L2x4_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL2x4_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L2x4_SAVE
-	b		LDGEMM_L2x4_SUB2
-
-LDGEMM_L2x4_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L2x4_SAVE
-
-LDGEMM_L2x4_SUB2:
-
-	KERNEL2x4_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x4_SUB2
-
-LDGEMM_L2x4_SAVE:
-
-	SAVE2x4
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2
-#endif
-LDGEMM_L2x4_END:
-
-LDGEMM_L2x2_BEGIN:
-
-
-	andi.		T1,	M,	2
-	ble		LDGEMM_L2x2_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
-    REFRESH_TEMP_BK T3,K,TEMP_REG,2,2
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L2x2_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L2x2_SUB4
-
-LDGEMM_L2x2_LOOP_START:
-
-	LOAD2x2_1
-	KERNEL2x2_I1
-	KERNEL2x2_2
-	KERNEL2x2_1
-	KERNEL2x2_2
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_1
-	KERNEL2x2_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L2x2_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L2x2_LOOP:
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_1
-	KERNEL2x2_2
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_1
-	KERNEL2x2_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x2_LOOP
-
-LDGEMM_L2x2_LOOP_END:
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_1
-	KERNEL2x2_2
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_1
-	KERNEL2x2_E2
-
-	b		LDGEMM_L2x2_SUB1
-
-LDGEMM_L2x2_SUB4:
-
-	KERNEL2x2_SUBI1
-	KERNEL2x2_SUB1
-	KERNEL2x2_SUB1
-	KERNEL2x2_SUB1
-
-	KERNEL2x2_SUB1
-	KERNEL2x2_SUB1
-	KERNEL2x2_SUB1
-	KERNEL2x2_SUB1
-
-	b		LDGEMM_L2x2_SUB1
-
-LDGEMM_L2x2_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL2x2_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L2x2_SAVE
-	b		LDGEMM_L2x2_SUB2
-
-LDGEMM_L2x2_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L2x2_SAVE
-
-LDGEMM_L2x2_SUB2:
-
-	KERNEL2x2_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x2_SUB2
-
-LDGEMM_L2x2_SAVE:
-
-	SAVE2x2
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2
-#endif
-LDGEMM_L2x2_END:
-
-LDGEMM_L2x1_BEGIN:
-
-
-	andi.		T1,	M,	1
-	ble		LDGEMM_L2x1_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
-    REFRESH_TEMP_BK T3,K,TEMP_REG,1,2
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L2x1_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L2x1_SUB4
-
-LDGEMM_L2x1_LOOP_START:
-
-	LOAD2x1_1
-	KERNEL2x1_I1
-	KERNEL2x1_2
-	KERNEL2x1_1
-	KERNEL2x1_2
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_1
-	KERNEL2x1_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L2x1_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L2x1_LOOP:
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_1
-	KERNEL2x1_2
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_1
-	KERNEL2x1_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x1_LOOP
-
-LDGEMM_L2x1_LOOP_END:
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_1
-	KERNEL2x1_2
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_1
-	KERNEL2x1_E2
-
-	b		LDGEMM_L2x1_SUB1
-
-LDGEMM_L2x1_SUB4:
-
-	KERNEL2x1_SUBI1
-	KERNEL2x1_SUB1
-	KERNEL2x1_SUB1
-	KERNEL2x1_SUB1
-
-	KERNEL2x1_SUB1
-	KERNEL2x1_SUB1
-	KERNEL2x1_SUB1
-	KERNEL2x1_SUB1
-
-	b		LDGEMM_L2x1_SUB1
-
-LDGEMM_L2x1_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL2x1_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L2x1_SAVE
-	b		LDGEMM_L2x1_SUB2
-
-LDGEMM_L2x1_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L2x1_SAVE
-
-LDGEMM_L2x1_SUB2:
-
-	KERNEL2x1_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x1_SUB2
-
-LDGEMM_L2x1_SAVE:
-
-	SAVE2x1
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2
-#endif
-LDGEMM_L2x1_END:
-
-	slwi		T1,	K,	4
-	add		B,	B,	T1
-#if defined(TRMMKERNEL) && !defined(LEFT)
-    addi TEMP_REG, TEMP_REG, 2
-#endif
-LDGEMM_L2_END:
-LDGEMM_L1_BEGIN:
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-	mr TEMP_REG, OFFSET	 /*off = offset;*/
-#endif 
-	andi.		T1,	N,	1
-	ble		LDGEMM_L1_END
-	mr		CO,	C
-	mr		AO,	A
-	srawi.		I,	M,	4
-	ble		LDGEMM_L1x16_END
-
-LDGEMM_L1x16_BEGIN:
-
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,1
-    REFRESH_TEMP_BK T3,K,TEMP_REG,16,1
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L1x16_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L1x16_SUB4
-
-LDGEMM_L1x16_LOOP_START:
-
-	#dcbt		AO,	PRE
-	LOAD1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_I1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L1x16_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L1x16_LOOP:
-
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x16_LOOP
-
-LDGEMM_L1x16_LOOP_END:
-
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	KERNEL1x16_E2
-
-	b		LDGEMM_L1x16_SUB1
-
-LDGEMM_L1x16_SUB4:
-
-	#dcbt		AO,	PRE
-	KERNEL1x16_SUBI1
-	#dcbt		AO,	PRE
-	KERNEL1x16_SUB1
-	#dcbt		AO,	PRE
-	KERNEL1x16_SUB1
-	#dcbt		AO,	PRE
-	KERNEL1x16_SUB1
-
-	KERNEL1x16_SUB1
-	KERNEL1x16_SUB1
-	KERNEL1x16_SUB1
-	KERNEL1x16_SUB1
-
-	b		LDGEMM_L1x16_SUB1
-
-LDGEMM_L1x16_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL1x16_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L1x16_SAVE
-	b		LDGEMM_L1x16_SUB2
-
-LDGEMM_L1x16_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L1x16_SAVE
-
-LDGEMM_L1x16_SUB2:
-
-	KERNEL1x16_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x16_SUB2
-
-LDGEMM_L1x16_SAVE:
-
-	SAVE1x16
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1
-#endif
-	addic.		I,	I,	-1
-	bgt		LDGEMM_L1x16_BEGIN
-
-LDGEMM_L1x16_END:
-
-LDGEMM_L1x8_BEGIN:
-
-	andi.		T2,	M,	15
-	ble		LDGEMM_L1x1_END
-
-	andi.		T1,	M,	8
-	ble		LDGEMM_L1x8_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
-    REFRESH_TEMP_BK T3,K,TEMP_REG,8,1
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L1x8_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L1x8_SUB4
-
-LDGEMM_L1x8_LOOP_START:
-
-	#dcbt	AO,	PRE
-	LOAD1x8_1
-	KERNEL1x8_I1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-	KERNEL1x8_1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-
-	KERNEL1x8_1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-	KERNEL1x8_1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L1x8_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L1x8_LOOP:
-
-	KERNEL1x8_1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-	KERNEL1x8_1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-
-	KERNEL1x8_1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-	KERNEL1x8_1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x8_LOOP
-
-LDGEMM_L1x8_LOOP_END:
-
-	KERNEL1x8_1
-	KERNEL1x8_2
-	KERNEL1x8_1
-	KERNEL1x8_2
-
-	KERNEL1x8_1
-	KERNEL1x8_2
-	KERNEL1x8_1
-	KERNEL1x8_E2
-
-	b		LDGEMM_L1x8_SUB1
-
-LDGEMM_L1x8_SUB4:
-
-	KERNEL1x8_SUBI1
-	KERNEL1x8_SUB1
-	KERNEL1x8_SUB1
-	KERNEL1x8_SUB1
-
-	KERNEL1x8_SUB1
-	KERNEL1x8_SUB1
-	KERNEL1x8_SUB1
-	KERNEL1x8_SUB1
-
-	b		LDGEMM_L1x8_SUB1
-
-LDGEMM_L1x8_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL1x8_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L1x8_SAVE
-	b		LDGEMM_L1x8_SUB2
-
-LDGEMM_L1x8_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L1x8_SAVE
-
-LDGEMM_L1x8_SUB2:
-
-	KERNEL1x8_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x8_SUB2
-
-LDGEMM_L1x8_SAVE:
-
-	SAVE1x8
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1
-#endif
-LDGEMM_L1x8_END:
-
-LDGEMM_L1x4_BEGIN:
-
-
-	andi.		T1,	M,	4
-	ble		LDGEMM_L1x4_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
-    REFRESH_TEMP_BK T3,K,TEMP_REG,4,1
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L1x4_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L1x4_SUB4
-
-LDGEMM_L1x4_LOOP_START:
-
-	LOAD1x4_1
-	KERNEL1x4_I1
-	KERNEL1x4_2
-	KERNEL1x4_1
-	KERNEL1x4_2
-
-	KERNEL1x4_1
-	KERNEL1x4_2
-	KERNEL1x4_1
-	KERNEL1x4_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L1x4_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L1x4_LOOP:
-
-	KERNEL1x4_1
-	KERNEL1x4_2
-	KERNEL1x4_1
-	KERNEL1x4_2
-
-	KERNEL1x4_1
-	KERNEL1x4_2
-	KERNEL1x4_1
-	KERNEL1x4_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x4_LOOP
-
-LDGEMM_L1x4_LOOP_END:
-
-	KERNEL1x4_1
-	KERNEL1x4_2
-	KERNEL1x4_1
-	KERNEL1x4_2
-
-	KERNEL1x4_1
-	KERNEL1x4_2
-	KERNEL1x4_1
-	KERNEL1x4_E2
-
-	b		LDGEMM_L1x4_SUB1
-
-LDGEMM_L1x4_SUB4:
-
-	KERNEL1x4_SUBI1
-	KERNEL1x4_SUB1
-	KERNEL1x4_SUB1
-	KERNEL1x4_SUB1
-
-	KERNEL1x4_SUB1
-	KERNEL1x4_SUB1
-	KERNEL1x4_SUB1
-	KERNEL1x4_SUB1
-
-	b		LDGEMM_L1x4_SUB1
-
-LDGEMM_L1x4_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL1x4_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L1x4_SAVE
-	b		LDGEMM_L1x4_SUB2
-
-LDGEMM_L1x4_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L1x4_SAVE
-
-LDGEMM_L1x4_SUB2:
-
-	KERNEL1x4_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x4_SUB2
-
-LDGEMM_L1x4_SAVE:
-
-	SAVE1x4
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1
-#endif
-LDGEMM_L1x4_END:
-
-LDGEMM_L1x2_BEGIN:
-
-
-	andi.		T1,	M,	2
-	ble		LDGEMM_L1x2_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
-    REFRESH_TEMP_BK T3,K,TEMP_REG,2,1
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L1x2_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L1x2_SUB4
-
-LDGEMM_L1x2_LOOP_START:
-
-	LOAD1x2_1
-	KERNEL1x2_I1
-	KERNEL1x2_2
-	KERNEL1x2_1
-	KERNEL1x2_2
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_1
-	KERNEL1x2_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L1x2_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L1x2_LOOP:
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_1
-	KERNEL1x2_2
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_1
-	KERNEL1x2_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x2_LOOP
-
-LDGEMM_L1x2_LOOP_END:
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_1
-	KERNEL1x2_2
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_1
-	KERNEL1x2_E2
-
-	b		LDGEMM_L1x2_SUB1
-
-LDGEMM_L1x2_SUB4:
-
-	KERNEL1x2_SUBI1
-	KERNEL1x2_SUB1
-	KERNEL1x2_SUB1
-	KERNEL1x2_SUB1
-
-	KERNEL1x2_SUB1
-	KERNEL1x2_SUB1
-	KERNEL1x2_SUB1
-	KERNEL1x2_SUB1
-
-	b		LDGEMM_L1x2_SUB1
-
-LDGEMM_L1x2_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL1x2_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L1x2_SAVE
-	b		LDGEMM_L1x2_SUB2
-
-LDGEMM_L1x2_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L1x2_SAVE
-
-LDGEMM_L1x2_SUB2:
-
-	KERNEL1x2_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x2_SUB2
-
-LDGEMM_L1x2_SAVE:
-
-	SAVE1x2
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1
-#endif
-LDGEMM_L1x2_END:
-
-LDGEMM_L1x1_BEGIN:
-
-
-	andi.		T1,	M,	1
-	ble		LDGEMM_L1x1_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
-    REFRESH_TEMP_BK T3,K,TEMP_REG,1,1
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L1x1_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L1x1_SUB4
-
-LDGEMM_L1x1_LOOP_START:
-
-	LOAD1x1_1
-	KERNEL1x1_I1
-	KERNEL1x1_2
-	KERNEL1x1_1
-	KERNEL1x1_2
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_1
-	KERNEL1x1_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L1x1_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L1x1_LOOP:
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_1
-	KERNEL1x1_2
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_1
-	KERNEL1x1_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x1_LOOP
-
-LDGEMM_L1x1_LOOP_END:
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_1
-	KERNEL1x1_2
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_1
-	KERNEL1x1_E2
-
-	b		LDGEMM_L1x1_SUB1
-
-LDGEMM_L1x1_SUB4:
-
-	KERNEL1x1_SUBI1
-	KERNEL1x1_SUB1
-	KERNEL1x1_SUB1
-	KERNEL1x1_SUB1
-
-	KERNEL1x1_SUB1
-	KERNEL1x1_SUB1
-	KERNEL1x1_SUB1
-	KERNEL1x1_SUB1
-
-	b		LDGEMM_L1x1_SUB1
-
-LDGEMM_L1x1_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL1x1_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L1x1_SAVE
-	b		LDGEMM_L1x1_SUB2
-
-LDGEMM_L1x1_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L1x1_SAVE
-
-LDGEMM_L1x1_SUB2:
-
-	KERNEL1x1_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x1_SUB2
-
-LDGEMM_L1x1_SAVE:
-
-	SAVE1x1
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1
-#endif
-LDGEMM_L1x1_END:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-    addi TEMP_REG, TEMP_REG, 1
-#endif
-LDGEMM_L1_END:
+/***************************************************************************
+Copyright (c) 2013-2019 The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+
+#define MY_ALIGN .align 3
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   neg TEMP_REG, OFFSET 
+#endif
+
+	srawi.		J,	N,	2
+	ble		LDGEMM_L4_END
+
+LDGEMM_L4_BEGIN:
+
+ 
+	li		T1,	128
+	li		T2,	256
+ 
+	mr		AO,	A
+	mr		CO,	C
+	slwi		T3,	LDC	,	2
+	add		C,	C,	T3
+
+ 
+	dcbt		A,	T1
+	dcbt		A,	T2
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LDGEMM_L4x16_END
+
+	MY_ALIGN
+LDGEMM_L4x16_BEGIN:
+
+	li		L,	-128
+
+
+	SAVE4x16_REGS
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,4
+#else
+	mr		BO,	B
+#endif	
+ 
+	and		T1,	CO,	L
+	and		T2,	C2,	L
+	and		T3,	C3,	L
+	and		T4,	C4,	L
+
+	dcbt		T1,	r0
+	dcbt		T2,	r0
+	dcbt		T3,	r0
+	dcbt		T4,	r0
+ 
+
+	addi		T1, T1, 128
+	addi		T2, T2, 128
+	addi		T3, T3, 128
+	addi		T4, T4, 128
+
+	dcbt		T1,	r0
+	dcbt		T2,	r0
+	dcbt		T3,	r0
+	dcbt		T4,	r0
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T3,K,TEMP_REG,16,4
+   srawi.		L, T3,	5
+#else
+   srawi.		L,	K,	5
+#endif 
+	
+	ble		LDGEMM_L4x16_SUB0
+
+
+	MY_ALIGN
+LDGEMM_L4x16_LOOP_START:
+
+	li	T2,	512
+ 
+ 
+	LOAD4x16_1
+    ##OffsetA=128 OffsetB=32
+    addi AO,AO,2176
+ #   addi BO,BO,32 
+	addic.		L,	L,	-1
+
+	ble		LDGEMM_L4x16_LOOP_END
+
+	
+	mtctr		L
+
+	MY_ALIGN
+
+LDGEMM_L4x16_LOOP:
+
+	#dcbt	AO,	PRE
+    KERNEL4x16_I1_L2_2  -2048,32, 0,0
+    KERNEL4x16_I1_L2_2  -2048,32, 1,0
+    KERNEL4x16_I1_L2_2  -2048,32, 2,0
+    KERNEL4x16_I1_L2_2  -2048,32, 3,0
+    KERNEL4x16_I1_L2_2  -2048,32, 4,0
+    KERNEL4x16_I1_L2_2  -2048,32, 5,0        
+    KERNEL4x16_I1_L2_2  -2048,32, 6,0
+    KERNEL4x16_I1_L2_2  -2048,32, 7,0  
+    KERNEL4x16_I1_L2_2  -2048,32, 8,0      
+    KERNEL4x16_I1_L2_2  -2048,32, 9,0
+    KERNEL4x16_I1_L2_2  -2048,32, 10,0
+    KERNEL4x16_I1_L2_2  -2048,32, 11,0
+    KERNEL4x16_I1_L2_2  -2048,32, 12,0
+    KERNEL4x16_I1_L2_2  -2048,32, 13,0    
+    KERNEL4x16_I1_L2_2  -2048,32, 14,0    
+    KERNEL4x16_I1_L2_2  -2048,32, 15,1  	
+
+
+	bdnz		LDGEMM_L4x16_LOOP
+
+	MY_ALIGN
+	MY_ALIGN
+LDGEMM_L4x16_LOOP_END:
+
+    KERNEL4x16_I1_L2_2  -2048,32, 0,0
+    KERNEL4x16_I1_L2_2  -2048,32, 1,0
+    KERNEL4x16_I1_L2_2  -2048,32, 2,0
+    KERNEL4x16_I1_L2_2  -2048,32, 3,0
+    KERNEL4x16_I1_L2_2  -2048,32, 4,0
+    KERNEL4x16_I1_L2_2  -2048,32, 5,0        
+    KERNEL4x16_I1_L2_2  -2048,32, 6,0
+    KERNEL4x16_I1_L2_2  -2048,32, 7,0  
+    KERNEL4x16_I1_L2_2  -2048,32, 8,0      
+    KERNEL4x16_I1_L2_2  -2048,32, 9,0
+    KERNEL4x16_I1_L2_2  -2048,32, 10,0
+    KERNEL4x16_I1_L2_2  -2048,32, 11,0
+    KERNEL4x16_I1_L2_2  -2048,32, 12,0
+    KERNEL4x16_I1_L2_2  -2048,32, 13,0    
+    KERNEL4x16_I1_L2_2  -2048,32, 14,0    
+    KERNEL4x16_I1_L2_3  -2048,32, 15,1    
+	b		LDGEMM_L4x16_SUB1
+
+
+	MY_ALIGN
+LDGEMM_L4x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	31
+#else
+	andi.		L,	K,	31
+#endif
+	KERNEL4x16 1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L4x16_SAVE
+	b		LDGEMM_L4x16_SUB2
+	MY_ALIGN
+LDGEMM_L4x16_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	31
+#else
+	andi.		L,	K,	31
+#endif	
+	ble		LDGEMM_L4x16_SAVE
+	MY_ALIGN
+LDGEMM_L4x16_SUB2:
+
+    andi.      T1,L, 16
+    ble LDGEMM_L4x16_SUB2_8
+	LOAD4x16_0 
+    KERNEL4x16_I1_L2_2  128,32, 0,0
+    KERNEL4x16_I1_L2_2  128,32, 1,0
+    KERNEL4x16_I1_L2_2  128,32, 2,0
+    KERNEL4x16_I1_L2_2  128,32, 3,0
+    KERNEL4x16_I1_L2_2  128,32, 4,0
+    KERNEL4x16_I1_L2_2  128,32, 5,0        
+    KERNEL4x16_I1_L2_2  128,32, 6,0
+    KERNEL4x16_I1_L2_3  128,32, 7,1 
+    MY_ALIGN
+LDGEMM_L4x16_SUB2_8:
+    andi.      T1,L, 8
+    ble LDGEMM_L4x16_SUB2_4
+	LOAD4x16_0
+    KERNEL4x16_I1_L2_2  128,32, 0,0
+    KERNEL4x16_I1_L2_2  128,32, 1,0
+    KERNEL4x16_I1_L2_2  128,32, 2,0
+    KERNEL4x16_I1_L2_3  128,32, 3,1
+	MY_ALIGN
+LDGEMM_L4x16_SUB2_4:
+    andi.      T1,L, 4
+    ble LDGEMM_L4x16_SUB2_2 
+	LOAD4x16_0
+    KERNEL4x16_I1_L2_2  128,32, 0,0
+    KERNEL4x16_I1_L2_3  128,32, 1,1
+	MY_ALIGN	
+LDGEMM_L4x16_SUB2_2:
+    andi.      T1,L, 2
+    ble LDGEMM_L4x16_SUB2_1
+    LOAD4x16_0
+    KERNEL4x16_I1_L2_3  128,32, 0,1
+    MY_ALIGN
+LDGEMM_L4x16_SUB2_1:
+    andi.      T1,L, 1
+    ble LDGEMM_L4x16_SAVE	
+    KERNEL4x16 0
+#	addic.		L,	L,	-1
+#	bgt		LDGEMM_L4x16_SUB2
+
+	MY_ALIGN
+LDGEMM_L4x16_SAVE:
+	SAVE4x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LDGEMM_L4x16_BEGIN
+
+LDGEMM_L4x16_END:
+
+LDGEMM_L4x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		LDGEMM_L4x1_END
+
+	andi.		T1,	M,	8
+	ble		LDGEMM_L4x8_END
+
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
+    REFRESH_TEMP_BK T3,K,TEMP_REG,8,4
+    srawi.		L, T3,	4	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	4	
+#endif		 
+ 
+
+	ble		LDGEMM_L4x8_SUB0
+
+LDGEMM_L4x8_LOOP_START:
+
+
+	LOAD4x8_1
+    ##OffsetA=64 OffsetB=32
+
+
+	addic.		L,	L,	-1
+
+	ble		LDGEMM_L4x8_LOOP_END
+
+    mtctr		L
+	MY_ALIGN
+
+LDGEMM_L4x8_LOOP:
+
+    KERNEL4x8_I1_L2_2  64,32, 0,0
+    KERNEL4x8_I1_L2_2  64,32, 1,0
+    KERNEL4x8_I1_L2_2  64,32, 2,0
+    KERNEL4x8_I1_L2_2  64,32, 3,0
+    KERNEL4x8_I1_L2_2  64,32, 4,0
+    KERNEL4x8_I1_L2_2  64,32, 5,0        
+    KERNEL4x8_I1_L2_2  64,32, 6,0
+    KERNEL4x8_I1_L2_2  64,32, 7,1     
+
+	bdnz		LDGEMM_L4x8_LOOP
+	MY_ALIGN
+LDGEMM_L4x8_LOOP_END:
+
+    KERNEL4x8_I1_L2_2  64,32, 0,0
+    KERNEL4x8_I1_L2_2  64,32, 1,0
+    KERNEL4x8_I1_L2_2  64,32, 2,0
+    KERNEL4x8_I1_L2_2  64,32, 3,0
+    KERNEL4x8_I1_L2_2  64,32, 4,0
+    KERNEL4x8_I1_L2_2  64,32, 5,0        
+    KERNEL4x8_I1_L2_2  64,32, 6,0
+    KERNEL4x8_I1_L2_3  64,32, 7,1  
+
+	b		LDGEMM_L4x8_SUB1
+	MY_ALIGN
+LDGEMM_L4x8_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	15
+#else
+	andi.		L,	K,	15
+#endif
+	KERNEL4x8 1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L4x8_SAVE
+	b		LDGEMM_L4x8_SUB2
+	MY_ALIGN
+LDGEMM_L4x8_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	15
+#else
+	andi.		L,	K,	15
+#endif	
+	ble		LDGEMM_L4x8_SAVE
+	MY_ALIGN
+LDGEMM_L4x8_SUB2:
+
+    andi.      T1,L, 8
+    ble LDGEMM_L4x8_SUB2_4
+	LOAD4x8_0
+    KERNEL4x8_I1_L2_2  64,32, 0,0
+    KERNEL4x8_I1_L2_2  64,32, 1,0
+    KERNEL4x8_I1_L2_2  64,32, 2,0
+    KERNEL4x8_I1_L2_3  64,32, 3,1
+	MY_ALIGN
+LDGEMM_L4x8_SUB2_4:
+    andi.      T1,L, 4
+    ble LDGEMM_L4x8_SUB2_2 
+	LOAD4x8_0
+    KERNEL4x8_I1_L2_2  64,32, 0,0
+    KERNEL4x8_I1_L2_3  64,32, 1,1
+	MY_ALIGN	
+LDGEMM_L4x8_SUB2_2:
+    andi.      T1,L, 2
+    ble LDGEMM_L4x8_SUB2_1
+    LOAD4x8_0
+    KERNEL4x8_I1_L2_3  64,32, 0,1
+    MY_ALIGN
+LDGEMM_L4x8_SUB2_1:
+    andi.      T1,L, 1
+    ble LDGEMM_L4x8_SAVE	
+    KERNEL4x8 0
+ 
+	MY_ALIGN
+LDGEMM_L4x8_SAVE:
+	SAVE4x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4
+#endif	
+LDGEMM_L4x8_END:
+
+LDGEMM_L4x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		LDGEMM_L4x4_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
+    REFRESH_TEMP_BK T3,K,TEMP_REG,4,4
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif		
+	ble		LDGEMM_L4x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L4x4_SUB4
+
+LDGEMM_L4x4_LOOP_START:
+
+	#dcbt	AO,	PRE
+	LOAD4x4_1
+	KERNEL4x4_I1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	#dcbt	AO,	PRE
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	#dcbt	AO,	PRE
+	KERNEL4x4_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L4x4_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L4x4_LOOP:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	#dcbt	AO,	PRE
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	#dcbt	AO,	PRE
+	KERNEL4x4_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x4_LOOP
+
+LDGEMM_L4x4_LOOP_END:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_E2
+
+	b		LDGEMM_L4x4_SUB1
+
+LDGEMM_L4x4_SUB4:
+
+	KERNEL4x4_SUBI1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	b		LDGEMM_L4x4_SUB1
+
+LDGEMM_L4x4_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL4x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L4x4_SAVE
+	b		LDGEMM_L4x4_SUB2
+
+LDGEMM_L4x4_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L4x4_SAVE
+
+LDGEMM_L4x4_SUB2:
+
+	KERNEL4x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x4_SUB2
+
+LDGEMM_L4x4_SAVE:
+
+	SAVE4x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4
+#endif	
+LDGEMM_L4x4_END:
+
+LDGEMM_L4x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		LDGEMM_L4x2_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
+    REFRESH_TEMP_BK T3,K,TEMP_REG,2,4
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L4x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L4x2_SUB4
+
+LDGEMM_L4x2_LOOP_START:
+
+	LOAD4x2_1
+	KERNEL4x2_I1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L4x2_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L4x2_LOOP:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x2_LOOP
+
+LDGEMM_L4x2_LOOP_END:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_E2
+
+	b		LDGEMM_L4x2_SUB1
+
+LDGEMM_L4x2_SUB4:
+
+	KERNEL4x2_SUBI1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	b		LDGEMM_L4x2_SUB1
+
+LDGEMM_L4x2_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL4x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L4x2_SAVE
+	b		LDGEMM_L4x2_SUB2
+
+LDGEMM_L4x2_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L4x2_SAVE
+
+LDGEMM_L4x2_SUB2:
+
+	KERNEL4x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x2_SUB2
+
+LDGEMM_L4x2_SAVE:
+
+	SAVE4x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4
+#endif	
+LDGEMM_L4x2_END:
+
+LDGEMM_L4x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		LDGEMM_L4x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
+    REFRESH_TEMP_BK T3,K,TEMP_REG,1,4
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L4x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L4x1_SUB4
+
+LDGEMM_L4x1_LOOP_START:
+
+	LOAD4x1_1
+	KERNEL4x1_I1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L4x1_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L4x1_LOOP:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x1_LOOP
+
+LDGEMM_L4x1_LOOP_END:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_E2
+
+	b		LDGEMM_L4x1_SUB1
+
+LDGEMM_L4x1_SUB4:
+
+	KERNEL4x1_SUBI1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	b		LDGEMM_L4x1_SUB1
+
+LDGEMM_L4x1_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL4x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L4x1_SAVE
+	b		LDGEMM_L4x1_SUB2
+
+LDGEMM_L4x1_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L4x1_SAVE
+
+LDGEMM_L4x1_SUB2:
+
+	KERNEL4x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x1_SUB2
+
+LDGEMM_L4x1_SAVE:
+
+	SAVE4x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4
+#endif	
+LDGEMM_L4x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 4
+#endif
+	addic.		J,	J,	-1
+	bgt		LDGEMM_L4_BEGIN
+
+	andi.		T2,	N,	3
+	ble		.L999
+
+LDGEMM_L4_END:
+
+	b		LDGEMM_L2_BEGIN
+
+.L999_H1:
+
+	b		.L999
+
+LDGEMM_L2_BEGIN:
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	andi.		T1,	N,	2
+	ble		LDGEMM_L2_END
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+	srawi.		I,	M,	4
+	ble		LDGEMM_L2x16_END
+
+LDGEMM_L2x16_BEGIN:
+
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,2
+    REFRESH_TEMP_BK T3,K,TEMP_REG,16,2
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L2x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L2x16_SUB4
+
+LDGEMM_L2x16_LOOP_START:
+
+	#dcbt		AO,	PRE
+	LOAD2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_I1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L2x16_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L2x16_LOOP:
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x16_LOOP
+
+LDGEMM_L2x16_LOOP_END:
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	KERNEL2x16_E2
+
+	b		LDGEMM_L2x16_SUB1
+
+LDGEMM_L2x16_SUB4:
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_SUBI1
+	#dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	#dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	#dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+
+	b		LDGEMM_L2x16_SUB1
+
+LDGEMM_L2x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL2x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L2x16_SAVE
+	b		LDGEMM_L2x16_SUB2
+
+LDGEMM_L2x16_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L2x16_SAVE
+
+LDGEMM_L2x16_SUB2:
+
+	KERNEL2x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x16_SUB2
+
+LDGEMM_L2x16_SAVE:
+
+	SAVE2x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2
+#endif	
+	addic.		I,	I,	-1
+	bgt		LDGEMM_L2x16_BEGIN
+
+LDGEMM_L2x16_END:
+
+LDGEMM_L2x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		LDGEMM_L2x1_END
+
+	andi.		T1,	M,	8
+	ble		LDGEMM_L2x8_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+    REFRESH_TEMP_BK T3,K,TEMP_REG,8,2
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L2x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L2x8_SUB4
+
+LDGEMM_L2x8_LOOP_START:
+
+	#dcbt	AO,	PRE
+	LOAD2x8_1
+	KERNEL2x8_I1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L2x8_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L2x8_LOOP:
+
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x8_LOOP
+
+LDGEMM_L2x8_LOOP_END:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_E2
+
+	b		LDGEMM_L2x8_SUB1
+
+LDGEMM_L2x8_SUB4:
+
+	KERNEL2x8_SUBI1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	b		LDGEMM_L2x8_SUB1
+
+LDGEMM_L2x8_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL2x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L2x8_SAVE
+	b		LDGEMM_L2x8_SUB2
+
+LDGEMM_L2x8_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L2x8_SAVE
+
+LDGEMM_L2x8_SUB2:
+
+	KERNEL2x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x8_SUB2
+
+LDGEMM_L2x8_SAVE:
+
+	SAVE2x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2
+#endif
+LDGEMM_L2x8_END:
+
+LDGEMM_L2x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		LDGEMM_L2x4_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+    REFRESH_TEMP_BK T3,K,TEMP_REG,4,2
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L2x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L2x4_SUB4
+
+LDGEMM_L2x4_LOOP_START:
+
+	LOAD2x4_1
+	KERNEL2x4_I1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L2x4_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L2x4_LOOP:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x4_LOOP
+
+LDGEMM_L2x4_LOOP_END:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_E2
+
+	b		LDGEMM_L2x4_SUB1
+
+LDGEMM_L2x4_SUB4:
+
+	KERNEL2x4_SUBI1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	b		LDGEMM_L2x4_SUB1
+
+LDGEMM_L2x4_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL2x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L2x4_SAVE
+	b		LDGEMM_L2x4_SUB2
+
+LDGEMM_L2x4_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L2x4_SAVE
+
+LDGEMM_L2x4_SUB2:
+
+	KERNEL2x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x4_SUB2
+
+LDGEMM_L2x4_SAVE:
+
+	SAVE2x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2
+#endif
+LDGEMM_L2x4_END:
+
+LDGEMM_L2x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		LDGEMM_L2x2_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+    REFRESH_TEMP_BK T3,K,TEMP_REG,2,2
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L2x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L2x2_SUB4
+
+LDGEMM_L2x2_LOOP_START:
+
+	LOAD2x2_1
+	KERNEL2x2_I1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L2x2_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L2x2_LOOP:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x2_LOOP
+
+LDGEMM_L2x2_LOOP_END:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_E2
+
+	b		LDGEMM_L2x2_SUB1
+
+LDGEMM_L2x2_SUB4:
+
+	KERNEL2x2_SUBI1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	b		LDGEMM_L2x2_SUB1
+
+LDGEMM_L2x2_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL2x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L2x2_SAVE
+	b		LDGEMM_L2x2_SUB2
+
+LDGEMM_L2x2_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L2x2_SAVE
+
+LDGEMM_L2x2_SUB2:
+
+	KERNEL2x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x2_SUB2
+
+LDGEMM_L2x2_SAVE:
+
+	SAVE2x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2
+#endif
+LDGEMM_L2x2_END:
+
+LDGEMM_L2x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		LDGEMM_L2x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+    REFRESH_TEMP_BK T3,K,TEMP_REG,1,2
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L2x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L2x1_SUB4
+
+LDGEMM_L2x1_LOOP_START:
+
+	LOAD2x1_1
+	KERNEL2x1_I1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L2x1_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L2x1_LOOP:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x1_LOOP
+
+LDGEMM_L2x1_LOOP_END:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_E2
+
+	b		LDGEMM_L2x1_SUB1
+
+LDGEMM_L2x1_SUB4:
+
+	KERNEL2x1_SUBI1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	b		LDGEMM_L2x1_SUB1
+
+LDGEMM_L2x1_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL2x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L2x1_SAVE
+	b		LDGEMM_L2x1_SUB2
+
+LDGEMM_L2x1_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L2x1_SAVE
+
+LDGEMM_L2x1_SUB2:
+
+	KERNEL2x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x1_SUB2
+
+LDGEMM_L2x1_SAVE:
+
+	SAVE2x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2
+#endif
+LDGEMM_L2x1_END:
+
+	slwi		T1,	K,	4
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 2
+#endif
+LDGEMM_L2_END:
+LDGEMM_L1_BEGIN:
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	andi.		T1,	N,	1
+	ble		LDGEMM_L1_END
+	mr		CO,	C
+	mr		AO,	A
+	srawi.		I,	M,	4
+	ble		LDGEMM_L1x16_END
+
+LDGEMM_L1x16_BEGIN:
+
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,1
+    REFRESH_TEMP_BK T3,K,TEMP_REG,16,1
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L1x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L1x16_SUB4
+
+LDGEMM_L1x16_LOOP_START:
+
+	#dcbt		AO,	PRE
+	LOAD1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_I1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L1x16_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L1x16_LOOP:
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x16_LOOP
+
+LDGEMM_L1x16_LOOP_END:
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	KERNEL1x16_E2
+
+	b		LDGEMM_L1x16_SUB1
+
+LDGEMM_L1x16_SUB4:
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_SUBI1
+	#dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	#dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	#dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+
+	b		LDGEMM_L1x16_SUB1
+
+LDGEMM_L1x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL1x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L1x16_SAVE
+	b		LDGEMM_L1x16_SUB2
+
+LDGEMM_L1x16_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L1x16_SAVE
+
+LDGEMM_L1x16_SUB2:
+
+	KERNEL1x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x16_SUB2
+
+LDGEMM_L1x16_SAVE:
+
+	SAVE1x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1
+#endif
+	addic.		I,	I,	-1
+	bgt		LDGEMM_L1x16_BEGIN
+
+LDGEMM_L1x16_END:
+
+LDGEMM_L1x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		LDGEMM_L1x1_END
+
+	andi.		T1,	M,	8
+	ble		LDGEMM_L1x8_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+    REFRESH_TEMP_BK T3,K,TEMP_REG,8,1
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L1x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L1x8_SUB4
+
+LDGEMM_L1x8_LOOP_START:
+
+	#dcbt	AO,	PRE
+	LOAD1x8_1
+	KERNEL1x8_I1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L1x8_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L1x8_LOOP:
+
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x8_LOOP
+
+LDGEMM_L1x8_LOOP_END:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_E2
+
+	b		LDGEMM_L1x8_SUB1
+
+LDGEMM_L1x8_SUB4:
+
+	KERNEL1x8_SUBI1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	b		LDGEMM_L1x8_SUB1
+
+LDGEMM_L1x8_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL1x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L1x8_SAVE
+	b		LDGEMM_L1x8_SUB2
+
+LDGEMM_L1x8_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L1x8_SAVE
+
+LDGEMM_L1x8_SUB2:
+
+	KERNEL1x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x8_SUB2
+
+LDGEMM_L1x8_SAVE:
+
+	SAVE1x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1
+#endif
+LDGEMM_L1x8_END:
+
+LDGEMM_L1x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		LDGEMM_L1x4_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+    REFRESH_TEMP_BK T3,K,TEMP_REG,4,1
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L1x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L1x4_SUB4
+
+LDGEMM_L1x4_LOOP_START:
+
+	LOAD1x4_1
+	KERNEL1x4_I1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L1x4_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L1x4_LOOP:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x4_LOOP
+
+LDGEMM_L1x4_LOOP_END:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_E2
+
+	b		LDGEMM_L1x4_SUB1
+
+LDGEMM_L1x4_SUB4:
+
+	KERNEL1x4_SUBI1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	b		LDGEMM_L1x4_SUB1
+
+LDGEMM_L1x4_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL1x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L1x4_SAVE
+	b		LDGEMM_L1x4_SUB2
+
+LDGEMM_L1x4_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L1x4_SAVE
+
+LDGEMM_L1x4_SUB2:
+
+	KERNEL1x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x4_SUB2
+
+LDGEMM_L1x4_SAVE:
+
+	SAVE1x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1
+#endif
+LDGEMM_L1x4_END:
+
+LDGEMM_L1x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		LDGEMM_L1x2_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+    REFRESH_TEMP_BK T3,K,TEMP_REG,2,1
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L1x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L1x2_SUB4
+
+LDGEMM_L1x2_LOOP_START:
+
+	LOAD1x2_1
+	KERNEL1x2_I1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L1x2_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L1x2_LOOP:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x2_LOOP
+
+LDGEMM_L1x2_LOOP_END:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_E2
+
+	b		LDGEMM_L1x2_SUB1
+
+LDGEMM_L1x2_SUB4:
+
+	KERNEL1x2_SUBI1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	b		LDGEMM_L1x2_SUB1
+
+LDGEMM_L1x2_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL1x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L1x2_SAVE
+	b		LDGEMM_L1x2_SUB2
+
+LDGEMM_L1x2_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L1x2_SAVE
+
+LDGEMM_L1x2_SUB2:
+
+	KERNEL1x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x2_SUB2
+
+LDGEMM_L1x2_SAVE:
+
+	SAVE1x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1
+#endif
+LDGEMM_L1x2_END:
+
+LDGEMM_L1x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		LDGEMM_L1x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+    REFRESH_TEMP_BK T3,K,TEMP_REG,1,1
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L1x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L1x1_SUB4
+
+LDGEMM_L1x1_LOOP_START:
+
+	LOAD1x1_1
+	KERNEL1x1_I1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L1x1_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L1x1_LOOP:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x1_LOOP
+
+LDGEMM_L1x1_LOOP_END:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_E2
+
+	b		LDGEMM_L1x1_SUB1
+
+LDGEMM_L1x1_SUB4:
+
+	KERNEL1x1_SUBI1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	b		LDGEMM_L1x1_SUB1
+
+LDGEMM_L1x1_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL1x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L1x1_SAVE
+	b		LDGEMM_L1x1_SUB2
+
+LDGEMM_L1x1_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L1x1_SAVE
+
+LDGEMM_L1x1_SUB2:
+
+	KERNEL1x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x1_SUB2
+
+LDGEMM_L1x1_SAVE:
+
+	SAVE1x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1
+#endif
+LDGEMM_L1x1_END:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 1
+#endif
+LDGEMM_L1_END:
diff --git a/kernel/power/dgemm_macros_power9.S b/kernel/power/dgemm_macros_power9.S
index c4b8270b8..4eddab24f 100644
--- a/kernel/power/dgemm_macros_power9.S
+++ b/kernel/power/dgemm_macros_power9.S
@@ -1,3623 +1,3623 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* Abdelrauf(quickwritereader@googlemail.com)
-* 	 BLASTEST 		: OK
-* 	 CTEST			: OK
-* 	 TEST			: OK
-*	 LAPACK-TEST		: OK
-**************************************************************************************/
-
-/*********************************************************************
-* Macros for N=4, M=16                                               *
-*********************************************************************/
-.macro LOAD4x16_1
-   LOAD4x16 1
-.endm
-
-.macro LOAD4x16_0
-   LOAD4x16 0
-.endm
-.macro LOAD4x16  Zero
-
-	lxv	vs24,	0(BO)
-	lxv	vs26,	16(BO)
-	xxpermdi	vs25,	vs24,	vs24,2	
-	xxpermdi	vs27,	vs26,	vs26,2
-
-	lxv	vs0,	 0(AO)
-	lxv	vs1,	16(AO)
-	lxv	vs2,	32(AO)
-	lxv	vs3,	48(AO)
- 
-
-	lxv	vs4,	64(AO)
-	lxv	vs5,	80(AO)
-	lxv	vs6,	96(AO)
-	lxv	vs7,	112(AO)
-.if \Zero==1 
-    xxlxor		vs32,vs32,vs32
-    xxlxor		vs33,vs33,vs33
-	xxlxor		vs34,vs34,vs34
-	xxlxor		vs35,vs35,vs35
-	xxlxor		vs36,vs36,vs36
-	xxlxor		vs37,vs37,vs37
-	xxlxor		vs38,vs38,vs38
-	xxlxor		vs39,vs39,vs39
-	xxlxor		vs40,	vs40,	vs40
-	xxlxor		vs41,	vs41,	vs41
-	xxlxor		vs42,	vs42,	vs42
-	xxlxor		vs43,	vs43,	vs43
-	xxlxor		vs44,	vs44,	vs44
-	xxlxor		vs45,	vs45,	vs45
-	xxlxor		vs46,	vs46,	vs46
-	xxlxor		vs47,	vs47,	vs47
-	xxlxor		vs48,	vs48,	vs48
-	xxlxor		vs49,	vs49,	vs49
-	xxlxor		vs50,	vs50,	vs50
-	xxlxor		vs51,	vs51,	vs51 
-	xxlxor		vs52,	vs52,	vs52
-	xxlxor		vs53,	vs53,	vs53
-	xxlxor		vs54,	vs54,	vs54
-	xxlxor		vs55,	vs55,	vs55 
-	xxlxor		vs56,	vs56,	vs56
-	xxlxor		vs57,	vs57,	vs57
-	xxlxor		vs58,	vs58,	vs58
-	xxlxor		vs59,	vs59,	vs59 
-	xxlxor		vs60,	vs60,	vs60
-	xxlxor		vs61,	vs61,	vs61
-	xxlxor		vs62,	vs62,	vs62
-	xxlxor		vs63,	vs63,	vs63	
-.endif
-.endm
-
-  
-#define unit_size 8
-#define DISP32(ind,disp) (ind*unit_size*32+disp)
-#define DISP16(ind,disp) (ind*unit_size*16+disp)
-#define DISP8(ind,disp) (ind*unit_size*8+disp)
-#define DISP4(ind,disp) (ind*unit_size*4+disp)
-#define DISP2(ind,disp) (ind*unit_size*2+disp)
-#define DISP1(ind,disp) (ind*unit_size+disp)
-
-.macro KERNEL4x16_L1_L2  Index,IsLast
-  KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0
-.endm
-
-
-
-.macro KERNEL4x16_I1_L2  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L2_I  AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I1_L2_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L2_I  AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L2_I  AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL4x16_I2_L2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L2_I  \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I2_L2_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L2_I  \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I2_L2_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL4x16_L1_L2_I  AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete
-
-.if \First ==1
-	xvmuldp		vs32,	vs0,	vs24
-	xvmuldp		vs33,	vs1,	vs24
-	xvmuldp		vs34,	vs2,	vs24
-	xvmuldp		vs35,	vs3,	vs24
-.else
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-.endif
-	lxv	vs8,	DISP32(\Index,0+\OffsetA)(\AREG)
-	lxv	vs9,	DISP32(\Index,16+\OffsetA)(\AREG)
-	lxv	vs10,	DISP32(\Index,32+\OffsetA)(\AREG)
-	lxv	vs11,	DISP32(\Index,48+\OffsetA)(\AREG)
-.if \First ==1
-	xvmuldp		vs36,	vs4,	vs24
-	xvmuldp		vs37,	vs5,	vs24
-	xvmuldp		vs38,	vs6,	vs24
-	xvmuldp		vs39,	vs7,	vs24
-.else
-	xvmaddadp		vs36,	vs4,	vs24
-	xvmaddadp		vs37,	vs5,	vs24
-	xvmaddadp		vs38,	vs6,	vs24
-	xvmaddadp		vs39,	vs7,	vs24
-.endif
-	lxv	vs28,	DISP8(\Index,0  +\OffsetB)(\BREG)
-	lxv	vs30,	DISP8(\Index,16  +\OffsetB)(\BREG)
-	xxpermdi	vs29,	vs28,	vs28,2	
-	xxpermdi	vs31,	vs30,	vs30,2
-.if \First ==1
-	xvmuldp		vs40,	vs0,	vs25
-	xvmuldp		vs41,	vs1,	vs25
-	xvmuldp		vs42,	vs2,	vs25
-	xvmuldp		vs43,	vs3,	vs25
-
-
-	xvmuldp		vs44,	vs4,	vs25
-	xvmuldp		vs45,	vs5,	vs25
-	xvmuldp		vs46,	vs6,	vs25
-	xvmuldp		vs47,	vs7,	vs25
-
-
-	xvmuldp		vs48,	vs0,	vs26
-	xvmuldp		vs49,	vs1,	vs26
-	xvmuldp		vs50,	vs2,	vs26
-	xvmuldp		vs51,	vs3,	vs26
-
-
-.else
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
-
-
-	xvmaddadp		vs44,	vs4,	vs25
-	xvmaddadp		vs45,	vs5,	vs25
-	xvmaddadp		vs46,	vs6,	vs25
-	xvmaddadp		vs47,	vs7,	vs25
-
-
-	xvmaddadp		vs48,	vs0,	vs26
-	xvmaddadp		vs49,	vs1,	vs26
-	xvmaddadp		vs50,	vs2,	vs26
-	xvmaddadp		vs51,	vs3,	vs26
-
-.endif
-	lxv	vs12,  DISP32(\Index,64+\OffsetA)(\AREG)
-	lxv	vs13,  DISP32(\Index,80+\OffsetA)(\AREG)
-.if \First ==1
-	xvmuldp		vs52,	vs4,	vs26
-	xvmuldp		vs53,	vs5,	vs26
-	xvmuldp		vs54,	vs6,	vs26
-	xvmuldp		vs55,	vs7,	vs26
-
-.else
-	xvmaddadp		vs52,	vs4,	vs26
-	xvmaddadp		vs53,	vs5,	vs26
-	xvmaddadp		vs54,	vs6,	vs26
-	xvmaddadp		vs55,	vs7,	vs26
-.endif
-	lxv	vs14,  DISP32(\Index,96+\OffsetA)(\AREG)
-	lxv	vs15,  DISP32(\Index,112+\OffsetA)(\AREG)
-.if \First ==1
-	xvmuldp		vs56,	vs0,	vs27
-	xvmuldp		vs57,	vs1,	vs27
-	xvmuldp		vs58,	vs2,	vs27
-	xvmuldp		vs59,	vs3,	vs27
-
- 
-
-	xvmuldp		vs60,	vs4,	vs27
-	xvmuldp		vs61,	vs5,	vs27
-	xvmuldp		vs62,	vs6,	vs27
-	xvmuldp		vs63,	vs7,	vs27
-
-.else
-	xvmaddadp		vs56,	vs0,	vs27
-	xvmaddadp		vs57,	vs1,	vs27
-	xvmaddadp		vs58,	vs2,	vs27
-	xvmaddadp		vs59,	vs3,	vs27
-
- 
-
-	xvmaddadp		vs60,	vs4,	vs27
-	xvmaddadp		vs61,	vs5,	vs27
-	xvmaddadp		vs62,	vs6,	vs27
-	xvmaddadp		vs63,	vs7,	vs27
-.endif
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-.if \Complete==0
-	lxv	vs0,	DISP32(\Index,128+\OffsetA)(\AREG)
-	lxv	vs1,	DISP32(\Index,144+\OffsetA)(\AREG)
-.endif
-	xvmaddadp		vs36,	vs12,	vs28
-	xvmaddadp		vs37,	vs13,	vs28
-	xvmaddadp		vs38,	vs14,	vs28
-	xvmaddadp		vs39,	vs15,	vs28
-.if \Complete==0
-	lxv	vs24,	DISP8(\Index,32  +\OffsetB)(\BREG)
-	lxv	vs26,	DISP8(\Index,48  +\OffsetB)(\BREG)
-	xxpermdi	vs25,	vs24,	vs24,2	
-	xxpermdi	vs27,	vs26,	vs26,2
-.endif
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-	xvmaddadp		vs42,	vs10,	vs29
-	xvmaddadp		vs43,	vs11,	vs29
-.if \Complete==0
-	lxv	vs2,	DISP32(\Index,160+\OffsetA)(\AREG)
-	lxv	vs3,	DISP32(\Index,176+\OffsetA)(\AREG)
-.endif
-	xvmaddadp		vs44,	vs12,	vs29
-	xvmaddadp		vs45,	vs13,	vs29
-	xvmaddadp		vs46,	vs14,	vs29
-	xvmaddadp		vs47,	vs15,	vs29
-
-
-	xvmaddadp		vs48,	vs8,	vs30
-	xvmaddadp		vs49,	vs9,	vs30
-	xvmaddadp		vs50,	vs10,	vs30
-	xvmaddadp		vs51,	vs11,	vs30
-.if \Complete==0
-	lxv	vs4,	DISP32(\Index,192+\OffsetA)(\AREG)
-	lxv	vs5,	DISP32(\Index,208+\OffsetA)(\AREG)
-.endif
-	xvmaddadp		vs52,	vs12,	vs30
-	xvmaddadp		vs53,	vs13,	vs30
-	xvmaddadp		vs54,	vs14,	vs30
-	xvmaddadp		vs55,	vs15,	vs30
-.if \Complete==0
-	lxv	vs6,	DISP32(\Index,224+\OffsetA)(\AREG)
-	lxv	vs7,	DISP32(\Index,240+\OffsetA)(\AREG)
-.endif
-	xvmaddadp		vs56,	vs8,	vs31
-	xvmaddadp		vs57,	vs9,	vs31
-	xvmaddadp		vs58,	vs10,	vs31
-	xvmaddadp		vs59,	vs11,	vs31
- 
-
-	xvmaddadp		vs60,	vs12,	vs31
-	
-	xvmaddadp		vs61,	vs13,	vs31
-	xvmaddadp		vs62,	vs14,	vs31
-	
-	xvmaddadp		vs63,	vs15,	vs31
-  .if \IsLast==1	
-  .if \Complete==1
-	addi		\AREG, \AREG, DISP32(\Index,128+\OffsetA)
-	addi		\BREG, \BREG,  DISP8(\Index,32+\OffsetB)
-  .else
-	addi		\AREG, \AREG, DISP32(\Index,256)
-	addi		\BREG, \BREG,  DISP8(\Index,64)
-  .endif
-  .endif
-  
-
-.endm
-
- 
-
-.macro KERNEL4x16 First
-
-	lxv	vs24,	0(BO)
-	lxv	vs26,	16(BO)
-	xxpermdi	vs25,	vs24,	vs24,2	
-	xxpermdi	vs27,	vs26,	vs26,2
-
-	lxv	vs0,	0(AO)
-	lxv	vs1,	16(AO)
-	lxv	vs2,	32(AO)
-	lxv	vs3,	48(AO) 
-
-	lxv	vs4,	64(AO)
-	lxv	vs5,	80(AO)
-	lxv	vs6,	96(AO)
-	lxv	vs7,	112(AO)
-
-
- 
-	addi		BO, BO, 32
-  addi		AO, AO, 128
-
-.if \First==1
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-	xvmuldp			vs36,	vs4,	vs24
-	xvmuldp			vs37,	vs5,	vs24
-	xvmuldp			vs38,	vs6,	vs24
-	xvmuldp			vs39,	vs7,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-	xvmuldp			vs42,	vs2,	vs25
-	xvmuldp			vs43,	vs3,	vs25
-	xvmuldp			vs44,	vs4,	vs25
-	xvmuldp			vs45,	vs5,	vs25
-	xvmuldp			vs46,	vs6,	vs25
-	xvmuldp			vs47,	vs7,	vs25
-
-	xvmuldp			vs48,	vs0,	vs26
-	xvmuldp			vs49,	vs1,	vs26
-	xvmuldp			vs50,	vs2,	vs26
-	xvmuldp			vs51,	vs3,	vs26
-	xvmuldp			vs52,	vs4,	vs26
-	xvmuldp			vs53,	vs5,	vs26
-	xvmuldp			vs54,	vs6,	vs26
-	xvmuldp			vs55,	vs7,	vs26
-
-	xvmuldp			vs56,	vs0,	vs27
-	xvmuldp			vs57,	vs1,	vs27
-	xvmuldp			vs58,	vs2,	vs27
-	xvmuldp			vs59,	vs3,	vs27
-	xvmuldp			vs60,	vs4,	vs27
-	xvmuldp			vs61,	vs5,	vs27
-	xvmuldp			vs62,	vs6,	vs27
-	xvmuldp			vs63,	vs7,	vs27
-.else
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-	xvmaddadp		vs36,	vs4,	vs24
-	xvmaddadp		vs37,	vs5,	vs24
-	xvmaddadp		vs38,	vs6,	vs24
-	xvmaddadp		vs39,	vs7,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
- 
-	xvmaddadp		vs44,	vs4,	vs25
-	xvmaddadp		vs45,	vs5,	vs25
-	xvmaddadp		vs46,	vs6,	vs25
-	xvmaddadp		vs47,	vs7,	vs25
-
-	xvmaddadp		vs48,	vs0,	vs26
-	xvmaddadp		vs49,	vs1,	vs26
-	xvmaddadp		vs50,	vs2,	vs26
-	xvmaddadp		vs51,	vs3,	vs26
- 
-	xvmaddadp		vs52,	vs4,	vs26
-	xvmaddadp		vs53,	vs5,	vs26
-	xvmaddadp		vs54,	vs6,	vs26
-	xvmaddadp		vs55,	vs7,	vs26
-
-	xvmaddadp		vs56,	vs0,	vs27
-	xvmaddadp		vs57,	vs1,	vs27
-	xvmaddadp		vs58,	vs2,	vs27
-	xvmaddadp		vs59,	vs3,	vs27
-	xvmaddadp		vs60,	vs4,	vs27
-	xvmaddadp		vs61,	vs5,	vs27
-	xvmaddadp		vs62,	vs6,	vs27
-	xvmaddadp		vs63,	vs7,	vs27
-
-.endif
-.endm
-
-.macro SAVE4x16_REGS
-	add		C2,	CO,	LDC
-	add		C3,	C2,	LDC
-	add		C4,	C3,	LDC
-.endm
-
-.macro SAVE4x16
-#ifndef TRMMKERNEL
-	lxv		vs0,	0(CO)
-	lxv		vs2,	16(CO)
-	lxv		vs4,	32(CO)
-	lxv		vs6,	48(CO)
-#endif	
-	xxpermdi  vs8, vs40,vs32,1
- 	xxpermdi  vs9 ,vs32,vs40,1
-#ifndef TRMMKERNEL
-	lxv		vs24,	64(CO)
-	lxv		vs26,	80(CO)
-	lxv		vs28,	96(CO)
-	lxv		vs30,	112(CO)
-#endif	
-	xxpermdi  vs10, vs41,vs33,1		 
- 	xxpermdi  vs11 ,vs33,vs41,1
-#ifndef TRMMKERNEL	 
-	lxv		vs1,	0(C2)
-	lxv		vs3,	16(C2)
-	lxv		vs5,	32(C2)
-	lxv		vs7,	48(C2)
-#endif	
-	xxpermdi  vs12, vs42,vs34,1
- 	xxpermdi  vs13 ,vs34,vs42,1
-#ifndef TRMMKERNEL
-	lxv		vs25,	64(C2)
-	lxv		vs27,	80(C2)
-#endif	
-	xxpermdi  vs14, vs43,vs35,1		 
- 	xxpermdi  vs15 ,vs35,vs43,1	
-#ifndef TRMMKERNEL	 
-	lxv		vs29,	96(C2)
-	lxv		vs31,	112(C2)	
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs8,	alpha_r 
-	xvmaddadp	vs1,	vs9,	alpha_r 
-	xvmaddadp	vs2,	vs10,	alpha_r 
-	xvmaddadp	vs3,	vs11,	alpha_r 
-#else
-	xvmuldp	vs0,	vs8,	alpha_r 
-	xvmuldp	vs1,	vs9,	alpha_r 
-	xvmuldp	vs2,	vs10,	alpha_r 
-	xvmuldp	vs3,	vs11,	alpha_r 
-
-#endif
-	xxpermdi  vs8, vs44,vs36,1
- 	xxpermdi  vs9 ,vs36,vs44,1
-	xxpermdi  vs10, vs45,vs37,1		 
- 	xxpermdi  vs11 ,vs37,vs45,1
-#ifndef TRMMKERNEL
-	xvmaddadp	vs4,	vs12,	alpha_r 
-	xvmaddadp	vs5,	vs13,	alpha_r 
-	xvmaddadp	vs6,	vs14,	alpha_r 
-	xvmaddadp	vs7,	vs15,	alpha_r 
-#else
-	xvmuldp	vs4,	vs12,	alpha_r 
-	xvmuldp	vs5,	vs13,	alpha_r 
-	xvmuldp	vs6,	vs14,	alpha_r 
-	xvmuldp	vs7,	vs15,	alpha_r 
-#endif
-	xxpermdi  vs12, vs46,vs38,1
- 	xxpermdi  vs13 ,vs38,vs46,1
-	xxpermdi  vs14, vs47,vs39,1		 
- 	xxpermdi  vs15 ,vs39,vs47,1
-
-#ifndef TRMMKERNEL 
-	xvmaddadp	vs24,	vs8,	alpha_r 
-	xvmaddadp	vs25,	vs9,	alpha_r 
-	xvmaddadp	vs26,	vs10,	alpha_r 
-	xvmaddadp	vs27,	vs11,	alpha_r 
-
-	xvmaddadp	vs28,	vs12,	alpha_r 
-	xvmaddadp	vs29,	vs13,	alpha_r 
-	xvmaddadp	vs30,	vs14,	alpha_r 
-	xvmaddadp	vs31,	vs15,	alpha_r 
-#else
-	xvmuldp	vs24,	vs8,	alpha_r 
-	xvmuldp	vs25,	vs9,	alpha_r 
-	xvmuldp	vs26,	vs10,	alpha_r 
-	xvmuldp	vs27,	vs11,	alpha_r 
-
-	xvmuldp	vs28,	vs12,	alpha_r 
-	xvmuldp	vs29,	vs13,	alpha_r 
-	xvmuldp	vs30,	vs14,	alpha_r 
-	xvmuldp	vs31,	vs15,	alpha_r 
-
-#endif
-	stxv		vs0,	0(CO)
-	stxv		vs2,	16(CO)
-	stxv		vs4,	32(CO)
-	stxv		vs6,	48(CO)
-
-	stxv		vs24,	64(CO)
-	stxv		vs26,	80(CO)
-	stxv		vs28,	96(CO)
-	stxv		vs30,	112(CO)
-
-	stxv		vs1,	0(C2)
-	stxv		vs3,	16(C2)
-	stxv		vs5,	32(C2)
-	stxv		vs7,	48(C2)
-	
-	stxv		vs25,	64(C2)
-	stxv		vs27,	80(C2)
-	stxv		vs29,	96(C2)
-	stxv		vs31,	112(C2)	
-#ifndef TRMMKERNEL
- 	lxv		vs0,	0(C3)
-	lxv		vs2,	16(C3)
-	lxv		vs4,	32(C3)
-	lxv		vs6,	48(C3)
-#endif	
-	xxpermdi  vs8, vs56,vs48,1
- 	xxpermdi  vs9 ,vs48,vs56,1
-#ifndef TRMMKERNEL	 
-	lxv		vs24,	64(C3)
-	lxv		vs26,	80(C3)
-#endif	
-	xxpermdi  vs10, vs57,vs49,1		 
- 	xxpermdi  vs11 ,vs49,vs57,1	
-#ifndef TRMMKERNEL	 
-	lxv		vs28,	96(C3)
-	lxv		vs30,	112(C3)
-#endif	
-	xxpermdi  vs12, vs58,vs50,1
- 	xxpermdi  vs13 ,vs50,vs58,1
-#ifndef TRMMKERNEL	 
-	lxv		vs1,	0(C4)
-	lxv		vs3,	16(C4)
-#endif	
-	xxpermdi  vs14, vs59,vs51,1		 
- 	xxpermdi  vs15 ,vs51,vs59,1	
-#ifndef TRMMKERNEL	 
-	lxv		vs5,	32(C4)
-	lxv		vs7,	48(C4)
-
-	lxv		vs25,	64(C4)
-	lxv		vs27,	80(C4)
-	lxv		vs29,	96(C4)
-	lxv		vs31,	112(C4)	
-#endif
- 
-#ifndef TRMMKERNEL 
-	xvmaddadp	vs0,	vs8,	alpha_r 
-	xvmaddadp	vs1,	vs9,	alpha_r 
-	xvmaddadp	vs2,	vs10,	alpha_r 
-	xvmaddadp	vs3,	vs11,	alpha_r 
-#else
-	xvmuldp	vs0,	vs8,	alpha_r 
-	xvmuldp	vs1,	vs9,	alpha_r 
-	xvmuldp	vs2,	vs10,	alpha_r 
-	xvmuldp	vs3,	vs11,	alpha_r 
-
-#endif
-
-	xxpermdi  vs8, vs60,vs52,1
- 	xxpermdi  vs9 ,vs52,vs60,1
-	xxpermdi  vs10, vs61,vs53,1		 
- 	xxpermdi  vs11 ,vs53,vs61,1
-#ifndef TRMMKERNEL
-	xvmaddadp	vs4,	vs12,	alpha_r 
-	xvmaddadp	vs5,	vs13,	alpha_r 
-	xvmaddadp	vs6,	vs14,	alpha_r 
-	xvmaddadp	vs7,	vs15,	alpha_r 
-#else
-	xvmuldp	vs4,	vs12,	alpha_r 
-	xvmuldp	vs5,	vs13,	alpha_r 
-	xvmuldp	vs6,	vs14,	alpha_r 
-	xvmuldp	vs7,	vs15,	alpha_r 
-#endif
-
-
-	xxpermdi  vs12, vs62,vs54,1
- 	xxpermdi  vs13 ,vs54,vs62,1
-	xxpermdi  vs14, vs63,vs55,1		 
- 	xxpermdi  vs15 ,vs55,vs63,1
-#ifndef TRMMKERNEL 
-	xvmaddadp	vs24,	vs8,	alpha_r 
-	xvmaddadp	vs25,	vs9,	alpha_r 
-	xvmaddadp	vs26,	vs10,	alpha_r 
-	xvmaddadp	vs27,	vs11,	alpha_r 
-
-	xvmaddadp	vs28,	vs12,	alpha_r 
-	xvmaddadp	vs29,	vs13,	alpha_r 
-	xvmaddadp	vs30,	vs14,	alpha_r 
-	xvmaddadp	vs31,	vs15,	alpha_r 
-#else
-	xvmuldp	vs24,	vs8,	alpha_r 
-	xvmuldp	vs25,	vs9,	alpha_r 
-	xvmuldp	vs26,	vs10,	alpha_r 
-	xvmuldp	vs27,	vs11,	alpha_r 
-
-	xvmuldp	vs28,	vs12,	alpha_r 
-	xvmuldp	vs29,	vs13,	alpha_r 
-	xvmuldp	vs30,	vs14,	alpha_r 
-	xvmuldp	vs31,	vs15,	alpha_r 
-#endif
- 	stxv		vs0,	0(C3)
-	stxv		vs2,	16(C3)
-	stxv		vs4,	32(C3)
-	stxv		vs6,	48(C3)
-
-	stxv		vs24,	64(C3)
-	stxv		vs26,	80(C3)
-	stxv		vs28,	96(C3)
-	stxv		vs30,	112(C3)
-
-	stxv		vs1,	0(C4)
-	stxv		vs3,	16(C4)
-	stxv		vs5,	32(C4)
-	stxv		vs7,	48(C4)
-	
-	stxv		vs25,	64(C4)
-	stxv		vs27,	80(C4)
-	stxv		vs29,	96(C4)
-	stxv		vs31,	112(C4)	
-
-	addi		CO,	CO,	128
-.endm
-
-/*********************************************************************
-* Macros for N=4, M=8                                                *
-*********************************************************************/
-
-.macro LOAD4x8_1
-   LOAD4x8 1
-.endm
-
-.macro LOAD4x8_0
-   LOAD4x8 0
-.endm
-.macro LOAD4x8  Zero
-
-	lxv	vs24,	0(BO)
-	lxv	vs26,	16(BO)
-	xxpermdi	vs25,	vs24,	vs24,2	
-	xxpermdi	vs27,	vs26,	vs26,2
-
-	lxv	vs0,	 0(AO)
-	lxv	vs1,	16(AO)
-	lxv	vs2,	32(AO)
-	lxv	vs3,	48(AO)
- 
-
-
-.if \Zero==1 
-    xxlxor		vs32,vs32,vs32
-    xxlxor		vs33,vs33,vs33
-	xxlxor		vs34,vs34,vs34
-	xxlxor		vs35,vs35,vs35
-
-	xxlxor		vs40,	vs40,	vs40
-	xxlxor		vs41,	vs41,	vs41
-	xxlxor		vs42,	vs42,	vs42
-	xxlxor		vs43,	vs43,	vs43
-
-	xxlxor		vs48,	vs48,	vs48
-	xxlxor		vs49,	vs49,	vs49
-	xxlxor		vs50,	vs50,	vs50
-	xxlxor		vs51,	vs51,	vs51 
-
-	xxlxor		vs56,	vs56,	vs56
-	xxlxor		vs57,	vs57,	vs57
-	xxlxor		vs58,	vs58,	vs58
-	xxlxor		vs59,	vs59,	vs59 
-
-.endif
-.endm
-
-  
- 
-.macro KERNEL4x8_L1_L2  Index,IsLast
-  KERNEL4x8_L1_L2_I  0,0,0, \Index,\IsLast,0
-.endm
-
-
-
-.macro KERNEL4x8_I1_L2  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L2_I  1,\OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x8_I1_L2_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L2_I  0,\OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L2_I  0,\OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL4x8_L1_L2_I  First, OffsetA,OffsetB, Index,IsLast ,Complete
-
-	lxv	vs8,	DISP16(\Index,0+\OffsetA)(AO)
-	lxv	vs9,	DISP16(\Index,16+\OffsetA)(AO)
-.if \First ==1
-	xvmuldp		vs32,	vs0,	vs24
-	xvmuldp		vs33,	vs1,	vs24
-	xvmuldp		vs34,	vs2,	vs24
-	xvmuldp		vs35,	vs3,	vs24
-.else
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-.endif
-
-	lxv	vs10,	DISP16(\Index,32+\OffsetA)(AO)
-	lxv	vs11,	DISP16(\Index,48+\OffsetA)(AO)
-
-
-
-.if \First ==1
-	xvmuldp		vs40,	vs0,	vs25
-	xvmuldp		vs41,	vs1,	vs25
-	xvmuldp		vs42,	vs2,	vs25
-	xvmuldp		vs43,	vs3,	vs25
-
-
-	xvmuldp		vs48,	vs0,	vs26
-	xvmuldp		vs49,	vs1,	vs26
-	xvmuldp		vs50,	vs2,	vs26
-	xvmuldp		vs51,	vs3,	vs26
-
-
-.else
-
-	lxv	vs28,	DISP8(\Index,0  +\OffsetB)(BO)
-	lxv	vs30,	DISP8(\Index,16  +\OffsetB)(BO)
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
-
-
-	xvmaddadp		vs48,	vs0,	vs26
-	xvmaddadp		vs49,	vs1,	vs26
-	xvmaddadp		vs50,	vs2,	vs26
-	xvmaddadp		vs51,	vs3,	vs26
-
-.endif
-	xxpermdi	vs29,	vs28,	vs28,2	
-	xxpermdi	vs31,	vs30,	vs30,2
-.if \First ==1
-	xvmuldp		vs56,	vs0,	vs27
-	xvmuldp		vs57,	vs1,	vs27
-	xvmuldp		vs58,	vs2,	vs27
-	xvmuldp		vs59,	vs3,	vs27
-
-.else
-	xvmaddadp		vs56,	vs0,	vs27
-	xvmaddadp		vs57,	vs1,	vs27
-	xvmaddadp		vs58,	vs2,	vs27
-	xvmaddadp		vs59,	vs3,	vs27
-
-.endif
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-.if \Complete==0
-	lxv	vs0,	DISP16(\Index,64+\OffsetA)(AO)
-	lxv	vs1,	DISP16(\Index,80+\OffsetA)(AO) 
-.endif
-
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-	xvmaddadp		vs42,	vs10,	vs29
-	xvmaddadp		vs43,	vs11,	vs29
-
-.if \Complete==0 
-	lxv	vs2,	DISP16(\Index,96+\OffsetA)(AO)
-	lxv	vs3,	DISP16(\Index,112+\OffsetA)(AO)
-.endif	
-
-
-	xvmaddadp		vs48,	vs8,	vs30
-	xvmaddadp		vs49,	vs9,	vs30
-	xvmaddadp		vs50,	vs10,	vs30
-	xvmaddadp		vs51,	vs11,	vs30
-.if \Complete==0
-	lxv	vs24,	DISP8(\Index,32  +\OffsetB)(BO)
-	lxv	vs26,	DISP8(\Index,48  +\OffsetB)(BO) 
-.endif
- 
-	xvmaddadp		vs56,	vs8,	vs31
-	xvmaddadp		vs57,	vs9,	vs31
-	xvmaddadp		vs58,	vs10,	vs31
-	xvmaddadp		vs59,	vs11,	vs31
-.if \Complete==0 
-	xxpermdi	vs25,	vs24,	vs24,2	
-	xxpermdi	vs27,	vs26,	vs26,2
-.endif
-
-  .if \IsLast==1	
-  .if \Complete==1
-	addi		AO, AO, DISP16(\Index,64+\OffsetA)
-	addi		BO, BO,  DISP8(\Index,32+\OffsetB)
-  .else
-	addi		AO, AO, DISP16(\Index,128)
-	addi		BO, BO,  DISP8(\Index,64)
-  .endif
-  .endif
-  
-
-.endm
-
- 
-
-.macro KERNEL4x8 First
-
-	lxv	vs24,	0(BO)
-	lxv	vs26,	16(BO)
-	xxpermdi	vs25,	vs24,	vs24,2	
-	xxpermdi	vs27,	vs26,	vs26,2
-
-	lxv	vs0,	0(AO)
-	lxv	vs1,	16(AO)
-	lxv	vs2,	32(AO)
-	lxv	vs3,	48(AO) 
-
-
-
- 
-	addi		BO, BO, 32
-    addi		AO, AO, 64
-
-.if \First==1
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
- 
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-	xvmuldp			vs42,	vs2,	vs25
-	xvmuldp			vs43,	vs3,	vs25
- 
-
-	xvmuldp			vs48,	vs0,	vs26
-	xvmuldp			vs49,	vs1,	vs26
-	xvmuldp			vs50,	vs2,	vs26
-	xvmuldp			vs51,	vs3,	vs26
- 
-
-	xvmuldp			vs56,	vs0,	vs27
-	xvmuldp			vs57,	vs1,	vs27
-	xvmuldp			vs58,	vs2,	vs27
-	xvmuldp			vs59,	vs3,	vs27
- 
-.else
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
- 
-
-
-	xvmaddadp		vs48,	vs0,	vs26
-	xvmaddadp		vs49,	vs1,	vs26
-	xvmaddadp		vs50,	vs2,	vs26
-	xvmaddadp		vs51,	vs3,	vs26
- 
-
-
-	xvmaddadp		vs56,	vs0,	vs27
-	xvmaddadp		vs57,	vs1,	vs27
-	xvmaddadp		vs58,	vs2,	vs27
-	xvmaddadp		vs59,	vs3,	vs27
-
-
-.endif
-.endm
-
- 
-
-.macro SAVE4x8
-	add		T2,	CO,	LDC
-	add		T3,	T2,	LDC
-	add		T4,	T3,	LDC
-#ifndef TRMMKERNEL
-	lxv		vs0,	0(CO)
-	lxv		vs2,	16(CO)
-#endif	
-	xxpermdi  vs8, vs40,vs32,1
- 	xxpermdi  vs9 ,vs32,vs40,1
-#ifndef TRMMKERNEL	 
-	lxv		vs4,	32(CO)
-	lxv		vs6,	48(CO)
-#endif	
-	xxpermdi  vs10, vs41,vs33,1		 
- 	xxpermdi  vs11 ,vs33,vs41,1
-#ifndef TRMMKERNEL	 
-	lxv		vs1,	0(T2)
-	lxv		vs3,	16(T2)
-#endif	
-	xxpermdi  vs12, vs42,vs34,1
- 	xxpermdi  vs13 ,vs34,vs42,1
-#ifndef TRMMKERNEL	 
-	lxv		vs5,	32(T2)
-	lxv		vs7,	48(T2)
-#endif	
-	xxpermdi  vs14, vs43,vs35,1		 
- 	xxpermdi  vs15 ,vs35,vs43,1	
- 
-
-
-#ifndef TRMMKERNEL 
-	xvmaddadp	vs0,	vs8,	alpha_r 
-	xvmaddadp	vs1,	vs9,	alpha_r 
-	xvmaddadp	vs2,	vs10,	alpha_r 
-	xvmaddadp	vs3,	vs11,	alpha_r 
-
-	xvmaddadp	vs4,	vs12,	alpha_r 
-	xvmaddadp	vs5,	vs13,	alpha_r 
-	xvmaddadp	vs6,	vs14,	alpha_r 
-	xvmaddadp	vs7,	vs15,	alpha_r 
-#else
-	xvmuldp	vs0,	vs8,	alpha_r 
-	xvmuldp	vs1,	vs9,	alpha_r 
-	xvmuldp	vs2,	vs10,	alpha_r 
-	xvmuldp	vs3,	vs11,	alpha_r 
-
-	xvmuldp	vs4,	vs12,	alpha_r 
-	xvmuldp	vs5,	vs13,	alpha_r 
-	xvmuldp	vs6,	vs14,	alpha_r 
-	xvmuldp	vs7,	vs15,	alpha_r 
-
-#endif
- 
-
-	stxv		vs0,	0(CO)
-	stxv		vs2,	16(CO)
-	stxv		vs4,	32(CO)
-	stxv		vs6,	48(CO)
-
- 
-	stxv		vs1,	0(T2)
-	stxv		vs3,	16(T2)
-	stxv		vs5,	32(T2)
-	stxv		vs7,	48(T2)
-	
- 
-	xxpermdi  vs8, vs56,vs48,1
- 	xxpermdi  vs9 ,vs48,vs56,1
-#ifndef TRMMKERNEL 
- 	lxv		vs0,	0(T3)
-	lxv		vs2,	16(T3)
-#endif	
-	xxpermdi  vs10, vs57,vs49,1		 
- 	xxpermdi  vs11 ,vs49,vs57,1	
-#ifndef TRMMKERNEL 	 
-	lxv		vs4,	32(T3)
-	lxv		vs6,	48(T3)
-#endif 
-	xxpermdi  vs12, vs58,vs50,1
- 	xxpermdi  vs13 ,vs50,vs58,1
-#ifndef TRMMKERNEL 	 
-	lxv		vs1,	0(T4)
-	lxv		vs3,	16(T4)
-#endif	
-	xxpermdi  vs14, vs59,vs51,1		 
- 	xxpermdi  vs15 ,vs51,vs59,1	
-#ifndef TRMMKERNEL 	 
-	lxv		vs5,	32(T4)
-	lxv		vs7,	48(T4)
- 
- 
-	xvmaddadp	vs0,	vs8,	alpha_r 
-	xvmaddadp	vs1,	vs9,	alpha_r 
-	xvmaddadp	vs2,	vs10,	alpha_r 
-	xvmaddadp	vs3,	vs11,	alpha_r 
-	
-
-
-	xvmaddadp	vs4,	vs12,	alpha_r 
-	xvmaddadp	vs5,	vs13,	alpha_r 
-	xvmaddadp	vs6,	vs14,	alpha_r 
-	xvmaddadp	vs7,	vs15,	alpha_r 
-#else
-	xvmuldp	vs0,	vs8,	alpha_r 
-	xvmuldp	vs1,	vs9,	alpha_r 
-	xvmuldp	vs2,	vs10,	alpha_r 
-	xvmuldp	vs3,	vs11,	alpha_r 
-	
-
-
-	xvmuldp	vs4,	vs12,	alpha_r 
-	xvmuldp	vs5,	vs13,	alpha_r 
-	xvmuldp	vs6,	vs14,	alpha_r 
-	xvmuldp	vs7,	vs15,	alpha_r 
-
-#endif
-
-
- 	stxv		vs0,	0(T3)
-	stxv		vs2,	16(T3)
-	stxv		vs4,	32(T3)
-	stxv		vs6,	48(T3)
-
- 
-	stxv		vs1,	0(T4)
-	stxv		vs3,	16(T4)
-	stxv		vs5,	32(T4)
-	stxv		vs7,	48(T4)
-	
- 
-
-	addi		CO,	CO,	64
-.endm
-
-
-/*********************************************************************
-* Macros for N=4, M=4                                                *
-*********************************************************************/
-
-.macro LOAD4x4_1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 32
-
-.endm
-
-.macro KERNEL4x4_I1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-	lxvdsx	vs30,	o16,	BO
-	lxvdsx	vs31,	o24,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 32
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-
-	xvmuldp			vs48,	vs0,	vs26
-	xvmuldp			vs49,	vs1,	vs26
-
-	xvmuldp			vs56,	vs0,	vs27
-	xvmuldp			vs57,	vs1,	vs27
-
-.endm
-
-.macro KERNEL4x4_1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-	lxvdsx	vs30,	o16,	BO
-	lxvdsx	vs31,	o24,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 32
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-
-	xvmaddadp		vs48,	vs0,	vs26
-	xvmaddadp		vs49,	vs1,	vs26
-
-	xvmaddadp		vs56,	vs0,	vs27
-	xvmaddadp		vs57,	vs1,	vs27
-
-.endm
-
-.macro KERNEL4x4_2
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 32
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-
-	xvmaddadp		vs48,	vs8,	vs30
-	xvmaddadp		vs49,	vs9,	vs30
-
-	xvmaddadp		vs56,	vs8,	vs31
-	xvmaddadp		vs57,	vs9,	vs31
-
-.endm
-
-.macro KERNEL4x4_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-
-	xvmaddadp		vs48,	vs8,	vs30
-	xvmaddadp		vs49,	vs9,	vs30
-
-	xvmaddadp		vs56,	vs8,	vs31
-	xvmaddadp		vs57,	vs9,	vs31
-
-.endm
-
-.macro KERNEL4x4_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 32
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-
-	xvmuldp			vs48,	vs0,	vs26
-	xvmuldp			vs49,	vs1,	vs26
-
-	xvmuldp			vs56,	vs0,	vs27
-	xvmuldp			vs57,	vs1,	vs27
-
-.endm
-
-.macro KERNEL4x4_SUB1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 32
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-
-	xvmaddadp		vs48,	vs0,	vs26
-	xvmaddadp		vs49,	vs1,	vs26
-
-	xvmaddadp		vs56,	vs0,	vs27
-	xvmaddadp		vs57,	vs1,	vs27
-
-.endm
-
-.macro SAVE4x4
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-	xvmaddadp	vs1,	vs33,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-	xvmuldp		vs1,	vs33,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-	lxvd2x		vs9,	o16,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs40,	alpha_r
-	xvmaddadp	vs9,	vs41,	alpha_r
-#else
-	xvmuldp		vs8,	vs40,	alpha_r
-	xvmuldp		vs9,	vs41,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-	stxvd2x		vs9,	o16,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs48,	alpha_r
-	xvmaddadp	vs1,	vs49,	alpha_r
-#else
-	xvmuldp		vs0,	vs48,	alpha_r
-	xvmuldp		vs1,	vs49,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-	lxvd2x		vs9,	o16,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs56,	alpha_r
-	xvmaddadp	vs9,	vs57,	alpha_r
-#else
-	xvmuldp		vs8,	vs56,	alpha_r
-	xvmuldp		vs9,	vs57,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-	stxvd2x		vs9,	o16,	T1
-
-	addi		CO,	CO,	32
-
-.endm
-
-/*********************************************************************
-* Macros for N=4, M=2                                                *
-*********************************************************************/
-
-.macro LOAD4x2_1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 32
-
-.endm
-
-.macro KERNEL4x2_I1
-
-	lxvd2x	vs8,	0,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-	lxvdsx	vs30,	o16,	BO
-	lxvdsx	vs31,	o24,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 32
-
-
-	xvmuldp			vs32,	vs0,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-
-	xvmuldp			vs48,	vs0,	vs26
-
-	xvmuldp			vs56,	vs0,	vs27
-
-.endm
-
-.macro KERNEL4x2_1
-
-	lxvd2x	vs8,	0,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-	lxvdsx	vs30,	o16,	BO
-	lxvdsx	vs31,	o24,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 32
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-
-	xvmaddadp		vs48,	vs0,	vs26
-
-	xvmaddadp		vs56,	vs0,	vs27
-
-.endm
-
-.macro KERNEL4x2_2
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 32
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-
-	xvmaddadp		vs48,	vs8,	vs30
-
-	xvmaddadp		vs56,	vs8,	vs31
-
-.endm
-
-.macro KERNEL4x2_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-
-	xvmaddadp		vs48,	vs8,	vs30
-
-	xvmaddadp		vs56,	vs8,	vs31
-
-.endm
-
-.macro KERNEL4x2_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 32
-
-
-	xvmuldp			vs32,	vs0,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-
-	xvmuldp			vs48,	vs0,	vs26
-
-	xvmuldp			vs56,	vs0,	vs27
-
-.endm
-
-.macro KERNEL4x2_SUB1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 32
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-
-	xvmaddadp		vs48,	vs0,	vs26
-
-	xvmaddadp		vs56,	vs0,	vs27
-
-.endm
-
-.macro SAVE4x2
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs40,	alpha_r
-#else
-	xvmuldp		vs8,	vs40,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs48,	alpha_r
-#else
-	xvmuldp		vs0,	vs48,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs56,	alpha_r
-#else
-	xvmuldp		vs8,	vs56,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-
-	addi		CO,	CO,	16
-
-.endm
-
-/*********************************************************************
-* Macros for N=4, M=1                                                *
-*********************************************************************/
-
-.macro LOAD4x1_1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-	lxsdx	vs26,	o16,	BO
-	lxsdx	vs27,	o24,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 32
-
-.endm
-
-.macro KERNEL4x1_I1
-
-	lxsdx	vs8,	0,	AO
-
-	lxsdx	vs28,	0,	BO
-	lxsdx	vs29,	o8,	BO
-	lxsdx	vs30,	o16,	BO
-	lxsdx	vs31,	o24,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 32
-
-
-	xsmuldp			vs32,	vs0,	vs24
-
-	xsmuldp			vs40,	vs0,	vs25
-
-	xsmuldp			vs48,	vs0,	vs26
-
-	xsmuldp			vs56,	vs0,	vs27
-
-.endm
-
-.macro KERNEL4x1_1
-
-	lxsdx	vs8,	0,	AO
-
-	lxsdx	vs28,	0,	BO
-	lxsdx	vs29,	o8,	BO
-	lxsdx	vs30,	o16,	BO
-	lxsdx	vs31,	o24,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 32
-
-
-	xsmaddadp		vs32,	vs0,	vs24
-
-	xsmaddadp		vs40,	vs0,	vs25
-
-	xsmaddadp		vs48,	vs0,	vs26
-
-	xsmaddadp		vs56,	vs0,	vs27
-
-.endm
-
-.macro KERNEL4x1_2
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-	lxsdx	vs26,	o16,	BO
-	lxsdx	vs27,	o24,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 32
-
-
-	xsmaddadp		vs32,	vs8,	vs28
-
-	xsmaddadp		vs40,	vs8,	vs29
-
-	xsmaddadp		vs48,	vs8,	vs30
-
-	xsmaddadp		vs56,	vs8,	vs31
-
-.endm
-
-.macro KERNEL4x1_E2
-
-
-	xsmaddadp		vs32,	vs8,	vs28
-
-	xsmaddadp		vs40,	vs8,	vs29
-
-	xsmaddadp		vs48,	vs8,	vs30
-
-	xsmaddadp		vs56,	vs8,	vs31
-
-.endm
-
-.macro KERNEL4x1_SUBI1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-	lxsdx	vs26,	o16,	BO
-	lxsdx	vs27,	o24,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 32
-
-
-	xsmuldp			vs32,	vs0,	vs24
-
-	xsmuldp			vs40,	vs0,	vs25
-
-	xsmuldp			vs48,	vs0,	vs26
-
-	xsmuldp			vs56,	vs0,	vs27
-
-.endm
-
-.macro KERNEL4x1_SUB1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-	lxsdx	vs26,	o16,	BO
-	lxsdx	vs27,	o24,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 32
-
-
-	xsmaddadp		vs32,	vs0,	vs24
-
-	xsmaddadp		vs40,	vs0,	vs25
-
-	xsmaddadp		vs48,	vs0,	vs26
-
-	xsmaddadp		vs56,	vs0,	vs27
-
-.endm
-
-.macro SAVE4x1
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxsdx		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xsmaddadp	vs0,	vs32,	alpha_r
-#else
-	xsmuldp		vs0,	vs32,	alpha_r
-#endif
-
-	stxsdx		vs0,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxsdx		vs8,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xsmaddadp	vs8,	vs40,	alpha_r
-#else
-	xsmuldp		vs8,	vs40,	alpha_r
-#endif
-
-	stxsdx		vs8,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxsdx		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xsmaddadp	vs0,	vs48,	alpha_r
-#else
-	xsmuldp		vs0,	vs48,	alpha_r
-#endif
-
-	stxsdx		vs0,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxsdx		vs8,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xsmaddadp	vs8,	vs56,	alpha_r
-#else
-	xsmuldp		vs8,	vs56,	alpha_r
-#endif
-
-	stxsdx		vs8,	0,	T1
-
-	addi		CO,	CO,	8
-
-.endm
-
-/*********************************************************************
-* Macros for N=2, M=16                                               *
-*********************************************************************/
-
-.macro LOAD2x16_1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-.endm
-
-.macro KERNEL2x16_I1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-	lxvd2x	vs12,	0,	AO
-	lxvd2x	vs13,	o16,	AO
-	lxvd2x	vs14,	o32,	AO
-	lxvd2x	vs15,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-	xvmuldp			vs36,	vs4,	vs24
-	xvmuldp			vs37,	vs5,	vs24
-	xvmuldp			vs38,	vs6,	vs24
-	xvmuldp			vs39,	vs7,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-	xvmuldp			vs42,	vs2,	vs25
-	xvmuldp			vs43,	vs3,	vs25
-	xvmuldp			vs44,	vs4,	vs25
-	xvmuldp			vs45,	vs5,	vs25
-	xvmuldp			vs46,	vs6,	vs25
-	xvmuldp			vs47,	vs7,	vs25
-
-.endm
-
-.macro KERNEL2x16_1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-	lxvd2x	vs12,	0,	AO
-	lxvd2x	vs13,	o16,	AO
-	lxvd2x	vs14,	o32,	AO
-	lxvd2x	vs15,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-	xvmaddadp		vs36,	vs4,	vs24
-	xvmaddadp		vs37,	vs5,	vs24
-	xvmaddadp		vs38,	vs6,	vs24
-	xvmaddadp		vs39,	vs7,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
-	xvmaddadp		vs44,	vs4,	vs25
-	xvmaddadp		vs45,	vs5,	vs25
-	xvmaddadp		vs46,	vs6,	vs25
-	xvmaddadp		vs47,	vs7,	vs25
-
-.endm
-
-.macro KERNEL2x16_2
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-	xvmaddadp		vs36,	vs12,	vs28
-	xvmaddadp		vs37,	vs13,	vs28
-	xvmaddadp		vs38,	vs14,	vs28
-	xvmaddadp		vs39,	vs15,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-	xvmaddadp		vs42,	vs10,	vs29
-	xvmaddadp		vs43,	vs11,	vs29
-	xvmaddadp		vs44,	vs12,	vs29
-	xvmaddadp		vs45,	vs13,	vs29
-	xvmaddadp		vs46,	vs14,	vs29
-	xvmaddadp		vs47,	vs15,	vs29
-
-.endm
-
-.macro KERNEL2x16_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-	xvmaddadp		vs36,	vs12,	vs28
-	xvmaddadp		vs37,	vs13,	vs28
-	xvmaddadp		vs38,	vs14,	vs28
-	xvmaddadp		vs39,	vs15,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-	xvmaddadp		vs42,	vs10,	vs29
-	xvmaddadp		vs43,	vs11,	vs29
-	xvmaddadp		vs44,	vs12,	vs29
-	xvmaddadp		vs45,	vs13,	vs29
-	xvmaddadp		vs46,	vs14,	vs29
-	xvmaddadp		vs47,	vs15,	vs29
-
-.endm
-
-.macro KERNEL2x16_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-	xvmuldp			vs36,	vs4,	vs24
-	xvmuldp			vs37,	vs5,	vs24
-	xvmuldp			vs38,	vs6,	vs24
-	xvmuldp			vs39,	vs7,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-	xvmuldp			vs42,	vs2,	vs25
-	xvmuldp			vs43,	vs3,	vs25
-	xvmuldp			vs44,	vs4,	vs25
-	xvmuldp			vs45,	vs5,	vs25
-	xvmuldp			vs46,	vs6,	vs25
-	xvmuldp			vs47,	vs7,	vs25
-
-.endm
-
-.macro KERNEL2x16_SUB1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-	xvmaddadp		vs36,	vs4,	vs24
-	xvmaddadp		vs37,	vs5,	vs24
-	xvmaddadp		vs38,	vs6,	vs24
-	xvmaddadp		vs39,	vs7,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
-	xvmaddadp		vs44,	vs4,	vs25
-	xvmaddadp		vs45,	vs5,	vs25
-	xvmaddadp		vs46,	vs6,	vs25
-	xvmaddadp		vs47,	vs7,	vs25
-
-.endm
-
-.macro SAVE2x16
-
-	mr		T1,	CO
-	addi		T2,	T1,	64
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-	lxvd2x		vs2,	o32,	T1
-	lxvd2x		vs3,	o48,	T1
-
-	lxvd2x		vs4,	0,	T2
-	lxvd2x		vs5,	o16,	T2
-	lxvd2x		vs6,	o32,	T2
-	lxvd2x		vs7,	o48,	T2
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-	xvmaddadp	vs1,	vs33,	alpha_r
-	xvmaddadp	vs2,	vs34,	alpha_r
-	xvmaddadp	vs3,	vs35,	alpha_r
-	xvmaddadp	vs4,	vs36,	alpha_r
-	xvmaddadp	vs5,	vs37,	alpha_r
-	xvmaddadp	vs6,	vs38,	alpha_r
-	xvmaddadp	vs7,	vs39,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-	xvmuldp		vs1,	vs33,	alpha_r
-	xvmuldp		vs2,	vs34,	alpha_r
-	xvmuldp		vs3,	vs35,	alpha_r
-	xvmuldp		vs4,	vs36,	alpha_r
-	xvmuldp		vs5,	vs37,	alpha_r
-	xvmuldp		vs6,	vs38,	alpha_r
-	xvmuldp		vs7,	vs39,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-	stxvd2x		vs2,	o32,	T1
-	stxvd2x		vs3,	o48,	T1
-
-	stxvd2x		vs4,	0,	T2
-	stxvd2x		vs5,	o16,	T2
-	stxvd2x		vs6,	o32,	T2
-	stxvd2x		vs7,	o48,	T2
-
-	add		T1,	T1,	LDC
-	add		T2,	T2,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-	lxvd2x		vs9,	o16,	T1
-	lxvd2x		vs10,	o32,	T1
-	lxvd2x		vs11,	o48,	T1
-
-	lxvd2x		vs12,	0,	T2
-	lxvd2x		vs13,	o16,	T2
-	lxvd2x		vs14,	o32,	T2
-	lxvd2x		vs15,	o48,	T2
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs40,	alpha_r
-	xvmaddadp	vs9,	vs41,	alpha_r
-	xvmaddadp	vs10,	vs42,	alpha_r
-	xvmaddadp	vs11,	vs43,	alpha_r
-	xvmaddadp	vs12,	vs44,	alpha_r
-	xvmaddadp	vs13,	vs45,	alpha_r
-	xvmaddadp	vs14,	vs46,	alpha_r
-	xvmaddadp	vs15,	vs47,	alpha_r
-#else
-	xvmuldp		vs8,	vs40,	alpha_r
-	xvmuldp		vs9,	vs41,	alpha_r
-	xvmuldp		vs10,	vs42,	alpha_r
-	xvmuldp		vs11,	vs43,	alpha_r
-	xvmuldp		vs12,	vs44,	alpha_r
-	xvmuldp		vs13,	vs45,	alpha_r
-	xvmuldp		vs14,	vs46,	alpha_r
-	xvmuldp		vs15,	vs47,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-	stxvd2x		vs9,	o16,	T1
-	stxvd2x		vs10,	o32,	T1
-	stxvd2x		vs11,	o48,	T1
-
-	stxvd2x		vs12,	0,	T2
-	stxvd2x		vs13,	o16,	T2
-	stxvd2x		vs14,	o32,	T2
-	stxvd2x		vs15,	o48,	T2
-
-	addi		CO,	CO,	128
-
-.endm
-
-/*********************************************************************
-* Macros for N=4, M=8                                                *
-*********************************************************************/
-
-.macro LOAD2x8_1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-.endm
-
-.macro KERNEL2x8_I1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-	xvmuldp			vs42,	vs2,	vs25
-	xvmuldp			vs43,	vs3,	vs25
-
-.endm
-
-.macro KERNEL2x8_1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
-
-.endm
-
-.macro KERNEL2x8_2
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-	xvmaddadp		vs42,	vs10,	vs29
-	xvmaddadp		vs43,	vs11,	vs29
-
-.endm
-
-.macro KERNEL2x8_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-	xvmaddadp		vs42,	vs10,	vs29
-	xvmaddadp		vs43,	vs11,	vs29
-
-.endm
-
-.macro KERNEL2x8_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-	xvmuldp			vs42,	vs2,	vs25
-	xvmuldp			vs43,	vs3,	vs25
-
-.endm
-
-.macro KERNEL2x8_SUB1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
-
-.endm
-
-.macro SAVE2x8
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-	lxvd2x		vs2,	o32,	T1
-	lxvd2x		vs3,	o48,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-	xvmaddadp	vs1,	vs33,	alpha_r
-	xvmaddadp	vs2,	vs34,	alpha_r
-	xvmaddadp	vs3,	vs35,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-	xvmuldp		vs1,	vs33,	alpha_r
-	xvmuldp		vs2,	vs34,	alpha_r
-	xvmuldp		vs3,	vs35,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-	stxvd2x		vs2,	o32,	T1
-	stxvd2x		vs3,	o48,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-	lxvd2x		vs9,	o16,	T1
-	lxvd2x		vs10,	o32,	T1
-	lxvd2x		vs11,	o48,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs40,	alpha_r
-	xvmaddadp	vs9,	vs41,	alpha_r
-	xvmaddadp	vs10,	vs42,	alpha_r
-	xvmaddadp	vs11,	vs43,	alpha_r
-#else
-	xvmuldp		vs8,	vs40,	alpha_r
-	xvmuldp		vs9,	vs41,	alpha_r
-	xvmuldp		vs10,	vs42,	alpha_r
-	xvmuldp		vs11,	vs43,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-	stxvd2x		vs9,	o16,	T1
-	stxvd2x		vs10,	o32,	T1
-	stxvd2x		vs11,	o48,	T1
-
-	addi		CO,	CO,	64
-
-.endm
-
-/*********************************************************************
-* Macros for N=2, M=4                                                *
-*********************************************************************/
-
-.macro LOAD2x4_1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 16
-
-.endm
-
-.macro KERNEL2x4_I1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 16
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-
-.endm
-
-.macro KERNEL2x4_1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-
-.endm
-
-.macro KERNEL2x4_2
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-
-.endm
-
-.macro KERNEL2x4_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-
-.endm
-
-.macro KERNEL2x4_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 16
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-
-.endm
-
-.macro KERNEL2x4_SUB1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-
-.endm
-
-.macro SAVE2x4
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-	xvmaddadp	vs1,	vs33,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-	xvmuldp		vs1,	vs33,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-	lxvd2x		vs9,	o16,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs40,	alpha_r
-	xvmaddadp	vs9,	vs41,	alpha_r
-#else
-	xvmuldp		vs8,	vs40,	alpha_r
-	xvmuldp		vs9,	vs41,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-	stxvd2x		vs9,	o16,	T1
-
-	addi		CO,	CO,	32
-
-.endm
-
-/*********************************************************************
-* Macros for N=2, M=2                                                *
-*********************************************************************/
-
-.macro LOAD2x2_1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 16
-
-.endm
-
-.macro KERNEL2x2_I1
-
-	lxvd2x	vs8,	0,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 16
-
-
-	xvmuldp			vs32,	vs0,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-
-.endm
-
-.macro KERNEL2x2_1
-
-	lxvd2x	vs8,	0,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-
-.endm
-
-.macro KERNEL2x2_2
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-
-.endm
-
-.macro KERNEL2x2_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-
-.endm
-
-.macro KERNEL2x2_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 16
-
-
-	xvmuldp			vs32,	vs0,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-
-.endm
-
-.macro KERNEL2x2_SUB1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-
-.endm
-
-.macro SAVE2x2
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs40,	alpha_r
-#else
-	xvmuldp		vs8,	vs40,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-
-	addi		CO,	CO,	16
-
-.endm
-
-/*********************************************************************
-* Macros for N=2, M=1                                                *
-*********************************************************************/
-
-.macro LOAD2x1_1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 16
-
-.endm
-
-.macro KERNEL2x1_I1
-
-	lxsdx	vs8,	0,	AO
-
-	lxsdx	vs28,	0,	BO
-	lxsdx	vs29,	o8,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 16
-
-
-	xsmuldp			vs32,	vs0,	vs24
-
-	xsmuldp			vs40,	vs0,	vs25
-
-.endm
-
-.macro KERNEL2x1_1
-
-	lxsdx	vs8,	0,	AO
-
-	lxsdx	vs28,	0,	BO
-	lxsdx	vs29,	o8,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 16
-
-
-	xsmaddadp		vs32,	vs0,	vs24
-
-	xsmaddadp		vs40,	vs0,	vs25
-
-.endm
-
-.macro KERNEL2x1_2
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 16
-
-
-	xsmaddadp		vs32,	vs8,	vs28
-
-	xsmaddadp		vs40,	vs8,	vs29
-
-.endm
-
-.macro KERNEL2x1_E2
-
-
-	xsmaddadp		vs32,	vs8,	vs28
-
-	xsmaddadp		vs40,	vs8,	vs29
-
-.endm
-
-.macro KERNEL2x1_SUBI1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 16
-
-
-	xsmuldp			vs32,	vs0,	vs24
-
-	xsmuldp			vs40,	vs0,	vs25
-
-.endm
-
-.macro KERNEL2x1_SUB1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 16
-
-
-	xsmaddadp		vs32,	vs0,	vs24
-
-	xsmaddadp		vs40,	vs0,	vs25
-
-.endm
-
-.macro SAVE2x1
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxsdx		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xsmaddadp	vs0,	vs32,	alpha_r
-#else
-	xsmuldp		vs0,	vs32,	alpha_r
-#endif
-
-	stxsdx		vs0,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxsdx		vs8,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xsmaddadp	vs8,	vs40,	alpha_r
-#else
-	xsmuldp		vs8,	vs40,	alpha_r
-#endif
-
-	stxsdx		vs8,	0,	T1
-
-	addi		CO,	CO,	8
-
-.endm
-
-/*********************************************************************
-* Macros for N=1, M=16                                               *
-*********************************************************************/
-
-.macro LOAD1x16_1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-.endm
-
-.macro KERNEL1x16_I1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-	lxvd2x	vs12,	0,	AO
-	lxvd2x	vs13,	o16,	AO
-	lxvd2x	vs14,	o32,	AO
-	lxvd2x	vs15,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-	xvmuldp			vs36,	vs4,	vs24
-	xvmuldp			vs37,	vs5,	vs24
-	xvmuldp			vs38,	vs6,	vs24
-	xvmuldp			vs39,	vs7,	vs24
-
-.endm
-
-.macro KERNEL1x16_1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-	lxvd2x	vs12,	0,	AO
-	lxvd2x	vs13,	o16,	AO
-	lxvd2x	vs14,	o32,	AO
-	lxvd2x	vs15,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-	xvmaddadp		vs36,	vs4,	vs24
-	xvmaddadp		vs37,	vs5,	vs24
-	xvmaddadp		vs38,	vs6,	vs24
-	xvmaddadp		vs39,	vs7,	vs24
-
-.endm
-
-.macro KERNEL1x16_2
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-	xvmaddadp		vs36,	vs12,	vs28
-	xvmaddadp		vs37,	vs13,	vs28
-	xvmaddadp		vs38,	vs14,	vs28
-	xvmaddadp		vs39,	vs15,	vs28
-
-.endm
-
-.macro KERNEL1x16_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-	xvmaddadp		vs36,	vs12,	vs28
-	xvmaddadp		vs37,	vs13,	vs28
-	xvmaddadp		vs38,	vs14,	vs28
-	xvmaddadp		vs39,	vs15,	vs28
-
-.endm
-
-.macro KERNEL1x16_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-	xvmuldp			vs36,	vs4,	vs24
-	xvmuldp			vs37,	vs5,	vs24
-	xvmuldp			vs38,	vs6,	vs24
-	xvmuldp			vs39,	vs7,	vs24
-
-.endm
-
-.macro KERNEL1x16_SUB1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-	xvmaddadp		vs36,	vs4,	vs24
-	xvmaddadp		vs37,	vs5,	vs24
-	xvmaddadp		vs38,	vs6,	vs24
-	xvmaddadp		vs39,	vs7,	vs24
-
-.endm
-
-.macro SAVE1x16
-
-	mr		T1,	CO
-	addi		T2,	T1,	64
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-	lxvd2x		vs2,	o32,	T1
-	lxvd2x		vs3,	o48,	T1
-
-	lxvd2x		vs4,	0,	T2
-	lxvd2x		vs5,	o16,	T2
-	lxvd2x		vs6,	o32,	T2
-	lxvd2x		vs7,	o48,	T2
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-	xvmaddadp	vs1,	vs33,	alpha_r
-	xvmaddadp	vs2,	vs34,	alpha_r
-	xvmaddadp	vs3,	vs35,	alpha_r
-	xvmaddadp	vs4,	vs36,	alpha_r
-	xvmaddadp	vs5,	vs37,	alpha_r
-	xvmaddadp	vs6,	vs38,	alpha_r
-	xvmaddadp	vs7,	vs39,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-	xvmuldp		vs1,	vs33,	alpha_r
-	xvmuldp		vs2,	vs34,	alpha_r
-	xvmuldp		vs3,	vs35,	alpha_r
-	xvmuldp		vs4,	vs36,	alpha_r
-	xvmuldp		vs5,	vs37,	alpha_r
-	xvmuldp		vs6,	vs38,	alpha_r
-	xvmuldp		vs7,	vs39,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-	stxvd2x		vs2,	o32,	T1
-	stxvd2x		vs3,	o48,	T1
-
-	stxvd2x		vs4,	0,	T2
-	stxvd2x		vs5,	o16,	T2
-	stxvd2x		vs6,	o32,	T2
-	stxvd2x		vs7,	o48,	T2
-
-	addi		CO,	CO,	128
-
-.endm
-
-/*********************************************************************
-* Macros for N=4, M=8                                                *
-*********************************************************************/
-
-.macro LOAD1x8_1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-.endm
-
-.macro KERNEL1x8_I1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-
-.endm
-
-.macro KERNEL1x8_1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-
-.endm
-
-.macro KERNEL1x8_2
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-
-.endm
-
-.macro KERNEL1x8_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-
-.endm
-
-.macro KERNEL1x8_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-
-.endm
-
-.macro KERNEL1x8_SUB1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-
-.endm
-
-.macro SAVE1x8
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-	lxvd2x		vs2,	o32,	T1
-	lxvd2x		vs3,	o48,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-	xvmaddadp	vs1,	vs33,	alpha_r
-	xvmaddadp	vs2,	vs34,	alpha_r
-	xvmaddadp	vs3,	vs35,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-	xvmuldp		vs1,	vs33,	alpha_r
-	xvmuldp		vs2,	vs34,	alpha_r
-	xvmuldp		vs3,	vs35,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-	stxvd2x		vs2,	o32,	T1
-	stxvd2x		vs3,	o48,	T1
-
-	addi		CO,	CO,	64
-
-.endm
-
-/*********************************************************************
-* Macros for N=1, M=4                                                *
-*********************************************************************/
-
-.macro LOAD1x4_1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 8
-
-.endm
-
-.macro KERNEL1x4_I1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 8
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-
-.endm
-
-.macro KERNEL1x4_1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-
-.endm
-
-.macro KERNEL1x4_2
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-
-.endm
-
-.macro KERNEL1x4_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-
-.endm
-
-.macro KERNEL1x4_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 8
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-
-.endm
-
-.macro KERNEL1x4_SUB1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-
-.endm
-
-.macro SAVE1x4
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-	xvmaddadp	vs1,	vs33,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-	xvmuldp		vs1,	vs33,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-
-	addi		CO,	CO,	32
-
-.endm
-
-/*********************************************************************
-* Macros for N=1, M=2                                                *
-*********************************************************************/
-
-.macro LOAD1x2_1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 8
-
-.endm
-
-.macro KERNEL1x2_I1
-
-	lxvd2x	vs8,	0,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 8
-
-
-	xvmuldp			vs32,	vs0,	vs24
-
-.endm
-
-.macro KERNEL1x2_1
-
-	lxvd2x	vs8,	0,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-
-.endm
-
-.macro KERNEL1x2_2
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-
-.endm
-
-.macro KERNEL1x2_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-
-.endm
-
-.macro KERNEL1x2_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 8
-
-
-	xvmuldp			vs32,	vs0,	vs24
-
-.endm
-
-.macro KERNEL1x2_SUB1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-
-.endm
-
-.macro SAVE1x2
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-
-	addi		CO,	CO,	16
-
-.endm
-
-/*********************************************************************
-* Macros for N=1, M=1                                                *
-*********************************************************************/
-
-.macro LOAD1x1_1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 8
-
-.endm
-
-.macro KERNEL1x1_I1
-
-	lxsdx	vs8,	0,	AO
-
-	lxsdx	vs28,	0,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 8
-
-
-	xsmuldp			vs32,	vs0,	vs24
-
-.endm
-
-.macro KERNEL1x1_1
-
-	lxsdx	vs8,	0,	AO
-
-	lxsdx	vs28,	0,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 8
-
-
-	xsmaddadp		vs32,	vs0,	vs24
-
-.endm
-
-.macro KERNEL1x1_2
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 8
-
-
-	xsmaddadp		vs32,	vs8,	vs28
-
-.endm
-
-.macro KERNEL1x1_E2
-
-
-	xsmaddadp		vs32,	vs8,	vs28
-
-.endm
-
-.macro KERNEL1x1_SUBI1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 8
-
-
-	xsmuldp			vs32,	vs0,	vs24
-
-.endm
-
-.macro KERNEL1x1_SUB1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 8
-
-
-	xsmaddadp		vs32,	vs0,	vs24
-
-.endm
-
-.macro SAVE1x1
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxsdx		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xsmaddadp	vs0,	vs32,	alpha_r
-#else
-	xsmuldp		vs0,	vs32,	alpha_r
-#endif
-
-	stxsdx		vs0,	0,	T1
-
-	addi		CO,	CO,	8
-
-.endm
-
-
-
-
-/****************************TRMM POINTER REFRESH MACROSES*************************/
-
-.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
-		.if \SHIFT_VAL==16 
-			slwi		\REG1,	\REG2,	7			
-		.elseif \SHIFT_VAL==8  
-			slwi		\REG1,	\REG2,	6			 
-		.elseif \SHIFT_VAL==4
-			slwi		\REG1,	\REG2,	5			  
-		.elseif \SHIFT_VAL==2
-			slwi		\REG1,	\REG2,	4			 
-		.elseif \SHIFT_VAL==1
-			slwi		\REG1,	\REG2,	3			 
-		.endif
-.endm
-
-/*
-//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		ptrbb = bb;
-// #else
-// 		ptrba += off*16;
-// 		ptrbb = bb + off*2;
-// #endif
-*/
-.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
-    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
-        /* ptrbb = bb;*/
-        mr \PTR_B,\B_VAL     /* refresh BPOINT */
-
-    #else
-		    /*
-        // ptrba  =ptrba+ off*C_A;
-        // ptrbb = bb + off*C_B; 
-				*/
-		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
-		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
-		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
-		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
-    #endif 
-.endm
-
-
-/*
-// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-// 		temp = bk-off;
-// #elif defined(LEFT)
-// 		temp = off+16;	// number of values in A
-// #else
-// 		temp = off+2;	// number of values in B
-// #endif
-*/
-.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
-    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
-                            /* temp = bk-off;*/
-           sub \TEMP_BK,\BK_VAL,\OFF_VAL
-
-    #elif defined(LEFT)
-                            /* temp = off+INCR_A;	// number of values in A */
-           addi \TEMP_BK, \OFF_VAL, \INCR_A
-    #else
-                            /* temp = off+INCR_B	// number of values in B*/
-           addi \TEMP_BK,\OFF_VAL, \INCR_B
-    #endif
-
-.endm
-/*
-// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		temp = bk - off;
-// #ifdef LEFT
-// 		temp -= 16; // number of values in A
-// #else
-// 		temp -= 2; // number of values in B
-// #endif
-// 		ptrba += temp*16;
-// 		ptrbb += temp*2;
-// #endif
-
-// #ifdef LEFT
-// 		off += 16; // number of values in A
-// #endif
-*/
- 
-
-.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
-
-    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-                    /*temp = bk - off;*/
-                sub \TEMP_BK,\BK_VAL,\OFF_VAL
-    #ifdef LEFT
-                    /*temp -= 8; // number of values in A*/
-                addi \TEMP_BK,\TEMP_BK,-\C_A
-    #else
-                    /*temp -= 4; // number of values in B*/
-                addi \TEMP_BK,\TEMP_BK,-\C_B 
-    #endif
-                    /*ptrba += temp*C_A;
-                    ptrbb += temp*C_B;*/ 
-                SHIFT_REG T4,\TEMP_BK,\C_A
-								SHIFT_REG T2,\TEMP_BK,\C_B
-                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
-								add \PTR_B, \PTR_B,T2 
-
-    #endif
-
-    #ifdef LEFT
-                    /*off += 8; // number of values in A*/
-                 addi \OFF_VAL,\OFF_VAL,\C_A
-    #endif
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+/*********************************************************************
+* Macros for N=4, M=16                                               *
+*********************************************************************/
+.macro LOAD4x16_1
+   LOAD4x16 1
+.endm
+
+.macro LOAD4x16_0
+   LOAD4x16 0
+.endm
+.macro LOAD4x16  Zero
+
+	lxv	vs24,	0(BO)
+	lxv	vs26,	16(BO)
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+
+	lxv	vs0,	 0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO)
+ 
+
+	lxv	vs4,	64(AO)
+	lxv	vs5,	80(AO)
+	lxv	vs6,	96(AO)
+	lxv	vs7,	112(AO)
+.if \Zero==1 
+    xxlxor		vs32,vs32,vs32
+    xxlxor		vs33,vs33,vs33
+	xxlxor		vs34,vs34,vs34
+	xxlxor		vs35,vs35,vs35
+	xxlxor		vs36,vs36,vs36
+	xxlxor		vs37,vs37,vs37
+	xxlxor		vs38,vs38,vs38
+	xxlxor		vs39,vs39,vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47
+	xxlxor		vs48,	vs48,	vs48
+	xxlxor		vs49,	vs49,	vs49
+	xxlxor		vs50,	vs50,	vs50
+	xxlxor		vs51,	vs51,	vs51 
+	xxlxor		vs52,	vs52,	vs52
+	xxlxor		vs53,	vs53,	vs53
+	xxlxor		vs54,	vs54,	vs54
+	xxlxor		vs55,	vs55,	vs55 
+	xxlxor		vs56,	vs56,	vs56
+	xxlxor		vs57,	vs57,	vs57
+	xxlxor		vs58,	vs58,	vs58
+	xxlxor		vs59,	vs59,	vs59 
+	xxlxor		vs60,	vs60,	vs60
+	xxlxor		vs61,	vs61,	vs61
+	xxlxor		vs62,	vs62,	vs62
+	xxlxor		vs63,	vs63,	vs63	
+.endif
+.endm
+
+  
+#define unit_size 8
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+
+.macro KERNEL4x16_L1_L2  Index,IsLast
+  KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0
+.endm
+
+
+
+.macro KERNEL4x16_I1_L2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I  AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L2_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I  AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I  AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x16_I2_L2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I  \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I2_L2_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I  \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I2_L2_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x16_L1_L2_I  AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete
+
+.if \First ==1
+	xvmuldp		vs32,	vs0,	vs24
+	xvmuldp		vs33,	vs1,	vs24
+	xvmuldp		vs34,	vs2,	vs24
+	xvmuldp		vs35,	vs3,	vs24
+.else
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+.endif
+	lxv	vs8,	DISP32(\Index,0+\OffsetA)(\AREG)
+	lxv	vs9,	DISP32(\Index,16+\OffsetA)(\AREG)
+	lxv	vs10,	DISP32(\Index,32+\OffsetA)(\AREG)
+	lxv	vs11,	DISP32(\Index,48+\OffsetA)(\AREG)
+.if \First ==1
+	xvmuldp		vs36,	vs4,	vs24
+	xvmuldp		vs37,	vs5,	vs24
+	xvmuldp		vs38,	vs6,	vs24
+	xvmuldp		vs39,	vs7,	vs24
+.else
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+.endif
+	lxv	vs28,	DISP8(\Index,0  +\OffsetB)(\BREG)
+	lxv	vs30,	DISP8(\Index,16  +\OffsetB)(\BREG)
+	xxpermdi	vs29,	vs28,	vs28,2	
+	xxpermdi	vs31,	vs30,	vs30,2
+.if \First ==1
+	xvmuldp		vs40,	vs0,	vs25
+	xvmuldp		vs41,	vs1,	vs25
+	xvmuldp		vs42,	vs2,	vs25
+	xvmuldp		vs43,	vs3,	vs25
+
+
+	xvmuldp		vs44,	vs4,	vs25
+	xvmuldp		vs45,	vs5,	vs25
+	xvmuldp		vs46,	vs6,	vs25
+	xvmuldp		vs47,	vs7,	vs25
+
+
+	xvmuldp		vs48,	vs0,	vs26
+	xvmuldp		vs49,	vs1,	vs26
+	xvmuldp		vs50,	vs2,	vs26
+	xvmuldp		vs51,	vs3,	vs26
+
+
+.else
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+
+.endif
+	lxv	vs12,  DISP32(\Index,64+\OffsetA)(\AREG)
+	lxv	vs13,  DISP32(\Index,80+\OffsetA)(\AREG)
+.if \First ==1
+	xvmuldp		vs52,	vs4,	vs26
+	xvmuldp		vs53,	vs5,	vs26
+	xvmuldp		vs54,	vs6,	vs26
+	xvmuldp		vs55,	vs7,	vs26
+
+.else
+	xvmaddadp		vs52,	vs4,	vs26
+	xvmaddadp		vs53,	vs5,	vs26
+	xvmaddadp		vs54,	vs6,	vs26
+	xvmaddadp		vs55,	vs7,	vs26
+.endif
+	lxv	vs14,  DISP32(\Index,96+\OffsetA)(\AREG)
+	lxv	vs15,  DISP32(\Index,112+\OffsetA)(\AREG)
+.if \First ==1
+	xvmuldp		vs56,	vs0,	vs27
+	xvmuldp		vs57,	vs1,	vs27
+	xvmuldp		vs58,	vs2,	vs27
+	xvmuldp		vs59,	vs3,	vs27
+
+ 
+
+	xvmuldp		vs60,	vs4,	vs27
+	xvmuldp		vs61,	vs5,	vs27
+	xvmuldp		vs62,	vs6,	vs27
+	xvmuldp		vs63,	vs7,	vs27
+
+.else
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+
+ 
+
+	xvmaddadp		vs60,	vs4,	vs27
+	xvmaddadp		vs61,	vs5,	vs27
+	xvmaddadp		vs62,	vs6,	vs27
+	xvmaddadp		vs63,	vs7,	vs27
+.endif
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+.if \Complete==0
+	lxv	vs0,	DISP32(\Index,128+\OffsetA)(\AREG)
+	lxv	vs1,	DISP32(\Index,144+\OffsetA)(\AREG)
+.endif
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+.if \Complete==0
+	lxv	vs24,	DISP8(\Index,32  +\OffsetB)(\BREG)
+	lxv	vs26,	DISP8(\Index,48  +\OffsetB)(\BREG)
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+.endif
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+.if \Complete==0
+	lxv	vs2,	DISP32(\Index,160+\OffsetA)(\AREG)
+	lxv	vs3,	DISP32(\Index,176+\OffsetA)(\AREG)
+.endif
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+	xvmaddadp		vs50,	vs10,	vs30
+	xvmaddadp		vs51,	vs11,	vs30
+.if \Complete==0
+	lxv	vs4,	DISP32(\Index,192+\OffsetA)(\AREG)
+	lxv	vs5,	DISP32(\Index,208+\OffsetA)(\AREG)
+.endif
+	xvmaddadp		vs52,	vs12,	vs30
+	xvmaddadp		vs53,	vs13,	vs30
+	xvmaddadp		vs54,	vs14,	vs30
+	xvmaddadp		vs55,	vs15,	vs30
+.if \Complete==0
+	lxv	vs6,	DISP32(\Index,224+\OffsetA)(\AREG)
+	lxv	vs7,	DISP32(\Index,240+\OffsetA)(\AREG)
+.endif
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+	xvmaddadp		vs58,	vs10,	vs31
+	xvmaddadp		vs59,	vs11,	vs31
+ 
+
+	xvmaddadp		vs60,	vs12,	vs31
+	
+	xvmaddadp		vs61,	vs13,	vs31
+	xvmaddadp		vs62,	vs14,	vs31
+	
+	xvmaddadp		vs63,	vs15,	vs31
+  .if \IsLast==1	
+  .if \Complete==1
+	addi		\AREG, \AREG, DISP32(\Index,128+\OffsetA)
+	addi		\BREG, \BREG,  DISP8(\Index,32+\OffsetB)
+  .else
+	addi		\AREG, \AREG, DISP32(\Index,256)
+	addi		\BREG, \BREG,  DISP8(\Index,64)
+  .endif
+  .endif
+  
+
+.endm
+
+ 
+
+.macro KERNEL4x16 First
+
+	lxv	vs24,	0(BO)
+	lxv	vs26,	16(BO)
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+
+	lxv	vs0,	0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO) 
+
+	lxv	vs4,	64(AO)
+	lxv	vs5,	80(AO)
+	lxv	vs6,	96(AO)
+	lxv	vs7,	112(AO)
+
+
+ 
+	addi		BO, BO, 32
+  addi		AO, AO, 128
+
+.if \First==1
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+	xvmuldp			vs50,	vs2,	vs26
+	xvmuldp			vs51,	vs3,	vs26
+	xvmuldp			vs52,	vs4,	vs26
+	xvmuldp			vs53,	vs5,	vs26
+	xvmuldp			vs54,	vs6,	vs26
+	xvmuldp			vs55,	vs7,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+	xvmuldp			vs58,	vs2,	vs27
+	xvmuldp			vs59,	vs3,	vs27
+	xvmuldp			vs60,	vs4,	vs27
+	xvmuldp			vs61,	vs5,	vs27
+	xvmuldp			vs62,	vs6,	vs27
+	xvmuldp			vs63,	vs7,	vs27
+.else
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+ 
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+ 
+	xvmaddadp		vs52,	vs4,	vs26
+	xvmaddadp		vs53,	vs5,	vs26
+	xvmaddadp		vs54,	vs6,	vs26
+	xvmaddadp		vs55,	vs7,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+	xvmaddadp		vs60,	vs4,	vs27
+	xvmaddadp		vs61,	vs5,	vs27
+	xvmaddadp		vs62,	vs6,	vs27
+	xvmaddadp		vs63,	vs7,	vs27
+
+.endif
+.endm
+
+.macro SAVE4x16_REGS
+	add		C2,	CO,	LDC
+	add		C3,	C2,	LDC
+	add		C4,	C3,	LDC
+.endm
+
+.macro SAVE4x16
+#ifndef TRMMKERNEL
+	lxv		vs0,	0(CO)
+	lxv		vs2,	16(CO)
+	lxv		vs4,	32(CO)
+	lxv		vs6,	48(CO)
+#endif	
+	xxpermdi  vs8, vs40,vs32,1
+ 	xxpermdi  vs9 ,vs32,vs40,1
+#ifndef TRMMKERNEL
+	lxv		vs24,	64(CO)
+	lxv		vs26,	80(CO)
+	lxv		vs28,	96(CO)
+	lxv		vs30,	112(CO)
+#endif	
+	xxpermdi  vs10, vs41,vs33,1		 
+ 	xxpermdi  vs11 ,vs33,vs41,1
+#ifndef TRMMKERNEL	 
+	lxv		vs1,	0(C2)
+	lxv		vs3,	16(C2)
+	lxv		vs5,	32(C2)
+	lxv		vs7,	48(C2)
+#endif	
+	xxpermdi  vs12, vs42,vs34,1
+ 	xxpermdi  vs13 ,vs34,vs42,1
+#ifndef TRMMKERNEL
+	lxv		vs25,	64(C2)
+	lxv		vs27,	80(C2)
+#endif	
+	xxpermdi  vs14, vs43,vs35,1		 
+ 	xxpermdi  vs15 ,vs35,vs43,1	
+#ifndef TRMMKERNEL	 
+	lxv		vs29,	96(C2)
+	lxv		vs31,	112(C2)	
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs8,	alpha_r 
+	xvmaddadp	vs1,	vs9,	alpha_r 
+	xvmaddadp	vs2,	vs10,	alpha_r 
+	xvmaddadp	vs3,	vs11,	alpha_r 
+#else
+	xvmuldp	vs0,	vs8,	alpha_r 
+	xvmuldp	vs1,	vs9,	alpha_r 
+	xvmuldp	vs2,	vs10,	alpha_r 
+	xvmuldp	vs3,	vs11,	alpha_r 
+
+#endif
+	xxpermdi  vs8, vs44,vs36,1
+ 	xxpermdi  vs9 ,vs36,vs44,1
+	xxpermdi  vs10, vs45,vs37,1		 
+ 	xxpermdi  vs11 ,vs37,vs45,1
+#ifndef TRMMKERNEL
+	xvmaddadp	vs4,	vs12,	alpha_r 
+	xvmaddadp	vs5,	vs13,	alpha_r 
+	xvmaddadp	vs6,	vs14,	alpha_r 
+	xvmaddadp	vs7,	vs15,	alpha_r 
+#else
+	xvmuldp	vs4,	vs12,	alpha_r 
+	xvmuldp	vs5,	vs13,	alpha_r 
+	xvmuldp	vs6,	vs14,	alpha_r 
+	xvmuldp	vs7,	vs15,	alpha_r 
+#endif
+	xxpermdi  vs12, vs46,vs38,1
+ 	xxpermdi  vs13 ,vs38,vs46,1
+	xxpermdi  vs14, vs47,vs39,1		 
+ 	xxpermdi  vs15 ,vs39,vs47,1
+
+#ifndef TRMMKERNEL 
+	xvmaddadp	vs24,	vs8,	alpha_r 
+	xvmaddadp	vs25,	vs9,	alpha_r 
+	xvmaddadp	vs26,	vs10,	alpha_r 
+	xvmaddadp	vs27,	vs11,	alpha_r 
+
+	xvmaddadp	vs28,	vs12,	alpha_r 
+	xvmaddadp	vs29,	vs13,	alpha_r 
+	xvmaddadp	vs30,	vs14,	alpha_r 
+	xvmaddadp	vs31,	vs15,	alpha_r 
+#else
+	xvmuldp	vs24,	vs8,	alpha_r 
+	xvmuldp	vs25,	vs9,	alpha_r 
+	xvmuldp	vs26,	vs10,	alpha_r 
+	xvmuldp	vs27,	vs11,	alpha_r 
+
+	xvmuldp	vs28,	vs12,	alpha_r 
+	xvmuldp	vs29,	vs13,	alpha_r 
+	xvmuldp	vs30,	vs14,	alpha_r 
+	xvmuldp	vs31,	vs15,	alpha_r 
+
+#endif
+	stxv		vs0,	0(CO)
+	stxv		vs2,	16(CO)
+	stxv		vs4,	32(CO)
+	stxv		vs6,	48(CO)
+
+	stxv		vs24,	64(CO)
+	stxv		vs26,	80(CO)
+	stxv		vs28,	96(CO)
+	stxv		vs30,	112(CO)
+
+	stxv		vs1,	0(C2)
+	stxv		vs3,	16(C2)
+	stxv		vs5,	32(C2)
+	stxv		vs7,	48(C2)
+	
+	stxv		vs25,	64(C2)
+	stxv		vs27,	80(C2)
+	stxv		vs29,	96(C2)
+	stxv		vs31,	112(C2)	
+#ifndef TRMMKERNEL
+ 	lxv		vs0,	0(C3)
+	lxv		vs2,	16(C3)
+	lxv		vs4,	32(C3)
+	lxv		vs6,	48(C3)
+#endif	
+	xxpermdi  vs8, vs56,vs48,1
+ 	xxpermdi  vs9 ,vs48,vs56,1
+#ifndef TRMMKERNEL	 
+	lxv		vs24,	64(C3)
+	lxv		vs26,	80(C3)
+#endif	
+	xxpermdi  vs10, vs57,vs49,1		 
+ 	xxpermdi  vs11 ,vs49,vs57,1	
+#ifndef TRMMKERNEL	 
+	lxv		vs28,	96(C3)
+	lxv		vs30,	112(C3)
+#endif	
+	xxpermdi  vs12, vs58,vs50,1
+ 	xxpermdi  vs13 ,vs50,vs58,1
+#ifndef TRMMKERNEL	 
+	lxv		vs1,	0(C4)
+	lxv		vs3,	16(C4)
+#endif	
+	xxpermdi  vs14, vs59,vs51,1		 
+ 	xxpermdi  vs15 ,vs51,vs59,1	
+#ifndef TRMMKERNEL	 
+	lxv		vs5,	32(C4)
+	lxv		vs7,	48(C4)
+
+	lxv		vs25,	64(C4)
+	lxv		vs27,	80(C4)
+	lxv		vs29,	96(C4)
+	lxv		vs31,	112(C4)	
+#endif
+ 
+#ifndef TRMMKERNEL 
+	xvmaddadp	vs0,	vs8,	alpha_r 
+	xvmaddadp	vs1,	vs9,	alpha_r 
+	xvmaddadp	vs2,	vs10,	alpha_r 
+	xvmaddadp	vs3,	vs11,	alpha_r 
+#else
+	xvmuldp	vs0,	vs8,	alpha_r 
+	xvmuldp	vs1,	vs9,	alpha_r 
+	xvmuldp	vs2,	vs10,	alpha_r 
+	xvmuldp	vs3,	vs11,	alpha_r 
+
+#endif
+
+	xxpermdi  vs8, vs60,vs52,1
+ 	xxpermdi  vs9 ,vs52,vs60,1
+	xxpermdi  vs10, vs61,vs53,1		 
+ 	xxpermdi  vs11 ,vs53,vs61,1
+#ifndef TRMMKERNEL
+	xvmaddadp	vs4,	vs12,	alpha_r 
+	xvmaddadp	vs5,	vs13,	alpha_r 
+	xvmaddadp	vs6,	vs14,	alpha_r 
+	xvmaddadp	vs7,	vs15,	alpha_r 
+#else
+	xvmuldp	vs4,	vs12,	alpha_r 
+	xvmuldp	vs5,	vs13,	alpha_r 
+	xvmuldp	vs6,	vs14,	alpha_r 
+	xvmuldp	vs7,	vs15,	alpha_r 
+#endif
+
+
+	xxpermdi  vs12, vs62,vs54,1
+ 	xxpermdi  vs13 ,vs54,vs62,1
+	xxpermdi  vs14, vs63,vs55,1		 
+ 	xxpermdi  vs15 ,vs55,vs63,1
+#ifndef TRMMKERNEL 
+	xvmaddadp	vs24,	vs8,	alpha_r 
+	xvmaddadp	vs25,	vs9,	alpha_r 
+	xvmaddadp	vs26,	vs10,	alpha_r 
+	xvmaddadp	vs27,	vs11,	alpha_r 
+
+	xvmaddadp	vs28,	vs12,	alpha_r 
+	xvmaddadp	vs29,	vs13,	alpha_r 
+	xvmaddadp	vs30,	vs14,	alpha_r 
+	xvmaddadp	vs31,	vs15,	alpha_r 
+#else
+	xvmuldp	vs24,	vs8,	alpha_r 
+	xvmuldp	vs25,	vs9,	alpha_r 
+	xvmuldp	vs26,	vs10,	alpha_r 
+	xvmuldp	vs27,	vs11,	alpha_r 
+
+	xvmuldp	vs28,	vs12,	alpha_r 
+	xvmuldp	vs29,	vs13,	alpha_r 
+	xvmuldp	vs30,	vs14,	alpha_r 
+	xvmuldp	vs31,	vs15,	alpha_r 
+#endif
+ 	stxv		vs0,	0(C3)
+	stxv		vs2,	16(C3)
+	stxv		vs4,	32(C3)
+	stxv		vs6,	48(C3)
+
+	stxv		vs24,	64(C3)
+	stxv		vs26,	80(C3)
+	stxv		vs28,	96(C3)
+	stxv		vs30,	112(C3)
+
+	stxv		vs1,	0(C4)
+	stxv		vs3,	16(C4)
+	stxv		vs5,	32(C4)
+	stxv		vs7,	48(C4)
+	
+	stxv		vs25,	64(C4)
+	stxv		vs27,	80(C4)
+	stxv		vs29,	96(C4)
+	stxv		vs31,	112(C4)	
+
+	addi		CO,	CO,	128
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD4x8_1
+   LOAD4x8 1
+.endm
+
+.macro LOAD4x8_0
+   LOAD4x8 0
+.endm
+.macro LOAD4x8  Zero
+
+	lxv	vs24,	0(BO)
+	lxv	vs26,	16(BO)
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+
+	lxv	vs0,	 0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO)
+ 
+
+
+.if \Zero==1 
+    xxlxor		vs32,vs32,vs32
+    xxlxor		vs33,vs33,vs33
+	xxlxor		vs34,vs34,vs34
+	xxlxor		vs35,vs35,vs35
+
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+
+	xxlxor		vs48,	vs48,	vs48
+	xxlxor		vs49,	vs49,	vs49
+	xxlxor		vs50,	vs50,	vs50
+	xxlxor		vs51,	vs51,	vs51 
+
+	xxlxor		vs56,	vs56,	vs56
+	xxlxor		vs57,	vs57,	vs57
+	xxlxor		vs58,	vs58,	vs58
+	xxlxor		vs59,	vs59,	vs59 
+
+.endif
+.endm
+
+  
+ 
+.macro KERNEL4x8_L1_L2  Index,IsLast
+  KERNEL4x8_L1_L2_I  0,0,0, \Index,\IsLast,0
+.endm
+
+
+
+.macro KERNEL4x8_I1_L2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L2_I  1,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L2_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L2_I  0,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L2_I  0,\OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x8_L1_L2_I  First, OffsetA,OffsetB, Index,IsLast ,Complete
+
+	lxv	vs8,	DISP16(\Index,0+\OffsetA)(AO)
+	lxv	vs9,	DISP16(\Index,16+\OffsetA)(AO)
+.if \First ==1
+	xvmuldp		vs32,	vs0,	vs24
+	xvmuldp		vs33,	vs1,	vs24
+	xvmuldp		vs34,	vs2,	vs24
+	xvmuldp		vs35,	vs3,	vs24
+.else
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+.endif
+
+	lxv	vs10,	DISP16(\Index,32+\OffsetA)(AO)
+	lxv	vs11,	DISP16(\Index,48+\OffsetA)(AO)
+
+
+
+.if \First ==1
+	xvmuldp		vs40,	vs0,	vs25
+	xvmuldp		vs41,	vs1,	vs25
+	xvmuldp		vs42,	vs2,	vs25
+	xvmuldp		vs43,	vs3,	vs25
+
+
+	xvmuldp		vs48,	vs0,	vs26
+	xvmuldp		vs49,	vs1,	vs26
+	xvmuldp		vs50,	vs2,	vs26
+	xvmuldp		vs51,	vs3,	vs26
+
+
+.else
+
+	lxv	vs28,	DISP8(\Index,0  +\OffsetB)(BO)
+	lxv	vs30,	DISP8(\Index,16  +\OffsetB)(BO)
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+
+.endif
+	xxpermdi	vs29,	vs28,	vs28,2	
+	xxpermdi	vs31,	vs30,	vs30,2
+.if \First ==1
+	xvmuldp		vs56,	vs0,	vs27
+	xvmuldp		vs57,	vs1,	vs27
+	xvmuldp		vs58,	vs2,	vs27
+	xvmuldp		vs59,	vs3,	vs27
+
+.else
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+
+.endif
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+.if \Complete==0
+	lxv	vs0,	DISP16(\Index,64+\OffsetA)(AO)
+	lxv	vs1,	DISP16(\Index,80+\OffsetA)(AO) 
+.endif
+
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+.if \Complete==0 
+	lxv	vs2,	DISP16(\Index,96+\OffsetA)(AO)
+	lxv	vs3,	DISP16(\Index,112+\OffsetA)(AO)
+.endif	
+
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+	xvmaddadp		vs50,	vs10,	vs30
+	xvmaddadp		vs51,	vs11,	vs30
+.if \Complete==0
+	lxv	vs24,	DISP8(\Index,32  +\OffsetB)(BO)
+	lxv	vs26,	DISP8(\Index,48  +\OffsetB)(BO) 
+.endif
+ 
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+	xvmaddadp		vs58,	vs10,	vs31
+	xvmaddadp		vs59,	vs11,	vs31
+.if \Complete==0 
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+.endif
+
+  .if \IsLast==1	
+  .if \Complete==1
+	addi		AO, AO, DISP16(\Index,64+\OffsetA)
+	addi		BO, BO,  DISP8(\Index,32+\OffsetB)
+  .else
+	addi		AO, AO, DISP16(\Index,128)
+	addi		BO, BO,  DISP8(\Index,64)
+  .endif
+  .endif
+  
+
+.endm
+
+ 
+
+.macro KERNEL4x8 First
+
+	lxv	vs24,	0(BO)
+	lxv	vs26,	16(BO)
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+
+	lxv	vs0,	0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO) 
+
+
+
+ 
+	addi		BO, BO, 32
+    addi		AO, AO, 64
+
+.if \First==1
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+ 
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+ 
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+	xvmuldp			vs50,	vs2,	vs26
+	xvmuldp			vs51,	vs3,	vs26
+ 
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+	xvmuldp			vs58,	vs2,	vs27
+	xvmuldp			vs59,	vs3,	vs27
+ 
+.else
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+ 
+
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+ 
+
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+
+
+.endif
+.endm
+
+ 
+
+.macro SAVE4x8
+	add		T2,	CO,	LDC
+	add		T3,	T2,	LDC
+	add		T4,	T3,	LDC
+#ifndef TRMMKERNEL
+	lxv		vs0,	0(CO)
+	lxv		vs2,	16(CO)
+#endif	
+	xxpermdi  vs8, vs40,vs32,1
+ 	xxpermdi  vs9 ,vs32,vs40,1
+#ifndef TRMMKERNEL	 
+	lxv		vs4,	32(CO)
+	lxv		vs6,	48(CO)
+#endif	
+	xxpermdi  vs10, vs41,vs33,1		 
+ 	xxpermdi  vs11 ,vs33,vs41,1
+#ifndef TRMMKERNEL	 
+	lxv		vs1,	0(T2)
+	lxv		vs3,	16(T2)
+#endif	
+	xxpermdi  vs12, vs42,vs34,1
+ 	xxpermdi  vs13 ,vs34,vs42,1
+#ifndef TRMMKERNEL	 
+	lxv		vs5,	32(T2)
+	lxv		vs7,	48(T2)
+#endif	
+	xxpermdi  vs14, vs43,vs35,1		 
+ 	xxpermdi  vs15 ,vs35,vs43,1	
+ 
+
+
+#ifndef TRMMKERNEL 
+	xvmaddadp	vs0,	vs8,	alpha_r 
+	xvmaddadp	vs1,	vs9,	alpha_r 
+	xvmaddadp	vs2,	vs10,	alpha_r 
+	xvmaddadp	vs3,	vs11,	alpha_r 
+
+	xvmaddadp	vs4,	vs12,	alpha_r 
+	xvmaddadp	vs5,	vs13,	alpha_r 
+	xvmaddadp	vs6,	vs14,	alpha_r 
+	xvmaddadp	vs7,	vs15,	alpha_r 
+#else
+	xvmuldp	vs0,	vs8,	alpha_r 
+	xvmuldp	vs1,	vs9,	alpha_r 
+	xvmuldp	vs2,	vs10,	alpha_r 
+	xvmuldp	vs3,	vs11,	alpha_r 
+
+	xvmuldp	vs4,	vs12,	alpha_r 
+	xvmuldp	vs5,	vs13,	alpha_r 
+	xvmuldp	vs6,	vs14,	alpha_r 
+	xvmuldp	vs7,	vs15,	alpha_r 
+
+#endif
+ 
+
+	stxv		vs0,	0(CO)
+	stxv		vs2,	16(CO)
+	stxv		vs4,	32(CO)
+	stxv		vs6,	48(CO)
+
+ 
+	stxv		vs1,	0(T2)
+	stxv		vs3,	16(T2)
+	stxv		vs5,	32(T2)
+	stxv		vs7,	48(T2)
+	
+ 
+	xxpermdi  vs8, vs56,vs48,1
+ 	xxpermdi  vs9 ,vs48,vs56,1
+#ifndef TRMMKERNEL 
+ 	lxv		vs0,	0(T3)
+	lxv		vs2,	16(T3)
+#endif	
+	xxpermdi  vs10, vs57,vs49,1		 
+ 	xxpermdi  vs11 ,vs49,vs57,1	
+#ifndef TRMMKERNEL 	 
+	lxv		vs4,	32(T3)
+	lxv		vs6,	48(T3)
+#endif 
+	xxpermdi  vs12, vs58,vs50,1
+ 	xxpermdi  vs13 ,vs50,vs58,1
+#ifndef TRMMKERNEL 	 
+	lxv		vs1,	0(T4)
+	lxv		vs3,	16(T4)
+#endif	
+	xxpermdi  vs14, vs59,vs51,1		 
+ 	xxpermdi  vs15 ,vs51,vs59,1	
+#ifndef TRMMKERNEL 	 
+	lxv		vs5,	32(T4)
+	lxv		vs7,	48(T4)
+ 
+ 
+	xvmaddadp	vs0,	vs8,	alpha_r 
+	xvmaddadp	vs1,	vs9,	alpha_r 
+	xvmaddadp	vs2,	vs10,	alpha_r 
+	xvmaddadp	vs3,	vs11,	alpha_r 
+	
+
+
+	xvmaddadp	vs4,	vs12,	alpha_r 
+	xvmaddadp	vs5,	vs13,	alpha_r 
+	xvmaddadp	vs6,	vs14,	alpha_r 
+	xvmaddadp	vs7,	vs15,	alpha_r 
+#else
+	xvmuldp	vs0,	vs8,	alpha_r 
+	xvmuldp	vs1,	vs9,	alpha_r 
+	xvmuldp	vs2,	vs10,	alpha_r 
+	xvmuldp	vs3,	vs11,	alpha_r 
+	
+
+
+	xvmuldp	vs4,	vs12,	alpha_r 
+	xvmuldp	vs5,	vs13,	alpha_r 
+	xvmuldp	vs6,	vs14,	alpha_r 
+	xvmuldp	vs7,	vs15,	alpha_r 
+
+#endif
+
+
+ 	stxv		vs0,	0(T3)
+	stxv		vs2,	16(T3)
+	stxv		vs4,	32(T3)
+	stxv		vs6,	48(T3)
+
+ 
+	stxv		vs1,	0(T4)
+	stxv		vs3,	16(T4)
+	stxv		vs5,	32(T4)
+	stxv		vs7,	48(T4)
+	
+ 
+
+	addi		CO,	CO,	64
+.endm
+
+
+/*********************************************************************
+* Macros for N=4, M=4                                                *
+*********************************************************************/
+
+.macro LOAD4x4_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x4_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+
+.endm
+
+.macro KERNEL4x4_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+
+.endm
+
+.macro KERNEL4x4_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+
+.endm
+
+.macro SAVE4x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs48,	alpha_r
+	xvmaddadp	vs1,	vs49,	alpha_r
+#else
+	xvmuldp		vs0,	vs48,	alpha_r
+	xvmuldp		vs1,	vs49,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs56,	alpha_r
+	xvmaddadp	vs9,	vs57,	alpha_r
+#else
+	xvmuldp		vs8,	vs56,	alpha_r
+	xvmuldp		vs9,	vs57,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	addi		CO,	CO,	32
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=2                                                *
+*********************************************************************/
+
+.macro LOAD4x2_1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x2_I1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x2_1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x2_2
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro SAVE4x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs48,	alpha_r
+#else
+	xvmuldp		vs0,	vs48,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs56,	alpha_r
+#else
+	xvmuldp		vs8,	vs56,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+
+	addi		CO,	CO,	16
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=1                                                *
+*********************************************************************/
+
+.macro LOAD4x1_1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x1_I1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+	lxsdx	vs30,	o16,	BO
+	lxsdx	vs31,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+	xsmuldp			vs48,	vs0,	vs26
+
+	xsmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x1_1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+	lxsdx	vs30,	o16,	BO
+	lxsdx	vs31,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+	xsmaddadp		vs48,	vs0,	vs26
+
+	xsmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x1_2
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+	xsmaddadp		vs48,	vs8,	vs30
+
+	xsmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+	xsmaddadp		vs48,	vs8,	vs30
+
+	xsmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+	xsmuldp			vs48,	vs0,	vs26
+
+	xsmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+	xsmaddadp		vs48,	vs0,	vs26
+
+	xsmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro SAVE4x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs32,	alpha_r
+#else
+	xsmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs8,	vs40,	alpha_r
+#else
+	xsmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxsdx		vs8,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs48,	alpha_r
+#else
+	xsmuldp		vs0,	vs48,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs8,	vs56,	alpha_r
+#else
+	xsmuldp		vs8,	vs56,	alpha_r
+#endif
+
+	stxsdx		vs8,	0,	T1
+
+	addi		CO,	CO,	8
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=16                                               *
+*********************************************************************/
+
+.macro LOAD2x16_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+.endm
+
+.macro KERNEL2x16_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+.endm
+
+.macro KERNEL2x16_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+.endm
+
+.macro KERNEL2x16_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+.endm
+
+.macro SAVE2x16
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+
+	lxvd2x		vs4,	0,	T2
+	lxvd2x		vs5,	o16,	T2
+	lxvd2x		vs6,	o32,	T2
+	lxvd2x		vs7,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+	xvmaddadp	vs4,	vs36,	alpha_r
+	xvmaddadp	vs5,	vs37,	alpha_r
+	xvmaddadp	vs6,	vs38,	alpha_r
+	xvmaddadp	vs7,	vs39,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+	xvmuldp		vs4,	vs36,	alpha_r
+	xvmuldp		vs5,	vs37,	alpha_r
+	xvmuldp		vs6,	vs38,	alpha_r
+	xvmuldp		vs7,	vs39,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	stxvd2x		vs4,	0,	T2
+	stxvd2x		vs5,	o16,	T2
+	stxvd2x		vs6,	o32,	T2
+	stxvd2x		vs7,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+
+	lxvd2x		vs12,	0,	T2
+	lxvd2x		vs13,	o16,	T2
+	lxvd2x		vs14,	o32,	T2
+	lxvd2x		vs15,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+	xvmaddadp	vs10,	vs42,	alpha_r
+	xvmaddadp	vs11,	vs43,	alpha_r
+	xvmaddadp	vs12,	vs44,	alpha_r
+	xvmaddadp	vs13,	vs45,	alpha_r
+	xvmaddadp	vs14,	vs46,	alpha_r
+	xvmaddadp	vs15,	vs47,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+	xvmuldp		vs10,	vs42,	alpha_r
+	xvmuldp		vs11,	vs43,	alpha_r
+	xvmuldp		vs12,	vs44,	alpha_r
+	xvmuldp		vs13,	vs45,	alpha_r
+	xvmuldp		vs14,	vs46,	alpha_r
+	xvmuldp		vs15,	vs47,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	stxvd2x		vs12,	0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	addi		CO,	CO,	128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD2x8_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x8_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+.endm
+
+.macro KERNEL2x8_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+.endm
+
+.macro KERNEL2x8_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+.endm
+
+.macro SAVE2x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+	xvmaddadp	vs10,	vs42,	alpha_r
+	xvmaddadp	vs11,	vs43,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+	xvmuldp		vs10,	vs42,	alpha_r
+	xvmuldp		vs11,	vs43,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	addi		CO,	CO,	64
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=4                                                *
+*********************************************************************/
+
+.macro LOAD2x4_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x4_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+.endm
+
+.macro KERNEL2x4_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+.endm
+
+.macro KERNEL2x4_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+.endm
+
+.macro SAVE2x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	addi		CO,	CO,	32
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=2                                                *
+*********************************************************************/
+
+.macro LOAD2x2_1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x2_I1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x2_1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x2_2
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro SAVE2x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+
+	addi		CO,	CO,	16
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=1                                                *
+*********************************************************************/
+
+.macro LOAD2x1_1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x1_I1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x1_1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x1_2
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro SAVE2x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs32,	alpha_r
+#else
+	xsmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs8,	vs40,	alpha_r
+#else
+	xsmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxsdx		vs8,	0,	T1
+
+	addi		CO,	CO,	8
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=16                                               *
+*********************************************************************/
+
+.macro LOAD1x16_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+.endm
+
+.macro KERNEL1x16_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+.endm
+
+.macro KERNEL1x16_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+.endm
+
+.macro KERNEL1x16_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+.endm
+
+.macro SAVE1x16
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+
+	lxvd2x		vs4,	0,	T2
+	lxvd2x		vs5,	o16,	T2
+	lxvd2x		vs6,	o32,	T2
+	lxvd2x		vs7,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+	xvmaddadp	vs4,	vs36,	alpha_r
+	xvmaddadp	vs5,	vs37,	alpha_r
+	xvmaddadp	vs6,	vs38,	alpha_r
+	xvmaddadp	vs7,	vs39,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+	xvmuldp		vs4,	vs36,	alpha_r
+	xvmuldp		vs5,	vs37,	alpha_r
+	xvmuldp		vs6,	vs38,	alpha_r
+	xvmuldp		vs7,	vs39,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	stxvd2x		vs4,	0,	T2
+	stxvd2x		vs5,	o16,	T2
+	stxvd2x		vs6,	o32,	T2
+	stxvd2x		vs7,	o48,	T2
+
+	addi		CO,	CO,	128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD1x8_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x8_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+.endm
+
+.macro KERNEL1x8_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+.endm
+
+.macro KERNEL1x8_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+.endm
+
+.macro SAVE1x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	addi		CO,	CO,	64
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=4                                                *
+*********************************************************************/
+
+.macro LOAD1x4_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x4_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+.endm
+
+.macro KERNEL1x4_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+.endm
+
+.macro KERNEL1x4_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+.endm
+
+.macro SAVE1x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	addi		CO,	CO,	32
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=2                                                *
+*********************************************************************/
+
+.macro LOAD1x2_1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x2_I1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x2_1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x2_2
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro SAVE1x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	addi		CO,	CO,	16
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=1                                                *
+*********************************************************************/
+
+.macro LOAD1x1_1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x1_I1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x1_1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x1_2
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro SAVE1x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs32,	alpha_r
+#else
+	xsmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	addi		CO,	CO,	8
+
+.endm
+
+
+
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+
+.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
+		.if \SHIFT_VAL==16 
+			slwi		\REG1,	\REG2,	7			
+		.elseif \SHIFT_VAL==8  
+			slwi		\REG1,	\REG2,	6			 
+		.elseif \SHIFT_VAL==4
+			slwi		\REG1,	\REG2,	5			  
+		.elseif \SHIFT_VAL==2
+			slwi		\REG1,	\REG2,	4			 
+		.elseif \SHIFT_VAL==1
+			slwi		\REG1,	\REG2,	3			 
+		.endif
+.endm
+
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*16;
+// 		ptrbb = bb + off*2;
+// #endif
+*/
+.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+        /* ptrbb = bb;*/
+        mr \PTR_B,\B_VAL     /* refresh BPOINT */
+
+    #else
+		    /*
+        // ptrba  =ptrba+ off*C_A;
+        // ptrbb = bb + off*C_B; 
+				*/
+		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
+		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
+		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
+		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
+    #endif 
+.endm
+
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+16;	// number of values in A
+// #else
+// 		temp = off+2;	// number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+                            /* temp = bk-off;*/
+           sub \TEMP_BK,\BK_VAL,\OFF_VAL
+
+    #elif defined(LEFT)
+                            /* temp = off+INCR_A;	// number of values in A */
+           addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+                            /* temp = off+INCR_B	// number of values in B*/
+           addi \TEMP_BK,\OFF_VAL, \INCR_B
+    #endif
+
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 16; // number of values in A
+// #else
+// 		temp -= 2; // number of values in B
+// #endif
+// 		ptrba += temp*16;
+// 		ptrbb += temp*2;
+// #endif
+
+// #ifdef LEFT
+// 		off += 16; // number of values in A
+// #endif
+*/
+ 
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                    /*temp = bk - off;*/
+                sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #ifdef LEFT
+                    /*temp -= 8; // number of values in A*/
+                addi \TEMP_BK,\TEMP_BK,-\C_A
+    #else
+                    /*temp -= 4; // number of values in B*/
+                addi \TEMP_BK,\TEMP_BK,-\C_B 
+    #endif
+                    /*ptrba += temp*C_A;
+                    ptrbb += temp*C_B;*/ 
+                SHIFT_REG T4,\TEMP_BK,\C_A
+								SHIFT_REG T2,\TEMP_BK,\C_B
+                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
+								add \PTR_B, \PTR_B,T2 
+
+    #endif
+
+    #ifdef LEFT
+                    /*off += 8; // number of values in A*/
+                 addi \OFF_VAL,\OFF_VAL,\C_A
+    #endif
 .endm
\ No newline at end of file
diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c
index bd74d20e5..58dcdec5a 100644
--- a/kernel/power/icamax.c
+++ b/kernel/power/icamax.c
@@ -1,328 +1,328 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-
-#include "common.h"
-#include <math.h>
-#include <altivec.h>
-#if defined(DOUBLE)
-    #define ABS fabs
-#else
-    #define ABS fabsf
-#endif
-#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
-
-#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code
-
-#if  !defined(USE_MASK_PERMUTATIONS)
-
-static inline __attribute__((always_inline))  __vector float mvec_mergee(__vector float a,__vector float b ){
-  __vector float result;
-  __asm__ ( 
-      "vmrgew %0,%1,%2;\n" 
-      : "=v" (result) 
-      : "v" (a), 
-      "v" (b) 
-      : );
-  return result;
-}
-
-static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){
-  __vector float result;
-  __asm__ ( 
-      "vmrgow %0,%1,%2;\n" 
-      : "=v" (result) 
-      : "v" (a), 
-      "v" (b) 
-      : );
-  return result;
-}
-
-#endif
-
-/**
- * Find  maximum index 
- * Warning: requirements n>0  and n % 32 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param maxf  (out) maximum absolute value .( only for output )
- * @return  index 
- */
-static BLASLONG   ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { 
-
-    BLASLONG index;
-    BLASLONG i=0;
-#if  defined(USE_MASK_PERMUTATIONS)    
-    register __vector unsigned int static_index0 = {0,1,2,3};
-#else
-    register __vector unsigned int static_index0 = {2,0,3,1};
-#endif    
-    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
-    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
-    register __vector unsigned int static_index1=static_index0 +temp0; 
-    register __vector unsigned int static_index2=static_index0 +temp1; 
-    register __vector unsigned int static_index3=static_index1 +temp1;  
-    temp0=vec_xor(temp0,temp0);
-    temp1=temp1 <<1 ; //{16,16,16,16}
-    register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
-    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
-    register __vector float quadruple_values={0,0,0,0};
-
-    register __vector float * v_ptrx=(__vector float *)x;
-#if  defined(USE_MASK_PERMUTATIONS)    
-    register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; 
-    register __vector unsigned char image_pack_mask=  {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; 
-#endif    
-    for(; i<n; i+=32 ){
-       //absolute temporary complex vectors
-       register __vector float v0=vec_abs(v_ptrx[0]);
-       register __vector float v1=vec_abs(v_ptrx[1]);
-       register __vector float v2=vec_abs(v_ptrx[2]);
-       register __vector float v3=vec_abs(v_ptrx[3]);
-       register __vector float v4=vec_abs(v_ptrx[4]);
-       register __vector float v5=vec_abs(v_ptrx[5]);
-       register __vector float v6=vec_abs(v_ptrx[6]);       
-       register __vector float v7=vec_abs(v_ptrx[7]);
-
-       //pack complex real and imaginary parts together to sum real+image
-#if defined(USE_MASK_PERMUTATIONS)       
-       register __vector float t1=vec_perm(v0,v1,real_pack_mask);
-       register __vector float ti=vec_perm(v0,v1,image_pack_mask); 
-            
-       v0=t1+ti; //sum quadruple real with quadruple image
-       register __vector float t2=vec_perm(v2,v3,real_pack_mask);
-       register __vector float ti2=vec_perm(v2,v3,image_pack_mask); 
-       v1=t2+ti2;
-       t1=vec_perm(v4,v5,real_pack_mask);
-       ti=vec_perm(v4,v5,image_pack_mask);      
-       v2=t1+ti; //sum
-       t2=vec_perm(v6,v7,real_pack_mask);
-       ti2=vec_perm(v6,v7,image_pack_mask); 
-       v3=t2+ti2;
-#else
-       register __vector float t1=mvec_mergee(v0,v1);
-       register __vector float ti=mvec_mergeo(v0,v1); 
-            
-       v0=t1+ti; //sum quadruple real with quadruple image
-       register __vector float t2= mvec_mergee(v2,v3);
-       register __vector float ti2=mvec_mergeo(v2,v3); 
-       v1=t2+ti2;
-       t1=mvec_mergee(v4,v5);
-       ti=mvec_mergeo(v4,v5);      
-       v2=t1+ti; //sum
-       t2=mvec_mergee(v6,v7);
-       ti2=mvec_mergeo(v6,v7); 
-       v3=t2+ti2;
-
-#endif
-       // now we have 16 summed elements . lets compare them
-       v_ptrx+=8;
-       register __vector bool int r1=vec_cmpgt(v1,v0);
-       register __vector bool int r2=vec_cmpgt(v3,v2);
-       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);
-       v0=vec_sel(v0,v1,r1); 
-       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);
-       v1=vec_sel(v2,v3,r2);
-       //final cmp and select index and value for first 16 values
-       r1=vec_cmpgt(v1,v0);
-       register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);
-       register __vector float vf0= vec_sel(v0,v1,r1); 
-
-       //absolute temporary complex vectors
-       v0=vec_abs(v_ptrx[0]);
-       v1=vec_abs(v_ptrx[1]);
-       v2=vec_abs(v_ptrx[2]);
-       v3=vec_abs(v_ptrx[3]);
-       v4=vec_abs(v_ptrx[4]);
-       v5=vec_abs(v_ptrx[5]);
-       v6=vec_abs(v_ptrx[6]);       
-       v7=vec_abs(v_ptrx[7]);
-
-       //pack complex real and imaginary parts together to sum real+image
-#if defined(USE_MASK_PERMUTATIONS)       
-       t1=vec_perm(v0,v1,real_pack_mask);
-       ti=vec_perm(v0,v1,image_pack_mask); 
-            
-       v0=t1+ti; //sum quadruple real with quadruple image
-       t2=vec_perm(v2,v3,real_pack_mask);
-       ti2=vec_perm(v2,v3,image_pack_mask); 
-       v1=t2+ti2;
-       t1=vec_perm(v4,v5,real_pack_mask);
-       ti=vec_perm(v4,v5,image_pack_mask);      
-       v2=t1+ti; //sum
-       t2=vec_perm(v6,v7,real_pack_mask);
-       ti2=vec_perm(v6,v7,image_pack_mask); 
-       v3=t2+ti2;
-#else
-       t1=mvec_mergee(v0,v1);
-       ti=mvec_mergeo(v0,v1); 
-            
-       v0=t1+ti; //sum quadruple real with quadruple image
-       t2=mvec_mergee(v2,v3);
-       ti2=mvec_mergeo(v2,v3); 
-       v1=t2+ti2;
-       t1=mvec_mergee(v4,v5);
-       ti=mvec_mergeo(v4,v5);      
-       v2=t1+ti; //sum
-       t2=mvec_mergee(v6,v7);
-       ti2=mvec_mergeo(v6,v7); 
-       v3=t2+ti2;
-
-#endif
-       // now we have 16 summed elements {from 16 to 31} . lets compare them
-       v_ptrx+=8;
-       r1=vec_cmpgt(v1,v0);
-       r2=vec_cmpgt(v3,v2);
-       ind2= vec_sel(static_index0,static_index1,r1);
-       v0=vec_sel(v0,v1,r1); 
-       ind3= vec_sel(static_index2,static_index3,r2);
-       v1=vec_sel(v2,v3,r2);
-       //final cmp and select index and value for the second 16 values
-       r1=vec_cmpgt(v1,v0);
-       register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);
-       register __vector float vv0= vec_sel(v0,v1,r1); 
-       indv0+=temp1; //make index from 16->31
-
-       //find final quadruple from 32 elements
-       r2=vec_cmpgt(vv0,vf0);
-       ind2 = vec_sel( indf0,indv0,r2);
-       vv0= vec_sel(vf0,vv0,r2);       
-       //get asbolute index
-       ind2+=temp0;
-       //compare with old quadruple and update 
-       r1=vec_cmpgt(vv0,quadruple_values);
-       quadruple_indices = vec_sel( quadruple_indices,ind2,r1);
-       quadruple_values= vec_sel(quadruple_values,vv0,r1);      
-
-       temp0+=temp_add;     
-    }
-
-    //now we have to chose from 4 values and 4 different indices
-    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
-    // otherwise we will assign index of the maximum value
-    float a1,a2,a3,a4;
-    unsigned int i1,i2,i3,i4;
-    a1=vec_extract(quadruple_values,0);
-    a2=vec_extract(quadruple_values,1);
-    a3=vec_extract(quadruple_values,2);
-    a4=vec_extract(quadruple_values,3);
-    i1=vec_extract(quadruple_indices,0);
-    i2=vec_extract(quadruple_indices,1);
-    i3=vec_extract(quadruple_indices,2);
-    i4=vec_extract(quadruple_indices,3);
-    if(a1==a2){
-      index=i1>i2?i2:i1;
-    }else if(a2>a1){
-      index=i2;
-      a1=a2;
-    }else{
-       index= i1;
-    }
-
-    if(a4==a3){
-      i1=i3>i4?i4:i3;
-    }else if(a4>a3){
-      i1=i4;
-      a3=a4;
-    }else{
-       i1= i3;
-    }
-
-    if(a1==a3){
-       index=i1>index?index:i1;
-       *maxf=a1; 
-    }else if(a3>a1){
-       index=i1;
-       *maxf=a3;
-    }else{ 
-        *maxf=a1;
-    }
-    return index; 
-
-}
- 
-  
-
- 
- 
-
-BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
-{
-    BLASLONG i = 0;
-    BLASLONG ix = 0;
-    FLOAT maxf = 0;
-    BLASLONG max = 0;
-    BLASLONG inc_x2;
-
-    if (n <= 0 || inc_x <= 0) return(max);
-     
-    if (inc_x == 1) {
-
-      BLASLONG n1 = n & -32;
-      if (n1 > 0) {
-
-            max = ciamax_kernel_32(n1, x, &maxf); 
-            i = n1;
-            ix = n1 << 1;
-      }
-
-      while(i < n)
-    {
-        if( CABS1(x,ix) > maxf )
-        {
-            max = i;
-            maxf = CABS1(x,ix);
-        }
-        ix += 2;
-        i++;
-    }
-        return (max + 1);
-
-    } else {
- 
-      inc_x2 = 2 * inc_x;
-
-    maxf = CABS1(x,0);
-    ix += inc_x2;
-    i++;
-
-    while(i < n)
-    {
-        if( CABS1(x,ix) > maxf )
-        {
-            max = i;
-            maxf = CABS1(x,ix);
-        }
-        ix += inc_x2;
-        i++;
-    }
-        return (max + 1);
-    }
- 
-}
-
-
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
+
+#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code
+
+#if  !defined(USE_MASK_PERMUTATIONS)
+
+static inline __attribute__((always_inline))  __vector float mvec_mergee(__vector float a,__vector float b ){
+  __vector float result;
+  __asm__ ( 
+      "vmrgew %0,%1,%2;\n" 
+      : "=v" (result) 
+      : "v" (a), 
+      "v" (b) 
+      : );
+  return result;
+}
+
+static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){
+  __vector float result;
+  __asm__ ( 
+      "vmrgow %0,%1,%2;\n" 
+      : "=v" (result) 
+      : "v" (a), 
+      "v" (b) 
+      : );
+  return result;
+}
+
+#endif
+
+/**
+ * Find  maximum index 
+ * Warning: requirements n>0  and n % 32 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param maxf  (out) maximum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG   ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { 
+
+    BLASLONG index;
+    BLASLONG i=0;
+#if  defined(USE_MASK_PERMUTATIONS)    
+    register __vector unsigned int static_index0 = {0,1,2,3};
+#else
+    register __vector unsigned int static_index0 = {2,0,3,1};
+#endif    
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0; 
+    register __vector unsigned int static_index2=static_index0 +temp1; 
+    register __vector unsigned int static_index3=static_index1 +temp1;  
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
+    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
+    register __vector float quadruple_values={0,0,0,0};
+
+    register __vector float * v_ptrx=(__vector float *)x;
+#if  defined(USE_MASK_PERMUTATIONS)    
+    register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; 
+    register __vector unsigned char image_pack_mask=  {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; 
+#endif    
+    for(; i<n; i+=32 ){
+       //absolute temporary complex vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+#if defined(USE_MASK_PERMUTATIONS)       
+       register __vector float t1=vec_perm(v0,v1,real_pack_mask);
+       register __vector float ti=vec_perm(v0,v1,image_pack_mask); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       register __vector float t2=vec_perm(v2,v3,real_pack_mask);
+       register __vector float ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+#else
+       register __vector float t1=mvec_mergee(v0,v1);
+       register __vector float ti=mvec_mergeo(v0,v1); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       register __vector float t2= mvec_mergee(v2,v3);
+       register __vector float ti2=mvec_mergeo(v2,v3); 
+       v1=t2+ti2;
+       t1=mvec_mergee(v4,v5);
+       ti=mvec_mergeo(v4,v5);      
+       v2=t1+ti; //sum
+       t2=mvec_mergee(v6,v7);
+       ti2=mvec_mergeo(v6,v7); 
+       v3=t2+ti2;
+
+#endif
+       // now we have 16 summed elements . lets compare them
+       v_ptrx+=8;
+       register __vector bool int r1=vec_cmpgt(v1,v0);
+       register __vector bool int r2=vec_cmpgt(v3,v2);
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for first 16 values
+       r1=vec_cmpgt(v1,v0);
+       register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1); 
+
+       //absolute temporary complex vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+#if defined(USE_MASK_PERMUTATIONS)       
+       t1=vec_perm(v0,v1,real_pack_mask);
+       ti=vec_perm(v0,v1,image_pack_mask); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       t2=vec_perm(v2,v3,real_pack_mask);
+       ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+#else
+       t1=mvec_mergee(v0,v1);
+       ti=mvec_mergeo(v0,v1); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       t2=mvec_mergee(v2,v3);
+       ti2=mvec_mergeo(v2,v3); 
+       v1=t2+ti2;
+       t1=mvec_mergee(v4,v5);
+       ti=mvec_mergeo(v4,v5);      
+       v2=t1+ti; //sum
+       t2=mvec_mergee(v6,v7);
+       ti2=mvec_mergeo(v6,v7); 
+       v3=t2+ti2;
+
+#endif
+       // now we have 16 summed elements {from 16 to 31} . lets compare them
+       v_ptrx+=8;
+       r1=vec_cmpgt(v1,v0);
+       r2=vec_cmpgt(v3,v2);
+       ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for the second 16 values
+       r1=vec_cmpgt(v1,v0);
+       register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1); 
+       indv0+=temp1; //make index from 16->31
+
+       //find final quadruple from 32 elements
+       r2=vec_cmpgt(vv0,vf0);
+       ind2 = vec_sel( indf0,indv0,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+       //get asbolute index
+       ind2+=temp0;
+       //compare with old quadruple and update 
+       r1=vec_cmpgt(vv0,quadruple_values);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r1);
+       quadruple_values= vec_sel(quadruple_values,vv0,r1);      
+
+       temp0+=temp_add;     
+    }
+
+    //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the maximum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+      index=i1>i2?i2:i1;
+    }else if(a2>a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4>a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+       index=i1>index?index:i1;
+       *maxf=a1; 
+    }else if(a3>a1){
+       index=i1;
+       *maxf=a3;
+    }else{ 
+        *maxf=a1;
+    }
+    return index; 
+
+}
+ 
+  
+
+ 
+ 
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i = 0;
+    BLASLONG ix = 0;
+    FLOAT maxf = 0;
+    BLASLONG max = 0;
+    BLASLONG inc_x2;
+
+    if (n <= 0 || inc_x <= 0) return(max);
+     
+    if (inc_x == 1) {
+
+      BLASLONG n1 = n & -32;
+      if (n1 > 0) {
+
+            max = ciamax_kernel_32(n1, x, &maxf); 
+            i = n1;
+            ix = n1 << 1;
+      }
+
+      while(i < n)
+    {
+        if( CABS1(x,ix) > maxf )
+        {
+            max = i;
+            maxf = CABS1(x,ix);
+        }
+        ix += 2;
+        i++;
+    }
+        return (max + 1);
+
+    } else {
+ 
+      inc_x2 = 2 * inc_x;
+
+    maxf = CABS1(x,0);
+    ix += inc_x2;
+    i++;
+
+    while(i < n)
+    {
+        if( CABS1(x,ix) > maxf )
+        {
+            max = i;
+            maxf = CABS1(x,ix);
+        }
+        ix += inc_x2;
+        i++;
+    }
+        return (max + 1);
+    }
+ 
+}
+
+
diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c
index 336766245..843370c6c 100644
--- a/kernel/power/icamin.c
+++ b/kernel/power/icamin.c
@@ -1,266 +1,266 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-
-#include "common.h"
-#include <math.h>
-#include <altivec.h>
-#if defined(DOUBLE)
-    #define ABS fabs
-#else
-    #define ABS fabsf
-#endif
-#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
-
-
-
- 
-/**
- * Find  minimum index 
- * Warning: requirements n>0  and n % 32 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param minf  (out) minimum absolute value .( only for output )
- * @return  index 
- */
-static BLASLONG   ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { 
-
-    BLASLONG index;
-    BLASLONG i=0;
-    register __vector unsigned int static_index0 = {0,1,2,3};
-    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
-    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
-    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
-    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
-    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
-    temp0=vec_xor(temp0,temp0);
-    temp1=temp1 <<1 ; //{16,16,16,16}
-    register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
-    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
-    float first_min=CABS1(x,0);
-    register __vector float quadruple_values={first_min,first_min,first_min,first_min};
-
-    register __vector float * v_ptrx=(__vector float *)x;
-    register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; 
-    register __vector unsigned char image_pack_mask=  {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; 
-    for(; i<n; i+=32){
-       //absolute temporary complex vectors
-       register __vector float v0=vec_abs(v_ptrx[0]);
-       register __vector float v1=vec_abs(v_ptrx[1]);
-       register __vector float v2=vec_abs(v_ptrx[2]);
-       register __vector float v3=vec_abs(v_ptrx[3]);
-       register __vector float v4=vec_abs(v_ptrx[4]);
-       register __vector float v5=vec_abs(v_ptrx[5]);
-       register __vector float v6=vec_abs(v_ptrx[6]);       
-       register __vector float v7=vec_abs(v_ptrx[7]);
-
-       //pack complex real and imaginary parts together to sum real+image
-       register __vector float t1=vec_perm(v0,v1,real_pack_mask);
-       register __vector float ti=vec_perm(v0,v1,image_pack_mask);      
-       v0=t1+ti; //sum quadruple real with quadruple image
-       register __vector float t2=vec_perm(v2,v3,real_pack_mask);
-       register __vector float ti2=vec_perm(v2,v3,image_pack_mask); 
-       v1=t2+ti2;
-       t1=vec_perm(v4,v5,real_pack_mask);
-       ti=vec_perm(v4,v5,image_pack_mask);      
-       v2=t1+ti; //sum
-       t2=vec_perm(v6,v7,real_pack_mask);
-       ti2=vec_perm(v6,v7,image_pack_mask); 
-       v3=t2+ti2;
-       // now we have 16 summed elements . lets compare them
-       v_ptrx+=8;
-       register __vector bool int r1=vec_cmpgt(v0,v1);
-       register __vector bool int r2=vec_cmpgt(v2,v3);
-       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);
-       v0=vec_sel(v0,v1,r1); 
-       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);
-       v1=vec_sel(v2,v3,r2);
-       //final cmp and select index and value for first 16 values
-       r1=vec_cmpgt(v0,v1);
-       register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);
-       register __vector float vf0= vec_sel(v0,v1,r1); 
-
-       //absolute temporary complex vectors
-       v0=vec_abs(v_ptrx[0]);
-       v1=vec_abs(v_ptrx[1]);
-       v2=vec_abs(v_ptrx[2]);
-       v3=vec_abs(v_ptrx[3]);
-       v4=vec_abs(v_ptrx[4]);
-       v5=vec_abs(v_ptrx[5]);
-       v6=vec_abs(v_ptrx[6]);       
-       v7=vec_abs(v_ptrx[7]);
-
-       //pack complex real and imaginary parts together to sum real+image
-       t1=vec_perm(v0,v1,real_pack_mask);
-       ti=vec_perm(v0,v1,image_pack_mask);      
-       v0=t1+ti; //sum quadruple real with quadruple image
-       t2=vec_perm(v2,v3,real_pack_mask);
-       ti2=vec_perm(v2,v3,image_pack_mask); 
-       v1=t2+ti2;
-       t1=vec_perm(v4,v5,real_pack_mask);
-       ti=vec_perm(v4,v5,image_pack_mask);      
-       v2=t1+ti; //sum
-       t2=vec_perm(v6,v7,real_pack_mask);
-       ti2=vec_perm(v6,v7,image_pack_mask); 
-       v3=t2+ti2;
-       // now we have 16 summed elements {from 16 to 31} . lets compare them
-       v_ptrx+=8;
-       r1=vec_cmpgt(v0,v1);
-       r2=vec_cmpgt(v2,v3);
-       ind2= vec_sel(static_index0,static_index1,r1);
-       v0=vec_sel(v0,v1,r1); 
-       ind3= vec_sel(static_index2,static_index3,r2);
-       v1=vec_sel(v2,v3,r2);
-       //final cmp and select index and value for the second 16 values
-       r1=vec_cmpgt(v0,v1);
-       register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);
-       register __vector float vv0= vec_sel(v0,v1,r1); 
-       indv0+=temp1; //make index from 16->31
-
-       //find final quadruple from 32 elements
-       r2=vec_cmpgt(vf0,vv0);
-       ind2 = vec_sel( indf0,indv0,r2);
-       vv0= vec_sel(vf0,vv0,r2);       
-       //get asbolute index
-       ind2+=temp0;
-       //compare with old quadruple and update 
-       r1=vec_cmpgt(quadruple_values,vv0);
-       quadruple_indices = vec_sel( quadruple_indices,ind2,r1);
-       quadruple_values= vec_sel(quadruple_values,vv0,r1);      
-
-       temp0+=temp_add;     
-    }
-
- //now we have to chose from 4 values and 4 different indices
-    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
-    // otherwise we will assign index of the minimum value
-    float a1,a2,a3,a4;
-    unsigned int i1,i2,i3,i4;
-    a1=vec_extract(quadruple_values,0);
-    a2=vec_extract(quadruple_values,1);
-    a3=vec_extract(quadruple_values,2);
-    a4=vec_extract(quadruple_values,3);
-    i1=vec_extract(quadruple_indices,0);
-    i2=vec_extract(quadruple_indices,1);
-    i3=vec_extract(quadruple_indices,2);
-    i4=vec_extract(quadruple_indices,3);
-    if(a1==a2){
-       index=i1>i2?i2:i1;
-    }else if(a2<a1){
-      index=i2;
-      a1=a2;
-    }else{
-       index= i1;
-    }
-
-    if(a4==a3){
-      i1=i3>i4?i4:i3;
-    }else if(a4<a3){
-      i1=i4;
-      a3=a4;
-    }else{
-       i1= i3;
-    }
-
-    if(a1==a3){
-      index=i1>index?index:i1;
-       *minf=a1; 
-    }else if(a3<a1){
-       index=i1;
-       *minf=a3;
-    }else{ 
-        *minf=a1;
-    }
-    return index;
-
-}
- 
-  
-
- 
-
- 
- 
-
-BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
-{
-    BLASLONG i=0;
-    BLASLONG ix=0;
-    FLOAT minf;
-    BLASLONG min=0;
-    BLASLONG inc_x2;
-
-    if (n <= 0 || inc_x <= 0) return(min);
-    
-
-    if (inc_x == 1) {
-        minf = CABS1(x,0); //index will not be incremented
-        BLASLONG n1 = n & -32;
-        if (n1 > 0) {
-
-            min = ciamin_kernel_32(n1, x, &minf);
-            i = n1;
-            ix = n1 << 1;
-        }
-      
-
-        while(i < n)
-        {
-            if( CABS1(x,ix) < minf )
-            {
-                min = i;
-                minf = CABS1(x,ix);
-            }
-            ix += 2;
-            i++;
-        }
-        return (min + 1);
-
-    } else {
- 
-        inc_x2 = 2 * inc_x;
-
-        minf = CABS1(x,0);
-        ix += inc_x2;
-        i++;
-
-        while(i < n)
-        {
-            if( CABS1(x,ix) < minf )
-            {
-                min = i;
-                minf = CABS1(x,ix);
-            }
-            ix += inc_x2;
-            i++;
-        }
-        return (min + 1);
-    }
- 
-}
-
-
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
+
+
+
+ 
+/**
+ * Find  minimum index 
+ * Warning: requirements n>0  and n % 32 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param minf  (out) minimum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG   ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { 
+
+    BLASLONG index;
+    BLASLONG i=0;
+    register __vector unsigned int static_index0 = {0,1,2,3};
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
+    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
+    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
+    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
+    float first_min=CABS1(x,0);
+    register __vector float quadruple_values={first_min,first_min,first_min,first_min};
+
+    register __vector float * v_ptrx=(__vector float *)x;
+    register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; 
+    register __vector unsigned char image_pack_mask=  {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; 
+    for(; i<n; i+=32){
+       //absolute temporary complex vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+       register __vector float t1=vec_perm(v0,v1,real_pack_mask);
+       register __vector float ti=vec_perm(v0,v1,image_pack_mask);      
+       v0=t1+ti; //sum quadruple real with quadruple image
+       register __vector float t2=vec_perm(v2,v3,real_pack_mask);
+       register __vector float ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+       // now we have 16 summed elements . lets compare them
+       v_ptrx+=8;
+       register __vector bool int r1=vec_cmpgt(v0,v1);
+       register __vector bool int r2=vec_cmpgt(v2,v3);
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for first 16 values
+       r1=vec_cmpgt(v0,v1);
+       register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1); 
+
+       //absolute temporary complex vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+       t1=vec_perm(v0,v1,real_pack_mask);
+       ti=vec_perm(v0,v1,image_pack_mask);      
+       v0=t1+ti; //sum quadruple real with quadruple image
+       t2=vec_perm(v2,v3,real_pack_mask);
+       ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+       // now we have 16 summed elements {from 16 to 31} . lets compare them
+       v_ptrx+=8;
+       r1=vec_cmpgt(v0,v1);
+       r2=vec_cmpgt(v2,v3);
+       ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for the second 16 values
+       r1=vec_cmpgt(v0,v1);
+       register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1); 
+       indv0+=temp1; //make index from 16->31
+
+       //find final quadruple from 32 elements
+       r2=vec_cmpgt(vf0,vv0);
+       ind2 = vec_sel( indf0,indv0,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+       //get asbolute index
+       ind2+=temp0;
+       //compare with old quadruple and update 
+       r1=vec_cmpgt(quadruple_values,vv0);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r1);
+       quadruple_values= vec_sel(quadruple_values,vv0,r1);      
+
+       temp0+=temp_add;     
+    }
+
+ //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the minimum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+       index=i1>i2?i2:i1;
+    }else if(a2<a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4<a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+      index=i1>index?index:i1;
+       *minf=a1; 
+    }else if(a3<a1){
+       index=i1;
+       *minf=a3;
+    }else{ 
+        *minf=a1;
+    }
+    return index;
+
+}
+ 
+  
+
+ 
+
+ 
+ 
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i=0;
+    BLASLONG ix=0;
+    FLOAT minf;
+    BLASLONG min=0;
+    BLASLONG inc_x2;
+
+    if (n <= 0 || inc_x <= 0) return(min);
+    
+
+    if (inc_x == 1) {
+        minf = CABS1(x,0); //index will not be incremented
+        BLASLONG n1 = n & -32;
+        if (n1 > 0) {
+
+            min = ciamin_kernel_32(n1, x, &minf);
+            i = n1;
+            ix = n1 << 1;
+        }
+      
+
+        while(i < n)
+        {
+            if( CABS1(x,ix) < minf )
+            {
+                min = i;
+                minf = CABS1(x,ix);
+            }
+            ix += 2;
+            i++;
+        }
+        return (min + 1);
+
+    } else {
+ 
+        inc_x2 = 2 * inc_x;
+
+        minf = CABS1(x,0);
+        ix += inc_x2;
+        i++;
+
+        while(i < n)
+        {
+            if( CABS1(x,ix) < minf )
+            {
+                min = i;
+                minf = CABS1(x,ix);
+            }
+            ix += inc_x2;
+            i++;
+        }
+        return (min + 1);
+    }
+ 
+}
+
+
diff --git a/kernel/power/isamax.c b/kernel/power/isamax.c
index bf1af78d6..fb2dafec0 100644
--- a/kernel/power/isamax.c
+++ b/kernel/power/isamax.c
@@ -1,288 +1,288 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-#include "common.h"
-#include <math.h>
-#include <altivec.h>
-
-
-#if defined(DOUBLE)
-    #define ABS fabs
-#else
-    #define ABS fabsf
-#endif
-
-/**
- * Find  maximum index 
- * Warning: requirements n>0  and n % 64 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param maxf  (out) maximum absolute value .( only for output )
- * @return  index 
- */
-static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) {
-    BLASLONG index;
-    BLASLONG i=0;
-    register __vector unsigned int static_index0 = {0,1,2,3};
-    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
-    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
-    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
-    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
-    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
-    temp0=vec_xor(temp0,temp0);
-    temp1=temp1 <<1 ; //{16,16,16,16}
-    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
-    register __vector float quadruple_values={0,0,0,0};
-    register __vector float * v_ptrx=(__vector float *)x;
-    for(; i<n; i+=64){
-       //absolute temporary vectors
-       register __vector float v0=vec_abs(v_ptrx[0]);
-       register __vector float v1=vec_abs(v_ptrx[1]);
-       register __vector float v2=vec_abs(v_ptrx[2]);
-       register __vector float v3=vec_abs(v_ptrx[3]);
-       register __vector float v4=vec_abs(v_ptrx[4]);
-       register __vector float v5=vec_abs(v_ptrx[5]);
-       register __vector float v6=vec_abs(v_ptrx[6]);       
-       register __vector float v7=vec_abs(v_ptrx[7]);
-       //cmp quadruple pairs
-       register __vector bool int r1=vec_cmpgt(v1,v0);
-       register __vector bool int r2=vec_cmpgt(v3,v2);
-       register __vector bool int r3=vec_cmpgt(v5,v4);
-       register __vector bool int r4=vec_cmpgt(v7,v6);
-      
-       //select
-       register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);
-       register __vector float vf0= vec_sel(v0,v1,r1);
-
-       register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);
-       register __vector float vf1= vec_sel(v2,v3,r2);
-
-       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);
-       v0=vec_sel(v4,v5,r3);
-
-       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);
-       v1=vec_sel(v6,v7,r4);
-
-       // cmp selected
-        r1=vec_cmpgt(vf1,vf0);
-       r2=vec_cmpgt(v1,v0);
-
-       v_ptrx+=8;
-       //select from above 
-       ind0_first= vec_sel(ind0_first,ind1,r1);
-       vf0= vec_sel(vf0,vf1,r1) ;
-
-       ind2= vec_sel(ind2,ind3,r2);
-       vf1= vec_sel(v0,v1,r2);
-
-       //second indices actually should be within [16,31] so ind2+16
-       ind2 +=temp1;
-       
-       //final cmp and select index and value for the first 32 values
-       r1=vec_cmpgt(vf1,vf0);
-       ind0_first = vec_sel(ind0_first,ind2,r1);
-       vf0= vec_sel(vf0,vf1,r1);
- 
-       ind0_first+=temp0; //get absolute index
-
-       temp0+=temp1;
-       temp0+=temp1; //temp0+32
-       //second part of 32
-       // absolute temporary vectors
-       v0=vec_abs(v_ptrx[0]);
-       v1=vec_abs(v_ptrx[1]);
-       v2=vec_abs(v_ptrx[2]);
-       v3=vec_abs(v_ptrx[3]);
-       v4=vec_abs(v_ptrx[4]);
-       v5=vec_abs(v_ptrx[5]);
-       v6=vec_abs(v_ptrx[6]);       
-       v7=vec_abs(v_ptrx[7]);
-       //cmp quadruple pairs
-       r1=vec_cmpgt(v1,v0);
-       r2=vec_cmpgt(v3,v2);
-       r3=vec_cmpgt(v5,v4);
-       r4=vec_cmpgt(v7,v6);
-       //select
-       register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);
-       register __vector float vv0= vec_sel(v0,v1,r1);
-
-       ind1= vec_sel(static_index2,static_index3,r2);
-       register __vector float vv1= vec_sel(v2,v3,r2);
-
-       ind2= vec_sel(static_index0,static_index1,r3);
-       v0=vec_sel(v4,v5,r3);
-
-       ind3= vec_sel(static_index2,static_index3,r4);
-       v1=vec_sel(v6,v7,r4);
-
-       // cmp selected
-       r1=vec_cmpgt(vv1,vv0);
-       r2=vec_cmpgt(v1,v0);
-
-       v_ptrx+=8;
-       //select from above 
-       ind0_second= vec_sel(ind0_second,ind1,r1);
-       vv0= vec_sel(vv0,vv1,r1) ;
-
-       ind2= vec_sel(ind2,ind3,r2);
-       vv1= vec_sel(v0,v1,r2) ;  
-
-       //second indices actually should be within [16,31] so ind2+16
-       ind2 +=temp1;
-       
-       //final cmp and select index and value for the second 32 values
-       r1=vec_cmpgt(vv1,vv0);
-       ind0_second = vec_sel(ind0_second,ind2,r1);
-       vv0= vec_sel(vv0,vv1,r1);
-
-       ind0_second+=temp0; //get absolute index
-    
-       //find final quadruple from 64 elements
-       r2=vec_cmpgt(vv0,vf0);
-       ind2 = vec_sel( ind0_first,ind0_second,r2);
-       vv0= vec_sel(vf0,vv0,r2);       
-
-       //compare with old quadruple and update 
-       r3=vec_cmpgt(vv0,quadruple_values);
-       quadruple_indices = vec_sel( quadruple_indices,ind2,r3);
-       quadruple_values= vec_sel(quadruple_values,vv0,r3);      
-
-       temp0+=temp1;
-       temp0+=temp1; //temp0+32
- 
-    }
-
-    //now we have to chose from 4 values and 4 different indices
-    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
-    // otherwise we will assign index of the maximum value
-    float a1,a2,a3,a4;
-    unsigned int i1,i2,i3,i4;
-    a1=vec_extract(quadruple_values,0);
-    a2=vec_extract(quadruple_values,1);
-    a3=vec_extract(quadruple_values,2);
-    a4=vec_extract(quadruple_values,3);
-    i1=vec_extract(quadruple_indices,0);
-    i2=vec_extract(quadruple_indices,1);
-    i3=vec_extract(quadruple_indices,2);
-    i4=vec_extract(quadruple_indices,3);
-    if(a1==a2){
-      index=i1>i2?i2:i1;
-    }else if(a2>a1){
-      index=i2;
-      a1=a2;
-    }else{
-       index= i1;
-    }
-
-    if(a4==a3){
-      i1=i3>i4?i4:i3;
-    }else if(a4>a3){
-      i1=i4;
-      a3=a4;
-    }else{
-       i1= i3;
-    }
-
-    if(a1==a3){
-       index=i1>index?index:i1;
-       *maxf=a1; 
-    }else if(a3>a1){
-       index=i1;
-       *maxf=a3;
-    }else{ 
-        *maxf=a1;
-    }
-    return index;
-
-}
-
-BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
-    BLASLONG i = 0;
-    BLASLONG j = 0;
-    FLOAT maxf = 0.0;
-    BLASLONG max = 0;
-
-    if (n <= 0 || inc_x <= 0) return (max);
-
-    if (inc_x == 1) {
-
-        BLASLONG n1 = n & -64;
-        if (n1 > 0) {
-
-            max = siamax_kernel_64(n1, x, &maxf);
-
-            i = n1;
-        }
-
-        while (i < n) {
-            if (ABS(x[i]) > maxf) {
-                max = i;
-                maxf = ABS(x[i]);
-            }
-            i++;
-        }
-        return (max + 1);
-
-    } else {
-
-        BLASLONG n1 = n & -4;
-        while (j < n1) {
-
-            if (ABS(x[i]) > maxf) {
-                max = j;
-                maxf = ABS(x[i]);
-            }
-            if (ABS(x[i + inc_x]) > maxf) {
-                max = j + 1;
-                maxf = ABS(x[i + inc_x]);
-            }
-            if (ABS(x[i + 2 * inc_x]) > maxf) {
-                max = j + 2;
-                maxf = ABS(x[i + 2 * inc_x]);
-            }
-            if (ABS(x[i + 3 * inc_x]) > maxf) {
-                max = j + 3;
-                maxf = ABS(x[i + 3 * inc_x]);
-            }
-
-            i += inc_x * 4;
-
-            j += 4;
-
-        }
-
-
-        while (j < n) {
-            if (ABS(x[i]) > maxf) {
-                max = j;
-                maxf = ABS(x[i]);
-            }
-            i += inc_x;
-            j++;
-        }
-        return (max + 1);
-    }
-}
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+
+
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+
+/**
+ * Find  maximum index 
+ * Warning: requirements n>0  and n % 64 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param maxf  (out) maximum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) {
+    BLASLONG index;
+    BLASLONG i=0;
+    register __vector unsigned int static_index0 = {0,1,2,3};
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
+    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
+    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
+    register __vector float quadruple_values={0,0,0,0};
+    register __vector float * v_ptrx=(__vector float *)x;
+    for(; i<n; i+=64){
+       //absolute temporary vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       register __vector bool int r1=vec_cmpgt(v1,v0);
+       register __vector bool int r2=vec_cmpgt(v3,v2);
+       register __vector bool int r3=vec_cmpgt(v5,v4);
+       register __vector bool int r4=vec_cmpgt(v7,v6);
+      
+       //select
+       register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1);
+
+       register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vf1= vec_sel(v2,v3,r2);
+
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+        r1=vec_cmpgt(vf1,vf0);
+       r2=vec_cmpgt(v1,v0);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_first= vec_sel(ind0_first,ind1,r1);
+       vf0= vec_sel(vf0,vf1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vf1= vec_sel(v0,v1,r2);
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the first 32 values
+       r1=vec_cmpgt(vf1,vf0);
+       ind0_first = vec_sel(ind0_first,ind2,r1);
+       vf0= vec_sel(vf0,vf1,r1);
+ 
+       ind0_first+=temp0; //get absolute index
+
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+       //second part of 32
+       // absolute temporary vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       r1=vec_cmpgt(v1,v0);
+       r2=vec_cmpgt(v3,v2);
+       r3=vec_cmpgt(v5,v4);
+       r4=vec_cmpgt(v7,v6);
+       //select
+       register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1);
+
+       ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vv1= vec_sel(v2,v3,r2);
+
+       ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+       r1=vec_cmpgt(vv1,vv0);
+       r2=vec_cmpgt(v1,v0);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_second= vec_sel(ind0_second,ind1,r1);
+       vv0= vec_sel(vv0,vv1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vv1= vec_sel(v0,v1,r2) ;  
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the second 32 values
+       r1=vec_cmpgt(vv1,vv0);
+       ind0_second = vec_sel(ind0_second,ind2,r1);
+       vv0= vec_sel(vv0,vv1,r1);
+
+       ind0_second+=temp0; //get absolute index
+    
+       //find final quadruple from 64 elements
+       r2=vec_cmpgt(vv0,vf0);
+       ind2 = vec_sel( ind0_first,ind0_second,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+
+       //compare with old quadruple and update 
+       r3=vec_cmpgt(vv0,quadruple_values);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r3);
+       quadruple_values= vec_sel(quadruple_values,vv0,r3);      
+
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+ 
+    }
+
+    //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the maximum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+      index=i1>i2?i2:i1;
+    }else if(a2>a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4>a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+       index=i1>index?index:i1;
+       *maxf=a1; 
+    }else if(a3>a1){
+       index=i1;
+       *maxf=a3;
+    }else{ 
+        *maxf=a1;
+    }
+    return index;
+
+}
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+    BLASLONG i = 0;
+    BLASLONG j = 0;
+    FLOAT maxf = 0.0;
+    BLASLONG max = 0;
+
+    if (n <= 0 || inc_x <= 0) return (max);
+
+    if (inc_x == 1) {
+
+        BLASLONG n1 = n & -64;
+        if (n1 > 0) {
+
+            max = siamax_kernel_64(n1, x, &maxf);
+
+            i = n1;
+        }
+
+        while (i < n) {
+            if (ABS(x[i]) > maxf) {
+                max = i;
+                maxf = ABS(x[i]);
+            }
+            i++;
+        }
+        return (max + 1);
+
+    } else {
+
+        BLASLONG n1 = n & -4;
+        while (j < n1) {
+
+            if (ABS(x[i]) > maxf) {
+                max = j;
+                maxf = ABS(x[i]);
+            }
+            if (ABS(x[i + inc_x]) > maxf) {
+                max = j + 1;
+                maxf = ABS(x[i + inc_x]);
+            }
+            if (ABS(x[i + 2 * inc_x]) > maxf) {
+                max = j + 2;
+                maxf = ABS(x[i + 2 * inc_x]);
+            }
+            if (ABS(x[i + 3 * inc_x]) > maxf) {
+                max = j + 3;
+                maxf = ABS(x[i + 3 * inc_x]);
+            }
+
+            i += inc_x * 4;
+
+            j += 4;
+
+        }
+
+
+        while (j < n) {
+            if (ABS(x[i]) > maxf) {
+                max = j;
+                maxf = ABS(x[i]);
+            }
+            i += inc_x;
+            j++;
+        }
+        return (max + 1);
+    }
+}
diff --git a/kernel/power/isamin.c b/kernel/power/isamin.c
index 1c1f0ad78..60c843f58 100644
--- a/kernel/power/isamin.c
+++ b/kernel/power/isamin.c
@@ -1,288 +1,288 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-#include "common.h"
-#include <math.h>
-#include <altivec.h>
-#if defined(DOUBLE)
-    #define ABS fabs
-#else
-    #define ABS fabsf
-#endif
-/**
- * Find  minimum index 
- * Warning: requirements n>0  and n % 64 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param minf  (out) minimum absolute value .( only for output )
- * @return  index 
- */
-static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) {
-    BLASLONG index;
-    BLASLONG i=0;
-    register __vector unsigned int static_index0 = {0,1,2,3};
-    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
-    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
-    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
-    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
-    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
-    temp0=vec_xor(temp0,temp0);
-    temp1=temp1 <<1 ; //{16,16,16,16}
-    register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3};
-    register __vector float * v_ptrx=(__vector float *)x;
-    register __vector float quadruple_values=vec_abs(v_ptrx[0]);
-    for(; i<n; i+=64){
-       //absolute temporary vectors
-       register __vector float v0=vec_abs(v_ptrx[0]);
-       register __vector float v1=vec_abs(v_ptrx[1]);
-       register __vector float v2=vec_abs(v_ptrx[2]);
-       register __vector float v3=vec_abs(v_ptrx[3]);
-       register __vector float v4=vec_abs(v_ptrx[4]);
-       register __vector float v5=vec_abs(v_ptrx[5]);
-       register __vector float v6=vec_abs(v_ptrx[6]);       
-       register __vector float v7=vec_abs(v_ptrx[7]);
-       //cmp quadruple pairs
-       register __vector bool int r1=vec_cmpgt(v0,v1);
-       register __vector bool int r2=vec_cmpgt(v2,v3);
-       register __vector bool int r3=vec_cmpgt(v4,v5);
-       register __vector bool int r4=vec_cmpgt(v6,v7);
-              
-       //select
-       register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);
-       register __vector float vf0= vec_sel(v0,v1,r1);
-
-       register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);
-       register __vector float vf1= vec_sel(v2,v3,r2);
-
-       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);
-       v0=vec_sel(v4,v5,r3);
-
-       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);
-       v1=vec_sel(v6,v7,r4);
-
-       // cmp selected
-       r1=vec_cmpgt(vf0,vf1);
-       r2=vec_cmpgt(v0,v1);
-
-       v_ptrx+=8;
-       //select from above 
-       ind0_first= vec_sel(ind0_first,ind1,r1);
-       vf0= vec_sel(vf0,vf1,r1) ;
-
-       ind2= vec_sel(ind2,ind3,r2);
-       vf1= vec_sel(v0,v1,r2);
-
-       //second indices actually should be within [16,31] so ind2+16
-       ind2 +=temp1;
-       
-       //final cmp and select index and value for the first 32 values
-       r1=vec_cmpgt(vf0,vf1);
-       ind0_first = vec_sel(ind0_first,ind2,r1);
-       vf0= vec_sel(vf0,vf1,r1);
- 
-       ind0_first+=temp0; //get absolute index
-       
-       temp0+=temp1;
-       temp0+=temp1; //temp0+32
-       //second part of 32
-       // absolute temporary vectors
-       v0=vec_abs(v_ptrx[0]);
-       v1=vec_abs(v_ptrx[1]);
-       v2=vec_abs(v_ptrx[2]);
-       v3=vec_abs(v_ptrx[3]);
-       v4=vec_abs(v_ptrx[4]);
-       v5=vec_abs(v_ptrx[5]);
-       v6=vec_abs(v_ptrx[6]);       
-       v7=vec_abs(v_ptrx[7]);
-       //cmp quadruple pairs
-       r1=vec_cmpgt(v0,v1);
-       r2=vec_cmpgt(v2,v3);
-       r3=vec_cmpgt(v4,v5);
-       r4=vec_cmpgt(v6,v7);
-       //select
-       register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);
-       register __vector float vv0= vec_sel(v0,v1,r1);
-
-       ind1= vec_sel(static_index2,static_index3,r2);
-       register __vector float vv1= vec_sel(v2,v3,r2);
-
-       ind2= vec_sel(static_index0,static_index1,r3);
-       v0=vec_sel(v4,v5,r3);
-
-       ind3= vec_sel(static_index2,static_index3,r4);
-       v1=vec_sel(v6,v7,r4);
-
-       // cmp selected
-       r1=vec_cmpgt(vv0,vv1);
-       r2=vec_cmpgt(v0,v1);
-
-       v_ptrx+=8;
-       //select from above 
-       ind0_second= vec_sel(ind0_second,ind1,r1);
-       vv0= vec_sel(vv0,vv1,r1) ;
-
-       ind2= vec_sel(ind2,ind3,r2);
-       vv1= vec_sel(v0,v1,r2) ;  
-
-       //second indices actually should be within [16,31] so ind2+16
-       ind2 +=temp1;
-       
-       //final cmp and select index and value for the second 32 values
-       r1=vec_cmpgt(vv0,vv1);
-       ind0_second = vec_sel(ind0_second,ind2,r1);
-       vv0= vec_sel(vv0,vv1,r1);
-
-       ind0_second+=temp0; //get absolute index
-        
-       //find final quadruple from 64 elements
-       r2=vec_cmpgt(vf0,vv0);
-       ind2 = vec_sel( ind0_first,ind0_second,r2);
-       vv0= vec_sel(vf0,vv0,r2);       
-             
-       //compare with old quadruple and update 
-       r3=vec_cmpgt( quadruple_values,vv0);
-       quadruple_indices = vec_sel( quadruple_indices,ind2,r3);
-       quadruple_values= vec_sel(quadruple_values,vv0,r3);      
-            
-       temp0+=temp1;
-       temp0+=temp1; //temp0+32
-       
-      
-    }
-
-    //now we have to chose from 4 values and 4 different indices
-    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
-    // otherwise we will assign index of the minimum value
-    float a1,a2,a3,a4;
-    unsigned int i1,i2,i3,i4;
-    a1=vec_extract(quadruple_values,0);
-    a2=vec_extract(quadruple_values,1);
-    a3=vec_extract(quadruple_values,2);
-    a4=vec_extract(quadruple_values,3);
-    i1=vec_extract(quadruple_indices,0);
-    i2=vec_extract(quadruple_indices,1);
-    i3=vec_extract(quadruple_indices,2);
-    i4=vec_extract(quadruple_indices,3);
-    if(a1==a2){
-       index=i1>i2?i2:i1;
-    }else if(a2<a1){
-      index=i2;
-      a1=a2;
-    }else{
-       index= i1;
-    }
-
-    if(a4==a3){
-      i1=i3>i4?i4:i3;
-    }else if(a4<a3){
-      i1=i4;
-      a3=a4;
-    }else{
-       i1= i3;
-    }
-
-    if(a1==a3){
-      index=i1>index?index:i1;
-       *minf=a1; 
-    }else if(a3<a1){
-       index=i1;
-       *minf=a3;
-    }else{ 
-        *minf=a1;
-    }
-    return index;
-
-}
-
-
-
-
-BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
-    BLASLONG i = 0;
-    BLASLONG j = 0; 
-    BLASLONG min = 0;
-    FLOAT minf = 0.0;
-    
-    if (n <= 0 || inc_x <= 0) return (min);
-    minf = ABS(x[0]); //index's not incremented
-    if (inc_x == 1) {
-
-        BLASLONG n1 = n & -64;
-        if (n1 > 0) {
-
-            min = siamin_kernel_64(n1, x, &minf);
-            i = n1;
-        }
-
-        while (i < n) {
-            if (ABS(x[i]) < minf) {
-                min = i;
-                minf = ABS(x[i]);
-            }
-            i++;
-        }
-        return (min + 1);
-
-    } else {
-
-        BLASLONG n1 = n & -4;
-        while (j < n1) {
-
-            if (ABS(x[i]) < minf) {
-                min = j;
-                minf = ABS(x[i]);
-            }
-            if (ABS(x[i + inc_x]) < minf) {
-                min = j + 1;
-                minf = ABS(x[i + inc_x]);
-            }
-            if (ABS(x[i + 2 * inc_x]) < minf) {
-                min = j + 2;
-                minf = ABS(x[i + 2 * inc_x]);
-            }
-            if (ABS(x[i + 3 * inc_x]) < minf) {
-                min = j + 3;
-                minf = ABS(x[i + 3 * inc_x]);
-            }
-
-            i += inc_x * 4;
-
-            j += 4;
-
-        }
-
-
-        while (j < n) {
-            if (ABS(x[i]) < minf) {
-                min = j;
-                minf = ABS(x[i]);
-            }
-            i += inc_x;
-            j++;
-        }
-        return (min + 1);
-    }
-}
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+/**
+ * Find  minimum index 
+ * Warning: requirements n>0  and n % 64 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param minf  (out) minimum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) {
+    BLASLONG index;
+    BLASLONG i=0;
+    register __vector unsigned int static_index0 = {0,1,2,3};
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
+    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
+    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3};
+    register __vector float * v_ptrx=(__vector float *)x;
+    register __vector float quadruple_values=vec_abs(v_ptrx[0]);
+    for(; i<n; i+=64){
+       //absolute temporary vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       register __vector bool int r1=vec_cmpgt(v0,v1);
+       register __vector bool int r2=vec_cmpgt(v2,v3);
+       register __vector bool int r3=vec_cmpgt(v4,v5);
+       register __vector bool int r4=vec_cmpgt(v6,v7);
+              
+       //select
+       register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1);
+
+       register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vf1= vec_sel(v2,v3,r2);
+
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+       r1=vec_cmpgt(vf0,vf1);
+       r2=vec_cmpgt(v0,v1);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_first= vec_sel(ind0_first,ind1,r1);
+       vf0= vec_sel(vf0,vf1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vf1= vec_sel(v0,v1,r2);
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the first 32 values
+       r1=vec_cmpgt(vf0,vf1);
+       ind0_first = vec_sel(ind0_first,ind2,r1);
+       vf0= vec_sel(vf0,vf1,r1);
+ 
+       ind0_first+=temp0; //get absolute index
+       
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+       //second part of 32
+       // absolute temporary vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       r1=vec_cmpgt(v0,v1);
+       r2=vec_cmpgt(v2,v3);
+       r3=vec_cmpgt(v4,v5);
+       r4=vec_cmpgt(v6,v7);
+       //select
+       register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1);
+
+       ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vv1= vec_sel(v2,v3,r2);
+
+       ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+       r1=vec_cmpgt(vv0,vv1);
+       r2=vec_cmpgt(v0,v1);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_second= vec_sel(ind0_second,ind1,r1);
+       vv0= vec_sel(vv0,vv1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vv1= vec_sel(v0,v1,r2) ;  
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the second 32 values
+       r1=vec_cmpgt(vv0,vv1);
+       ind0_second = vec_sel(ind0_second,ind2,r1);
+       vv0= vec_sel(vv0,vv1,r1);
+
+       ind0_second+=temp0; //get absolute index
+        
+       //find final quadruple from 64 elements
+       r2=vec_cmpgt(vf0,vv0);
+       ind2 = vec_sel( ind0_first,ind0_second,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+             
+       //compare with old quadruple and update 
+       r3=vec_cmpgt( quadruple_values,vv0);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r3);
+       quadruple_values= vec_sel(quadruple_values,vv0,r3);      
+            
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+       
+      
+    }
+
+    //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the minimum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+       index=i1>i2?i2:i1;
+    }else if(a2<a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4<a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+      index=i1>index?index:i1;
+       *minf=a1; 
+    }else if(a3<a1){
+       index=i1;
+       *minf=a3;
+    }else{ 
+        *minf=a1;
+    }
+    return index;
+
+}
+
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+    BLASLONG i = 0;
+    BLASLONG j = 0; 
+    BLASLONG min = 0;
+    FLOAT minf = 0.0;
+    
+    if (n <= 0 || inc_x <= 0) return (min);
+    minf = ABS(x[0]); //index's not incremented
+    if (inc_x == 1) {
+
+        BLASLONG n1 = n & -64;
+        if (n1 > 0) {
+
+            min = siamin_kernel_64(n1, x, &minf);
+            i = n1;
+        }
+
+        while (i < n) {
+            if (ABS(x[i]) < minf) {
+                min = i;
+                minf = ABS(x[i]);
+            }
+            i++;
+        }
+        return (min + 1);
+
+    } else {
+
+        BLASLONG n1 = n & -4;
+        while (j < n1) {
+
+            if (ABS(x[i]) < minf) {
+                min = j;
+                minf = ABS(x[i]);
+            }
+            if (ABS(x[i + inc_x]) < minf) {
+                min = j + 1;
+                minf = ABS(x[i + inc_x]);
+            }
+            if (ABS(x[i + 2 * inc_x]) < minf) {
+                min = j + 2;
+                minf = ABS(x[i + 2 * inc_x]);
+            }
+            if (ABS(x[i + 3 * inc_x]) < minf) {
+                min = j + 3;
+                minf = ABS(x[i + 3 * inc_x]);
+            }
+
+            i += inc_x * 4;
+
+            j += 4;
+
+        }
+
+
+        while (j < n) {
+            if (ABS(x[i]) < minf) {
+                min = j;
+                minf = ABS(x[i]);
+            }
+            i += inc_x;
+            j++;
+        }
+        return (min + 1);
+    }
+}
diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S
index 7a0f3143e..5cdc83d87 100644
--- a/kernel/power/sgemm_kernel_power9.S
+++ b/kernel/power/sgemm_kernel_power9.S
@@ -1,272 +1,272 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-#define ASSEMBLER
-#include "common.h"
-#include "def_vsx.h"
-
- 
-#define LOAD	ld
-#define STACKSIZE  (512 )  
-#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
-#define	M	r3
-#define	N	r4
-#define	K	r5
-
- 
-#define A	r7
-#define	B	r8
-#define	C	r9
-#define	LDC	r10
-#define OFFSET	r6
- 
- 
-
-#define alpha_r vs20
-#define save_permute_1 vs21
-#define save_permute_2 vs22
-#define permute_mask vs23
-#define o0	0
- 
-
-#define T1	r11
-#define T2	r12
-#define T3	r14
-#define T4	r15
-#define T5	r16
-#define T6	r17
-#define L	r18
-#define T7	r19
-#define T8	r20
-#define TEMP_REG	r21
-#define	I	r22
-#define J	r23
-#define AO	r24
-#define	BO	r25
-#define	CO 	r26
-#define T9	r27
-#define	T10	r28
-#define	T11	r29
-
-#define T12	r30
-#define T13	r31
-
-#include "sgemm_macros_power9.S"
-
-.equ    perm_const1, 0x0405060700010203
-.equ    perm_const2, 0x0c0d0e0f08090a0b
-.equ save_permute_11, 0x1415161718191a1b
-.equ save_permute_12, 0x0405060708090a0b
-.equ save_permute_21, 0x101112131c1d1e1f
-.equ save_permute_22, 0x000102030c0d0e0f 
-
-
-#ifndef NEEDPARAM
-
-	PROLOGUE
-	PROFCODE
-
-	addi	SP, SP, -STACKSIZE
-	mflr r0
-
-
-	stfd	f14,    0(SP)
-	stfd	f15,    8(SP)
-	stfd	f16,   16(SP)
-	stfd	f17,   24(SP)
-
-	stfd	f18,   32(SP)
-	stfd	f19,   40(SP)
-	stfd	f20,   48(SP)
-	stfd	f21,   56(SP)
-
-	stfd	f22,   64(SP)
-	stfd	f23,   72(SP)
-	stfd	f24,   80(SP)
-	stfd	f25,   88(SP)
-
-	stfd	f26,   96(SP)
-	stfd	f27,  104(SP)
-	stfd	f28,  112(SP)
-	stfd	f29,  120(SP)
-
-	stfd	f30,  128(SP)
-	stfd	f31,  136(SP)
-
- 
-	std	r31,  144(SP)
-	std	r30,  152(SP)
-	std	r29,  160(SP)
-	std	r28,  168(SP)
-	std	r27,  176(SP)
-	std	r26,  184(SP)
-	std	r25,  192(SP)
-	std	r24,  200(SP)
-	std	r23,  208(SP)
-	std	r22,  216(SP)
-	std	r21,  224(SP)
-	std	r20,  232(SP)
-	std	r19,  240(SP)
-	std	r18,  248(SP)
-	std	r17,  256(SP)
-	std	r16,  264(SP)
-	std	r15,  272(SP)
-	std	r14,  280(SP)
- 
- 
-  stxv    vs52,  288(SP)
-  stxv    vs53,  304(SP)
-  stxv    vs54,  320(SP)
-  stxv    vs55,  336(SP)
-  stxv    vs56,  352(SP)
-  stxv    vs57,  368(SP)
-  stxv    vs58,  384(SP)
-  stxv    vs59,  400(SP)
-  stxv    vs60,  416(SP)
-  stxv    vs61,  432(SP)
-  stxv    vs62,  448(SP)
-  stxv    vs63,  464(SP)
-  std     r0,   FLINK_SAVE(SP)
- 
-
-#if defined(TRMMKERNEL) 
-	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
-#endif
-   slwi    LDC, LDC, 2
-
- 
- 
-	/*alpha is stored in f1. convert to single and splat*/
-  xscvdpspn alpha_r,vs1 
-	xxspltw   alpha_r,alpha_r,0 
- 
-/*load reverse permute mask for big endian
-  uint128 = 0xc0d0e0f08090a0b0405060700010203
-*/ 
-		
-	lis T2, perm_const2@highest
-	lis T1, perm_const1@highest
-	lis T3, save_permute_12@highest
-	lis T4, save_permute_11@highest
-	lis T5, save_permute_22@highest
-	lis T6, save_permute_21@highest
-	ori T2, T2, perm_const2@higher
-	ori T1, T1, perm_const1@higher
-	ori T3, T3, save_permute_12@higher
-	ori T4, T4, save_permute_11@higher
-	ori T5, T5, save_permute_22@higher
-	ori T6, T6, save_permute_21@higher
-	rldicr T2, T2, 32, 31
-	rldicr T1, T1, 32, 31
-	rldicr T3, T3, 32, 31
-	rldicr T4, T4, 32, 31
-	rldicr T5, T5, 32, 31
-	rldicr T6, T6, 32, 31
-	oris T2, T2, perm_const2@h
-	oris T1, T1, perm_const1@h
-	oris T3, T3, save_permute_12@h
-	oris T4, T4, save_permute_11@h
-	oris T5, T5, save_permute_22@h
-	oris T6, T6, save_permute_21@h
-	ori T2, T2, perm_const2@l  
-	ori T1, T1, perm_const1@l
-	ori T3, T3, save_permute_12@l  
-	ori T4, T4, save_permute_11@l
-	ori T5, T5, save_permute_22@l 
-	ori T6, T6, save_permute_21@l
-  li r0,0
-	mtvsrdd permute_mask,T2,T1
-	mtvsrdd save_permute_1,T3,T4	
-	mtvsrdd save_permute_2,T5,T6	
-
-#include "sgemm_logic_power9.S"
-
-.L999: 
-	lfd	f14,    0(SP)
-	lfd	f15,    8(SP)
-	lfd	f16,   16(SP)
-	lfd	f17,   24(SP)
-
-	lfd	f18,   32(SP)
-	lfd	f19,   40(SP)
-	lfd	f20,   48(SP)
-	lfd	f21,   56(SP)
-
-	lfd	f22,   64(SP)
-	lfd	f23,   72(SP)
-	lfd	f24,   80(SP)
-	lfd	f25,   88(SP)
-
-	lfd	f26,   96(SP)
-	lfd	f27,  104(SP)
-	lfd	f28,  112(SP)
-	lfd	f29,  120(SP)
-
-	lfd	f30,  128(SP)
-	lfd	f31,  136(SP)
-
-	ld	r31,  144(SP)
-	ld	r30,  152(SP)
-	ld	r29,  160(SP)
-	ld	r28,  168(SP)
-	ld	r27,  176(SP)
-	ld	r26,  184(SP)
-	ld	r25,  192(SP)
-	ld	r24,  200(SP)
-	ld	r23,  208(SP)
-	ld	r22,  216(SP)
-	ld	r21,  224(SP)
-	ld	r20,  232(SP)
-	ld	r19,  240(SP)
-	ld	r18,  248(SP)
-	ld	r17,  256(SP)
-	ld	r16,  264(SP)
-	ld	r15,  272(SP)
-	ld	r14,  280(SP)
-
-	ld    r0, 	 FLINK_SAVE(SP)	
- 
-    lxv    vs52,  288(SP)
-    lxv    vs53,  304(SP)
-    lxv    vs54,  320(SP)
-    lxv    vs55,  336(SP)
-    lxv    vs56,  352(SP)
-    lxv    vs57,  368(SP)
-    lxv    vs58,  384(SP) 
-    lxv    vs59,  400(SP)
-	mtlr r0
-    lxv    vs60,  416(SP)
-    lxv    vs61,  432(SP) 
-    lxv    vs62,  448(SP)
-    lxv    vs63,  464(SP)
-
-	addi	SP, SP, STACKSIZE 
-	blr
-
-
-	EPILOGUE
-#endif
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+#define STACKSIZE  (512 )  
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+ 
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+ 
+ 
+
+#define alpha_r vs20
+#define save_permute_1 vs21
+#define save_permute_2 vs22
+#define permute_mask vs23
+#define o0	0
+ 
+
+#define T1	r11
+#define T2	r12
+#define T3	r14
+#define T4	r15
+#define T5	r16
+#define T6	r17
+#define L	r18
+#define T7	r19
+#define T8	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define T9	r27
+#define	T10	r28
+#define	T11	r29
+
+#define T12	r30
+#define T13	r31
+
+#include "sgemm_macros_power9.S"
+
+.equ    perm_const1, 0x0405060700010203
+.equ    perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_11, 0x1415161718191a1b
+.equ save_permute_12, 0x0405060708090a0b
+.equ save_permute_21, 0x101112131c1d1e1f
+.equ save_permute_22, 0x000102030c0d0e0f 
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	mflr r0
+
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+ 
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+  stxv    vs52,  288(SP)
+  stxv    vs53,  304(SP)
+  stxv    vs54,  320(SP)
+  stxv    vs55,  336(SP)
+  stxv    vs56,  352(SP)
+  stxv    vs57,  368(SP)
+  stxv    vs58,  384(SP)
+  stxv    vs59,  400(SP)
+  stxv    vs60,  416(SP)
+  stxv    vs61,  432(SP)
+  stxv    vs62,  448(SP)
+  stxv    vs63,  464(SP)
+  std     r0,   FLINK_SAVE(SP)
+ 
+
+#if defined(TRMMKERNEL) 
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+   slwi    LDC, LDC, 2
+
+ 
+ 
+	/*alpha is stored in f1. convert to single and splat*/
+  xscvdpspn alpha_r,vs1 
+	xxspltw   alpha_r,alpha_r,0 
+ 
+/*load reverse permute mask for big endian
+  uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/ 
+		
+	lis T2, perm_const2@highest
+	lis T1, perm_const1@highest
+	lis T3, save_permute_12@highest
+	lis T4, save_permute_11@highest
+	lis T5, save_permute_22@highest
+	lis T6, save_permute_21@highest
+	ori T2, T2, perm_const2@higher
+	ori T1, T1, perm_const1@higher
+	ori T3, T3, save_permute_12@higher
+	ori T4, T4, save_permute_11@higher
+	ori T5, T5, save_permute_22@higher
+	ori T6, T6, save_permute_21@higher
+	rldicr T2, T2, 32, 31
+	rldicr T1, T1, 32, 31
+	rldicr T3, T3, 32, 31
+	rldicr T4, T4, 32, 31
+	rldicr T5, T5, 32, 31
+	rldicr T6, T6, 32, 31
+	oris T2, T2, perm_const2@h
+	oris T1, T1, perm_const1@h
+	oris T3, T3, save_permute_12@h
+	oris T4, T4, save_permute_11@h
+	oris T5, T5, save_permute_22@h
+	oris T6, T6, save_permute_21@h
+	ori T2, T2, perm_const2@l  
+	ori T1, T1, perm_const1@l
+	ori T3, T3, save_permute_12@l  
+	ori T4, T4, save_permute_11@l
+	ori T5, T5, save_permute_22@l 
+	ori T6, T6, save_permute_21@l
+  li r0,0
+	mtvsrdd permute_mask,T2,T1
+	mtvsrdd save_permute_1,T3,T4	
+	mtvsrdd save_permute_2,T5,T6	
+
+#include "sgemm_logic_power9.S"
+
+.L999: 
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+
+	ld    r0, 	 FLINK_SAVE(SP)	
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP)
+	mtlr r0
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE 
+	blr
+
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S
index a34ed32b8..4022959e2 100644
--- a/kernel/power/sgemm_logic_power9.S
+++ b/kernel/power/sgemm_logic_power9.S
@@ -1,2192 +1,2192 @@
-#define MY_ALIGN .align 3
-b L8
-
-	MY_ALIGN
-LSGEMM_L8x16_LMAIN_SUB: 
-	LOAD8x16_2    
-	MY_ALIGN
-
-LSGEMM_L8x16_LOOP:
-    KERNEL8x16_L2 128,64,0,0 
-LSGEMM_L8x16_K128:
-    KERNEL8x16_L2 128,64,1,0 
-    KERNEL8x16_I1_L4_2  128,64, 1,0
-    KERNEL8x16_I1_L4_2  128,64, 2,0
-    KERNEL8x16_I1_L4_2  128,64, 3,0
-    KERNEL8x16_I1_L4_2  128,64, 4,0
-    KERNEL8x16_I1_L4_2  128,64, 5,0        
-    KERNEL8x16_I1_L4_2  128,64, 6,0
-    KERNEL8x16_I1_L4_2  128,64, 7,0  
-    KERNEL8x16_I1_L4_2  128,64, 8,0      
-    KERNEL8x16_I1_L4_2  128,64, 9,0
-    KERNEL8x16_I1_L4_2  128,64, 10,0
-    KERNEL8x16_I1_L4_2  128,64, 11,0
-    KERNEL8x16_I1_L4_2  128,64, 12,0
-    KERNEL8x16_I1_L4_2  128,64, 13,0    
-    KERNEL8x16_I1_L4_2  128,64, 14,0    
-    KERNEL8x16_I1_L4_2  128,64, 15,0  	
-    KERNEL8x16_I1_L4_2  128,64, 16,0
-    KERNEL8x16_I1_L4_2  128,64, 17,0
-    KERNEL8x16_I1_L4_2  128,64, 18,0
-    KERNEL8x16_I1_L4_2  128,64, 19,0
-    KERNEL8x16_I1_L4_2  128,64, 20,0
-    KERNEL8x16_I1_L4_2  128,64, 21,0        
-    KERNEL8x16_I1_L4_2  128,64, 22,0
-    KERNEL8x16_I1_L4_2  128,64, 23,0  
-    KERNEL8x16_I1_L4_2  128,64, 24,0      
-    KERNEL8x16_I1_L4_2  128,64, 25,0
-    KERNEL8x16_I1_L4_2  128,64, 26,0
-    KERNEL8x16_I1_L4_2  128,64, 27,0
-    KERNEL8x16_I1_L4_2  128,64, 28,0
-    KERNEL8x16_I1_L4_2  128,64, 29,0    
-    KERNEL8x16_I1_L4_2  128,64, 30,0    
-    KERNEL8x16_I1_L4_2  128,64, 31,1 
-	bdnz		LSGEMM_L8x16_LOOP
-
-	MY_ALIGN
-LSGEMM_L8x16_LOOP_END: 
-    END8x16_2
-    blr  
-
-	MY_ALIGN
-LSGEMM_L8x16_L64_SUB: 
-	LOAD8x16_2     
-    KERNEL8x16_I1_L4_2  128,64, 0,0
-    KERNEL8x16_I1_L4_2  128,64, 1,0
-    KERNEL8x16_I1_L4_2  128,64, 2,0
-    KERNEL8x16_I1_L4_2  128,64,3,0
-    KERNEL8x16_I1_L4_2  128,64,4,0
-    KERNEL8x16_I1_L4_2  128,64,5,0        
-    KERNEL8x16_I1_L4_2  128,64,6,0
-    KERNEL8x16_I1_L4_2  128,64,7,0  
-    KERNEL8x16_I1_L4_2  128,64,8,0      
-    KERNEL8x16_I1_L4_2  128,64,9,0
-    KERNEL8x16_I1_L4_2  128,64,10,0
-    KERNEL8x16_I1_L4_2  128,64,11,0
-    KERNEL8x16_I1_L4_2  128,64,12,0
-    KERNEL8x16_I1_L4_2  128,64,13,0    
-    KERNEL8x16_I1_L4_2  128,64,14,0    
-    KERNEL8x16_I1_L4_3  128,64,15,1 
-    blr	
-LSGEMM_L8x16_L32_SUB: 
-	LOAD8x16_2     
-    KERNEL8x16_I1_L4_2  128,64,0,0
-    KERNEL8x16_I1_L4_2  128,64,1,0
-    KERNEL8x16_I1_L4_2  128,64,2,0
-    KERNEL8x16_I1_L4_2  128,64,3,0
-    KERNEL8x16_I1_L4_2  128,64,4,0
-    KERNEL8x16_I1_L4_2  128,64,5,0        
-    KERNEL8x16_I1_L4_2  128,64,6,0
-    KERNEL8x16_I1_L4_3  128,64,7,1
-    blr	
-
-LSGEMM_L8x16_L16_SUB: 
-	LOAD8x16_2     
-    KERNEL8x16_I1_L4_2  128,64,0,0
-    KERNEL8x16_I1_L4_2  128,64,1,0
-    KERNEL8x16_I1_L4_2  128,64,2,0
-    KERNEL8x16_I1_L4_3  128,64,3,1
-    blr	
-
-L8:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-   neg TEMP_REG, OFFSET 
-#endif
-
-	srawi.		J,	N,	3
-
-	ble		LSGEMM_L8_END
-
-LSGEMM_L8_BEGIN:
-
-	li		T1,	128
-	li		T2,	256
- 
-	mr		AO,	A
-	mr		CO,	C
-	slwi		T3,	LDC	,	3
-	add		C,	C,	T3
-
-	dcbt		A,	T1
-	dcbt		A,	T2
-#if defined(TRMMKERNEL) && defined(LEFT)
-	mr TEMP_REG, OFFSET	 /*off = offset;*/
-#endif 
-	srawi.		I,	M,	4
-	ble		LSGEMM_L8x16_END
-
-	MY_ALIGN
-LSGEMM_L8x16_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,8
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
-   mr T12, T11
-   addi T12,T12, -2
-   srawi.		L, T12,	7 /**(T11-2) % 128x */
-#else
-   mr T12, K
-   addi T12,T12, -2
-   srawi.		L,	T12,	7 /**(K-2) % 128x */
-#endif 
- 
-    ZERO8x16 
-	ble		LSGEMM_L8x16_SUB0
-	mtctr		L 
-    bl      LSGEMM_L8x16_LMAIN_SUB
-	andi.		L,	T12,	127
-	ble		LSGEMM_L8x16_SAVE
-	b		LSGEMM_L8x16_SUB2   
-	MY_ALIGN
-LSGEMM_L8x16_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	255
-    cmpwi   T11,129
-#else
-	andi.		L,	K,	255
-    cmpwi   K,129
-#endif       
-    li T10,1
-    bne CMP8x16_128K
-    addi BO,BO,-32
-    addi AO,AO,-64 
-    LOAD8x16 64,32 
-    END8x16_WITHOUT_ADD   
-    LOAD8x16_2O AO,BO,  128, 64 
-    mtctr   T10   
-    bl LSGEMM_L8x16_K128   
-    b LSGEMM_L8x16_SAVE  
-CMP8x16_128K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T11,128
-#else    
-    cmpwi   K,128
-#endif        
-    bne LSGEMM_L8x16_SUB2 
-    MY_ALIGN   
-    mtctr   T10
-    addi BO,BO,-64
-    addi AO,AO,-128   
-    LOAD8x16_2O  AO,BO,  128,64
-    bl LSGEMM_L8x16_K128   
-    b LSGEMM_L8x16_SAVE
-	MY_ALIGN
-LSGEMM_L8x16_SUB2:
-    andi.   T10,L,64
-    ble   LSGEMM_L8x16_SUB2_32
-    bl   LSGEMM_L8x16_L64_SUB
-    MY_ALIGN 
-LSGEMM_L8x16_SUB2_32:
-    andi.      T10,L, 32
-    ble LSGEMM_L8x16_SUB2_16
-    bl   LSGEMM_L8x16_L32_SUB
-    MY_ALIGN                
-LSGEMM_L8x16_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_L8x16_SUB2_8
-	bl  LSGEMM_L8x16_L16_SUB
-    MY_ALIGN 
-LSGEMM_L8x16_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_L8x16_SUB2_4 
-	LOAD8x16_2
-    KERNEL8x16_I1_L4_2  128,64, 0,0
-    KERNEL8x16_I1_L4_3  128,64, 1,1
-	MY_ALIGN	
-LSGEMM_L8x16_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_L8x16_SUB2_2
-    LOAD8x16_2
-    KERNEL8x16_I1_L4_3  128,64, 0,1
-    MY_ALIGN
-LSGEMM_L8x16_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_L8x16_SUB2_1
-    LOAD8x16_2
-    KERNEL8x16_E2  128,64, 0,1
-    MY_ALIGN    
-LSGEMM_L8x16_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_L8x16_SAVE	
-    KERNEL8x16 0
-
-
-	MY_ALIGN
-LSGEMM_L8x16_SAVE:
-	SAVE8x16
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8
-#endif	
-	addic.		I,	I,	-1
-	bgt+		LSGEMM_L8x16_BEGIN
-    MY_ALIGN
-LSGEMM_L8x16_END:
-LSGEMM_L8x8_BEGIN:
-    andi.       T2, M,  15
-    ble     LSGEMM_L8x1_END
-
-    andi.       T1, M,  8
-    ble     LSGEMM_L8x8_END
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,8
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,8,8
-   mr T12, T11
-   addi T12,T12, -1
-   srawi.       L, T12, 4 /**(T11-1) % 16x */
-#else
-   mr T12, K
-   addi T12,T12, -1
-   srawi.       L,  T12,    4 /**(K-1) % 16x */
-#endif 
-    
-    ZERO8x8
-    ble     LSGEMM_L8x8_SUB0
-
-    MY_ALIGN
-LSGEMM_L8x8_LOOP_START:
- 
-    LOAD8x8_0  /*we already zeroed */ 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L8x8_LOOP:
-
-    KERNEL8x8_I1_L4_2  32,32, 0,0
-    KERNEL8x8_I1_L4_2  32,32, 1,0
-    KERNEL8x8_I1_L4_2  32,32, 2,0
-    KERNEL8x8_I1_L4_2  32,32, 3,1    
-
-    bdnz        LSGEMM_L8x8_LOOP
-
-    MY_ALIGN
-LSGEMM_L8x8_LOOP_END:
-
-    END8x8 0, AO, BO, 32, 32    
-
-    b       LSGEMM_L8x8_SUB1 
-    MY_ALIGN
-LSGEMM_L8x8_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    31
-#else
-    andi.       L,  K,  31
-#endif   
-    b       LSGEMM_L8x8_SUB2
-    MY_ALIGN
-LSGEMM_L8x8_SUB1:
-#if defined(TRMMKERNEL)
-    andi.       L,  T12,    15
-#else
-    andi.       L,  T12,    15
-#endif  
-    ble     LSGEMM_L8x8_SAVE
-    MY_ALIGN
-LSGEMM_L8x8_SUB2:
- 
-    srawi.      T1,L, 3
-    ble LSGEMM_L8x8_SUB2_4 
-    mtctr		T1
-    MY_ALIGN
-LSGEMM_L8x8_SUB2_LOOP:    
-    LOAD8x8_0
-    KERNEL8x8_I1_L4_2  32,32, 0,0
-    KERNEL8x8_I1_L4_3  32,32, 1,1
-    bdnz LSGEMM_L8x8_SUB2_LOOP
-    MY_ALIGN    
-LSGEMM_L8x8_SUB2_4:
-    andi.      T1,L, 4
-    ble LSGEMM_L8x8_SUB2_2
-    LOAD8x8_0
-    KERNEL8x8_I1_L4_3  32,32, 0,1
-    MY_ALIGN
-LSGEMM_L8x8_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L8x8_SUB2_1
-    LOAD8x8_0
-    KERNEL8x8_I1_L2_3  32,32, 0,1
-    MY_ALIGN    
-LSGEMM_L8x8_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L8x8_SAVE   
-    KERNEL8x8 0
- 
-
-    MY_ALIGN
-LSGEMM_L8x8_SAVE:
-    SAVE8x8
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8
-#endif  
-    MY_ALIGN  
-LSGEMM_L8x8_END:
-LSGEMM_L8x4_BEGIN:
-    andi.       T2, M,  15
-    ble     LSGEMM_L8x1_END
-
-    andi.       T1, M,  4
-    ble     LSGEMM_L8x4_END
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,8
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,4,8
-   mr T12, T11
-   addi T12,T12, -1
-   srawi.       L, T12, 4 /**(T11-1) % 16x */
-#else
-   mr T12, K
-   addi T12,T12, -1
-   srawi.       L,  T12,    4 /**(K-1) % 16x */
-#endif 
-    
-    ZERO8x4
-    ble     LSGEMM_L8x4_SUB0
-
-    MY_ALIGN
-LSGEMM_L8x4_LOOP_START:
- 
-    LOAD8x4_0  /*we already zeroed */ 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L8x4_LOOP:
-
-    KERNEL8x4_I1_L4_2  16,32, 0,0
-    KERNEL8x4_I1_L4_2  16,32, 1,0
-    KERNEL8x4_I1_L4_2  16,32, 2,0
-    KERNEL8x4_I1_L4_2  16,32, 3,1    
-
-    bdnz        LSGEMM_L8x4_LOOP
-
-    MY_ALIGN
-LSGEMM_L8x4_LOOP_END:
-
-    END8x4 0, AO, BO, 16, 32    
-
-    b       LSGEMM_L8x4_SUB1 
-    MY_ALIGN
-LSGEMM_L8x4_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    31
-#else
-    andi.       L,  K,  31
-#endif   
-    b       LSGEMM_L8x4_SUB2
-    MY_ALIGN
-LSGEMM_L8x4_SUB1:
-#if defined(TRMMKERNEL)
-    andi.       L,  T12,    15
-#else
-    andi.       L,  T12,    15
-#endif  
-    ble     LSGEMM_L8x4_SAVE
-    MY_ALIGN
-LSGEMM_L8x4_SUB2:
-
-    srawi.      T1,L, 3
-    ble LSGEMM_L8x4_SUB2_4 
-    mtctr		T1
-    MY_ALIGN
-LSGEMM_L8x4_SUB2_LOOP:      
-    LOAD8x4_0
-    KERNEL8x4_I1_L4_2  16,32, 0,0
-    KERNEL8x4_I1_L4_3  16,32, 1,1
-    bdnz LSGEMM_L8x4_SUB2_LOOP
-    MY_ALIGN    
-LSGEMM_L8x4_SUB2_4:
-    andi.      T1,L, 4
-    ble LSGEMM_L8x4_SUB2_2
-    LOAD8x4_0
-    KERNEL8x4_I1_L4_3  16,32, 0,1
-    MY_ALIGN
-LSGEMM_L8x4_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L8x4_SUB2_1
-    LOAD8x4_0
-    KERNEL8x4_I1_L2_3  16,32, 0,1
-    MY_ALIGN    
-LSGEMM_L8x4_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L8x4_SAVE   
-    KERNEL8x4 0
- 
-
-    MY_ALIGN
-LSGEMM_L8x4_SAVE:
-    SAVE8x4
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8
-#endif  
-    MY_ALIGN  
-LSGEMM_L8x4_END:
-LSGEMM_L8x2_BEGIN:
-    andi.       T1, M,  2
-    ble     LSGEMM_L8x2_END
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,8
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 
-   srawi.       L, T11, 3 /**(T11) % 8x */
-#else
-   srawi.       L,  K,    3 /**(K) % 8x */
-#endif 
-    
-    ZERO8x2
-    ble     LSGEMM_L8x2_SUB0
-
-    MY_ALIGN
-LSGEMM_L8x2_LOOP_START: 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L8x2_LOOP:
-
-    KERNEL8x2_2  0,0, 0,0
-    KERNEL8x2_2  0,0, 1,0
-    KERNEL8x2_2  0,0, 2,0
-    KERNEL8x2_2  0,0, 3,1    
-
-    bdnz        LSGEMM_L8x2_LOOP
-
-    MY_ALIGN
-LSGEMM_L8x2_LOOP_END:   
- 
-LSGEMM_L8x2_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    7
-#else
-    andi.       L,  K,  7
-#endif    
-    ble     LSGEMM_L8x2_SAVE
-    MY_ALIGN
-LSGEMM_L8x2_SUB2:
-    andi.      T1,L, 4
-    ble LSGEMM_L8x2_SUB2_2
-    KERNEL8x2_2  0,0, 0,0
-    KERNEL8x2_2  0,0, 1,1
-    MY_ALIGN
-LSGEMM_L8x2_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L8x2_SUB2_1
-    KERNEL8x2_2  0,0, 0,1 
-    MY_ALIGN    
-LSGEMM_L8x2_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L8x2_SAVE   
-    KERNEL8x2
-  
-    MY_ALIGN
-LSGEMM_L8x2_SAVE:
-    SAVE8x2
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8
-#endif  
-    MY_ALIGN  
-LSGEMM_L8x2_END:
-LSGEMM_L8x1_BEGIN: 
-    andi.       T1, M,  1
-    ble     LSGEMM_L8x1_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,8
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 
-   srawi.       L, T11, 3 /**(T11) % 8x */
-#else
-   srawi.       L,  K,    3 /**(K) % 8x */
-#endif 
-    
-    ZERO8x1
-    ble     LSGEMM_L8x1_SUB0
-
-    MY_ALIGN
-LSGEMM_L8x1_LOOP_START: 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L8x1_LOOP:
-
-    KERNEL8x1_4  0,0, 0,0
-    KERNEL8x1_4  0,0, 1,1     
-
-    bdnz        LSGEMM_L8x1_LOOP
-
-    MY_ALIGN
-LSGEMM_L8x1_LOOP_END:   
- 
-LSGEMM_L8x1_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    7
-#else
-    andi.       L,  K,  7
-#endif    
-    ble     LSGEMM_L8x1_SAVE
-    MY_ALIGN
-LSGEMM_L8x1_SUB2:
-    andi.      T1,L, 4
-    ble LSGEMM_L8x1_SUB2_2
-    KERNEL8x1_4  0,0, 0,1 
-    MY_ALIGN
-LSGEMM_L8x1_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L8x1_SUB2_1
-    KERNEL8x1_2 
-    MY_ALIGN    
-LSGEMM_L8x1_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L8x1_SAVE   
-    KERNEL8x1
-  
-    MY_ALIGN
-LSGEMM_L8x1_SAVE:
-    SAVE8x1
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8
-#endif  
-    MY_ALIGN  
-LSGEMM_L8x1_END:
-
-	slwi		T1,	K,	5
-	add		B,	B,	T1
-#if defined(TRMMKERNEL) && !defined(LEFT)
-    addi TEMP_REG, TEMP_REG, 8
-#endif
-	addic.		J,	J,	-1
-	bgt		LSGEMM_L8_BEGIN
- 
-
-LSGEMM_L8_END:
-
-/*	b		LSGEMM_L4_BEGIN*/
-    andi.       T1, N,  4
-    ble     LSGEMM_L4_END
-LSGEMM_L4_BEGIN:
-  
-
-	mr		AO,	A
-	mr		CO,	C
-	slwi		T3,	LDC	,	2
-	add		C,	C,	T3
- 
-#if defined(TRMMKERNEL) && defined(LEFT)
-	mr TEMP_REG, OFFSET	 /*off = offset;*/
-#endif 
-	srawi.		I,	M,	4
-	ble		LSGEMM_L4x16_END
-
-	MY_ALIGN
-LSGEMM_L4x16_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,4
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,16,4
-   mr T12, T11
-   addi T12,T12, -1
-   srawi.		L, T12,	6 /**(T11-1) % 64x */
-#else
-   mr T12, K
-   addi T12,T12, -1
-   srawi.		L,	T12,	6 /**(K-1) % 64x */
-#endif 
- 
-    ZERO4x16
-	ble		LSGEMM_L4x16_SUB0
-
-	MY_ALIGN
-LSGEMM_L4x16_LOOP_START:
- 
-	LOAD4x16_0  /*we already zeroed */
-    ##OffsetA=64 OffsetB=16
-    addi AO,AO,2112
-    addi BO,BO,16  
-
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_L4x16_LOOP:
-
-    KERNEL4x16_I1_L4_2  -2048,0, 0,0
-    KERNEL4x16_I1_L4_2  -2048,0, 1,0
-    KERNEL4x16_I1_L4_2  -2048,0, 2,0
-    KERNEL4x16_I1_L4_2  -2048,0, 3,0
-    KERNEL4x16_I1_L4_2  -2048,0, 4,0
-    KERNEL4x16_I1_L4_2  -2048,0, 5,0        
-    KERNEL4x16_I1_L4_2  -2048,0, 6,0
-    KERNEL4x16_I1_L4_2  -2048,0, 7,0  
-    KERNEL4x16_I1_L4_2  -2048,0, 8,0      
-    KERNEL4x16_I1_L4_2  -2048,0, 9,0
-    KERNEL4x16_I1_L4_2  -2048,0, 10,0
-    KERNEL4x16_I1_L4_2  -2048,0, 11,0
-    KERNEL4x16_I1_L4_2  -2048,0, 12,0
-    KERNEL4x16_I1_L4_2  -2048,0, 13,0    
-    KERNEL4x16_I1_L4_2  -2048,0, 14,0    
-    KERNEL4x16_I1_L4_2  -2048,0, 15,1  	
-
-	bdnz		LSGEMM_L4x16_LOOP
-
-	MY_ALIGN
-LSGEMM_L4x16_LOOP_END:
-
-    END4x16 0, AO, BO, -2048, 0    
-
-	b		LSGEMM_L4x16_SUB1 
-	MY_ALIGN
-LSGEMM_L4x16_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	127
-#else
-	andi.		L,	K,	127
-#endif   
-	b		LSGEMM_L4x16_SUB2
-	MY_ALIGN
-LSGEMM_L4x16_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T12,	63
-#else
-	andi.		L,  T12,	63
-#endif	
-	ble		LSGEMM_L4x16_SAVE
-	MY_ALIGN
-LSGEMM_L4x16_SUB2:
-
-    srawi.      T10,L, 5
-    ble LSGEMM_L4x16_SUB2_16
-    mtctr		T10
-    MY_ALIGN
-LSGEMM_L4x16_SUB2_LOOP:
-	LOAD4x16_0 
-    KERNEL4x16_I1_L4_2  64,16, 0,0
-    KERNEL4x16_I1_L4_2  64,16, 1,0
-    KERNEL4x16_I1_L4_2  64,16, 2,0
-    KERNEL4x16_I1_L4_2  64,16, 3,0
-    KERNEL4x16_I1_L4_2  64,16, 4,0
-    KERNEL4x16_I1_L4_2  64,16, 5,0
-    KERNEL4x16_I1_L4_2  64,16, 6,0
-    KERNEL4x16_I1_L4_3  64,16, 7,1
-    bdnz LSGEMM_L4x16_SUB2_LOOP 
-    MY_ALIGN        
-LSGEMM_L4x16_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_L4x16_SUB2_8
-	LOAD4x16_0 
-    KERNEL4x16_I1_L4_2  64,16, 0,0
-    KERNEL4x16_I1_L4_2  64,16, 1,0
-    KERNEL4x16_I1_L4_2  64,16, 2,0
-    KERNEL4x16_I1_L4_3  64,16, 3,1
-    MY_ALIGN 
-LSGEMM_L4x16_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_L4x16_SUB2_4 
-	LOAD4x16_0
-    KERNEL4x16_I1_L4_2  64,16, 0,0
-    KERNEL4x16_I1_L4_3  64,16, 1,1
-	MY_ALIGN	
-LSGEMM_L4x16_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_L4x16_SUB2_2
-    LOAD4x16_0
-    KERNEL4x16_I1_L4_3  64,16, 0,1
-    MY_ALIGN
-LSGEMM_L4x16_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_L4x16_SUB2_1
-    LOAD4x16_0
-    KERNEL4x16_I1_L2_3  64,16, 0,1
-    MY_ALIGN    
-LSGEMM_L4x16_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_L4x16_SAVE	
-    KERNEL4x16 0
-#	addic.		L,	L,	-1
-#	bgt		LSGEMM_L4x16_SUB2
-
-	MY_ALIGN
-LSGEMM_L4x16_SAVE:
-	SAVE4x16
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4
-#endif	
-	addic.		I,	I,	-1
-	bgt+		LSGEMM_L4x16_BEGIN
-    MY_ALIGN
-LSGEMM_L4x16_END:
-LSGEMM_L4x8_BEGIN:
-    andi.       T2, M,  15
-    ble     LSGEMM_L4x1_END
-
-    andi.       T1, M,  8
-    ble     LSGEMM_L4x8_END
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,8,4
-   mr T12, T11
-   addi T12,T12, -1
-   srawi.       L, T12, 4 /**(T11-1) % 16x */
-#else
-   mr T12, K
-   addi T12,T12, -1
-   srawi.       L,  T12,    4 /**(K-1) % 16x */
-#endif 
-    
-    ZERO4x8
-    ble     LSGEMM_L4x8_SUB0
-
-    MY_ALIGN
-LSGEMM_L4x8_LOOP_START:
- 
-    LOAD4x8_0  /*we already zeroed */ 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L4x8_LOOP:
-
-    KERNEL4x8_I1_L4_2  32,16, 0,0
-    KERNEL4x8_I1_L4_2  32,16, 1,0
-    KERNEL4x8_I1_L4_2  32,16, 2,0
-    KERNEL4x8_I1_L4_2  32,16, 3,1    
-
-    bdnz        LSGEMM_L4x8_LOOP
-
-    MY_ALIGN
-LSGEMM_L4x8_LOOP_END:
-
-    END4x8 0, AO, BO, 32, 16    
-
-    b       LSGEMM_L4x8_SUB1 
-    MY_ALIGN
-LSGEMM_L4x8_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    31
-#else
-    andi.       L,  K,  31
-#endif   
-    b       LSGEMM_L4x8_SUB2
-    MY_ALIGN
-LSGEMM_L4x8_SUB1:
-#if defined(TRMMKERNEL)
-    andi.       L,  T12,    15
-#else
-    andi.       L,  T12,    15
-#endif  
-    ble     LSGEMM_L4x8_SAVE
-    MY_ALIGN
-LSGEMM_L4x8_SUB2:
- 
-    srawi.      T1,L, 3
-    ble LSGEMM_L4x8_SUB2_4 
-    mtctr		T1
-    MY_ALIGN
-LSGEMM_L4x8_SUB2_LOOP:    
-    LOAD4x8_0
-    KERNEL4x8_I1_L4_2  32,16, 0,0
-    KERNEL4x8_I1_L4_3  32,16, 1,1
-    bdnz LSGEMM_L4x8_SUB2_LOOP
-    MY_ALIGN    
-LSGEMM_L4x8_SUB2_4:
-    andi.      T1,L, 4
-    ble LSGEMM_L4x8_SUB2_2
-    LOAD4x8_0
-    KERNEL4x8_I1_L4_3  32,16, 0,1
-    MY_ALIGN
-LSGEMM_L4x8_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L4x8_SUB2_1
-    LOAD4x8_0
-    KERNEL4x8_I1_L2_3  32,16, 0,1
-    MY_ALIGN    
-LSGEMM_L4x8_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L4x8_SAVE   
-    KERNEL4x8 0
- 
-
-    MY_ALIGN
-LSGEMM_L4x8_SAVE:
-    SAVE4x8
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4
-#endif  
-    MY_ALIGN  
-LSGEMM_L4x8_END:
-LSGEMM_L4x4_BEGIN:
-    andi.       T2, M,  15
-    ble     LSGEMM_L4x1_END
-
-    andi.       T1, M,  4
-    ble     LSGEMM_L4x4_END
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,4,4
-   mr T12, T11
-   addi T12,T12, -1
-   srawi.       L, T12, 4 /**(T11-1) % 16x */
-#else
-   mr T12, K
-   addi T12,T12, -1
-   srawi.       L,  T12,    4 /**(K-1) % 16x */
-#endif 
-    
-    ZERO4x4
-    ble     LSGEMM_L4x4_SUB0
-
-    MY_ALIGN
-LSGEMM_L4x4_LOOP_START:
- 
-    LOAD4x4_0  /*we already zeroed */ 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L4x4_LOOP:
-
-    KERNEL4x4_I1_L4_2  16,16, 0,0
-    KERNEL4x4_I1_L4_2  16,16, 1,0
-    KERNEL4x4_I1_L4_2  16,16, 2,0
-    KERNEL4x4_I1_L4_2  16,16, 3,1    
-
-    bdnz        LSGEMM_L4x4_LOOP
-
-    MY_ALIGN
-LSGEMM_L4x4_LOOP_END:
-
-    END4x4 0, AO, BO, 16, 16    
-
-    b       LSGEMM_L4x4_SUB1 
-    MY_ALIGN
-LSGEMM_L4x4_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    31
-#else
-    andi.       L,  K,  31
-#endif   
-    b       LSGEMM_L4x4_SUB2
-    MY_ALIGN
-LSGEMM_L4x4_SUB1:
-#if defined(TRMMKERNEL)
-    andi.       L,  T12,    15
-#else
-    andi.       L,  T12,    15
-#endif  
-    ble     LSGEMM_L4x4_SAVE
-    MY_ALIGN
-LSGEMM_L4x4_SUB2:
-
-    srawi.      T1,L, 3 
-    ble LSGEMM_L4x4_SUB2_4  
-    mtctr		T1
-    MY_ALIGN
-LSGEMM_L4x4_SUB2_LOOP:     
-    LOAD4x4_0
-    KERNEL4x4_I1_L4_2  16,16, 0,0
-    KERNEL4x4_I1_L4_3  16,16, 1,1
-    bdnz LSGEMM_L4x4_SUB2_LOOP
-    MY_ALIGN    
-LSGEMM_L4x4_SUB2_4:
-    andi.      T1,L, 4
-    ble LSGEMM_L4x4_SUB2_2
-    LOAD4x4_0
-    KERNEL4x4_I1_L4_3  16,16, 0,1
-    MY_ALIGN
-LSGEMM_L4x4_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L4x4_SUB2_1
-    LOAD4x4_0
-    KERNEL4x4_I1_L2_3  16,16, 0,1
-    MY_ALIGN    
-LSGEMM_L4x4_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L4x4_SAVE   
-    KERNEL4x4 0
- 
-
-    MY_ALIGN
-LSGEMM_L4x4_SAVE:
-    SAVE4x4
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4
-#endif  
-    MY_ALIGN  
-LSGEMM_L4x4_END:
-LSGEMM_L4x2_BEGIN:
-    andi.       T1, M,  2
-    ble     LSGEMM_L4x2_END
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 
-   srawi.       L, T11, 3 /**(T11) % 8x */
-#else
-   srawi.       L,  K,    3 /**(K) % 8x */
-#endif 
-    
-    ZERO4x2
-    ble     LSGEMM_L4x2_SUB0
-
-    MY_ALIGN
-LSGEMM_L4x2_LOOP_START: 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L4x2_LOOP:
-
-    KERNEL4x2_2  0,0, 0,0
-    KERNEL4x2_2  0,0, 1,0
-    KERNEL4x2_2  0,0, 2,0
-    KERNEL4x2_2  0,0, 3,1    
-
-    bdnz        LSGEMM_L4x2_LOOP
-
-    MY_ALIGN
-LSGEMM_L4x2_LOOP_END:   
- 
-LSGEMM_L4x2_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    7
-#else
-    andi.       L,  K,  7
-#endif    
-    ble     LSGEMM_L4x2_SAVE
-    MY_ALIGN
-LSGEMM_L4x2_SUB2:
-    andi.      T1,L, 4
-    ble LSGEMM_L4x2_SUB2_2
-    KERNEL4x2_2  0,0, 0,0
-    KERNEL4x2_2  0,0, 1,1
-    MY_ALIGN
-LSGEMM_L4x2_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L4x2_SUB2_1
-    KERNEL4x2_2  0,0, 0,1 
-    MY_ALIGN    
-LSGEMM_L4x2_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L4x2_SAVE   
-    KERNEL4x2
-  
-    MY_ALIGN
-LSGEMM_L4x2_SAVE:
-    SAVE4x2
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4
-#endif  
-    MY_ALIGN  
-LSGEMM_L4x2_END:
-LSGEMM_L4x1_BEGIN: 
-    andi.       T1, M,  1
-    ble     LSGEMM_L4x1_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 
-   srawi.       L, T11, 3 /**(T11) % 8x */
-#else
-   srawi.       L,  K,    3 /**(K) % 8x */
-#endif 
-    
-    ZERO4x1
-    ble     LSGEMM_L4x1_SUB0
-
-    MY_ALIGN
-LSGEMM_L4x1_LOOP_START: 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L4x1_LOOP:
-
-    KERNEL4x1_4  0,0, 0,0
-    KERNEL4x1_4  0,0, 1,1     
-
-    bdnz        LSGEMM_L4x1_LOOP
-
-    MY_ALIGN
-LSGEMM_L4x1_LOOP_END:   
- 
-LSGEMM_L4x1_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    7
-#else
-    andi.       L,  K,  7
-#endif    
-    ble     LSGEMM_L4x1_SAVE
-    MY_ALIGN
-LSGEMM_L4x1_SUB2:
-    andi.      T1,L, 4
-    ble LSGEMM_L4x1_SUB2_2
-    KERNEL4x1_4  0,0, 0,1 
-    MY_ALIGN
-LSGEMM_L4x1_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L4x1_SUB2_1
-    KERNEL4x1_2 
-    MY_ALIGN    
-LSGEMM_L4x1_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L4x1_SAVE   
-    KERNEL4x1
-  
-    MY_ALIGN
-LSGEMM_L4x1_SAVE:
-    SAVE4x1
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4
-#endif  
-    MY_ALIGN  
-LSGEMM_L4x1_END:
-
-	slwi		T1,	K,	4
-	add		B,	B,	T1
-#if defined(TRMMKERNEL) && !defined(LEFT)
-    addi TEMP_REG, TEMP_REG, 4
-#endif
-
-	andi.		T2,	N,	3
-	ble		.L999
-
-LSGEMM_L4_END:
-    andi.       T1, N,  2
-    ble     LSGEMM_L2_END
-LSGEMM_L2_BEGIN:
-  
-
-	mr		AO,	A
-	mr		CO,	C
-	slwi		T3,	LDC	,	1
-	add		C,	C,	T3
- 
-#if defined(TRMMKERNEL) && defined(LEFT)
-	mr TEMP_REG, OFFSET	 /*off = offset;*/
-#endif 
-	srawi.		I,	M,	4
-	ble		LSGEMM_L2x16_END
-
-	MY_ALIGN
-LSGEMM_L2x16_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,2
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO2x16
-	ble		LSGEMM_L2x16_SUB0
-    addi AO,AO,2048
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_L2x16_LOOP:
-
-    KERNEL2x16_4  -2048,0, 0,0
-    KERNEL2x16_4  -2048,0, 1,0
-    KERNEL2x16_4  -2048,0, 2,0
-    KERNEL2x16_4  -2048,0, 3,0
-    KERNEL2x16_4  -2048,0, 4,0
-    KERNEL2x16_4  -2048,0, 5,0        
-    KERNEL2x16_4  -2048,0, 6,0
-    KERNEL2x16_4  -2048,0, 7,0  
-    KERNEL2x16_4  -2048,0, 8,0      
-    KERNEL2x16_4  -2048,0, 9,0
-    KERNEL2x16_4  -2048,0, 10,0
-    KERNEL2x16_4  -2048,0, 11,0
-    KERNEL2x16_4  -2048,0, 12,0
-    KERNEL2x16_4  -2048,0, 13,0    
-    KERNEL2x16_4  -2048,0, 14,0    
-    KERNEL2x16_4  -2048,0, 15,1  	
-
-	bdnz		LSGEMM_L2x16_LOOP
-    MY_ALIGN
-    addi AO,AO, -2048
-	MY_ALIGN
-LSGEMM_L2x16_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_L2x16_SAVE
-	MY_ALIGN
-LSGEMM_L2x16_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_L2x16_SUB2_16 
-    KERNEL2x16_4  0,0, 0,0
-    KERNEL2x16_4  0,0, 1,0
-    KERNEL2x16_4  0,0, 2,0
-    KERNEL2x16_4  0,0, 3,0
-    KERNEL2x16_4  0,0, 4,0
-    KERNEL2x16_4  0,0, 5,0
-    KERNEL2x16_4  0,0, 6,0
-    KERNEL2x16_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_L2x16_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_L2x16_SUB2_8 
-    KERNEL2x16_4  0,0, 0,0
-    KERNEL2x16_4  0,0, 1,0
-    KERNEL2x16_4  0,0, 2,0
-    KERNEL2x16_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_L2x16_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_L2x16_SUB2_4  
-    KERNEL2x16_4  0,0, 0,0
-    KERNEL2x16_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_L2x16_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_L2x16_SUB2_2 
-    KERNEL2x16_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_L2x16_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_L2x16_SUB2_1 
-    KERNEL2x16_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_L2x16_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_L2x16_SAVE	
-    KERNEL2x16
-
-	MY_ALIGN
-LSGEMM_L2x16_SAVE:
-	SAVE2x16
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2
-#endif	
-	addic.		I,	I,	-1
-	bgt+		LSGEMM_L2x16_BEGIN
-    MY_ALIGN
-LSGEMM_L2x16_END:
-	andi.		I,	M,	8
-	ble		LSGEMM_L2x8_END
-
-	MY_ALIGN
-LSGEMM_L2x8_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO2x8
-	ble		LSGEMM_L2x8_SUB0
-    addi AO,AO,2048
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_L2x8_LOOP:
-
-    KERNEL2x8_4  -2048,0, 0,0
-    KERNEL2x8_4  -2048,0, 1,0
-    KERNEL2x8_4  -2048,0, 2,0
-    KERNEL2x8_4  -2048,0, 3,0
-    KERNEL2x8_4  -2048,0, 4,0
-    KERNEL2x8_4  -2048,0, 5,0        
-    KERNEL2x8_4  -2048,0, 6,0
-    KERNEL2x8_4  -2048,0, 7,0  
-    KERNEL2x8_4  -2048,0, 8,0      
-    KERNEL2x8_4  -2048,0, 9,0
-    KERNEL2x8_4  -2048,0, 10,0
-    KERNEL2x8_4  -2048,0, 11,0
-    KERNEL2x8_4  -2048,0, 12,0
-    KERNEL2x8_4  -2048,0, 13,0    
-    KERNEL2x8_4  -2048,0, 14,0    
-    KERNEL2x8_4  -2048,0, 15,1  	
-
-	bdnz		LSGEMM_L2x8_LOOP
-    MY_ALIGN
-    addi AO,AO, -2048
-	MY_ALIGN
-LSGEMM_L2x8_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_L2x8_SAVE
-	MY_ALIGN
-LSGEMM_L2x8_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_L2x8_SUB2_16 
-    KERNEL2x8_4  0,0, 0,0
-    KERNEL2x8_4  0,0, 1,0
-    KERNEL2x8_4  0,0, 2,0
-    KERNEL2x8_4  0,0, 3,0
-    KERNEL2x8_4  0,0, 4,0
-    KERNEL2x8_4  0,0, 5,0
-    KERNEL2x8_4  0,0, 6,0
-    KERNEL2x8_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_L2x8_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_L2x8_SUB2_8 
-    KERNEL2x8_4  0,0, 0,0
-    KERNEL2x8_4  0,0, 1,0
-    KERNEL2x8_4  0,0, 2,0
-    KERNEL2x8_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_L2x8_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_L2x8_SUB2_4  
-    KERNEL2x8_4  0,0, 0,0
-    KERNEL2x8_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_L2x8_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_L2x8_SUB2_2 
-    KERNEL2x8_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_L2x8_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_L2x8_SUB2_1 
-    KERNEL2x8_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_L2x8_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_L2x8_SAVE	
-    KERNEL2x8
-
-	MY_ALIGN
-LSGEMM_L2x8_SAVE:
-	SAVE2x8
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2
-#endif	 
-    MY_ALIGN
-LSGEMM_L2x8_END:
-	andi.		I,	M,	4
-	ble		LSGEMM_L2x4_END
-
-	MY_ALIGN
-LSGEMM_L2x4_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO2x4
-	ble		LSGEMM_L2x4_SUB0
- 
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_L2x4_LOOP:
-
-    KERNEL2x4_4  0,0, 0,0
-    KERNEL2x4_4  0,0, 1,0
-    KERNEL2x4_4  0,0, 2,0
-    KERNEL2x4_4  0,0, 3,0
-    KERNEL2x4_4  0,0, 4,0
-    KERNEL2x4_4  0,0, 5,0        
-    KERNEL2x4_4  0,0, 6,0
-    KERNEL2x4_4  0,0, 7,0  
-    KERNEL2x4_4  0,0, 8,0      
-    KERNEL2x4_4  0,0, 9,0
-    KERNEL2x4_4  0,0, 10,0
-    KERNEL2x4_4  0,0, 11,0
-    KERNEL2x4_4  0,0, 12,0
-    KERNEL2x4_4  0,0, 13,0    
-    KERNEL2x4_4  0,0, 14,0    
-    KERNEL2x4_4  0,0, 15,1  	
-
-	bdnz		LSGEMM_L2x4_LOOP
-    MY_ALIGN
- 
-	MY_ALIGN
-LSGEMM_L2x4_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_L2x4_SAVE
-	MY_ALIGN
-LSGEMM_L2x4_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_L2x4_SUB2_16 
-    KERNEL2x4_4  0,0, 0,0
-    KERNEL2x4_4  0,0, 1,0
-    KERNEL2x4_4  0,0, 2,0
-    KERNEL2x4_4  0,0, 3,0
-    KERNEL2x4_4  0,0, 4,0
-    KERNEL2x4_4  0,0, 5,0
-    KERNEL2x4_4  0,0, 6,0
-    KERNEL2x4_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_L2x4_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_L2x4_SUB2_8 
-    KERNEL2x4_4  0,0, 0,0
-    KERNEL2x4_4  0,0, 1,0
-    KERNEL2x4_4  0,0, 2,0
-    KERNEL2x4_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_L2x4_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_L2x4_SUB2_4  
-    KERNEL2x4_4  0,0, 0,0
-    KERNEL2x4_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_L2x4_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_L2x4_SUB2_2 
-    KERNEL2x4_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_L2x4_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_L2x4_SUB2_1 
-    KERNEL2x4_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_L2x4_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_L2x4_SAVE	
-    KERNEL2x4
-
-	MY_ALIGN
-LSGEMM_L2x4_SAVE:
-	SAVE2x4
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2
-#endif	 
-    MY_ALIGN
-LSGEMM_L2x4_END:
-	andi.		I,	M,	2
-	ble		LSGEMM_L2x2_END
-
-	MY_ALIGN
-LSGEMM_L2x2_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO2x2
-	ble		LSGEMM_L2x2_SUB0
- 
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_L2x2_LOOP:
-
-    KERNEL2x2_4  0,0, 0,0
-    KERNEL2x2_4  0,0, 1,0
-    KERNEL2x2_4  0,0, 2,0
-    KERNEL2x2_4  0,0, 3,0
-    KERNEL2x2_4  0,0, 4,0
-    KERNEL2x2_4  0,0, 5,0        
-    KERNEL2x2_4  0,0, 6,0
-    KERNEL2x2_4  0,0, 7,0  
-    KERNEL2x2_4  0,0, 8,0      
-    KERNEL2x2_4  0,0, 9,0
-    KERNEL2x2_4  0,0, 10,0
-    KERNEL2x2_4  0,0, 11,0
-    KERNEL2x2_4  0,0, 12,0
-    KERNEL2x2_4  0,0, 13,0    
-    KERNEL2x2_4  0,0, 14,0    
-    KERNEL2x2_4  0,0, 15,1  	
-
-	bdnz		LSGEMM_L2x2_LOOP
-    MY_ALIGN
- 
-	MY_ALIGN
-LSGEMM_L2x2_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_L2x2_SAVE
-	MY_ALIGN
-LSGEMM_L2x2_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_L2x2_SUB2_16 
-    KERNEL2x2_4  0,0, 0,0
-    KERNEL2x2_4  0,0, 1,0
-    KERNEL2x2_4  0,0, 2,0
-    KERNEL2x2_4  0,0, 3,0
-    KERNEL2x2_4  0,0, 4,0
-    KERNEL2x2_4  0,0, 5,0
-    KERNEL2x2_4  0,0, 6,0
-    KERNEL2x2_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_L2x2_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_L2x2_SUB2_8 
-    KERNEL2x2_4  0,0, 0,0
-    KERNEL2x2_4  0,0, 1,0
-    KERNEL2x2_4  0,0, 2,0
-    KERNEL2x2_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_L2x2_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_L2x2_SUB2_4  
-    KERNEL2x2_4  0,0, 0,0
-    KERNEL2x2_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_L2x2_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_L2x2_SUB2_2 
-    KERNEL2x2_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_L2x2_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_L2x2_SUB2_1 
-    KERNEL2x2_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_L2x2_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_L2x2_SAVE	
-    KERNEL2x2
-
-	MY_ALIGN
-LSGEMM_L2x2_SAVE:
-	SAVE2x2
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2
-#endif	 
-    MY_ALIGN
-LSGEMM_L2x2_END:
-	andi.		I,	M,	1
-	ble		LSGEMM_L2x1_END
-
-	MY_ALIGN
-LSGEMM_L2x1_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO2x1
-	ble		LSGEMM_L2x1_SUB0
- 
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_L2x1_LOOP:
-
-    KERNEL2x1_4  0,0, 0,0
-    KERNEL2x1_4  0,0, 1,0
-    KERNEL2x1_4  0,0, 2,0
-    KERNEL2x1_4  0,0, 3,0
-    KERNEL2x1_4  0,0, 4,0
-    KERNEL2x1_4  0,0, 5,0        
-    KERNEL2x1_4  0,0, 6,0
-    KERNEL2x1_4  0,0, 7,0  
-    KERNEL2x1_4  0,0, 8,0      
-    KERNEL2x1_4  0,0, 9,0
-    KERNEL2x1_4  0,0, 10,0
-    KERNEL2x1_4  0,0, 11,0
-    KERNEL2x1_4  0,0, 12,0
-    KERNEL2x1_4  0,0, 13,0    
-    KERNEL2x1_4  0,0, 14,0    
-    KERNEL2x1_4  0,0, 15,1  	
-
-	bdnz		LSGEMM_L2x1_LOOP
-    MY_ALIGN
- 
-	MY_ALIGN
-LSGEMM_L2x1_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_L2x1_SAVE
-	MY_ALIGN
-LSGEMM_L2x1_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_L2x1_SUB2_16 
-    KERNEL2x1_4  0,0, 0,0
-    KERNEL2x1_4  0,0, 1,0
-    KERNEL2x1_4  0,0, 2,0
-    KERNEL2x1_4  0,0, 3,0
-    KERNEL2x1_4  0,0, 4,0
-    KERNEL2x1_4  0,0, 5,0
-    KERNEL2x1_4  0,0, 6,0
-    KERNEL2x1_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_L2x1_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_L2x1_SUB2_8 
-    KERNEL2x1_4  0,0, 0,0
-    KERNEL2x1_4  0,0, 1,0
-    KERNEL2x1_4  0,0, 2,0
-    KERNEL2x1_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_L2x1_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_L2x1_SUB2_4  
-    KERNEL2x1_4  0,0, 0,0
-    KERNEL2x1_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_L2x1_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_L2x1_SUB2_2 
-    KERNEL2x1_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_L2x1_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_L2x1_SUB2_1 
-    KERNEL2x1_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_L2x1_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_L2x1_SAVE	
-    KERNEL2x1
-
-	MY_ALIGN
-LSGEMM_L2x1_SAVE:
-	SAVE2x1
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2
-#endif	 
-    MY_ALIGN
-LSGEMM_L2x1_END:
-	slwi		T1,	K,	3
-	add		B,	B,	T1
-#if defined(TRMMKERNEL) && !defined(LEFT)
-    addi TEMP_REG, TEMP_REG, 2
-#endif 
-LSGEMM_L2_END:
-   andi.       T1, N,  1
-   ble     LSGEMM_END
-LSGEMM_1_BEGIN:
-  
-
-	mr		AO,	A
-	mr		CO,	C 
-	add		C,	C,	LDC
- 
-#if defined(TRMMKERNEL) && defined(LEFT)
-	mr TEMP_REG, OFFSET	 /*off = offset;*/
-#endif 
-	srawi.		I,	M,	4
-	ble		LSGEMM_1x16_END
-
-	MY_ALIGN
-LSGEMM_1x16_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,1
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO1x16
-	ble		LSGEMM_1x16_SUB0
-    addi AO,AO,2048
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_1x16_LOOP:
-
-    KERNEL1x16_4  -2048,0, 0,0
-    KERNEL1x16_4  -2048,0, 1,0
-    KERNEL1x16_4  -2048,0, 2,0
-    KERNEL1x16_4  -2048,0, 3,0
-    KERNEL1x16_4  -2048,0, 4,0
-    KERNEL1x16_4  -2048,0, 5,0        
-    KERNEL1x16_4  -2048,0, 6,0
-    KERNEL1x16_4  -2048,0, 7,0  
-    KERNEL1x16_4  -2048,0, 8,0      
-    KERNEL1x16_4  -2048,0, 9,0
-    KERNEL1x16_4  -2048,0, 10,0
-    KERNEL1x16_4  -2048,0, 11,0
-    KERNEL1x16_4  -2048,0, 12,0
-    KERNEL1x16_4  -2048,0, 13,0    
-    KERNEL1x16_4  -2048,0, 14,0    
-    KERNEL1x16_4  -2048,0, 15,1  	
-
-	bdnz		LSGEMM_1x16_LOOP
-    MY_ALIGN
-    addi AO,AO, -2048
-	MY_ALIGN
-LSGEMM_1x16_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_1x16_SAVE
-	MY_ALIGN
-LSGEMM_1x16_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_1x16_SUB2_16 
-    KERNEL1x16_4  0,0, 0,0
-    KERNEL1x16_4  0,0, 1,0
-    KERNEL1x16_4  0,0, 2,0
-    KERNEL1x16_4  0,0, 3,0
-    KERNEL1x16_4  0,0, 4,0
-    KERNEL1x16_4  0,0, 5,0
-    KERNEL1x16_4  0,0, 6,0
-    KERNEL1x16_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_1x16_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_1x16_SUB2_8 
-    KERNEL1x16_4  0,0, 0,0
-    KERNEL1x16_4  0,0, 1,0
-    KERNEL1x16_4  0,0, 2,0
-    KERNEL1x16_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_1x16_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_1x16_SUB2_4  
-    KERNEL1x16_4  0,0, 0,0
-    KERNEL1x16_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_1x16_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_1x16_SUB2_2 
-    KERNEL1x16_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_1x16_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_1x16_SUB2_1 
-    KERNEL1x16_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_1x16_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_1x16_SAVE	
-    KERNEL1x16
-
-	MY_ALIGN
-LSGEMM_1x16_SAVE:
-	SAVE1x16
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1
-#endif	
-	addic.		I,	I,	-1
-	bgt+		LSGEMM_1x16_BEGIN
-    MY_ALIGN
-LSGEMM_1x16_END:
-	andi.		I,	M,	8
-	ble		LSGEMM_1x8_END
-
-	MY_ALIGN
-LSGEMM_1x8_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO1x8
-	ble		LSGEMM_1x8_SUB0
-    addi AO,AO,2048
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_1x8_LOOP:
-
-    KERNEL1x8_4  -2048,0, 0,0
-    KERNEL1x8_4  -2048,0, 1,0
-    KERNEL1x8_4  -2048,0, 2,0
-    KERNEL1x8_4  -2048,0, 3,0
-    KERNEL1x8_4  -2048,0, 4,0
-    KERNEL1x8_4  -2048,0, 5,0        
-    KERNEL1x8_4  -2048,0, 6,0
-    KERNEL1x8_4  -2048,0, 7,0  
-    KERNEL1x8_4  -2048,0, 8,0      
-    KERNEL1x8_4  -2048,0, 9,0
-    KERNEL1x8_4  -2048,0, 10,0
-    KERNEL1x8_4  -2048,0, 11,0
-    KERNEL1x8_4  -2048,0, 12,0
-    KERNEL1x8_4  -2048,0, 13,0    
-    KERNEL1x8_4  -2048,0, 14,0    
-    KERNEL1x8_4  -2048,0, 15,1  	
-
-	bdnz		LSGEMM_1x8_LOOP
-    MY_ALIGN
-    addi AO,AO, -2048
-	MY_ALIGN
-LSGEMM_1x8_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_1x8_SAVE
-	MY_ALIGN
-LSGEMM_1x8_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_1x8_SUB2_16 
-    KERNEL1x8_4  0,0, 0,0
-    KERNEL1x8_4  0,0, 1,0
-    KERNEL1x8_4  0,0, 2,0
-    KERNEL1x8_4  0,0, 3,0
-    KERNEL1x8_4  0,0, 4,0
-    KERNEL1x8_4  0,0, 5,0
-    KERNEL1x8_4  0,0, 6,0
-    KERNEL1x8_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_1x8_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_1x8_SUB2_8 
-    KERNEL1x8_4  0,0, 0,0
-    KERNEL1x8_4  0,0, 1,0
-    KERNEL1x8_4  0,0, 2,0
-    KERNEL1x8_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_1x8_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_1x8_SUB2_4  
-    KERNEL1x8_4  0,0, 0,0
-    KERNEL1x8_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_1x8_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_1x8_SUB2_2 
-    KERNEL1x8_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_1x8_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_1x8_SUB2_1 
-    KERNEL1x8_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_1x8_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_1x8_SAVE	
-    KERNEL1x8
-
-	MY_ALIGN
-LSGEMM_1x8_SAVE:
-	SAVE1x8
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1
-#endif	 
-    MY_ALIGN
-LSGEMM_1x8_END:
-	andi.		I,	M,	4
-	ble		LSGEMM_1x4_END
-
-	MY_ALIGN
-LSGEMM_1x4_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO1x4
-	ble		LSGEMM_1x4_SUB0
- 
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_1x4_LOOP:
-
-    KERNEL1x4_4  0,0, 0,0
-    KERNEL1x4_4  0,0, 1,0
-    KERNEL1x4_4  0,0, 2,0
-    KERNEL1x4_4  0,0, 3,0
-    KERNEL1x4_4  0,0, 4,0
-    KERNEL1x4_4  0,0, 5,0        
-    KERNEL1x4_4  0,0, 6,0
-    KERNEL1x4_4  0,0, 7,0  
-    KERNEL1x4_4  0,0, 8,0      
-    KERNEL1x4_4  0,0, 9,0
-    KERNEL1x4_4  0,0, 10,0
-    KERNEL1x4_4  0,0, 11,0
-    KERNEL1x4_4  0,0, 12,0
-    KERNEL1x4_4  0,0, 13,0    
-    KERNEL1x4_4  0,0, 14,0    
-    KERNEL1x4_4  0,0, 15,1  	
-
-	bdnz		LSGEMM_1x4_LOOP
-    MY_ALIGN
- 
-	MY_ALIGN
-LSGEMM_1x4_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_1x4_SAVE
-	MY_ALIGN
-LSGEMM_1x4_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_1x4_SUB2_16 
-    KERNEL1x4_4  0,0, 0,0
-    KERNEL1x4_4  0,0, 1,0
-    KERNEL1x4_4  0,0, 2,0
-    KERNEL1x4_4  0,0, 3,0
-    KERNEL1x4_4  0,0, 4,0
-    KERNEL1x4_4  0,0, 5,0
-    KERNEL1x4_4  0,0, 6,0
-    KERNEL1x4_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_1x4_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_1x4_SUB2_8 
-    KERNEL1x4_4  0,0, 0,0
-    KERNEL1x4_4  0,0, 1,0
-    KERNEL1x4_4  0,0, 2,0
-    KERNEL1x4_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_1x4_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_1x4_SUB2_4  
-    KERNEL1x4_4  0,0, 0,0
-    KERNEL1x4_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_1x4_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_1x4_SUB2_2 
-    KERNEL1x4_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_1x4_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_1x4_SUB2_1 
-    KERNEL1x4_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_1x4_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_1x4_SAVE	
-    KERNEL1x4
-
-	MY_ALIGN
-LSGEMM_1x4_SAVE:
-	SAVE1x4
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1
-#endif	 
-    MY_ALIGN
-LSGEMM_1x4_END:
-	andi.		I,	M,	2
-	ble		LSGEMM_1x2_END
-
-	MY_ALIGN
-LSGEMM_1x2_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO1x2
-	ble		LSGEMM_1x2_SUB0
- 
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_1x2_LOOP:
-
-    KERNEL1x2_4  0,0, 0,0
-    KERNEL1x2_4  0,0, 1,0
-    KERNEL1x2_4  0,0, 2,0
-    KERNEL1x2_4  0,0, 3,0
-    KERNEL1x2_4  0,0, 4,0
-    KERNEL1x2_4  0,0, 5,0        
-    KERNEL1x2_4  0,0, 6,0
-    KERNEL1x2_4  0,0, 7,0  
-    KERNEL1x2_4  0,0, 8,0      
-    KERNEL1x2_4  0,0, 9,0
-    KERNEL1x2_4  0,0, 10,0
-    KERNEL1x2_4  0,0, 11,0
-    KERNEL1x2_4  0,0, 12,0
-    KERNEL1x2_4  0,0, 13,0    
-    KERNEL1x2_4  0,0, 14,0    
-    KERNEL1x2_4  0,0, 15,1  	
-
-	bdnz		LSGEMM_1x2_LOOP
-    MY_ALIGN
- 
-	MY_ALIGN
-LSGEMM_1x2_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_1x2_SAVE
-	MY_ALIGN
-LSGEMM_1x2_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_1x2_SUB2_16 
-    KERNEL1x2_4  0,0, 0,0
-    KERNEL1x2_4  0,0, 1,0
-    KERNEL1x2_4  0,0, 2,0
-    KERNEL1x2_4  0,0, 3,0
-    KERNEL1x2_4  0,0, 4,0
-    KERNEL1x2_4  0,0, 5,0
-    KERNEL1x2_4  0,0, 6,0
-    KERNEL1x2_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_1x2_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_1x2_SUB2_8 
-    KERNEL1x2_4  0,0, 0,0
-    KERNEL1x2_4  0,0, 1,0
-    KERNEL1x2_4  0,0, 2,0
-    KERNEL1x2_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_1x2_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_1x2_SUB2_4  
-    KERNEL1x2_4  0,0, 0,0
-    KERNEL1x2_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_1x2_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_1x2_SUB2_2 
-    KERNEL1x2_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_1x2_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_1x2_SUB2_1 
-    KERNEL1x2_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_1x2_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_1x2_SAVE	
-    KERNEL1x2
-
-	MY_ALIGN
-LSGEMM_1x2_SAVE:
-	SAVE1x2
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1
-#endif	 
-    MY_ALIGN
-LSGEMM_1x2_END:
-    andi.		I,	M,	1
-	ble		LSGEMM_1x1_END
-
-	MY_ALIGN
-LSGEMM_1x1_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO1x1
-	ble		LSGEMM_1x1_SUB0
- 
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_1x1_LOOP:
-
-    KERNEL1x1_16  0,0, 0,0
-    KERNEL1x1_16  0,0, 1,0
-    KERNEL1x1_16  0,0, 2,0
-    KERNEL1x1_16  0,0, 3,1 	
-
-	bdnz		LSGEMM_1x1_LOOP
-    MY_ALIGN
- 
-	MY_ALIGN
-LSGEMM_1x1_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_1x1_SAVE
-	MY_ALIGN
-LSGEMM_1x1_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_1x1_SUB2_16 
-    KERNEL1x1_16  0,0, 0,0
-    KERNEL1x1_16  0,0, 1,1 
-    MY_ALIGN        
-LSGEMM_1x1_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_1x1_SUB2_8 
-    KERNEL1x1_16  0,0, 0,1
-    MY_ALIGN 
-LSGEMM_1x1_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_1x1_SUB2_4  
-    KERNEL1x1_8  0,0, 0,1
-	MY_ALIGN	
-LSGEMM_1x1_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_1x1_SUB2_2 
-    KERNEL1x1_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_1x1_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_1x1_SUB2_1 
-    KERNEL1x1_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_1x1_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_1x1_SAVE	
-    KERNEL1x1
-
-	MY_ALIGN
-LSGEMM_1x1_SAVE:
-	SAVE1x1
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1
-#endif	 
-    MY_ALIGN
-LSGEMM_1x1_END:
-	slwi		T1,	K,	2
-	add		B,	B,	T1
-#if defined(TRMMKERNEL) && !defined(LEFT)
-    addi TEMP_REG, TEMP_REG, 1
-#endif 
+#define MY_ALIGN .align 3
+b L8
+
+	MY_ALIGN
+LSGEMM_L8x16_LMAIN_SUB: 
+	LOAD8x16_2    
+	MY_ALIGN
+
+LSGEMM_L8x16_LOOP:
+    KERNEL8x16_L2 128,64,0,0 
+LSGEMM_L8x16_K128:
+    KERNEL8x16_L2 128,64,1,0 
+    KERNEL8x16_I1_L4_2  128,64, 1,0
+    KERNEL8x16_I1_L4_2  128,64, 2,0
+    KERNEL8x16_I1_L4_2  128,64, 3,0
+    KERNEL8x16_I1_L4_2  128,64, 4,0
+    KERNEL8x16_I1_L4_2  128,64, 5,0        
+    KERNEL8x16_I1_L4_2  128,64, 6,0
+    KERNEL8x16_I1_L4_2  128,64, 7,0  
+    KERNEL8x16_I1_L4_2  128,64, 8,0      
+    KERNEL8x16_I1_L4_2  128,64, 9,0
+    KERNEL8x16_I1_L4_2  128,64, 10,0
+    KERNEL8x16_I1_L4_2  128,64, 11,0
+    KERNEL8x16_I1_L4_2  128,64, 12,0
+    KERNEL8x16_I1_L4_2  128,64, 13,0    
+    KERNEL8x16_I1_L4_2  128,64, 14,0    
+    KERNEL8x16_I1_L4_2  128,64, 15,0  	
+    KERNEL8x16_I1_L4_2  128,64, 16,0
+    KERNEL8x16_I1_L4_2  128,64, 17,0
+    KERNEL8x16_I1_L4_2  128,64, 18,0
+    KERNEL8x16_I1_L4_2  128,64, 19,0
+    KERNEL8x16_I1_L4_2  128,64, 20,0
+    KERNEL8x16_I1_L4_2  128,64, 21,0        
+    KERNEL8x16_I1_L4_2  128,64, 22,0
+    KERNEL8x16_I1_L4_2  128,64, 23,0  
+    KERNEL8x16_I1_L4_2  128,64, 24,0      
+    KERNEL8x16_I1_L4_2  128,64, 25,0
+    KERNEL8x16_I1_L4_2  128,64, 26,0
+    KERNEL8x16_I1_L4_2  128,64, 27,0
+    KERNEL8x16_I1_L4_2  128,64, 28,0
+    KERNEL8x16_I1_L4_2  128,64, 29,0    
+    KERNEL8x16_I1_L4_2  128,64, 30,0    
+    KERNEL8x16_I1_L4_2  128,64, 31,1 
+	bdnz		LSGEMM_L8x16_LOOP
+
+	MY_ALIGN
+LSGEMM_L8x16_LOOP_END: 
+    END8x16_2
+    blr  
+
+	MY_ALIGN
+LSGEMM_L8x16_L64_SUB: 
+	LOAD8x16_2     
+    KERNEL8x16_I1_L4_2  128,64, 0,0
+    KERNEL8x16_I1_L4_2  128,64, 1,0
+    KERNEL8x16_I1_L4_2  128,64, 2,0
+    KERNEL8x16_I1_L4_2  128,64,3,0
+    KERNEL8x16_I1_L4_2  128,64,4,0
+    KERNEL8x16_I1_L4_2  128,64,5,0        
+    KERNEL8x16_I1_L4_2  128,64,6,0
+    KERNEL8x16_I1_L4_2  128,64,7,0  
+    KERNEL8x16_I1_L4_2  128,64,8,0      
+    KERNEL8x16_I1_L4_2  128,64,9,0
+    KERNEL8x16_I1_L4_2  128,64,10,0
+    KERNEL8x16_I1_L4_2  128,64,11,0
+    KERNEL8x16_I1_L4_2  128,64,12,0
+    KERNEL8x16_I1_L4_2  128,64,13,0    
+    KERNEL8x16_I1_L4_2  128,64,14,0    
+    KERNEL8x16_I1_L4_3  128,64,15,1 
+    blr	
+LSGEMM_L8x16_L32_SUB: 
+	LOAD8x16_2     
+    KERNEL8x16_I1_L4_2  128,64,0,0
+    KERNEL8x16_I1_L4_2  128,64,1,0
+    KERNEL8x16_I1_L4_2  128,64,2,0
+    KERNEL8x16_I1_L4_2  128,64,3,0
+    KERNEL8x16_I1_L4_2  128,64,4,0
+    KERNEL8x16_I1_L4_2  128,64,5,0        
+    KERNEL8x16_I1_L4_2  128,64,6,0
+    KERNEL8x16_I1_L4_3  128,64,7,1
+    blr	
+
+LSGEMM_L8x16_L16_SUB: 
+	LOAD8x16_2     
+    KERNEL8x16_I1_L4_2  128,64,0,0
+    KERNEL8x16_I1_L4_2  128,64,1,0
+    KERNEL8x16_I1_L4_2  128,64,2,0
+    KERNEL8x16_I1_L4_3  128,64,3,1
+    blr	
+
+L8:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   neg TEMP_REG, OFFSET 
+#endif
+
+	srawi.		J,	N,	3
+
+	ble		LSGEMM_L8_END
+
+LSGEMM_L8_BEGIN:
+
+	li		T1,	128
+	li		T2,	256
+ 
+	mr		AO,	A
+	mr		CO,	C
+	slwi		T3,	LDC	,	3
+	add		C,	C,	T3
+
+	dcbt		A,	T1
+	dcbt		A,	T2
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_L8x16_END
+
+	MY_ALIGN
+LSGEMM_L8x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,8
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
+   mr T12, T11
+   addi T12,T12, -2
+   srawi.		L, T12,	7 /**(T11-2) % 128x */
+#else
+   mr T12, K
+   addi T12,T12, -2
+   srawi.		L,	T12,	7 /**(K-2) % 128x */
+#endif 
+ 
+    ZERO8x16 
+	ble		LSGEMM_L8x16_SUB0
+	mtctr		L 
+    bl      LSGEMM_L8x16_LMAIN_SUB
+	andi.		L,	T12,	127
+	ble		LSGEMM_L8x16_SAVE
+	b		LSGEMM_L8x16_SUB2   
+	MY_ALIGN
+LSGEMM_L8x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	255
+    cmpwi   T11,129
+#else
+	andi.		L,	K,	255
+    cmpwi   K,129
+#endif       
+    li T10,1
+    bne CMP8x16_128K
+    addi BO,BO,-32
+    addi AO,AO,-64 
+    LOAD8x16 64,32 
+    END8x16_WITHOUT_ADD   
+    LOAD8x16_2O AO,BO,  128, 64 
+    mtctr   T10   
+    bl LSGEMM_L8x16_K128   
+    b LSGEMM_L8x16_SAVE  
+CMP8x16_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T11,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne LSGEMM_L8x16_SUB2 
+    MY_ALIGN   
+    mtctr   T10
+    addi BO,BO,-64
+    addi AO,AO,-128   
+    LOAD8x16_2O  AO,BO,  128,64
+    bl LSGEMM_L8x16_K128   
+    b LSGEMM_L8x16_SAVE
+	MY_ALIGN
+LSGEMM_L8x16_SUB2:
+    andi.   T10,L,64
+    ble   LSGEMM_L8x16_SUB2_32
+    bl   LSGEMM_L8x16_L64_SUB
+    MY_ALIGN 
+LSGEMM_L8x16_SUB2_32:
+    andi.      T10,L, 32
+    ble LSGEMM_L8x16_SUB2_16
+    bl   LSGEMM_L8x16_L32_SUB
+    MY_ALIGN                
+LSGEMM_L8x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L8x16_SUB2_8
+	bl  LSGEMM_L8x16_L16_SUB
+    MY_ALIGN 
+LSGEMM_L8x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L8x16_SUB2_4 
+	LOAD8x16_2
+    KERNEL8x16_I1_L4_2  128,64, 0,0
+    KERNEL8x16_I1_L4_3  128,64, 1,1
+	MY_ALIGN	
+LSGEMM_L8x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L8x16_SUB2_2
+    LOAD8x16_2
+    KERNEL8x16_I1_L4_3  128,64, 0,1
+    MY_ALIGN
+LSGEMM_L8x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L8x16_SUB2_1
+    LOAD8x16_2
+    KERNEL8x16_E2  128,64, 0,1
+    MY_ALIGN    
+LSGEMM_L8x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L8x16_SAVE	
+    KERNEL8x16 0
+
+
+	MY_ALIGN
+LSGEMM_L8x16_SAVE:
+	SAVE8x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_L8x16_BEGIN
+    MY_ALIGN
+LSGEMM_L8x16_END:
+LSGEMM_L8x8_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L8x1_END
+
+    andi.       T1, M,  8
+    ble     LSGEMM_L8x8_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,8
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO8x8
+    ble     LSGEMM_L8x8_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x8_LOOP_START:
+ 
+    LOAD8x8_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x8_LOOP:
+
+    KERNEL8x8_I1_L4_2  32,32, 0,0
+    KERNEL8x8_I1_L4_2  32,32, 1,0
+    KERNEL8x8_I1_L4_2  32,32, 2,0
+    KERNEL8x8_I1_L4_2  32,32, 3,1    
+
+    bdnz        LSGEMM_L8x8_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x8_LOOP_END:
+
+    END8x8 0, AO, BO, 32, 32    
+
+    b       LSGEMM_L8x8_SUB1 
+    MY_ALIGN
+LSGEMM_L8x8_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L8x8_SUB2
+    MY_ALIGN
+LSGEMM_L8x8_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L8x8_SAVE
+    MY_ALIGN
+LSGEMM_L8x8_SUB2:
+ 
+    srawi.      T1,L, 3
+    ble LSGEMM_L8x8_SUB2_4 
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L8x8_SUB2_LOOP:    
+    LOAD8x8_0
+    KERNEL8x8_I1_L4_2  32,32, 0,0
+    KERNEL8x8_I1_L4_3  32,32, 1,1
+    bdnz LSGEMM_L8x8_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L8x8_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x8_SUB2_2
+    LOAD8x8_0
+    KERNEL8x8_I1_L4_3  32,32, 0,1
+    MY_ALIGN
+LSGEMM_L8x8_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x8_SUB2_1
+    LOAD8x8_0
+    KERNEL8x8_I1_L2_3  32,32, 0,1
+    MY_ALIGN    
+LSGEMM_L8x8_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x8_SAVE   
+    KERNEL8x8 0
+ 
+
+    MY_ALIGN
+LSGEMM_L8x8_SAVE:
+    SAVE8x8
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x8_END:
+LSGEMM_L8x4_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L8x1_END
+
+    andi.       T1, M,  4
+    ble     LSGEMM_L8x4_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,8
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO8x4
+    ble     LSGEMM_L8x4_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x4_LOOP_START:
+ 
+    LOAD8x4_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x4_LOOP:
+
+    KERNEL8x4_I1_L4_2  16,32, 0,0
+    KERNEL8x4_I1_L4_2  16,32, 1,0
+    KERNEL8x4_I1_L4_2  16,32, 2,0
+    KERNEL8x4_I1_L4_2  16,32, 3,1    
+
+    bdnz        LSGEMM_L8x4_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x4_LOOP_END:
+
+    END8x4 0, AO, BO, 16, 32    
+
+    b       LSGEMM_L8x4_SUB1 
+    MY_ALIGN
+LSGEMM_L8x4_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L8x4_SUB2
+    MY_ALIGN
+LSGEMM_L8x4_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L8x4_SAVE
+    MY_ALIGN
+LSGEMM_L8x4_SUB2:
+
+    srawi.      T1,L, 3
+    ble LSGEMM_L8x4_SUB2_4 
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L8x4_SUB2_LOOP:      
+    LOAD8x4_0
+    KERNEL8x4_I1_L4_2  16,32, 0,0
+    KERNEL8x4_I1_L4_3  16,32, 1,1
+    bdnz LSGEMM_L8x4_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L8x4_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x4_SUB2_2
+    LOAD8x4_0
+    KERNEL8x4_I1_L4_3  16,32, 0,1
+    MY_ALIGN
+LSGEMM_L8x4_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x4_SUB2_1
+    LOAD8x4_0
+    KERNEL8x4_I1_L2_3  16,32, 0,1
+    MY_ALIGN    
+LSGEMM_L8x4_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x4_SAVE   
+    KERNEL8x4 0
+ 
+
+    MY_ALIGN
+LSGEMM_L8x4_SAVE:
+    SAVE8x4
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x4_END:
+LSGEMM_L8x2_BEGIN:
+    andi.       T1, M,  2
+    ble     LSGEMM_L8x2_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO8x2
+    ble     LSGEMM_L8x2_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x2_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x2_LOOP:
+
+    KERNEL8x2_2  0,0, 0,0
+    KERNEL8x2_2  0,0, 1,0
+    KERNEL8x2_2  0,0, 2,0
+    KERNEL8x2_2  0,0, 3,1    
+
+    bdnz        LSGEMM_L8x2_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x2_LOOP_END:   
+ 
+LSGEMM_L8x2_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L8x2_SAVE
+    MY_ALIGN
+LSGEMM_L8x2_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x2_SUB2_2
+    KERNEL8x2_2  0,0, 0,0
+    KERNEL8x2_2  0,0, 1,1
+    MY_ALIGN
+LSGEMM_L8x2_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x2_SUB2_1
+    KERNEL8x2_2  0,0, 0,1 
+    MY_ALIGN    
+LSGEMM_L8x2_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x2_SAVE   
+    KERNEL8x2
+  
+    MY_ALIGN
+LSGEMM_L8x2_SAVE:
+    SAVE8x2
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x2_END:
+LSGEMM_L8x1_BEGIN: 
+    andi.       T1, M,  1
+    ble     LSGEMM_L8x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO8x1
+    ble     LSGEMM_L8x1_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x1_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x1_LOOP:
+
+    KERNEL8x1_4  0,0, 0,0
+    KERNEL8x1_4  0,0, 1,1     
+
+    bdnz        LSGEMM_L8x1_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x1_LOOP_END:   
+ 
+LSGEMM_L8x1_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L8x1_SAVE
+    MY_ALIGN
+LSGEMM_L8x1_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x1_SUB2_2
+    KERNEL8x1_4  0,0, 0,1 
+    MY_ALIGN
+LSGEMM_L8x1_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x1_SUB2_1
+    KERNEL8x1_2 
+    MY_ALIGN    
+LSGEMM_L8x1_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x1_SAVE   
+    KERNEL8x1
+  
+    MY_ALIGN
+LSGEMM_L8x1_SAVE:
+    SAVE8x1
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 8
+#endif
+	addic.		J,	J,	-1
+	bgt		LSGEMM_L8_BEGIN
+ 
+
+LSGEMM_L8_END:
+
+/*	b		LSGEMM_L4_BEGIN*/
+    andi.       T1, N,  4
+    ble     LSGEMM_L4_END
+LSGEMM_L4_BEGIN:
+  
+
+	mr		AO,	A
+	mr		CO,	C
+	slwi		T3,	LDC	,	2
+	add		C,	C,	T3
+ 
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_L4x16_END
+
+	MY_ALIGN
+LSGEMM_L4x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,4
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,4
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.		L, T12,	6 /**(T11-1) % 64x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.		L,	T12,	6 /**(K-1) % 64x */
+#endif 
+ 
+    ZERO4x16
+	ble		LSGEMM_L4x16_SUB0
+
+	MY_ALIGN
+LSGEMM_L4x16_LOOP_START:
+ 
+	LOAD4x16_0  /*we already zeroed */
+    ##OffsetA=64 OffsetB=16
+    addi AO,AO,2112
+    addi BO,BO,16  
+
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L4x16_LOOP:
+
+    KERNEL4x16_I1_L4_2  -2048,0, 0,0
+    KERNEL4x16_I1_L4_2  -2048,0, 1,0
+    KERNEL4x16_I1_L4_2  -2048,0, 2,0
+    KERNEL4x16_I1_L4_2  -2048,0, 3,0
+    KERNEL4x16_I1_L4_2  -2048,0, 4,0
+    KERNEL4x16_I1_L4_2  -2048,0, 5,0        
+    KERNEL4x16_I1_L4_2  -2048,0, 6,0
+    KERNEL4x16_I1_L4_2  -2048,0, 7,0  
+    KERNEL4x16_I1_L4_2  -2048,0, 8,0      
+    KERNEL4x16_I1_L4_2  -2048,0, 9,0
+    KERNEL4x16_I1_L4_2  -2048,0, 10,0
+    KERNEL4x16_I1_L4_2  -2048,0, 11,0
+    KERNEL4x16_I1_L4_2  -2048,0, 12,0
+    KERNEL4x16_I1_L4_2  -2048,0, 13,0    
+    KERNEL4x16_I1_L4_2  -2048,0, 14,0    
+    KERNEL4x16_I1_L4_2  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_L4x16_LOOP
+
+	MY_ALIGN
+LSGEMM_L4x16_LOOP_END:
+
+    END4x16 0, AO, BO, -2048, 0    
+
+	b		LSGEMM_L4x16_SUB1 
+	MY_ALIGN
+LSGEMM_L4x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	127
+#else
+	andi.		L,	K,	127
+#endif   
+	b		LSGEMM_L4x16_SUB2
+	MY_ALIGN
+LSGEMM_L4x16_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T12,	63
+#else
+	andi.		L,  T12,	63
+#endif	
+	ble		LSGEMM_L4x16_SAVE
+	MY_ALIGN
+LSGEMM_L4x16_SUB2:
+
+    srawi.      T10,L, 5
+    ble LSGEMM_L4x16_SUB2_16
+    mtctr		T10
+    MY_ALIGN
+LSGEMM_L4x16_SUB2_LOOP:
+	LOAD4x16_0 
+    KERNEL4x16_I1_L4_2  64,16, 0,0
+    KERNEL4x16_I1_L4_2  64,16, 1,0
+    KERNEL4x16_I1_L4_2  64,16, 2,0
+    KERNEL4x16_I1_L4_2  64,16, 3,0
+    KERNEL4x16_I1_L4_2  64,16, 4,0
+    KERNEL4x16_I1_L4_2  64,16, 5,0
+    KERNEL4x16_I1_L4_2  64,16, 6,0
+    KERNEL4x16_I1_L4_3  64,16, 7,1
+    bdnz LSGEMM_L4x16_SUB2_LOOP 
+    MY_ALIGN        
+LSGEMM_L4x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L4x16_SUB2_8
+	LOAD4x16_0 
+    KERNEL4x16_I1_L4_2  64,16, 0,0
+    KERNEL4x16_I1_L4_2  64,16, 1,0
+    KERNEL4x16_I1_L4_2  64,16, 2,0
+    KERNEL4x16_I1_L4_3  64,16, 3,1
+    MY_ALIGN 
+LSGEMM_L4x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L4x16_SUB2_4 
+	LOAD4x16_0
+    KERNEL4x16_I1_L4_2  64,16, 0,0
+    KERNEL4x16_I1_L4_3  64,16, 1,1
+	MY_ALIGN	
+LSGEMM_L4x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L4x16_SUB2_2
+    LOAD4x16_0
+    KERNEL4x16_I1_L4_3  64,16, 0,1
+    MY_ALIGN
+LSGEMM_L4x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L4x16_SUB2_1
+    LOAD4x16_0
+    KERNEL4x16_I1_L2_3  64,16, 0,1
+    MY_ALIGN    
+LSGEMM_L4x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L4x16_SAVE	
+    KERNEL4x16 0
+#	addic.		L,	L,	-1
+#	bgt		LSGEMM_L4x16_SUB2
+
+	MY_ALIGN
+LSGEMM_L4x16_SAVE:
+	SAVE4x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_L4x16_BEGIN
+    MY_ALIGN
+LSGEMM_L4x16_END:
+LSGEMM_L4x8_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L4x1_END
+
+    andi.       T1, M,  8
+    ble     LSGEMM_L4x8_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,4
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO4x8
+    ble     LSGEMM_L4x8_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x8_LOOP_START:
+ 
+    LOAD4x8_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x8_LOOP:
+
+    KERNEL4x8_I1_L4_2  32,16, 0,0
+    KERNEL4x8_I1_L4_2  32,16, 1,0
+    KERNEL4x8_I1_L4_2  32,16, 2,0
+    KERNEL4x8_I1_L4_2  32,16, 3,1    
+
+    bdnz        LSGEMM_L4x8_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x8_LOOP_END:
+
+    END4x8 0, AO, BO, 32, 16    
+
+    b       LSGEMM_L4x8_SUB1 
+    MY_ALIGN
+LSGEMM_L4x8_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L4x8_SUB2
+    MY_ALIGN
+LSGEMM_L4x8_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L4x8_SAVE
+    MY_ALIGN
+LSGEMM_L4x8_SUB2:
+ 
+    srawi.      T1,L, 3
+    ble LSGEMM_L4x8_SUB2_4 
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L4x8_SUB2_LOOP:    
+    LOAD4x8_0
+    KERNEL4x8_I1_L4_2  32,16, 0,0
+    KERNEL4x8_I1_L4_3  32,16, 1,1
+    bdnz LSGEMM_L4x8_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L4x8_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x8_SUB2_2
+    LOAD4x8_0
+    KERNEL4x8_I1_L4_3  32,16, 0,1
+    MY_ALIGN
+LSGEMM_L4x8_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x8_SUB2_1
+    LOAD4x8_0
+    KERNEL4x8_I1_L2_3  32,16, 0,1
+    MY_ALIGN    
+LSGEMM_L4x8_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x8_SAVE   
+    KERNEL4x8 0
+ 
+
+    MY_ALIGN
+LSGEMM_L4x8_SAVE:
+    SAVE4x8
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x8_END:
+LSGEMM_L4x4_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L4x1_END
+
+    andi.       T1, M,  4
+    ble     LSGEMM_L4x4_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,4
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO4x4
+    ble     LSGEMM_L4x4_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x4_LOOP_START:
+ 
+    LOAD4x4_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x4_LOOP:
+
+    KERNEL4x4_I1_L4_2  16,16, 0,0
+    KERNEL4x4_I1_L4_2  16,16, 1,0
+    KERNEL4x4_I1_L4_2  16,16, 2,0
+    KERNEL4x4_I1_L4_2  16,16, 3,1    
+
+    bdnz        LSGEMM_L4x4_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x4_LOOP_END:
+
+    END4x4 0, AO, BO, 16, 16    
+
+    b       LSGEMM_L4x4_SUB1 
+    MY_ALIGN
+LSGEMM_L4x4_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L4x4_SUB2
+    MY_ALIGN
+LSGEMM_L4x4_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L4x4_SAVE
+    MY_ALIGN
+LSGEMM_L4x4_SUB2:
+
+    srawi.      T1,L, 3 
+    ble LSGEMM_L4x4_SUB2_4  
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L4x4_SUB2_LOOP:     
+    LOAD4x4_0
+    KERNEL4x4_I1_L4_2  16,16, 0,0
+    KERNEL4x4_I1_L4_3  16,16, 1,1
+    bdnz LSGEMM_L4x4_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L4x4_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x4_SUB2_2
+    LOAD4x4_0
+    KERNEL4x4_I1_L4_3  16,16, 0,1
+    MY_ALIGN
+LSGEMM_L4x4_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x4_SUB2_1
+    LOAD4x4_0
+    KERNEL4x4_I1_L2_3  16,16, 0,1
+    MY_ALIGN    
+LSGEMM_L4x4_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x4_SAVE   
+    KERNEL4x4 0
+ 
+
+    MY_ALIGN
+LSGEMM_L4x4_SAVE:
+    SAVE4x4
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x4_END:
+LSGEMM_L4x2_BEGIN:
+    andi.       T1, M,  2
+    ble     LSGEMM_L4x2_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO4x2
+    ble     LSGEMM_L4x2_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x2_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x2_LOOP:
+
+    KERNEL4x2_2  0,0, 0,0
+    KERNEL4x2_2  0,0, 1,0
+    KERNEL4x2_2  0,0, 2,0
+    KERNEL4x2_2  0,0, 3,1    
+
+    bdnz        LSGEMM_L4x2_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x2_LOOP_END:   
+ 
+LSGEMM_L4x2_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L4x2_SAVE
+    MY_ALIGN
+LSGEMM_L4x2_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x2_SUB2_2
+    KERNEL4x2_2  0,0, 0,0
+    KERNEL4x2_2  0,0, 1,1
+    MY_ALIGN
+LSGEMM_L4x2_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x2_SUB2_1
+    KERNEL4x2_2  0,0, 0,1 
+    MY_ALIGN    
+LSGEMM_L4x2_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x2_SAVE   
+    KERNEL4x2
+  
+    MY_ALIGN
+LSGEMM_L4x2_SAVE:
+    SAVE4x2
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x2_END:
+LSGEMM_L4x1_BEGIN: 
+    andi.       T1, M,  1
+    ble     LSGEMM_L4x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO4x1
+    ble     LSGEMM_L4x1_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x1_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x1_LOOP:
+
+    KERNEL4x1_4  0,0, 0,0
+    KERNEL4x1_4  0,0, 1,1     
+
+    bdnz        LSGEMM_L4x1_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x1_LOOP_END:   
+ 
+LSGEMM_L4x1_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L4x1_SAVE
+    MY_ALIGN
+LSGEMM_L4x1_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x1_SUB2_2
+    KERNEL4x1_4  0,0, 0,1 
+    MY_ALIGN
+LSGEMM_L4x1_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x1_SUB2_1
+    KERNEL4x1_2 
+    MY_ALIGN    
+LSGEMM_L4x1_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x1_SAVE   
+    KERNEL4x1
+  
+    MY_ALIGN
+LSGEMM_L4x1_SAVE:
+    SAVE4x1
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x1_END:
+
+	slwi		T1,	K,	4
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 4
+#endif
+
+	andi.		T2,	N,	3
+	ble		.L999
+
+LSGEMM_L4_END:
+    andi.       T1, N,  2
+    ble     LSGEMM_L2_END
+LSGEMM_L2_BEGIN:
+  
+
+	mr		AO,	A
+	mr		CO,	C
+	slwi		T3,	LDC	,	1
+	add		C,	C,	T3
+ 
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_L2x16_END
+
+	MY_ALIGN
+LSGEMM_L2x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x16
+	ble		LSGEMM_L2x16_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x16_LOOP:
+
+    KERNEL2x16_4  -2048,0, 0,0
+    KERNEL2x16_4  -2048,0, 1,0
+    KERNEL2x16_4  -2048,0, 2,0
+    KERNEL2x16_4  -2048,0, 3,0
+    KERNEL2x16_4  -2048,0, 4,0
+    KERNEL2x16_4  -2048,0, 5,0        
+    KERNEL2x16_4  -2048,0, 6,0
+    KERNEL2x16_4  -2048,0, 7,0  
+    KERNEL2x16_4  -2048,0, 8,0      
+    KERNEL2x16_4  -2048,0, 9,0
+    KERNEL2x16_4  -2048,0, 10,0
+    KERNEL2x16_4  -2048,0, 11,0
+    KERNEL2x16_4  -2048,0, 12,0
+    KERNEL2x16_4  -2048,0, 13,0    
+    KERNEL2x16_4  -2048,0, 14,0    
+    KERNEL2x16_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_L2x16_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_L2x16_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x16_SAVE
+	MY_ALIGN
+LSGEMM_L2x16_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x16_SUB2_16 
+    KERNEL2x16_4  0,0, 0,0
+    KERNEL2x16_4  0,0, 1,0
+    KERNEL2x16_4  0,0, 2,0
+    KERNEL2x16_4  0,0, 3,0
+    KERNEL2x16_4  0,0, 4,0
+    KERNEL2x16_4  0,0, 5,0
+    KERNEL2x16_4  0,0, 6,0
+    KERNEL2x16_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x16_SUB2_8 
+    KERNEL2x16_4  0,0, 0,0
+    KERNEL2x16_4  0,0, 1,0
+    KERNEL2x16_4  0,0, 2,0
+    KERNEL2x16_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x16_SUB2_4  
+    KERNEL2x16_4  0,0, 0,0
+    KERNEL2x16_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x16_SUB2_2 
+    KERNEL2x16_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x16_SUB2_1 
+    KERNEL2x16_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x16_SAVE	
+    KERNEL2x16
+
+	MY_ALIGN
+LSGEMM_L2x16_SAVE:
+	SAVE2x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_L2x16_BEGIN
+    MY_ALIGN
+LSGEMM_L2x16_END:
+	andi.		I,	M,	8
+	ble		LSGEMM_L2x8_END
+
+	MY_ALIGN
+LSGEMM_L2x8_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x8
+	ble		LSGEMM_L2x8_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x8_LOOP:
+
+    KERNEL2x8_4  -2048,0, 0,0
+    KERNEL2x8_4  -2048,0, 1,0
+    KERNEL2x8_4  -2048,0, 2,0
+    KERNEL2x8_4  -2048,0, 3,0
+    KERNEL2x8_4  -2048,0, 4,0
+    KERNEL2x8_4  -2048,0, 5,0        
+    KERNEL2x8_4  -2048,0, 6,0
+    KERNEL2x8_4  -2048,0, 7,0  
+    KERNEL2x8_4  -2048,0, 8,0      
+    KERNEL2x8_4  -2048,0, 9,0
+    KERNEL2x8_4  -2048,0, 10,0
+    KERNEL2x8_4  -2048,0, 11,0
+    KERNEL2x8_4  -2048,0, 12,0
+    KERNEL2x8_4  -2048,0, 13,0    
+    KERNEL2x8_4  -2048,0, 14,0    
+    KERNEL2x8_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_L2x8_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_L2x8_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x8_SAVE
+	MY_ALIGN
+LSGEMM_L2x8_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x8_SUB2_16 
+    KERNEL2x8_4  0,0, 0,0
+    KERNEL2x8_4  0,0, 1,0
+    KERNEL2x8_4  0,0, 2,0
+    KERNEL2x8_4  0,0, 3,0
+    KERNEL2x8_4  0,0, 4,0
+    KERNEL2x8_4  0,0, 5,0
+    KERNEL2x8_4  0,0, 6,0
+    KERNEL2x8_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x8_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x8_SUB2_8 
+    KERNEL2x8_4  0,0, 0,0
+    KERNEL2x8_4  0,0, 1,0
+    KERNEL2x8_4  0,0, 2,0
+    KERNEL2x8_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x8_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x8_SUB2_4  
+    KERNEL2x8_4  0,0, 0,0
+    KERNEL2x8_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x8_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x8_SUB2_2 
+    KERNEL2x8_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x8_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x8_SUB2_1 
+    KERNEL2x8_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x8_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x8_SAVE	
+    KERNEL2x8
+
+	MY_ALIGN
+LSGEMM_L2x8_SAVE:
+	SAVE2x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x8_END:
+	andi.		I,	M,	4
+	ble		LSGEMM_L2x4_END
+
+	MY_ALIGN
+LSGEMM_L2x4_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x4
+	ble		LSGEMM_L2x4_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x4_LOOP:
+
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,0
+    KERNEL2x4_4  0,0, 2,0
+    KERNEL2x4_4  0,0, 3,0
+    KERNEL2x4_4  0,0, 4,0
+    KERNEL2x4_4  0,0, 5,0        
+    KERNEL2x4_4  0,0, 6,0
+    KERNEL2x4_4  0,0, 7,0  
+    KERNEL2x4_4  0,0, 8,0      
+    KERNEL2x4_4  0,0, 9,0
+    KERNEL2x4_4  0,0, 10,0
+    KERNEL2x4_4  0,0, 11,0
+    KERNEL2x4_4  0,0, 12,0
+    KERNEL2x4_4  0,0, 13,0    
+    KERNEL2x4_4  0,0, 14,0    
+    KERNEL2x4_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_L2x4_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_L2x4_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x4_SAVE
+	MY_ALIGN
+LSGEMM_L2x4_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x4_SUB2_16 
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,0
+    KERNEL2x4_4  0,0, 2,0
+    KERNEL2x4_4  0,0, 3,0
+    KERNEL2x4_4  0,0, 4,0
+    KERNEL2x4_4  0,0, 5,0
+    KERNEL2x4_4  0,0, 6,0
+    KERNEL2x4_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x4_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x4_SUB2_8 
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,0
+    KERNEL2x4_4  0,0, 2,0
+    KERNEL2x4_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x4_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x4_SUB2_4  
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x4_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x4_SUB2_2 
+    KERNEL2x4_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x4_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x4_SUB2_1 
+    KERNEL2x4_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x4_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x4_SAVE	
+    KERNEL2x4
+
+	MY_ALIGN
+LSGEMM_L2x4_SAVE:
+	SAVE2x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x4_END:
+	andi.		I,	M,	2
+	ble		LSGEMM_L2x2_END
+
+	MY_ALIGN
+LSGEMM_L2x2_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x2
+	ble		LSGEMM_L2x2_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x2_LOOP:
+
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,0
+    KERNEL2x2_4  0,0, 2,0
+    KERNEL2x2_4  0,0, 3,0
+    KERNEL2x2_4  0,0, 4,0
+    KERNEL2x2_4  0,0, 5,0        
+    KERNEL2x2_4  0,0, 6,0
+    KERNEL2x2_4  0,0, 7,0  
+    KERNEL2x2_4  0,0, 8,0      
+    KERNEL2x2_4  0,0, 9,0
+    KERNEL2x2_4  0,0, 10,0
+    KERNEL2x2_4  0,0, 11,0
+    KERNEL2x2_4  0,0, 12,0
+    KERNEL2x2_4  0,0, 13,0    
+    KERNEL2x2_4  0,0, 14,0    
+    KERNEL2x2_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_L2x2_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_L2x2_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x2_SAVE
+	MY_ALIGN
+LSGEMM_L2x2_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x2_SUB2_16 
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,0
+    KERNEL2x2_4  0,0, 2,0
+    KERNEL2x2_4  0,0, 3,0
+    KERNEL2x2_4  0,0, 4,0
+    KERNEL2x2_4  0,0, 5,0
+    KERNEL2x2_4  0,0, 6,0
+    KERNEL2x2_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x2_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x2_SUB2_8 
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,0
+    KERNEL2x2_4  0,0, 2,0
+    KERNEL2x2_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x2_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x2_SUB2_4  
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x2_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x2_SUB2_2 
+    KERNEL2x2_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x2_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x2_SUB2_1 
+    KERNEL2x2_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x2_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x2_SAVE	
+    KERNEL2x2
+
+	MY_ALIGN
+LSGEMM_L2x2_SAVE:
+	SAVE2x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x2_END:
+	andi.		I,	M,	1
+	ble		LSGEMM_L2x1_END
+
+	MY_ALIGN
+LSGEMM_L2x1_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x1
+	ble		LSGEMM_L2x1_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x1_LOOP:
+
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,0
+    KERNEL2x1_4  0,0, 2,0
+    KERNEL2x1_4  0,0, 3,0
+    KERNEL2x1_4  0,0, 4,0
+    KERNEL2x1_4  0,0, 5,0        
+    KERNEL2x1_4  0,0, 6,0
+    KERNEL2x1_4  0,0, 7,0  
+    KERNEL2x1_4  0,0, 8,0      
+    KERNEL2x1_4  0,0, 9,0
+    KERNEL2x1_4  0,0, 10,0
+    KERNEL2x1_4  0,0, 11,0
+    KERNEL2x1_4  0,0, 12,0
+    KERNEL2x1_4  0,0, 13,0    
+    KERNEL2x1_4  0,0, 14,0    
+    KERNEL2x1_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_L2x1_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_L2x1_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x1_SAVE
+	MY_ALIGN
+LSGEMM_L2x1_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x1_SUB2_16 
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,0
+    KERNEL2x1_4  0,0, 2,0
+    KERNEL2x1_4  0,0, 3,0
+    KERNEL2x1_4  0,0, 4,0
+    KERNEL2x1_4  0,0, 5,0
+    KERNEL2x1_4  0,0, 6,0
+    KERNEL2x1_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x1_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x1_SUB2_8 
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,0
+    KERNEL2x1_4  0,0, 2,0
+    KERNEL2x1_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x1_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x1_SUB2_4  
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x1_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x1_SUB2_2 
+    KERNEL2x1_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x1_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x1_SUB2_1 
+    KERNEL2x1_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x1_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x1_SAVE	
+    KERNEL2x1
+
+	MY_ALIGN
+LSGEMM_L2x1_SAVE:
+	SAVE2x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x1_END:
+	slwi		T1,	K,	3
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 2
+#endif 
+LSGEMM_L2_END:
+   andi.       T1, N,  1
+   ble     LSGEMM_END
+LSGEMM_1_BEGIN:
+  
+
+	mr		AO,	A
+	mr		CO,	C 
+	add		C,	C,	LDC
+ 
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_1x16_END
+
+	MY_ALIGN
+LSGEMM_1x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x16
+	ble		LSGEMM_1x16_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x16_LOOP:
+
+    KERNEL1x16_4  -2048,0, 0,0
+    KERNEL1x16_4  -2048,0, 1,0
+    KERNEL1x16_4  -2048,0, 2,0
+    KERNEL1x16_4  -2048,0, 3,0
+    KERNEL1x16_4  -2048,0, 4,0
+    KERNEL1x16_4  -2048,0, 5,0        
+    KERNEL1x16_4  -2048,0, 6,0
+    KERNEL1x16_4  -2048,0, 7,0  
+    KERNEL1x16_4  -2048,0, 8,0      
+    KERNEL1x16_4  -2048,0, 9,0
+    KERNEL1x16_4  -2048,0, 10,0
+    KERNEL1x16_4  -2048,0, 11,0
+    KERNEL1x16_4  -2048,0, 12,0
+    KERNEL1x16_4  -2048,0, 13,0    
+    KERNEL1x16_4  -2048,0, 14,0    
+    KERNEL1x16_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_1x16_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_1x16_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x16_SAVE
+	MY_ALIGN
+LSGEMM_1x16_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x16_SUB2_16 
+    KERNEL1x16_4  0,0, 0,0
+    KERNEL1x16_4  0,0, 1,0
+    KERNEL1x16_4  0,0, 2,0
+    KERNEL1x16_4  0,0, 3,0
+    KERNEL1x16_4  0,0, 4,0
+    KERNEL1x16_4  0,0, 5,0
+    KERNEL1x16_4  0,0, 6,0
+    KERNEL1x16_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x16_SUB2_8 
+    KERNEL1x16_4  0,0, 0,0
+    KERNEL1x16_4  0,0, 1,0
+    KERNEL1x16_4  0,0, 2,0
+    KERNEL1x16_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x16_SUB2_4  
+    KERNEL1x16_4  0,0, 0,0
+    KERNEL1x16_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x16_SUB2_2 
+    KERNEL1x16_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x16_SUB2_1 
+    KERNEL1x16_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x16_SAVE	
+    KERNEL1x16
+
+	MY_ALIGN
+LSGEMM_1x16_SAVE:
+	SAVE1x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_1x16_BEGIN
+    MY_ALIGN
+LSGEMM_1x16_END:
+	andi.		I,	M,	8
+	ble		LSGEMM_1x8_END
+
+	MY_ALIGN
+LSGEMM_1x8_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x8
+	ble		LSGEMM_1x8_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x8_LOOP:
+
+    KERNEL1x8_4  -2048,0, 0,0
+    KERNEL1x8_4  -2048,0, 1,0
+    KERNEL1x8_4  -2048,0, 2,0
+    KERNEL1x8_4  -2048,0, 3,0
+    KERNEL1x8_4  -2048,0, 4,0
+    KERNEL1x8_4  -2048,0, 5,0        
+    KERNEL1x8_4  -2048,0, 6,0
+    KERNEL1x8_4  -2048,0, 7,0  
+    KERNEL1x8_4  -2048,0, 8,0      
+    KERNEL1x8_4  -2048,0, 9,0
+    KERNEL1x8_4  -2048,0, 10,0
+    KERNEL1x8_4  -2048,0, 11,0
+    KERNEL1x8_4  -2048,0, 12,0
+    KERNEL1x8_4  -2048,0, 13,0    
+    KERNEL1x8_4  -2048,0, 14,0    
+    KERNEL1x8_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_1x8_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_1x8_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x8_SAVE
+	MY_ALIGN
+LSGEMM_1x8_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x8_SUB2_16 
+    KERNEL1x8_4  0,0, 0,0
+    KERNEL1x8_4  0,0, 1,0
+    KERNEL1x8_4  0,0, 2,0
+    KERNEL1x8_4  0,0, 3,0
+    KERNEL1x8_4  0,0, 4,0
+    KERNEL1x8_4  0,0, 5,0
+    KERNEL1x8_4  0,0, 6,0
+    KERNEL1x8_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x8_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x8_SUB2_8 
+    KERNEL1x8_4  0,0, 0,0
+    KERNEL1x8_4  0,0, 1,0
+    KERNEL1x8_4  0,0, 2,0
+    KERNEL1x8_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x8_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x8_SUB2_4  
+    KERNEL1x8_4  0,0, 0,0
+    KERNEL1x8_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x8_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x8_SUB2_2 
+    KERNEL1x8_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x8_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x8_SUB2_1 
+    KERNEL1x8_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x8_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x8_SAVE	
+    KERNEL1x8
+
+	MY_ALIGN
+LSGEMM_1x8_SAVE:
+	SAVE1x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x8_END:
+	andi.		I,	M,	4
+	ble		LSGEMM_1x4_END
+
+	MY_ALIGN
+LSGEMM_1x4_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x4
+	ble		LSGEMM_1x4_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x4_LOOP:
+
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,0
+    KERNEL1x4_4  0,0, 2,0
+    KERNEL1x4_4  0,0, 3,0
+    KERNEL1x4_4  0,0, 4,0
+    KERNEL1x4_4  0,0, 5,0        
+    KERNEL1x4_4  0,0, 6,0
+    KERNEL1x4_4  0,0, 7,0  
+    KERNEL1x4_4  0,0, 8,0      
+    KERNEL1x4_4  0,0, 9,0
+    KERNEL1x4_4  0,0, 10,0
+    KERNEL1x4_4  0,0, 11,0
+    KERNEL1x4_4  0,0, 12,0
+    KERNEL1x4_4  0,0, 13,0    
+    KERNEL1x4_4  0,0, 14,0    
+    KERNEL1x4_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_1x4_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_1x4_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x4_SAVE
+	MY_ALIGN
+LSGEMM_1x4_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x4_SUB2_16 
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,0
+    KERNEL1x4_4  0,0, 2,0
+    KERNEL1x4_4  0,0, 3,0
+    KERNEL1x4_4  0,0, 4,0
+    KERNEL1x4_4  0,0, 5,0
+    KERNEL1x4_4  0,0, 6,0
+    KERNEL1x4_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x4_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x4_SUB2_8 
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,0
+    KERNEL1x4_4  0,0, 2,0
+    KERNEL1x4_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x4_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x4_SUB2_4  
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x4_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x4_SUB2_2 
+    KERNEL1x4_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x4_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x4_SUB2_1 
+    KERNEL1x4_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x4_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x4_SAVE	
+    KERNEL1x4
+
+	MY_ALIGN
+LSGEMM_1x4_SAVE:
+	SAVE1x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x4_END:
+	andi.		I,	M,	2
+	ble		LSGEMM_1x2_END
+
+	MY_ALIGN
+LSGEMM_1x2_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x2
+	ble		LSGEMM_1x2_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x2_LOOP:
+
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,0
+    KERNEL1x2_4  0,0, 2,0
+    KERNEL1x2_4  0,0, 3,0
+    KERNEL1x2_4  0,0, 4,0
+    KERNEL1x2_4  0,0, 5,0        
+    KERNEL1x2_4  0,0, 6,0
+    KERNEL1x2_4  0,0, 7,0  
+    KERNEL1x2_4  0,0, 8,0      
+    KERNEL1x2_4  0,0, 9,0
+    KERNEL1x2_4  0,0, 10,0
+    KERNEL1x2_4  0,0, 11,0
+    KERNEL1x2_4  0,0, 12,0
+    KERNEL1x2_4  0,0, 13,0    
+    KERNEL1x2_4  0,0, 14,0    
+    KERNEL1x2_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_1x2_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_1x2_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x2_SAVE
+	MY_ALIGN
+LSGEMM_1x2_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x2_SUB2_16 
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,0
+    KERNEL1x2_4  0,0, 2,0
+    KERNEL1x2_4  0,0, 3,0
+    KERNEL1x2_4  0,0, 4,0
+    KERNEL1x2_4  0,0, 5,0
+    KERNEL1x2_4  0,0, 6,0
+    KERNEL1x2_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x2_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x2_SUB2_8 
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,0
+    KERNEL1x2_4  0,0, 2,0
+    KERNEL1x2_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x2_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x2_SUB2_4  
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x2_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x2_SUB2_2 
+    KERNEL1x2_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x2_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x2_SUB2_1 
+    KERNEL1x2_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x2_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x2_SAVE	
+    KERNEL1x2
+
+	MY_ALIGN
+LSGEMM_1x2_SAVE:
+	SAVE1x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x2_END:
+    andi.		I,	M,	1
+	ble		LSGEMM_1x1_END
+
+	MY_ALIGN
+LSGEMM_1x1_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x1
+	ble		LSGEMM_1x1_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x1_LOOP:
+
+    KERNEL1x1_16  0,0, 0,0
+    KERNEL1x1_16  0,0, 1,0
+    KERNEL1x1_16  0,0, 2,0
+    KERNEL1x1_16  0,0, 3,1 	
+
+	bdnz		LSGEMM_1x1_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_1x1_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x1_SAVE
+	MY_ALIGN
+LSGEMM_1x1_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x1_SUB2_16 
+    KERNEL1x1_16  0,0, 0,0
+    KERNEL1x1_16  0,0, 1,1 
+    MY_ALIGN        
+LSGEMM_1x1_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x1_SUB2_8 
+    KERNEL1x1_16  0,0, 0,1
+    MY_ALIGN 
+LSGEMM_1x1_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x1_SUB2_4  
+    KERNEL1x1_8  0,0, 0,1
+	MY_ALIGN	
+LSGEMM_1x1_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x1_SUB2_2 
+    KERNEL1x1_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x1_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x1_SUB2_1 
+    KERNEL1x1_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x1_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x1_SAVE	
+    KERNEL1x1
+
+	MY_ALIGN
+LSGEMM_1x1_SAVE:
+	SAVE1x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x1_END:
+	slwi		T1,	K,	2
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 1
+#endif 
 LSGEMM_END:
\ No newline at end of file
diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S
index 2c9e537c7..3750d338d 100644
--- a/kernel/power/sgemm_macros_power9.S
+++ b/kernel/power/sgemm_macros_power9.S
@@ -1,5575 +1,5575 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-#define unit_size 4
-#define DISP64(ind,disp) (ind*unit_size*64+disp)
-#define DISP32(ind,disp) (ind*unit_size*32+disp)
-#define DISP16(ind,disp) (ind*unit_size*16+disp)
-#define DISP8(ind,disp) (ind*unit_size*8+disp)
-#define DISP4(ind,disp) (ind*unit_size*4+disp)
-#define DISP2(ind,disp) (ind*unit_size*2+disp)
-#define DISP1(ind,disp) (ind*unit_size+disp)
-
-/**********************************************************************************************
-* Macros for N=8 and M=16
-**********************************************************************************************/
-
- 
-
-.macro KERNEL8x16_L1_L4  Index,IsLast
-  KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
-.endm
-
-.macro KERNEL8x16_I1_L4  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x16_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x16_I1_L4_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x16_I1_L4_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
- 
-.macro KERNEL8x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL8x16_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x16_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL8x16_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro Zero8X16
-    xxlxor		vs32,	vs32,	vs32
-    xxlxor		vs33,	vs33,	vs33
-	xxlxor		vs34,	vs34,	vs34
-	xxlxor		vs35,	vs35,	vs35
-	xxlxor		vs36,	vs36,	vs36
-	xxlxor		vs37,	vs37,	vs37
-	xxlxor		vs38,	vs38,	vs38
-	xxlxor		vs39,	vs39,	vs39
-	xxlxor		vs40,	vs40,	vs40
-	xxlxor		vs41,	vs41,	vs41
-	xxlxor		vs42,	vs42,	vs42
-	xxlxor		vs43,	vs43,	vs43
-	xxlxor		vs44,	vs44,	vs44
-	xxlxor		vs45,	vs45,	vs45
-	xxlxor		vs46,	vs46,	vs46
-	xxlxor		vs47,	vs47,	vs47
-	xxlxor		vs48,	vs48,	vs48
-	xxlxor		vs49,	vs49,	vs49
-	xxlxor		vs50,	vs50,	vs50
-	xxlxor		vs51,	vs51,	vs51 
-	xxlxor		vs52,	vs52,	vs52
-	xxlxor		vs53,	vs53,	vs53
-	xxlxor		vs54,	vs54,	vs54
-	xxlxor		vs55,	vs55,	vs55 
-	xxlxor		vs56,	vs56,	vs56
-	xxlxor		vs57,	vs57,	vs57
-	xxlxor		vs58,	vs58,	vs58
-	xxlxor		vs59,	vs59,	vs59 
-	xxlxor		vs60,	vs60,	vs60
-	xxlxor		vs61,	vs61,	vs61
-	xxlxor		vs62,	vs62,	vs62
-	xxlxor		vs63,	vs63,	vs63	
-.endm
-
-.macro LOAD8x16  OffsetA,OffsetB
-
-	lxv	vs24,	(\OffsetB+0)(BO)
-	lxv	vs28,	(\OffsetB+16)(BO)
-	xxperm  	vs26,	vs24,		permute_mask
-	xxperm  	vs30,	vs28,		permute_mask	  
-	lxv	vs0,	(\OffsetA+0)(AO)
-	lxv	vs1,	(\OffsetA+16)(AO)
-	xxpermdi	vs25,	vs24,	vs24,2	   
-	xxpermdi	vs29,	vs28,	vs28,2	  
-	lxv	vs2,	(\OffsetA+32)(AO)
-	lxv	vs3,	(\OffsetA+48)(AO) 
-	xxpermdi	vs27,	vs26,	vs26,2	
-	xxpermdi	vs31,	vs30,	vs30,2	 	
-
-.endm
-
-.macro END8x16_NORMAL
-  END8x16 0, AO, BO, 64,32 
-.endm
-
-.macro END8x16_WITHOUT_ADD
-	END8x16 0, AO,BO,0,0
-.endm
-
-.macro END8x16 First, AREG, BREG, OffsetA, OffsetB
-
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
-.endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
-
-.if \First==1
-    xvmulsp     vs32, vs0,vs24
-    xvmulsp     vs33, vs1,vs24
-    xvmulsp     vs34, vs2,vs24  
-    xvmulsp     vs35, vs3,vs24  
-
-    xvmulsp     vs36, vs0,vs25
-    xvmulsp     vs37, vs1,vs25
-    xvmulsp     vs38, vs2,vs25  
-    xvmulsp     vs39, vs3,vs25
-
-    xvmulsp     vs40, vs0,vs26
-    xvmulsp     vs41, vs1,vs26
-    xvmulsp     vs42, vs2,vs26  
-    xvmulsp     vs43, vs3,vs26
-
-    xvmulsp     vs44, vs0,vs27
-    xvmulsp     vs45, vs1,vs27
-    xvmulsp     vs46, vs2,vs27  
-    xvmulsp     vs47, vs3,vs27
-
-    xvmulsp     vs48, vs0,vs28
-    xvmulsp     vs49, vs1,vs28
-    xvmulsp     vs50, vs2,vs28  
-    xvmulsp     vs51, vs3,vs28  
-
-    xvmulsp     vs52, vs0,vs29
-    xvmulsp     vs53, vs1,vs29
-    xvmulsp     vs54, vs2,vs29  
-    xvmulsp     vs55, vs3,vs29
-
-    xvmulsp     vs56, vs0,vs30
-    xvmulsp     vs57, vs1,vs30
-    xvmulsp     vs58, vs2,vs30  
-    xvmulsp     vs59, vs3,vs30
-
-    xvmulsp     vs60, vs0,vs31
-    xvmulsp     vs61, vs1,vs31
-    xvmulsp     vs62, vs2,vs31  
-    xvmulsp     vs63, vs3,vs31
-
-.else
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs34, vs2,vs24  
-    xvmaddasp       vs35, vs3,vs24  
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-    xvmaddasp       vs38, vs2,vs25  
-    xvmaddasp       vs39, vs3,vs25 
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs42, vs2,vs26  
-    xvmaddasp       vs43, vs3,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-    xvmaddasp       vs46, vs2,vs27  
-    xvmaddasp       vs47, vs3,vs27
-
-    xvmaddasp       vs48, vs0,vs28
-    xvmaddasp       vs49, vs1,vs28
-    xvmaddasp       vs50, vs2,vs28  
-    xvmaddasp       vs51, vs3,vs28  
-
-    xvmaddasp       vs52, vs0,vs29
-    xvmaddasp       vs53, vs1,vs29
-    xvmaddasp       vs54, vs2,vs29  
-    xvmaddasp       vs55, vs3,vs29
-
-    xvmaddasp       vs56, vs0,vs30
-    xvmaddasp       vs57, vs1,vs30
-    xvmaddasp       vs58, vs2,vs30  
-    xvmaddasp       vs59, vs3,vs30
-
-    xvmaddasp       vs60, vs0,vs31
-    xvmaddasp       vs61, vs1,vs31
-    xvmaddasp       vs62, vs2,vs31  
-    xvmaddasp       vs63, vs3,vs31 
-
-.endif
-.endm  
-
-.macro KERNEL8x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
-
-KERNEL8x16_2  \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
-KERNEL8x16_2  \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
-
-.endm
-
-.macro KERNEL8x16 First
-
-  LOAD8x16 0,0
-  END8x16 \First, AO, BO, 64,32 
-.endm
-
-.macro LOAD8x16_2
-    LOAD8x16_2O AO,BO, 0,0
-.endm	
-
-.macro LOAD8x16_2O  AREG,BREG, OffsetA,OffsetB
-  lxv	vs8,	(\OffsetB)(\BREG)
-  lxv	vs12,	(16+\OffsetB)(\BREG)
-  lxv	vs24,	(32+\OffsetB)(\BREG)
-  lxv	vs28,	(32+16+\OffsetB)(\BREG)
-  lxv	vs4,	(0+\OffsetA)(\AREG)
-  lxv	vs5,	(16+\OffsetA)(\AREG)
-  xxperm  	vs10,	vs8,		permute_mask
-  xxperm  	vs14,	vs12,		permute_mask	
-  lxv	vs6,	(32+\OffsetA)(\AREG)
-  lxv	vs7,	(48+\OffsetA)(\AREG) 
-  xxpermdi	vs9,	vs8,	 vs8,2	 
-  xxpermdi	vs13,	vs12,	vs12,2	 
-  lxv	vs0,	(64+\OffsetA)(\AREG)
-  lxv	vs1,	(64+16+\OffsetA)(\AREG) 
-  xxpermdi	vs11,	vs10,	vs10,2	
-  xxpermdi	vs15,	vs14,	vs14,2	
-  lxv	vs2,	(64+32+\OffsetA)(\AREG)
-  lxv	vs3,	(64+48+\OffsetA)(\AREG)
-
-  xxperm  	vs26,	vs24,	permute_mask
-  xxperm  	vs30,	vs28,	permute_mask	
-  xxpermdi	vs25,	vs24,	vs24,2 
-  xxpermdi	vs29,	vs28,	vs28,2	      
-  xxpermdi	vs27,	vs26,	vs26,2	
-  xxpermdi	vs31,	vs30,	vs30,2	 
-.endm
-
-.macro END8x16_2	  
-  /*for load2 offset will be 128 and 64*/
-   KERNEL8x16_2	AO,BO,	128,64,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL8x16_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL8x16_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL8x16_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL8x16_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL8x16_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp		vs32, vs4,vs8
-  xvmaddasp		vs33, vs5,vs8
-  xvmaddasp		vs48, vs4,vs12
-  xvmaddasp		vs49, vs5,vs12
-
-  xvmaddasp		vs40, vs4,vs10
-  xvmaddasp		vs41, vs5,vs10
-  xvmaddasp		vs56, vs4,vs14
-  xvmaddasp		vs57, vs5,vs14
-
-  xvmaddasp		vs36, vs4,vs9
-  xvmaddasp		vs37, vs5,vs9
-  xvmaddasp		vs52, vs4,vs13
-  xvmaddasp		vs53, vs5,vs13
-
-  xvmaddasp		vs44, vs4,vs11
-  xvmaddasp		vs45, vs5,vs11
-  xvmaddasp		vs60, vs4,vs15
-  xvmaddasp		vs61, vs5,vs15
-
-.if \Complete==0	
-   lxv	vs4,	DISP32(\Index,0+\OffsetA)(\AREG)
-   lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
-.endif
-
-  xvmaddasp		vs34, vs6,vs8	
-  xvmaddasp		vs35, vs7,vs8	
-  xvmaddasp		vs50, vs6,vs12
-  xvmaddasp		vs51, vs7,vs12
-.if \Complete==0  
-  lxv vs8,  DISP16(\Index,\OffsetB)(\BREG)
-  lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
-.endif    
-  xvmaddasp		vs42, vs6,vs10
-  xvmaddasp		vs43, vs7,vs10
-  xvmaddasp		vs58, vs6,vs14
-  xvmaddasp		vs59, vs7,vs14
-.if \Complete==0  
-  xxperm    vs10, vs8,    permute_mask
-  xxperm    vs14, vs12,   permute_mask    
-.endif    
-  xvmaddasp		vs38, vs6,vs9	
-  xvmaddasp		vs39, vs7,vs9	
-  xvmaddasp   vs54, vs6,vs13
-  xvmaddasp   vs55, vs7,vs13
-.if \Complete==0
-  xxpermdi  vs9,  vs8,   vs8,2   
-  xxpermdi  vs13, vs12, vs12,2   
-.endif    
-  xvmaddasp		vs46, vs6,vs11
-  xvmaddasp		vs47, vs7,vs11
-  xvmaddasp		vs62, vs6,vs15
-  xvmaddasp		vs63, vs7,vs15
-.if \Complete==0
-  xxpermdi  vs11, vs10, vs10,2  
-  xxpermdi  vs15, vs14, vs14,2  
-.endif  
-
-.if \Complete==0
-   lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
-   lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
-.endif 
-
-  xvmaddasp		vs32, vs0,vs24
-  xvmaddasp		vs33, vs1,vs24
-  xvmaddasp		vs48, vs0,vs28
-  xvmaddasp		vs49, vs1,vs28
-  xvmaddasp		vs40, vs0,vs26
-  xvmaddasp		vs41, vs1,vs26
-  xvmaddasp		vs56, vs0,vs30
-  xvmaddasp		vs57, vs1,vs30
-  xvmaddasp		vs36, vs0,vs25
-  xvmaddasp		vs37, vs1,vs25
-  xvmaddasp		vs52, vs0,vs29
-  xvmaddasp		vs53, vs1,vs29
-  xvmaddasp		vs44, vs0,vs27
-  xvmaddasp		vs45, vs1,vs27
-  xvmaddasp		vs60, vs0,vs31
-  xvmaddasp		vs61, vs1,vs31 
-.if \Complete==0
-  lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
-  lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG) 
-.endif
-
-  xvmaddasp		vs34, vs2,vs24
-  xvmaddasp		vs35, vs3,vs24	  
-  xvmaddasp		vs50, vs2,vs28
-  xvmaddasp		vs51, vs3,vs28
-.if \Complete==0
-  lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
-  lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
-.endif  
-  xvmaddasp		vs42, vs2,vs26
-  xvmaddasp		vs43, vs3,vs26
-  xvmaddasp		vs58, vs2,vs30
-  xvmaddasp		vs59, vs3,vs30
-.if \Complete==0
-  xxperm    vs26, vs24, permute_mask
-  xxperm    vs30, vs28, permute_mask  
-.endif  
-  xvmaddasp		vs38, vs2,vs25
-  xvmaddasp		vs39, vs3,vs25
-  xvmaddasp		vs54, vs2,vs29
-  xvmaddasp		vs55, vs3,vs29
-.if \Complete==0
-  xxpermdi  vs25, vs24, vs24,2 
-  xxpermdi  vs29, vs28, vs28,2    
-.endif  
-  xvmaddasp		vs46, vs2,vs27
-  xvmaddasp		vs47, vs3,vs27
-  xvmaddasp		vs62, vs2,vs31	
-  xvmaddasp		vs63, vs3,vs31
-.if \Complete==0
-  xxpermdi  vs27, vs26, vs26,2  
-  xxpermdi  vs31, vs30, vs30,2   
-.endif
-.if \Complete==0
-  lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
-  lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
-.endif
-
-
-.if \IsLast==1	
-.if \Complete==1
-	addi		\BREG, \BREG,  DISP16(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP32(\Index,\OffsetA)  
-
-.else
-	addi		\BREG, \BREG,  DISP16(\Index,64)
-  addi    \AREG, \AREG, DISP32(\Index,128)  
-
-.endif
-.endif   
-
-
-.endm
-
- 
-.macro SAVE8x16
-
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC 
-
-  add     T2, CO, T10  
-  add     T3, T1, T10  
-
-  add     T4, T2, T10  
-  add     T5, T3, T10 
-
-  add     T6, T4, T10 
-  add     T7, T5, T10 
-
-
-
-   /* permute to restore butterfly rank 1 updateto normal promoted one */  
-    /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC)  vs11 MEM(CO+3*LDC) */
-    /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC)  vs15 MEM(16+CO+3*LDC) */
-    /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC)  vs19 MEM(32+CO+3*LDC) */
-    /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC)  vs27 MEM(32+CO+3*LDC) */
-
-    xxmrglw     vs8,    vs32,   vs44
-    xxmrglw     vs10,   vs36,   vs40  
-
-    xxmrghw     vs1,    vs32,   vs44
-    xxmrghw     vs0,    vs36,   vs40
-
-    xxmrglw     vs12,   vs33,   vs45
-    xxmrglw     vs14,   vs37,   vs41  
-
-    xxmrghw     vs2,    vs37,   vs41
-    xxmrghw     vs3,    vs33,   vs45
-#ifndef TRMMKERNEL    
-    lxv        vs32, 0(CO)
-    lxv        vs33, 16(CO) 
-#endif 
-    xxmrglw     vs16,   vs34,   vs46
-    xxmrglw     vs18,   vs38,   vs42   
-
-    xxlor      vs9, vs8,    vs8
-    xxlor      vs11,    vs10,   vs10 
-
-    xxmrghw     vs4,    vs38,   vs42
-    xxmrghw     vs5,    vs34,   vs46
-
-    xxlor      vs13,    vs12,   vs12
-    xxlor      vs15,    vs14,   vs14
-
-    xxmrglw     vs24,   vs35,   vs47
-    xxmrglw     vs26,   vs39,   vs43  
-
-    xxlor      vs17,    vs16,   vs16
-    xxlor      vs19,    vs18,   vs18
-
-    xxmrghw     vs30,   vs39,   vs43 
-    xxmrghw     vs31,   vs35,   vs47
-#ifndef TRMMKERNEL       
-    lxv        vs34, 32(CO)  
-    lxv        vs35, 48(CO)      
-#endif
-    xxperm      vs8,    vs0,    save_permute_1
-    xxperm      vs10,   vs1,    save_permute_1
-#ifndef TRMMKERNEL    
-    lxv        vs36, 0(T1)
-    lxv        vs37, 16(T1) 
-#endif
-    xxperm      vs9,    vs0,    save_permute_2  
-    xxperm      vs11,   vs1,    save_permute_2      
-
-#ifndef TRMMKERNEL      
-    lxv        vs38, 32(T1)  
-    lxv        vs39, 48(T1)     
-#endif
-
-    xxlor      vs25,    vs24,   vs24
-    xxlor      vs27,    vs26,   vs26 
-
-
-
-#ifndef TRMMKERNEL       
-    lxv        vs40, 0(T2)
-    lxv        vs41, 16(T2) 
-#endif
-
-    xxperm     vs12,    vs2,    save_permute_1
-    xxperm     vs14,    vs3,    save_permute_1
-#ifndef TRMMKERNEL     
-    lxv        vs42, 32(T2)  
-    lxv        vs43, 48(T2)     
-#endif  
-       
-    xxperm     vs13,    vs2,    save_permute_2   
-    xxperm     vs15,    vs3,    save_permute_2  
-#ifndef TRMMKERNEL    
-    lxv        vs44, 0(T3)
-    lxv        vs45, 16(T3)
-#endif
-    xxperm     vs16,    vs4,    save_permute_1
-    xxperm     vs18,    vs5,    save_permute_1
-#ifndef TRMMKERNEL      
-    lxv        vs46, 32(T3)  
-    lxv        vs47, 48(T3)                 
-#endif  
-
-    
-
-
-      
-    xxperm     vs17,    vs4,    save_permute_2   
-    xxperm     vs19,    vs5,    save_permute_2      
-#ifdef TRMMKERNEL
-    xvmulsp     vs32,   vs8,    alpha_r 
-    xvmulsp     vs33,   vs12,   alpha_r                 
-#else 
-    xvmaddasp   vs32,   vs8,    alpha_r 
-    xvmaddasp   vs33,   vs12,   alpha_r            
-#endif 
-    xxperm     vs24,    vs30,   save_permute_1
-    xxperm     vs26,    vs31,   save_permute_1 
-
- 
-    stxv        vs32, 0(CO)
-    stxv        vs33, 16(CO)     
-#ifdef TRMMKERNEL   
-    xvmulsp     vs34,   vs16,   alpha_r 
-    xvmulsp     vs35,   vs24,   alpha_r                 
-#else    
-    xvmaddasp   vs34,   vs16,   alpha_r 
-    xvmaddasp   vs35,   vs24,   alpha_r           
-#endif 
-         
-    xxperm     vs25,    vs30,   save_permute_2   
-    xxperm     vs27,    vs31,   save_permute_2  
-
-
-    stxv        vs34, 32(CO)  
-    stxv        vs35, 48(CO)  
-#ifdef TRMMKERNEL  
-    xvmulsp     vs36,   vs9,    alpha_r 
-    xvmulsp     vs37,   vs13,   alpha_r                
-#else   
-    xvmaddasp   vs36,   vs9,    alpha_r 
-    xvmaddasp   vs37,   vs13,   alpha_r           
-#endif 
-    stxv        vs36, 0(T1)
-    stxv        vs37, 16(T1)
-#ifdef TRMMKERNEL  
-    xvmulsp     vs38,   vs17,   alpha_r 
-    xvmulsp     vs39,   vs25,   alpha_r               
-#else   
-    xvmaddasp   vs38,   vs17,   alpha_r 
-    xvmaddasp   vs39,   vs25,   alpha_r         
-#endif 
-    stxv        vs38, 32(T1)  
-    stxv        vs39, 48(T1)
-
-#ifdef TRMMKERNEL
-    xvmulsp     vs40,   vs10,   alpha_r 
-    xvmulsp     vs41,   vs14,   alpha_r                    
-#else 
-    xvmaddasp   vs40,   vs10,   alpha_r 
-    xvmaddasp   vs41,   vs14,   alpha_r   
-#endif   
-
-    stxv        vs40, 0(T2)
-    stxv        vs41, 16(T2)  
-#ifdef TRMMKERNEL 
-    xvmulsp     vs42,   vs18,   alpha_r 
-    xvmulsp     vs43,   vs26,   alpha_r                     
-#else   
-    xvmaddasp   vs42,   vs18,   alpha_r 
-    xvmaddasp   vs43,   vs26,   alpha_r
-#endif      
-    stxv        vs42, 32(T2)  
-    stxv        vs43, 48(T2)  
-#ifdef TRMMKERNEL  
-    xvmulsp     vs44,   vs11,   alpha_r 
-    xvmulsp     vs45,   vs15,   alpha_r                    
-#else
-    xvmaddasp   vs44,   vs11,   alpha_r 
-    xvmaddasp   vs45,   vs15,   alpha_r    
-#endif      
-    stxv        vs44, 0(T3)
-    stxv        vs45, 16(T3) 
-#ifdef TRMMKERNEL 
-    xvmulsp     vs46,   vs19,   alpha_r 
-    xvmulsp     vs47,   vs27,   alpha_r                   
-#else 
-    xvmaddasp   vs46,   vs19,   alpha_r 
-    xvmaddasp   vs47,   vs27,   alpha_r 
-#endif      
-    stxv        vs46, 32(T3)  
-    stxv        vs47, 48(T3)
-  
- /*****the same with the second 8X8 ****/
- #ifndef TRMMKERNEL 
-    lxv        vs32, 0(T4)
-    lxv        vs33, 16(T4) 
-#endif  
-    xxmrglw     vs8,    vs48,   vs60
-    xxmrglw     vs10,   vs52,   vs56  
-#ifndef TRMMKERNEL    
-    lxv        vs34, 32(T4)  
-    lxv        vs35, 48(T4)  
-#endif  
-    xxmrghw     vs1,    vs48,   vs60
-    xxmrghw     vs0,    vs52,   vs56
-#ifndef TRMMKERNEL        
-    lxv        vs36, 0(T5)
-    lxv        vs37, 16(T5) 
-#endif  
-    xxmrglw     vs12,   vs49,   vs61
-    xxmrglw     vs14,   vs53,   vs57  
-#ifndef TRMMKERNEL    
-    lxv        vs38,32(T5)  
-    lxv        vs39, 48(T5)     
-#endif   
- 
-    xxmrghw     vs2,    vs53,   vs57
-    xxmrghw     vs3,    vs49,   vs61
-#ifndef TRMMKERNEL   
-    lxv        vs40, 0(T6)
-    lxv        vs41, 16(T6)
-#endif  
-    xxmrglw     vs16,   vs50,   vs62
-    xxmrglw     vs18,   vs54,   vs58   
-#ifndef TRMMKERNEL      
-    lxv        vs42, 32(T6)  
-    lxv        vs43, 48(T6) 
-#endif  
-    xxlor      vs9, vs8,    vs8
-    xxlor      vs11,    vs10,   vs10 
-    xxmrghw     vs4,    vs54,   vs58
-    xxmrghw     vs5,    vs50,   vs62
-#ifndef TRMMKERNEL              
-    lxv        vs44, 0(T7)
-    lxv        vs45, 16(T7) 
-#endif  
-    xxlor      vs13,    vs12,   vs12
-    xxlor      vs15,    vs14,   vs14
- 
-    xxmrglw     vs24,   vs51,   vs63
-    xxmrglw     vs26,   vs55,   vs59 
-#ifndef TRMMKERNEL    
-    lxv        vs46, 32(T7)  
-    lxv        vs47, 48(T7)     
-#endif  
-    xxlor      vs17,    vs16,   vs16
-    xxlor      vs19,    vs18,   vs18
-    xxmrghw     vs30,   vs55,   vs59 
-    xxmrghw     vs31,   vs51,   vs63 
-
- 
-
-    xxperm      vs8,    vs0,    save_permute_1
-    xxperm      vs10,   vs1,    save_permute_1
-     
-    xxperm      vs9,    vs0,    save_permute_2  
-    xxperm      vs11,   vs1,    save_permute_2      
-
-    xxlor      vs25,    vs24,   vs24
-    xxlor      vs27,    vs26,   vs26 
-    xxperm     vs12,    vs2,    save_permute_1
-    xxperm     vs14,    vs3,    save_permute_1
-
-    xxperm     vs13,    vs2,    save_permute_2   
-    xxperm     vs15,    vs3,    save_permute_2      
- #ifdef TRMMKERNEL
-    xvmulsp     vs32,   vs8,    alpha_r 
-    xvmulsp     vs33,   vs12,   alpha_r                 
-#else 
-    xvmaddasp   vs32,   vs8,    alpha_r 
-    xvmaddasp   vs33,   vs12,   alpha_r            
-#endif  
-    xxperm     vs16,    vs4,    save_permute_1
-    xxperm     vs18,    vs5,    save_permute_1
-    stxv        vs32, 0(T4)
-    stxv        vs33, 16(T4) 
-    xxperm     vs17,    vs4,    save_permute_2   
-    xxperm     vs19,    vs5,    save_permute_2      
-    xxperm     vs24,    vs30,   save_permute_1
-    xxperm     vs26,    vs31,   save_permute_1 
-    xxperm     vs25,    vs30,   save_permute_2   
-    xxperm     vs27,    vs31,   save_permute_2      
-
-#ifdef TRMMKERNEL   
-    xvmulsp     vs34,   vs16,   alpha_r 
-    xvmulsp     vs35,   vs24,   alpha_r                 
-#else    
-    xvmaddasp   vs34,   vs16,   alpha_r 
-    xvmaddasp   vs35,   vs24,   alpha_r           
-#endif 
-    stxv        vs34, 32(T4)  
-    stxv        vs35, 48(T4)  
-
-#ifdef TRMMKERNEL  
-    xvmulsp     vs36,   vs9,    alpha_r 
-    xvmulsp     vs37,   vs13,   alpha_r                
-#else   
-    xvmaddasp   vs36,   vs9,    alpha_r 
-    xvmaddasp   vs37,   vs13,   alpha_r           
-#endif 
-    stxv        vs36, 0(T5)
-    stxv        vs37, 16(T5) 
-
-#ifdef TRMMKERNEL  
-    xvmulsp     vs38,   vs17,   alpha_r 
-    xvmulsp     vs39,   vs25,   alpha_r               
-#else  
-    xvmaddasp   vs38,   vs17,   alpha_r 
-    xvmaddasp   vs39,   vs25,   alpha_r         
-#endif 
-
-
-
- 
-    stxv        vs38, 32(T5)  
-    stxv        vs39, 48(T5)
-
-
-#ifdef TRMMKERNEL
-    xvmulsp     vs40,   vs10,   alpha_r 
-    xvmulsp     vs41,   vs14,   alpha_r                    
-#else 
-    xvmaddasp   vs40,   vs10,   alpha_r 
-    xvmaddasp   vs41,   vs14,   alpha_r   
-#endif  
-    stxv        vs40, 0(T6)
-    stxv        vs41, 16(T6) 
-#ifdef TRMMKERNEL 
-    xvmulsp     vs42,   vs18,   alpha_r 
-    xvmulsp     vs43,   vs26,   alpha_r                     
-#else   
-    xvmaddasp   vs42,   vs18,   alpha_r 
-    xvmaddasp   vs43,   vs26,   alpha_r
-#endif  
-    stxv        vs42, 32(T6)  
-    stxv        vs43, 48(T6)  
-#ifdef TRMMKERNEL  
-    xvmulsp     vs44,   vs11,   alpha_r 
-    xvmulsp     vs45,   vs15,   alpha_r                    
-#else
-    xvmaddasp   vs44,   vs11,   alpha_r 
-    xvmaddasp   vs45,   vs15,   alpha_r    
-#endif  
-
-    stxv        vs44, 0(T7)
-    stxv        vs45, 16(T7) 
-#ifdef TRMMKERNEL 
-    xvmulsp     vs46,   vs19,   alpha_r 
-    xvmulsp     vs47,   vs27,   alpha_r                   
-#else 
-    xvmaddasp   vs46,   vs19,   alpha_r 
-    xvmaddasp   vs47,   vs27,   alpha_r 
-#endif  
- 
-    stxv        vs46, 32(T7)  
-    stxv        vs47, 48(T7)
-  
-
-    addi CO,CO,64
-
-
-.endm
-
-
-
-/**********************************************************************************************
-* Macros for N=8 and M=8
-**********************************************************************************************/
-
-.macro LOAD8x8_1
-   LOAD8x8 1
-.endm
-
-.macro LOAD8x8_0
-   LOAD8x8 0
-.endm
-
-.macro KERNEL8x8_L1_L4  Index,IsLast
-  KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
-.endm
-
-.macro KERNEL8x8_I1_L4  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x8_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x8_I1_L4_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x8_I1_L4_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-.macro KERNEL8x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-   KERNEL8x8_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL8x8_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL8x8_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x8_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL8x8_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro END8x8_NORMAL
-  END8x8 0, AO, BO, 32,32 
-.endm
-
-.macro Zero8X8
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33
- 
-    xxlxor      vs36,   vs36,   vs36
-    xxlxor      vs37,   vs37,   vs37
- 
-    xxlxor      vs40,   vs40,   vs40
-    xxlxor      vs41,   vs41,   vs41
- 
-    xxlxor      vs44,   vs44,   vs44
-    xxlxor      vs45,   vs45,   vs45
- 
-    xxlxor      vs48,   vs48,   vs48
-    xxlxor      vs49,   vs49,   vs49
- 
-    xxlxor      vs52,   vs52,   vs52
-    xxlxor      vs53,   vs53,   vs53
- 
-    xxlxor      vs56,   vs56,   vs56
-    xxlxor      vs57,   vs57,   vs57
-  
-    xxlxor      vs60,   vs60,   vs60
-    xxlxor      vs61,   vs61,   vs61
-    
-.endm
-
-.macro LOAD8x8  Zero
-
-    lxv vs24,   0(BO)
-    lxv vs28,   16(BO)
-    lxv vs0,     0(AO)
-    lxv vs1,    16(AO)
-
-    xxperm      vs26,   vs24,       permute_mask
-    xxperm      vs30,   vs28,       permute_mask    
-    xxpermdi    vs25,   vs24,   vs24,2     
-    xxpermdi    vs29,   vs28,   vs28,2    
-
-    xxpermdi    vs27,   vs26,   vs26,2  
-    xxpermdi    vs31,   vs30,   vs30,2      
-
-.if \Zero==1 
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33 
-    xxlxor      vs36,   vs36,   vs36
-    xxlxor      vs37,   vs37,   vs37
-    xxlxor      vs40,   vs40,   vs40
-    xxlxor      vs41,   vs41,   vs41 
-    xxlxor      vs44,   vs44,   vs44
-    xxlxor      vs45,   vs45,   vs45 
-    xxlxor      vs48,   vs48,   vs48
-    xxlxor      vs49,   vs49,   vs49 
-    xxlxor      vs52,   vs52,   vs52
-    xxlxor      vs53,   vs53,   vs53 
-    xxlxor      vs56,   vs56,   vs56
-    xxlxor      vs57,   vs57,   vs57  
-    xxlxor      vs60,   vs60,   vs60
-    xxlxor      vs61,   vs61,   vs61  
-.endif
-.endm
-
-
-.macro END8x8 First, AREG, BREG, OffsetA, OffsetB
-
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
-.endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
-
-.if \First==1
-    xvmulsp     vs32, vs0,vs24
-    xvmulsp     vs33, vs1,vs24
-
-    xvmulsp     vs36, vs0,vs25
-    xvmulsp     vs37, vs1,vs25
-
-    xvmulsp     vs40, vs0,vs26
-    xvmulsp     vs41, vs1,vs26
-
-    xvmulsp     vs44, vs0,vs27
-    xvmulsp     vs45, vs1,vs27
-
-    xvmulsp     vs48, vs0,vs28
-    xvmulsp     vs49, vs1,vs28
-
-    xvmulsp     vs52, vs0,vs29
-    xvmulsp     vs53, vs1,vs29
-
-    xvmulsp     vs56, vs0,vs30
-    xvmulsp     vs57, vs1,vs30
-
-    xvmulsp     vs60, vs0,vs31
-    xvmulsp     vs61, vs1,vs31
-
-.else
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-
-    xvmaddasp       vs48, vs0,vs28
-    xvmaddasp       vs49, vs1,vs28
-
-    xvmaddasp       vs52, vs0,vs29
-    xvmaddasp       vs53, vs1,vs29
-
-    xvmaddasp       vs56, vs0,vs30
-    xvmaddasp       vs57, vs1,vs30
-
-    xvmaddasp       vs60, vs0,vs31
-    xvmaddasp       vs61, vs1,vs31
-
-.endif
-.endm  
-
-.macro KERNEL8x8_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
-
-    lxv vs8,    DISP32(\Index, 0+\OffsetB)(\BREG)
-    lxv vs12,   DISP32(\Index,16+\OffsetB)(\BREG)
-
-    lxv vs4,    DISP32(\Index, 0+\OffsetA)(\AREG)
-    lxv vs5,    DISP32(\Index,16+\OffsetA)(\AREG)
-
-    xxperm      vs10,   vs8,        permute_mask
-    xxperm      vs14,   vs12,       permute_mask    
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xxpermdi    vs9,    vs8,    vs8,2    
-    xxpermdi    vs13,   vs12,   vs12,2   
-
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-    xxpermdi    vs11,   vs10,   vs10,2  
-    xxpermdi    vs15,   vs14,   vs14,2  
-
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-
-    xvmaddasp       vs48, vs0,vs28
-    xvmaddasp       vs49, vs1,vs28
-
-    xvmaddasp       vs52, vs0,vs29
-    xvmaddasp       vs53, vs1,vs29
-    lxv vs24,   DISP32(\Index,32+\OffsetB)(\BREG)
-    lxv vs28,   DISP32(\Index,32+16+\OffsetB)(\BREG)
-    xvmaddasp       vs56, vs0,vs30
-    xvmaddasp       vs57, vs1,vs30
-
-    xvmaddasp       vs60, vs0,vs31
-    xvmaddasp       vs61, vs1,vs31
-
-    xxperm      vs26,   vs24,       permute_mask
-    xxperm      vs30,   vs28,       permute_mask    
-
-    lxv vs0,    DISP32(\Index,32+\OffsetA)(\AREG)
-    lxv vs1,    DISP32(\Index,32+16+\OffsetA)(\AREG)
-
-
-    xxpermdi    vs25,   vs24,   vs24,2     
-    xxpermdi    vs29,   vs28,   vs28,2    
-
-    xvmaddasp       vs32, vs4,vs8
-    xvmaddasp       vs33, vs5,vs8
-
-    xvmaddasp       vs36, vs4,vs9
-    xvmaddasp       vs37, vs5,vs9
-
-    xxpermdi    vs27,   vs26,   vs26,2  
-    xxpermdi    vs31,   vs30,   vs30,2      
-
-    xvmaddasp       vs40, vs4,vs10
-    xvmaddasp       vs41, vs5,vs10
-
-    xvmaddasp       vs44, vs4,vs11
-    xvmaddasp       vs45, vs5,vs11
-
-    xvmaddasp       vs48, vs4,vs12
-    xvmaddasp       vs49, vs5,vs12
-
-    xvmaddasp       vs52, vs4,vs13
-    xvmaddasp       vs53, vs5,vs13
-    lxv vs8,    DISP32(\Index,64+\OffsetB)(\BREG)
-    lxv vs12,   DISP32(\Index,64+16+\OffsetB)(\BREG)
-    xvmaddasp       vs56, vs4,vs14
-    xvmaddasp       vs57, vs5,vs14
-
-    xvmaddasp       vs60, vs4,vs15
-    xvmaddasp       vs61, vs5,vs15
-
-    xxperm      vs10,   vs8,        permute_mask
-    xxperm      vs14,   vs12,       permute_mask   
- 
-
-    lxv vs4,    DISP32(\Index,64+0+\OffsetA)(\AREG)
-    lxv vs5,    DISP32(\Index,64+16+\OffsetA)(\AREG)
-
- 
-    xxpermdi    vs9,    vs8,    vs8,2    
-    xxpermdi    vs13,   vs12,   vs12,2  
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-    xxpermdi    vs11,   vs10,   vs10,2  
-    xxpermdi    vs15,   vs14,   vs14,2  
-
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-
-    xvmaddasp       vs48, vs0,vs28
-    xvmaddasp       vs49, vs1,vs28
-
-    xvmaddasp       vs52, vs0,vs29
-    xvmaddasp       vs53, vs1,vs29
-.if \Complete==0
-    lxv vs24,   DISP32(\Index,96+\OffsetB)(\BREG)
-    lxv vs28,   DISP32(\Index,96+16+\OffsetB)(\BREG)
-.endif 
-    xvmaddasp       vs56, vs0,vs30
-    xvmaddasp       vs57, vs1,vs30
-.if \Complete==0
-    xxperm      vs26,   vs24,   permute_mask
-    xxperm      vs30,   vs28,   permute_mask   
-.endif 
-    xvmaddasp       vs60, vs0,vs31
-    xvmaddasp       vs61, vs1,vs31
-
-
-.if \Complete==0
-    lxv vs0,    DISP32(\Index,96+\OffsetA)(\AREG)
-    lxv vs1,    DISP32(\Index,96+16+\OffsetA)(\AREG) 
-.endif 
-
-.if \Complete==0     
-    xxpermdi    vs25,   vs24,   vs24,2 
-    xxpermdi    vs29,   vs28,   vs28,2      
-
-.endif 
-.if \IsLast==1  
-.if \Complete==1
-  
-    addi        \BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
-    addi        \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
-.else
-  
-    addi        \BREG, \BREG,  DISP32(\Index,128)
-    addi        \AREG, \AREG, DISP32(\Index,128)
-.endif
-.endif   
- 
-    xvmaddasp       vs32, vs4,vs8
-    xvmaddasp       vs33, vs5,vs8
-
-    xvmaddasp       vs36, vs4,vs9
-    xvmaddasp       vs37, vs5,vs9
-
-.if \Complete==0        
-    xxpermdi    vs27,   vs26,   vs26,2  
-    xxpermdi    vs31,   vs30,   vs30,2  
-    
-.endif
- 
-    xvmaddasp       vs40, vs4,vs10
-    xvmaddasp       vs41, vs5,vs10
-
-    xvmaddasp       vs44, vs4,vs11
-    xvmaddasp       vs45, vs5,vs11
-
-    xvmaddasp       vs48, vs4,vs12
-    xvmaddasp       vs49, vs5,vs12
-
-    xvmaddasp       vs52, vs4,vs13
-    xvmaddasp       vs53, vs5,vs13
-
-    xvmaddasp       vs56, vs4,vs14
-    xvmaddasp       vs57, vs5,vs14
-
-    xvmaddasp       vs60, vs4,vs15
-    xvmaddasp       vs61, vs5,vs15
-
-.endm
-
-.macro KERNEL8x8 First
-
-  LOAD8x8 0
-  END8x8 \First, AO, BO, 32,32  
-.endm
-
-.macro KERNEL8x8_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-    
-    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\BREG)
-    lxv vs12,   DISP16(\Index,16+\OffsetB)(\BREG)
-
-    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs5,    DISP16(\Index,16+\OffsetA)(\AREG)
-
-    xxperm      vs10,   vs8,        permute_mask
-    xxperm      vs14,   vs12,       permute_mask    
-    xxpermdi    vs9,    vs8,    vs8,2    
-    xxpermdi    vs13,   vs12,   vs12,2   
-.if \First==1
-    xvmulsp     vs32, vs0,vs24
-    xvmulsp     vs33, vs1,vs24
-
-    xvmulsp     vs36, vs0,vs25
-    xvmulsp     vs37, vs1,vs25
-
-.else
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-.endif
-
-    xxpermdi    vs11,   vs10,   vs10,2  
-    xxpermdi    vs15,   vs14,   vs14,2  
- 
-.if \First==1  
-    xvmulsp     vs40, vs0,vs26
-    xvmulsp     vs41, vs1,vs26
-
-    xvmulsp     vs44, vs0,vs27
-    xvmulsp     vs45, vs1,vs27
-
-    xvmulsp     vs48, vs0,vs28
-    xvmulsp     vs49, vs1,vs28
-
-    xvmulsp     vs52, vs0,vs29
-    xvmulsp     vs53, vs1,vs29
-
-    xvmulsp     vs56, vs0,vs30
-    xvmulsp     vs57, vs1,vs30
-
-    xvmulsp     vs60, vs0,vs31
-    xvmulsp     vs61, vs1,vs31
-
-.else 
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-
-    xvmaddasp       vs48, vs0,vs28
-    xvmaddasp       vs49, vs1,vs28
-
-    xvmaddasp       vs52, vs0,vs29
-    xvmaddasp       vs53, vs1,vs29
-
-    xvmaddasp       vs56, vs0,vs30
-    xvmaddasp       vs57, vs1,vs30
-
-    xvmaddasp       vs60, vs0,vs31
-    xvmaddasp       vs61, vs1,vs31
-
-.endif
-.if \Complete==0
-    lxv vs24,   DISP16(\Index,32+\OffsetB)(\BREG)
-    lxv vs28,   DISP16(\Index,32+16+\OffsetB)(\BREG)
-
-    lxv vs0,    DISP16(\Index,32+\OffsetA)(\AREG)
-    lxv vs1,    DISP16(\Index,32+16+\OffsetA)(\AREG)
-
-    xxperm      vs26,   vs24,   permute_mask
-    xxperm      vs30,   vs28,   permute_mask    
-    xxpermdi    vs25,   vs24,   vs24,2   
-    xxpermdi    vs29,   vs28,   vs28,2  
-.endif    
-.if \IsLast==1  
-.if \Complete==1
-    addi        \BREG, \BREG,  DISP16(\Index,32+\OffsetB) 
-    addi        \AREG, \AREG,  DISP16(\Index,32+\OffsetA)
-
-.else
-    addi        \BREG, \BREG,  DISP16(\Index,64)
-    addi        \AREG, \AREG,  DISP16(\Index,64) 
-.endif
-.endif
-
-.if \First==1
-    xvmulsp     vs32, vs4,vs8
-    xvmulsp     vs33, vs5,vs8
-
-    xvmulsp     vs36, vs4,vs9
-    xvmulsp     vs37, vs5,vs9
-
-.else
-    xvmaddasp       vs32, vs4,vs8
-    xvmaddasp       vs33, vs5,vs8
-
-    xvmaddasp       vs36, vs4,vs9
-    xvmaddasp       vs37, vs5,vs9
-
-.endif 
- 
-.if \Complete==0        
-    xxpermdi    vs27,   vs26,   vs26,2  
-    xxpermdi    vs31,   vs30,   vs30,2  
- 
-.endif
-.if \First==1  
-    xvmulsp     vs40, vs4,vs10
-    xvmulsp     vs41, vs5,vs10
-
-    xvmulsp     vs44, vs4,vs11
-    xvmulsp     vs45, vs5,vs11
-
-    xvmulsp     vs48, vs4,vs12
-    xvmulsp     vs49, vs5,vs12
-
-    xvmulsp     vs52, vs4,vs13
-    xvmulsp     vs53, vs5,vs13
-
-    xvmulsp     vs56, vs4,vs14
-    xvmulsp     vs57, vs5,vs14
-
-    xvmulsp     vs60, vs4,vs15
-    xvmulsp     vs61, vs5,vs15
-
-.else 
-    xvmaddasp       vs40, vs4,vs10
-    xvmaddasp       vs41, vs5,vs10
-
-    xvmaddasp       vs44, vs4,vs11
-    xvmaddasp       vs45, vs5,vs11
-
-    xvmaddasp       vs48, vs4,vs12
-    xvmaddasp       vs49, vs5,vs12
-
-    xvmaddasp       vs52, vs4,vs13
-    xvmaddasp       vs53, vs5,vs13
-
-    xvmaddasp       vs56, vs4,vs14
-    xvmaddasp       vs57, vs5,vs14
-
-    xvmaddasp       vs60, vs4,vs15
-    xvmaddasp       vs61, vs5,vs15
-
-.endif
-
-.endm
-
-
-.macro SAVE8x8 
- 
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC 
-
-  add     T2, CO, T10  
-  add     T3, T1, T10  
-
-  add     T4, T2, T10  
-  add     T5, T3, T10 
-
-  add     T6, T4, T10 
-  add     T7, T5, T10 
-
-#ifndef TRMMKERNEL    
-    lxv        vs34, 0(CO)
-    lxv        vs35, 16(CO)      
-    lxv        vs38, 0(T1)
-    lxv        vs39, 16(T1)  
-    lxv        vs42, 0(T2)
-    lxv        vs43, 16(T2)     
-    lxv        vs46, 0(T3)
-    lxv        vs47, 16(T3)  
-
-    lxv        vs50, 0(T4)
-    lxv        vs51, 16(T4)      
-    lxv        vs54, 0(T5)
-    lxv        vs55, 16(T5)  
-    lxv        vs58, 0(T6)
-    lxv        vs59, 16(T6)     
-    lxv        vs62, 0(T7)
-    lxv        vs63, 16(T7) 
-#endif  
-
-    xxmrglw     vs8,    vs32,   vs44
-    xxmrglw     vs10,   vs36,   vs40  
-
-    xxmrghw     vs1,    vs32,   vs44
-    xxmrghw     vs0,    vs36,   vs40
-
-    xxmrglw     vs12,   vs33,   vs45
-    xxmrglw     vs14,   vs37,   vs41  
-
-    xxmrghw     vs2,    vs37,   vs41
-    xxmrghw     vs3,    vs33,   vs45
-
-    xxlor      vs9, vs8,    vs8
-    xxlor      vs11,    vs10,   vs10 
- 
-    xxlor      vs13,    vs12,   vs12
-    xxlor      vs15,    vs14,   vs14
-
-    xxperm      vs8,    vs0,    save_permute_1
-    xxperm      vs10,   vs1,    save_permute_1
-    xxperm      vs9,    vs0,    save_permute_2  
-    xxperm      vs11,   vs1,    save_permute_2      
-
-    xxperm     vs12,    vs2,    save_permute_1
-    xxperm     vs14,    vs3,    save_permute_1
-      
-    xxperm     vs13,    vs2,    save_permute_2   
-    xxperm     vs15,    vs3,    save_permute_2      
-
-
-    /* multiply add normal way */
- 
-#ifdef TRMMKERNEL
-    xvmulsp     vs34,   vs8,    alpha_r 
-    xvmulsp     vs35,   vs12,   alpha_r 
-    xvmulsp     vs38,   vs9,    alpha_r 
-    xvmulsp     vs39,   vs13,   alpha_r 
-    xvmulsp     vs42,   vs10,   alpha_r 
-    xvmulsp     vs43,   vs14,   alpha_r 
-    xvmulsp     vs46,   vs11,   alpha_r 
-    xvmulsp     vs47,   vs15,   alpha_r                    
-#else 
-    xvmaddasp   vs34,   vs8,    alpha_r 
-    xvmaddasp   vs35,   vs12,   alpha_r 
-    xvmaddasp   vs38,   vs9,    alpha_r 
-    xvmaddasp   vs39,   vs13,   alpha_r  
-    xvmaddasp   vs42,   vs10,   alpha_r 
-    xvmaddasp   vs43,   vs14,   alpha_r   
-    xvmaddasp   vs46,   vs11,   alpha_r 
-    xvmaddasp   vs47,   vs15,   alpha_r                     
-#endif     
- 
-   
-    xxmrglw     vs8,    vs48,   vs60
-    xxmrglw     vs10,   vs52,   vs56  
-
-    xxmrghw     vs1,    vs48,   vs60
-    xxmrghw     vs0,    vs52,   vs56
-    stxv        vs34, 0(CO)
-    stxv        vs35, 16(CO) 
-    xxmrglw     vs12,   vs49,   vs61
-    xxmrglw     vs14,   vs53,   vs57  
-    stxv        vs38, 0(T1)
-    stxv        vs39, 16(T1) 
-    xxmrghw     vs2,    vs53,   vs57
-    xxmrghw     vs3,    vs49,   vs61
-    stxv        vs42, 0(T2)
-    stxv        vs43, 16(T2)   
-    xxlor      vs9, vs8,    vs8
-    xxlor      vs11,    vs10,   vs10  
-    stxv        vs46, 0(T3)
-    stxv        vs47, 16(T3)  
-    xxlor      vs13,    vs12,   vs12
-    xxlor      vs15,    vs14,   vs14
-   
-    xxperm      vs8,    vs0,    save_permute_1
-    xxperm      vs10,   vs1,    save_permute_1
-    
- 
-    xxperm      vs9,    vs0,    save_permute_2  
-    xxperm      vs11,   vs1,    save_permute_2      
- 
-    xxperm     vs12,    vs2,    save_permute_1
-    xxperm     vs14,    vs3,    save_permute_1
-    xxperm     vs13,    vs2,    save_permute_2   
-    xxperm     vs15,    vs3,    save_permute_2      
-    
- #ifdef TRMMKERNEL
-    xvmulsp     vs50,   vs8,    alpha_r 
-    xvmulsp     vs51,   vs12,   alpha_r 
-    xvmulsp     vs54,   vs9,    alpha_r 
-    xvmulsp     vs55,   vs13,   alpha_r 
-    xvmulsp     vs58,   vs10,   alpha_r 
-    xvmulsp     vs59,   vs14,   alpha_r 
-    xvmulsp     vs62,   vs11,   alpha_r 
-    xvmulsp     vs63,   vs15,   alpha_r                    
-#else 
-    xvmaddasp     vs50,   vs8,    alpha_r 
-    xvmaddasp     vs51,   vs12,   alpha_r 
-    xvmaddasp     vs54,   vs9,    alpha_r 
-    xvmaddasp     vs55,   vs13,   alpha_r 
-    xvmaddasp     vs58,   vs10,   alpha_r 
-    xvmaddasp     vs59,   vs14,   alpha_r 
-    xvmaddasp     vs62,   vs11,   alpha_r 
-    xvmaddasp     vs63,   vs15,   alpha_r                     
-#endif  
-
-    stxv        vs50, 0(T4)
-    stxv        vs51, 16(T4)      
-    stxv        vs54, 0(T5)
-    stxv        vs55, 16(T5)  
-    stxv        vs58, 0(T6)
-    stxv        vs59, 16(T6)     
-    stxv        vs62, 0(T7)
-    stxv        vs63, 16(T7)   
-
-    addi CO,CO,32
-
-.endm
-
-
-/**********************************************************************************************
-* Macros for N=8 and M=4
-**********************************************************************************************/
-
-.macro LOAD8x4_1
-   LOAD8x4 1
-.endm
-
-.macro LOAD8x4_0
-   LOAD8x4 0
-.endm
-
-.macro KERNEL8x4_L1_L4  Index,IsLast
-  KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
-.endm
-
-.macro KERNEL8x4_I1_L4  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x4_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x4_I1_L4_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x4_I1_L4_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-.macro KERNEL8x4_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-   KERNEL8x4_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL8x4_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL8x4_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x4_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL8x4_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro Zero8X4
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33 
-    xxlxor      vs34,   vs34,   vs34
-    xxlxor      vs35,   vs35,   vs35
-    
-    xxlxor      vs48,   vs48,   vs48
-    xxlxor      vs49,   vs49,   vs49
-    xxlxor      vs50,   vs50,   vs50
-    xxlxor      vs51,   vs51,   vs51  
-    
-.endm
-
-.macro LOAD8x4  Zero
-
-    lxv vs0,     0(AO)
-    lxv vs24,   0(BO)
-    lxv vs25,   16(BO)
-
-
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2        
-
-.if \Zero==1 
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33 
-    xxlxor      vs34,   vs34,   vs34
-    xxlxor      vs35,   vs35,   vs35
-
-    xxlxor      vs48,   vs48,   vs48
-    xxlxor      vs49,   vs49,   vs49
-    xxlxor      vs50,   vs50,   vs50
-    xxlxor      vs51,   vs51,   vs51  
-.endif
-.endm
-
-.macro END8x4_NORMAL
-  END8x4 0, AO, BO, 16,32 
-.endm
-
-.macro END8x4 First, AREG, BREG, OffsetA, OffsetB
-
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
-.endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
-
-.if \First==1
-    xvmulsp      vs32,   vs24,   vs0
-    xvmulsp      vs33,   vs24,   vs1 
-    xvmulsp      vs34,   vs24,   vs2
-    xvmulsp      vs35,   vs24,   vs3
-
-    xvmulsp      vs48,   vs25,   vs0
-    xvmulsp      vs49,   vs25,   vs1
-    xvmulsp      vs50,   vs25,   vs2
-    xvmulsp      vs51,   vs25,   vs3  
-.else
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
-
-    xvmaddasp      vs48,   vs25,   vs0
-    xvmaddasp      vs49,   vs25,   vs1
-    xvmaddasp      vs50,   vs25,   vs2
-    xvmaddasp      vs51,   vs25,   vs3 
-
-.endif
-.endm  
-
-.macro KERNEL8x4_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
-
-    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP32(\Index, 0+\OffsetB)(\BREG)
-    lxv vs27,   DISP32(\Index,16+\OffsetB)(\BREG)
-
-    xxperm      vs6,   vs4,       permute_mask  
-    xxpermdi    vs5,   vs4,   vs4,2      
-    xxpermdi    vs7,   vs6,   vs6,2 
- 
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
-
-    xvmaddasp      vs48,   vs25,   vs0
-    xvmaddasp      vs49,   vs25,   vs1
-    xvmaddasp      vs50,   vs25,   vs2
-    xvmaddasp      vs51,   vs25,   vs3 
-
-    lxv vs0,    DISP16(\Index, 16+\OffsetA)(\AREG)
-    lxv vs24,   DISP32(\Index, 32+\OffsetB)(\BREG)
-    lxv vs25,   DISP32(\Index, 48+\OffsetB)(\BREG) 
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2   
-
-    xvmaddasp      vs32,   vs26,   vs4
-    xvmaddasp      vs33,   vs26,   vs5 
-    xvmaddasp      vs34,   vs26,   vs6
-    xvmaddasp      vs35,   vs26,   vs7
-
-    xvmaddasp      vs48,   vs27,   vs4
-    xvmaddasp      vs49,   vs27,   vs5
-    xvmaddasp      vs50,   vs27,   vs6
-    xvmaddasp      vs51,   vs27,   vs7
- 
-
-    lxv vs4,    DISP16(\Index, 32+\OffsetA)(\AREG)
-    lxv vs26,   DISP32(\Index, 64+\OffsetB)(\BREG)
-    lxv vs27,   DISP32(\Index, 80+\OffsetB)(\BREG)
-
-    xxperm      vs6,   vs4,       permute_mask  
-    xxpermdi    vs5,   vs4,   vs4,2      
-    xxpermdi    vs7,   vs6,   vs6,2 
- 
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
-
-    xvmaddasp      vs48,   vs25,   vs0
-    xvmaddasp      vs49,   vs25,   vs1
-    xvmaddasp      vs50,   vs25,   vs2
-    xvmaddasp      vs51,   vs25,   vs3 
-
-.if \Complete==0 
-
-    lxv vs0,    DISP16(\Index, 48+\OffsetA)(\AREG)
-    lxv vs24,   DISP32(\Index, 96+\OffsetB)(\BREG)
-    lxv vs25,   DISP32(\Index, 96+16+\OffsetB)(\BREG) 
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2   
-.endif
-    xvmaddasp      vs32,   vs26,   vs4
-    xvmaddasp      vs33,   vs26,   vs5 
-    xvmaddasp      vs34,   vs26,   vs6
-    xvmaddasp      vs35,   vs26,   vs7
-
-    xvmaddasp      vs48,   vs27,   vs4
-    xvmaddasp      vs49,   vs27,   vs5
-    xvmaddasp      vs50,   vs27,   vs6
-    xvmaddasp      vs51,   vs27,   vs7
-
- 
- 
-.if \IsLast==1  
-.if \Complete==1
-    addi        \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)  
-    addi        \BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
-
-.else
-    addi        \AREG, \AREG, DISP16(\Index,64)  
-    addi        \BREG, \BREG,  DISP32(\Index,128)
-
-.endif
-.endif   
- 
- 
-.endm
-
-.macro KERNEL8x4 First
-    LOAD8x4 0
-    END8x4 \First, AO, BO, 16,32  
-.endm
-
-.macro KERNEL8x4_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
-    lxv vs4,    DISP8(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG)
-    lxv vs27,   DISP16(\Index,16+\OffsetB)(\BREG)
-
-    xxperm      vs6,   vs4,       permute_mask  
-    xxpermdi    vs5,   vs4,   vs4,2      
-    xxpermdi    vs7,   vs6,   vs6,2 
-.if \First==1
-    xvmulsp      vs32,   vs24,   vs0
-    xvmulsp      vs33,   vs24,   vs1 
-    xvmulsp      vs34,   vs24,   vs2
-    xvmulsp      vs35,   vs24,   vs3
-
-    xvmulsp      vs48,   vs25,   vs0
-    xvmulsp      vs49,   vs25,   vs1
-    xvmulsp      vs50,   vs25,   vs2
-    xvmulsp      vs51,   vs25,   vs3  
-.else 
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
-
-    xvmaddasp      vs48,   vs25,   vs0
-    xvmaddasp      vs49,   vs25,   vs1
-    xvmaddasp      vs50,   vs25,   vs2
-    xvmaddasp      vs51,   vs25,   vs3 
-.endif
-
-.if \Complete==0 
-
-    lxv vs0,    DISP8(\Index, 16+\OffsetA)(\AREG)
-    lxv vs24,   DISP16(\Index, 32+\OffsetB)(\BREG)
-    lxv vs25,   DISP16(\Index, 48+\OffsetB)(\BREG) 
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2   
-.endif
-
-.if \First==1
-    xvmulsp      vs32,   vs26,   vs4
-    xvmulsp      vs33,   vs26,   vs5 
-    xvmulsp      vs34,   vs26,   vs6
-    xvmulsp      vs35,   vs26,   vs7
-
-    xvmulsp      vs48,   vs27,   vs4
-    xvmulsp      vs49,   vs27,   vs5
-    xvmulsp      vs50,   vs27,   vs6
-    xvmulsp      vs51,   vs27,   vs7
-
-
-.else
-    xvmaddasp      vs32,   vs26,   vs4
-    xvmaddasp      vs33,   vs26,   vs5 
-    xvmaddasp      vs34,   vs26,   vs6
-    xvmaddasp      vs35,   vs26,   vs7
-
-    xvmaddasp      vs48,   vs27,   vs4
-    xvmaddasp      vs49,   vs27,   vs5
-    xvmaddasp      vs50,   vs27,   vs6
-    xvmaddasp      vs51,   vs27,   vs7
-.endif
- 
- 
-.if \IsLast==1  
-.if \Complete==1
-    addi        \AREG, \AREG, DISP8(\Index,16+\OffsetA)  
-    addi        \BREG, \BREG,  DISP16(\Index,32+\OffsetB)
-
-.else
-    addi        \AREG, \AREG, DISP8(\Index,32)  
-    addi        \BREG, \BREG,  DISP16(\Index,64)
-
-.endif
-.endif   
-     
-  
-.endm
-
-
-.macro SAVE8x4
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC 
-#if !defined(TRMMKERNEL)  
-  lxv        vs36, 0(CO)
-  lxv        vs37, 0(T1)
-#endif  
-  add     T2, CO, T10  
-  add     T3, T1, T10 
-#if !defined(TRMMKERNEL)    
-  lxv        vs38, 0(T2)
-  lxv        vs39, 0(T3)   
-#endif   
-  add     T4, T2, T10 
-  add     T5, T3, T10
-#if !defined(TRMMKERNEL)    
-  lxv        vs40, 0(T4)
-  lxv        vs41, 0(T5)
-#endif  
-  add     T6, T4, T10 
-  add     T7, T5, T10
-#if !defined(TRMMKERNEL)    
-  lxv        vs42, 0(T6)
-  lxv        vs43, 0(T7)
-#endif
-  xxmrglw  vs0, vs35,vs32
-  xxmrglw  vs1, vs34,vs33 
-  xxmrglw  vs4, vs32,vs35
-  xxmrglw  vs5, vs33,vs34 
-
-
-  xxmrghw  vs2, vs35,vs32
-  xxmrghw  vs3, vs34,vs33 
-  xxmrghw  vs6, vs32,vs35
-  xxmrghw  vs7, vs33,vs34  
-
-  xxmrgld  vs24, vs1, vs0  
-  xxmrghd  vs25,vs5,vs4 
-
-  xxmrgld  vs26, vs2, vs3  
-  xxmrghd  vs27,vs6,vs7
-
-
-  xxmrglw  vs0, vs51,vs48
-  xxmrglw  vs1, vs50,vs49  
-  xxmrglw  vs4, vs48,vs51
-  xxmrglw  vs5, vs49,vs50 
-
-  xxmrghw  vs2, vs51,vs48
-  xxmrghw  vs3, vs50,vs49  
-  xxmrghw  vs6, vs48,vs51
-  xxmrghw  vs7, vs49,vs50   
-
-  xxmrgld  vs28, vs1, vs0  
-  xxmrghd  vs29,vs5,vs4
-
-  xxmrgld  vs30, vs2, vs3   
-  xxmrghd  vs31,vs6,vs7
-#if defined(TRMMKERNEL)
-
-  xvmulsp        vs36, vs24, alpha_r
-  xvmulsp        vs37, vs25, alpha_r 
-  xvmulsp        vs38, vs26, alpha_r
-  xvmulsp        vs39, vs27, alpha_r   
-  xvmulsp        vs40, vs28, alpha_r
-  xvmulsp        vs41, vs29, alpha_r 
-  xvmulsp        vs42, vs30, alpha_r
-  xvmulsp        vs43, vs31, alpha_r
-#else
-  xvmaddasp        vs36, vs24, alpha_r
-  xvmaddasp        vs37, vs25, alpha_r 
-  xvmaddasp        vs38, vs26, alpha_r
-  xvmaddasp        vs39, vs27, alpha_r   
-  xvmaddasp        vs40, vs28, alpha_r
-  xvmaddasp        vs41, vs29, alpha_r 
-  xvmaddasp        vs42, vs30, alpha_r
-  xvmaddasp        vs43, vs31, alpha_r
-#endif
-
-  stxv        vs36, 0(CO)
-  stxv        vs37, 0(T1) 
-  stxv        vs38, 0(T2)
-  stxv        vs39, 0(T3)   
-  stxv        vs40, 0(T4)
-  stxv        vs41, 0(T5) 
-  stxv        vs42, 0(T6)
-  stxv        vs43, 0(T7)
-
-
-  addi CO,CO,16
-.endm
-
-
-/**********************************************************************************************
-* Macros for N=8 and M=2
-**********************************************************************************************/
-
- 
-.macro KERNEL8x2_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- 
-
-.macro Zero8x2
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
-    xxlxor      vs2,   vs2,   vs2
-    xxlxor      vs3,   vs3,   vs3
-       
-.endm
- 
-.macro KERNEL8x2
-  KERNEL8x2_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxsd v4,   DISP2(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG)
-    lxv vs27,   DISP8(\Index,16+\OffsetB)(\BREG)      
-    xxspltw   vs8,  vs36, 0 
-    xxspltw   vs9,  vs36, 1  
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8 
-    xvmulsp      vs2,   vs26,   vs9
-    xvmulsp      vs3,   vs27,   vs9 
-     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs26,   vs9
-    xvmaddasp      vs3,   vs27,   vs9
- 
- .endif
-   
-    addi        \AREG, \AREG, DISP2(\Index,8)  
-    addi        \BREG, \BREG, DISP8(\Index,32)
- 
-.endm
-
-.macro KERNEL8x2_I_2  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG)
-    lxv vs27,   DISP16(\Index,16+\OffsetB)(\BREG)
-    lxv vs28,   DISP16(\Index,32+\OffsetB)(\BREG)
-    lxv vs29,   DISP16(\Index,48+\OffsetB)(\BREG)      
-    xxspltw   vs8,  vs4, 2  
-    xxspltw   vs9,  vs4, 3 
-    xxspltw   vs10, vs4, 0 
-    xxspltw   vs11, vs4, 1
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8 
-    xvmulsp      vs2,   vs26,   vs9
-    xvmulsp      vs3,   vs27,   vs9 
-
-    xvmulsp      vs0,   vs28,   vs10
-    xvmulsp      vs1,   vs29,   vs10 
-    xvmulsp      vs2,   vs28,   vs11
-    xvmulsp      vs3,   vs29,   vs11     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs26,   vs9
-    xvmaddasp      vs3,   vs27,   vs9
-
-    xvmaddasp      vs0,   vs28,   vs10
-    xvmaddasp      vs1,   vs29,   vs10 
-    xvmaddasp      vs2,   vs28,   vs11
-    xvmaddasp      vs3,   vs29,   vs11  
- .endif
-
- 
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP4(\Index,16)  
-    addi        \BREG, \BREG, DISP16(\Index,64)
-.endif 
-  
-.endm
-
-
-.macro SAVE8x2
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC  
-  add     T2, CO, T10  
-  add     T3, T1, T10     
-  add     T4, T2, T10 
-  add     T5, T3, T10 
-  add     T6, T4, T10 
-  add     T7, T5, T10 
-  /*convert alpha_r for multiply*/
-  xscvspdp  vs4,alpha_r
-/* v0 corresponds to vs32, do not forget*/
-#if !defined(TRMMKERNEL)
-  lxssp  v0,0(CO) 
-  lxssp  v1,4(CO) 
-
-  lxssp  v2,0(T1)
-  lxssp  v3,4(T1)
-
-  lxssp  v4,0(T2)
-  lxssp  v5,4(T2)
-
-  lxssp  v6,0(T3)
-  lxssp  v7,4(T3)
-
-  lxssp  v8,0(T4)
-  lxssp  v9,4(T4)
-
-  lxssp  v10,0(T5)
-  lxssp  v11,4(T5)
-
-  lxssp  v12,0(T6)
-  lxssp  v13,4(T6)
-
-  lxssp  v14,0(T7)
-  lxssp  v15,4(T7)
-#endif
-  xscvspdp  vs5, vs2
-  xxspltw   vs6, vs2, 1 
-  xxspltw   vs7, vs2, 2 
-  xxspltw   vs8, vs2, 3  
-  xscvspdp  vs6,vs6
-  xscvspdp  vs7,vs7
-  xscvspdp  vs8,vs8
-
-  xscvspdp  vs24, vs0
-  xxspltw   vs25, vs0, 1 
-  xxspltw   vs26, vs0, 2 
-  xxspltw   vs27, vs0, 3  
-  xscvspdp  vs25,vs25
-  xscvspdp  vs26,vs26
-  xscvspdp  vs27,vs27
-
-  xscvspdp  vs9, vs3
-  xxspltw   vs10, vs3, 1 
-  xxspltw   vs11, vs3, 2 
-  xxspltw   vs12, vs3, 3  
-  xscvspdp  vs10,vs10
-  xscvspdp  vs11,vs11
-  xscvspdp  vs12,vs12
-
-  xscvspdp  vs28, vs1
-  xxspltw   vs29, vs1, 1 
-  xxspltw   vs30, vs1, 2 
-  xxspltw   vs31, vs1, 3  
-  xscvspdp  vs29,vs29
-  xscvspdp  vs30,vs30
-  xscvspdp  vs31,vs31
-
-
-
-
-#if defined(TRMMKERNEL)
-  xsmuldp  vs32,vs8, vs4 
-  xsmuldp  vs33,vs27, vs4 
-
-  xsmuldp  vs34,vs7, vs4 
-  xsmuldp  vs35,vs26, vs4 
-
-  xsmuldp  vs36,vs6, vs4 
-  xsmuldp  vs37,vs25, vs4  
-
-  xsmuldp  vs38,vs5, vs4 
-  xsmuldp  vs39,vs24, vs4  
-
-  xsmuldp  vs40,vs12, vs4 
-  xsmuldp  vs41,vs31, vs4
-
-  xsmuldp  vs42,vs11, vs4 
-  xsmuldp  vs43,vs30, vs4  
-
-  xsmuldp  vs44,vs10, vs4 
-  xsmuldp  vs45,vs29, vs4 
-
-  xsmuldp  vs46,vs9, vs4 
-  xsmuldp  vs47,vs28, vs4      
-#else
-  xsmaddadp  vs32,vs8, vs4 
-  xsmaddadp  vs33,vs27, vs4 
-
-  xsmaddadp  vs34,vs7, vs4 
-  xsmaddadp  vs35,vs26, vs4 
-
-  xsmaddadp  vs36,vs6, vs4 
-  xsmaddadp  vs37,vs25, vs4  
-
-  xsmaddadp  vs38,vs5, vs4 
-  xsmaddadp  vs39,vs24, vs4  
-
-  xsmaddadp  vs40,vs12, vs4 
-  xsmaddadp  vs41,vs31, vs4
-
-  xsmaddadp  vs42,vs11, vs4 
-  xsmaddadp  vs43,vs30, vs4  
-
-  xsmaddadp  vs44,vs10, vs4 
-  xsmaddadp  vs45,vs29, vs4 
-
-  xsmaddadp  vs46,vs9, vs4 
-  xsmaddadp  vs47,vs28, vs4     
-#endif  
-
-  stxssp  v0,0(CO) 
-  stxssp  v1,4(CO) 
-
-  stxssp  v2,0(T1)
-  stxssp  v3,4(T1)
-
-  stxssp  v4,0(T2)
-  stxssp  v5,4(T2)
-
-  stxssp  v6,0(T3)
-  stxssp  v7,4(T3)
-
-  stxssp  v8,0(T4)
-  stxssp  v9,4(T4)
-
-  stxssp  v10,0(T5)
-  stxssp  v11,4(T5)
-
-  stxssp  v12,0(T6)
-  stxssp  v13,4(T6)
-
-  stxssp  v14,0(T7)
-  stxssp  v15,4(T7)
- 
-
-  addi CO,CO,8
-.endm
-
-
-/**********************************************************************************************
-* Macros for N=8 and M=1
-**********************************************************************************************/
-.macro KERNEL8x1_4   OffsetA,OffsetB, Index,IsLast
-  KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro Zero8x1
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1  
-.endm
-
-.macro KERNEL8x1
-  KERNEL8x1_1 AO,BO, 0 
-.endm
-
-.macro KERNEL8x1_2
-  KERNEL8x1_2_1 AO,BO, 0 
-.endm
-
-.macro KERNEL8x1_1 AREG,BREG,First 
-    lxvwsx vs8,  0, \AREG
-    lxv vs26,   0(\BREG)
-    lxv vs27,   16(\BREG)      
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8  
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8  
- .endif
-    addi        \AREG, \AREG,  4  
-    addi        \BREG, \BREG,  32
-.endm
-
-.macro KERNEL8x1_2_1 AREG,BREG,First 
-    lxsd v4,    0(\AREG)
-    lxv vs26,   0(\BREG)
-    lxv vs27,  16(\BREG)      
-    lxv vs28,  32(\BREG)
-    lxv vs29,  48(\BREG) 
-    xxspltw   vs8,  vs36, 1 
-    xxspltw   vs9,  vs36, 0  
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8  
-    xvmulsp      vs0,   vs28,   vs9
-    xvmulsp      vs1,   vs29,   vs9     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8  
-    xvmaddasp      vs0,   vs28,   vs9
-    xvmaddasp      vs1,   vs29,   vs9 
- .endif
-    addi        \AREG, \AREG,  8 
-    addi        \BREG, \BREG,  64
-.endm
-
-.macro KERNEL8x1_I_4  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
-    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
-    xxspltw   vs8,  vs4, 3 
-    xxspltw   vs9,  vs4, 2 
-    xxspltw   vs10, vs4, 1 
-    xxspltw   vs11, vs4, 0
-    lxv vs26,   DISP32(\Index, 0+\OffsetB)(\BREG)
-    lxv vs27,   DISP32(\Index,16+\OffsetB)(\BREG)
-    lxv vs28,   DISP32(\Index,32+\OffsetB)(\BREG)
-    lxv vs29,   DISP32(\Index,48+\OffsetB)(\BREG) 
-    lxv vs30,   DISP32(\Index,64+ 0+\OffsetB)(\BREG)
-    lxv vs31,   DISP32(\Index,64+16+\OffsetB)(\BREG)
-    lxv vs32,   DISP32(\Index,64+32+\OffsetB)(\BREG)
-    lxv vs33,   DISP32(\Index,64+48+\OffsetB)(\BREG)         
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8  
-    xvmulsp      vs0,   vs28,   vs9
-    xvmulsp      vs1,   vs29,   vs9     
-    xvmulsp      vs0,   vs30,   vs10
-    xvmulsp      vs1,   vs31,   vs10  
-    xvmulsp      vs0,   vs32,   vs11
-    xvmulsp      vs1,   vs33,   vs11     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8  
-    xvmaddasp      vs0,   vs28,   vs9
-    xvmaddasp      vs1,   vs29,   vs9     
-    xvmaddasp      vs0,   vs30,   vs10
-    xvmaddasp      vs1,   vs31,   vs10  
-    xvmaddasp      vs0,   vs32,   vs11
-    xvmaddasp      vs1,   vs33,   vs11  
- .endif
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP4(\Index,16)  
-    addi        \BREG, \BREG, DISP32(\Index,128)
-.endif 
-.endm
-
-.macro SAVE8x1
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC  
-  add     T2, CO, T10  
-  add     T3, T1, T10     
-  add     T4, T2, T10 
-  add     T5, T3, T10 
-  add     T6, T4, T10 
-  add     T7, T5, T10 
-  /*convert alpha_r for multiply*/
-  xscvspdp  vs4,alpha_r
-/* v0 corresponds to vs32, do not forget*/
-#if !defined(TRMMKERNEL)
-  lxssp  v0,0(CO)  
-  lxssp  v2,0(T1) 
-  lxssp  v4,0(T2) 
-  lxssp  v6,0(T3) 
-  lxssp  v8,0(T4) 
-  lxssp  v10,0(T5) 
-  lxssp  v12,0(T6) 
-  lxssp  v14,0(T7)
-#endif
-  xscvspdp  vs24, vs0
-  xxspltw   vs25, vs0, 1 
-  xxspltw   vs26, vs0, 2 
-  xxspltw   vs27, vs0, 3  
-  xscvspdp  vs25,vs25
-  xscvspdp  vs26,vs26
-  xscvspdp  vs27,vs27
-  xscvspdp  vs28, vs1
-  xxspltw   vs29, vs1, 1 
-  xxspltw   vs30, vs1, 2 
-  xxspltw   vs31, vs1, 3  
-  xscvspdp  vs29,vs29
-  xscvspdp  vs30,vs30
-  xscvspdp  vs31,vs31
-#if defined(TRMMKERNEL)
-  xsmuldp  vs32,vs27, vs4 
-  xsmuldp  vs34,vs26, vs4 
-  xsmuldp  vs36,vs25, vs4 
-  xsmuldp  vs38,vs24, vs4 
-  xsmuldp  vs40,vs31, vs4 
-  xsmuldp  vs42,vs30, vs4 
-  xsmuldp  vs44,vs29, vs4 
-  xsmuldp  vs46,vs28, vs4 
-#else
-  xsmaddadp  vs32,vs27, vs4 
-  xsmaddadp  vs34,vs26, vs4 
-  xsmaddadp  vs36,vs25, vs4 
-  xsmaddadp  vs38,vs24, vs4 
-  xsmaddadp  vs40,vs31, vs4 
-  xsmaddadp  vs42,vs30, vs4 
-  xsmaddadp  vs44,vs29, vs4 
-  xsmaddadp  vs46,vs28, vs4  
-#endif  
-  stxssp  v0,0(CO)  
-  stxssp  v2,0(T1) 
-  stxssp  v4,0(T2) 
-  stxssp  v6,0(T3) 
-  stxssp  v8,0(T4) 
-  stxssp  v10,0(T5) 
-  stxssp  v12,0(T6) 
-  stxssp  v14,0(T7) 
-  addi CO,CO,4
-.endm
-
-
-
-/**********************************************************************************************
-* Macros for N=4 and M=16
-**********************************************************************************************/
-
-.macro LOAD4x16_1
-   LOAD4x16 1
-.endm
-
-.macro LOAD4x16_0
-   LOAD4x16 0
-.endm
-
-.macro KERNEL4x16_L1_L4  Index,IsLast
-  KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I1_L4  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I1_L4_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I1_L4_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-.macro KERNEL4x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-   KERNEL4x16_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL4x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro Zero4X16
-    xxlxor		vs32,	vs32,	vs32
-    xxlxor		vs33,	vs33,	vs33
-	xxlxor		vs34,	vs34,	vs34
-	xxlxor		vs35,	vs35,	vs35
-	xxlxor		vs36,	vs36,	vs36
-	xxlxor		vs37,	vs37,	vs37
-	xxlxor		vs38,	vs38,	vs38
-	xxlxor		vs39,	vs39,	vs39
-	xxlxor		vs40,	vs40,	vs40
-	xxlxor		vs41,	vs41,	vs41
-	xxlxor		vs42,	vs42,	vs42
-	xxlxor		vs43,	vs43,	vs43
-	xxlxor		vs44,	vs44,	vs44
-	xxlxor		vs45,	vs45,	vs45
-	xxlxor		vs46,	vs46,	vs46
-	xxlxor		vs47,	vs47,	vs47	
-.endm
-
-.macro LOAD4x16  Zero
-
-	lxv	vs24,	0(BO) 
-	lxv	vs0,	 0(AO)
-	lxv	vs1,	16(AO)
-	lxv	vs2,	32(AO)
-	lxv	vs3,	48(AO)
-	xxperm  	vs26,	vs24,		permute_mask 	
-	xxpermdi	vs25,	vs24,	vs24,2 
-	xxpermdi	vs27,	vs26,	vs26,2	 	
-
-.if \Zero==1 
-    xxlxor		vs32,	vs32,	vs32
-    xxlxor		vs33,	vs33,	vs33
-	xxlxor		vs34,	vs34,	vs34
-	xxlxor		vs35,	vs35,	vs35
-	xxlxor		vs36,	vs36,	vs36
-	xxlxor		vs37,	vs37,	vs37
-	xxlxor		vs38,	vs38,	vs38
-	xxlxor		vs39,	vs39,	vs39
-	xxlxor		vs40,	vs40,	vs40
-	xxlxor		vs41,	vs41,	vs41
-	xxlxor		vs42,	vs42,	vs42
-	xxlxor		vs43,	vs43,	vs43
-	xxlxor		vs44,	vs44,	vs44
-	xxlxor		vs45,	vs45,	vs45
-	xxlxor		vs46,	vs46,	vs46
-	xxlxor		vs47,	vs47,	vs47
- 
-.endif
-.endm
-
-.macro END4x16_NORMAL
-  END4x16 0, AO, BO, 64,16 
-.endm
-
-.macro END4x16 First, AREG, BREG, OffsetA, OffsetB
-
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
-.endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
-
-.if \First==1
-    xvmulsp     vs32, vs0,vs24
-    xvmulsp     vs33, vs1,vs24
-    xvmulsp     vs34, vs2,vs24  
-    xvmulsp     vs35, vs3,vs24  
-
-    xvmulsp     vs36, vs0,vs25
-    xvmulsp     vs37, vs1,vs25
-    xvmulsp     vs38, vs2,vs25  
-    xvmulsp     vs39, vs3,vs25
-
-    xvmulsp     vs40, vs0,vs26
-    xvmulsp     vs41, vs1,vs26
-    xvmulsp     vs42, vs2,vs26  
-    xvmulsp     vs43, vs3,vs26
-
-    xvmulsp     vs44, vs0,vs27
-    xvmulsp     vs45, vs1,vs27
-    xvmulsp     vs46, vs2,vs27  
-    xvmulsp     vs47, vs3,vs27
-
-.else
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs34, vs2,vs24  
-    xvmaddasp       vs35, vs3,vs24  
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-    xvmaddasp       vs38, vs2,vs25  
-    xvmaddasp       vs39, vs3,vs25 
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs42, vs2,vs26  
-    xvmaddasp       vs43, vs3,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-    xvmaddasp       vs46, vs2,vs27  
-    xvmaddasp       vs47, vs3,vs27
-
-.endif
-.endm  
-
-.macro KERNEL4x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
-
-	lxv	vs8,	DISP16(\Index, 0+\OffsetB)(\BREG) 
-
- 	lxv	vs4,	DISP64(\Index, 0+\OffsetA)(\AREG)
-	lxv	vs5,	DISP64(\Index,16+\OffsetA)(\AREG)
-	lxv	vs6,	DISP64(\Index,32+\OffsetA)(\AREG)
-	lxv	vs7,	DISP64(\Index,48+\OffsetA)(\AREG) 
-
-	xxperm  	vs10,	vs8,		permute_mask 
-	xxpermdi	vs9,	vs8,	vs8,2	  
-
-    xvmaddasp		vs32, vs0,vs24
-	xvmaddasp		vs33, vs1,vs24
-	xvmaddasp		vs34, vs2,vs24	
-	xvmaddasp		vs35, vs3,vs24	 
-
-    xvmaddasp		vs36, vs0,vs25
-	xvmaddasp		vs37, vs1,vs25
-	xvmaddasp		vs38, vs2,vs25	
-	xvmaddasp		vs39, vs3,vs25 
-
- 	xxpermdi	vs11,	vs10,	vs10,2	 
-
-    xvmaddasp		vs40, vs0,vs26
-	xvmaddasp		vs41, vs1,vs26
-	xvmaddasp		vs42, vs2,vs26	
-	xvmaddasp		vs43, vs3,vs26
-
-    xvmaddasp		vs44, vs0,vs27
-	xvmaddasp		vs45, vs1,vs27
-	xvmaddasp		vs46, vs2,vs27	
-	xvmaddasp		vs47, vs3,vs27
-
-
-
-	lxv	vs24,	DISP16(\Index,16+\OffsetB)(\BREG) 
-
-	lxv	vs0,	DISP64(\Index,64+\OffsetA)(\AREG)
-	lxv	vs1,	DISP64(\Index,64+16+\OffsetA)(\AREG)
-	lxv	vs2,	DISP64(\Index,64+32+\OffsetA)(\AREG)
-	lxv	vs3,	DISP64(\Index,64+48+\OffsetA)(\AREG)
-
-	xxperm  	vs26,	vs24,		permute_mask 
-	xxpermdi	vs25,	vs24,	vs24,2	    
- 
-
-    xvmaddasp		vs32, vs4,vs8
-	xvmaddasp		vs33, vs5,vs8
-	xvmaddasp		vs34, vs6,vs8	
-	xvmaddasp		vs35, vs7,vs8	
- 
-    xvmaddasp		vs36, vs4,vs9
-	xvmaddasp		vs37, vs5,vs9
-	xvmaddasp		vs38, vs6,vs9	
-	xvmaddasp		vs39, vs7,vs9
-         
-	xxpermdi	vs27,	vs26,	vs26,2	 	
-
-    xvmaddasp		vs40, vs4,vs10
-	xvmaddasp		vs41, vs5,vs10
-	xvmaddasp		vs42, vs6,vs10	
-	xvmaddasp		vs43, vs7,vs10
-
-    xvmaddasp		vs44, vs4,vs11
-	xvmaddasp		vs45, vs5,vs11
-	xvmaddasp		vs46, vs6,vs11	
-	xvmaddasp		vs47, vs7,vs11
- 
-
-	lxv	vs8,	DISP16(\Index,32+\OffsetB)(\BREG) 
-
- 	lxv	vs4,	DISP64(\Index,128+0+\OffsetA)(\AREG)
-	lxv	vs5,	DISP64(\Index,128+16+\OffsetA)(\AREG)
-	lxv	vs6,	DISP64(\Index,128+32+\OffsetA)(\AREG)
-	lxv	vs7,	DISP64(\Index,128+48+\OffsetA)(\AREG) 
-
-	xxperm  	vs10,	vs8,		permute_mask 
-	xxpermdi	vs9,	vs8,	vs8,2	  
-
-    xvmaddasp		vs32, vs0,vs24
-	xvmaddasp		vs33, vs1,vs24
-	xvmaddasp		vs34, vs2,vs24	
-	xvmaddasp		vs35, vs3,vs24	 
-
-    xvmaddasp		vs36, vs0,vs25
-	xvmaddasp		vs37, vs1,vs25
-	xvmaddasp		vs38, vs2,vs25	
-	xvmaddasp		vs39, vs3,vs25
-
- 	xxpermdi	vs11,	vs10,	vs10,2	 
-
-    xvmaddasp		vs40, vs0,vs26
-	xvmaddasp		vs41, vs1,vs26
-	xvmaddasp		vs42, vs2,vs26	
-	xvmaddasp		vs43, vs3,vs26
-
-    xvmaddasp		vs44, vs0,vs27
-	xvmaddasp		vs45, vs1,vs27
-	xvmaddasp		vs46, vs2,vs27	
-	xvmaddasp		vs47, vs3,vs27
-
- 
- 
-.if \Complete==0
-	lxv	vs24,	DISP16(\Index,48+\OffsetB)(\BREG) 
-
-	lxv	vs0,	DISP64(\Index,192+\OffsetA)(\AREG)
-	lxv	vs1,	DISP64(\Index,192+16+\OffsetA)(\AREG) 
-	lxv	vs2,	DISP64(\Index,192+32+\OffsetA)(\AREG)
-	lxv	vs3,	DISP64(\Index,192+48+\OffsetA)(\AREG)
-
-	xxperm  	vs26,	vs24,	permute_mask 	
-	xxpermdi	vs25,	vs24,	vs24,2  	
-
-.endif 
-.if \IsLast==1	
-.if \Complete==1
-  
-	addi		\BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
-	addi		\AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
-.else
-  
-	addi		\BREG, \BREG,  DISP16(\Index,64)
-	addi		\AREG, \AREG, DISP64(\Index,256)
-.endif
-.endif   
- 
-    xvmaddasp		vs32, vs4,vs8
-	xvmaddasp		vs33, vs5,vs8
-	xvmaddasp		vs34, vs6,vs8	
-	xvmaddasp		vs35, vs7,vs8	 
- 
-    xvmaddasp		vs36, vs4,vs9
-	xvmaddasp		vs37, vs5,vs9
-	xvmaddasp		vs38, vs6,vs9	
-	xvmaddasp		vs39, vs7,vs9
-  
-.if \Complete==0        
-	xxpermdi	vs27,	vs26,	vs26,2	 
- 	
-.endif
- 
-    xvmaddasp		vs40, vs4,vs10
-	xvmaddasp		vs41, vs5,vs10
-	xvmaddasp		vs42, vs6,vs10	
-	xvmaddasp		vs43, vs7,vs10
-
-    xvmaddasp		vs44, vs4,vs11
-	xvmaddasp		vs45, vs5,vs11
-	xvmaddasp		vs46, vs6,vs11	
-	xvmaddasp		vs47, vs7,vs11
-
- 
-
-.endm
-
-.macro KERNEL4x16 First
-
-  LOAD4x16 0
-  END4x16 \First, AO, BO, 64,16 
-.endm
-
-.macro KERNEL4x16_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-	
-	lxv	vs8,	DISP8(\Index, 0+\OffsetB)(\BREG) 
- 	lxv	vs4,	DISP32(\Index, 0+\OffsetA)(\AREG)
-	lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
-	lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
-	lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
-
-	xxperm  	vs10,	vs8,		permute_mask 
-	xxpermdi	vs9,	vs8,	vs8,2	  
-.if \First==1
-    xvmulsp		vs32, vs0,vs24
-	xvmulsp		vs33, vs1,vs24
-	xvmulsp		vs34, vs2,vs24	
-	xvmulsp		vs35, vs3,vs24	
-
-    xvmulsp		vs36, vs0,vs25
-	xvmulsp		vs37, vs1,vs25
-	xvmulsp		vs38, vs2,vs25	
-	xvmulsp		vs39, vs3,vs25	
-.else
-    xvmaddasp		vs32, vs0,vs24
-	xvmaddasp		vs33, vs1,vs24
-	xvmaddasp		vs34, vs2,vs24	
-	xvmaddasp		vs35, vs3,vs24
-
-    xvmaddasp		vs36, vs0,vs25
-	xvmaddasp		vs37, vs1,vs25
-	xvmaddasp		vs38, vs2,vs25	
-	xvmaddasp		vs39, vs3,vs25		
-.endif
-
- 	xxpermdi	vs11,	vs10,	vs10,2	 	
- 
-.if \First==1  
-    xvmulsp		vs40, vs0,vs26
-	xvmulsp		vs41, vs1,vs26
-	xvmulsp		vs42, vs2,vs26	
-	xvmulsp		vs43, vs3,vs26
-
-    xvmulsp		vs44, vs0,vs27
-	xvmulsp		vs45, vs1,vs27
-	xvmulsp		vs46, vs2,vs27	
-	xvmulsp		vs47, vs3,vs27
-
-  
-.else 
-    xvmaddasp		vs40, vs0,vs26
-	xvmaddasp		vs41, vs1,vs26
-	xvmaddasp		vs42, vs2,vs26	
-	xvmaddasp		vs43, vs3,vs26
-
-    xvmaddasp		vs44, vs0,vs27
-	xvmaddasp		vs45, vs1,vs27
-	xvmaddasp		vs46, vs2,vs27	
-	xvmaddasp		vs47, vs3,vs27
- 
-
-.endif
-.if \Complete==0
-	lxv	vs24,	DISP8(\Index,16+\OffsetB)(\BREG) 
-	lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
-	lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG)
-	lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
-	lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
-
-	xxperm  	vs26,	vs24,	permute_mask 
-	xxpermdi	vs25,	vs24,	vs24,2	  
-.endif    
-.if \IsLast==1	
-.if \Complete==1
- 	addi		\BREG, \BREG,  DISP8(\Index,16+\OffsetB) 
-	addi		\AREG, \AREG, DISP32(\Index,64+\OffsetA)
-
-.else
-  	addi		\BREG, \BREG,  DISP8(\Index,32)
-	addi		\AREG, \AREG, DISP32(\Index,128) 
-.endif
-.endif
-
-.if \First==1
-    xvmulsp		vs32, vs4,vs8
-	xvmulsp		vs33, vs5,vs8
-	xvmulsp		vs34, vs6,vs8	
-	xvmulsp		vs35, vs7,vs8
-
-    xvmulsp		vs36, vs4,vs9
-	xvmulsp		vs37, vs5,vs9
-	xvmulsp		vs38, vs6,vs9	
-	xvmulsp		vs39, vs7,vs9
-.else
-    xvmaddasp		vs32, vs4,vs8
-	xvmaddasp		vs33, vs5,vs8
-	xvmaddasp		vs34, vs6,vs8	
-	xvmaddasp		vs35, vs7,vs8	
-
-    xvmaddasp		vs36, vs4,vs9
-	xvmaddasp		vs37, vs5,vs9
-	xvmaddasp		vs38, vs6,vs9	
-	xvmaddasp		vs39, vs7,vs9
-.endif 
- 
-.if \Complete==0        
-	xxpermdi	vs27,	vs26,	vs26,2	 
- 
-.endif
-.if \First==1  
-    xvmulsp		vs40, vs4,vs10
-	xvmulsp		vs41, vs5,vs10
-	xvmulsp		vs42, vs6,vs10	
-	xvmulsp		vs43, vs7,vs10
-
-    xvmulsp		vs44, vs4,vs11
-	xvmulsp		vs45, vs5,vs11
-	xvmulsp		vs46, vs6,vs11	
-	xvmulsp		vs47, vs7,vs11
-
- 
-
-.else 
-    xvmaddasp		vs40, vs4,vs10
-	xvmaddasp		vs41, vs5,vs10
-	xvmaddasp		vs42, vs6,vs10	
-	xvmaddasp		vs43, vs7,vs10
-
-    xvmaddasp		vs44, vs4,vs11
-	xvmaddasp		vs45, vs5,vs11
-	xvmaddasp		vs46, vs6,vs11	
-	xvmaddasp		vs47, vs7,vs11
-
- 
-
-.endif
-
-.endm
-
- 
-.macro SAVE4x16
-
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC 
-
-  add     T2, CO, T10  
-  add     T3, T1, T10  
-
-  
- 
-    xxmrglw     vs8,    vs32,   vs44
-    xxmrglw     vs10,   vs36,   vs40  
-
-    xxmrghw     vs1,    vs32,   vs44
-    xxmrghw     vs0,    vs36,   vs40
-
-    xxmrglw     vs12,   vs33,   vs45
-    xxmrglw     vs14,   vs37,   vs41  
-
-    xxmrghw     vs2,    vs37,   vs41
-    xxmrghw     vs3,    vs33,   vs45
-
-    xxmrglw     vs16,   vs34,   vs46
-    xxmrglw     vs18,   vs38,   vs42   
-
-    xxlor      vs9, vs8,    vs8
-    xxlor      vs11,    vs10,   vs10 
-
-    xxmrghw     vs4,    vs38,   vs42
-    xxmrghw     vs5,    vs34,   vs46
-
-    xxlor      vs13,    vs12,   vs12
-    xxlor      vs15,    vs14,   vs14
-
-    xxmrglw     vs24,   vs35,   vs47
-    xxmrglw     vs26,   vs39,   vs43  
-
-    xxlor      vs17,    vs16,   vs16
-    xxlor      vs19,    vs18,   vs18
-
-    xxmrghw     vs30,   vs39,   vs43 
-    xxmrghw     vs31,   vs35,   vs47
-
-    xxperm      vs8,    vs0,    save_permute_1
-    xxperm      vs10,   vs1,    save_permute_1
-    xxperm      vs9,    vs0,    save_permute_2  
-    xxperm      vs11,   vs1,    save_permute_2      
-
-#ifndef TRMMKERNEL    
-    lxv        vs32, 0(CO)
-    lxv        vs33, 16(CO) 
-    lxv        vs34, 32(CO)  
-    lxv        vs35, 48(CO)      
-#endif
-    xxlor      vs25,    vs24,   vs24
-    xxlor      vs27,    vs26,   vs26 
-
-#ifndef TRMMKERNEL    
-    lxv        vs36, 0(T1)
-    lxv        vs37, 16(T1) 
-    lxv        vs38, 32(T1)  
-    lxv        vs39, 48(T1)     
-#endif
-#ifndef TRMMKERNEL       
-    lxv        vs40, 0(T2)
-    lxv        vs41, 16(T2) 
-    lxv        vs42, 32(T2)  
-    lxv        vs43, 48(T2)     
-#endif  
-#ifndef TRMMKERNEL    
-    lxv        vs44, 0(T3)
-    lxv        vs45, 16(T3) 
-    lxv        vs46, 32(T3)  
-    lxv        vs47, 48(T3)                 
-#endif  
-
-    xxperm     vs12,    vs2,    save_permute_1
-    xxperm     vs14,    vs3,    save_permute_1
-       
-    xxperm     vs13,    vs2,    save_permute_2   
-    xxperm     vs15,    vs3,    save_permute_2      
-
-    xxperm     vs16,    vs4,    save_permute_1
-    xxperm     vs18,    vs5,    save_permute_1
-      
-    xxperm     vs17,    vs4,    save_permute_2   
-    xxperm     vs19,    vs5,    save_permute_2      
-
-    xxperm     vs24,    vs30,   save_permute_1
-    xxperm     vs26,    vs31,   save_permute_1 
-         
-    xxperm     vs25,    vs30,   save_permute_2   
-    xxperm     vs27,    vs31,   save_permute_2  
-
-
-    /* multiply add normal way */
- 
-#ifdef TRMMKERNEL
-    xvmulsp     vs32,   vs8,    alpha_r 
-    xvmulsp     vs33,   vs12,   alpha_r   
-    xvmulsp     vs34,   vs16,   alpha_r 
-    xvmulsp     vs35,   vs24,   alpha_r  
-    xvmulsp     vs36,   vs9,    alpha_r 
-    xvmulsp     vs37,   vs13,   alpha_r  
-    xvmulsp     vs38,   vs17,   alpha_r 
-    xvmulsp     vs39,   vs25,   alpha_r               
-#else 
-    xvmaddasp   vs32,   vs8,    alpha_r 
-    xvmaddasp   vs33,   vs12,   alpha_r   
-    xvmaddasp   vs34,   vs16,   alpha_r 
-    xvmaddasp   vs35,   vs24,   alpha_r  
-    xvmaddasp   vs36,   vs9,    alpha_r 
-    xvmaddasp   vs37,   vs13,   alpha_r   
-    xvmaddasp   vs38,   vs17,   alpha_r 
-    xvmaddasp   vs39,   vs25,   alpha_r         
-#endif 
-
-
-
-#ifdef TRMMKERNEL
-    xvmulsp     vs40,   vs10,   alpha_r 
-    xvmulsp     vs41,   vs14,   alpha_r 
-    xvmulsp     vs42,   vs18,   alpha_r 
-    xvmulsp     vs43,   vs26,   alpha_r  
-    xvmulsp     vs44,   vs11,   alpha_r 
-    xvmulsp     vs45,   vs15,   alpha_r  
-    xvmulsp     vs46,   vs19,   alpha_r 
-    xvmulsp     vs47,   vs27,   alpha_r                   
-#else
-
-    xvmaddasp   vs40,   vs10,   alpha_r 
-    xvmaddasp   vs41,   vs14,   alpha_r   
-    xvmaddasp   vs42,   vs18,   alpha_r 
-    xvmaddasp   vs43,   vs26,   alpha_r  
-    xvmaddasp   vs44,   vs11,   alpha_r 
-    xvmaddasp   vs45,   vs15,   alpha_r 
-    xvmaddasp   vs46,   vs19,   alpha_r 
-    xvmaddasp   vs47,   vs27,   alpha_r  
-        
-#endif  
-
-    stxv        vs32, 0(CO)
-    stxv        vs33, 16(CO) 
-    stxv        vs34, 32(CO)  
-    stxv        vs35, 48(CO)  
-
-    stxv        vs36, 0(T1)
-    stxv        vs37, 16(T1)  
-    stxv        vs38, 32(T1)  
-    stxv        vs39, 48(T1)
-
-    stxv        vs40, 0(T2)
-    stxv        vs41, 16(T2)  
-    stxv        vs42, 32(T2)  
-    stxv        vs43, 48(T2)  
-    stxv        vs44, 0(T3)
-    stxv        vs45, 16(T3) 
-    stxv        vs46, 32(T3)  
-    stxv        vs47, 48(T3)
-   
-    addi CO,CO,64
-
-
-.endm
-
-
-
-/**********************************************************************************************
-* Macros for N=4 and M=8
-**********************************************************************************************/
-
-.macro LOAD4x8_1
-   LOAD4x8 1
-.endm
-
-.macro LOAD4x8_0
-   LOAD4x8 0
-.endm
-
-.macro KERNEL4x8_L1_L4  Index,IsLast
-  KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
-.endm
-
-.macro KERNEL4x8_I1_L4  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x8_I1_L4_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x8_I1_L4_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-.macro KERNEL4x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-   KERNEL4x8_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL4x8_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x8_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro END4x8_NORMAL
-  END4x8 0, AO, BO, 32,16 
-.endm
-
-.macro Zero4X8
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33
- 
-    xxlxor      vs36,   vs36,   vs36
-    xxlxor      vs37,   vs37,   vs37
- 
-    xxlxor      vs40,   vs40,   vs40
-    xxlxor      vs41,   vs41,   vs41
- 
-    xxlxor      vs44,   vs44,   vs44
-    xxlxor      vs45,   vs45,   vs45
-    
-.endm
-
-.macro LOAD4x8  Zero
-
-    lxv vs24,   0(BO) 
-    lxv vs0,     0(AO)
-    lxv vs1,    16(AO)
-
-    xxperm      vs26,   vs24,       permute_mask    
-    xxpermdi    vs25,   vs24,   vs24,2      
-
-    xxpermdi    vs27,   vs26,   vs26,2      
-
-.if \Zero==1 
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33 
-    xxlxor      vs36,   vs36,   vs36
-    xxlxor      vs37,   vs37,   vs37
-    xxlxor      vs40,   vs40,   vs40
-    xxlxor      vs41,   vs41,   vs41 
-    xxlxor      vs44,   vs44,   vs44
-    xxlxor      vs45,   vs45,   vs45 
- 
-.endif
-.endm
-
-
-.macro END4x8 First, AREG, BREG, OffsetA, OffsetB
-
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
-.endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
-
-.if \First==1
-    xvmulsp     vs32, vs0,vs24
-    xvmulsp     vs33, vs1,vs24
-
-    xvmulsp     vs36, vs0,vs25
-    xvmulsp     vs37, vs1,vs25
-
-    xvmulsp     vs40, vs0,vs26
-    xvmulsp     vs41, vs1,vs26
-
-    xvmulsp     vs44, vs0,vs27
-    xvmulsp     vs45, vs1,vs27
- 
-
-.else
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
- 
-
-.endif
-.endm  
-
-.macro KERNEL4x8_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
-
-    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\BREG) 
-
-    lxv vs4,    DISP32(\Index, 0+\OffsetA)(\AREG)
-    lxv vs5,    DISP32(\Index,16+\OffsetA)(\AREG)
-
-    xxperm      vs10,   vs8,        permute_mask    
-    xxpermdi    vs9,    vs8,    vs8,2     
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-    xxpermdi    vs11,   vs10,   vs10,2   
-
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-
- 
-
-    lxv vs24,   DISP16(\Index,16+\OffsetB)(\BREG) 
-
-    lxv vs0,    DISP32(\Index,32+\OffsetA)(\AREG)
-    lxv vs1,    DISP32(\Index,32+16+\OffsetA)(\AREG)
-
-    xxperm      vs26,   vs24,       permute_mask   
-    xxpermdi    vs25,   vs24,   vs24,2      
-
-    xvmaddasp       vs32, vs4,vs8
-    xvmaddasp       vs33, vs5,vs8
-
-    xvmaddasp       vs36, vs4,vs9
-    xvmaddasp       vs37, vs5,vs9
-
-    xxpermdi    vs27,   vs26,   vs26,2       
-
-    xvmaddasp       vs40, vs4,vs10
-    xvmaddasp       vs41, vs5,vs10
-
-    xvmaddasp       vs44, vs4,vs11
-    xvmaddasp       vs45, vs5,vs11
-
- 
-
-    lxv vs8,    DISP16(\Index,32+\OffsetB)(\BREG) 
-
-    lxv vs4,    DISP32(\Index,64+0+\OffsetA)(\AREG)
-    lxv vs5,    DISP32(\Index,64+16+\OffsetA)(\AREG)
-
-    xxperm      vs10,   vs8,        permute_mask     
-    xxpermdi    vs9,    vs8,    vs8,2     
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-    xxpermdi    vs11,   vs10,   vs10,2   
-
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-
- 
-
-.if \Complete==0
-    lxv vs24,   DISP16(\Index,48+\OffsetB)(\BREG) 
-
-    lxv vs0,    DISP32(\Index,96+\OffsetA)(\AREG)
-    lxv vs1,    DISP32(\Index,96+16+\OffsetA)(\AREG) 
-
-    xxperm      vs26,   vs24,   permute_mask     
-    xxpermdi    vs25,   vs24,   vs24,2      
-
-.endif 
-.if \IsLast==1  
-.if \Complete==1
-  
-    addi        \BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
-    addi        \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
-.else
-  
-    addi        \BREG, \BREG,  DISP16(\Index,64)
-    addi        \AREG, \AREG, DISP32(\Index,128)
-.endif
-.endif   
- 
-    xvmaddasp       vs32, vs4,vs8
-    xvmaddasp       vs33, vs5,vs8
-
-    xvmaddasp       vs36, vs4,vs9
-    xvmaddasp       vs37, vs5,vs9
-
-.if \Complete==0        
-    xxpermdi    vs27,   vs26,   vs26,2    
-    
-.endif
- 
-    xvmaddasp       vs40, vs4,vs10
-    xvmaddasp       vs41, vs5,vs10
-
-    xvmaddasp       vs44, vs4,vs11
-    xvmaddasp       vs45, vs5,vs11
-
- 
-
-.endm
-
-.macro KERNEL4x8 First
-
-  LOAD4x8 0
-  END4x8 \First, AO, BO, 32,16  
-.endm
-
-.macro KERNEL4x8_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-    
-    lxv vs8,     DISP8(\Index, 0+\OffsetB)(\BREG) 
-    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs5,    DISP16(\Index,16+\OffsetA)(\AREG)
-
-    xxperm      vs10,   vs8,        permute_mask  
-    xxpermdi    vs9,    vs8,    vs8,2     
-.if \First==1
-    xvmulsp     vs32, vs0,vs24
-    xvmulsp     vs33, vs1,vs24
-
-    xvmulsp     vs36, vs0,vs25
-    xvmulsp     vs37, vs1,vs25
-
-.else
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-.endif
-
-    xxpermdi    vs11,   vs10,   vs10,2    
- 
-.if \First==1  
-    xvmulsp     vs40, vs0,vs26
-    xvmulsp     vs41, vs1,vs26
-
-    xvmulsp     vs44, vs0,vs27
-    xvmulsp     vs45, vs1,vs27
- 
-
-.else 
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
- 
-
-.endif
-.if \Complete==0
-    lxv vs24,    DISP8(\Index,16+\OffsetB)(\BREG) 
-
-    lxv vs0,    DISP16(\Index,32+\OffsetA)(\AREG)
-    lxv vs1,    DISP16(\Index,32+16+\OffsetA)(\AREG)
-
-    xxperm      vs26,   vs24,   permute_mask   
-    xxpermdi    vs25,   vs24,   vs24,2    
-.endif    
-.if \IsLast==1  
-.if \Complete==1
-    addi        \BREG, \BREG,   DISP8(\Index,16+\OffsetB) 
-    addi        \AREG, \AREG,  DISP16(\Index,32+\OffsetA)
-
-.else
-    addi        \BREG, \BREG,   DISP8(\Index,32)
-    addi        \AREG, \AREG,  DISP16(\Index,64) 
-.endif
-.endif
-
-.if \First==1
-    xvmulsp     vs32, vs4,vs8
-    xvmulsp     vs33, vs5,vs8
-
-    xvmulsp     vs36, vs4,vs9
-    xvmulsp     vs37, vs5,vs9
-
-.else
-    xvmaddasp       vs32, vs4,vs8
-    xvmaddasp       vs33, vs5,vs8
-
-    xvmaddasp       vs36, vs4,vs9
-    xvmaddasp       vs37, vs5,vs9
-
-.endif 
- 
-.if \Complete==0        
-    xxpermdi    vs27,   vs26,   vs26,2   
- 
-.endif
-.if \First==1  
-    xvmulsp     vs40, vs4,vs10
-    xvmulsp     vs41, vs5,vs10
-
-    xvmulsp     vs44, vs4,vs11
-    xvmulsp     vs45, vs5,vs11
- 
-.else 
-    xvmaddasp       vs40, vs4,vs10
-    xvmaddasp       vs41, vs5,vs10
-
-    xvmaddasp       vs44, vs4,vs11
-    xvmaddasp       vs45, vs5,vs11 
-
-.endif
-
-.endm
-
-
-.macro SAVE4x8 
- 
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC 
-
-  add     T2, CO, T10  
-  add     T3, T1, T10  
-
- 
-
-#ifndef TRMMKERNEL    
-    lxv        vs34, 0(CO)
-    lxv        vs35, 16(CO)      
-    lxv        vs38, 0(T1)
-    lxv        vs39, 16(T1)  
-    lxv        vs42, 0(T2)
-    lxv        vs43, 16(T2)     
-    lxv        vs46, 0(T3)
-    lxv        vs47, 16(T3)  
-
- 
-#endif  
-
-    xxmrglw     vs8,    vs32,   vs44
-    xxmrglw     vs10,   vs36,   vs40  
-
-    xxmrghw     vs1,    vs32,   vs44
-    xxmrghw     vs0,    vs36,   vs40
-
-    xxmrglw     vs12,   vs33,   vs45
-    xxmrglw     vs14,   vs37,   vs41  
-
-    xxmrghw     vs2,    vs37,   vs41
-    xxmrghw     vs3,    vs33,   vs45
-
-    xxlor      vs9, vs8,    vs8
-    xxlor      vs11,    vs10,   vs10 
- 
-    xxlor      vs13,    vs12,   vs12
-    xxlor      vs15,    vs14,   vs14
-
-    xxperm      vs8,    vs0,    save_permute_1
-    xxperm      vs10,   vs1,    save_permute_1
-    xxperm      vs9,    vs0,    save_permute_2  
-    xxperm      vs11,   vs1,    save_permute_2      
-
-    xxperm     vs12,    vs2,    save_permute_1
-    xxperm     vs14,    vs3,    save_permute_1
-      
-    xxperm     vs13,    vs2,    save_permute_2   
-    xxperm     vs15,    vs3,    save_permute_2      
-
-
-    /* multiply add normal way */
- 
-#ifdef TRMMKERNEL
-    xvmulsp     vs34,   vs8,    alpha_r 
-    xvmulsp     vs35,   vs12,   alpha_r 
-    xvmulsp     vs38,   vs9,    alpha_r 
-    xvmulsp     vs39,   vs13,   alpha_r 
-    xvmulsp     vs42,   vs10,   alpha_r 
-    xvmulsp     vs43,   vs14,   alpha_r 
-    xvmulsp     vs46,   vs11,   alpha_r 
-    xvmulsp     vs47,   vs15,   alpha_r                    
-#else 
-    xvmaddasp   vs34,   vs8,    alpha_r 
-    xvmaddasp   vs35,   vs12,   alpha_r 
-    xvmaddasp   vs38,   vs9,    alpha_r 
-    xvmaddasp   vs39,   vs13,   alpha_r  
-    xvmaddasp   vs42,   vs10,   alpha_r 
-    xvmaddasp   vs43,   vs14,   alpha_r   
-    xvmaddasp   vs46,   vs11,   alpha_r 
-    xvmaddasp   vs47,   vs15,   alpha_r                     
-#endif     
- 
-    
-    stxv        vs34, 0(CO)
-    stxv        vs35, 16(CO)  
-    stxv        vs38, 0(T1)
-    stxv        vs39, 16(T1)  
-    stxv        vs42, 0(T2)
-    stxv        vs43, 16(T2)     
-    stxv        vs46, 0(T3)
-    stxv        vs47, 16(T3)  
-  
-
-    addi CO,CO,32
-
-.endm
-
-
-/**********************************************************************************************
-* Macros for N=4 and M=4
-**********************************************************************************************/
-
-.macro LOAD4x4_1
-   LOAD4x4 1
-.endm
-
-.macro LOAD4x4_0
-   LOAD4x4 0
-.endm
-
-.macro KERNEL4x4_L1_L4  Index,IsLast
-  KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
-.endm
-
-.macro KERNEL4x4_I1_L4  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x4_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x4_I1_L4_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x4_I1_L4_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-.macro KERNEL4x4_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-   KERNEL4x4_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL4x4_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x4_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x4_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x4_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro Zero4X4
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33 
-    xxlxor      vs34,   vs34,   vs34
-    xxlxor      vs35,   vs35,   vs35
- 
-.endm
-
-.macro LOAD4x4  Zero
-
-    lxv vs0,     0(AO)
-    lxv vs24,   0(BO) 
-
-
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2        
-
-.if \Zero==1 
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33 
-    xxlxor      vs34,   vs34,   vs34
-    xxlxor      vs35,   vs35,   vs35
- 
-.endif
-.endm
-
-.macro END4x4_NORMAL
-  END4x4 0, AO, BO, 16,16 
-.endm
-
-.macro END4x4 First, AREG, BREG, OffsetA, OffsetB
-
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
-.endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
-
-.if \First==1
-    xvmulsp      vs32,   vs24,   vs0
-    xvmulsp      vs33,   vs24,   vs1 
-    xvmulsp      vs34,   vs24,   vs2
-    xvmulsp      vs35,   vs24,   vs3  
-.else
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
- 
-
-.endif
-.endm  
-
-.macro KERNEL4x4_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
-
-    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG) 
-
-    xxperm      vs6,   vs4,       permute_mask  
-    xxpermdi    vs5,   vs4,   vs4,2      
-    xxpermdi    vs7,   vs6,   vs6,2 
- 
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
- 
-
-    lxv vs0,    DISP16(\Index, 16+\OffsetA)(\AREG)
-    lxv vs24,   DISP16(\Index, 16+\OffsetB)(\BREG)  
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2   
-
-    xvmaddasp      vs32,   vs26,   vs4
-    xvmaddasp      vs33,   vs26,   vs5 
-    xvmaddasp      vs34,   vs26,   vs6
-    xvmaddasp      vs35,   vs26,   vs7
- 
- 
-
-    lxv vs4,    DISP16(\Index, 32+\OffsetA)(\AREG)
-    lxv vs26,   DISP16(\Index, 32+\OffsetB)(\BREG) 
-
-    xxperm      vs6,   vs4,       permute_mask  
-    xxpermdi    vs5,   vs4,   vs4,2      
-    xxpermdi    vs7,   vs6,   vs6,2 
- 
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
- 
-
-.if \Complete==0 
-
-    lxv vs0,    DISP16(\Index, 48+\OffsetA)(\AREG)
-    lxv vs24,   DISP16(\Index, 48+\OffsetB)(\BREG) 
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2   
-.endif
-    xvmaddasp      vs32,   vs26,   vs4
-    xvmaddasp      vs33,   vs26,   vs5 
-    xvmaddasp      vs34,   vs26,   vs6
-    xvmaddasp      vs35,   vs26,   vs7
- 
-
- 
- 
-.if \IsLast==1  
-.if \Complete==1
-    addi        \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)  
-    addi        \BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
-
-.else
-    addi        \AREG, \AREG, DISP16(\Index,64)  
-    addi        \BREG, \BREG,  DISP16(\Index,64)
-
-.endif
-.endif   
- 
- 
-.endm
-
-.macro KERNEL4x4 First
-    LOAD4x4 0
-    END4x4 \First, AO, BO, 16,16  
-.endm
-
-.macro KERNEL4x4_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
-    lxv vs4,    DISP8(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG) 
-
-    xxperm      vs6,   vs4,       permute_mask  
-    xxpermdi    vs5,   vs4,   vs4,2      
-    xxpermdi    vs7,   vs6,   vs6,2 
-.if \First==1
-    xvmulsp      vs32,   vs24,   vs0
-    xvmulsp      vs33,   vs24,   vs1 
-    xvmulsp      vs34,   vs24,   vs2
-    xvmulsp      vs35,   vs24,   vs3
- 
-.else 
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
- 
-.endif
-
-.if \Complete==0 
-
-    lxv vs0,    DISP8(\Index, 16+\OffsetA)(\AREG)
-    lxv vs24,   DISP8(\Index, 16+\OffsetB)(\BREG) 
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2   
-.endif
-
-.if \First==1
-    xvmulsp      vs32,   vs26,   vs4
-    xvmulsp      vs33,   vs26,   vs5 
-    xvmulsp      vs34,   vs26,   vs6
-    xvmulsp      vs35,   vs26,   vs7 
-
-
-.else
-    xvmaddasp      vs32,   vs26,   vs4
-    xvmaddasp      vs33,   vs26,   vs5 
-    xvmaddasp      vs34,   vs26,   vs6
-    xvmaddasp      vs35,   vs26,   vs7
- 
-.endif
- 
- 
-.if \IsLast==1  
-.if \Complete==1
-    addi        \AREG, \AREG, DISP8(\Index,16+\OffsetA)  
-    addi        \BREG, \BREG,  DISP8(\Index,16+\OffsetB)
-
-.else
-    addi        \AREG, \AREG, DISP8(\Index,32)  
-    addi        \BREG, \BREG,  DISP8(\Index,32)
-
-.endif
-.endif   
-     
-  
-.endm
-
-
-.macro SAVE4x4
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC 
-#if !defined(TRMMKERNEL)  
-  lxv        vs36, 0(CO)
-  lxv        vs37, 0(T1)
-#endif
-  add     T2, CO, T10  
-  add     T3, T1, T10 
-#if !defined(TRMMKERNEL)   
-  lxv        vs38, 0(T2)
-  lxv        vs39, 0(T3)    
-#endif   
-
-  xxmrglw  vs0, vs35,vs32
-  xxmrglw  vs1, vs34,vs33 
-  xxmrglw  vs4, vs32,vs35
-  xxmrglw  vs5, vs33,vs34 
-
-
-  xxmrghw  vs2, vs35,vs32
-  xxmrghw  vs3, vs34,vs33 
-  xxmrghw  vs6, vs32,vs35
-  xxmrghw  vs7, vs33,vs34  
-
-  xxmrgld  vs24, vs1, vs0  
-  xxmrghd  vs25,vs5,vs4 
-
-  xxmrgld  vs26, vs2, vs3  
-  xxmrghd  vs27,vs6,vs7
-
- #if defined(TRMMKERNEL)
-  xvmulsp        vs36, vs24, alpha_r
-  xvmulsp        vs37, vs25, alpha_r 
-  xvmulsp        vs38, vs26, alpha_r
-  xvmulsp        vs39, vs27, alpha_r 
-#else
-  xvmaddasp        vs36, vs24, alpha_r
-  xvmaddasp        vs37, vs25, alpha_r 
-  xvmaddasp        vs38, vs26, alpha_r
-  xvmaddasp        vs39, vs27, alpha_r   
- #endif
-  stxv        vs36, 0(CO)
-  stxv        vs37, 0(T1) 
-  stxv        vs38, 0(T2)
-  stxv        vs39, 0(T3)   
- 
-
-
-  addi CO,CO,16
-.endm
-
-
-/**********************************************************************************************
-* Macros for N=4 and M=2
-**********************************************************************************************/
-
- 
-.macro KERNEL4x2_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- 
-
-.macro Zero4x2
-    xxlxor      vs0,   vs0,   vs0 
-    xxlxor      vs2,   vs2,   vs2 
-       
-.endm
- 
-.macro KERNEL4x2
-  KERNEL4x2_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxsd v4,   DISP2(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG)      
-    xxspltw   vs8,  vs36, 0 
-    xxspltw   vs9,  vs36, 1  
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8 
-    xvmulsp      vs2,   vs26,   vs9 
-     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8  
-    xvmaddasp      vs2,   vs26,   vs9 
- 
- .endif
-   
-    addi        \AREG, \AREG, DISP2(\Index,8)  
-    addi        \BREG, \BREG, DISP4(\Index,16)
- 
-.endm
-
-.macro KERNEL4x2_I_2  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG) 
-    lxv vs28,   DISP8(\Index,16+\OffsetB)(\BREG)       
-    xxspltw   vs8,  vs4, 2  
-    xxspltw   vs9,  vs4, 3 
-    xxspltw   vs10, vs4, 0 
-    xxspltw   vs11, vs4, 1
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8 
-    xvmulsp      vs2,   vs26,   vs9  
-
-    xvmulsp      vs0,   vs28,   vs10 
-    xvmulsp      vs2,   vs28,   vs11     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8 
-    xvmaddasp      vs2,   vs26,   vs9 
-
-    xvmaddasp      vs0,   vs28,   vs10 
-    xvmaddasp      vs2,   vs28,   vs11   
- .endif
-
- 
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP4(\Index,16)  
-    addi        \BREG, \BREG, DISP8(\Index,32)
-.endif 
-  
-.endm
-
-
-.macro SAVE4x2
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC  
-  add     T2, CO, T10  
-  add     T3, T1, T10     
-  /*convert alpha_r for multiply*/
-  xscvspdp  vs4,alpha_r
-/* v0 corresponds to vs32, do not forget*/
-#if !defined(TRMMKERNEL)
-  lxssp  v0,0(CO) 
-  lxssp  v1,4(CO) 
-
-  lxssp  v2,0(T1)
-  lxssp  v3,4(T1)
-
-  lxssp  v4,0(T2)
-  lxssp  v5,4(T2)
-
-  lxssp  v6,0(T3)
-  lxssp  v7,4(T3)
-
-   
-#endif
-  xscvspdp  vs5, vs2
-  xxspltw   vs6, vs2, 1 
-  xxspltw   vs7, vs2, 2 
-  xxspltw   vs8, vs2, 3  
-  xscvspdp  vs6,vs6
-  xscvspdp  vs7,vs7
-  xscvspdp  vs8,vs8
-
-  xscvspdp  vs24, vs0
-  xxspltw   vs25, vs0, 1 
-  xxspltw   vs26, vs0, 2 
-  xxspltw   vs27, vs0, 3  
-  xscvspdp  vs25,vs25
-  xscvspdp  vs26,vs26
-  xscvspdp  vs27,vs27
- 
-
-#if defined(TRMMKERNEL)
-  xsmuldp  vs32,vs8, vs4 
-  xsmuldp  vs33,vs27, vs4 
-
-  xsmuldp  vs34,vs7, vs4 
-  xsmuldp  vs35,vs26, vs4 
-
-  xsmuldp  vs36,vs6, vs4 
-  xsmuldp  vs37,vs25, vs4  
-
-  xsmuldp  vs38,vs5, vs4 
-  xsmuldp  vs39,vs24, vs4  
-
-      
-#else
-  xsmaddadp  vs32,vs8, vs4 
-  xsmaddadp  vs33,vs27, vs4 
-
-  xsmaddadp  vs34,vs7, vs4 
-  xsmaddadp  vs35,vs26, vs4 
-
-  xsmaddadp  vs36,vs6, vs4 
-  xsmaddadp  vs37,vs25, vs4  
-
-  xsmaddadp  vs38,vs5, vs4 
-  xsmaddadp  vs39,vs24, vs4  
-
-    
-#endif  
-
-  stxssp  v0,0(CO) 
-  stxssp  v1,4(CO) 
-
-  stxssp  v2,0(T1)
-  stxssp  v3,4(T1)
-
-  stxssp  v4,0(T2)
-  stxssp  v5,4(T2)
-
-  stxssp  v6,0(T3)
-  stxssp  v7,4(T3)
-
- 
- 
-
-  addi CO,CO,8
-.endm
-
-
-/**********************************************************************************************
-* Macros for N=4 and M=1
-**********************************************************************************************/
-.macro KERNEL4x1_4   OffsetA,OffsetB, Index,IsLast
-  KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro Zero4x1
-    xxlxor      vs0,   vs0,   vs0 
-.endm
-
-.macro KERNEL4x1
-  KERNEL4x1_1 AO,BO, 0 
-.endm
-
-.macro KERNEL4x1_2
-  KERNEL4x1_2_1 AO,BO, 0 
-.endm
-
-.macro KERNEL4x1_1 AREG,BREG,First 
-    lxvwsx vs8,  0, \AREG
-    lxv vs26,   0(\BREG)       
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8  
-.else 
-    xvmaddasp      vs0,   vs26,   vs8  
- .endif
-    addi        \AREG, \AREG,  4  
-    addi        \BREG, \BREG,  16
-.endm
-
-.macro KERNEL4x1_2_1 AREG,BREG,First 
-    lxsd v4,    0(\AREG)
-    lxv vs26,   0(\BREG)      
-    lxv vs28,  16(\BREG) 
-    xxspltw   vs8,  vs36, 1 
-    xxspltw   vs9,  vs36, 0  
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8 
-    xvmulsp      vs0,   vs28,   vs9     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8  
-    xvmaddasp      vs0,   vs28,   vs9  
- .endif
-    addi        \AREG, \AREG,  8 
-    addi        \BREG, \BREG,  32
-.endm
-
-.macro KERNEL4x1_I_4  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
-    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
-    xxspltw   vs8,  vs4, 3 
-    xxspltw   vs9,  vs4, 2 
-    xxspltw   vs10, vs4, 1 
-    xxspltw   vs11, vs4, 0
-    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG) 
-    lxv vs28,   DISP16(\Index,16+\OffsetB)(\BREG)  
-    lxv vs30,   DISP16(\Index,32+\OffsetB)(\BREG) 
-    lxv vs32,   DISP16(\Index,48+\OffsetB)(\BREG)          
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8  
-    xvmulsp      vs0,   vs28,   vs9      
-    xvmulsp      vs0,   vs30,   vs10  
-    xvmulsp      vs0,   vs32,   vs11     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8  
-    xvmaddasp      vs0,   vs28,   vs9     
-    xvmaddasp      vs0,   vs30,   vs10  
-    xvmaddasp      vs0,   vs32,   vs11  
- .endif
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP4(\Index,16)  
-    addi        \BREG, \BREG, DISP16(\Index,64)
-.endif 
-.endm
-
-.macro SAVE4x1
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC  
-  add     T2, CO, T10  
-  add     T3, T1, T10     
-  /*convert alpha_r for multiply*/
-  xscvspdp  vs4,alpha_r
-/* v0 corresponds to vs32, do not forget*/
-#if !defined(TRMMKERNEL)
-  lxssp  v0,0(CO)  
-  lxssp  v2,0(T1) 
-  lxssp  v4,0(T2) 
-  lxssp  v6,0(T3)  
-#endif
-  xscvspdp  vs24, vs0
-  xxspltw   vs25, vs0, 1 
-  xxspltw   vs26, vs0, 2 
-  xxspltw   vs27, vs0, 3  
-  xscvspdp  vs25,vs25
-  xscvspdp  vs26,vs26
-  xscvspdp  vs27,vs27
-
-#if defined(TRMMKERNEL)
-  xsmuldp  vs32,vs27, vs4 
-  xsmuldp  vs34,vs26, vs4 
-  xsmuldp  vs36,vs25, vs4 
-  xsmuldp  vs38,vs24, vs4  
-#else
-  xsmaddadp  vs32,vs27, vs4 
-  xsmaddadp  vs34,vs26, vs4 
-  xsmaddadp  vs36,vs25, vs4 
-  xsmaddadp  vs38,vs24, vs4   
-#endif  
-  stxssp  v0,0(CO)  
-  stxssp  v2,0(T1) 
-  stxssp  v4,0(T2) 
-  stxssp  v6,0(T3)  
-  addi CO,CO,4
-.endm
-
-/****************************N=2 section*****************/
-
-.macro KERNEL2x16_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL2x16_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- 
-.macro Zero2x16
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
-    xxlxor      vs2,   vs2,   vs2
-    xxlxor      vs3,   vs3,   vs3
-    xxlxor      vs4,   vs4,   vs4
-    xxlxor      vs5,   vs5,   vs5 
-    xxlxor      vs6,   vs6,   vs6
-    xxlxor      vs7,   vs7,   vs7      
-.endm
- 
-.macro KERNEL2x16
-  KERNEL2x16_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x16_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- 
-.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 1 
-    xxspltw   vs9,  vs36, 0 
-    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)   
-    lxv vs28,   DISP16(\Index, 32+\OffsetA)(\AREG)
-    lxv vs29,   DISP16(\Index,48+\OffsetA)(\AREG)        
- 
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8 
-    xvmulsp      vs2,   vs28,   vs8
-    xvmulsp      vs3,   vs29,   vs8 
-
-    xvmulsp      vs4,   vs26,   vs9
-    xvmulsp      vs5,   vs27,   vs9 
-    xvmulsp      vs6,   vs28,   vs9
-    xvmulsp      vs7,   vs29,   vs9     
-     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs28,   vs8
-    xvmaddasp      vs3,   vs29,   vs8 
-
-    xvmaddasp      vs4,   vs26,   vs9
-    xvmaddasp      vs5,   vs27,   vs9 
-    xvmaddasp      vs6,   vs28,   vs9
-    xvmaddasp      vs7,   vs29,   vs9
- 
- .endif
-   
-    addi        \BREG, \BREG, DISP2(\Index,8)
-    addi        \AREG, \AREG, DISP16(\Index,64)  
- 
-.endm
-
-
-
-
-.macro KERNEL2x16_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
-    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP64(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP64(\Index,16+\OffsetA)(\AREG)
-    lxv vs28,   DISP64(\Index,32+\OffsetA)(\AREG)
-    lxv vs29,   DISP64(\Index,48+\OffsetA)(\AREG)  
-
-    lxv vs16,   DISP64(\Index,64+ 0+\OffsetA)(\AREG)
-    lxv vs17,   DISP64(\Index,64+ 16+\OffsetA)(\AREG)
-    lxv vs18,   DISP64(\Index,64+ 32+\OffsetA)(\AREG)
-    lxv vs19,   DISP64(\Index,64+ 48+\OffsetA)(\AREG) 
-
-    lxv vs30,   DISP64(\Index,128+ 0+\OffsetA)(\AREG)
-    lxv vs31,   DISP64(\Index,128+ 16+\OffsetA)(\AREG)
-    lxv vs32,   DISP64(\Index,128+ 32+\OffsetA)(\AREG)
-    lxv vs33,   DISP64(\Index,128+ 48+\OffsetA)(\AREG)  
-        
-    lxv vs34,   DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
-    lxv vs35,   DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
-    lxv vs36,   DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
-    lxv vs37,   DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) 
-
-    xxspltw   vs8,  vs38, 3  
-    xxspltw   vs9,  vs38, 2 
-    xxspltw   vs10, vs38, 1 
-    xxspltw   vs11, vs38, 0  
-
-    xxspltw   vs12,  vs39, 3  
-    xxspltw   vs13,  vs39, 2 
-    xxspltw   vs14, vs39, 1 
-    xxspltw   vs15, vs39, 0  
-
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs28,   vs8
-    xvmaddasp      vs3,   vs29,   vs8 
-
-    xvmaddasp      vs4,   vs26,   vs9
-    xvmaddasp      vs5,   vs27,   vs9 
-    xvmaddasp      vs6,   vs28,   vs9
-    xvmaddasp      vs7,   vs29,   vs9 
-
-    xvmaddasp      vs0,   vs16,   vs10
-    xvmaddasp      vs1,   vs17,   vs10 
-    xvmaddasp      vs2,   vs18,   vs10
-    xvmaddasp      vs3,   vs19,   vs10 
-
-    xvmaddasp      vs4,   vs16,   vs11
-    xvmaddasp      vs5,   vs17,   vs11 
-    xvmaddasp      vs6,   vs18,   vs11
-    xvmaddasp      vs7,   vs19,   vs11  
-
-    xvmaddasp      vs0,   vs30,   vs12
-    xvmaddasp      vs1,   vs31,   vs12 
-    xvmaddasp      vs2,   vs32,   vs12
-    xvmaddasp      vs3,   vs33,   vs12 
-
-    xvmaddasp      vs4,   vs30,   vs13
-    xvmaddasp      vs5,   vs31,   vs13 
-    xvmaddasp      vs6,   vs32,   vs13
-    xvmaddasp      vs7,   vs33,   vs13 
-
-    xvmaddasp      vs0,   vs34,   vs14
-    xvmaddasp      vs1,   vs35,   vs14 
-    xvmaddasp      vs2,   vs36,   vs14
-    xvmaddasp      vs3,   vs37,   vs14 
-
-    xvmaddasp      vs4,   vs34,   vs15
-    xvmaddasp      vs5,   vs35,   vs15 
-    xvmaddasp      vs6,   vs36,   vs15
-    xvmaddasp      vs7,   vs37,   vs15    
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP8(\Index,32)  
-    addi        \AREG, \AREG, DISP64(\Index,256)
-.endif 
-  
-.endm
-
-.macro KERNEL2x16_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 3  
-    xxspltw   vs9,  vs36, 2 
-    xxspltw   vs10, vs36, 1 
-    xxspltw   vs11, vs36, 0    
-    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG)
-    lxv vs28,   DISP32(\Index,32+\OffsetA)(\AREG)
-    lxv vs29,   DISP32(\Index,48+\OffsetA)(\AREG)      
-    lxv vs16,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
-    lxv vs17,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)
-    lxv vs18,   DISP32(\Index,64+ 32+\OffsetA)(\AREG)
-    lxv vs19,   DISP32(\Index,64+ 48+\OffsetA)(\AREG) 
- 
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs28,   vs8
-    xvmaddasp      vs3,   vs29,   vs8 
-
-    xvmaddasp      vs4,   vs26,   vs9
-    xvmaddasp      vs5,   vs27,   vs9 
-    xvmaddasp      vs6,   vs28,   vs9
-    xvmaddasp      vs7,   vs29,   vs9 
-
-    xvmaddasp      vs0,   vs16,   vs10
-    xvmaddasp      vs1,   vs17,   vs10 
-    xvmaddasp      vs2,   vs18,   vs10
-    xvmaddasp      vs3,   vs19,   vs10 
-
-    xvmaddasp      vs4,   vs16,   vs11
-    xvmaddasp      vs5,   vs17,   vs11 
-    xvmaddasp      vs6,   vs18,   vs11
-    xvmaddasp      vs7,   vs19,   vs11   
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP4(\Index,16)  
-    addi        \AREG, \AREG, DISP32(\Index,128)
-.endif 
-  
-.endm
-
-
-.macro SAVE2x16
-
-#ifndef TRMMKERNEL    
-    lxv        vs16, 0(CO)
-    lxv        vs17, 16(CO) 
-    lxv        vs18, 32(CO)  
-    lxv        vs19, 48(CO)      
-#endif
-  add     T1, CO, LDC 
-#ifndef TRMMKERNEL    
-    lxv        vs26, 0(T1)
-    lxv        vs27, 16(T1) 
-    lxv        vs28, 32(T1)  
-    lxv        vs29, 48(T1)      
-#endif
-
-#if defined(TRMMKERNEL)
-  xvmulsp        vs16, vs0, alpha_r
-  xvmulsp        vs17, vs1, alpha_r 
-  xvmulsp        vs18, vs2, alpha_r
-  xvmulsp        vs19, vs3, alpha_r   
-  xvmulsp        vs26, vs4, alpha_r
-  xvmulsp        vs27, vs5, alpha_r 
-  xvmulsp        vs28, vs6, alpha_r
-  xvmulsp        vs29, vs7, alpha_r
-#else
-  xvmaddasp        vs16, vs0, alpha_r
-  xvmaddasp        vs17, vs1, alpha_r 
-  xvmaddasp        vs18, vs2, alpha_r
-  xvmaddasp        vs19, vs3, alpha_r   
-  xvmaddasp        vs26, vs4, alpha_r
-  xvmaddasp        vs27, vs5, alpha_r 
-  xvmaddasp        vs28, vs6, alpha_r
-  xvmaddasp        vs29, vs7, alpha_r
-#endif
-    stxv        vs16, 0(CO)
-    stxv        vs17, 16(CO) 
-    stxv        vs18, 32(CO)  
-    stxv        vs19, 48(CO)      
-    
-    stxv        vs26, 0(T1)
-    stxv        vs27, 16(T1) 
-    stxv        vs28, 32(T1)  
-    stxv        vs29, 48(T1) 
- 
-  addi CO,CO,64
-
-.endm
-
-/*       M=8 N=2 */
-
-.macro KERNEL2x8_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL2x8_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- 
-.macro Zero2x8
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
- 
-    xxlxor      vs4,   vs4,   vs4
-    xxlxor      vs5,   vs5,   vs5 
-     
-.endm
- 
-.macro KERNEL2x8
-  KERNEL2x8_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x8_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- 
-.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 1 
-    xxspltw   vs9,  vs36, 0 
-    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)          
- 
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8  
-
-    xvmulsp      vs4,   vs26,   vs9
-    xvmulsp      vs5,   vs27,   vs9      
-     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8   
-
-    xvmaddasp      vs4,   vs26,   vs9
-    xvmaddasp      vs5,   vs27,   vs9  
- 
- .endif
-   
-    addi        \BREG, \BREG, DISP2(\Index,8)
-    addi        \AREG, \AREG, DISP8(\Index,32)  
- 
-.endm
-
-
-
-
-.macro KERNEL2x8_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
-    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG) 
-
-    lxv vs16,   DISP32(\Index,32+ 0+\OffsetA)(\AREG)
-    lxv vs17,   DISP32(\Index,32+ 16+\OffsetA)(\AREG) 
-
-    lxv vs30,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
-    lxv vs31,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)  
-        
-    lxv vs34,   DISP32(\Index, 96+ 0+\OffsetA)(\AREG)
-    lxv vs35,   DISP32(\Index, 96+ 16+\OffsetA)(\AREG) 
-
-    xxspltw   vs8,  vs38, 3  
-    xxspltw   vs9,  vs38, 2 
-    xxspltw   vs10, vs38, 1 
-    xxspltw   vs11, vs38, 0  
-
-    xxspltw   vs12,  vs39, 3  
-    xxspltw   vs13,  vs39, 2 
-    xxspltw   vs14, vs39, 1 
-    xxspltw   vs15, vs39, 0  
-
- 
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs4,   vs26,   vs9
-    xvmaddasp      vs5,   vs27,   vs9 
- 
-
-    xvmaddasp      vs0,   vs16,   vs10
-    xvmaddasp      vs1,   vs17,   vs10 
-    xvmaddasp      vs4,   vs16,   vs11
-    xvmaddasp      vs5,   vs17,   vs11 
- 
-
-    xvmaddasp      vs0,   vs30,   vs12
-    xvmaddasp      vs1,   vs31,   vs12 
-    xvmaddasp      vs4,   vs30,   vs13
-    xvmaddasp      vs5,   vs31,   vs13 
-
-    xvmaddasp      vs0,   vs34,   vs14
-    xvmaddasp      vs1,   vs35,   vs14 
-    xvmaddasp      vs4,   vs34,   vs15
-    xvmaddasp      vs5,   vs35,   vs15 
-   
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP8(\Index,32)  
-    addi        \AREG, \AREG, DISP32(\Index,128)
-.endif 
-  
-.endm
-
-.macro KERNEL2x8_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 3  
-    xxspltw   vs9,  vs36, 2 
-    xxspltw   vs10, vs36, 1 
-    xxspltw   vs11, vs36, 0    
-    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)
-    lxv vs16,   DISP16(\Index,32+\OffsetA)(\AREG)
-    lxv vs17,   DISP16(\Index,48+\OffsetA)(\AREG)      
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8  
-
-    xvmaddasp      vs4,   vs26,   vs9
-    xvmaddasp      vs5,   vs27,   vs9  
-
-    xvmaddasp      vs0,   vs16,   vs10
-    xvmaddasp      vs1,   vs17,   vs10   
-
-    xvmaddasp      vs4,   vs16,   vs11
-    xvmaddasp      vs5,   vs17,   vs11     
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP4(\Index,16)  
-    addi        \AREG, \AREG, DISP16(\Index,64)
-.endif 
-  
-.endm
-
-
-.macro SAVE2x8
-
-#ifndef TRMMKERNEL    
-    lxv        vs16, 0(CO)
-    lxv        vs17, 16(CO)     
-#endif
-  add     T1, CO, LDC 
-#ifndef TRMMKERNEL    
-    lxv        vs26, 0(T1)
-    lxv        vs27, 16(T1) 
-    
-#endif
-
-#if defined(TRMMKERNEL)
-  xvmulsp        vs16, vs0, alpha_r
-  xvmulsp        vs17, vs1, alpha_r  
-  xvmulsp        vs26, vs4, alpha_r
-  xvmulsp        vs27, vs5, alpha_r 
-#else
-  xvmaddasp        vs16, vs0, alpha_r
-  xvmaddasp        vs17, vs1, alpha_r  
-  xvmaddasp        vs26, vs4, alpha_r
-  xvmaddasp        vs27, vs5, alpha_r 
-#endif
-
-    stxv        vs16, 0(CO)
-    stxv        vs17, 16(CO) 
-     
-    
-    stxv        vs26, 0(T1)
-    stxv        vs27, 16(T1) 
-
-  addi CO,CO,32
-
-.endm
-
-
-/*M=4*/
-
-
-.macro KERNEL2x4_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL2x4_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- /* we will aggregate on save vs0 +vs4 vs11+vs5 */
-.macro Zero2x4
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
- 
-    xxlxor      vs4,   vs4,   vs4
-    xxlxor      vs5,   vs5,   vs5 
-    
-.endm
- 
-.macro KERNEL2x4
-  KERNEL2x4_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x4_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- 
-.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 1 
-    xxspltw   vs9,  vs36, 0 
-    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)        
- 
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8 
-    xvmulsp      vs1,   vs26,   vs9     
-     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8 
-    xvmaddasp      vs1,   vs26,   vs9 
- .endif
-   
-    addi        \BREG, \BREG, DISP2(\Index,8)
-    addi        \AREG, \AREG, DISP4(\Index,16)  
- 
-.endm
-
-
-
-
-.macro KERNEL2x4_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
-    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs16,   DISP16(\Index,16+\OffsetA)(\AREG) 
-
-    lxv vs30,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
-    lxv vs34,   DISP16(\Index,32+ 16+\OffsetA)(\AREG) 
- 
-
-    xxspltw   vs8,  vs38, 3  
-    xxspltw   vs9,  vs38, 2 
-    xxspltw   vs10, vs38, 1 
-    xxspltw   vs11, vs38, 0  
-
-    xxspltw   vs12,  vs39, 3  
-    xxspltw   vs13,  vs39, 2 
-    xxspltw   vs14, vs39, 1 
-    xxspltw   vs15, vs39, 0  
-
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs26,   vs9 
-    xvmaddasp      vs4,   vs16,   vs10
-    xvmaddasp      vs5,   vs16,   vs11 
- 
-
-    xvmaddasp      vs0,   vs30,   vs12
-    xvmaddasp      vs1,   vs30,   vs13 
-    xvmaddasp      vs4,   vs34,   vs14
-    xvmaddasp      vs5,   vs34,   vs15 
- 
-   
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP8(\Index,32)  
-    addi        \AREG, \AREG, DISP16(\Index,64)
-.endif 
-  
-.endm
-
-.macro KERNEL2x4_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 3  
-    xxspltw   vs9,  vs36, 2 
-    xxspltw   vs10, vs36, 1 
-    xxspltw   vs11, vs36, 0    
-    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
-    lxv vs16,   DISP8(\Index, 16+\OffsetA)(\AREG)      
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs26,   vs9 
-    xvmaddasp      vs4,   vs16,   vs10
-    xvmaddasp      vs5,   vs16,   vs11     
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP4(\Index,16)  
-    addi        \AREG, \AREG, DISP8(\Index,32)
-.endif 
-  
-.endm
-
-
-.macro SAVE2x4
-
-#ifndef TRMMKERNEL    
-    lxv        vs16, 0(CO)     
-#endif
-  add     T1, CO, LDC 
-#ifndef TRMMKERNEL    
-    lxv        vs26, 0(T1) 
-    
-#endif
-    /*aggregate vectors*/
-  xvaddsp         vs0,vs0,vs4
-  xvaddsp         vs1,vs1,vs5 
-#if defined(TRMMKERNEL)
-  xvmulsp        vs16, vs0, alpha_r 
-  xvmulsp        vs26, vs1, alpha_r 
-#else
-  xvmaddasp        vs16, vs0, alpha_r 
-  xvmaddasp        vs26, vs1, alpha_r 
-#endif
-
-  stxv        vs16, 0(CO) 
-  stxv        vs26, 0(T1)  
-
-  addi CO,CO,16
-
-.endm
-
-
-/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2  */
-.macro SWITCH_PERMUTE_INNER
-    xxpermdi	permute_mask,	permute_mask,	permute_mask,2
-.endm
-
-.macro Zero2x2
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
-    SWITCH_PERMUTE_INNER
-.endm
- 
-.macro KERNEL2x2
-  KERNEL2x2_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x2_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x2_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- 
-.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
-    xxperm   vs9,  vs36, permute_mask 
-    lxsd v5,   DISP2(\Index, 0+\OffsetA)(\AREG)        
- 
- 
-.if \First==1
-    xvmulsp      vs0,   vs37,   vs36 
-    xvmulsp      vs1,   vs37,   vs9     
-     
-.else 
-    xvmaddasp      vs0,   vs37,   vs36 
-    xvmaddasp      vs1,   vs37,   vs9 
- .endif
-   
-    addi        \BREG, \BREG, DISP2(\Index,8)
-    addi        \AREG, \AREG, DISP2(\Index,8)  
- 
-.endm
-
-
-
-
-.macro KERNEL2x2_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\BREG)
-    lxv vs10,    DISP8(\Index, 16+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
-    lxv vs16,   DISP8(\Index,16+\OffsetA)(\AREG) 
-
- 
-    xxperm   vs9,  vs8, permute_mask   
-    xxperm   vs11, vs10, permute_mask  
-
-
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs26,   vs9 
-    xvmaddasp      vs0,   vs16,   vs10
-    xvmaddasp      vs1,   vs16,   vs11 
- 
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP8(\Index,32)  
-    addi        \AREG, \AREG, DISP8(\Index,32)
-.endif 
-  
-.endm
-
-.macro KERNEL2x2_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs8,    DISP4(\Index, 0+\OffsetB)(\BREG)  
-    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG) 
-
- 
-    xxperm   vs9,  vs8, permute_mask    
-
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs26,   vs9  
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP4(\Index,16)  
-    addi        \AREG, \AREG, DISP4(\Index,16)
-.endif 
-.endm
-
-
-.macro SAVE2x2
-
-#ifndef TRMMKERNEL    
-    lxsd v4   , 0(CO)     
-#endif
-  add     T1, CO, LDC 
-#ifndef TRMMKERNEL    
-    lxsd v5   , 0(T1) 
-    
-#endif
-    /*aggregate vectors*/
-  xxpermdi         vs4,vs0,vs0,2
-  xxpermdi         vs5,vs1,vs1,2  
-  xvaddsp          vs0,vs0,vs4
-  xvaddsp         vs1,vs1,vs5 
-  /*   */
-  /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10}  */
-  xxperm    vs1,vs1, permute_mask
-
-
-  xxmrghw   vs2 ,vs1,vs0
-  xxpermdi         vs2,vs2,vs2,2  
-  xxmrghw   vs3 ,vs0,vs1  
-#if defined(TRMMKERNEL)
-  xvmulsp        vs36, vs2, alpha_r 
-  xvmulsp        vs37, vs3, alpha_r 
-#else
-  xvmaddasp        vs36, vs2, alpha_r 
-  xvmaddasp        vs37, vs3, alpha_r 
-#endif
-  /**** store last two words*/
-
-
-  stxsd       v4, 0(CO) 
-  stxsd        v5, 0(T1)  
-
-  addi CO,CO,8
-
-.endm
-
-/*--------------------------- M=1 N=2 */
-.macro Zero2x1
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
-    xxlxor    vs2,vs2,vs2 
-    xxlxor    vs3,vs3,vs3     
-.endm
- 
-.macro KERNEL2x1
-  KERNEL2x1_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x1_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x1_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- /*
-   we will calculate 1 alone then will add it to batched ones
- */
-.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxssp v3,   DISP2(\Index, 0+\OffsetB)(\BREG)
-    lxssp v4,   DISP2(\Index, 4+\OffsetB)(\BREG) 
-    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\AREG)        
- 
- 
-.if \First==1
-    xvmulsp      vs2,   vs37,   vs35 
-    xvmulsp      vs3,   vs37,   vs36     
-     
-.else 
-    xsmaddadp     vs2,   vs37,   vs35
-    xsmaddadp      vs3,   vs37,   vs36
- .endif
-   
-    addi        \BREG, \BREG, DISP2(\Index,8)
-    addi        \AREG, \AREG, DISP1(\Index,4)  
- 
-.endm
-
-
-
-
-.macro KERNEL2x1_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\BREG)
-    lxv vs10,   DISP8(\Index, 16+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)
-   
-    xxmrglw   vs5, vs26,vs26
-    xxmrghw   vs6, vs26,vs26 
- 
-    xvmaddasp      vs0,   vs8,   vs5
-    xvmaddasp      vs1,   vs10,   vs6 
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP8(\Index,32)  
-    addi        \AREG, \AREG, DISP4(\Index,16)
-.endif 
-  
-.endm
-
-.macro KERNEL2x1_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxssp v3,   DISP4(\Index, 0+\OffsetB)(\BREG)
-    lxssp v4,   DISP4(\Index, 4+\OffsetB)(\BREG) 
-    lxssp v7,   DISP4(\Index, 8+\OffsetB)(\BREG)
-    lxssp v8,   DISP4(\Index, 12+\OffsetB)(\BREG)    
-    lxssp v5,   DISP2(\Index, 0+\OffsetA)(\AREG)        
-    lxssp v6,   DISP2(\Index, 4+\OffsetA)(\AREG)  
- 
- 
-    xsmaddadp      vs2,   vs37,   vs35
-    xsmaddadp      vs3,   vs37,   vs36
-
-    xsmaddadp      vs2,   vs38,   vs39 
-    xsmaddadp      vs3,   vs38,   vs40      
- 
-   
-    addi        \BREG, \BREG, DISP4(\Index,16)
-    addi        \AREG, \AREG, DISP2(\Index,8) 
-.endm
-
-
-.macro SAVE2x1
-
-#ifndef TRMMKERNEL    
-    lxssp v4   , 0(CO)     
-#endif
-  add     T1, CO, LDC 
-#ifndef TRMMKERNEL    
-    lxssp v5   , 0(T1) 
-    
-#endif
-
-  /*convert alpha_r for multiply*/
-  xscvspdp  vs16,alpha_r
-
- /*aggregate vectors 2x2_4   */ 
-      xxpermdi         vs4,vs0,vs0,2
-      xxpermdi         vs5,vs1,vs1,2  
-      xvaddsp          vs0,vs0,vs4
-      xvaddsp         vs1,vs1,vs5 
-      xvaddsp         vs0,vs0,vs1 
-/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/
-  xscvspdp  vs5, vs0
-  xxspltw   vs6, vs0, 1  
-  xscvspdp  vs6,vs6 
-  xsadddp  vs2,vs2,vs6
-  xsadddp  vs3,vs3,vs5  
-
-  /**** store last two words*/
-#if defined(TRMMKERNEL) 
-  xsmuldp  vs36,vs2, vs16 
-  xsmuldp  vs37,vs3, vs16  
- 
-#else
-  xsmaddadp  vs36,vs2, vs16 
-  xsmaddadp  vs37,vs3, vs16 
-#endif  
-
-  stxssp       v4, 0(CO) 
-  stxssp        v5, 0(T1)  
-
-  addi CO,CO,4
-
-.endm
-
-
-
-/****************************N=1 section*****************/
-
-.macro KERNEL1x16_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL1x16_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- 
-.macro Zero1x16
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
-    xxlxor      vs2,   vs2,   vs2
-    xxlxor      vs3,   vs3,   vs3       
-.endm
- 
-.macro KERNEL1x16
-  KERNEL1x16_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x16_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- 
-.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
-    xscvdpspn   vs36,vs36
-    xxspltw     vs8,  vs36, 0
-    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)   
-    lxv vs28,   DISP16(\Index, 32+\OffsetA)(\AREG)
-    lxv vs29,   DISP16(\Index,48+\OffsetA)(\AREG)        
- 
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8 
-    xvmulsp      vs2,   vs28,   vs8
-    xvmulsp      vs3,   vs29,   vs8 
-  
-     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs28,   vs8
-    xvmaddasp      vs3,   vs29,   vs8 
- 
- .endif
-   
-    addi        \BREG, \BREG, DISP1(\Index,4)
-    addi        \AREG, \AREG, DISP16(\Index,64)  
- 
-.endm
-
-
-
-
-.macro KERNEL1x16_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP64(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP64(\Index,16+\OffsetA)(\AREG)
-    lxv vs28,   DISP64(\Index,32+\OffsetA)(\AREG)
-    lxv vs29,   DISP64(\Index,48+\OffsetA)(\AREG)  
-
-    lxv vs16,   DISP64(\Index,64+ 0+\OffsetA)(\AREG)
-    lxv vs17,   DISP64(\Index,64+ 16+\OffsetA)(\AREG)
-    lxv vs18,   DISP64(\Index,64+ 32+\OffsetA)(\AREG)
-    lxv vs19,   DISP64(\Index,64+ 48+\OffsetA)(\AREG) 
-
-    xxspltw   vs8,  vs38, 3  
-    xxspltw   vs9,  vs38, 2 
-
-    lxv vs30,   DISP64(\Index,128+ 0+\OffsetA)(\AREG)
-    lxv vs31,   DISP64(\Index,128+ 16+\OffsetA)(\AREG)
-    lxv vs32,   DISP64(\Index,128+ 32+\OffsetA)(\AREG)
-    lxv vs33,   DISP64(\Index,128+ 48+\OffsetA)(\AREG)  
-        
-    lxv vs34,   DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
-    lxv vs35,   DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
-    lxv vs36,   DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
-    lxv vs37,   DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) 
-
-    xxspltw   vs10, vs38, 1 
-    xxspltw   vs11, vs38, 0    
-
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs28,   vs8
-    xvmaddasp      vs3,   vs29,   vs8 
- 
-
-    xvmaddasp      vs0,   vs16,   vs9
-    xvmaddasp      vs1,   vs17,   vs9 
-    xvmaddasp      vs2,   vs18,   vs9
-    xvmaddasp      vs3,   vs19,   vs9 
- 
-
-    xvmaddasp      vs0,   vs30,   vs10
-    xvmaddasp      vs1,   vs31,   vs10 
-    xvmaddasp      vs2,   vs32,   vs10
-    xvmaddasp      vs3,   vs33,   vs10 
- 
-
-    xvmaddasp      vs0,   vs34,   vs11
-    xvmaddasp      vs1,   vs35,   vs11 
-    xvmaddasp      vs2,   vs36,   vs11
-    xvmaddasp      vs3,   vs37,   vs11 
-
- 
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP4(\Index,16)  
-    addi        \AREG, \AREG, DISP64(\Index,256)
-.endif 
-  
-.endm
-
-.macro KERNEL1x16_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 1  
-    xxspltw   vs9,  vs36, 0      
-    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG)
-    lxv vs28,   DISP32(\Index,32+\OffsetA)(\AREG)
-    lxv vs29,   DISP32(\Index,48+\OffsetA)(\AREG)      
-    lxv vs16,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
-    lxv vs17,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)
-    lxv vs18,   DISP32(\Index,64+ 32+\OffsetA)(\AREG)
-    lxv vs19,   DISP32(\Index,64+ 48+\OffsetA)(\AREG) 
- 
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs28,   vs8
-    xvmaddasp      vs3,   vs29,   vs8 
- 
-
-    xvmaddasp      vs0,   vs16,   vs9
-    xvmaddasp      vs1,   vs17,   vs9 
-    xvmaddasp      vs2,   vs18,   vs9
-    xvmaddasp      vs3,   vs19,   vs9 
-  
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP2(\Index,8)  
-    addi        \AREG, \AREG, DISP32(\Index,128)
-.endif 
-  
-.endm
-
-
-.macro SAVE1x16
-
-#ifndef TRMMKERNEL    
-    lxv        vs16, 0(CO)
-    lxv        vs17, 16(CO) 
-    lxv        vs18, 32(CO)  
-    lxv        vs19, 48(CO)      
-#endif
- 
-
-#if defined(TRMMKERNEL)
-  xvmulsp        vs16, vs0, alpha_r
-  xvmulsp        vs17, vs1, alpha_r 
-  xvmulsp        vs18, vs2, alpha_r
-  xvmulsp        vs19, vs3, alpha_r   
-#else
-  xvmaddasp        vs16, vs0, alpha_r
-  xvmaddasp        vs17, vs1, alpha_r 
-  xvmaddasp        vs18, vs2, alpha_r
-  xvmaddasp        vs19, vs3, alpha_r   
-#endif
-    stxv        vs16, 0(CO)
-    stxv        vs17, 16(CO) 
-    stxv        vs18, 32(CO)  
-    stxv        vs19, 48(CO)      
-    
-  addi CO,CO,64
-
-.endm
-
-/*       M=8 N=1 */
-
-.macro KERNEL1x8_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL1x8_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- 
-.macro Zero1x8
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1  
-    xxlxor      vs2,   vs2,   vs2
-    xxlxor      vs3,   vs3,   vs3          
-.endm
- 
-.macro KERNEL1x8
-  KERNEL1x8_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x8_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- 
-.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
-    xscvdpspn   vs36,vs36
-    xxspltw     vs8,  vs36, 0
-    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)         
- 
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8  
-  
-     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8  
- 
- .endif
-   
-    addi        \BREG, \BREG, DISP1(\Index,4)
-    addi        \AREG, \AREG, DISP8(\Index,32)  
- 
-.endm
-
-
-
-
-.macro KERNEL1x8_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG) 
-
-    lxv vs16,   DISP32(\Index,32+ 0+\OffsetA)(\AREG)
-    lxv vs17,   DISP32(\Index,32+ 16+\OffsetA)(\AREG) 
-
-    xxspltw   vs8,  vs38, 3  
-    xxspltw   vs9,  vs38, 2 
-
-    lxv vs30,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
-    lxv vs31,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)   
-        
-    lxv vs34,   DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG)
-    lxv vs35,   DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG)  
-
-    xxspltw   vs10, vs38, 1 
-    xxspltw   vs11, vs38, 0    
-
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8  
- 
-
-    xvmaddasp      vs2,   vs16,   vs9
-    xvmaddasp      vs3,   vs17,   vs9  
- 
-
-    xvmaddasp      vs0,   vs30,   vs10
-    xvmaddasp      vs1,   vs31,   vs10  
- 
-
-    xvmaddasp      vs2,   vs34,   vs11
-    xvmaddasp      vs3,   vs35,   vs11  
-
- 
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP4(\Index,16)  
-    addi        \AREG, \AREG, DISP32(\Index,128)
-.endif 
-  
-.endm
-
-.macro KERNEL1x8_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 1  
-    xxspltw   vs9,  vs36, 0      
-    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)     
-    lxv vs16,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
-    lxv vs17,   DISP16(\Index,32+ 16+\OffsetA)(\AREG) 
- 
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8  
- 
-
-    xvmaddasp      vs2,   vs16,   vs9
-    xvmaddasp      vs3,   vs17,   vs9   
-  
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP2(\Index,8)  
-    addi        \AREG, \AREG, DISP16(\Index,64)
-.endif 
-  
-.endm
-
-
-.macro SAVE1x8
-
-#ifndef TRMMKERNEL    
-    lxv        vs16, 0(CO)
-    lxv        vs17, 16(CO)       
-#endif
-   /* aggregate vs0 vs2 and vs1 vs3*/
-  xvaddsp vs0,vs0,vs2
-  xvaddsp  vs1,vs1,vs3
-#if defined(TRMMKERNEL)
-  xvmulsp        vs16, vs0, alpha_r
-  xvmulsp        vs17, vs1, alpha_r     
-#else
-  xvmaddasp        vs16, vs0, alpha_r
-  xvmaddasp        vs17, vs1, alpha_r  
-#endif
-    stxv        vs16, 0(CO)
-    stxv        vs17, 16(CO)      
-    
-  addi CO,CO,32
-
-.endm
-/*M=4*/
-
-.macro KERNEL1x4_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL1x4_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- 
-.macro Zero1x4
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1  
-    xxlxor      vs2,   vs2,   vs2
-    xxlxor      vs3,   vs3,   vs3          
-.endm
- 
-.macro KERNEL1x4
-  KERNEL1x4_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x4_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- 
-.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
-    xscvdpspn   vs36,vs36
-    xxspltw     vs8,  vs36, 0
-    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)         
- 
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8 
-.else 
-    xvmaddasp      vs0,   vs26,   vs8 
- 
- .endif
-   
-    addi        \BREG, \BREG, DISP1(\Index,4)
-    addi        \AREG, \AREG, DISP4(\Index,16)  
- 
-.endm
-
-
-
-
-.macro KERNEL1x4_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG) 
- 
-
-    xxspltw   vs8,  vs38, 3  
-    xxspltw   vs9,  vs38, 2 
-
-    lxv vs30,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
-    lxv vs31,   DISP16(\Index,32+ 16+\OffsetA)(\AREG)   
-          
-
-    xxspltw   vs10, vs38, 1 
-    xxspltw   vs11, vs38, 0    
-
- 
-    xvmaddasp      vs0,   vs26,   vs8 
-
-    xvmaddasp      vs1,   vs27,   vs9 
-
-    xvmaddasp      vs2,   vs30,   vs10   
- 
-
-    xvmaddasp      vs3,   vs31,   vs11   
-
- 
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP4(\Index,16)  
-    addi        \AREG, \AREG, DISP16(\Index,64)
-.endif 
-  
-.endm
-
-.macro KERNEL1x4_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 1  
-    xxspltw   vs9,  vs36, 0      
-    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)      
- 
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs9
-  
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP2(\Index,8)  
-    addi        \AREG, \AREG, DISP8(\Index,32)
-.endif 
-  
-.endm
-
-
-.macro SAVE1x4
-
-#ifndef TRMMKERNEL    
-    lxv        vs16, 0(CO)       
-#endif
-   /* aggregate */
-  xvaddsp vs0,vs0,vs2
-  xvaddsp  vs1,vs1,vs3
-  xvaddsp  vs0,vs1,vs0
-#if defined(TRMMKERNEL)
-  xvmulsp        vs16, vs0, alpha_r     
-#else
-  xvmaddasp        vs16, vs0, alpha_r  
-#endif
-    stxv        vs16, 0(CO)      
-    
-  addi CO,CO,16
-
-.endm
-
-/* M=2 N=1*/ 
-.macro Zero1x2
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
-    xxlxor    vs2,vs2,vs2 
-    xxlxor    vs3,vs3,vs3     
-.endm
- 
-.macro KERNEL1x2
-  KERNEL1x2_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x2_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x2_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- /*
-   we will calculate 1 alone then will add it to batched ones
- */
-.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxssp v3,   DISP2(\Index, 0+\OffsetB)(\AREG)
-    lxssp v4,   DISP2(\Index, 4+\OffsetB)(\AREG) 
-    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\BREG)        
- 
- 
-.if \First==1
-    xvmuldp      vs2,   vs37,   vs35 
-    xvmuldp      vs3,   vs37,   vs36     
-     
-.else 
-    xsmaddadp     vs2,   vs37,   vs35
-    xsmaddadp      vs3,   vs37,   vs36
- .endif
-   
-    addi        \AREG, \AREG,  DISP2(\Index,8) 
-    addi        \BREG, \BREG, DISP1(\Index,4) 
- 
-.endm
-
-
-
-
-.macro KERNEL1x2_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\AREG)
-    lxv vs10,   DISP8(\Index, 16+\OffsetB)(\AREG) 
-
-    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\BREG)
-   
-    xxmrglw   vs5, vs26,vs26
-    xxmrghw   vs6, vs26,vs26 
- 
-    xvmaddasp      vs0,   vs8,   vs5
-    xvmaddasp      vs1,   vs10,   vs6 
- 
- 
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP8(\Index,32)
-    addi        \BREG, \BREG,  DISP4(\Index,16)  
-.endif 
-  
-.endm
-
-.macro KERNEL1x2_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxssp v3,   DISP4(\Index, 0+\OffsetB)(\AREG)
-    lxssp v4,   DISP4(\Index, 4+\OffsetB)(\AREG) 
-    lxssp v7,   DISP4(\Index, 8+\OffsetB)(\AREG)
-    lxssp v8,   DISP4(\Index, 12+\OffsetB)(\AREG)    
-    lxssp v5,   DISP2(\Index, 0+\OffsetA)(\BREG)        
-    lxssp v6,   DISP2(\Index, 4+\OffsetA)(\BREG)  
- 
- 
-    xsmaddadp      vs2,   vs37,   vs35
-    xsmaddadp      vs3,   vs37,   vs36
-
-    xsmaddadp      vs2,   vs38,   vs39 
-    xsmaddadp      vs3,   vs38,   vs40      
- 
-   
-    addi        \AREG, \AREG, DISP4(\Index,16)
-    addi        \BREG, \BREG, DISP2(\Index,8) 
-.endm
-
-
-.macro SAVE1x2
-
-#ifndef TRMMKERNEL    
-    lxssp v4   , 0(CO)      
-    lxssp v5   , 4(CO) 
-    
-#endif
-
-  /*convert alpha_r for multiply*/
-  xscvspdp  vs16,alpha_r
-
- /*aggregate vectors 1x2_4   */ 
-      xxpermdi         vs4,vs0,vs0,2
-      xxpermdi         vs5,vs1,vs1,2  
-      xvaddsp          vs0,vs0,vs4
-      xvaddsp         vs1,vs1,vs5 
-      xvaddsp         vs0,vs0,vs1 
-/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/
-  xscvspdp  vs5, vs0
-  xxspltw   vs6, vs0, 1  
-  xscvspdp  vs6,vs6 
-  xsadddp  vs2,vs2,vs6
-  xsadddp  vs3,vs3,vs5  
-
-  /**** store last two words*/
-#if defined(TRMMKERNEL) 
-  xsmuldp  vs36,vs2, vs16 
-  xsmuldp  vs37,vs3, vs16  
- 
-#else
-  xsmaddadp  vs36,vs2, vs16 
-  xsmaddadp  vs37,vs3, vs16 
-#endif  
-
-  stxssp       v4, 0(CO) 
-  stxssp        v5, 4(CO)  
-
-  addi CO,CO,8
-
-.endm
-/*///////////////// N=1 M=1 //////////////////*/
-.macro Zero1x1
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
-    xxlxor      vs2, vs2,vs2 
-    xxlxor      vs3,vs3,vs3 
-    xxlxor      vs4,vs4,vs4       
-.endm
- 
-.macro KERNEL1x1
-  KERNEL1x1_1 AO,BO, 1, 0,0,0
-.endm
-
-.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x1_I_16 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x1_I_8 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x1_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x1_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- /*
-   we will calculate 1 alone ( FIRST==1 to zero vs4) 
- */
-.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxssp v3,   DISP1(\Index, 0+\OffsetB)(\AREG) 
-    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\BREG)        
- 
- 
-.if \First==1
-    xvmuldp      vs4,   vs37,   vs35       
-     
-.else 
-    xsmaddadp     vs4,   vs37,   vs35 
- .endif
-   
-    addi        \AREG, \AREG,  DISP1(\Index,4) 
-    addi        \BREG, \BREG, DISP1(\Index,4) 
- 
-.endm
-
-
-.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\AREG) 
-    lxv vs9,    DISP16(\Index, 16+\OffsetB)(\AREG) 
-    lxv vs10,   DISP16(\Index, 32+0+\OffsetB)(\AREG) 
-    lxv vs11,   DISP16(\Index, 32+ 16+\OffsetB)(\AREG)        
-    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\BREG) 
-    lxv vs16,   DISP16(\Index, 16+\OffsetA)(\BREG) 
-    lxv vs17,   DISP16(\Index, 32+0+\OffsetA)(\BREG) 
-    lxv vs18,   DISP16(\Index, 32+16+\OffsetA)(\BREG)     
-    xvmaddasp      vs0,   vs8,   vs26 
-    xvmaddasp      vs1,   vs9,   vs16  
-    xvmaddasp      vs2,   vs10,  vs17 
-    xvmaddasp      vs3,   vs11,  vs18
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP16(\Index,64)
-    addi        \BREG, \BREG,  DISP16(\Index,64)  
-.endif 
-  
-.endm
-
-.macro KERNEL1x1_I_8  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\AREG) 
-    lxv vs9,    DISP8(\Index, 16+\OffsetB)(\AREG)     
-    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\BREG) 
-    lxv vs16,   DISP8(\Index, 16+\OffsetA)(\BREG) 
-    xvmaddasp      vs0,   vs8,   vs26 
-    xvmaddasp      vs1,   vs9,   vs16 
- 
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP8(\Index,32)
-    addi        \BREG, \BREG,  DISP8(\Index,32)  
-.endif 
-  
-.endm
-
-
-.macro KERNEL1x1_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs8,    DISP4(\Index, 0+\OffsetB)(\AREG) 
-    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\BREG) 
- 
-    xvmaddasp      vs0,   vs8,   vs26 
- 
- 
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP4(\Index,16)
-    addi        \BREG, \BREG,  DISP4(\Index,16)  
-.endif 
-  
-.endm
-
-.macro KERNEL1x1_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\AREG) 
-    lxsd v5,   DISP2(\Index, 0+\OffsetA)(\BREG) 
- 
-    xvmaddasp      vs0,   vs36,   vs37 
- 
-    addi        \AREG, \AREG, DISP2(\Index,8)
-    addi        \BREG, \BREG, DISP2(\Index,8) 
-.endm
-
-
-.macro SAVE1x1
-
-#ifndef TRMMKERNEL    
-    lxssp v4   , 0(CO)    
-    
-#endif
-
-  /*convert alpha_r for multiply*/
-  xscvspdp  vs16,alpha_r
-
- /*aggregate vectors   */ 
-      xvaddsp          vs0,vs0,vs1
-      xvaddsp          vs2,vs2,vs3
-      xvaddsp          vs0,vs0,vs2
-
-      xxpermdi         vs7,vs0,vs0,2   
-      xvaddsp          vs0,vs0,vs7 
-/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/
-  xscvspdp  vs5, vs0
-  xxspltw   vs6, vs0, 1  
-  xscvspdp  vs6,vs6 
-  xsadddp  vs7,vs5,vs6
-  xsadddp  vs4,vs4,vs7  
-
-  /**** store last two words*/
-#if defined(TRMMKERNEL) 
-  xsmuldp  vs36,vs4, vs16   
- 
-#else
-  xsmaddadp  vs36,vs4, vs16   
-#endif  
-
-  stxssp       v4, 0(CO)    
-
-  addi CO,CO,4
-
-.endm
-
-
-
-
-/****************************TRMM POINTER REFRESH MACROSES*************************/
-
-.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
-		.if \SHIFT_VAL==16 
-			slwi		\REG1,	\REG2,	6			
-		.elseif \SHIFT_VAL==8  
-			slwi		\REG1,	\REG2,	5			 
-		.elseif \SHIFT_VAL==4
-			slwi		\REG1,	\REG2,	4			  
-		.elseif \SHIFT_VAL==2
-			slwi		\REG1,	\REG2,	3			 
-		.elseif \SHIFT_VAL==1
-			slwi		\REG1,	\REG2,	2			 
-		.endif
-.endm
-
-/*
-//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		ptrbb = bb;
-// #else
-// 		ptrba += off*16;
-// 		ptrbb = bb + off*2;
-// #endif
-*/
-.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
-    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
-        /* ptrbb = bb;*/
-        mr \PTR_B,\B_VAL     /* refresh BPOINT */
-
-    #else
-		    /*
-        // ptrba  =ptrba+ off*C_A;
-        // ptrbb = bb + off*C_B; 
-				*/
-		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
-		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
-		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
-		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
-    #endif 
-.endm
-
-
-/*
-// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-// 		temp = bk-off;
-// #elif defined(LEFT)
-// 		temp = off+16;	// number of values in A
-// #else
-// 		temp = off+2;	// number of values in B
-// #endif
-*/
-.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
-    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
-                            /* temp = bk-off;*/
-           sub \TEMP_BK,\BK_VAL,\OFF_VAL
-
-    #elif defined(LEFT)
-                            /* temp = off+INCR_A;	// number of values in A */
-           addi \TEMP_BK, \OFF_VAL, \INCR_A
-    #else
-                            /* temp = off+INCR_B	// number of values in B*/
-           addi \TEMP_BK,\OFF_VAL, \INCR_B
-    #endif
-
-.endm
-/*
-// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		temp = bk - off;
-// #ifdef LEFT
-// 		temp -= 16; // number of values in A
-// #else
-// 		temp -= 2; // number of values in B
-// #endif
-// 		ptrba += temp*16;
-// 		ptrbb += temp*2;
-// #endif
-
-// #ifdef LEFT
-// 		off += 16; // number of values in A
-// #endif
-*/
- 
-
-.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
-
-    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-                    /*temp = bk - off;*/
-                sub \TEMP_BK,\BK_VAL,\OFF_VAL
-    #ifdef LEFT
-                    /*temp -= 8; // number of values in A*/
-                addi \TEMP_BK,\TEMP_BK,-\C_A
-    #else
-                    /*temp -= 4; // number of values in B*/
-                addi \TEMP_BK,\TEMP_BK,-\C_B 
-    #endif
-                    /*ptrba += temp*C_A;
-                    ptrbb += temp*C_B;*/ 
-                SHIFT_REG T4,\TEMP_BK,\C_A
-								SHIFT_REG T2,\TEMP_BK,\C_B
-                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
-								add \PTR_B, \PTR_B,T2 
-
-    #endif
-
-    #ifdef LEFT
-                    /*off += 8; // number of values in A*/
-                 addi \OFF_VAL,\OFF_VAL,\C_A
-    #endif
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#define unit_size 4
+#define DISP64(ind,disp) (ind*unit_size*64+disp)
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+
+/**********************************************************************************************
+* Macros for N=8 and M=16
+**********************************************************************************************/
+
+ 
+
+.macro KERNEL8x16_L1_L4  Index,IsLast
+  KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+ 
+.macro KERNEL8x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero8X16
+    xxlxor		vs32,	vs32,	vs32
+    xxlxor		vs33,	vs33,	vs33
+	xxlxor		vs34,	vs34,	vs34
+	xxlxor		vs35,	vs35,	vs35
+	xxlxor		vs36,	vs36,	vs36
+	xxlxor		vs37,	vs37,	vs37
+	xxlxor		vs38,	vs38,	vs38
+	xxlxor		vs39,	vs39,	vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47
+	xxlxor		vs48,	vs48,	vs48
+	xxlxor		vs49,	vs49,	vs49
+	xxlxor		vs50,	vs50,	vs50
+	xxlxor		vs51,	vs51,	vs51 
+	xxlxor		vs52,	vs52,	vs52
+	xxlxor		vs53,	vs53,	vs53
+	xxlxor		vs54,	vs54,	vs54
+	xxlxor		vs55,	vs55,	vs55 
+	xxlxor		vs56,	vs56,	vs56
+	xxlxor		vs57,	vs57,	vs57
+	xxlxor		vs58,	vs58,	vs58
+	xxlxor		vs59,	vs59,	vs59 
+	xxlxor		vs60,	vs60,	vs60
+	xxlxor		vs61,	vs61,	vs61
+	xxlxor		vs62,	vs62,	vs62
+	xxlxor		vs63,	vs63,	vs63	
+.endm
+
+.macro LOAD8x16  OffsetA,OffsetB
+
+	lxv	vs24,	(\OffsetB+0)(BO)
+	lxv	vs28,	(\OffsetB+16)(BO)
+	xxperm  	vs26,	vs24,		permute_mask
+	xxperm  	vs30,	vs28,		permute_mask	  
+	lxv	vs0,	(\OffsetA+0)(AO)
+	lxv	vs1,	(\OffsetA+16)(AO)
+	xxpermdi	vs25,	vs24,	vs24,2	   
+	xxpermdi	vs29,	vs28,	vs28,2	  
+	lxv	vs2,	(\OffsetA+32)(AO)
+	lxv	vs3,	(\OffsetA+48)(AO) 
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2	 	
+
+.endm
+
+.macro END8x16_NORMAL
+  END8x16 0, AO, BO, 64,32 
+.endm
+
+.macro END8x16_WITHOUT_ADD
+	END8x16 0, AO,BO,0,0
+.endm
+
+.macro END8x16 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+    xvmulsp     vs34, vs2,vs24  
+    xvmulsp     vs35, vs3,vs24  
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+    xvmulsp     vs38, vs2,vs25  
+    xvmulsp     vs39, vs3,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+    xvmulsp     vs42, vs2,vs26  
+    xvmulsp     vs43, vs3,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+    xvmulsp     vs46, vs2,vs27  
+    xvmulsp     vs47, vs3,vs27
+
+    xvmulsp     vs48, vs0,vs28
+    xvmulsp     vs49, vs1,vs28
+    xvmulsp     vs50, vs2,vs28  
+    xvmulsp     vs51, vs3,vs28  
+
+    xvmulsp     vs52, vs0,vs29
+    xvmulsp     vs53, vs1,vs29
+    xvmulsp     vs54, vs2,vs29  
+    xvmulsp     vs55, vs3,vs29
+
+    xvmulsp     vs56, vs0,vs30
+    xvmulsp     vs57, vs1,vs30
+    xvmulsp     vs58, vs2,vs30  
+    xvmulsp     vs59, vs3,vs30
+
+    xvmulsp     vs60, vs0,vs31
+    xvmulsp     vs61, vs1,vs31
+    xvmulsp     vs62, vs2,vs31  
+    xvmulsp     vs63, vs3,vs31
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs38, vs2,vs25  
+    xvmaddasp       vs39, vs3,vs25 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs46, vs2,vs27  
+    xvmaddasp       vs47, vs3,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+    xvmaddasp       vs50, vs2,vs28  
+    xvmaddasp       vs51, vs3,vs28  
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+    xvmaddasp       vs54, vs2,vs29  
+    xvmaddasp       vs55, vs3,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+    xvmaddasp       vs58, vs2,vs30  
+    xvmaddasp       vs59, vs3,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+    xvmaddasp       vs62, vs2,vs31  
+    xvmaddasp       vs63, vs3,vs31 
+
+.endif
+.endm  
+
+.macro KERNEL8x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+KERNEL8x16_2  \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
+KERNEL8x16_2  \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
+
+.endm
+
+.macro KERNEL8x16 First
+
+  LOAD8x16 0,0
+  END8x16 \First, AO, BO, 64,32 
+.endm
+
+.macro LOAD8x16_2
+    LOAD8x16_2O AO,BO, 0,0
+.endm	
+
+.macro LOAD8x16_2O  AREG,BREG, OffsetA,OffsetB
+  lxv	vs8,	(\OffsetB)(\BREG)
+  lxv	vs12,	(16+\OffsetB)(\BREG)
+  lxv	vs24,	(32+\OffsetB)(\BREG)
+  lxv	vs28,	(32+16+\OffsetB)(\BREG)
+  lxv	vs4,	(0+\OffsetA)(\AREG)
+  lxv	vs5,	(16+\OffsetA)(\AREG)
+  xxperm  	vs10,	vs8,		permute_mask
+  xxperm  	vs14,	vs12,		permute_mask	
+  lxv	vs6,	(32+\OffsetA)(\AREG)
+  lxv	vs7,	(48+\OffsetA)(\AREG) 
+  xxpermdi	vs9,	vs8,	 vs8,2	 
+  xxpermdi	vs13,	vs12,	vs12,2	 
+  lxv	vs0,	(64+\OffsetA)(\AREG)
+  lxv	vs1,	(64+16+\OffsetA)(\AREG) 
+  xxpermdi	vs11,	vs10,	vs10,2	
+  xxpermdi	vs15,	vs14,	vs14,2	
+  lxv	vs2,	(64+32+\OffsetA)(\AREG)
+  lxv	vs3,	(64+48+\OffsetA)(\AREG)
+
+  xxperm  	vs26,	vs24,	permute_mask
+  xxperm  	vs30,	vs28,	permute_mask	
+  xxpermdi	vs25,	vs24,	vs24,2 
+  xxpermdi	vs29,	vs28,	vs28,2	      
+  xxpermdi	vs27,	vs26,	vs26,2	
+  xxpermdi	vs31,	vs30,	vs30,2	 
+.endm
+
+.macro END8x16_2	  
+  /*for load2 offset will be 128 and 64*/
+   KERNEL8x16_2	AO,BO,	128,64,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL8x16_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL8x16_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL8x16_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL8x16_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp		vs32, vs4,vs8
+  xvmaddasp		vs33, vs5,vs8
+  xvmaddasp		vs48, vs4,vs12
+  xvmaddasp		vs49, vs5,vs12
+
+  xvmaddasp		vs40, vs4,vs10
+  xvmaddasp		vs41, vs5,vs10
+  xvmaddasp		vs56, vs4,vs14
+  xvmaddasp		vs57, vs5,vs14
+
+  xvmaddasp		vs36, vs4,vs9
+  xvmaddasp		vs37, vs5,vs9
+  xvmaddasp		vs52, vs4,vs13
+  xvmaddasp		vs53, vs5,vs13
+
+  xvmaddasp		vs44, vs4,vs11
+  xvmaddasp		vs45, vs5,vs11
+  xvmaddasp		vs60, vs4,vs15
+  xvmaddasp		vs61, vs5,vs15
+
+.if \Complete==0	
+   lxv	vs4,	DISP32(\Index,0+\OffsetA)(\AREG)
+   lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
+.endif
+
+  xvmaddasp		vs34, vs6,vs8	
+  xvmaddasp		vs35, vs7,vs8	
+  xvmaddasp		vs50, vs6,vs12
+  xvmaddasp		vs51, vs7,vs12
+.if \Complete==0  
+  lxv vs8,  DISP16(\Index,\OffsetB)(\BREG)
+  lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
+.endif    
+  xvmaddasp		vs42, vs6,vs10
+  xvmaddasp		vs43, vs7,vs10
+  xvmaddasp		vs58, vs6,vs14
+  xvmaddasp		vs59, vs7,vs14
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask
+  xxperm    vs14, vs12,   permute_mask    
+.endif    
+  xvmaddasp		vs38, vs6,vs9	
+  xvmaddasp		vs39, vs7,vs9	
+  xvmaddasp   vs54, vs6,vs13
+  xvmaddasp   vs55, vs7,vs13
+.if \Complete==0
+  xxpermdi  vs9,  vs8,   vs8,2   
+  xxpermdi  vs13, vs12, vs12,2   
+.endif    
+  xvmaddasp		vs46, vs6,vs11
+  xvmaddasp		vs47, vs7,vs11
+  xvmaddasp		vs62, vs6,vs15
+  xvmaddasp		vs63, vs7,vs15
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi  vs15, vs14, vs14,2  
+.endif  
+
+.if \Complete==0
+   lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
+   lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
+.endif 
+
+  xvmaddasp		vs32, vs0,vs24
+  xvmaddasp		vs33, vs1,vs24
+  xvmaddasp		vs48, vs0,vs28
+  xvmaddasp		vs49, vs1,vs28
+  xvmaddasp		vs40, vs0,vs26
+  xvmaddasp		vs41, vs1,vs26
+  xvmaddasp		vs56, vs0,vs30
+  xvmaddasp		vs57, vs1,vs30
+  xvmaddasp		vs36, vs0,vs25
+  xvmaddasp		vs37, vs1,vs25
+  xvmaddasp		vs52, vs0,vs29
+  xvmaddasp		vs53, vs1,vs29
+  xvmaddasp		vs44, vs0,vs27
+  xvmaddasp		vs45, vs1,vs27
+  xvmaddasp		vs60, vs0,vs31
+  xvmaddasp		vs61, vs1,vs31 
+.if \Complete==0
+  lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
+  lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG) 
+.endif
+
+  xvmaddasp		vs34, vs2,vs24
+  xvmaddasp		vs35, vs3,vs24	  
+  xvmaddasp		vs50, vs2,vs28
+  xvmaddasp		vs51, vs3,vs28
+.if \Complete==0
+  lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
+  lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
+.endif  
+  xvmaddasp		vs42, vs2,vs26
+  xvmaddasp		vs43, vs3,vs26
+  xvmaddasp		vs58, vs2,vs30
+  xvmaddasp		vs59, vs3,vs30
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask
+  xxperm    vs30, vs28, permute_mask  
+.endif  
+  xvmaddasp		vs38, vs2,vs25
+  xvmaddasp		vs39, vs3,vs25
+  xvmaddasp		vs54, vs2,vs29
+  xvmaddasp		vs55, vs3,vs29
+.if \Complete==0
+  xxpermdi  vs25, vs24, vs24,2 
+  xxpermdi  vs29, vs28, vs28,2    
+.endif  
+  xvmaddasp		vs46, vs2,vs27
+  xvmaddasp		vs47, vs3,vs27
+  xvmaddasp		vs62, vs2,vs31	
+  xvmaddasp		vs63, vs3,vs31
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2  
+  xxpermdi  vs31, vs30, vs30,2   
+.endif
+.if \Complete==0
+  lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
+  lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+
+.if \IsLast==1	
+.if \Complete==1
+	addi		\BREG, \BREG,  DISP16(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP32(\Index,\OffsetA)  
+
+.else
+	addi		\BREG, \BREG,  DISP16(\Index,64)
+  addi    \AREG, \AREG, DISP32(\Index,128)  
+
+.endif
+.endif   
+
+
+.endm
+
+ 
+.macro SAVE8x16
+
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+  add     T4, T2, T10  
+  add     T5, T3, T10 
+
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+
+
+
+   /* permute to restore butterfly rank 1 updateto normal promoted one */  
+    /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC)  vs11 MEM(CO+3*LDC) */
+    /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC)  vs15 MEM(16+CO+3*LDC) */
+    /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC)  vs19 MEM(32+CO+3*LDC) */
+    /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC)  vs27 MEM(32+CO+3*LDC) */
+
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+#ifndef TRMMKERNEL    
+    lxv        vs32, 0(CO)
+    lxv        vs33, 16(CO) 
+#endif 
+    xxmrglw     vs16,   vs34,   vs46
+    xxmrglw     vs18,   vs38,   vs42   
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+
+    xxmrghw     vs4,    vs38,   vs42
+    xxmrghw     vs5,    vs34,   vs46
+
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxmrglw     vs24,   vs35,   vs47
+    xxmrglw     vs26,   vs39,   vs43  
+
+    xxlor      vs17,    vs16,   vs16
+    xxlor      vs19,    vs18,   vs18
+
+    xxmrghw     vs30,   vs39,   vs43 
+    xxmrghw     vs31,   vs35,   vs47
+#ifndef TRMMKERNEL       
+    lxv        vs34, 32(CO)  
+    lxv        vs35, 48(CO)      
+#endif
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+#ifndef TRMMKERNEL    
+    lxv        vs36, 0(T1)
+    lxv        vs37, 16(T1) 
+#endif
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+#ifndef TRMMKERNEL      
+    lxv        vs38, 32(T1)  
+    lxv        vs39, 48(T1)     
+#endif
+
+    xxlor      vs25,    vs24,   vs24
+    xxlor      vs27,    vs26,   vs26 
+
+
+
+#ifndef TRMMKERNEL       
+    lxv        vs40, 0(T2)
+    lxv        vs41, 16(T2) 
+#endif
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+#ifndef TRMMKERNEL     
+    lxv        vs42, 32(T2)  
+    lxv        vs43, 48(T2)     
+#endif  
+       
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2  
+#ifndef TRMMKERNEL    
+    lxv        vs44, 0(T3)
+    lxv        vs45, 16(T3)
+#endif
+    xxperm     vs16,    vs4,    save_permute_1
+    xxperm     vs18,    vs5,    save_permute_1
+#ifndef TRMMKERNEL      
+    lxv        vs46, 32(T3)  
+    lxv        vs47, 48(T3)                 
+#endif  
+
+    
+
+
+      
+    xxperm     vs17,    vs4,    save_permute_2   
+    xxperm     vs19,    vs5,    save_permute_2      
+#ifdef TRMMKERNEL
+    xvmulsp     vs32,   vs8,    alpha_r 
+    xvmulsp     vs33,   vs12,   alpha_r                 
+#else 
+    xvmaddasp   vs32,   vs8,    alpha_r 
+    xvmaddasp   vs33,   vs12,   alpha_r            
+#endif 
+    xxperm     vs24,    vs30,   save_permute_1
+    xxperm     vs26,    vs31,   save_permute_1 
+
+ 
+    stxv        vs32, 0(CO)
+    stxv        vs33, 16(CO)     
+#ifdef TRMMKERNEL   
+    xvmulsp     vs34,   vs16,   alpha_r 
+    xvmulsp     vs35,   vs24,   alpha_r                 
+#else    
+    xvmaddasp   vs34,   vs16,   alpha_r 
+    xvmaddasp   vs35,   vs24,   alpha_r           
+#endif 
+         
+    xxperm     vs25,    vs30,   save_permute_2   
+    xxperm     vs27,    vs31,   save_permute_2  
+
+
+    stxv        vs34, 32(CO)  
+    stxv        vs35, 48(CO)  
+#ifdef TRMMKERNEL  
+    xvmulsp     vs36,   vs9,    alpha_r 
+    xvmulsp     vs37,   vs13,   alpha_r                
+#else   
+    xvmaddasp   vs36,   vs9,    alpha_r 
+    xvmaddasp   vs37,   vs13,   alpha_r           
+#endif 
+    stxv        vs36, 0(T1)
+    stxv        vs37, 16(T1)
+#ifdef TRMMKERNEL  
+    xvmulsp     vs38,   vs17,   alpha_r 
+    xvmulsp     vs39,   vs25,   alpha_r               
+#else   
+    xvmaddasp   vs38,   vs17,   alpha_r 
+    xvmaddasp   vs39,   vs25,   alpha_r         
+#endif 
+    stxv        vs38, 32(T1)  
+    stxv        vs39, 48(T1)
+
+#ifdef TRMMKERNEL
+    xvmulsp     vs40,   vs10,   alpha_r 
+    xvmulsp     vs41,   vs14,   alpha_r                    
+#else 
+    xvmaddasp   vs40,   vs10,   alpha_r 
+    xvmaddasp   vs41,   vs14,   alpha_r   
+#endif   
+
+    stxv        vs40, 0(T2)
+    stxv        vs41, 16(T2)  
+#ifdef TRMMKERNEL 
+    xvmulsp     vs42,   vs18,   alpha_r 
+    xvmulsp     vs43,   vs26,   alpha_r                     
+#else   
+    xvmaddasp   vs42,   vs18,   alpha_r 
+    xvmaddasp   vs43,   vs26,   alpha_r
+#endif      
+    stxv        vs42, 32(T2)  
+    stxv        vs43, 48(T2)  
+#ifdef TRMMKERNEL  
+    xvmulsp     vs44,   vs11,   alpha_r 
+    xvmulsp     vs45,   vs15,   alpha_r                    
+#else
+    xvmaddasp   vs44,   vs11,   alpha_r 
+    xvmaddasp   vs45,   vs15,   alpha_r    
+#endif      
+    stxv        vs44, 0(T3)
+    stxv        vs45, 16(T3) 
+#ifdef TRMMKERNEL 
+    xvmulsp     vs46,   vs19,   alpha_r 
+    xvmulsp     vs47,   vs27,   alpha_r                   
+#else 
+    xvmaddasp   vs46,   vs19,   alpha_r 
+    xvmaddasp   vs47,   vs27,   alpha_r 
+#endif      
+    stxv        vs46, 32(T3)  
+    stxv        vs47, 48(T3)
+  
+ /*****the same with the second 8X8 ****/
+ #ifndef TRMMKERNEL 
+    lxv        vs32, 0(T4)
+    lxv        vs33, 16(T4) 
+#endif  
+    xxmrglw     vs8,    vs48,   vs60
+    xxmrglw     vs10,   vs52,   vs56  
+#ifndef TRMMKERNEL    
+    lxv        vs34, 32(T4)  
+    lxv        vs35, 48(T4)  
+#endif  
+    xxmrghw     vs1,    vs48,   vs60
+    xxmrghw     vs0,    vs52,   vs56
+#ifndef TRMMKERNEL        
+    lxv        vs36, 0(T5)
+    lxv        vs37, 16(T5) 
+#endif  
+    xxmrglw     vs12,   vs49,   vs61
+    xxmrglw     vs14,   vs53,   vs57  
+#ifndef TRMMKERNEL    
+    lxv        vs38,32(T5)  
+    lxv        vs39, 48(T5)     
+#endif   
+ 
+    xxmrghw     vs2,    vs53,   vs57
+    xxmrghw     vs3,    vs49,   vs61
+#ifndef TRMMKERNEL   
+    lxv        vs40, 0(T6)
+    lxv        vs41, 16(T6)
+#endif  
+    xxmrglw     vs16,   vs50,   vs62
+    xxmrglw     vs18,   vs54,   vs58   
+#ifndef TRMMKERNEL      
+    lxv        vs42, 32(T6)  
+    lxv        vs43, 48(T6) 
+#endif  
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+    xxmrghw     vs4,    vs54,   vs58
+    xxmrghw     vs5,    vs50,   vs62
+#ifndef TRMMKERNEL              
+    lxv        vs44, 0(T7)
+    lxv        vs45, 16(T7) 
+#endif  
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+ 
+    xxmrglw     vs24,   vs51,   vs63
+    xxmrglw     vs26,   vs55,   vs59 
+#ifndef TRMMKERNEL    
+    lxv        vs46, 32(T7)  
+    lxv        vs47, 48(T7)     
+#endif  
+    xxlor      vs17,    vs16,   vs16
+    xxlor      vs19,    vs18,   vs18
+    xxmrghw     vs30,   vs55,   vs59 
+    xxmrghw     vs31,   vs51,   vs63 
+
+ 
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+     
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+    xxlor      vs25,    vs24,   vs24
+    xxlor      vs27,    vs26,   vs26 
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+ #ifdef TRMMKERNEL
+    xvmulsp     vs32,   vs8,    alpha_r 
+    xvmulsp     vs33,   vs12,   alpha_r                 
+#else 
+    xvmaddasp   vs32,   vs8,    alpha_r 
+    xvmaddasp   vs33,   vs12,   alpha_r            
+#endif  
+    xxperm     vs16,    vs4,    save_permute_1
+    xxperm     vs18,    vs5,    save_permute_1
+    stxv        vs32, 0(T4)
+    stxv        vs33, 16(T4) 
+    xxperm     vs17,    vs4,    save_permute_2   
+    xxperm     vs19,    vs5,    save_permute_2      
+    xxperm     vs24,    vs30,   save_permute_1
+    xxperm     vs26,    vs31,   save_permute_1 
+    xxperm     vs25,    vs30,   save_permute_2   
+    xxperm     vs27,    vs31,   save_permute_2      
+
+#ifdef TRMMKERNEL   
+    xvmulsp     vs34,   vs16,   alpha_r 
+    xvmulsp     vs35,   vs24,   alpha_r                 
+#else    
+    xvmaddasp   vs34,   vs16,   alpha_r 
+    xvmaddasp   vs35,   vs24,   alpha_r           
+#endif 
+    stxv        vs34, 32(T4)  
+    stxv        vs35, 48(T4)  
+
+#ifdef TRMMKERNEL  
+    xvmulsp     vs36,   vs9,    alpha_r 
+    xvmulsp     vs37,   vs13,   alpha_r                
+#else   
+    xvmaddasp   vs36,   vs9,    alpha_r 
+    xvmaddasp   vs37,   vs13,   alpha_r           
+#endif 
+    stxv        vs36, 0(T5)
+    stxv        vs37, 16(T5) 
+
+#ifdef TRMMKERNEL  
+    xvmulsp     vs38,   vs17,   alpha_r 
+    xvmulsp     vs39,   vs25,   alpha_r               
+#else  
+    xvmaddasp   vs38,   vs17,   alpha_r 
+    xvmaddasp   vs39,   vs25,   alpha_r         
+#endif 
+
+
+
+ 
+    stxv        vs38, 32(T5)  
+    stxv        vs39, 48(T5)
+
+
+#ifdef TRMMKERNEL
+    xvmulsp     vs40,   vs10,   alpha_r 
+    xvmulsp     vs41,   vs14,   alpha_r                    
+#else 
+    xvmaddasp   vs40,   vs10,   alpha_r 
+    xvmaddasp   vs41,   vs14,   alpha_r   
+#endif  
+    stxv        vs40, 0(T6)
+    stxv        vs41, 16(T6) 
+#ifdef TRMMKERNEL 
+    xvmulsp     vs42,   vs18,   alpha_r 
+    xvmulsp     vs43,   vs26,   alpha_r                     
+#else   
+    xvmaddasp   vs42,   vs18,   alpha_r 
+    xvmaddasp   vs43,   vs26,   alpha_r
+#endif  
+    stxv        vs42, 32(T6)  
+    stxv        vs43, 48(T6)  
+#ifdef TRMMKERNEL  
+    xvmulsp     vs44,   vs11,   alpha_r 
+    xvmulsp     vs45,   vs15,   alpha_r                    
+#else
+    xvmaddasp   vs44,   vs11,   alpha_r 
+    xvmaddasp   vs45,   vs15,   alpha_r    
+#endif  
+
+    stxv        vs44, 0(T7)
+    stxv        vs45, 16(T7) 
+#ifdef TRMMKERNEL 
+    xvmulsp     vs46,   vs19,   alpha_r 
+    xvmulsp     vs47,   vs27,   alpha_r                   
+#else 
+    xvmaddasp   vs46,   vs19,   alpha_r 
+    xvmaddasp   vs47,   vs27,   alpha_r 
+#endif  
+ 
+    stxv        vs46, 32(T7)  
+    stxv        vs47, 48(T7)
+  
+
+    addi CO,CO,64
+
+
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=8
+**********************************************************************************************/
+
+.macro LOAD8x8_1
+   LOAD8x8 1
+.endm
+
+.macro LOAD8x8_0
+   LOAD8x8 0
+.endm
+
+.macro KERNEL8x8_L1_L4  Index,IsLast
+  KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL8x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL8x8_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x8_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro END8x8_NORMAL
+  END8x8 0, AO, BO, 32,32 
+.endm
+
+.macro Zero8X8
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33
+ 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+ 
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41
+ 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45
+ 
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49
+ 
+    xxlxor      vs52,   vs52,   vs52
+    xxlxor      vs53,   vs53,   vs53
+ 
+    xxlxor      vs56,   vs56,   vs56
+    xxlxor      vs57,   vs57,   vs57
+  
+    xxlxor      vs60,   vs60,   vs60
+    xxlxor      vs61,   vs61,   vs61
+    
+.endm
+
+.macro LOAD8x8  Zero
+
+    lxv vs24,   0(BO)
+    lxv vs28,   16(BO)
+    lxv vs0,     0(AO)
+    lxv vs1,    16(AO)
+
+    xxperm      vs26,   vs24,       permute_mask
+    xxperm      vs30,   vs28,       permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2     
+    xxpermdi    vs29,   vs28,   vs28,2    
+
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2      
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45 
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49 
+    xxlxor      vs52,   vs52,   vs52
+    xxlxor      vs53,   vs53,   vs53 
+    xxlxor      vs56,   vs56,   vs56
+    xxlxor      vs57,   vs57,   vs57  
+    xxlxor      vs60,   vs60,   vs60
+    xxlxor      vs61,   vs61,   vs61  
+.endif
+.endm
+
+
+.macro END8x8 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+
+    xvmulsp     vs48, vs0,vs28
+    xvmulsp     vs49, vs1,vs28
+
+    xvmulsp     vs52, vs0,vs29
+    xvmulsp     vs53, vs1,vs29
+
+    xvmulsp     vs56, vs0,vs30
+    xvmulsp     vs57, vs1,vs30
+
+    xvmulsp     vs60, vs0,vs31
+    xvmulsp     vs61, vs1,vs31
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+.endif
+.endm  
+
+.macro KERNEL8x8_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs8,    DISP32(\Index, 0+\OffsetB)(\BREG)
+    lxv vs12,   DISP32(\Index,16+\OffsetB)(\BREG)
+
+    lxv vs4,    DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask
+    xxperm      vs14,   vs12,       permute_mask    
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xxpermdi    vs9,    vs8,    vs8,2    
+    xxpermdi    vs13,   vs12,   vs12,2   
+
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2  
+    xxpermdi    vs15,   vs14,   vs14,2  
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+    lxv vs24,   DISP32(\Index,32+\OffsetB)(\BREG)
+    lxv vs28,   DISP32(\Index,32+16+\OffsetB)(\BREG)
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+    xxperm      vs26,   vs24,       permute_mask
+    xxperm      vs30,   vs28,       permute_mask    
+
+    lxv vs0,    DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,32+16+\OffsetA)(\AREG)
+
+
+    xxpermdi    vs25,   vs24,   vs24,2     
+    xxpermdi    vs29,   vs28,   vs28,2    
+
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2      
+
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+    xvmaddasp       vs48, vs4,vs12
+    xvmaddasp       vs49, vs5,vs12
+
+    xvmaddasp       vs52, vs4,vs13
+    xvmaddasp       vs53, vs5,vs13
+    lxv vs8,    DISP32(\Index,64+\OffsetB)(\BREG)
+    lxv vs12,   DISP32(\Index,64+16+\OffsetB)(\BREG)
+    xvmaddasp       vs56, vs4,vs14
+    xvmaddasp       vs57, vs5,vs14
+
+    xvmaddasp       vs60, vs4,vs15
+    xvmaddasp       vs61, vs5,vs15
+
+    xxperm      vs10,   vs8,        permute_mask
+    xxperm      vs14,   vs12,       permute_mask   
+ 
+
+    lxv vs4,    DISP32(\Index,64+0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,64+16+\OffsetA)(\AREG)
+
+ 
+    xxpermdi    vs9,    vs8,    vs8,2    
+    xxpermdi    vs13,   vs12,   vs12,2  
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2  
+    xxpermdi    vs15,   vs14,   vs14,2  
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+.if \Complete==0
+    lxv vs24,   DISP32(\Index,96+\OffsetB)(\BREG)
+    lxv vs28,   DISP32(\Index,96+16+\OffsetB)(\BREG)
+.endif 
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+.if \Complete==0
+    xxperm      vs26,   vs24,   permute_mask
+    xxperm      vs30,   vs28,   permute_mask   
+.endif 
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+
+.if \Complete==0
+    lxv vs0,    DISP32(\Index,96+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,96+16+\OffsetA)(\AREG) 
+.endif 
+
+.if \Complete==0     
+    xxpermdi    vs25,   vs24,   vs24,2 
+    xxpermdi    vs29,   vs28,   vs28,2      
+
+.endif 
+.if \IsLast==1  
+.if \Complete==1
+  
+    addi        \BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
+    addi        \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
+.else
+  
+    addi        \BREG, \BREG,  DISP32(\Index,128)
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif
+.endif   
+ 
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2  
+    
+.endif
+ 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+    xvmaddasp       vs48, vs4,vs12
+    xvmaddasp       vs49, vs5,vs12
+
+    xvmaddasp       vs52, vs4,vs13
+    xvmaddasp       vs53, vs5,vs13
+
+    xvmaddasp       vs56, vs4,vs14
+    xvmaddasp       vs57, vs5,vs14
+
+    xvmaddasp       vs60, vs4,vs15
+    xvmaddasp       vs61, vs5,vs15
+
+.endm
+
+.macro KERNEL8x8 First
+
+  LOAD8x8 0
+  END8x8 \First, AO, BO, 32,32  
+.endm
+
+.macro KERNEL8x8_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+    
+    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\BREG)
+    lxv vs12,   DISP16(\Index,16+\OffsetB)(\BREG)
+
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP16(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask
+    xxperm      vs14,   vs12,       permute_mask    
+    xxpermdi    vs9,    vs8,    vs8,2    
+    xxpermdi    vs13,   vs12,   vs12,2   
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+.endif
+
+    xxpermdi    vs11,   vs10,   vs10,2  
+    xxpermdi    vs15,   vs14,   vs14,2  
+ 
+.if \First==1  
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+
+    xvmulsp     vs48, vs0,vs28
+    xvmulsp     vs49, vs1,vs28
+
+    xvmulsp     vs52, vs0,vs29
+    xvmulsp     vs53, vs1,vs29
+
+    xvmulsp     vs56, vs0,vs30
+    xvmulsp     vs57, vs1,vs30
+
+    xvmulsp     vs60, vs0,vs31
+    xvmulsp     vs61, vs1,vs31
+
+.else 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+.endif
+.if \Complete==0
+    lxv vs24,   DISP16(\Index,32+\OffsetB)(\BREG)
+    lxv vs28,   DISP16(\Index,32+16+\OffsetB)(\BREG)
+
+    lxv vs0,    DISP16(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP16(\Index,32+16+\OffsetA)(\AREG)
+
+    xxperm      vs26,   vs24,   permute_mask
+    xxperm      vs30,   vs28,   permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2   
+    xxpermdi    vs29,   vs28,   vs28,2  
+.endif    
+.if \IsLast==1  
+.if \Complete==1
+    addi        \BREG, \BREG,  DISP16(\Index,32+\OffsetB) 
+    addi        \AREG, \AREG,  DISP16(\Index,32+\OffsetA)
+
+.else
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+    addi        \AREG, \AREG,  DISP16(\Index,64) 
+.endif
+.endif
+
+.if \First==1
+    xvmulsp     vs32, vs4,vs8
+    xvmulsp     vs33, vs5,vs8
+
+    xvmulsp     vs36, vs4,vs9
+    xvmulsp     vs37, vs5,vs9
+
+.else
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.endif 
+ 
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2  
+ 
+.endif
+.if \First==1  
+    xvmulsp     vs40, vs4,vs10
+    xvmulsp     vs41, vs5,vs10
+
+    xvmulsp     vs44, vs4,vs11
+    xvmulsp     vs45, vs5,vs11
+
+    xvmulsp     vs48, vs4,vs12
+    xvmulsp     vs49, vs5,vs12
+
+    xvmulsp     vs52, vs4,vs13
+    xvmulsp     vs53, vs5,vs13
+
+    xvmulsp     vs56, vs4,vs14
+    xvmulsp     vs57, vs5,vs14
+
+    xvmulsp     vs60, vs4,vs15
+    xvmulsp     vs61, vs5,vs15
+
+.else 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+    xvmaddasp       vs48, vs4,vs12
+    xvmaddasp       vs49, vs5,vs12
+
+    xvmaddasp       vs52, vs4,vs13
+    xvmaddasp       vs53, vs5,vs13
+
+    xvmaddasp       vs56, vs4,vs14
+    xvmaddasp       vs57, vs5,vs14
+
+    xvmaddasp       vs60, vs4,vs15
+    xvmaddasp       vs61, vs5,vs15
+
+.endif
+
+.endm
+
+
+.macro SAVE8x8 
+ 
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+  add     T4, T2, T10  
+  add     T5, T3, T10 
+
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+
+#ifndef TRMMKERNEL    
+    lxv        vs34, 0(CO)
+    lxv        vs35, 16(CO)      
+    lxv        vs38, 0(T1)
+    lxv        vs39, 16(T1)  
+    lxv        vs42, 0(T2)
+    lxv        vs43, 16(T2)     
+    lxv        vs46, 0(T3)
+    lxv        vs47, 16(T3)  
+
+    lxv        vs50, 0(T4)
+    lxv        vs51, 16(T4)      
+    lxv        vs54, 0(T5)
+    lxv        vs55, 16(T5)  
+    lxv        vs58, 0(T6)
+    lxv        vs59, 16(T6)     
+    lxv        vs62, 0(T7)
+    lxv        vs63, 16(T7) 
+#endif  
+
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+ 
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+      
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+
+
+    /* multiply add normal way */
+ 
+#ifdef TRMMKERNEL
+    xvmulsp     vs34,   vs8,    alpha_r 
+    xvmulsp     vs35,   vs12,   alpha_r 
+    xvmulsp     vs38,   vs9,    alpha_r 
+    xvmulsp     vs39,   vs13,   alpha_r 
+    xvmulsp     vs42,   vs10,   alpha_r 
+    xvmulsp     vs43,   vs14,   alpha_r 
+    xvmulsp     vs46,   vs11,   alpha_r 
+    xvmulsp     vs47,   vs15,   alpha_r                    
+#else 
+    xvmaddasp   vs34,   vs8,    alpha_r 
+    xvmaddasp   vs35,   vs12,   alpha_r 
+    xvmaddasp   vs38,   vs9,    alpha_r 
+    xvmaddasp   vs39,   vs13,   alpha_r  
+    xvmaddasp   vs42,   vs10,   alpha_r 
+    xvmaddasp   vs43,   vs14,   alpha_r   
+    xvmaddasp   vs46,   vs11,   alpha_r 
+    xvmaddasp   vs47,   vs15,   alpha_r                     
+#endif     
+ 
+   
+    xxmrglw     vs8,    vs48,   vs60
+    xxmrglw     vs10,   vs52,   vs56  
+
+    xxmrghw     vs1,    vs48,   vs60
+    xxmrghw     vs0,    vs52,   vs56
+    stxv        vs34, 0(CO)
+    stxv        vs35, 16(CO) 
+    xxmrglw     vs12,   vs49,   vs61
+    xxmrglw     vs14,   vs53,   vs57  
+    stxv        vs38, 0(T1)
+    stxv        vs39, 16(T1) 
+    xxmrghw     vs2,    vs53,   vs57
+    xxmrghw     vs3,    vs49,   vs61
+    stxv        vs42, 0(T2)
+    stxv        vs43, 16(T2)   
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10  
+    stxv        vs46, 0(T3)
+    stxv        vs47, 16(T3)  
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+   
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    
+ 
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+ 
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+    
+ #ifdef TRMMKERNEL
+    xvmulsp     vs50,   vs8,    alpha_r 
+    xvmulsp     vs51,   vs12,   alpha_r 
+    xvmulsp     vs54,   vs9,    alpha_r 
+    xvmulsp     vs55,   vs13,   alpha_r 
+    xvmulsp     vs58,   vs10,   alpha_r 
+    xvmulsp     vs59,   vs14,   alpha_r 
+    xvmulsp     vs62,   vs11,   alpha_r 
+    xvmulsp     vs63,   vs15,   alpha_r                    
+#else 
+    xvmaddasp     vs50,   vs8,    alpha_r 
+    xvmaddasp     vs51,   vs12,   alpha_r 
+    xvmaddasp     vs54,   vs9,    alpha_r 
+    xvmaddasp     vs55,   vs13,   alpha_r 
+    xvmaddasp     vs58,   vs10,   alpha_r 
+    xvmaddasp     vs59,   vs14,   alpha_r 
+    xvmaddasp     vs62,   vs11,   alpha_r 
+    xvmaddasp     vs63,   vs15,   alpha_r                     
+#endif  
+
+    stxv        vs50, 0(T4)
+    stxv        vs51, 16(T4)      
+    stxv        vs54, 0(T5)
+    stxv        vs55, 16(T5)  
+    stxv        vs58, 0(T6)
+    stxv        vs59, 16(T6)     
+    stxv        vs62, 0(T7)
+    stxv        vs63, 16(T7)   
+
+    addi CO,CO,32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=4
+**********************************************************************************************/
+
+.macro LOAD8x4_1
+   LOAD8x4 1
+.endm
+
+.macro LOAD8x4_0
+   LOAD8x4 0
+.endm
+
+.macro KERNEL8x4_L1_L4  Index,IsLast
+  KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL8x4_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL8x4_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x4_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero8X4
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+    
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49
+    xxlxor      vs50,   vs50,   vs50
+    xxlxor      vs51,   vs51,   vs51  
+    
+.endm
+
+.macro LOAD8x4  Zero
+
+    lxv vs0,     0(AO)
+    lxv vs24,   0(BO)
+    lxv vs25,   16(BO)
+
+
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2        
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49
+    xxlxor      vs50,   vs50,   vs50
+    xxlxor      vs51,   vs51,   vs51  
+.endif
+.endm
+
+.macro END8x4_NORMAL
+  END8x4 0, AO, BO, 16,32 
+.endm
+
+.macro END8x4 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3
+
+    xvmulsp      vs48,   vs25,   vs0
+    xvmulsp      vs49,   vs25,   vs1
+    xvmulsp      vs50,   vs25,   vs2
+    xvmulsp      vs51,   vs25,   vs3  
+.else
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+
+.endif
+.endm  
+
+.macro KERNEL8x4_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP32(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetB)(\BREG)
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+
+    lxv vs0,    DISP16(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP32(\Index, 32+\OffsetB)(\BREG)
+    lxv vs25,   DISP32(\Index, 48+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+
+    xvmaddasp      vs48,   vs27,   vs4
+    xvmaddasp      vs49,   vs27,   vs5
+    xvmaddasp      vs50,   vs27,   vs6
+    xvmaddasp      vs51,   vs27,   vs7
+ 
+
+    lxv vs4,    DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs26,   DISP32(\Index, 64+\OffsetB)(\BREG)
+    lxv vs27,   DISP32(\Index, 80+\OffsetB)(\BREG)
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+
+.if \Complete==0 
+
+    lxv vs0,    DISP16(\Index, 48+\OffsetA)(\AREG)
+    lxv vs24,   DISP32(\Index, 96+\OffsetB)(\BREG)
+    lxv vs25,   DISP32(\Index, 96+16+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+
+    xvmaddasp      vs48,   vs27,   vs4
+    xvmaddasp      vs49,   vs27,   vs5
+    xvmaddasp      vs50,   vs27,   vs6
+    xvmaddasp      vs51,   vs27,   vs7
+
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)  
+    addi        \BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+    addi        \BREG, \BREG,  DISP32(\Index,128)
+
+.endif
+.endif   
+ 
+ 
+.endm
+
+.macro KERNEL8x4 First
+    LOAD8x4 0
+    END8x4 \First, AO, BO, 16,32  
+.endm
+
+.macro KERNEL8x4_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetB)(\BREG)
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3
+
+    xvmulsp      vs48,   vs25,   vs0
+    xvmulsp      vs49,   vs25,   vs1
+    xvmulsp      vs50,   vs25,   vs2
+    xvmulsp      vs51,   vs25,   vs3  
+.else 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+.endif
+
+.if \Complete==0 
+
+    lxv vs0,    DISP8(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP16(\Index, 32+\OffsetB)(\BREG)
+    lxv vs25,   DISP16(\Index, 48+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+
+.if \First==1
+    xvmulsp      vs32,   vs26,   vs4
+    xvmulsp      vs33,   vs26,   vs5 
+    xvmulsp      vs34,   vs26,   vs6
+    xvmulsp      vs35,   vs26,   vs7
+
+    xvmulsp      vs48,   vs27,   vs4
+    xvmulsp      vs49,   vs27,   vs5
+    xvmulsp      vs50,   vs27,   vs6
+    xvmulsp      vs51,   vs27,   vs7
+
+
+.else
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+
+    xvmaddasp      vs48,   vs27,   vs4
+    xvmaddasp      vs49,   vs27,   vs5
+    xvmaddasp      vs50,   vs27,   vs6
+    xvmaddasp      vs51,   vs27,   vs7
+.endif
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP8(\Index,16+\OffsetA)  
+    addi        \BREG, \BREG,  DISP16(\Index,32+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+
+.endif
+.endif   
+     
+  
+.endm
+
+
+.macro SAVE8x4
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+#if !defined(TRMMKERNEL)  
+  lxv        vs36, 0(CO)
+  lxv        vs37, 0(T1)
+#endif  
+  add     T2, CO, T10  
+  add     T3, T1, T10 
+#if !defined(TRMMKERNEL)    
+  lxv        vs38, 0(T2)
+  lxv        vs39, 0(T3)   
+#endif   
+  add     T4, T2, T10 
+  add     T5, T3, T10
+#if !defined(TRMMKERNEL)    
+  lxv        vs40, 0(T4)
+  lxv        vs41, 0(T5)
+#endif  
+  add     T6, T4, T10 
+  add     T7, T5, T10
+#if !defined(TRMMKERNEL)    
+  lxv        vs42, 0(T6)
+  lxv        vs43, 0(T7)
+#endif
+  xxmrglw  vs0, vs35,vs32
+  xxmrglw  vs1, vs34,vs33 
+  xxmrglw  vs4, vs32,vs35
+  xxmrglw  vs5, vs33,vs34 
+
+
+  xxmrghw  vs2, vs35,vs32
+  xxmrghw  vs3, vs34,vs33 
+  xxmrghw  vs6, vs32,vs35
+  xxmrghw  vs7, vs33,vs34  
+
+  xxmrgld  vs24, vs1, vs0  
+  xxmrghd  vs25,vs5,vs4 
+
+  xxmrgld  vs26, vs2, vs3  
+  xxmrghd  vs27,vs6,vs7
+
+
+  xxmrglw  vs0, vs51,vs48
+  xxmrglw  vs1, vs50,vs49  
+  xxmrglw  vs4, vs48,vs51
+  xxmrglw  vs5, vs49,vs50 
+
+  xxmrghw  vs2, vs51,vs48
+  xxmrghw  vs3, vs50,vs49  
+  xxmrghw  vs6, vs48,vs51
+  xxmrghw  vs7, vs49,vs50   
+
+  xxmrgld  vs28, vs1, vs0  
+  xxmrghd  vs29,vs5,vs4
+
+  xxmrgld  vs30, vs2, vs3   
+  xxmrghd  vs31,vs6,vs7
+#if defined(TRMMKERNEL)
+
+  xvmulsp        vs36, vs24, alpha_r
+  xvmulsp        vs37, vs25, alpha_r 
+  xvmulsp        vs38, vs26, alpha_r
+  xvmulsp        vs39, vs27, alpha_r   
+  xvmulsp        vs40, vs28, alpha_r
+  xvmulsp        vs41, vs29, alpha_r 
+  xvmulsp        vs42, vs30, alpha_r
+  xvmulsp        vs43, vs31, alpha_r
+#else
+  xvmaddasp        vs36, vs24, alpha_r
+  xvmaddasp        vs37, vs25, alpha_r 
+  xvmaddasp        vs38, vs26, alpha_r
+  xvmaddasp        vs39, vs27, alpha_r   
+  xvmaddasp        vs40, vs28, alpha_r
+  xvmaddasp        vs41, vs29, alpha_r 
+  xvmaddasp        vs42, vs30, alpha_r
+  xvmaddasp        vs43, vs31, alpha_r
+#endif
+
+  stxv        vs36, 0(CO)
+  stxv        vs37, 0(T1) 
+  stxv        vs38, 0(T2)
+  stxv        vs39, 0(T3)   
+  stxv        vs40, 0(T4)
+  stxv        vs41, 0(T5) 
+  stxv        vs42, 0(T6)
+  stxv        vs43, 0(T7)
+
+
+  addi CO,CO,16
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=2
+**********************************************************************************************/
+
+ 
+.macro KERNEL8x2_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+
+.macro Zero8x2
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3
+       
+.endm
+ 
+.macro KERNEL8x2
+  KERNEL8x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetB)(\BREG)      
+    xxspltw   vs8,  vs36, 0 
+    xxspltw   vs9,  vs36, 1  
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs26,   vs9
+    xvmulsp      vs3,   vs27,   vs9 
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs26,   vs9
+    xvmaddasp      vs3,   vs27,   vs9
+ 
+ .endif
+   
+    addi        \AREG, \AREG, DISP2(\Index,8)  
+    addi        \BREG, \BREG, DISP8(\Index,32)
+ 
+.endm
+
+.macro KERNEL8x2_I_2  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetB)(\BREG)
+    lxv vs28,   DISP16(\Index,32+\OffsetB)(\BREG)
+    lxv vs29,   DISP16(\Index,48+\OffsetB)(\BREG)      
+    xxspltw   vs8,  vs4, 2  
+    xxspltw   vs9,  vs4, 3 
+    xxspltw   vs10, vs4, 0 
+    xxspltw   vs11, vs4, 1
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs26,   vs9
+    xvmulsp      vs3,   vs27,   vs9 
+
+    xvmulsp      vs0,   vs28,   vs10
+    xvmulsp      vs1,   vs29,   vs10 
+    xvmulsp      vs2,   vs28,   vs11
+    xvmulsp      vs3,   vs29,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs26,   vs9
+    xvmaddasp      vs3,   vs27,   vs9
+
+    xvmaddasp      vs0,   vs28,   vs10
+    xvmaddasp      vs1,   vs29,   vs10 
+    xvmaddasp      vs2,   vs28,   vs11
+    xvmaddasp      vs3,   vs29,   vs11  
+ .endif
+
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+
+.macro SAVE8x2
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  add     T4, T2, T10 
+  add     T5, T3, T10 
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO) 
+  lxssp  v1,4(CO) 
+
+  lxssp  v2,0(T1)
+  lxssp  v3,4(T1)
+
+  lxssp  v4,0(T2)
+  lxssp  v5,4(T2)
+
+  lxssp  v6,0(T3)
+  lxssp  v7,4(T3)
+
+  lxssp  v8,0(T4)
+  lxssp  v9,4(T4)
+
+  lxssp  v10,0(T5)
+  lxssp  v11,4(T5)
+
+  lxssp  v12,0(T6)
+  lxssp  v13,4(T6)
+
+  lxssp  v14,0(T7)
+  lxssp  v15,4(T7)
+#endif
+  xscvspdp  vs5, vs2
+  xxspltw   vs6, vs2, 1 
+  xxspltw   vs7, vs2, 2 
+  xxspltw   vs8, vs2, 3  
+  xscvspdp  vs6,vs6
+  xscvspdp  vs7,vs7
+  xscvspdp  vs8,vs8
+
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+
+  xscvspdp  vs9, vs3
+  xxspltw   vs10, vs3, 1 
+  xxspltw   vs11, vs3, 2 
+  xxspltw   vs12, vs3, 3  
+  xscvspdp  vs10,vs10
+  xscvspdp  vs11,vs11
+  xscvspdp  vs12,vs12
+
+  xscvspdp  vs28, vs1
+  xxspltw   vs29, vs1, 1 
+  xxspltw   vs30, vs1, 2 
+  xxspltw   vs31, vs1, 3  
+  xscvspdp  vs29,vs29
+  xscvspdp  vs30,vs30
+  xscvspdp  vs31,vs31
+
+
+
+
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs8, vs4 
+  xsmuldp  vs33,vs27, vs4 
+
+  xsmuldp  vs34,vs7, vs4 
+  xsmuldp  vs35,vs26, vs4 
+
+  xsmuldp  vs36,vs6, vs4 
+  xsmuldp  vs37,vs25, vs4  
+
+  xsmuldp  vs38,vs5, vs4 
+  xsmuldp  vs39,vs24, vs4  
+
+  xsmuldp  vs40,vs12, vs4 
+  xsmuldp  vs41,vs31, vs4
+
+  xsmuldp  vs42,vs11, vs4 
+  xsmuldp  vs43,vs30, vs4  
+
+  xsmuldp  vs44,vs10, vs4 
+  xsmuldp  vs45,vs29, vs4 
+
+  xsmuldp  vs46,vs9, vs4 
+  xsmuldp  vs47,vs28, vs4      
+#else
+  xsmaddadp  vs32,vs8, vs4 
+  xsmaddadp  vs33,vs27, vs4 
+
+  xsmaddadp  vs34,vs7, vs4 
+  xsmaddadp  vs35,vs26, vs4 
+
+  xsmaddadp  vs36,vs6, vs4 
+  xsmaddadp  vs37,vs25, vs4  
+
+  xsmaddadp  vs38,vs5, vs4 
+  xsmaddadp  vs39,vs24, vs4  
+
+  xsmaddadp  vs40,vs12, vs4 
+  xsmaddadp  vs41,vs31, vs4
+
+  xsmaddadp  vs42,vs11, vs4 
+  xsmaddadp  vs43,vs30, vs4  
+
+  xsmaddadp  vs44,vs10, vs4 
+  xsmaddadp  vs45,vs29, vs4 
+
+  xsmaddadp  vs46,vs9, vs4 
+  xsmaddadp  vs47,vs28, vs4     
+#endif  
+
+  stxssp  v0,0(CO) 
+  stxssp  v1,4(CO) 
+
+  stxssp  v2,0(T1)
+  stxssp  v3,4(T1)
+
+  stxssp  v4,0(T2)
+  stxssp  v5,4(T2)
+
+  stxssp  v6,0(T3)
+  stxssp  v7,4(T3)
+
+  stxssp  v8,0(T4)
+  stxssp  v9,4(T4)
+
+  stxssp  v10,0(T5)
+  stxssp  v11,4(T5)
+
+  stxssp  v12,0(T6)
+  stxssp  v13,4(T6)
+
+  stxssp  v14,0(T7)
+  stxssp  v15,4(T7)
+ 
+
+  addi CO,CO,8
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=1
+**********************************************************************************************/
+.macro KERNEL8x1_4   OffsetA,OffsetB, Index,IsLast
+  KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro Zero8x1
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1  
+.endm
+
+.macro KERNEL8x1
+  KERNEL8x1_1 AO,BO, 0 
+.endm
+
+.macro KERNEL8x1_2
+  KERNEL8x1_2_1 AO,BO, 0 
+.endm
+
+.macro KERNEL8x1_1 AREG,BREG,First 
+    lxvwsx vs8,  0, \AREG
+    lxv vs26,   0(\BREG)
+    lxv vs27,   16(\BREG)      
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ .endif
+    addi        \AREG, \AREG,  4  
+    addi        \BREG, \BREG,  32
+.endm
+
+.macro KERNEL8x1_2_1 AREG,BREG,First 
+    lxsd v4,    0(\AREG)
+    lxv vs26,   0(\BREG)
+    lxv vs27,  16(\BREG)      
+    lxv vs28,  32(\BREG)
+    lxv vs29,  48(\BREG) 
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0  
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+    xvmulsp      vs0,   vs28,   vs9
+    xvmulsp      vs1,   vs29,   vs9     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9
+    xvmaddasp      vs1,   vs29,   vs9 
+ .endif
+    addi        \AREG, \AREG,  8 
+    addi        \BREG, \BREG,  64
+.endm
+
+.macro KERNEL8x1_I_4  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    xxspltw   vs8,  vs4, 3 
+    xxspltw   vs9,  vs4, 2 
+    xxspltw   vs10, vs4, 1 
+    xxspltw   vs11, vs4, 0
+    lxv vs26,   DISP32(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetB)(\BREG)
+    lxv vs28,   DISP32(\Index,32+\OffsetB)(\BREG)
+    lxv vs29,   DISP32(\Index,48+\OffsetB)(\BREG) 
+    lxv vs30,   DISP32(\Index,64+ 0+\OffsetB)(\BREG)
+    lxv vs31,   DISP32(\Index,64+16+\OffsetB)(\BREG)
+    lxv vs32,   DISP32(\Index,64+32+\OffsetB)(\BREG)
+    lxv vs33,   DISP32(\Index,64+48+\OffsetB)(\BREG)         
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+    xvmulsp      vs0,   vs28,   vs9
+    xvmulsp      vs1,   vs29,   vs9     
+    xvmulsp      vs0,   vs30,   vs10
+    xvmulsp      vs1,   vs31,   vs10  
+    xvmulsp      vs0,   vs32,   vs11
+    xvmulsp      vs1,   vs33,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9
+    xvmaddasp      vs1,   vs29,   vs9     
+    xvmaddasp      vs0,   vs30,   vs10
+    xvmaddasp      vs1,   vs31,   vs10  
+    xvmaddasp      vs0,   vs32,   vs11
+    xvmaddasp      vs1,   vs33,   vs11  
+ .endif
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP32(\Index,128)
+.endif 
+.endm
+
+.macro SAVE8x1
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  add     T4, T2, T10 
+  add     T5, T3, T10 
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO)  
+  lxssp  v2,0(T1) 
+  lxssp  v4,0(T2) 
+  lxssp  v6,0(T3) 
+  lxssp  v8,0(T4) 
+  lxssp  v10,0(T5) 
+  lxssp  v12,0(T6) 
+  lxssp  v14,0(T7)
+#endif
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+  xscvspdp  vs28, vs1
+  xxspltw   vs29, vs1, 1 
+  xxspltw   vs30, vs1, 2 
+  xxspltw   vs31, vs1, 3  
+  xscvspdp  vs29,vs29
+  xscvspdp  vs30,vs30
+  xscvspdp  vs31,vs31
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs27, vs4 
+  xsmuldp  vs34,vs26, vs4 
+  xsmuldp  vs36,vs25, vs4 
+  xsmuldp  vs38,vs24, vs4 
+  xsmuldp  vs40,vs31, vs4 
+  xsmuldp  vs42,vs30, vs4 
+  xsmuldp  vs44,vs29, vs4 
+  xsmuldp  vs46,vs28, vs4 
+#else
+  xsmaddadp  vs32,vs27, vs4 
+  xsmaddadp  vs34,vs26, vs4 
+  xsmaddadp  vs36,vs25, vs4 
+  xsmaddadp  vs38,vs24, vs4 
+  xsmaddadp  vs40,vs31, vs4 
+  xsmaddadp  vs42,vs30, vs4 
+  xsmaddadp  vs44,vs29, vs4 
+  xsmaddadp  vs46,vs28, vs4  
+#endif  
+  stxssp  v0,0(CO)  
+  stxssp  v2,0(T1) 
+  stxssp  v4,0(T2) 
+  stxssp  v6,0(T3) 
+  stxssp  v8,0(T4) 
+  stxssp  v10,0(T5) 
+  stxssp  v12,0(T6) 
+  stxssp  v14,0(T7) 
+  addi CO,CO,4
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro LOAD4x16_1
+   LOAD4x16 1
+.endm
+
+.macro LOAD4x16_0
+   LOAD4x16 0
+.endm
+
+.macro KERNEL4x16_L1_L4  Index,IsLast
+  KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL4x16_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero4X16
+    xxlxor		vs32,	vs32,	vs32
+    xxlxor		vs33,	vs33,	vs33
+	xxlxor		vs34,	vs34,	vs34
+	xxlxor		vs35,	vs35,	vs35
+	xxlxor		vs36,	vs36,	vs36
+	xxlxor		vs37,	vs37,	vs37
+	xxlxor		vs38,	vs38,	vs38
+	xxlxor		vs39,	vs39,	vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47	
+.endm
+
+.macro LOAD4x16  Zero
+
+	lxv	vs24,	0(BO) 
+	lxv	vs0,	 0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO)
+	xxperm  	vs26,	vs24,		permute_mask 	
+	xxpermdi	vs25,	vs24,	vs24,2 
+	xxpermdi	vs27,	vs26,	vs26,2	 	
+
+.if \Zero==1 
+    xxlxor		vs32,	vs32,	vs32
+    xxlxor		vs33,	vs33,	vs33
+	xxlxor		vs34,	vs34,	vs34
+	xxlxor		vs35,	vs35,	vs35
+	xxlxor		vs36,	vs36,	vs36
+	xxlxor		vs37,	vs37,	vs37
+	xxlxor		vs38,	vs38,	vs38
+	xxlxor		vs39,	vs39,	vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47
+ 
+.endif
+.endm
+
+.macro END4x16_NORMAL
+  END4x16 0, AO, BO, 64,16 
+.endm
+
+.macro END4x16 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+    xvmulsp     vs34, vs2,vs24  
+    xvmulsp     vs35, vs3,vs24  
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+    xvmulsp     vs38, vs2,vs25  
+    xvmulsp     vs39, vs3,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+    xvmulsp     vs42, vs2,vs26  
+    xvmulsp     vs43, vs3,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+    xvmulsp     vs46, vs2,vs27  
+    xvmulsp     vs47, vs3,vs27
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs38, vs2,vs25  
+    xvmaddasp       vs39, vs3,vs25 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs46, vs2,vs27  
+    xvmaddasp       vs47, vs3,vs27
+
+.endif
+.endm  
+
+.macro KERNEL4x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+	lxv	vs8,	DISP16(\Index, 0+\OffsetB)(\BREG) 
+
+ 	lxv	vs4,	DISP64(\Index, 0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP64(\Index,16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP64(\Index,32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP64(\Index,48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask 
+	xxpermdi	vs9,	vs8,	vs8,2	  
+
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24	 
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25 
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	 
+
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+
+
+
+	lxv	vs24,	DISP16(\Index,16+\OffsetB)(\BREG) 
+
+	lxv	vs0,	DISP64(\Index,64+\OffsetA)(\AREG)
+	lxv	vs1,	DISP64(\Index,64+16+\OffsetA)(\AREG)
+	lxv	vs2,	DISP64(\Index,64+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP64(\Index,64+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,		permute_mask 
+	xxpermdi	vs25,	vs24,	vs24,2	    
+ 
+
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	
+ 
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+         
+	xxpermdi	vs27,	vs26,	vs26,2	 	
+
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+ 
+
+	lxv	vs8,	DISP16(\Index,32+\OffsetB)(\BREG) 
+
+ 	lxv	vs4,	DISP64(\Index,128+0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP64(\Index,128+16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP64(\Index,128+32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP64(\Index,128+48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask 
+	xxpermdi	vs9,	vs8,	vs8,2	  
+
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24	 
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	 
+
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+
+ 
+ 
+.if \Complete==0
+	lxv	vs24,	DISP16(\Index,48+\OffsetB)(\BREG) 
+
+	lxv	vs0,	DISP64(\Index,192+\OffsetA)(\AREG)
+	lxv	vs1,	DISP64(\Index,192+16+\OffsetA)(\AREG) 
+	lxv	vs2,	DISP64(\Index,192+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP64(\Index,192+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,	permute_mask 	
+	xxpermdi	vs25,	vs24,	vs24,2  	
+
+.endif 
+.if \IsLast==1	
+.if \Complete==1
+  
+	addi		\BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
+	addi		\AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
+.else
+  
+	addi		\BREG, \BREG,  DISP16(\Index,64)
+	addi		\AREG, \AREG, DISP64(\Index,256)
+.endif
+.endif   
+ 
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	 
+ 
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+  
+.if \Complete==0        
+	xxpermdi	vs27,	vs26,	vs26,2	 
+ 	
+.endif
+ 
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+
+ 
+
+.endm
+
+.macro KERNEL4x16 First
+
+  LOAD4x16 0
+  END4x16 \First, AO, BO, 64,16 
+.endm
+
+.macro KERNEL4x16_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+	
+	lxv	vs8,	DISP8(\Index, 0+\OffsetB)(\BREG) 
+ 	lxv	vs4,	DISP32(\Index, 0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask 
+	xxpermdi	vs9,	vs8,	vs8,2	  
+.if \First==1
+    xvmulsp		vs32, vs0,vs24
+	xvmulsp		vs33, vs1,vs24
+	xvmulsp		vs34, vs2,vs24	
+	xvmulsp		vs35, vs3,vs24	
+
+    xvmulsp		vs36, vs0,vs25
+	xvmulsp		vs37, vs1,vs25
+	xvmulsp		vs38, vs2,vs25	
+	xvmulsp		vs39, vs3,vs25	
+.else
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25		
+.endif
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	 	
+ 
+.if \First==1  
+    xvmulsp		vs40, vs0,vs26
+	xvmulsp		vs41, vs1,vs26
+	xvmulsp		vs42, vs2,vs26	
+	xvmulsp		vs43, vs3,vs26
+
+    xvmulsp		vs44, vs0,vs27
+	xvmulsp		vs45, vs1,vs27
+	xvmulsp		vs46, vs2,vs27	
+	xvmulsp		vs47, vs3,vs27
+
+  
+.else 
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+ 
+
+.endif
+.if \Complete==0
+	lxv	vs24,	DISP8(\Index,16+\OffsetB)(\BREG) 
+	lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
+	lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG)
+	lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,	permute_mask 
+	xxpermdi	vs25,	vs24,	vs24,2	  
+.endif    
+.if \IsLast==1	
+.if \Complete==1
+ 	addi		\BREG, \BREG,  DISP8(\Index,16+\OffsetB) 
+	addi		\AREG, \AREG, DISP32(\Index,64+\OffsetA)
+
+.else
+  	addi		\BREG, \BREG,  DISP8(\Index,32)
+	addi		\AREG, \AREG, DISP32(\Index,128) 
+.endif
+.endif
+
+.if \First==1
+    xvmulsp		vs32, vs4,vs8
+	xvmulsp		vs33, vs5,vs8
+	xvmulsp		vs34, vs6,vs8	
+	xvmulsp		vs35, vs7,vs8
+
+    xvmulsp		vs36, vs4,vs9
+	xvmulsp		vs37, vs5,vs9
+	xvmulsp		vs38, vs6,vs9	
+	xvmulsp		vs39, vs7,vs9
+.else
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	
+
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+.endif 
+ 
+.if \Complete==0        
+	xxpermdi	vs27,	vs26,	vs26,2	 
+ 
+.endif
+.if \First==1  
+    xvmulsp		vs40, vs4,vs10
+	xvmulsp		vs41, vs5,vs10
+	xvmulsp		vs42, vs6,vs10	
+	xvmulsp		vs43, vs7,vs10
+
+    xvmulsp		vs44, vs4,vs11
+	xvmulsp		vs45, vs5,vs11
+	xvmulsp		vs46, vs6,vs11	
+	xvmulsp		vs47, vs7,vs11
+
+ 
+
+.else 
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+
+ 
+
+.endif
+
+.endm
+
+ 
+.macro SAVE4x16
+
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+  
+ 
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+
+    xxmrglw     vs16,   vs34,   vs46
+    xxmrglw     vs18,   vs38,   vs42   
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+
+    xxmrghw     vs4,    vs38,   vs42
+    xxmrghw     vs5,    vs34,   vs46
+
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxmrglw     vs24,   vs35,   vs47
+    xxmrglw     vs26,   vs39,   vs43  
+
+    xxlor      vs17,    vs16,   vs16
+    xxlor      vs19,    vs18,   vs18
+
+    xxmrghw     vs30,   vs39,   vs43 
+    xxmrghw     vs31,   vs35,   vs47
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+#ifndef TRMMKERNEL    
+    lxv        vs32, 0(CO)
+    lxv        vs33, 16(CO) 
+    lxv        vs34, 32(CO)  
+    lxv        vs35, 48(CO)      
+#endif
+    xxlor      vs25,    vs24,   vs24
+    xxlor      vs27,    vs26,   vs26 
+
+#ifndef TRMMKERNEL    
+    lxv        vs36, 0(T1)
+    lxv        vs37, 16(T1) 
+    lxv        vs38, 32(T1)  
+    lxv        vs39, 48(T1)     
+#endif
+#ifndef TRMMKERNEL       
+    lxv        vs40, 0(T2)
+    lxv        vs41, 16(T2) 
+    lxv        vs42, 32(T2)  
+    lxv        vs43, 48(T2)     
+#endif  
+#ifndef TRMMKERNEL    
+    lxv        vs44, 0(T3)
+    lxv        vs45, 16(T3) 
+    lxv        vs46, 32(T3)  
+    lxv        vs47, 48(T3)                 
+#endif  
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+       
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+
+    xxperm     vs16,    vs4,    save_permute_1
+    xxperm     vs18,    vs5,    save_permute_1
+      
+    xxperm     vs17,    vs4,    save_permute_2   
+    xxperm     vs19,    vs5,    save_permute_2      
+
+    xxperm     vs24,    vs30,   save_permute_1
+    xxperm     vs26,    vs31,   save_permute_1 
+         
+    xxperm     vs25,    vs30,   save_permute_2   
+    xxperm     vs27,    vs31,   save_permute_2  
+
+
+    /* multiply add normal way */
+ 
+#ifdef TRMMKERNEL
+    xvmulsp     vs32,   vs8,    alpha_r 
+    xvmulsp     vs33,   vs12,   alpha_r   
+    xvmulsp     vs34,   vs16,   alpha_r 
+    xvmulsp     vs35,   vs24,   alpha_r  
+    xvmulsp     vs36,   vs9,    alpha_r 
+    xvmulsp     vs37,   vs13,   alpha_r  
+    xvmulsp     vs38,   vs17,   alpha_r 
+    xvmulsp     vs39,   vs25,   alpha_r               
+#else 
+    xvmaddasp   vs32,   vs8,    alpha_r 
+    xvmaddasp   vs33,   vs12,   alpha_r   
+    xvmaddasp   vs34,   vs16,   alpha_r 
+    xvmaddasp   vs35,   vs24,   alpha_r  
+    xvmaddasp   vs36,   vs9,    alpha_r 
+    xvmaddasp   vs37,   vs13,   alpha_r   
+    xvmaddasp   vs38,   vs17,   alpha_r 
+    xvmaddasp   vs39,   vs25,   alpha_r         
+#endif 
+
+
+
+#ifdef TRMMKERNEL
+    xvmulsp     vs40,   vs10,   alpha_r 
+    xvmulsp     vs41,   vs14,   alpha_r 
+    xvmulsp     vs42,   vs18,   alpha_r 
+    xvmulsp     vs43,   vs26,   alpha_r  
+    xvmulsp     vs44,   vs11,   alpha_r 
+    xvmulsp     vs45,   vs15,   alpha_r  
+    xvmulsp     vs46,   vs19,   alpha_r 
+    xvmulsp     vs47,   vs27,   alpha_r                   
+#else
+
+    xvmaddasp   vs40,   vs10,   alpha_r 
+    xvmaddasp   vs41,   vs14,   alpha_r   
+    xvmaddasp   vs42,   vs18,   alpha_r 
+    xvmaddasp   vs43,   vs26,   alpha_r  
+    xvmaddasp   vs44,   vs11,   alpha_r 
+    xvmaddasp   vs45,   vs15,   alpha_r 
+    xvmaddasp   vs46,   vs19,   alpha_r 
+    xvmaddasp   vs47,   vs27,   alpha_r  
+        
+#endif  
+
+    stxv        vs32, 0(CO)
+    stxv        vs33, 16(CO) 
+    stxv        vs34, 32(CO)  
+    stxv        vs35, 48(CO)  
+
+    stxv        vs36, 0(T1)
+    stxv        vs37, 16(T1)  
+    stxv        vs38, 32(T1)  
+    stxv        vs39, 48(T1)
+
+    stxv        vs40, 0(T2)
+    stxv        vs41, 16(T2)  
+    stxv        vs42, 32(T2)  
+    stxv        vs43, 48(T2)  
+    stxv        vs44, 0(T3)
+    stxv        vs45, 16(T3) 
+    stxv        vs46, 32(T3)  
+    stxv        vs47, 48(T3)
+   
+    addi CO,CO,64
+
+
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+   LOAD4x8 1
+.endm
+
+.macro LOAD4x8_0
+   LOAD4x8 0
+.endm
+
+.macro KERNEL4x8_L1_L4  Index,IsLast
+  KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL4x8_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x8_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro END4x8_NORMAL
+  END4x8 0, AO, BO, 32,16 
+.endm
+
+.macro Zero4X8
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33
+ 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+ 
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41
+ 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45
+    
+.endm
+
+.macro LOAD4x8  Zero
+
+    lxv vs24,   0(BO) 
+    lxv vs0,     0(AO)
+    lxv vs1,    16(AO)
+
+    xxperm      vs26,   vs24,       permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2      
+
+    xxpermdi    vs27,   vs26,   vs26,2      
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45 
+ 
+.endif
+.endm
+
+
+.macro END4x8 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+ 
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+ 
+
+.endif
+.endm  
+
+.macro KERNEL4x8_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs4,    DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask    
+    xxpermdi    vs9,    vs8,    vs8,2     
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2   
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+ 
+
+    lxv vs24,   DISP16(\Index,16+\OffsetB)(\BREG) 
+
+    lxv vs0,    DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,32+16+\OffsetA)(\AREG)
+
+    xxperm      vs26,   vs24,       permute_mask   
+    xxpermdi    vs25,   vs24,   vs24,2      
+
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+    xxpermdi    vs27,   vs26,   vs26,2       
+
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+ 
+
+    lxv vs8,    DISP16(\Index,32+\OffsetB)(\BREG) 
+
+    lxv vs4,    DISP32(\Index,64+0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,64+16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask     
+    xxpermdi    vs9,    vs8,    vs8,2     
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2   
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+ 
+
+.if \Complete==0
+    lxv vs24,   DISP16(\Index,48+\OffsetB)(\BREG) 
+
+    lxv vs0,    DISP32(\Index,96+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,96+16+\OffsetA)(\AREG) 
+
+    xxperm      vs26,   vs24,   permute_mask     
+    xxpermdi    vs25,   vs24,   vs24,2      
+
+.endif 
+.if \IsLast==1  
+.if \Complete==1
+  
+    addi        \BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
+    addi        \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
+.else
+  
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif
+.endif   
+ 
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2    
+    
+.endif
+ 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+ 
+
+.endm
+
+.macro KERNEL4x8 First
+
+  LOAD4x8 0
+  END4x8 \First, AO, BO, 32,16  
+.endm
+
+.macro KERNEL4x8_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+    
+    lxv vs8,     DISP8(\Index, 0+\OffsetB)(\BREG) 
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP16(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask  
+    xxpermdi    vs9,    vs8,    vs8,2     
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+.endif
+
+    xxpermdi    vs11,   vs10,   vs10,2    
+ 
+.if \First==1  
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+ 
+
+.else 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+ 
+
+.endif
+.if \Complete==0
+    lxv vs24,    DISP8(\Index,16+\OffsetB)(\BREG) 
+
+    lxv vs0,    DISP16(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP16(\Index,32+16+\OffsetA)(\AREG)
+
+    xxperm      vs26,   vs24,   permute_mask   
+    xxpermdi    vs25,   vs24,   vs24,2    
+.endif    
+.if \IsLast==1  
+.if \Complete==1
+    addi        \BREG, \BREG,   DISP8(\Index,16+\OffsetB) 
+    addi        \AREG, \AREG,  DISP16(\Index,32+\OffsetA)
+
+.else
+    addi        \BREG, \BREG,   DISP8(\Index,32)
+    addi        \AREG, \AREG,  DISP16(\Index,64) 
+.endif
+.endif
+
+.if \First==1
+    xvmulsp     vs32, vs4,vs8
+    xvmulsp     vs33, vs5,vs8
+
+    xvmulsp     vs36, vs4,vs9
+    xvmulsp     vs37, vs5,vs9
+
+.else
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.endif 
+ 
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2   
+ 
+.endif
+.if \First==1  
+    xvmulsp     vs40, vs4,vs10
+    xvmulsp     vs41, vs5,vs10
+
+    xvmulsp     vs44, vs4,vs11
+    xvmulsp     vs45, vs5,vs11
+ 
+.else 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11 
+
+.endif
+
+.endm
+
+
+.macro SAVE4x8 
+ 
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+ 
+
+#ifndef TRMMKERNEL    
+    lxv        vs34, 0(CO)
+    lxv        vs35, 16(CO)      
+    lxv        vs38, 0(T1)
+    lxv        vs39, 16(T1)  
+    lxv        vs42, 0(T2)
+    lxv        vs43, 16(T2)     
+    lxv        vs46, 0(T3)
+    lxv        vs47, 16(T3)  
+
+ 
+#endif  
+
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+ 
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+      
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+
+
+    /* multiply add normal way */
+ 
+#ifdef TRMMKERNEL
+    xvmulsp     vs34,   vs8,    alpha_r 
+    xvmulsp     vs35,   vs12,   alpha_r 
+    xvmulsp     vs38,   vs9,    alpha_r 
+    xvmulsp     vs39,   vs13,   alpha_r 
+    xvmulsp     vs42,   vs10,   alpha_r 
+    xvmulsp     vs43,   vs14,   alpha_r 
+    xvmulsp     vs46,   vs11,   alpha_r 
+    xvmulsp     vs47,   vs15,   alpha_r                    
+#else 
+    xvmaddasp   vs34,   vs8,    alpha_r 
+    xvmaddasp   vs35,   vs12,   alpha_r 
+    xvmaddasp   vs38,   vs9,    alpha_r 
+    xvmaddasp   vs39,   vs13,   alpha_r  
+    xvmaddasp   vs42,   vs10,   alpha_r 
+    xvmaddasp   vs43,   vs14,   alpha_r   
+    xvmaddasp   vs46,   vs11,   alpha_r 
+    xvmaddasp   vs47,   vs15,   alpha_r                     
+#endif     
+ 
+    
+    stxv        vs34, 0(CO)
+    stxv        vs35, 16(CO)  
+    stxv        vs38, 0(T1)
+    stxv        vs39, 16(T1)  
+    stxv        vs42, 0(T2)
+    stxv        vs43, 16(T2)     
+    stxv        vs46, 0(T3)
+    stxv        vs47, 16(T3)  
+  
+
+    addi CO,CO,32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+   LOAD4x4 1
+.endm
+
+.macro LOAD4x4_0
+   LOAD4x4 0
+.endm
+
+.macro KERNEL4x4_L1_L4  Index,IsLast
+  KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x4_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL4x4_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x4_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero4X4
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+ 
+.endm
+
+.macro LOAD4x4  Zero
+
+    lxv vs0,     0(AO)
+    lxv vs24,   0(BO) 
+
+
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2        
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+ 
+.endif
+.endm
+
+.macro END4x4_NORMAL
+  END4x4 0, AO, BO, 16,16 
+.endm
+
+.macro END4x4 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3  
+.else
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+
+.endif
+.endm  
+
+.macro KERNEL4x4_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG) 
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+
+    lxv vs0,    DISP16(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP16(\Index, 16+\OffsetB)(\BREG)  
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+ 
+ 
+
+    lxv vs4,    DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 32+\OffsetB)(\BREG) 
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+
+.if \Complete==0 
+
+    lxv vs0,    DISP16(\Index, 48+\OffsetA)(\AREG)
+    lxv vs24,   DISP16(\Index, 48+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+ 
+
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)  
+    addi        \BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+
+.endif
+.endif   
+ 
+ 
+.endm
+
+.macro KERNEL4x4 First
+    LOAD4x4 0
+    END4x4 \First, AO, BO, 16,16  
+.endm
+
+.macro KERNEL4x4_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG) 
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3
+ 
+.else 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+.endif
+
+.if \Complete==0 
+
+    lxv vs0,    DISP8(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+
+.if \First==1
+    xvmulsp      vs32,   vs26,   vs4
+    xvmulsp      vs33,   vs26,   vs5 
+    xvmulsp      vs34,   vs26,   vs6
+    xvmulsp      vs35,   vs26,   vs7 
+
+
+.else
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+ 
+.endif
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP8(\Index,16+\OffsetA)  
+    addi        \BREG, \BREG,  DISP8(\Index,16+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+    addi        \BREG, \BREG,  DISP8(\Index,32)
+
+.endif
+.endif   
+     
+  
+.endm
+
+
+.macro SAVE4x4
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+#if !defined(TRMMKERNEL)  
+  lxv        vs36, 0(CO)
+  lxv        vs37, 0(T1)
+#endif
+  add     T2, CO, T10  
+  add     T3, T1, T10 
+#if !defined(TRMMKERNEL)   
+  lxv        vs38, 0(T2)
+  lxv        vs39, 0(T3)    
+#endif   
+
+  xxmrglw  vs0, vs35,vs32
+  xxmrglw  vs1, vs34,vs33 
+  xxmrglw  vs4, vs32,vs35
+  xxmrglw  vs5, vs33,vs34 
+
+
+  xxmrghw  vs2, vs35,vs32
+  xxmrghw  vs3, vs34,vs33 
+  xxmrghw  vs6, vs32,vs35
+  xxmrghw  vs7, vs33,vs34  
+
+  xxmrgld  vs24, vs1, vs0  
+  xxmrghd  vs25,vs5,vs4 
+
+  xxmrgld  vs26, vs2, vs3  
+  xxmrghd  vs27,vs6,vs7
+
+ #if defined(TRMMKERNEL)
+  xvmulsp        vs36, vs24, alpha_r
+  xvmulsp        vs37, vs25, alpha_r 
+  xvmulsp        vs38, vs26, alpha_r
+  xvmulsp        vs39, vs27, alpha_r 
+#else
+  xvmaddasp        vs36, vs24, alpha_r
+  xvmaddasp        vs37, vs25, alpha_r 
+  xvmaddasp        vs38, vs26, alpha_r
+  xvmaddasp        vs39, vs27, alpha_r   
+ #endif
+  stxv        vs36, 0(CO)
+  stxv        vs37, 0(T1) 
+  stxv        vs38, 0(T2)
+  stxv        vs39, 0(T3)   
+ 
+
+
+  addi CO,CO,16
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+ 
+.macro KERNEL4x2_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+
+.macro Zero4x2
+    xxlxor      vs0,   vs0,   vs0 
+    xxlxor      vs2,   vs2,   vs2 
+       
+.endm
+ 
+.macro KERNEL4x2
+  KERNEL4x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG)      
+    xxspltw   vs8,  vs36, 0 
+    xxspltw   vs9,  vs36, 1  
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs2,   vs26,   vs9 
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+    xvmaddasp      vs2,   vs26,   vs9 
+ 
+ .endif
+   
+    addi        \AREG, \AREG, DISP2(\Index,8)  
+    addi        \BREG, \BREG, DISP4(\Index,16)
+ 
+.endm
+
+.macro KERNEL4x2_I_2  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG) 
+    lxv vs28,   DISP8(\Index,16+\OffsetB)(\BREG)       
+    xxspltw   vs8,  vs4, 2  
+    xxspltw   vs9,  vs4, 3 
+    xxspltw   vs10, vs4, 0 
+    xxspltw   vs11, vs4, 1
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs2,   vs26,   vs9  
+
+    xvmulsp      vs0,   vs28,   vs10 
+    xvmulsp      vs2,   vs28,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8 
+    xvmaddasp      vs2,   vs26,   vs9 
+
+    xvmaddasp      vs0,   vs28,   vs10 
+    xvmaddasp      vs2,   vs28,   vs11   
+ .endif
+
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+
+.macro SAVE4x2
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO) 
+  lxssp  v1,4(CO) 
+
+  lxssp  v2,0(T1)
+  lxssp  v3,4(T1)
+
+  lxssp  v4,0(T2)
+  lxssp  v5,4(T2)
+
+  lxssp  v6,0(T3)
+  lxssp  v7,4(T3)
+
+   
+#endif
+  xscvspdp  vs5, vs2
+  xxspltw   vs6, vs2, 1 
+  xxspltw   vs7, vs2, 2 
+  xxspltw   vs8, vs2, 3  
+  xscvspdp  vs6,vs6
+  xscvspdp  vs7,vs7
+  xscvspdp  vs8,vs8
+
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+ 
+
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs8, vs4 
+  xsmuldp  vs33,vs27, vs4 
+
+  xsmuldp  vs34,vs7, vs4 
+  xsmuldp  vs35,vs26, vs4 
+
+  xsmuldp  vs36,vs6, vs4 
+  xsmuldp  vs37,vs25, vs4  
+
+  xsmuldp  vs38,vs5, vs4 
+  xsmuldp  vs39,vs24, vs4  
+
+      
+#else
+  xsmaddadp  vs32,vs8, vs4 
+  xsmaddadp  vs33,vs27, vs4 
+
+  xsmaddadp  vs34,vs7, vs4 
+  xsmaddadp  vs35,vs26, vs4 
+
+  xsmaddadp  vs36,vs6, vs4 
+  xsmaddadp  vs37,vs25, vs4  
+
+  xsmaddadp  vs38,vs5, vs4 
+  xsmaddadp  vs39,vs24, vs4  
+
+    
+#endif  
+
+  stxssp  v0,0(CO) 
+  stxssp  v1,4(CO) 
+
+  stxssp  v2,0(T1)
+  stxssp  v3,4(T1)
+
+  stxssp  v4,0(T2)
+  stxssp  v5,4(T2)
+
+  stxssp  v6,0(T3)
+  stxssp  v7,4(T3)
+
+ 
+ 
+
+  addi CO,CO,8
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+.macro KERNEL4x1_4   OffsetA,OffsetB, Index,IsLast
+  KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro Zero4x1
+    xxlxor      vs0,   vs0,   vs0 
+.endm
+
+.macro KERNEL4x1
+  KERNEL4x1_1 AO,BO, 0 
+.endm
+
+.macro KERNEL4x1_2
+  KERNEL4x1_2_1 AO,BO, 0 
+.endm
+
+.macro KERNEL4x1_1 AREG,BREG,First 
+    lxvwsx vs8,  0, \AREG
+    lxv vs26,   0(\BREG)       
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8  
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+ .endif
+    addi        \AREG, \AREG,  4  
+    addi        \BREG, \BREG,  16
+.endm
+
+.macro KERNEL4x1_2_1 AREG,BREG,First 
+    lxsd v4,    0(\AREG)
+    lxv vs26,   0(\BREG)      
+    lxv vs28,  16(\BREG) 
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0  
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs0,   vs28,   vs9     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9  
+ .endif
+    addi        \AREG, \AREG,  8 
+    addi        \BREG, \BREG,  32
+.endm
+
+.macro KERNEL4x1_I_4  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    xxspltw   vs8,  vs4, 3 
+    xxspltw   vs9,  vs4, 2 
+    xxspltw   vs10, vs4, 1 
+    xxspltw   vs11, vs4, 0
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG) 
+    lxv vs28,   DISP16(\Index,16+\OffsetB)(\BREG)  
+    lxv vs30,   DISP16(\Index,32+\OffsetB)(\BREG) 
+    lxv vs32,   DISP16(\Index,48+\OffsetB)(\BREG)          
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8  
+    xvmulsp      vs0,   vs28,   vs9      
+    xvmulsp      vs0,   vs30,   vs10  
+    xvmulsp      vs0,   vs32,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9     
+    xvmaddasp      vs0,   vs30,   vs10  
+    xvmaddasp      vs0,   vs32,   vs11  
+ .endif
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP16(\Index,64)
+.endif 
+.endm
+
+.macro SAVE4x1
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO)  
+  lxssp  v2,0(T1) 
+  lxssp  v4,0(T2) 
+  lxssp  v6,0(T3)  
+#endif
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs27, vs4 
+  xsmuldp  vs34,vs26, vs4 
+  xsmuldp  vs36,vs25, vs4 
+  xsmuldp  vs38,vs24, vs4  
+#else
+  xsmaddadp  vs32,vs27, vs4 
+  xsmaddadp  vs34,vs26, vs4 
+  xsmaddadp  vs36,vs25, vs4 
+  xsmaddadp  vs38,vs24, vs4   
+#endif  
+  stxssp  v0,0(CO)  
+  stxssp  v2,0(T1) 
+  stxssp  v4,0(T2) 
+  stxssp  v6,0(T3)  
+  addi CO,CO,4
+.endm
+
+/****************************N=2 section*****************/
+
+.macro KERNEL2x16_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL2x16_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero2x16
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3
+    xxlxor      vs4,   vs4,   vs4
+    xxlxor      vs5,   vs5,   vs5 
+    xxlxor      vs6,   vs6,   vs6
+    xxlxor      vs7,   vs7,   vs7      
+.endm
+ 
+.macro KERNEL2x16
+  KERNEL2x16_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x16_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0 
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)   
+    lxv vs28,   DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs29,   DISP16(\Index,48+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs28,   vs8
+    xvmulsp      vs3,   vs29,   vs8 
+
+    xvmulsp      vs4,   vs26,   vs9
+    xvmulsp      vs5,   vs27,   vs9 
+    xvmulsp      vs6,   vs28,   vs9
+    xvmulsp      vs7,   vs29,   vs9     
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+    xvmaddasp      vs6,   vs28,   vs9
+    xvmaddasp      vs7,   vs29,   vs9
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x16_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP64(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP64(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP64(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP64(\Index,48+\OffsetA)(\AREG)  
+
+    lxv vs16,   DISP64(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP64(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP64(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP64(\Index,64+ 48+\OffsetA)(\AREG) 
+
+    lxv vs30,   DISP64(\Index,128+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP64(\Index,128+ 16+\OffsetA)(\AREG)
+    lxv vs32,   DISP64(\Index,128+ 32+\OffsetA)(\AREG)
+    lxv vs33,   DISP64(\Index,128+ 48+\OffsetA)(\AREG)  
+        
+    lxv vs34,   DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
+    lxv vs36,   DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
+    lxv vs37,   DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0  
+
+    xxspltw   vs12,  vs39, 3  
+    xxspltw   vs13,  vs39, 2 
+    xxspltw   vs14, vs39, 1 
+    xxspltw   vs15, vs39, 0  
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+    xvmaddasp      vs6,   vs28,   vs9
+    xvmaddasp      vs7,   vs29,   vs9 
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10 
+    xvmaddasp      vs2,   vs18,   vs10
+    xvmaddasp      vs3,   vs19,   vs10 
+
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11 
+    xvmaddasp      vs6,   vs18,   vs11
+    xvmaddasp      vs7,   vs19,   vs11  
+
+    xvmaddasp      vs0,   vs30,   vs12
+    xvmaddasp      vs1,   vs31,   vs12 
+    xvmaddasp      vs2,   vs32,   vs12
+    xvmaddasp      vs3,   vs33,   vs12 
+
+    xvmaddasp      vs4,   vs30,   vs13
+    xvmaddasp      vs5,   vs31,   vs13 
+    xvmaddasp      vs6,   vs32,   vs13
+    xvmaddasp      vs7,   vs33,   vs13 
+
+    xvmaddasp      vs0,   vs34,   vs14
+    xvmaddasp      vs1,   vs35,   vs14 
+    xvmaddasp      vs2,   vs36,   vs14
+    xvmaddasp      vs3,   vs37,   vs14 
+
+    xvmaddasp      vs4,   vs34,   vs15
+    xvmaddasp      vs5,   vs35,   vs15 
+    xvmaddasp      vs6,   vs36,   vs15
+    xvmaddasp      vs7,   vs37,   vs15    
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP64(\Index,256)
+.endif 
+  
+.endm
+
+.macro KERNEL2x16_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 3  
+    xxspltw   vs9,  vs36, 2 
+    xxspltw   vs10, vs36, 1 
+    xxspltw   vs11, vs36, 0    
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP32(\Index,48+\OffsetA)(\AREG)      
+    lxv vs16,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP32(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP32(\Index,64+ 48+\OffsetA)(\AREG) 
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+    xvmaddasp      vs6,   vs28,   vs9
+    xvmaddasp      vs7,   vs29,   vs9 
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10 
+    xvmaddasp      vs2,   vs18,   vs10
+    xvmaddasp      vs3,   vs19,   vs10 
+
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11 
+    xvmaddasp      vs6,   vs18,   vs11
+    xvmaddasp      vs7,   vs19,   vs11   
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+
+.macro SAVE2x16
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO) 
+    lxv        vs18, 32(CO)  
+    lxv        vs19, 48(CO)      
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxv        vs26, 0(T1)
+    lxv        vs27, 16(T1) 
+    lxv        vs28, 32(T1)  
+    lxv        vs29, 48(T1)      
+#endif
+
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r 
+  xvmulsp        vs18, vs2, alpha_r
+  xvmulsp        vs19, vs3, alpha_r   
+  xvmulsp        vs26, vs4, alpha_r
+  xvmulsp        vs27, vs5, alpha_r 
+  xvmulsp        vs28, vs6, alpha_r
+  xvmulsp        vs29, vs7, alpha_r
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r 
+  xvmaddasp        vs18, vs2, alpha_r
+  xvmaddasp        vs19, vs3, alpha_r   
+  xvmaddasp        vs26, vs4, alpha_r
+  xvmaddasp        vs27, vs5, alpha_r 
+  xvmaddasp        vs28, vs6, alpha_r
+  xvmaddasp        vs29, vs7, alpha_r
+#endif
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO) 
+    stxv        vs18, 32(CO)  
+    stxv        vs19, 48(CO)      
+    
+    stxv        vs26, 0(T1)
+    stxv        vs27, 16(T1) 
+    stxv        vs28, 32(T1)  
+    stxv        vs29, 48(T1) 
+ 
+  addi CO,CO,64
+
+.endm
+
+/*       M=8 N=2 */
+
+.macro KERNEL2x8_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL2x8_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero2x8
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+ 
+    xxlxor      vs4,   vs4,   vs4
+    xxlxor      vs5,   vs5,   vs5 
+     
+.endm
+ 
+.macro KERNEL2x8
+  KERNEL2x8_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x8_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0 
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)          
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+
+    xvmulsp      vs4,   vs26,   vs9
+    xvmulsp      vs5,   vs27,   vs9      
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8   
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9  
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x8_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG) 
+
+    lxv vs16,   DISP32(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,32+ 16+\OffsetA)(\AREG) 
+
+    lxv vs30,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)  
+        
+    lxv vs34,   DISP32(\Index, 96+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP32(\Index, 96+ 16+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0  
+
+    xxspltw   vs12,  vs39, 3  
+    xxspltw   vs13,  vs39, 2 
+    xxspltw   vs14, vs39, 1 
+    xxspltw   vs15, vs39, 0  
+
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+ 
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10 
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11 
+ 
+
+    xvmaddasp      vs0,   vs30,   vs12
+    xvmaddasp      vs1,   vs31,   vs12 
+    xvmaddasp      vs4,   vs30,   vs13
+    xvmaddasp      vs5,   vs31,   vs13 
+
+    xvmaddasp      vs0,   vs34,   vs14
+    xvmaddasp      vs1,   vs35,   vs14 
+    xvmaddasp      vs4,   vs34,   vs15
+    xvmaddasp      vs5,   vs35,   vs15 
+   
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+.macro KERNEL2x8_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 3  
+    xxspltw   vs9,  vs36, 2 
+    xxspltw   vs10, vs36, 1 
+    xxspltw   vs11, vs36, 0    
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)
+    lxv vs16,   DISP16(\Index,32+\OffsetA)(\AREG)
+    lxv vs17,   DISP16(\Index,48+\OffsetA)(\AREG)      
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9  
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10   
+
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11     
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+
+.macro SAVE2x8
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxv        vs26, 0(T1)
+    lxv        vs27, 16(T1) 
+    
+#endif
+
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r  
+  xvmulsp        vs26, vs4, alpha_r
+  xvmulsp        vs27, vs5, alpha_r 
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r  
+  xvmaddasp        vs26, vs4, alpha_r
+  xvmaddasp        vs27, vs5, alpha_r 
+#endif
+
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO) 
+     
+    
+    stxv        vs26, 0(T1)
+    stxv        vs27, 16(T1) 
+
+  addi CO,CO,32
+
+.endm
+
+
+/*M=4*/
+
+
+.macro KERNEL2x4_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL2x4_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ /* we will aggregate on save vs0 +vs4 vs11+vs5 */
+.macro Zero2x4
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+ 
+    xxlxor      vs4,   vs4,   vs4
+    xxlxor      vs5,   vs5,   vs5 
+    
+.endm
+ 
+.macro KERNEL2x4
+  KERNEL2x4_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x4_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0 
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs1,   vs26,   vs9     
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8 
+    xvmaddasp      vs1,   vs26,   vs9 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x4_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs16,   DISP16(\Index,16+\OffsetA)(\AREG) 
+
+    lxv vs30,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs34,   DISP16(\Index,32+ 16+\OffsetA)(\AREG) 
+ 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0  
+
+    xxspltw   vs12,  vs39, 3  
+    xxspltw   vs13,  vs39, 2 
+    xxspltw   vs14, vs39, 1 
+    xxspltw   vs15, vs39, 0  
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9 
+    xvmaddasp      vs4,   vs16,   vs10
+    xvmaddasp      vs5,   vs16,   vs11 
+ 
+
+    xvmaddasp      vs0,   vs30,   vs12
+    xvmaddasp      vs1,   vs30,   vs13 
+    xvmaddasp      vs4,   vs34,   vs14
+    xvmaddasp      vs5,   vs34,   vs15 
+ 
+   
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+.macro KERNEL2x4_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 3  
+    xxspltw   vs9,  vs36, 2 
+    xxspltw   vs10, vs36, 1 
+    xxspltw   vs11, vs36, 0    
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs16,   DISP8(\Index, 16+\OffsetA)(\AREG)      
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9 
+    xvmaddasp      vs4,   vs16,   vs10
+    xvmaddasp      vs5,   vs16,   vs11     
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+
+.macro SAVE2x4
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxv        vs26, 0(T1) 
+    
+#endif
+    /*aggregate vectors*/
+  xvaddsp         vs0,vs0,vs4
+  xvaddsp         vs1,vs1,vs5 
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r 
+  xvmulsp        vs26, vs1, alpha_r 
+#else
+  xvmaddasp        vs16, vs0, alpha_r 
+  xvmaddasp        vs26, vs1, alpha_r 
+#endif
+
+  stxv        vs16, 0(CO) 
+  stxv        vs26, 0(T1)  
+
+  addi CO,CO,16
+
+.endm
+
+
+/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2  */
+.macro SWITCH_PERMUTE_INNER
+    xxpermdi	permute_mask,	permute_mask,	permute_mask,2
+.endm
+
+.macro Zero2x2
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    SWITCH_PERMUTE_INNER
+.endm
+ 
+.macro KERNEL2x2
+  KERNEL2x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x2_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x2_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxperm   vs9,  vs36, permute_mask 
+    lxsd v5,   DISP2(\Index, 0+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs37,   vs36 
+    xvmulsp      vs1,   vs37,   vs9     
+     
+.else 
+    xvmaddasp      vs0,   vs37,   vs36 
+    xvmaddasp      vs1,   vs37,   vs9 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP2(\Index,8)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x2_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs10,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs16,   DISP8(\Index,16+\OffsetA)(\AREG) 
+
+ 
+    xxperm   vs9,  vs8, permute_mask   
+    xxperm   vs11, vs10, permute_mask  
+
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9 
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs16,   vs11 
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+.macro KERNEL2x2_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP4(\Index, 0+\OffsetB)(\BREG)  
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG) 
+
+ 
+    xxperm   vs9,  vs8, permute_mask    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP4(\Index,16)
+.endif 
+.endm
+
+
+.macro SAVE2x2
+
+#ifndef TRMMKERNEL    
+    lxsd v4   , 0(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxsd v5   , 0(T1) 
+    
+#endif
+    /*aggregate vectors*/
+  xxpermdi         vs4,vs0,vs0,2
+  xxpermdi         vs5,vs1,vs1,2  
+  xvaddsp          vs0,vs0,vs4
+  xvaddsp         vs1,vs1,vs5 
+  /*   */
+  /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10}  */
+  xxperm    vs1,vs1, permute_mask
+
+
+  xxmrghw   vs2 ,vs1,vs0
+  xxpermdi         vs2,vs2,vs2,2  
+  xxmrghw   vs3 ,vs0,vs1  
+#if defined(TRMMKERNEL)
+  xvmulsp        vs36, vs2, alpha_r 
+  xvmulsp        vs37, vs3, alpha_r 
+#else
+  xvmaddasp        vs36, vs2, alpha_r 
+  xvmaddasp        vs37, vs3, alpha_r 
+#endif
+  /**** store last two words*/
+
+
+  stxsd       v4, 0(CO) 
+  stxsd        v5, 0(T1)  
+
+  addi CO,CO,8
+
+.endm
+
+/*--------------------------- M=1 N=2 */
+.macro Zero2x1
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor    vs2,vs2,vs2 
+    xxlxor    vs3,vs3,vs3     
+.endm
+ 
+.macro KERNEL2x1
+  KERNEL2x1_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x1_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x1_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+   we will calculate 1 alone then will add it to batched ones
+ */
+.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v3,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    lxssp v4,   DISP2(\Index, 4+\OffsetB)(\BREG) 
+    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs2,   vs37,   vs35 
+    xvmulsp      vs3,   vs37,   vs36     
+     
+.else 
+    xsmaddadp     vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP1(\Index,4)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x1_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs10,   DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)
+   
+    xxmrglw   vs5, vs26,vs26
+    xxmrghw   vs6, vs26,vs26 
+ 
+    xvmaddasp      vs0,   vs8,   vs5
+    xvmaddasp      vs1,   vs10,   vs6 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP4(\Index,16)
+.endif 
+  
+.endm
+
+.macro KERNEL2x1_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxssp v3,   DISP4(\Index, 0+\OffsetB)(\BREG)
+    lxssp v4,   DISP4(\Index, 4+\OffsetB)(\BREG) 
+    lxssp v7,   DISP4(\Index, 8+\OffsetB)(\BREG)
+    lxssp v8,   DISP4(\Index, 12+\OffsetB)(\BREG)    
+    lxssp v5,   DISP2(\Index, 0+\OffsetA)(\AREG)        
+    lxssp v6,   DISP2(\Index, 4+\OffsetA)(\AREG)  
+ 
+ 
+    xsmaddadp      vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+
+    xsmaddadp      vs2,   vs38,   vs39 
+    xsmaddadp      vs3,   vs38,   vs40      
+ 
+   
+    addi        \BREG, \BREG, DISP4(\Index,16)
+    addi        \AREG, \AREG, DISP2(\Index,8) 
+.endm
+
+
+.macro SAVE2x1
+
+#ifndef TRMMKERNEL    
+    lxssp v4   , 0(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxssp v5   , 0(T1) 
+    
+#endif
+
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs16,alpha_r
+
+ /*aggregate vectors 2x2_4   */ 
+      xxpermdi         vs4,vs0,vs0,2
+      xxpermdi         vs5,vs1,vs1,2  
+      xvaddsp          vs0,vs0,vs4
+      xvaddsp         vs1,vs1,vs5 
+      xvaddsp         vs0,vs0,vs1 
+/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/
+  xscvspdp  vs5, vs0
+  xxspltw   vs6, vs0, 1  
+  xscvspdp  vs6,vs6 
+  xsadddp  vs2,vs2,vs6
+  xsadddp  vs3,vs3,vs5  
+
+  /**** store last two words*/
+#if defined(TRMMKERNEL) 
+  xsmuldp  vs36,vs2, vs16 
+  xsmuldp  vs37,vs3, vs16  
+ 
+#else
+  xsmaddadp  vs36,vs2, vs16 
+  xsmaddadp  vs37,vs3, vs16 
+#endif  
+
+  stxssp       v4, 0(CO) 
+  stxssp        v5, 0(T1)  
+
+  addi CO,CO,4
+
+.endm
+
+
+
+/****************************N=1 section*****************/
+
+.macro KERNEL1x16_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL1x16_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero1x16
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3       
+.endm
+ 
+.macro KERNEL1x16
+  KERNEL1x16_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x16_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
+    xscvdpspn   vs36,vs36
+    xxspltw     vs8,  vs36, 0
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)   
+    lxv vs28,   DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs29,   DISP16(\Index,48+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs28,   vs8
+    xvmulsp      vs3,   vs29,   vs8 
+  
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP1(\Index,4)
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+ 
+.endm
+
+
+
+
+.macro KERNEL1x16_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP64(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP64(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP64(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP64(\Index,48+\OffsetA)(\AREG)  
+
+    lxv vs16,   DISP64(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP64(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP64(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP64(\Index,64+ 48+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+
+    lxv vs30,   DISP64(\Index,128+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP64(\Index,128+ 16+\OffsetA)(\AREG)
+    lxv vs32,   DISP64(\Index,128+ 32+\OffsetA)(\AREG)
+    lxv vs33,   DISP64(\Index,128+ 48+\OffsetA)(\AREG)  
+        
+    lxv vs34,   DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
+    lxv vs36,   DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
+    lxv vs37,   DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) 
+
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+ 
+
+    xvmaddasp      vs0,   vs16,   vs9
+    xvmaddasp      vs1,   vs17,   vs9 
+    xvmaddasp      vs2,   vs18,   vs9
+    xvmaddasp      vs3,   vs19,   vs9 
+ 
+
+    xvmaddasp      vs0,   vs30,   vs10
+    xvmaddasp      vs1,   vs31,   vs10 
+    xvmaddasp      vs2,   vs32,   vs10
+    xvmaddasp      vs3,   vs33,   vs10 
+ 
+
+    xvmaddasp      vs0,   vs34,   vs11
+    xvmaddasp      vs1,   vs35,   vs11 
+    xvmaddasp      vs2,   vs36,   vs11
+    xvmaddasp      vs3,   vs37,   vs11 
+
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP64(\Index,256)
+.endif 
+  
+.endm
+
+.macro KERNEL1x16_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1  
+    xxspltw   vs9,  vs36, 0      
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP32(\Index,48+\OffsetA)(\AREG)      
+    lxv vs16,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP32(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP32(\Index,64+ 48+\OffsetA)(\AREG) 
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+ 
+
+    xvmaddasp      vs0,   vs16,   vs9
+    xvmaddasp      vs1,   vs17,   vs9 
+    xvmaddasp      vs2,   vs18,   vs9
+    xvmaddasp      vs3,   vs19,   vs9 
+  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP2(\Index,8)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+
+.macro SAVE1x16
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO) 
+    lxv        vs18, 32(CO)  
+    lxv        vs19, 48(CO)      
+#endif
+ 
+
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r 
+  xvmulsp        vs18, vs2, alpha_r
+  xvmulsp        vs19, vs3, alpha_r   
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r 
+  xvmaddasp        vs18, vs2, alpha_r
+  xvmaddasp        vs19, vs3, alpha_r   
+#endif
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO) 
+    stxv        vs18, 32(CO)  
+    stxv        vs19, 48(CO)      
+    
+  addi CO,CO,64
+
+.endm
+
+/*       M=8 N=1 */
+
+.macro KERNEL1x8_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL1x8_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero1x8
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1  
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3          
+.endm
+ 
+.macro KERNEL1x8
+  KERNEL1x8_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x8_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
+    xscvdpspn   vs36,vs36
+    xxspltw     vs8,  vs36, 0
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)         
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+  
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP1(\Index,4)
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+ 
+.endm
+
+
+
+
+.macro KERNEL1x8_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG) 
+
+    lxv vs16,   DISP32(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,32+ 16+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+
+    lxv vs30,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)   
+        
+    lxv vs34,   DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG)  
+
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ 
+
+    xvmaddasp      vs2,   vs16,   vs9
+    xvmaddasp      vs3,   vs17,   vs9  
+ 
+
+    xvmaddasp      vs0,   vs30,   vs10
+    xvmaddasp      vs1,   vs31,   vs10  
+ 
+
+    xvmaddasp      vs2,   vs34,   vs11
+    xvmaddasp      vs3,   vs35,   vs11  
+
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+.macro KERNEL1x8_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1  
+    xxspltw   vs9,  vs36, 0      
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)     
+    lxv vs16,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP16(\Index,32+ 16+\OffsetA)(\AREG) 
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ 
+
+    xvmaddasp      vs2,   vs16,   vs9
+    xvmaddasp      vs3,   vs17,   vs9   
+  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP2(\Index,8)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+
+.macro SAVE1x8
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO)       
+#endif
+   /* aggregate vs0 vs2 and vs1 vs3*/
+  xvaddsp vs0,vs0,vs2
+  xvaddsp  vs1,vs1,vs3
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r     
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r  
+#endif
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO)      
+    
+  addi CO,CO,32
+
+.endm
+/*M=4*/
+
+.macro KERNEL1x4_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL1x4_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero1x4
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1  
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3          
+.endm
+ 
+.macro KERNEL1x4
+  KERNEL1x4_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x4_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
+    xscvdpspn   vs36,vs36
+    xxspltw     vs8,  vs36, 0
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)         
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+.else 
+    xvmaddasp      vs0,   vs26,   vs8 
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP1(\Index,4)
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+ 
+.endm
+
+
+
+
+.macro KERNEL1x4_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG) 
+ 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+
+    lxv vs30,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP16(\Index,32+ 16+\OffsetA)(\AREG)   
+          
+
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8 
+
+    xvmaddasp      vs1,   vs27,   vs9 
+
+    xvmaddasp      vs2,   vs30,   vs10   
+ 
+
+    xvmaddasp      vs3,   vs31,   vs11   
+
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+.macro KERNEL1x4_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1  
+    xxspltw   vs9,  vs36, 0      
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)      
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs9
+  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP2(\Index,8)  
+    addi        \AREG, \AREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+
+.macro SAVE1x4
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)       
+#endif
+   /* aggregate */
+  xvaddsp vs0,vs0,vs2
+  xvaddsp  vs1,vs1,vs3
+  xvaddsp  vs0,vs1,vs0
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r     
+#else
+  xvmaddasp        vs16, vs0, alpha_r  
+#endif
+    stxv        vs16, 0(CO)      
+    
+  addi CO,CO,16
+
+.endm
+
+/* M=2 N=1*/ 
+.macro Zero1x2
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor    vs2,vs2,vs2 
+    xxlxor    vs3,vs3,vs3     
+.endm
+ 
+.macro KERNEL1x2
+  KERNEL1x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x2_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x2_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+   we will calculate 1 alone then will add it to batched ones
+ */
+.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v3,   DISP2(\Index, 0+\OffsetB)(\AREG)
+    lxssp v4,   DISP2(\Index, 4+\OffsetB)(\AREG) 
+    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\BREG)        
+ 
+ 
+.if \First==1
+    xvmuldp      vs2,   vs37,   vs35 
+    xvmuldp      vs3,   vs37,   vs36     
+     
+.else 
+    xsmaddadp     vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+ .endif
+   
+    addi        \AREG, \AREG,  DISP2(\Index,8) 
+    addi        \BREG, \BREG, DISP1(\Index,4) 
+ 
+.endm
+
+
+
+
+.macro KERNEL1x2_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\AREG)
+    lxv vs10,   DISP8(\Index, 16+\OffsetB)(\AREG) 
+
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\BREG)
+   
+    xxmrglw   vs5, vs26,vs26
+    xxmrghw   vs6, vs26,vs26 
+ 
+    xvmaddasp      vs0,   vs8,   vs5
+    xvmaddasp      vs1,   vs10,   vs6 
+ 
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP8(\Index,32)
+    addi        \BREG, \BREG,  DISP4(\Index,16)  
+.endif 
+  
+.endm
+
+.macro KERNEL1x2_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxssp v3,   DISP4(\Index, 0+\OffsetB)(\AREG)
+    lxssp v4,   DISP4(\Index, 4+\OffsetB)(\AREG) 
+    lxssp v7,   DISP4(\Index, 8+\OffsetB)(\AREG)
+    lxssp v8,   DISP4(\Index, 12+\OffsetB)(\AREG)    
+    lxssp v5,   DISP2(\Index, 0+\OffsetA)(\BREG)        
+    lxssp v6,   DISP2(\Index, 4+\OffsetA)(\BREG)  
+ 
+ 
+    xsmaddadp      vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+
+    xsmaddadp      vs2,   vs38,   vs39 
+    xsmaddadp      vs3,   vs38,   vs40      
+ 
+   
+    addi        \AREG, \AREG, DISP4(\Index,16)
+    addi        \BREG, \BREG, DISP2(\Index,8) 
+.endm
+
+
+.macro SAVE1x2
+
+#ifndef TRMMKERNEL    
+    lxssp v4   , 0(CO)      
+    lxssp v5   , 4(CO) 
+    
+#endif
+
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs16,alpha_r
+
+ /*aggregate vectors 1x2_4   */ 
+      xxpermdi         vs4,vs0,vs0,2
+      xxpermdi         vs5,vs1,vs1,2  
+      xvaddsp          vs0,vs0,vs4
+      xvaddsp         vs1,vs1,vs5 
+      xvaddsp         vs0,vs0,vs1 
+/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/
+  xscvspdp  vs5, vs0
+  xxspltw   vs6, vs0, 1  
+  xscvspdp  vs6,vs6 
+  xsadddp  vs2,vs2,vs6
+  xsadddp  vs3,vs3,vs5  
+
+  /**** store last two words*/
+#if defined(TRMMKERNEL) 
+  xsmuldp  vs36,vs2, vs16 
+  xsmuldp  vs37,vs3, vs16  
+ 
+#else
+  xsmaddadp  vs36,vs2, vs16 
+  xsmaddadp  vs37,vs3, vs16 
+#endif  
+
+  stxssp       v4, 0(CO) 
+  stxssp        v5, 4(CO)  
+
+  addi CO,CO,8
+
+.endm
+/*///////////////// N=1 M=1 //////////////////*/
+.macro Zero1x1
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2, vs2,vs2 
+    xxlxor      vs3,vs3,vs3 
+    xxlxor      vs4,vs4,vs4       
+.endm
+ 
+.macro KERNEL1x1
+  KERNEL1x1_1 AO,BO, 1, 0,0,0
+.endm
+
+.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_16 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_8 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+   we will calculate 1 alone ( FIRST==1 to zero vs4) 
+ */
+.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v3,   DISP1(\Index, 0+\OffsetB)(\AREG) 
+    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\BREG)        
+ 
+ 
+.if \First==1
+    xvmuldp      vs4,   vs37,   vs35       
+     
+.else 
+    xsmaddadp     vs4,   vs37,   vs35 
+ .endif
+   
+    addi        \AREG, \AREG,  DISP1(\Index,4) 
+    addi        \BREG, \BREG, DISP1(\Index,4) 
+ 
+.endm
+
+
+.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\AREG) 
+    lxv vs9,    DISP16(\Index, 16+\OffsetB)(\AREG) 
+    lxv vs10,   DISP16(\Index, 32+0+\OffsetB)(\AREG) 
+    lxv vs11,   DISP16(\Index, 32+ 16+\OffsetB)(\AREG)        
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\BREG) 
+    lxv vs16,   DISP16(\Index, 16+\OffsetA)(\BREG) 
+    lxv vs17,   DISP16(\Index, 32+0+\OffsetA)(\BREG) 
+    lxv vs18,   DISP16(\Index, 32+16+\OffsetA)(\BREG)     
+    xvmaddasp      vs0,   vs8,   vs26 
+    xvmaddasp      vs1,   vs9,   vs16  
+    xvmaddasp      vs2,   vs10,  vs17 
+    xvmaddasp      vs3,   vs11,  vs18
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP16(\Index,64)
+    addi        \BREG, \BREG,  DISP16(\Index,64)  
+.endif 
+  
+.endm
+
+.macro KERNEL1x1_I_8  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\AREG) 
+    lxv vs9,    DISP8(\Index, 16+\OffsetB)(\AREG)     
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\BREG) 
+    lxv vs16,   DISP8(\Index, 16+\OffsetA)(\BREG) 
+    xvmaddasp      vs0,   vs8,   vs26 
+    xvmaddasp      vs1,   vs9,   vs16 
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP8(\Index,32)
+    addi        \BREG, \BREG,  DISP8(\Index,32)  
+.endif 
+  
+.endm
+
+
+.macro KERNEL1x1_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP4(\Index, 0+\OffsetB)(\AREG) 
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\BREG) 
+ 
+    xvmaddasp      vs0,   vs8,   vs26 
+ 
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)
+    addi        \BREG, \BREG,  DISP4(\Index,16)  
+.endif 
+  
+.endm
+
+.macro KERNEL1x1_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\AREG) 
+    lxsd v5,   DISP2(\Index, 0+\OffsetA)(\BREG) 
+ 
+    xvmaddasp      vs0,   vs36,   vs37 
+ 
+    addi        \AREG, \AREG, DISP2(\Index,8)
+    addi        \BREG, \BREG, DISP2(\Index,8) 
+.endm
+
+
+.macro SAVE1x1
+
+#ifndef TRMMKERNEL    
+    lxssp v4   , 0(CO)    
+    
+#endif
+
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs16,alpha_r
+
+ /*aggregate vectors   */ 
+      xvaddsp          vs0,vs0,vs1
+      xvaddsp          vs2,vs2,vs3
+      xvaddsp          vs0,vs0,vs2
+
+      xxpermdi         vs7,vs0,vs0,2   
+      xvaddsp          vs0,vs0,vs7 
+/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/
+  xscvspdp  vs5, vs0
+  xxspltw   vs6, vs0, 1  
+  xscvspdp  vs6,vs6 
+  xsadddp  vs7,vs5,vs6
+  xsadddp  vs4,vs4,vs7  
+
+  /**** store last two words*/
+#if defined(TRMMKERNEL) 
+  xsmuldp  vs36,vs4, vs16   
+ 
+#else
+  xsmaddadp  vs36,vs4, vs16   
+#endif  
+
+  stxssp       v4, 0(CO)    
+
+  addi CO,CO,4
+
+.endm
+
+
+
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+
+.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
+		.if \SHIFT_VAL==16 
+			slwi		\REG1,	\REG2,	6			
+		.elseif \SHIFT_VAL==8  
+			slwi		\REG1,	\REG2,	5			 
+		.elseif \SHIFT_VAL==4
+			slwi		\REG1,	\REG2,	4			  
+		.elseif \SHIFT_VAL==2
+			slwi		\REG1,	\REG2,	3			 
+		.elseif \SHIFT_VAL==1
+			slwi		\REG1,	\REG2,	2			 
+		.endif
+.endm
+
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*16;
+// 		ptrbb = bb + off*2;
+// #endif
+*/
+.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+        /* ptrbb = bb;*/
+        mr \PTR_B,\B_VAL     /* refresh BPOINT */
+
+    #else
+		    /*
+        // ptrba  =ptrba+ off*C_A;
+        // ptrbb = bb + off*C_B; 
+				*/
+		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
+		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
+		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
+		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
+    #endif 
+.endm
+
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+16;	// number of values in A
+// #else
+// 		temp = off+2;	// number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+                            /* temp = bk-off;*/
+           sub \TEMP_BK,\BK_VAL,\OFF_VAL
+
+    #elif defined(LEFT)
+                            /* temp = off+INCR_A;	// number of values in A */
+           addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+                            /* temp = off+INCR_B	// number of values in B*/
+           addi \TEMP_BK,\OFF_VAL, \INCR_B
+    #endif
+
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 16; // number of values in A
+// #else
+// 		temp -= 2; // number of values in B
+// #endif
+// 		ptrba += temp*16;
+// 		ptrbb += temp*2;
+// #endif
+
+// #ifdef LEFT
+// 		off += 16; // number of values in A
+// #endif
+*/
+ 
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                    /*temp = bk - off;*/
+                sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #ifdef LEFT
+                    /*temp -= 8; // number of values in A*/
+                addi \TEMP_BK,\TEMP_BK,-\C_A
+    #else
+                    /*temp -= 4; // number of values in B*/
+                addi \TEMP_BK,\TEMP_BK,-\C_B 
+    #endif
+                    /*ptrba += temp*C_A;
+                    ptrbb += temp*C_B;*/ 
+                SHIFT_REG T4,\TEMP_BK,\C_A
+								SHIFT_REG T2,\TEMP_BK,\C_B
+                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
+								add \PTR_B, \PTR_B,T2 
+
+    #endif
+
+    #ifdef LEFT
+                    /*off += 8; // number of values in A*/
+                 addi \OFF_VAL,\OFF_VAL,\C_A
+    #endif
 .endm
\ No newline at end of file
diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c
index 5dfb18f5b..f5c1ba729 100644
--- a/kernel/power/sgemv_n.c
+++ b/kernel/power/sgemv_n.c
@@ -1,470 +1,470 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-#if !defined(__VEC__) || !defined(__ALTIVEC__)
-#include "../arm/gemv_n.c"
-
-#else
-
-#include "common.h"
-
-#define NBMAX 4096
-
-static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
-{
-
-    BLASLONG i;
-	FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; 
-    FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
-	a0 = ap[0];
-	a1 = ap[1];
-	a2 = ap[2];
-	a3 = ap[3]; 
-    b0 = a0 + lda4 ;
-	b1 = a1 + lda4 ;
-	b2 = a2 + lda4 ;
-	b3 = a3 + lda4 ;
-    x0 = xo[0] * *alpha;
-    x1 = xo[1] * *alpha;
-    x2 = xo[2] * *alpha;
-    x3 = xo[3] * *alpha;
-    x4 = xo[4] * *alpha;
-    x5 = xo[5] * *alpha;
-    x6 = xo[6] * *alpha;
-    x7 = xo[7] * *alpha;
-    __vector float* va0 = (__vector float*)a0;
-    __vector float* va1 = (__vector float*)a1;
-    __vector float* va2 = (__vector float*)a2;
-    __vector float* va3 = (__vector float*)a3;
-    __vector float* vb0 = (__vector float*)b0;
-    __vector float* vb1 = (__vector float*)b1;
-    __vector float* vb2 = (__vector float*)b2;
-    __vector float* vb3 = (__vector float*)b3; 
-    
-    __vector float   v_x0 = {x0,x0,x0,x0};
-    __vector float   v_x1 = {x1,x1,x1,x1};
-    __vector float   v_x2 = {x2,x2,x2,x2};
-    __vector float   v_x3 = {x3,x3,x3,x3};
-    __vector float   v_x4 = {x4,x4,x4,x4};
-    __vector float   v_x5 = {x5,x5,x5,x5};
-    __vector float   v_x6 = {x6,x6,x6,x6};
-    __vector float   v_x7 = {x7,x7,x7,x7};
-    __vector float* v_y =(__vector float*)y;   
- 
-    for ( i=0; i< n/4; i++)
-    {
-        register __vector float vy=v_y[i];
-        vy   += v_x0 * va0[i]   +  v_x1 * va1[i]   + v_x2 * va2[i]   + v_x3 * va3[i] ; 
-        vy  += v_x4 * vb0[i]   +  v_x5 * vb1[i]   + v_x6 * vb2[i]   + v_x7 * vb3[i] ;
-        v_y[i] =vy;  
-    }
-
-}
-	 
-static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
-{
-    BLASLONG i;
-    FLOAT x0,x1,x2,x3;
-    x0 = xo[0] * *alpha;
-    x1 = xo[1] * *alpha;
-    x2 = xo[2] * *alpha;
-    x3 = xo[3] * *alpha;
-    __vector float   v_x0 = {x0,x0,x0,x0};
-    __vector float   v_x1 = {x1,x1,x1,x1};
-    __vector float   v_x2 = {x2,x2,x2,x2};
-    __vector float   v_x3 = {x3,x3,x3,x3};
-    __vector float* v_y =(__vector float*)y;      
-    __vector float* va0 = (__vector float*)ap[0];
-    __vector float* va1 = (__vector float*)ap[1];
-    __vector float* va2 = (__vector float*)ap[2];
-    __vector float* va3 = (__vector float*)ap[3]; 
- 
-    for ( i=0; i< n/4; i++ )
-    {
-        register __vector float vy=v_y[i];
-        vy   += v_x0 * va0[i]   +  v_x1 * va1[i]   + v_x2 * va2[i]   + v_x3 * va3[i] ;  
-        v_y[i] =vy;     
-    }
-
-} 
-
-static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
-{
-
-    BLASLONG i;
-    FLOAT x0,x1;
-    x0 = x[0] * *alpha;
-    x1 = x[1] * *alpha; 
-    __vector float   v_x0 = {x0,x0,x0,x0};
-    __vector float   v_x1 = {x1,x1,x1,x1}; 
-    __vector float* v_y =(__vector float*)y;      
-    __vector float* va0 = (__vector float*)ap[0];
-    __vector float* va1 = (__vector float*)ap[1]; 
- 
-    for ( i=0; i< n/4; i++ )
-    { 
-        v_y[i]   += v_x0 * va0[i]   +  v_x1 * va1[i] ;     
-    }
-
-} 
- 
- 
-static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
-{
-
-    BLASLONG i;
-    FLOAT x0 ;
-    x0 = x[0] * *alpha; 
-    __vector float   v_x0 = {x0,x0,x0,x0}; 
-    __vector float* v_y =(__vector float*)y;      
-    __vector float* va0 = (__vector float*)ap; 
- 
-    for ( i=0; i< n/4; i++ )
-    { 
-        v_y[i]   += v_x0 * va0[i]  ;        
-    }
-
-}
- 
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
-{
-    BLASLONG i;
-        
-    for ( i=0; i<n; i++ ){
-            *dest += *src;
-            src++;
-            dest += inc_dest;
-    }
-    return;
-     
-
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
-	BLASLONG i;
-	FLOAT *a_ptr;
-	FLOAT *x_ptr;
-	FLOAT *y_ptr;
-	FLOAT *ap[4];
-	BLASLONG n1;
-	BLASLONG m1;
-	BLASLONG m2;
-	BLASLONG m3;
-	BLASLONG n2;
-	BLASLONG lda4 =  lda << 2;
-	BLASLONG lda8 =  lda << 3;
-	FLOAT xbuffer[8] __attribute__((aligned(16)));
-	FLOAT *ybuffer;
-
-        if ( m < 1 ) return(0);
-        if ( n < 1 ) return(0);
-
-	ybuffer = buffer;
-	
-        if ( inc_x == 1 )
-	{
-		n1 = n >> 3 ;
-		n2 = n &  7 ;
-	}
-	else
-	{
-		n1 = n >> 2 ;
-		n2 = n &  3 ;
-
-	}
-	
-        m3 = m & 3  ;
-        m1 = m & -4 ;
-        m2 = (m & (NBMAX-1)) - m3 ;
-
-
-	y_ptr = y;
-
-	BLASLONG NB = NBMAX;
-
-	while ( NB == NBMAX )
-	{
-		
-		m1 -= NB;
-		if ( m1 < 0)
-		{
-			if ( m2 == 0 ) break;	
-			NB = m2;
-		}
-		
-		a_ptr = a;
-		x_ptr = x;
-		
-		ap[0] = a_ptr;
-		ap[1] = a_ptr + lda;
-		ap[2] = ap[1] + lda;
-		ap[3] = ap[2] + lda;
-
-		if ( inc_y != 1 )
-			memset(ybuffer,0,NB*4);
-		else
-			ybuffer = y_ptr;
-
-		if ( inc_x == 1 )
-		{
-
-
-			for( i = 0; i < n1 ; i++)
-			{
-				sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
-				ap[0] += lda8; 
-				ap[1] += lda8; 
-				ap[2] += lda8; 
-				ap[3] += lda8; 
-				a_ptr += lda8;
-				x_ptr += 8;	
-			}
-
-
-			if ( n2 & 4 )
-			{
-				sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
-				ap[0] += lda4; 
-				ap[1] += lda4; 
-				ap[2] += lda4; 
-				ap[3] += lda4; 
-				a_ptr += lda4;
-				x_ptr += 4;	
-			}
-
-			if ( n2 & 2 )
-			{
-				sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
-				a_ptr += lda*2;
-				x_ptr += 2;	
-			}
-
-
-			if ( n2 & 1 )
-			{
-				sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); 
-                a_ptr += lda;
-                x_ptr += 1;   
-			}
-
-
-		}
-		else
-		{
-
-			for( i = 0; i < n1 ; i++)
-			{
-				xbuffer[0] = x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[1] =  x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[2] =  x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[3] = x_ptr[0];
-				x_ptr += inc_x;	
-				sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
-				ap[0] += lda4; 
-				ap[1] += lda4; 
-				ap[2] += lda4; 
-				ap[3] += lda4; 
-				a_ptr += lda4;
-			}
-
-			for( i = 0; i < n2 ; i++)
-			{
-				xbuffer[0] = x_ptr[0];
-				x_ptr += inc_x;	
-				sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
-				a_ptr += lda;
-
-			}
-
-		}
-
-		a     += NB;
-		if ( inc_y != 1 )
-		{
-			add_y(NB,ybuffer,y_ptr,inc_y);
-			y_ptr += NB * inc_y;
-		}
-		else
-			y_ptr += NB ;
-
-	}
-
-	if ( m3 == 0 ) return(0);
-
-	if ( m3 == 3 )
-	{
-		a_ptr = a;
-		x_ptr = x;
-		FLOAT temp0 = 0.0;
-		FLOAT temp1 = 0.0;
-		FLOAT temp2 = 0.0;
-		if ( lda == 3 && inc_x ==1 )
-		{
-
-			for( i = 0; i < ( n & -4 ); i+=4 )
-			{
-
-				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
-				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
-
-				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
-				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
-				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
-
-				a_ptr += 12;
-				x_ptr += 4;
-			}
-
-			for( ; i < n; i++ )
-			{
-				temp0 += a_ptr[0] * x_ptr[0];
-				temp1 += a_ptr[1] * x_ptr[0];
-				temp2 += a_ptr[2] * x_ptr[0];
-				a_ptr += 3;
-				x_ptr ++;
-			}
-
-		}
-		else
-		{
-
-			for( i = 0; i < n; i++ )
-			{
-				temp0 += a_ptr[0] * x_ptr[0];
-				temp1 += a_ptr[1] * x_ptr[0];
-				temp2 += a_ptr[2] * x_ptr[0];
-				a_ptr += lda;
-				x_ptr += inc_x;
-
-
-			}
-
-		}
-		y_ptr[0] += alpha * temp0;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp1;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp2;
-		return(0);
-	}
-
-
-	if ( m3 == 2 )
-	{
-		a_ptr = a;
-		x_ptr = x;
-		FLOAT temp0 = 0.0;
-		FLOAT temp1 = 0.0;
-		if ( lda == 2 && inc_x ==1 )
-		{
-
-			for( i = 0; i < (n & -4) ; i+=4 )
-			{
-				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
-				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
-				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
-				a_ptr += 8;
-				x_ptr += 4;
-
-			}
-
-
-			for( ; i < n; i++ )
-			{
-				temp0 += a_ptr[0]   * x_ptr[0];
-				temp1 += a_ptr[1]   * x_ptr[0];
-				a_ptr += 2;
-				x_ptr ++;
-			}
-
-		}
-		else
-		{
-
-			for( i = 0; i < n; i++ )
-			{
-				temp0 += a_ptr[0] * x_ptr[0];
-				temp1 += a_ptr[1] * x_ptr[0];
-				a_ptr += lda;
-				x_ptr += inc_x;
-
-
-			}
-
-		}
-		y_ptr[0] += alpha * temp0;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp1;
-		return(0);
-	}
-
-	if ( m3 == 1 )
-	{
-		a_ptr = a;
-		x_ptr = x;
-		FLOAT temp = 0.0;
-		if ( lda == 1 && inc_x ==1 )
-		{
-
-			for( i = 0; i < (n & -4); i+=4 )
-			{
-				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
-	
-			}
-
-			for( ; i < n; i++ )
-			{
-				temp += a_ptr[i] * x_ptr[i];
-			}
-
-		}
-		else
-		{
-
-			for( i = 0; i < n; i++ )
-			{
-				temp += a_ptr[0] * x_ptr[0];
-				a_ptr += lda;
-				x_ptr += inc_x;
-			}
-
-		}
-		y_ptr[0] += alpha * temp;
-		return(0);
-	}
-
-
-	return(0);
-}
-
-#endif
-
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/gemv_n.c"
+
+#else
+
+#include "common.h"
+
+#define NBMAX 4096
+
+static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+    BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; 
+    FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3]; 
+    b0 = a0 + lda4 ;
+	b1 = a1 + lda4 ;
+	b2 = a2 + lda4 ;
+	b3 = a3 + lda4 ;
+    x0 = xo[0] * *alpha;
+    x1 = xo[1] * *alpha;
+    x2 = xo[2] * *alpha;
+    x3 = xo[3] * *alpha;
+    x4 = xo[4] * *alpha;
+    x5 = xo[5] * *alpha;
+    x6 = xo[6] * *alpha;
+    x7 = xo[7] * *alpha;
+    __vector float* va0 = (__vector float*)a0;
+    __vector float* va1 = (__vector float*)a1;
+    __vector float* va2 = (__vector float*)a2;
+    __vector float* va3 = (__vector float*)a3;
+    __vector float* vb0 = (__vector float*)b0;
+    __vector float* vb1 = (__vector float*)b1;
+    __vector float* vb2 = (__vector float*)b2;
+    __vector float* vb3 = (__vector float*)b3; 
+    
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1};
+    __vector float   v_x2 = {x2,x2,x2,x2};
+    __vector float   v_x3 = {x3,x3,x3,x3};
+    __vector float   v_x4 = {x4,x4,x4,x4};
+    __vector float   v_x5 = {x5,x5,x5,x5};
+    __vector float   v_x6 = {x6,x6,x6,x6};
+    __vector float   v_x7 = {x7,x7,x7,x7};
+    __vector float* v_y =(__vector float*)y;   
+ 
+    for ( i=0; i< n/4; i++)
+    {
+        register __vector float vy=v_y[i];
+        vy   += v_x0 * va0[i]   +  v_x1 * va1[i]   + v_x2 * va2[i]   + v_x3 * va3[i] ; 
+        vy  += v_x4 * vb0[i]   +  v_x5 * vb1[i]   + v_x6 * vb2[i]   + v_x7 * vb3[i] ;
+        v_y[i] =vy;  
+    }
+
+}
+	 
+static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+    BLASLONG i;
+    FLOAT x0,x1,x2,x3;
+    x0 = xo[0] * *alpha;
+    x1 = xo[1] * *alpha;
+    x2 = xo[2] * *alpha;
+    x3 = xo[3] * *alpha;
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1};
+    __vector float   v_x2 = {x2,x2,x2,x2};
+    __vector float   v_x3 = {x3,x3,x3,x3};
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap[0];
+    __vector float* va1 = (__vector float*)ap[1];
+    __vector float* va2 = (__vector float*)ap[2];
+    __vector float* va3 = (__vector float*)ap[3]; 
+ 
+    for ( i=0; i< n/4; i++ )
+    {
+        register __vector float vy=v_y[i];
+        vy   += v_x0 * va0[i]   +  v_x1 * va1[i]   + v_x2 * va2[i]   + v_x3 * va3[i] ;  
+        v_y[i] =vy;     
+    }
+
+} 
+
+static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+    BLASLONG i;
+    FLOAT x0,x1;
+    x0 = x[0] * *alpha;
+    x1 = x[1] * *alpha; 
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1}; 
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap[0];
+    __vector float* va1 = (__vector float*)ap[1]; 
+ 
+    for ( i=0; i< n/4; i++ )
+    { 
+        v_y[i]   += v_x0 * va0[i]   +  v_x1 * va1[i] ;     
+    }
+
+} 
+ 
+ 
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+    BLASLONG i;
+    FLOAT x0 ;
+    x0 = x[0] * *alpha; 
+    __vector float   v_x0 = {x0,x0,x0,x0}; 
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap; 
+ 
+    for ( i=0; i< n/4; i++ )
+    { 
+        v_y[i]   += v_x0 * va0[i]  ;        
+    }
+
+}
+ 
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+    BLASLONG i;
+        
+    for ( i=0; i<n; i++ ){
+            *dest += *src;
+            src++;
+            dest += inc_dest;
+    }
+    return;
+     
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	BLASLONG lda4 =  lda << 2;
+	BLASLONG lda8 =  lda << 3;
+	FLOAT xbuffer[8] __attribute__((aligned(16)));
+	FLOAT *ybuffer;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	ybuffer = buffer;
+	
+        if ( inc_x == 1 )
+	{
+		n1 = n >> 3 ;
+		n2 = n &  7 ;
+	}
+	else
+	{
+		n1 = n >> 2 ;
+		n2 = n &  3 ;
+
+	}
+	
+        m3 = m & 3  ;
+        m1 = m & -4 ;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		if ( inc_y != 1 )
+			memset(ybuffer,0,NB*4);
+		else
+			ybuffer = y_ptr;
+
+		if ( inc_x == 1 )
+		{
+
+
+			for( i = 0; i < n1 ; i++)
+			{
+				sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
+				ap[0] += lda8; 
+				ap[1] += lda8; 
+				ap[2] += lda8; 
+				ap[3] += lda8; 
+				a_ptr += lda8;
+				x_ptr += 8;	
+			}
+
+
+			if ( n2 & 4 )
+			{
+				sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+				x_ptr += 4;	
+			}
+
+			if ( n2 & 2 )
+			{
+				sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
+				a_ptr += lda*2;
+				x_ptr += 2;	
+			}
+
+
+			if ( n2 & 1 )
+			{
+				sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); 
+                a_ptr += lda;
+                x_ptr += 1;   
+			}
+
+
+		}
+		else
+		{
+
+			for( i = 0; i < n1 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[1] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[2] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[3] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+			}
+
+			for( i = 0; i < n2 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
+				a_ptr += lda;
+
+			}
+
+		}
+
+		a     += NB;
+		if ( inc_y != 1 )
+		{
+			add_y(NB,ybuffer,y_ptr,inc_y);
+			y_ptr += NB * inc_y;
+		}
+		else
+			y_ptr += NB ;
+
+	}
+
+	if ( m3 == 0 ) return(0);
+
+	if ( m3 == 3 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		FLOAT temp2 = 0.0;
+		if ( lda == 3 && inc_x ==1 )
+		{
+
+			for( i = 0; i < ( n & -4 ); i+=4 )
+			{
+
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+
+				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
+				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
+
+				a_ptr += 12;
+				x_ptr += 4;
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += 3;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp2;
+		return(0);
+	}
+
+
+	if ( m3 == 2 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		if ( lda == 2 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4) ; i+=4 )
+			{
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+				a_ptr += 8;
+				x_ptr += 4;
+
+			}
+
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0]   * x_ptr[0];
+				temp1 += a_ptr[1]   * x_ptr[0];
+				a_ptr += 2;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		return(0);
+	}
+
+	if ( m3 == 1 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		if ( lda == 1 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4); i+=4 )
+			{
+				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+	
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp += a_ptr[i] * x_ptr[i];
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp += a_ptr[0] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+			}
+
+		}
+		y_ptr[0] += alpha * temp;
+		return(0);
+	}
+
+
+	return(0);
+}
+
+#endif
+
diff --git a/kernel/power/sgemv_n_8.c b/kernel/power/sgemv_n_8.c
index 64696236a..0edb79129 100644
--- a/kernel/power/sgemv_n_8.c
+++ b/kernel/power/sgemv_n_8.c
@@ -1,514 +1,514 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-
-/****Note***
-UnUsed kernel
-This kernel works. But it was not competitive enough to be added in production
-It could be used and tested in future or could provide barebone for switching to inline assembly
-*/
-
-#include "common.h"
-
-#define NBMAX 4096
-
-static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
-{
-
-    BLASLONG i;
-	FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; 
-    FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
-	a0 = ap[0];
-	a1 = ap[1];
-	a2 = ap[2];
-	a3 = ap[3]; 
-    b0 = a0 + lda4 ;
-	b1 = a1 + lda4 ;
-	b2 = a2 + lda4 ;
-	b3 = a3 + lda4 ;
-    x0 = xo[0] * *alpha;
-    x1 = xo[1] * *alpha;
-    x2 = xo[2] * *alpha;
-    x3 = xo[3] * *alpha;
-    x4 = xo[4] * *alpha;
-    x5 = xo[5] * *alpha;
-    x6 = xo[6] * *alpha;
-    x7 = xo[7] * *alpha;
-    __vector float* va0 = (__vector float*)a0;
-    __vector float* va1 = (__vector float*)a1;
-    __vector float* va2 = (__vector float*)a2;
-    __vector float* va3 = (__vector float*)a3;
-    __vector float* vb0 = (__vector float*)b0;
-    __vector float* vb1 = (__vector float*)b1;
-    __vector float* vb2 = (__vector float*)b2;
-    __vector float* vb3 = (__vector float*)b3; 
-    
-    register __vector float   v_x0 = {x0,x0,x0,x0};
-    register __vector float   v_x1 = {x1,x1,x1,x1};
-    register __vector float   v_x2 = {x2,x2,x2,x2};
-    register __vector float   v_x3 = {x3,x3,x3,x3};
-    register __vector float   v_x4 = {x4,x4,x4,x4};
-    register __vector float   v_x5 = {x5,x5,x5,x5};
-    register __vector float   v_x6 = {x6,x6,x6,x6};
-    register __vector float   v_x7 = {x7,x7,x7,x7};
-    __vector float* v_y =(__vector float*)y;   
- 
-    for ( i=0; i< n/4; i+=2)
-    {
-        register __vector float vy_1=v_y[i];
-        register __vector float vy_2=v_y[i+1];
-        register __vector float va0_1=va0[i] ; 
-        register __vector float va0_2=va0[i+1] ; 
-        register __vector float va1_1=va1[i] ; 
-        register __vector float va1_2=va1[i+1] ; 
-        register __vector float va2_1=va2[i] ; 
-        register __vector float va2_2=va2[i+1] ; 
-        register __vector float va3_1=va3[i] ; 
-        register __vector float va3_2=va3[i+1] ;
-        register __vector float vb0_1=vb0[i] ; 
-        register __vector float vb0_2=vb0[i+1] ; 
-        register __vector float vb1_1=vb1[i] ; 
-        register __vector float vb1_2=vb1[i+1] ; 
-        register __vector float vb2_1=vb2[i] ; 
-        register __vector float vb2_2=vb2[i+1] ; 
-        register __vector float vb3_1=vb3[i] ; 
-        register __vector float vb3_2=vb3[i+1] ;         
-        vy_1   += v_x0 * va0_1  +  v_x1 * va1_1  + v_x2 * va2_1  + v_x3 * va3_1 ;
-        vy_1   += v_x4 * vb0_1   +  v_x5 * vb1_1   + v_x6 * vb2_1   + v_x7 * vb3_1 ;
-        vy_2   +=  v_x0 * va0_2   +  v_x1 * va1_2   + v_x2 * va2_2   + v_x3 * va3_2 ; 
-        vy_2   += v_x4 * vb0_2   +  v_x5 * vb1_2   + v_x6 * vb2_2   + v_x7 * vb3_2 ;
-        v_y[i] =vy_1;
-        v_y[i+1] =vy_2;   
-    }
-
-}
-	 
-static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
-{
-    BLASLONG i;
-    FLOAT x0,x1,x2,x3;
-    x0 = xo[0] * *alpha;
-    x1 = xo[1] * *alpha;
-    x2 = xo[2] * *alpha;
-    x3 = xo[3] * *alpha;
-    __vector float   v_x0 = {x0,x0,x0,x0};
-    __vector float   v_x1 = {x1,x1,x1,x1};
-    __vector float   v_x2 = {x2,x2,x2,x2};
-    __vector float   v_x3 = {x3,x3,x3,x3};
-    __vector float* v_y =(__vector float*)y;      
-    __vector float* va0 = (__vector float*)ap[0];
-    __vector float* va1 = (__vector float*)ap[1];
-    __vector float* va2 = (__vector float*)ap[2];
-    __vector float* va3 = (__vector float*)ap[3]; 
- 
-    for ( i=0; i< n/4; i+=2 )
-    {
-        register __vector float vy_1=v_y[i];
-        register __vector float vy_2=v_y[i+1];
-        register __vector float va0_1=va0[i] ; 
-        register __vector float va0_2=va0[i+1] ; 
-        register __vector float va1_1=va1[i] ; 
-        register __vector float va1_2=va1[i+1] ; 
-        register __vector float va2_1=va2[i] ; 
-        register __vector float va2_2=va2[i+1] ; 
-        register __vector float va3_1=va3[i] ; 
-        register __vector float va3_2=va3[i+1] ;      
-        vy_1   += v_x0 * va0_1  +  v_x1 * va1_1  + v_x2 * va2_1  + v_x3 * va3_1 ;
-        vy_2   +=  v_x0 * va0_2   +  v_x1 * va1_2   + v_x2 * va2_2   + v_x3 * va3_2 ;
-        v_y[i] =vy_1;
-        v_y[i+1] =vy_2;   
-    }
-  
-} 
-
-static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
-{
-
-    BLASLONG i;
-    FLOAT x0,x1;
-    x0 = x[0] * *alpha;
-    x1 = x[1] * *alpha; 
-    __vector float   v_x0 = {x0,x0,x0,x0};
-    __vector float   v_x1 = {x1,x1,x1,x1}; 
-    __vector float* v_y =(__vector float*)y;      
-    __vector float* va0 = (__vector float*)ap[0];
-    __vector float* va1 = (__vector float*)ap[1]; 
- 
-    for ( i=0; i< n/4; i+=2 )
-    { 
-        v_y[i]   += v_x0 * va0[i]   +  v_x1 * va1[i] ;
-        v_y[i+1]  += v_x0 * va0[i+1]   +  v_x1 * va1[i+1] ;     
-    }
-
-} 
- 
- 
-static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
-{
-
-    BLASLONG i;
-    FLOAT x0 ;
-    x0 = x[0] * *alpha; 
-    __vector float   v_x0 = {x0,x0,x0,x0}; 
-    __vector float* v_y =(__vector float*)y;      
-    __vector float* va0 = (__vector float*)ap; 
- 
-    for ( i=0; i< n/4; i+=2 )
-    { 
-        v_y[i]   += v_x0 * va0[i]   ;
-        v_y[i+1] +=   v_x0 * va0[i+1]   ;        
-    }
-
-}
- 
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
-{
-    BLASLONG i;
-        
-    for ( i=0; i<n; i++ ){
-            *dest += *src;
-            src++;
-            dest += inc_dest;
-    }
-    return;
-     
-
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
-	BLASLONG i;
-	FLOAT *a_ptr;
-	FLOAT *x_ptr;
-	FLOAT *y_ptr;
-	FLOAT *ap[4];
-	BLASLONG n1;
-	BLASLONG m1;
-	BLASLONG m2;
-	BLASLONG m3;
-	BLASLONG n2;
-	BLASLONG lda4 =  lda << 2;
-	BLASLONG lda8 =  lda << 3;
-	FLOAT xbuffer[8] __attribute__((aligned(16)));
-	FLOAT *ybuffer;
-
-        if ( m < 1 ) return(0);
-        if ( n < 1 ) return(0);
-
-	ybuffer = buffer;
-	
-        if ( inc_x == 1 )
-	{
-		n1 = n >> 3 ;
-		n2 = n &  7 ;
-	}
-	else
-	{
-		n1 = n >> 2 ;
-		n2 = n &  3 ;
-
-	}
-	 
-        m3 = m & 7  ;
-        m1 = m - m3;
-        m2 = (m & (NBMAX-1)) - m3 ;
-
-
-	y_ptr = y;
-
-	BLASLONG NB = NBMAX;
-
-	while ( NB == NBMAX )
-	{
-		
-		m1 -= NB;
-		if ( m1 < 0)
-		{
-			if ( m2 == 0 ) break;	
-			NB = m2;
-		}
-		
-		a_ptr = a;
-		x_ptr = x;
-		
-		ap[0] = a_ptr;
-		ap[1] = a_ptr + lda;
-		ap[2] = ap[1] + lda;
-		ap[3] = ap[2] + lda;
-
-		if ( inc_y != 1 )
-			memset(ybuffer,0,NB*4);
-		else
-			ybuffer = y_ptr;
-
-		if ( inc_x == 1 )
-		{
-
-
-			for( i = 0; i < n1 ; i++)
-			{
-				sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
-				ap[0] += lda8; 
-				ap[1] += lda8; 
-				ap[2] += lda8; 
-				ap[3] += lda8; 
-				a_ptr += lda8;
-				x_ptr += 8;	
-			}
-
-
-			if ( n2 & 4 )
-			{
-				sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha);
-				ap[0] += lda4; 
-				ap[1] += lda4; 
-				ap[2] += lda4; 
-				ap[3] += lda4; 
-				a_ptr += lda4;
-				x_ptr += 4;	
-			}
-
-			if ( n2 & 2 )
-			{
-				sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha);
-				a_ptr += lda*2;
-				x_ptr += 2;	
-			}
-
-
-			if ( n2 & 1 )
-			{
-				sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); 
-                a_ptr += lda;
-                x_ptr += 1;   
-			}
-
-
-		}
-		else
-		{
-
-			for( i = 0; i < n1 ; i++)
-			{
-				xbuffer[0] = x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[1] =  x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[2] =  x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[3] = x_ptr[0];
-				x_ptr += inc_x;	
-				sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha);
-				ap[0] += lda4; 
-				ap[1] += lda4; 
-				ap[2] += lda4; 
-				ap[3] += lda4; 
-				a_ptr += lda4;
-			}
-
-			for( i = 0; i < n2 ; i++)
-			{
-				xbuffer[0] = x_ptr[0];
-				x_ptr += inc_x;	
-				sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
-				a_ptr += lda;
-
-			}
-
-		}
-
-		a     += NB;
-		if ( inc_y != 1 )
-		{
-			add_y(NB,ybuffer,y_ptr,inc_y);
-			y_ptr += NB * inc_y;
-		}
-		else
-			y_ptr += NB ;
-
-	}
-
-	 
-	if ( m3 & 4 )
-	{
-		a_ptr = a;
-		x_ptr = x;
-		FLOAT temp0 = 0.0;
-		FLOAT temp1 = 0.0;
-		FLOAT temp2 = 0.0;
-		FLOAT temp3 = 0.0;		
-		if ( lda == 4 && inc_x ==1 )
-		{
-
-			for( i = 0; i < ( n & -4 ); i+=4 )
-			{
-
-				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1];
-				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1];
-				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1];
-				temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1];
-
-				temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12]  * x_ptr[3];
-				temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3];
-				temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3];
-				temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3];
-
-				a_ptr += 16;
-				x_ptr += 4;
-			}
-
-			for( ; i < n; i++ )
-			{
-				temp0 += a_ptr[0] * x_ptr[0];
-				temp1 += a_ptr[1] * x_ptr[0];
-				temp2 += a_ptr[2] * x_ptr[0];
-				temp3 += a_ptr[3] * x_ptr[0] ;
-				a_ptr +=4;
-				x_ptr ++;
-			}
-
-		}
-		else
-		{
-
-			for( i = 0; i < n; i++ )
-			{
-				temp0 += a_ptr[0] * x_ptr[0];
-				temp1 += a_ptr[1] * x_ptr[0];
-				temp2 += a_ptr[2] * x_ptr[0];
-				temp3 += a_ptr[3] * x_ptr[0];
-				a_ptr += lda;
-				x_ptr += inc_x;
-
-
-			}
-
-		}
-		y_ptr[0] += alpha * temp0;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp1;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp2;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp3; 
-		y_ptr += inc_y;
-        a     += 4;
-	}
-
-
-	if ( m3 & 2 )
-	{
-		a_ptr = a;
-		x_ptr = x;
-		FLOAT temp0 = 0.0;
-		FLOAT temp1 = 0.0;
-		if ( lda == 2 && inc_x ==1 )
-		{
-
-			for( i = 0; i < (n & -4) ; i+=4 )
-			{
-				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
-				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
-				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
-				a_ptr += 8;
-				x_ptr += 4;
-
-			}
-
-
-			for( ; i < n; i++ )
-			{
-				temp0 += a_ptr[0]   * x_ptr[0];
-				temp1 += a_ptr[1]   * x_ptr[0];
-				a_ptr += 2;
-				x_ptr ++;
-			}
-
-		}
-		else
-		{
-
-			for( i = 0; i < n; i++ )
-			{
-				temp0 += a_ptr[0] * x_ptr[0];
-				temp1 += a_ptr[1] * x_ptr[0];
-				a_ptr += lda;
-				x_ptr += inc_x;
-
-
-			}
-
-		}
-		y_ptr[0] += alpha * temp0;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp1;
- 		y_ptr += inc_y;
-        a     += 2;
-	}
-
-	if ( m3 & 1 )
-	{
-		a_ptr = a;
-		x_ptr = x;
-		FLOAT temp = 0.0;
-		if ( lda == 1 && inc_x ==1 )
-		{
-
-			for( i = 0; i < (n & -4); i+=4 )
-			{
-				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
-	
-			}
-
-			for( ; i < n; i++ )
-			{
-				temp += a_ptr[i] * x_ptr[i];
-			}
-
-		}
-		else
-		{
-
-			for( i = 0; i < n; i++ )
-			{
-				temp += a_ptr[0] * x_ptr[0];
-				a_ptr += lda;
-				x_ptr += inc_x;
-			}
-
-		}
-		y_ptr[0] += alpha * temp;
- 
- 
-	}
-
-
-	return(0);
-}
-
-
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/****Note***
+UnUsed kernel
+This kernel works. But it was not competitive enough to be added in production
+It could be used and tested in future or could provide barebone for switching to inline assembly
+*/
+
+#include "common.h"
+
+#define NBMAX 4096
+
+static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+    BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; 
+    FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3]; 
+    b0 = a0 + lda4 ;
+	b1 = a1 + lda4 ;
+	b2 = a2 + lda4 ;
+	b3 = a3 + lda4 ;
+    x0 = xo[0] * *alpha;
+    x1 = xo[1] * *alpha;
+    x2 = xo[2] * *alpha;
+    x3 = xo[3] * *alpha;
+    x4 = xo[4] * *alpha;
+    x5 = xo[5] * *alpha;
+    x6 = xo[6] * *alpha;
+    x7 = xo[7] * *alpha;
+    __vector float* va0 = (__vector float*)a0;
+    __vector float* va1 = (__vector float*)a1;
+    __vector float* va2 = (__vector float*)a2;
+    __vector float* va3 = (__vector float*)a3;
+    __vector float* vb0 = (__vector float*)b0;
+    __vector float* vb1 = (__vector float*)b1;
+    __vector float* vb2 = (__vector float*)b2;
+    __vector float* vb3 = (__vector float*)b3; 
+    
+    register __vector float   v_x0 = {x0,x0,x0,x0};
+    register __vector float   v_x1 = {x1,x1,x1,x1};
+    register __vector float   v_x2 = {x2,x2,x2,x2};
+    register __vector float   v_x3 = {x3,x3,x3,x3};
+    register __vector float   v_x4 = {x4,x4,x4,x4};
+    register __vector float   v_x5 = {x5,x5,x5,x5};
+    register __vector float   v_x6 = {x6,x6,x6,x6};
+    register __vector float   v_x7 = {x7,x7,x7,x7};
+    __vector float* v_y =(__vector float*)y;   
+ 
+    for ( i=0; i< n/4; i+=2)
+    {
+        register __vector float vy_1=v_y[i];
+        register __vector float vy_2=v_y[i+1];
+        register __vector float va0_1=va0[i] ; 
+        register __vector float va0_2=va0[i+1] ; 
+        register __vector float va1_1=va1[i] ; 
+        register __vector float va1_2=va1[i+1] ; 
+        register __vector float va2_1=va2[i] ; 
+        register __vector float va2_2=va2[i+1] ; 
+        register __vector float va3_1=va3[i] ; 
+        register __vector float va3_2=va3[i+1] ;
+        register __vector float vb0_1=vb0[i] ; 
+        register __vector float vb0_2=vb0[i+1] ; 
+        register __vector float vb1_1=vb1[i] ; 
+        register __vector float vb1_2=vb1[i+1] ; 
+        register __vector float vb2_1=vb2[i] ; 
+        register __vector float vb2_2=vb2[i+1] ; 
+        register __vector float vb3_1=vb3[i] ; 
+        register __vector float vb3_2=vb3[i+1] ;         
+        vy_1   += v_x0 * va0_1  +  v_x1 * va1_1  + v_x2 * va2_1  + v_x3 * va3_1 ;
+        vy_1   += v_x4 * vb0_1   +  v_x5 * vb1_1   + v_x6 * vb2_1   + v_x7 * vb3_1 ;
+        vy_2   +=  v_x0 * va0_2   +  v_x1 * va1_2   + v_x2 * va2_2   + v_x3 * va3_2 ; 
+        vy_2   += v_x4 * vb0_2   +  v_x5 * vb1_2   + v_x6 * vb2_2   + v_x7 * vb3_2 ;
+        v_y[i] =vy_1;
+        v_y[i+1] =vy_2;   
+    }
+
+}
+	 
+static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+    BLASLONG i;
+    FLOAT x0,x1,x2,x3;
+    x0 = xo[0] * *alpha;
+    x1 = xo[1] * *alpha;
+    x2 = xo[2] * *alpha;
+    x3 = xo[3] * *alpha;
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1};
+    __vector float   v_x2 = {x2,x2,x2,x2};
+    __vector float   v_x3 = {x3,x3,x3,x3};
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap[0];
+    __vector float* va1 = (__vector float*)ap[1];
+    __vector float* va2 = (__vector float*)ap[2];
+    __vector float* va3 = (__vector float*)ap[3]; 
+ 
+    for ( i=0; i< n/4; i+=2 )
+    {
+        register __vector float vy_1=v_y[i];
+        register __vector float vy_2=v_y[i+1];
+        register __vector float va0_1=va0[i] ; 
+        register __vector float va0_2=va0[i+1] ; 
+        register __vector float va1_1=va1[i] ; 
+        register __vector float va1_2=va1[i+1] ; 
+        register __vector float va2_1=va2[i] ; 
+        register __vector float va2_2=va2[i+1] ; 
+        register __vector float va3_1=va3[i] ; 
+        register __vector float va3_2=va3[i+1] ;      
+        vy_1   += v_x0 * va0_1  +  v_x1 * va1_1  + v_x2 * va2_1  + v_x3 * va3_1 ;
+        vy_2   +=  v_x0 * va0_2   +  v_x1 * va1_2   + v_x2 * va2_2   + v_x3 * va3_2 ;
+        v_y[i] =vy_1;
+        v_y[i+1] =vy_2;   
+    }
+  
+} 
+
+static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+    BLASLONG i;
+    FLOAT x0,x1;
+    x0 = x[0] * *alpha;
+    x1 = x[1] * *alpha; 
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1}; 
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap[0];
+    __vector float* va1 = (__vector float*)ap[1]; 
+ 
+    for ( i=0; i< n/4; i+=2 )
+    { 
+        v_y[i]   += v_x0 * va0[i]   +  v_x1 * va1[i] ;
+        v_y[i+1]  += v_x0 * va0[i+1]   +  v_x1 * va1[i+1] ;     
+    }
+
+} 
+ 
+ 
+static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+    BLASLONG i;
+    FLOAT x0 ;
+    x0 = x[0] * *alpha; 
+    __vector float   v_x0 = {x0,x0,x0,x0}; 
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap; 
+ 
+    for ( i=0; i< n/4; i+=2 )
+    { 
+        v_y[i]   += v_x0 * va0[i]   ;
+        v_y[i+1] +=   v_x0 * va0[i+1]   ;        
+    }
+
+}
+ 
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+    BLASLONG i;
+        
+    for ( i=0; i<n; i++ ){
+            *dest += *src;
+            src++;
+            dest += inc_dest;
+    }
+    return;
+     
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	BLASLONG lda4 =  lda << 2;
+	BLASLONG lda8 =  lda << 3;
+	FLOAT xbuffer[8] __attribute__((aligned(16)));
+	FLOAT *ybuffer;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	ybuffer = buffer;
+	
+        if ( inc_x == 1 )
+	{
+		n1 = n >> 3 ;
+		n2 = n &  7 ;
+	}
+	else
+	{
+		n1 = n >> 2 ;
+		n2 = n &  3 ;
+
+	}
+	 
+        m3 = m & 7  ;
+        m1 = m - m3;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		if ( inc_y != 1 )
+			memset(ybuffer,0,NB*4);
+		else
+			ybuffer = y_ptr;
+
+		if ( inc_x == 1 )
+		{
+
+
+			for( i = 0; i < n1 ; i++)
+			{
+				sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
+				ap[0] += lda8; 
+				ap[1] += lda8; 
+				ap[2] += lda8; 
+				ap[3] += lda8; 
+				a_ptr += lda8;
+				x_ptr += 8;	
+			}
+
+
+			if ( n2 & 4 )
+			{
+				sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+				x_ptr += 4;	
+			}
+
+			if ( n2 & 2 )
+			{
+				sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha);
+				a_ptr += lda*2;
+				x_ptr += 2;	
+			}
+
+
+			if ( n2 & 1 )
+			{
+				sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); 
+                a_ptr += lda;
+                x_ptr += 1;   
+			}
+
+
+		}
+		else
+		{
+
+			for( i = 0; i < n1 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[1] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[2] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[3] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+			}
+
+			for( i = 0; i < n2 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
+				a_ptr += lda;
+
+			}
+
+		}
+
+		a     += NB;
+		if ( inc_y != 1 )
+		{
+			add_y(NB,ybuffer,y_ptr,inc_y);
+			y_ptr += NB * inc_y;
+		}
+		else
+			y_ptr += NB ;
+
+	}
+
+	 
+	if ( m3 & 4 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		FLOAT temp2 = 0.0;
+		FLOAT temp3 = 0.0;		
+		if ( lda == 4 && inc_x ==1 )
+		{
+
+			for( i = 0; i < ( n & -4 ); i+=4 )
+			{
+
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1];
+				temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1];
+
+				temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12]  * x_ptr[3];
+				temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3];
+				temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3];
+				temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3];
+
+				a_ptr += 16;
+				x_ptr += 4;
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				temp3 += a_ptr[3] * x_ptr[0] ;
+				a_ptr +=4;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				temp3 += a_ptr[3] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp2;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp3; 
+		y_ptr += inc_y;
+        a     += 4;
+	}
+
+
+	if ( m3 & 2 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		if ( lda == 2 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4) ; i+=4 )
+			{
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+				a_ptr += 8;
+				x_ptr += 4;
+
+			}
+
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0]   * x_ptr[0];
+				temp1 += a_ptr[1]   * x_ptr[0];
+				a_ptr += 2;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+ 		y_ptr += inc_y;
+        a     += 2;
+	}
+
+	if ( m3 & 1 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		if ( lda == 1 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4); i+=4 )
+			{
+				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+	
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp += a_ptr[i] * x_ptr[i];
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp += a_ptr[0] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+			}
+
+		}
+		y_ptr[0] += alpha * temp;
+ 
+ 
+	}
+
+
+	return(0);
+}
+
+
diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c
index 62c517a9d..c3fc8e77a 100644
--- a/kernel/power/sgemv_t.c
+++ b/kernel/power/sgemv_t.c
@@ -1,484 +1,484 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-#if !defined(__VEC__) || !defined(__ALTIVEC__)
-#include "../arm/gemv_t.c"
-
-#else
-
-#include "common.h"
-
-#define NBMAX 2048
-
-#include <altivec.h> 
- 
-static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
-    BLASLONG i;  
-    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
-    __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
-    register __vector float temp0 = {0,0,0,0};
-    register __vector float temp1 = {0,0,0,0};
-    register __vector float temp2 = {0,0,0,0};
-    register __vector float temp3 = {0,0,0,0};
-    register __vector float temp4 = {0,0,0,0};
-    register __vector float temp5 = {0,0,0,0};
-    register __vector float temp6 = {0,0,0,0};
-    register __vector float temp7 = {0,0,0,0};
-
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-    a4 = a3 + lda;
-    a5 = a4 + lda;
-    a6 = a5 + lda;
-    a7 = a6 + lda;
-    va0 = (__vector float*) a0;
-    va1 = (__vector float*) a1;
-    va2 = (__vector float*) a2;
-    va3 = (__vector float*) a3;
-    va4 = (__vector float*) a4;
-    va5 = (__vector float*) a5;
-    va6 = (__vector float*) a6;
-    va7 = (__vector float*) a7;
-    v_x = (__vector float*) x;
- 
-   
-        for (i = 0; i < n/4; i ++) {
-            temp0 += v_x[i] * va0[i];
-            temp1 += v_x[i] * va1[i];
-            temp2 += v_x[i] * va2[i];
-            temp3 += v_x[i] * va3[i];
-            temp4 += v_x[i] * va4[i];
-            temp5 += v_x[i] * va5[i];
-            temp6 += v_x[i] * va6[i];
-            temp7 += v_x[i] * va7[i]; 
-        }
-    
-  
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
-    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
-    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
-
-    y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
-    y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
-    y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
-    y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
-
-}
- 
-
-static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
-    BLASLONG i = 0;
-    FLOAT *a0, *a1, *a2, *a3;
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* va1 = (__vector float*) a1;
-    __vector float* va2 = (__vector float*) a2;
-    __vector float* va3 = (__vector float*) a3;
-    __vector float* v_x = (__vector float*) x;
-    register __vector float temp0 = {0,0,0,0};
-    register __vector float temp1 = {0,0,0,0};
-    register __vector float temp2 = {0,0,0,0};
-    register __vector float temp3 = {0,0,0,0}; 
-
-    for (i = 0; i < n / 4; i ++) {
-        temp0 += v_x[i] * va0[i];
-        temp1 += v_x[i] * va1[i];
-        temp2 += v_x[i] * va2[i];
-        temp3 += v_x[i] * va3[i]; 
-    }
- 
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
-    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
-    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
-
-}
- 
-
-static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
-
-    BLASLONG i;
-    FLOAT *a0, *a1;
-    a0 = ap;
-    a1 = ap + lda;
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* va1 = (__vector float*) a1;
-    __vector float* v_x = (__vector float*) x;
-    __vector float temp0 = {0,0,0,0};
-    __vector float temp1 = {0,0,0,0};
-    for (i = 0; i < n / 4; i ++) {
-        temp0 += v_x[i] * va0[i];
-        temp1 += v_x[i] * va1[i];
-    }
-
-
-
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-    y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); 
-}
-
-static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
-
-    BLASLONG i;
-    FLOAT *a0;
-    a0 = ap;
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* v_x = (__vector float*) x;
-    __vector float temp0 = {0,0,0,0};
-    for (i = 0; i < n / 4; i ++) {
-        temp0 += v_x[i] * va0[i] ;
-    }
-
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-
-}
-
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
-    BLASLONG i;
-    for (i = 0; i < n; i++) {
-        *dest++ = *src;
-        src += inc_src;
-    }
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
-    BLASLONG i;
-    BLASLONG j;
-    FLOAT *a_ptr;
-    FLOAT *x_ptr;
-    FLOAT *y_ptr;
-
-    BLASLONG n1;
-    BLASLONG m1;
-    BLASLONG m2;
-    BLASLONG m3;
-    BLASLONG n2; 
-    FLOAT ybuffer[8] __attribute__((aligned(16)));
-    FLOAT *xbuffer; 
-    if (m < 1) return (0);
-    if (n < 1) return (0);
-
-    xbuffer = buffer;
-
-    n1 = n >> 3;
-    n2 = n & 7;
-
-    m3 = m & 3;
-    m1 = m - m3;
-    m2 = (m & (NBMAX - 1)) - m3;
-
-    BLASLONG NB = NBMAX;
-
-    while (NB == NBMAX) {
-
-        m1 -= NB;
-        if (m1 < 0) {
-            if (m2 == 0) break;
-            NB = m2;
-        }
-
-        y_ptr = y;
-        a_ptr = a;
-        x_ptr = x;
-
-        if (inc_x != 1)
-            copy_x(NB, x_ptr, xbuffer, inc_x);
-        else
-            xbuffer = x_ptr;
-
-        BLASLONG lda8 = lda << 3;
-
-
-        if (inc_y == 1) {
-
-            for (i = 0; i < n1; i++) {
-                 
-                sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
- 
-                y_ptr += 8;
-                a_ptr += lda8;
-        
-            }
-
-        } else {
-                   
-            for (i = 0; i < n1; i++) {
-                ybuffer[0] = 0;
-                ybuffer[1] = 0;
-                ybuffer[2] = 0;
-                ybuffer[3] = 0;
-                ybuffer[4] = 0;
-                ybuffer[5] = 0;
-                ybuffer[6] = 0;
-                ybuffer[7] = 0;
-                sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
-
- 
-
-                *y_ptr += ybuffer[0];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[1];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[2];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[3];
-                y_ptr += inc_y;
-
-                *y_ptr += ybuffer[4];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[5];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[6];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[7];
-                y_ptr += inc_y;
-
-                a_ptr += lda8;
-            }
-
-        }
-
-
-        if (n2 & 4) {
-            ybuffer[0] = 0;
-            ybuffer[1] = 0;
-            ybuffer[2] = 0;
-            ybuffer[3] = 0;
-            sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
-
-            a_ptr += lda<<2;
-
-            *y_ptr += ybuffer[0];
-            y_ptr += inc_y;
-            *y_ptr += ybuffer[1];
-            y_ptr += inc_y;
-            *y_ptr += ybuffer[2];
-            y_ptr += inc_y;
-            *y_ptr += ybuffer[3];
-            y_ptr += inc_y;
-        }
-
-        if (n2 & 2) {
-            sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
-            a_ptr += lda << 1;
-            y_ptr += 2 * inc_y;
-
-        }
-
-        if (n2 & 1) {
-            sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
-            a_ptr += lda;
-            y_ptr += inc_y;
-
-        }
-
-        a += NB;
-        x += NB * inc_x;
-
-
-    }
-
-    if (m3 == 0) return (0);
-
-    x_ptr = x;
-    a_ptr = a;
-    if (m3 == 3) {
-        FLOAT xtemp0 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp1 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp2 = *x_ptr * alpha;
-
-        FLOAT *aj = a_ptr;
-        y_ptr = y;
-
-        if (lda == 3 && inc_y == 1) {
-
-            for (j = 0; j < (n & -4); j += 4) {
-
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
-                y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
-                y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
-                y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
-                aj += 12;
-            }
-
-            for (; j < n; j++) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
-                aj += 3;
-            }
-
-        } else {
-
-            if (inc_y == 1) {
-
-                BLASLONG register lda2 = lda << 1;
-                BLASLONG register lda4 = lda << 2;
-                BLASLONG register lda3 = lda2 + lda;
-
-                for (j = 0; j < (n & -4); j += 4) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
-                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;
-                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;
-                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;
-                    aj += lda4;
-                }
-
-                for (; j < n; j++) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
-                    aj += lda;
-                }
-
-            } else {
-
-                for (j = 0; j < n; j++) {
-                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
-                    y_ptr += inc_y;
-                    aj += lda;
-                }
-
-            }
-
-        }
-        return (0);
-    }
-
-    if (m3 == 2) {
-        FLOAT xtemp0 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp1 = *x_ptr * alpha;
-
-        FLOAT *aj = a_ptr;
-        y_ptr = y;
-
-        if (lda == 2 && inc_y == 1) {
-
-            for (j = 0; j < (n & -4); j += 4) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
-                y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
-                y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
-                y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
-                aj += 8;
-
-            }
-
-            for (; j < n; j++) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
-                aj += 2;
-            }
-
-        } else {
-            if (inc_y == 1) {
-
-                BLASLONG register lda2 = lda << 1;
-                BLASLONG register lda4 = lda << 2;
-                BLASLONG register lda3 = lda2 + lda;
-
-                for (j = 0; j < (n & -4); j += 4) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
-                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
-                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
-                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
-                    aj += lda4;
-                }
-
-                for (; j < n; j++) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
-                    aj += lda;
-                }
-
-            } else {
-                for (j = 0; j < n; j++) {
-                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
-                    y_ptr += inc_y;
-                    aj += lda;
-                }
-            }
-
-        }
-        return (0);
-
-    }
-
-    FLOAT xtemp = *x_ptr * alpha;
-    FLOAT *aj = a_ptr;
-    y_ptr = y;
-    if (lda == 1 && inc_y == 1) {
-        for (j = 0; j < (n & -4); j += 4) {
-            y_ptr[j] += aj[j] * xtemp;
-            y_ptr[j + 1] += aj[j + 1] * xtemp;
-            y_ptr[j + 2] += aj[j + 2] * xtemp;
-            y_ptr[j + 3] += aj[j + 3] * xtemp;
-        }
-        for (; j < n; j++) {
-            y_ptr[j] += aj[j] * xtemp;
-        }
-
-
-    } else {
-        if (inc_y == 1) {
-
-            BLASLONG register lda2 = lda << 1;
-            BLASLONG register lda4 = lda << 2;
-            BLASLONG register lda3 = lda2 + lda;
-            for (j = 0; j < (n & -4); j += 4) {
-                y_ptr[j] += *aj * xtemp;
-                y_ptr[j + 1] += *(aj + lda) * xtemp;
-                y_ptr[j + 2] += *(aj + lda2) * xtemp;
-                y_ptr[j + 3] += *(aj + lda3) * xtemp;
-                aj += lda4;
-            }
-
-            for (; j < n; j++) {
-                y_ptr[j] += *aj * xtemp;
-                aj += lda;
-            }
-
-        } else {
-            for (j = 0; j < n; j++) {
-                *y_ptr += *aj * xtemp;
-                y_ptr += inc_y;
-                aj += lda;
-            }
-
-        }
-    }
-
-    return (0);
-
-}
-
-#endif
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/gemv_t.c"
+
+#else
+
+#include "common.h"
+
+#define NBMAX 2048
+
+#include <altivec.h> 
+ 
+static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i;  
+    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
+    __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
+    register __vector float temp0 = {0,0,0,0};
+    register __vector float temp1 = {0,0,0,0};
+    register __vector float temp2 = {0,0,0,0};
+    register __vector float temp3 = {0,0,0,0};
+    register __vector float temp4 = {0,0,0,0};
+    register __vector float temp5 = {0,0,0,0};
+    register __vector float temp6 = {0,0,0,0};
+    register __vector float temp7 = {0,0,0,0};
+
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    a4 = a3 + lda;
+    a5 = a4 + lda;
+    a6 = a5 + lda;
+    a7 = a6 + lda;
+    va0 = (__vector float*) a0;
+    va1 = (__vector float*) a1;
+    va2 = (__vector float*) a2;
+    va3 = (__vector float*) a3;
+    va4 = (__vector float*) a4;
+    va5 = (__vector float*) a5;
+    va6 = (__vector float*) a6;
+    va7 = (__vector float*) a7;
+    v_x = (__vector float*) x;
+ 
+   
+        for (i = 0; i < n/4; i ++) {
+            temp0 += v_x[i] * va0[i];
+            temp1 += v_x[i] * va1[i];
+            temp2 += v_x[i] * va2[i];
+            temp3 += v_x[i] * va3[i];
+            temp4 += v_x[i] * va4[i];
+            temp5 += v_x[i] * va5[i];
+            temp6 += v_x[i] * va6[i];
+            temp7 += v_x[i] * va7[i]; 
+        }
+    
+  
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
+    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
+    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
+
+    y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
+    y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
+    y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
+    y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
+
+}
+ 
+
+static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i = 0;
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1;
+    __vector float* va2 = (__vector float*) a2;
+    __vector float* va3 = (__vector float*) a3;
+    __vector float* v_x = (__vector float*) x;
+    register __vector float temp0 = {0,0,0,0};
+    register __vector float temp1 = {0,0,0,0};
+    register __vector float temp2 = {0,0,0,0};
+    register __vector float temp3 = {0,0,0,0}; 
+
+    for (i = 0; i < n / 4; i ++) {
+        temp0 += v_x[i] * va0[i];
+        temp1 += v_x[i] * va1[i];
+        temp2 += v_x[i] * va2[i];
+        temp3 += v_x[i] * va3[i]; 
+    }
+ 
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
+    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
+    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
+
+}
+ 
+
+static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
+
+    BLASLONG i;
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1;
+    __vector float* v_x = (__vector float*) x;
+    __vector float temp0 = {0,0,0,0};
+    __vector float temp1 = {0,0,0,0};
+    for (i = 0; i < n / 4; i ++) {
+        temp0 += v_x[i] * va0[i];
+        temp1 += v_x[i] * va1[i];
+    }
+
+
+
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); 
+}
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+
+    BLASLONG i;
+    FLOAT *a0;
+    a0 = ap;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* v_x = (__vector float*) x;
+    __vector float temp0 = {0,0,0,0};
+    for (i = 0; i < n / 4; i ++) {
+        temp0 += v_x[i] * va0[i] ;
+    }
+
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+
+}
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+    BLASLONG i;
+    for (i = 0; i < n; i++) {
+        *dest++ = *src;
+        src += inc_src;
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
+    BLASLONG i;
+    BLASLONG j;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2; 
+    FLOAT ybuffer[8] __attribute__((aligned(16)));
+    FLOAT *xbuffer; 
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    xbuffer = buffer;
+
+    n1 = n >> 3;
+    n2 = n & 7;
+
+    m3 = m & 3;
+    m1 = m - m3;
+    m2 = (m & (NBMAX - 1)) - m3;
+
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        y_ptr = y;
+        a_ptr = a;
+        x_ptr = x;
+
+        if (inc_x != 1)
+            copy_x(NB, x_ptr, xbuffer, inc_x);
+        else
+            xbuffer = x_ptr;
+
+        BLASLONG lda8 = lda << 3;
+
+
+        if (inc_y == 1) {
+
+            for (i = 0; i < n1; i++) {
+                 
+                sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
+ 
+                y_ptr += 8;
+                a_ptr += lda8;
+        
+            }
+
+        } else {
+                   
+            for (i = 0; i < n1; i++) {
+                ybuffer[0] = 0;
+                ybuffer[1] = 0;
+                ybuffer[2] = 0;
+                ybuffer[3] = 0;
+                ybuffer[4] = 0;
+                ybuffer[5] = 0;
+                ybuffer[6] = 0;
+                ybuffer[7] = 0;
+                sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+ 
+
+                *y_ptr += ybuffer[0];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[1];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[2];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[3];
+                y_ptr += inc_y;
+
+                *y_ptr += ybuffer[4];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[5];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[6];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[7];
+                y_ptr += inc_y;
+
+                a_ptr += lda8;
+            }
+
+        }
+
+
+        if (n2 & 4) {
+            ybuffer[0] = 0;
+            ybuffer[1] = 0;
+            ybuffer[2] = 0;
+            ybuffer[3] = 0;
+            sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+            a_ptr += lda<<2;
+
+            *y_ptr += ybuffer[0];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[1];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[2];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[3];
+            y_ptr += inc_y;
+        }
+
+        if (n2 & 2) {
+            sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
+            a_ptr += lda << 1;
+            y_ptr += 2 * inc_y;
+
+        }
+
+        if (n2 & 1) {
+            sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
+            a_ptr += lda;
+            y_ptr += inc_y;
+
+        }
+
+        a += NB;
+        x += NB * inc_x;
+
+
+    }
+
+    if (m3 == 0) return (0);
+
+    x_ptr = x;
+    a_ptr = a;
+    if (m3 == 3) {
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp2 = *x_ptr * alpha;
+
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+
+        if (lda == 3 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+                y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
+                y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
+                y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
+                aj += 12;
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+                aj += 3;
+            }
+
+        } else {
+
+            if (inc_y == 1) {
+
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    aj += lda;
+                }
+
+            } else {
+
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+
+            }
+
+        }
+        return (0);
+    }
+
+    if (m3 == 2) {
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+
+        if (lda == 2 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
+                y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
+                y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
+                aj += 8;
+
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                aj += 2;
+            }
+
+        } else {
+            if (inc_y == 1) {
+
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    aj += lda;
+                }
+
+            } else {
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+            }
+
+        }
+        return (0);
+
+    }
+
+    FLOAT xtemp = *x_ptr * alpha;
+    FLOAT *aj = a_ptr;
+    y_ptr = y;
+    if (lda == 1 && inc_y == 1) {
+        for (j = 0; j < (n & -4); j += 4) {
+            y_ptr[j] += aj[j] * xtemp;
+            y_ptr[j + 1] += aj[j + 1] * xtemp;
+            y_ptr[j + 2] += aj[j + 2] * xtemp;
+            y_ptr[j + 3] += aj[j + 3] * xtemp;
+        }
+        for (; j < n; j++) {
+            y_ptr[j] += aj[j] * xtemp;
+        }
+
+
+    } else {
+        if (inc_y == 1) {
+
+            BLASLONG register lda2 = lda << 1;
+            BLASLONG register lda4 = lda << 2;
+            BLASLONG register lda3 = lda2 + lda;
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += *aj * xtemp;
+                y_ptr[j + 1] += *(aj + lda) * xtemp;
+                y_ptr[j + 2] += *(aj + lda2) * xtemp;
+                y_ptr[j + 3] += *(aj + lda3) * xtemp;
+                aj += lda4;
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += *aj * xtemp;
+                aj += lda;
+            }
+
+        } else {
+            for (j = 0; j < n; j++) {
+                *y_ptr += *aj * xtemp;
+                y_ptr += inc_y;
+                aj += lda;
+            }
+
+        }
+    }
+
+    return (0);
+
+}
+
+#endif
diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c
index b90512162..1ee7c8aeb 100644
--- a/kernel/power/sgemv_t_8.c
+++ b/kernel/power/sgemv_t_8.c
@@ -1,508 +1,508 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-
- 
-/****Note***
-UnUsed kernel
-This kernel works. But it was not competitive enough to be added in production
-It could be used and tested in future or could be used as base for switching to inline assembly
-*/
-
-#include "common.h"
-#include <stdio.h>
-#define NBMAX 4096
-
-#include <altivec.h> 
- 
-static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
-    BLASLONG i;  
-    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
-    __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
-    register __vector float temp0 = {0,0,0,0};
-    register __vector float temp1 = {0,0,0,0};
-    register __vector float temp2 = {0,0,0,0};
-    register __vector float temp3 = {0,0,0,0};
-    register __vector float temp4 = {0,0,0,0};
-    register __vector float temp5 = {0,0,0,0};
-    register __vector float temp6 = {0,0,0,0};
-    register __vector float temp7 = {0,0,0,0};
-
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-    a4 = a3 + lda;
-    a5 = a4 + lda;
-    a6 = a5 + lda;
-    a7 = a6 + lda;
-    va0 = (__vector float*) a0;
-    va1 = (__vector float*) a1;
-    va2 = (__vector float*) a2;
-    va3 = (__vector float*) a3;
-    va4 = (__vector float*) a4;
-    va5 = (__vector float*) a5;
-    va6 = (__vector float*) a6;
-    va7 = (__vector float*) a7;
-    v_x = (__vector float*) x;
- 
-   
-        for (i = 0; i < n/4; i +=2) {
-            register __vector float vx1=v_x[i] ; 
-            register __vector float vx2=v_x[i+1] ; 
-            register __vector float va0_1=va0[i] ; 
-            register __vector float va0_2=va0[i+1] ; 
-            register __vector float va1_1=va1[i] ; 
-            register __vector float va1_2=va1[i+1] ; 
-            register __vector float va2_1=va2[i] ; 
-            register __vector float va2_2=va2[i+1] ; 
-            register __vector float va3_1=va3[i] ; 
-            register __vector float va3_2=va3[i+1] ; 
-            register __vector float va4_1=va4[i] ; 
-            register __vector float va4_2=va4[i+1] ;
-            register __vector float va5_1=va5[i] ; 
-            register __vector float va5_2=va5[i+1] ; 
-            register __vector float va6_1=va6[i] ; 
-            register __vector float va6_2=va6[i+1] ; 
-            register __vector float va7_1=va7[i] ; 
-            register __vector float va7_2=va7[i+1] ;                       
-            temp0 += vx1* va0_1 + vx2 * va0_2;
-            temp1 += vx1* va1_1 + vx2 * va1_2;
-            temp2 += vx1* va2_1 + vx2 * va2_2;
-            temp3 += vx1* va3_1 + vx2 * va3_2;
-            temp4 += vx1* va4_1 + vx2 * va4_2;
-            temp5 += vx1* va5_1 + vx2 * va5_2;
-            temp6 += vx1* va6_1 + vx2 * va6_2;
-            temp7 += vx1* va7_1 + vx2 * va7_2;  
-        }
-    
-  
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
-    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
-    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
-
-    y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
-    y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
-    y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
-    y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
-
-}
- 
-
-static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
-    BLASLONG i = 0;
-    FLOAT *a0, *a1, *a2, *a3;
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* va1 = (__vector float*) a1;
-    __vector float* va2 = (__vector float*) a2;
-    __vector float* va3 = (__vector float*) a3;
-    __vector float* v_x = (__vector float*) x;
-    register __vector float temp0 = {0,0,0,0};
-    register __vector float temp1 = {0,0,0,0};
-    register __vector float temp2 = {0,0,0,0};
-    register __vector float temp3 = {0,0,0,0}; 
-
-    for (i = 0; i < n / 4; i +=2) {
-        temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
-        temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1];
-        temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1];
-        temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; 
-    }
- 
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
-    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
-    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
-
-}
- 
-
-static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
-
-    BLASLONG i;
-    FLOAT *a0, *a1;
-    a0 = ap;
-    a1 = ap + lda;
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* va1 = (__vector float*) a1;
-    __vector float* v_x = (__vector float*) x;
-    __vector float temp0 = {0,0,0,0};
-    __vector float temp1 = {0,0,0,0};
-    for (i = 0; i < n / 4; i +=2) {
-        temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
-        temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; 
-    }
-
-
-
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-    y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); 
-}
-
-static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
-
-    BLASLONG i;
-    FLOAT *a0;
-    a0 = ap;
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* v_x = (__vector float*) x;
-    __vector float temp0 = {0,0,0,0};
-    for (i = 0; i < n / 4; i +=2) {
-        temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; 
-    }
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-
-}
- 
-
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
-    BLASLONG i;
-    for (i = 0; i < n; i++) {
-        *dest++ = *src;
-        src += inc_src;
-    }
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
-    BLASLONG i;
-    BLASLONG j;
-    FLOAT *a_ptr;
-    FLOAT *x_ptr;
-    FLOAT *y_ptr;
-
-    BLASLONG n1;
-    BLASLONG m1;
-    BLASLONG m2;
-    BLASLONG m3;
-    BLASLONG n2;
-
-    FLOAT ybuffer[8] __attribute__((aligned(16)));
-    FLOAT *xbuffer; 
-    if (m < 1) return (0);
-    if (n < 1) return (0);
-
-    xbuffer = buffer;
-
-    n1 = n >> 3;
-    n2 = n & 7;
-
-    m3 = m & 7;
-    m1 = m - m3;
-    m2 = (m & (NBMAX - 1)) - m3;
-  
-    BLASLONG NB = NBMAX;
-
-    while (NB == NBMAX) {
-
-        m1 -= NB;
-        if (m1 < 0) {
-            if (m2 == 0) break;
-            NB = m2;
-        }
-
-        y_ptr = y;
-        a_ptr = a;
-        x_ptr = x;
-
-        if (inc_x != 1)
-            copy_x(NB, x_ptr, xbuffer, inc_x);
-        else
-            xbuffer = x_ptr;
-
-        BLASLONG lda8 = lda << 3;
-
-  
-        if (inc_y == 1) {
-
-            for (i = 0; i < n1; i++) {
-                 
-                sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
- 
-                y_ptr += 8;
-                a_ptr += lda8;
-        
-            }
-
-        } else {
-                   
-            for (i = 0; i < n1; i++) {
-                ybuffer[0] = 0;
-                ybuffer[1] = 0;
-                ybuffer[2] = 0;
-                ybuffer[3] = 0;
-                ybuffer[4] = 0;
-                ybuffer[5] = 0;
-                ybuffer[6] = 0;
-                ybuffer[7] = 0;
-                sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
-
- 
-
-                *y_ptr += ybuffer[0];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[1];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[2];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[3];
-                y_ptr += inc_y;
-
-                *y_ptr += ybuffer[4];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[5];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[6];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[7];
-                y_ptr += inc_y;
-
-                a_ptr += lda8;
-            }
-
-        }
-
-
-        if (n2 & 4) {
-            ybuffer[0] = 0;
-            ybuffer[1] = 0;
-            ybuffer[2] = 0;
-            ybuffer[3] = 0;
-            sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
-
-            a_ptr += lda<<2;
-
-            *y_ptr += ybuffer[0];
-            y_ptr += inc_y;
-            *y_ptr += ybuffer[1];
-            y_ptr += inc_y;
-            *y_ptr += ybuffer[2];
-            y_ptr += inc_y;
-            *y_ptr += ybuffer[3];
-            y_ptr += inc_y;
-        }
-
-        if (n2 & 2) {
-            sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
-            a_ptr += lda << 1;
-            y_ptr += 2 * inc_y;
-
-        }
-
-        if (n2 & 1) {
-            sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha);
-            a_ptr += lda;
-            y_ptr += inc_y;
-
-        }
-
-        a += NB;
-        x += NB * inc_x;
-
-
-    }
-
-    if (m3 == 0) return (0);
-
-    x_ptr = x;
-    a_ptr = a;
-    if (m3 & 4) {
-        FLOAT xtemp0 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp1 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp2 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp3 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT *aj = a_ptr;
-        y_ptr = y;
-        if (lda == 4 && inc_y == 1) {
-
-            for (j = 0; j < (n & -4); j += 4) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1  +  aj[2] * xtemp2 + aj[3] * xtemp3;
-                y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1  +  aj[6] * xtemp2 + aj[7] * xtemp3;
-                y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1  +  aj[10] * xtemp2 + aj[11] * xtemp3;
-                y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1  +  aj[14] * xtemp2 + aj[15] * xtemp3;
-                aj += 16;
-
-            }
-
-            for (; j < n; j++) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 +  aj[2] * xtemp2 + aj[3] * xtemp3;
-                aj += 4;
-            }
-
-        } else if (inc_y == 1) {
-        
-                BLASLONG register lda2 = lda << 1;
-                BLASLONG register lda4 = lda << 2;
-                BLASLONG register lda3 = lda2 + lda;
-
-                for (j = 0; j < (n & -4); j += 4) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3;
-                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3;
-                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2  + *(aj +  lda2 +3) * xtemp3;
-                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2  + *(aj +  lda3+3) * xtemp3;
-                    aj += lda4;
-                }
-
-                for (; j < n; j++) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3;
-                    aj += lda;
-                }
-
-        } else {
-
-                for (j = 0; j < n; j++) {
-                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3;
-                    y_ptr += inc_y;
-                    aj += lda;
-                }
-
-            } 
-            if (m3==4) return (0);
-            a_ptr += 4; 
-    }
-
-    if (m3 & 2 ) {
-  
-        FLOAT xtemp0 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp1 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT *aj = a_ptr;
-        y_ptr = y;
-
-        if (lda == 2 && inc_y == 1) {
-
-            for (j = 0; j < (n & -4); j += 4) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
-                y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
-                y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
-                y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
-                aj += 8;
-
-            }
-
-            for (; j < n; j++) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
-                aj += 2;
-            }
-
-        } else {
-            if (inc_y == 1) {
-
-                BLASLONG register lda2 = lda << 1;
-                BLASLONG register lda4 = lda << 2;
-                BLASLONG register lda3 = lda2 + lda;
-
-                for (j = 0; j < (n & -4); j += 4) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
-                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
-                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
-                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
-                    aj += lda4;
-                }
-
-                for (; j < n; j++) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
-                    aj += lda;
-                }
-
-            } else {
-                for (j = 0; j < n; j++) {
-                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
-                    y_ptr += inc_y;
-                    aj += lda;
-                }
-            }
-
-        } 
-        if (m3==2) return (0);
-        a_ptr += 2; 
-    }
-    if (m3 & 1) {
-          
-    FLOAT xtemp = *x_ptr * alpha;
-            x_ptr += inc_x;
-    FLOAT *aj = a_ptr;
-    y_ptr = y;
-    if (lda == 1 && inc_y == 1) {
-        for (j = 0; j < (n & -4); j += 4) {
-            y_ptr[j] += aj[j] * xtemp;
-            y_ptr[j + 1] += aj[j + 1] * xtemp;
-            y_ptr[j + 2] += aj[j + 2] * xtemp;
-            y_ptr[j + 3] += aj[j + 3] * xtemp;
-        }
-        for (; j < n; j++) {
-            y_ptr[j] += aj[j] * xtemp;
-        }
-
-
-    } else {
-        if (inc_y == 1) {
-
-            BLASLONG register lda2 = lda << 1;
-            BLASLONG register lda4 = lda << 2;
-            BLASLONG register lda3 = lda2 + lda;
-            for (j = 0; j < (n & -4); j += 4) {
-                y_ptr[j] += *aj * xtemp;
-                y_ptr[j + 1] += *(aj + lda) * xtemp;
-                y_ptr[j + 2] += *(aj + lda2) * xtemp;
-                y_ptr[j + 3] += *(aj + lda3) * xtemp;
-                aj += lda4;
-            }
-
-            for (; j < n; j++) {
-                y_ptr[j] += *aj * xtemp;
-                aj += lda;
-            }
-
-        } else {
-            for (j = 0; j < n; j++) {
-                *y_ptr += *aj * xtemp;
-                y_ptr += inc_y;
-                aj += lda;
-            }
-
-        }
-    
-    }
-                a_ptr += 1; 
-    }
-    return (0);
-
-}
-
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+ 
+/****Note***
+UnUsed kernel
+This kernel works. But it was not competitive enough to be added in production
+It could be used and tested in future or could be used as base for switching to inline assembly
+*/
+
+#include "common.h"
+#include <stdio.h>
+#define NBMAX 4096
+
+#include <altivec.h> 
+ 
+static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i;  
+    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
+    __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
+    register __vector float temp0 = {0,0,0,0};
+    register __vector float temp1 = {0,0,0,0};
+    register __vector float temp2 = {0,0,0,0};
+    register __vector float temp3 = {0,0,0,0};
+    register __vector float temp4 = {0,0,0,0};
+    register __vector float temp5 = {0,0,0,0};
+    register __vector float temp6 = {0,0,0,0};
+    register __vector float temp7 = {0,0,0,0};
+
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    a4 = a3 + lda;
+    a5 = a4 + lda;
+    a6 = a5 + lda;
+    a7 = a6 + lda;
+    va0 = (__vector float*) a0;
+    va1 = (__vector float*) a1;
+    va2 = (__vector float*) a2;
+    va3 = (__vector float*) a3;
+    va4 = (__vector float*) a4;
+    va5 = (__vector float*) a5;
+    va6 = (__vector float*) a6;
+    va7 = (__vector float*) a7;
+    v_x = (__vector float*) x;
+ 
+   
+        for (i = 0; i < n/4; i +=2) {
+            register __vector float vx1=v_x[i] ; 
+            register __vector float vx2=v_x[i+1] ; 
+            register __vector float va0_1=va0[i] ; 
+            register __vector float va0_2=va0[i+1] ; 
+            register __vector float va1_1=va1[i] ; 
+            register __vector float va1_2=va1[i+1] ; 
+            register __vector float va2_1=va2[i] ; 
+            register __vector float va2_2=va2[i+1] ; 
+            register __vector float va3_1=va3[i] ; 
+            register __vector float va3_2=va3[i+1] ; 
+            register __vector float va4_1=va4[i] ; 
+            register __vector float va4_2=va4[i+1] ;
+            register __vector float va5_1=va5[i] ; 
+            register __vector float va5_2=va5[i+1] ; 
+            register __vector float va6_1=va6[i] ; 
+            register __vector float va6_2=va6[i+1] ; 
+            register __vector float va7_1=va7[i] ; 
+            register __vector float va7_2=va7[i+1] ;                       
+            temp0 += vx1* va0_1 + vx2 * va0_2;
+            temp1 += vx1* va1_1 + vx2 * va1_2;
+            temp2 += vx1* va2_1 + vx2 * va2_2;
+            temp3 += vx1* va3_1 + vx2 * va3_2;
+            temp4 += vx1* va4_1 + vx2 * va4_2;
+            temp5 += vx1* va5_1 + vx2 * va5_2;
+            temp6 += vx1* va6_1 + vx2 * va6_2;
+            temp7 += vx1* va7_1 + vx2 * va7_2;  
+        }
+    
+  
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
+    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
+    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
+
+    y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
+    y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
+    y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
+    y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
+
+}
+ 
+
+static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i = 0;
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1;
+    __vector float* va2 = (__vector float*) a2;
+    __vector float* va3 = (__vector float*) a3;
+    __vector float* v_x = (__vector float*) x;
+    register __vector float temp0 = {0,0,0,0};
+    register __vector float temp1 = {0,0,0,0};
+    register __vector float temp2 = {0,0,0,0};
+    register __vector float temp3 = {0,0,0,0}; 
+
+    for (i = 0; i < n / 4; i +=2) {
+        temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
+        temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1];
+        temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1];
+        temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; 
+    }
+ 
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
+    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
+    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
+
+}
+ 
+
+static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
+
+    BLASLONG i;
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1;
+    __vector float* v_x = (__vector float*) x;
+    __vector float temp0 = {0,0,0,0};
+    __vector float temp1 = {0,0,0,0};
+    for (i = 0; i < n / 4; i +=2) {
+        temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
+        temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; 
+    }
+
+
+
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); 
+}
+
+static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+
+    BLASLONG i;
+    FLOAT *a0;
+    a0 = ap;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* v_x = (__vector float*) x;
+    __vector float temp0 = {0,0,0,0};
+    for (i = 0; i < n / 4; i +=2) {
+        temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; 
+    }
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+
+}
+ 
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+    BLASLONG i;
+    for (i = 0; i < n; i++) {
+        *dest++ = *src;
+        src += inc_src;
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
+    BLASLONG i;
+    BLASLONG j;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2;
+
+    FLOAT ybuffer[8] __attribute__((aligned(16)));
+    FLOAT *xbuffer; 
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    xbuffer = buffer;
+
+    n1 = n >> 3;
+    n2 = n & 7;
+
+    m3 = m & 7;
+    m1 = m - m3;
+    m2 = (m & (NBMAX - 1)) - m3;
+  
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        y_ptr = y;
+        a_ptr = a;
+        x_ptr = x;
+
+        if (inc_x != 1)
+            copy_x(NB, x_ptr, xbuffer, inc_x);
+        else
+            xbuffer = x_ptr;
+
+        BLASLONG lda8 = lda << 3;
+
+  
+        if (inc_y == 1) {
+
+            for (i = 0; i < n1; i++) {
+                 
+                sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
+ 
+                y_ptr += 8;
+                a_ptr += lda8;
+        
+            }
+
+        } else {
+                   
+            for (i = 0; i < n1; i++) {
+                ybuffer[0] = 0;
+                ybuffer[1] = 0;
+                ybuffer[2] = 0;
+                ybuffer[3] = 0;
+                ybuffer[4] = 0;
+                ybuffer[5] = 0;
+                ybuffer[6] = 0;
+                ybuffer[7] = 0;
+                sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+ 
+
+                *y_ptr += ybuffer[0];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[1];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[2];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[3];
+                y_ptr += inc_y;
+
+                *y_ptr += ybuffer[4];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[5];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[6];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[7];
+                y_ptr += inc_y;
+
+                a_ptr += lda8;
+            }
+
+        }
+
+
+        if (n2 & 4) {
+            ybuffer[0] = 0;
+            ybuffer[1] = 0;
+            ybuffer[2] = 0;
+            ybuffer[3] = 0;
+            sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+            a_ptr += lda<<2;
+
+            *y_ptr += ybuffer[0];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[1];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[2];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[3];
+            y_ptr += inc_y;
+        }
+
+        if (n2 & 2) {
+            sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
+            a_ptr += lda << 1;
+            y_ptr += 2 * inc_y;
+
+        }
+
+        if (n2 & 1) {
+            sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha);
+            a_ptr += lda;
+            y_ptr += inc_y;
+
+        }
+
+        a += NB;
+        x += NB * inc_x;
+
+
+    }
+
+    if (m3 == 0) return (0);
+
+    x_ptr = x;
+    a_ptr = a;
+    if (m3 & 4) {
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp2 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp3 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+        if (lda == 4 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1  +  aj[2] * xtemp2 + aj[3] * xtemp3;
+                y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1  +  aj[6] * xtemp2 + aj[7] * xtemp3;
+                y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1  +  aj[10] * xtemp2 + aj[11] * xtemp3;
+                y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1  +  aj[14] * xtemp2 + aj[15] * xtemp3;
+                aj += 16;
+
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 +  aj[2] * xtemp2 + aj[3] * xtemp3;
+                aj += 4;
+            }
+
+        } else if (inc_y == 1) {
+        
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2  + *(aj +  lda2 +3) * xtemp3;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2  + *(aj +  lda3+3) * xtemp3;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3;
+                    aj += lda;
+                }
+
+        } else {
+
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+
+            } 
+            if (m3==4) return (0);
+            a_ptr += 4; 
+    }
+
+    if (m3 & 2 ) {
+  
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+
+        if (lda == 2 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
+                y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
+                y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
+                aj += 8;
+
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                aj += 2;
+            }
+
+        } else {
+            if (inc_y == 1) {
+
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    aj += lda;
+                }
+
+            } else {
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+            }
+
+        } 
+        if (m3==2) return (0);
+        a_ptr += 2; 
+    }
+    if (m3 & 1) {
+          
+    FLOAT xtemp = *x_ptr * alpha;
+            x_ptr += inc_x;
+    FLOAT *aj = a_ptr;
+    y_ptr = y;
+    if (lda == 1 && inc_y == 1) {
+        for (j = 0; j < (n & -4); j += 4) {
+            y_ptr[j] += aj[j] * xtemp;
+            y_ptr[j + 1] += aj[j + 1] * xtemp;
+            y_ptr[j + 2] += aj[j + 2] * xtemp;
+            y_ptr[j + 3] += aj[j + 3] * xtemp;
+        }
+        for (; j < n; j++) {
+            y_ptr[j] += aj[j] * xtemp;
+        }
+
+
+    } else {
+        if (inc_y == 1) {
+
+            BLASLONG register lda2 = lda << 1;
+            BLASLONG register lda4 = lda << 2;
+            BLASLONG register lda3 = lda2 + lda;
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += *aj * xtemp;
+                y_ptr[j + 1] += *(aj + lda) * xtemp;
+                y_ptr[j + 2] += *(aj + lda2) * xtemp;
+                y_ptr[j + 3] += *(aj + lda3) * xtemp;
+                aj += lda4;
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += *aj * xtemp;
+                aj += lda;
+            }
+
+        } else {
+            for (j = 0; j < n; j++) {
+                *y_ptr += *aj * xtemp;
+                y_ptr += inc_y;
+                aj += lda;
+            }
+
+        }
+    
+    }
+                a_ptr += 1; 
+    }
+    return (0);
+
+}
+
diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S
index d1e60da6c..f9320d516 100644
--- a/kernel/power/zgemm_kernel_power9.S
+++ b/kernel/power/zgemm_kernel_power9.S
@@ -1,245 +1,245 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-#define ASSEMBLER
-#include "common.h"
-#include "def_vsx.h"
-
-#define LOAD	ld
- 
-#define STACKSIZE 512
-
-#define FZERO	312+192(SP)
-
-#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
-
-#define	M	r3
-#define	N	r4
-#define	K	r5
-
- 
-#define A	r8
-#define	B	r9
-#define	C	r10
-#define	LDC	r6
-#define OFFSET	r7
- 
- 
-
-#define o0	0
-#define alpha_r vs30
-#define alpha_i vs31
-
-#define VECSAVE r11
-
-#define FRAMEPOINTER r12
-
-#define T10 r14
-
-#define L	r15
-#define T8	r16
-#define T5	r17
-#define T2	r19
-#define TEMP_REG	r20
-#define	T6	r21
-#define	I	r22
-#define J	r23
-#define AO	r24
-#define	BO	r25
-#define	CO	r26
-#define T7	r27
-#define	T3	r28
-#define T4	r29
-
-#define PRE	r30
-#define T1  	r31
-
-#ifndef NEEDPARAM
-
-	PROLOGUE
-	PROFCODE
-
-	mr      FRAMEPOINTER, SP
-    addi    SP, SP, -STACKSIZE 
-    mflr    r0
-	stfd	f14,    0(SP)
-	stfd	f15,    8(SP)
-	stfd	f16,   16(SP)
-	stfd	f17,   24(SP)
-
-	stfd	f18,   32(SP)
-	stfd	f19,   40(SP)
-	stfd	f20,   48(SP)
-	stfd	f21,   56(SP)
-
-	stfd	f22,   64(SP)
-	stfd	f23,   72(SP)
-	stfd	f24,   80(SP)
-	stfd	f25,   88(SP)
-
-	stfd	f26,   96(SP)
-	stfd	f27,  104(SP)
-	stfd	f28,  112(SP)
-	stfd	f29,  120(SP)
-
-	stfd	f30,  128(SP)
-	stfd	f31,  136(SP)
-
-    xxspltd  alpha_r,vs1,0  /*copy from register f1 */
-    xxspltd  alpha_i,vs2,0  /*copy from register f2 */
- 
-	std	r31,  144(SP)
-	std	r30,  152(SP)
-	std	r29,  160(SP)
-	std	r28,  168(SP)
-	std	r27,  176(SP)
-	std	r26,  184(SP)
-	std	r25,  192(SP)
-	std	r24,  200(SP)
-	std	r23,  208(SP)
-	std	r22,  216(SP)
-	std	r21,  224(SP)
-	std	r20,  232(SP)
-	std	r19,  240(SP)
-	std	r18,  248(SP)
-	std	r17,  256(SP)
-	std	r16,  264(SP)
-	std	r15,  272(SP)
-	std	r14,  280(SP)
- 
- 
-    stxv    vs52,  288(SP)
-    stxv    vs53,  304(SP)
-    stxv    vs54,  320(SP)
-    stxv    vs55,  336(SP)
-    stxv    vs56,  352(SP)
-    stxv    vs57,  368(SP)
-    stxv    vs58,  384(SP)
-    stxv    vs59,  400(SP)
-    stxv    vs60,  416(SP)
-    stxv    vs61,  432(SP)
-    stxv    vs62,  448(SP)
-    stxv    vs63,  464(SP)
-
-    std    r0, FLINK_SAVE(SP)
- 
-
-#if defined(linux) || defined(__FreeBSD__)
-	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
-#endif
-
-
-#ifdef TRMMKERNEL
-#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
-	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
-#endif 
-#endif
-
-
-#include "zgemm_macros_power9.S"
-
- 
-
-	slwi	LDC, LDC, ZBASE_SHIFT
-	li	PRE,  512 
-    li  r0,   0
- 
-
-#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
-/*negate for this case as we will use addition -1*(a+b) */
-  xvnegdp alpha_r,alpha_r
-  xvnegdp alpha_i,alpha_i
-#endif
-	.align 4
-
-#include "zgemm_logic_power9.S"
-
-L999:
- 
-	lfd	f14,    0(SP)
-	lfd	f15,    8(SP)
-	lfd	f16,   16(SP)
-	lfd	f17,   24(SP)
-
-	lfd	f18,   32(SP)
-	lfd	f19,   40(SP)
-	lfd	f20,   48(SP)
-	lfd	f21,   56(SP)
-
-	lfd	f22,   64(SP)
-	lfd	f23,   72(SP)
-	lfd	f24,   80(SP)
-	lfd	f25,   88(SP)
-
-	lfd	f26,   96(SP)
-	lfd	f27,  104(SP)
-	lfd	f28,  112(SP)
-	lfd	f29,  120(SP)
-
-	lfd	f30,  128(SP)
-	lfd	f31,  136(SP)
-
- 
-	ld	r31,  144(SP)
-	ld	r30,  152(SP)
-	ld	r29,  160(SP)
-	ld	r28,  168(SP)
-	ld	r27,  176(SP)
-	ld	r26,  184(SP)
-	ld	r25,  192(SP)
-	ld	r24,  200(SP)
-	ld	r23,  208(SP)
-	ld	r22,  216(SP)
-	ld	r21,  224(SP)
-	ld	r20,  232(SP)
-	ld	r19,  240(SP)
-	ld	r18,  248(SP)
-	ld	r17,  256(SP)
-	ld	r16,  264(SP)
-	ld	r15,  272(SP)
-	ld	r14,  280(SP)
-
-	ld    r0, 	 FLINK_SAVE(SP)	
- 
-    lxv    vs52,  288(SP)
-    lxv    vs53,  304(SP)
-    lxv    vs54,  320(SP)
-    lxv    vs55,  336(SP)
-    lxv    vs56,  352(SP)
-    lxv    vs57,  368(SP)
-    lxv    vs58,  384(SP) 
-    lxv    vs59,  400(SP)
-	mtlr r0
-    lxv    vs60,  416(SP)
-    lxv    vs61,  432(SP) 
-    lxv    vs62,  448(SP)
-    lxv    vs63,  464(SP)
-
-	addi	SP, SP, STACKSIZE 
-	blr
-
-	EPILOGUE
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define LOAD	ld
+ 
+#define STACKSIZE 512
+
+#define FZERO	312+192(SP)
+
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+ 
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+ 
+ 
+
+#define o0	0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define VECSAVE r11
+
+#define FRAMEPOINTER r12
+
+#define T10 r14
+
+#define L	r15
+#define T8	r16
+#define T5	r17
+#define T2	r19
+#define TEMP_REG	r20
+#define	T6	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO	r26
+#define T7	r27
+#define	T3	r28
+#define T4	r29
+
+#define PRE	r30
+#define T1  	r31
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	mr      FRAMEPOINTER, SP
+    addi    SP, SP, -STACKSIZE 
+    mflr    r0
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+    xxspltd  alpha_r,vs1,0  /*copy from register f1 */
+    xxspltd  alpha_i,vs2,0  /*copy from register f2 */
+ 
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+    stxv    vs52,  288(SP)
+    stxv    vs53,  304(SP)
+    stxv    vs54,  320(SP)
+    stxv    vs55,  336(SP)
+    stxv    vs56,  352(SP)
+    stxv    vs57,  368(SP)
+    stxv    vs58,  384(SP)
+    stxv    vs59,  400(SP)
+    stxv    vs60,  416(SP)
+    stxv    vs61,  432(SP)
+    stxv    vs62,  448(SP)
+    stxv    vs63,  464(SP)
+
+    std    r0, FLINK_SAVE(SP)
+ 
+
+#if defined(linux) || defined(__FreeBSD__)
+	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+
+
+#ifdef TRMMKERNEL
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
+#endif 
+#endif
+
+
+#include "zgemm_macros_power9.S"
+
+ 
+
+	slwi	LDC, LDC, ZBASE_SHIFT
+	li	PRE,  512 
+    li  r0,   0
+ 
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
+/*negate for this case as we will use addition -1*(a+b) */
+  xvnegdp alpha_r,alpha_r
+  xvnegdp alpha_i,alpha_i
+#endif
+	.align 4
+
+#include "zgemm_logic_power9.S"
+
+L999:
+ 
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+ 
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+
+	ld    r0, 	 FLINK_SAVE(SP)	
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP)
+	mtlr r0
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE 
+	blr
+
+	EPILOGUE
 #endif
\ No newline at end of file
diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S
index fe5d8ade2..850b41aff 100644
--- a/kernel/power/zgemm_logic_power9.S
+++ b/kernel/power/zgemm_logic_power9.S
@@ -1,1891 +1,1891 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-#define MY_ALIGN .align 3
-b ZGEMM_L2
-/*                MINI SUBROUTINES                            */      
-/*                2x8 MAIN 128x+2 LOOP                     */      
-
-
-ZGEMM_L2x8_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x8_2 
-    MY_ALIGN
-ZGEMM_L2x8_LOOP:
-/*----------------------------------------*/   
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 256,64,0,0 
-ZGEMM_L2x8_K128:
-/*----------------------------------------*/   
-    KERNEL2x8_L2 256,64,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 256,64,2,0
-    KERNEL2x8_L2 256,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 256,64,4,0
-    KERNEL2x8_L2 256,64,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 256,64,6,0
-    KERNEL2x8_L2 256,64,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL2x8_L2 256,64,8,0
-    KERNEL2x8_L2 256,64,9,0
-    KERNEL2x8_L2 256,64,10,0
-    KERNEL2x8_L2 256,64,11,0  
-    dcbt    BO, T4
-    KERNEL2x8_L2 256,64,12,0
-    KERNEL2x8_L2 256,64,13,0
-    KERNEL2x8_L2 256,64,14,0
-    KERNEL2x8_L2 256,64,15,0  
-    KERNEL2x8_L2 256,64,16,0
-    KERNEL2x8_L2 256,64,17,0 
-    KERNEL2x8_L2 256,64,18,0
-    KERNEL2x8_L2 256,64,19,0  
-    KERNEL2x8_L2 256,64,20,0
-    KERNEL2x8_L2 256,64,21,0 
-    KERNEL2x8_L2 256,64,22,0
-    KERNEL2x8_L2 256,64,23,0   
-    KERNEL2x8_L2 256,64,24,0
-    KERNEL2x8_L2 256,64,25,0
-    KERNEL2x8_L2 256,64,26,0
-    KERNEL2x8_L2 256,64,27,0  
-    KERNEL2x8_L2 256,64,28,0
-    KERNEL2x8_L2 256,64,29,0
-    KERNEL2x8_L2 256,64,30,0
-    KERNEL2x8_L2 256,64,31,0 
-    KERNEL2x8_L2 256,64,32,0
-    KERNEL2x8_L2 256,64,33,0
-    KERNEL2x8_L2 256,64,34,0
-    KERNEL2x8_L2 256,64,35,0 
-    KERNEL2x8_L2 256,64,36,0
-    KERNEL2x8_L2 256,64,37,0
-    KERNEL2x8_L2 256,64,38,0
-    KERNEL2x8_L2 256,64,39,0  
-    KERNEL2x8_L2 256,64,40,0
-    KERNEL2x8_L2 256,64,41,0
-    KERNEL2x8_L2 256,64,42,0
-    KERNEL2x8_L2 256,64,43,0  
-    KERNEL2x8_L2 256,64,44,0
-    KERNEL2x8_L2 256,64,45,0
-    KERNEL2x8_L2 256,64,46,0
-    KERNEL2x8_L2 256,64,47,0 
-    KERNEL2x8_L2 256,64,48,0
-    KERNEL2x8_L2 256,64,49,0 
-    KERNEL2x8_L2 256,64,50,0
-    KERNEL2x8_L2 256,64,51,0  
-    KERNEL2x8_L2 256,64,52,0
-    KERNEL2x8_L2 256,64,53,0 
-    KERNEL2x8_L2 256,64,54,0
-    KERNEL2x8_L2 256,64,55,0  
-    KERNEL2x8_L2 256,64,56,0
-    KERNEL2x8_L2 256,64,57,0
-    KERNEL2x8_L2 256,64,58,0
-    KERNEL2x8_L2 256,64,59,0  
-    KERNEL2x8_L2 256,64,60,0
-    KERNEL2x8_L2 256,64,61,0
-    KERNEL2x8_L2 256,64,62,0 
-    KERNEL2x8_L2 256,64,63,1  
-    bdnz    ZGEMM_L2x8_LOOP
-    MY_ALIGN  
-ZGEMM_L2x8_LOOP_END:
-/*----------------------------------------*/   
-    END2x8_2
-    blr
-    MY_ALIGN
-
-
-ZGEMM_2x8_L64_SUB:
-/*----------------------------------------*/   
-    LOAD2x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 256,64,0,0 
-    KERNEL2x8_L2 256,64,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 256,64,2,0
-    KERNEL2x8_L2 256,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 256,64,4,0
-    KERNEL2x8_L2 256,64,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 256,64,6,0
-    KERNEL2x8_L2 256,64,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL2x8_L2 256,64,8,0
-    KERNEL2x8_L2 256,64,9,0
-    KERNEL2x8_L2 256,64,10,0
-    KERNEL2x8_L2 256,64,11,0  
-    dcbt    BO, T4
-    KERNEL2x8_L2 256,64,12,0
-    KERNEL2x8_L2 256,64,13,0
-    KERNEL2x8_L2 256,64,14,0
-    KERNEL2x8_L2 256,64,15,0  
-    KERNEL2x8_L2 256,64,16,0
-    KERNEL2x8_L2 256,64,17,0 
-    KERNEL2x8_L2 256,64,18,0
-    KERNEL2x8_L2 256,64,19,0  
-    KERNEL2x8_L2 256,64,20,0
-    KERNEL2x8_L2 256,64,21,0 
-    KERNEL2x8_L2 256,64,22,0
-    KERNEL2x8_L2 256,64,23,0   
-    KERNEL2x8_L2 256,64,24,0
-    KERNEL2x8_L2 256,64,25,0
-    KERNEL2x8_L2 256,64,26,0
-    KERNEL2x8_L2 256,64,27,0  
-    KERNEL2x8_L2 256,64,28,0
-    KERNEL2x8_L2 256,64,29,0
-    KERNEL2x8_L2 256,64,30,0
-    KERNEL2x8_E2 256,64,31,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_2x8_L32_SUB:
-/*----------------------------------------*/   
-    LOAD2x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 256,64,0,0 
-    KERNEL2x8_L2 256,64,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 256,64,2,0
-    KERNEL2x8_L2 256,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 256,64,4,0
-    KERNEL2x8_L2 256,64,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 256,64,6,0
-    KERNEL2x8_L2 256,64,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL2x8_L2 256,64,8,0
-    KERNEL2x8_L2 256,64,9,0
-    KERNEL2x8_L2 256,64,10,0
-    KERNEL2x8_L2 256,64,11,0  
-    dcbt    BO, T4
-    KERNEL2x8_L2 256,64,12,0
-    KERNEL2x8_L2 256,64,13,0
-    KERNEL2x8_L2 256,64,14,0
-    KERNEL2x8_E2 256,64,15,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_2x8_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x8_2 
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 256,64,0,0 
-    KERNEL2x8_L2 256,64,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 256,64,2,0
-    KERNEL2x8_L2 256,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 256,64,4,0
-    KERNEL2x8_L2 256,64,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 256,64,6,0
-    KERNEL2x8_E2 256,64,7,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_2x4_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x4_2  
-    MY_ALIGN
-ZGEMM_L2x4_LOOP:
-/*----------------------------------------*/   
-    KERNEL2x4_L2 128,64,0,0
-ZGEMM_L2x4_K32:
-/*----------------------------------------*/   
-    KERNEL2x4_L2 128,64,1,0   
-    KERNEL2x4_L2 128,64,2,0
-    KERNEL2x4_L2 128,64,3,0  
-    KERNEL2x4_L2 128,64,4,0
-    KERNEL2x4_L2 128,64,5,0 
-    KERNEL2x4_L2 128,64,6,0
-    KERNEL2x4_L2 128,64,7,0
-    KERNEL2x4_L2 128,64,8,0
-    KERNEL2x4_L2 128,64,9,0   
-    KERNEL2x4_L2 128,64,10,0
-    KERNEL2x4_L2 128,64,11,0  
-    KERNEL2x4_L2 128,64,12,0
-    KERNEL2x4_L2 128,64,13,0 
-    KERNEL2x4_L2 128,64,14,0
-    KERNEL2x4_L2 128,64,15,1    
-    bdnz    ZGEMM_L2x4_LOOP
-    MY_ALIGN  
-ZGEMM_L2x4_LOOP_END:
-/*----------------------------------------*/   
-    END2x4_2 
-    blr
-    MY_ALIGN
-
-
-ZGEMM_2x4_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x4_2
-    KERNEL2x4_L2 128,64,0,0
-    KERNEL2x4_L2 128,64,1,0   
-    KERNEL2x4_L2 128,64,2,0
-    KERNEL2x4_L2 128,64,3,0  
-    KERNEL2x4_L2 128,64,4,0
-    KERNEL2x4_L2 128,64,5,0 
-    KERNEL2x4_L2 128,64,6,0
-    KERNEL2x4_E2 128,64,7,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_2x4_L8_SUB:
-/*----------------------------------------*/   
-    LOAD2x4_2
-    KERNEL2x4_L2 128,64,0,0
-    KERNEL2x4_L2 128,64,1,0   
-    KERNEL2x4_L2 128,64,2,0
-    KERNEL2x4_E2 128,64,3,1 
-    blr
-
-
-ZGEMM_2x2_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x2_2  
-    MY_ALIGN 
-ZGEMM_L2x2_LOOP:
-/*----------------------------------------*/   
-    KERNEL2x2_L2 64,64,0,0 
-ZGEMM_L2x2_K32:
-/*----------------------------------------*/   
-    KERNEL2x2_L2 64,64,1,0  
-    KERNEL2x2_L2 64,64,2,0
-    KERNEL2x2_L2 64,64,3,0  
-    KERNEL2x2_L2 64,64,4,0
-    KERNEL2x2_L2 64,64,5,0 
-    KERNEL2x2_L2 64,64,6,0
-    KERNEL2x2_L2 64,64,7,0
-    KERNEL2x2_L2 64,64,8,0
-    KERNEL2x2_L2 64,64,9,0  
-    KERNEL2x2_L2 64,64,10,0
-    KERNEL2x2_L2 64,64,11,0  
-    KERNEL2x2_L2 64,64,12,0
-    KERNEL2x2_L2 64,64,13,0 
-    KERNEL2x2_L2 64,64,14,0
-    KERNEL2x2_L2 64,64,15,1   
-    bdnz    ZGEMM_L2x2_LOOP
-    MY_ALIGN  
-
-
-ZGEMM_L2x2_LOOP_END:
-/*----------------------------------------*/   
-    END2x2_2 
-    blr
-    MY_ALIGN
-ZGEMM_2x2_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x2_2
-    KERNEL2x2_L2 64,64,0,0
-    KERNEL2x2_L2 64,64,1,0  
-    KERNEL2x2_L2 64,64,2,0
-    KERNEL2x2_L2 64,64,3,0  
-    KERNEL2x2_L2 64,64,4,0
-    KERNEL2x2_L2 64,64,5,0 
-    KERNEL2x2_L2 64,64,6,0
-    KERNEL2x2_E2 64,64,7,1
-    blr
-    MY_ALIGN
-ZGEMM_2x2_L8_SUB:
-/*----------------------------------------*/   
-    LOAD2x2_2
-    KERNEL2x2_L2 64,64,0,0
-    KERNEL2x2_L2 64,64,1,0  
-    KERNEL2x2_L2 64,64,2,0
-    KERNEL2x2_E2 64,64,3,1  
-    blr
-
-
-ZGEMM_2x1_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x1_2  
-    MY_ALIGN
-ZGEMM_L2x1_LOOP:
-/*----------------------------------------*/   
-    KERNEL2x1_L2 32,64,0,0 
-ZGEMM_L2x1_K32:
-/*----------------------------------------*/   
-    KERNEL2x1_L2 32,64,1,0  
-    KERNEL2x1_L2 32,64,2,0
-    KERNEL2x1_L2 32,64,3,0  
-    KERNEL2x1_L2 32,64,4,0
-    KERNEL2x1_L2 32,64,5,0 
-    KERNEL2x1_L2 32,64,6,0
-    KERNEL2x1_L2 32,64,7,0
-    KERNEL2x1_L2 32,64,8,0
-    KERNEL2x1_L2 32,64,9,0  
-    KERNEL2x1_L2 32,64,10,0
-    KERNEL2x1_L2 32,64,11,0  
-    KERNEL2x1_L2 32,64,12,0
-    KERNEL2x1_L2 32,64,13,0 
-    KERNEL2x1_L2 32,64,14,0
-    KERNEL2x1_L2 32,64,15,1   
-    bdnz    ZGEMM_L2x1_LOOP
-    MY_ALIGN  
-ZGEMM_L2x1_LOOP_END:
-/*----------------------------------------*/   
-    END2x1_2 
-    blr
-
-    MY_ALIGN
-ZGEMM_2x1_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x1_2
-    KERNEL2x1_L2 32,64,0,0
-    KERNEL2x1_L2 32,64,1,0  
-    KERNEL2x1_L2 32,64,2,0
-    KERNEL2x1_L2 32,64,3,0  
-    KERNEL2x1_L2 32,64,4,0
-    KERNEL2x1_L2 32,64,5,0 
-    KERNEL2x1_L2 32,64,6,0
-    KERNEL2x1_E2 32,64,7,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_2x1_L8_SUB:
-/*----------------------------------------*/   
-    LOAD2x1_2
-    KERNEL2x1_L2 32,64,0,0
-    KERNEL2x1_L2 32,64,1,0  
-    KERNEL2x1_L2 32,64,2,0
-    KERNEL2x1_E2 32,64,3,1  
-    blr
-
-
-
-/*             MAIN LOOP BEGINS               */   
-    MY_ALIGN
-
-
-ZGEMM_L2:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL) && !defined(LEFT)   
-    neg TEMP_REG, OFFSET 
-#endif   
-    srawi.    J,  N,  1
-    ble   ZGEMM_L2_END
-
-
-ZGEMM_L2_BEGIN:
-/*----------------------------------------*/   
-    mr    CO, C
-    slwi    T1, LDC , 1     
-    add     T2,C,LDC    
-    mr    AO, A  
-    add   C,  C,  T1
-#if defined(TRMMKERNEL) && defined(LEFT)   
-    mr TEMP_REG, OFFSET  /*off = offset;*/
-#endif     
-    srawi.    I,  M,  3
-    ble   ZGEMM_L2x8_END
-    dcbt    CO,r0  /*just prefetch*/
-    dcbt    T2,r0    
-
-
-ZGEMM_L2x8_BEGIN:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
-#else    
-    mr    BO, B  
-    dcbt    B,  r0  
-#endif     
-    dcbt    AO, r0
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
-    mr T1, T6
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512   
-    srawi.   T8, T1, 7 /**(T11-2) % 128x */
-#else   
-    mr T1, K
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512 
-    srawi.   T8, T1, 7 /**(K-2) % 128x */
-#endif   
-    ZERO2x8  
-    ble   ZGEMM_L2x8_SUB0
-    bl ZGEMM_L2x8_LMAIN_SUB
-    andi.   L,  T1, 127
-    ble   ZGEMM_L2x8_SAVE
-    b   ZGEMM_L2x8_SUB2
-
-
-ZGEMM_L2x8_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 255
-    cmpwi   T6,129
-#else   
-    andi.   L,  K,  255
-    cmpwi   K,129
-#endif       
-    li T8,1
-    bne CMP2x8_128K
-    addi BO,BO,-32
-    addi AO,AO,-128 
-    LOAD2x8O 128,32 
-    END2x8_WITHOUT_ADD   
-    LOAD2x8_2O  256, 64 
-    mtctr   T8    
-    bl ZGEMM_L2x8_K128   
-    b ZGEMM_L2x8_SAVE  
-    CMP2x8_128K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,128
-#else    
-    cmpwi   K,128
-#endif        
-    bne ZGEMM_L2x8_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-256   
-    LOAD2x8_2O 256,64
-    bl ZGEMM_L2x8_K128   
-    b ZGEMM_L2x8_SAVE 
-    MY_ALIGN
-
-
-ZGEMM_L2x8_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 64
-    ble ZGEMM_L2x8_SUB2_32
-    bl  ZGEMM_2x8_L64_SUB
-    MY_ALIGN
-
-
-ZGEMM_L2x8_SUB2_32:
-/*----------------------------------------*/   
-    andi.      T1,L, 32
-    ble ZGEMM_L2x8_SUB2_16    
-    bl  ZGEMM_2x8_L32_SUB
-    MY_ALIGN 
-
-
-ZGEMM_L2x8_SUB2_16:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L2x8_SUB2_8
-    bl  ZGEMM_2x8_L16_SUB  
-    MY_ALIGN    
-
-
-ZGEMM_L2x8_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L2x8_SUB2_4
-    LOAD2x8_2
-    KERNEL2x8_L2  256,64, 0,0
-    KERNEL2x8_L2  256,64, 1,0
-    KERNEL2x8_L2  256,64, 2,0
-    KERNEL2x8_E2  256,64, 3,1
-    MY_ALIGN   
-
-
-ZGEMM_L2x8_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L2x8_SUB2_2
-    LOAD2x8_2
-    KERNEL2x8_L2  256,64, 0,0
-    KERNEL2x8_E2  256,64, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L2x8_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L2x8_SUB2_1
-    LOAD2x8_2 
-    KERNEL2x8_E2  256,64, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L2x8_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L2x8_SAVE 
-    KERNEL2x8
-
-
-ZGEMM_L2x8_SAVE:
-/*----------------------------------------*/   
-    addic.    I,  I,  -1
-    SAVE2x8
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
-#endif     
-    bgt   ZGEMM_L2x8_BEGIN
-    andi.   T2, M,  7
-    ble   ZGEMM_L2x1_END
-    andi.   T1, M,  4
-    ble   ZGEMM_L2x4_END
-    b   ZGEMM_L2x4_BEGIN
-    MY_ALIGN 
-
-
-ZGEMM_L2x8_END:
-/*----------------------------------------*/   
-
-
-ZGEMM_L2x4_BEGIN:
-/*----------------------------------------*/   
-    andi.   T2, M,  7
-    ble   ZGEMM_L2x1_END
-    andi.   T1, M,  4
-    ble   ZGEMM_L2x4_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T11-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO2x4
-    ble   ZGEMM_L2x4_SUB0 
-    bl ZGEMM_2x4_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   ZGEMM_L2x4_SAVE
-    b    ZGEMM_L2x4_SUB2
-
-
-ZGEMM_L2x4_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP2x4_32K
-    addi BO,BO,-32
-    addi AO,AO,-64  
-    LOAD2x4O 64,32 
-    END2x4_WITHOUT_ADD   
-    LOAD2x4_2O  128, 64 
-    mtctr   T8    
-    bl ZGEMM_L2x4_K32   
-    b ZGEMM_L2x4_SAVE  
-    CMP2x4_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne ZGEMM_L2x4_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-128   
-    LOAD2x4_2O 128,64
-    bl ZGEMM_L2x4_K32   
-    b ZGEMM_L2x4_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-ZGEMM_L2x4_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L2x4_SUB2_8
-    bl  ZGEMM_2x4_L16_SUB  
-    MY_ALIGN
-
-
-ZGEMM_L2x4_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L2x4_SUB2_4
-    bl ZGEMM_2x4_L8_SUB
-    MY_ALIGN  
-
-
-ZGEMM_L2x4_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L2x4_SUB2_2
-    LOAD2x4_2
-    KERNEL2x4_L2  128,64, 0,0
-    KERNEL2x4_E2  128,64, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L2x4_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L2x4_SUB2_1
-    LOAD2x4_2
-    KERNEL2x4_E2  128,64, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L2x4_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L2x4_SAVE 
-    KERNEL2x4
-
-
-ZGEMM_L2x4_SAVE:
-/*----------------------------------------*/   
-    SAVE2x4
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
-#endif     
-
-
-ZGEMM_L2x4_END:
-/*----------------------------------------*/   
-
-
-ZGEMM_L2x2_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  2
-    ble   ZGEMM_L2x2_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T11-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO2x2
-    ble   ZGEMM_L2x2_SUB0 
-    bl ZGEMM_2x2_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   ZGEMM_L2x2_SAVE
-    b   ZGEMM_L2x2_SUB2
-
-
-ZGEMM_L2x2_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP2x2_32K
-    addi BO,BO,-32
-    addi AO,AO,-32  
-    LOAD2x2O 32,32 
-    END2x2_WITHOUT_ADD   
-    LOAD2x2_2O  64, 64  
-    mtctr   T8    
-    bl ZGEMM_L2x2_K32   
-    b ZGEMM_L2x2_SAVE  
-    CMP2x2_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne ZGEMM_L2x2_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-64   
-    LOAD2x2_2O 64,64
-    bl ZGEMM_L2x2_K32   
-    b ZGEMM_L2x2_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-ZGEMM_L2x2_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L2x2_SUB2_8
-    bl ZGEMM_2x2_L16_SUB  
-    MY_ALIGN
-
-
-ZGEMM_L2x2_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L2x2_SUB2_4
-    bl ZGEMM_2x2_L8_SUB
-    MY_ALIGN  
-
-
-ZGEMM_L2x2_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L2x2_SUB2_2
-    LOAD2x2_2
-    KERNEL2x2_L2  64,64, 0,0
-    KERNEL2x2_E2  64,64, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L2x2_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L2x2_SUB2_1
-    LOAD2x2_2
-    KERNEL2x2_E2  64,64, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L2x2_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L2x2_SAVE 
-    KERNEL2x2
-
-
-ZGEMM_L2x2_SAVE:
-/*----------------------------------------*/   
-    SAVE2x2
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
-#endif     
-
-
-ZGEMM_L2x2_END:
-/*----------------------------------------*/   
-
-
-ZGEMM_L2x1_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  1
-    ble   ZGEMM_L2x1_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T11-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO2x1
-    ble   ZGEMM_L2x1_SUB0 
-    bl ZGEMM_2x1_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   ZGEMM_L2x1_SAVE
-    b   ZGEMM_L2x1_SUB2
-
-
-ZGEMM_L2x1_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP2x1_32K
-    addi BO,BO,-32
-    addi AO,AO,-16  
-    LOAD2x1O 16,32 
-    END2x1_WITHOUT_ADD   
-    LOAD2x1_2O  32, 64  
-    mtctr   T8    
-    bl ZGEMM_L2x1_K32   
-    b ZGEMM_L2x1_SAVE  
-    CMP2x1_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne ZGEMM_L2x1_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-32   
-    LOAD2x1_2O 32,64
-    bl ZGEMM_L2x1_K32   
-    b ZGEMM_L2x1_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-ZGEMM_L2x1_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L2x1_SUB2_8
-    bl ZGEMM_2x1_L16_SUB  
-    MY_ALIGN
-
-
-ZGEMM_L2x1_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L2x1_SUB2_4
-    bl ZGEMM_2x1_L8_SUB
-    MY_ALIGN  
-
-
-ZGEMM_L2x1_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L2x1_SUB2_2
-    LOAD2x1_2
-    KERNEL2x1_L2  32,64, 0,0
-    KERNEL2x1_E2  32,64, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L2x1_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L2x1_SUB2_1
-    LOAD2x1_2
-    KERNEL2x1_E2  32,64, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L2x1_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L2x1_SAVE 
-    KERNEL2x1
-
-
-ZGEMM_L2x1_SAVE:
-/*----------------------------------------*/   
-    SAVE2x1
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
-#endif   
-
-
-ZGEMM_L2x1_END:
-/*----------------------------------------*/   
-    slwi    T1, K,  5
-    addic.    J,  J,  -1
-    add   B,  B,  T1
-#if defined(TRMMKERNEL) && !defined(LEFT)   
-    addi TEMP_REG, TEMP_REG, 2
-#endif   
-    bgt   ZGEMM_L2_BEGIN
-
-
-ZGEMM_L2_END:
-
-b ZGEMM_L1
-/*                MINI SUBROUTINES                            */      
-/*                1x8 MAIN 128x+2 LOOP                     */      
-
-
-ZGEMM_L1x8_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x8_2 
-    MY_ALIGN
-ZGEMM_L1x8_LOOP:
-/*----------------------------------------*/   
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 256,32,0,0 
-ZGEMM_L1x8_K128:
-/*----------------------------------------*/   
-    KERNEL1x8_L2 256,32,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 256,32,2,0
-    KERNEL1x8_L2 256,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 256,32,4,0
-    KERNEL1x8_L2 256,32,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 256,32,6,0
-    KERNEL1x8_L2 256,32,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL1x8_L2 256,32,8,0
-    KERNEL1x8_L2 256,32,9,0
-    KERNEL1x8_L2 256,32,10,0
-    KERNEL1x8_L2 256,32,11,0  
-    dcbt    BO, T4
-    KERNEL1x8_L2 256,32,12,0
-    KERNEL1x8_L2 256,32,13,0
-    KERNEL1x8_L2 256,32,14,0
-    KERNEL1x8_L2 256,32,15,0  
-    KERNEL1x8_L2 256,32,16,0
-    KERNEL1x8_L2 256,32,17,0 
-    KERNEL1x8_L2 256,32,18,0
-    KERNEL1x8_L2 256,32,19,0  
-    KERNEL1x8_L2 256,32,20,0
-    KERNEL1x8_L2 256,32,21,0 
-    KERNEL1x8_L2 256,32,22,0
-    KERNEL1x8_L2 256,32,23,0   
-    KERNEL1x8_L2 256,32,24,0
-    KERNEL1x8_L2 256,32,25,0
-    KERNEL1x8_L2 256,32,26,0
-    KERNEL1x8_L2 256,32,27,0  
-    KERNEL1x8_L2 256,32,28,0
-    KERNEL1x8_L2 256,32,29,0
-    KERNEL1x8_L2 256,32,30,0
-    KERNEL1x8_L2 256,32,31,0 
-    KERNEL1x8_L2 256,32,32,0
-    KERNEL1x8_L2 256,32,33,0
-    KERNEL1x8_L2 256,32,34,0
-    KERNEL1x8_L2 256,32,35,0 
-    KERNEL1x8_L2 256,32,36,0
-    KERNEL1x8_L2 256,32,37,0
-    KERNEL1x8_L2 256,32,38,0
-    KERNEL1x8_L2 256,32,39,0  
-    KERNEL1x8_L2 256,32,40,0
-    KERNEL1x8_L2 256,32,41,0
-    KERNEL1x8_L2 256,32,42,0
-    KERNEL1x8_L2 256,32,43,0  
-    KERNEL1x8_L2 256,32,44,0
-    KERNEL1x8_L2 256,32,45,0
-    KERNEL1x8_L2 256,32,46,0
-    KERNEL1x8_L2 256,32,47,0 
-    KERNEL1x8_L2 256,32,48,0
-    KERNEL1x8_L2 256,32,49,0 
-    KERNEL1x8_L2 256,32,50,0
-    KERNEL1x8_L2 256,32,51,0  
-    KERNEL1x8_L2 256,32,52,0
-    KERNEL1x8_L2 256,32,53,0 
-    KERNEL1x8_L2 256,32,54,0
-    KERNEL1x8_L2 256,32,55,0  
-    KERNEL1x8_L2 256,32,56,0
-    KERNEL1x8_L2 256,32,57,0
-    KERNEL1x8_L2 256,32,58,0
-    KERNEL1x8_L2 256,32,59,0  
-    KERNEL1x8_L2 256,32,60,0
-    KERNEL1x8_L2 256,32,61,0
-    KERNEL1x8_L2 256,32,62,0 
-    KERNEL1x8_L2 256,32,63,1  
-    bdnz    ZGEMM_L1x8_LOOP
-    MY_ALIGN  
-ZGEMM_L1x8_LOOP_END:
-/*----------------------------------------*/   
-    END1x8_2
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x8_L64_SUB:
-/*----------------------------------------*/   
-    LOAD1x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 256,32,0,0 
-    KERNEL1x8_L2 256,32,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 256,32,2,0
-    KERNEL1x8_L2 256,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 256,32,4,0
-    KERNEL1x8_L2 256,32,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 256,32,6,0
-    KERNEL1x8_L2 256,32,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL1x8_L2 256,32,8,0
-    KERNEL1x8_L2 256,32,9,0
-    KERNEL1x8_L2 256,32,10,0
-    KERNEL1x8_L2 256,32,11,0  
-    dcbt    BO, T4
-    KERNEL1x8_L2 256,32,12,0
-    KERNEL1x8_L2 256,32,13,0
-    KERNEL1x8_L2 256,32,14,0
-    KERNEL1x8_L2 256,32,15,0  
-    KERNEL1x8_L2 256,32,16,0
-    KERNEL1x8_L2 256,32,17,0 
-    KERNEL1x8_L2 256,32,18,0
-    KERNEL1x8_L2 256,32,19,0  
-    KERNEL1x8_L2 256,32,20,0
-    KERNEL1x8_L2 256,32,21,0 
-    KERNEL1x8_L2 256,32,22,0
-    KERNEL1x8_L2 256,32,23,0   
-    KERNEL1x8_L2 256,32,24,0
-    KERNEL1x8_L2 256,32,25,0
-    KERNEL1x8_L2 256,32,26,0
-    KERNEL1x8_L2 256,32,27,0  
-    KERNEL1x8_L2 256,32,28,0
-    KERNEL1x8_L2 256,32,29,0
-    KERNEL1x8_L2 256,32,30,0
-    KERNEL1x8_E2 256,32,31,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x8_L32_SUB:
-/*----------------------------------------*/   
-    LOAD1x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 256,32,0,0 
-    KERNEL1x8_L2 256,32,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 256,32,2,0
-    KERNEL1x8_L2 256,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 256,32,4,0
-    KERNEL1x8_L2 256,32,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 256,32,6,0
-    KERNEL1x8_L2 256,32,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL1x8_L2 256,32,8,0
-    KERNEL1x8_L2 256,32,9,0
-    KERNEL1x8_L2 256,32,10,0
-    KERNEL1x8_L2 256,32,11,0  
-    dcbt    BO, T4
-    KERNEL1x8_L2 256,32,12,0
-    KERNEL1x8_L2 256,32,13,0
-    KERNEL1x8_L2 256,32,14,0
-    KERNEL1x8_E2 256,32,15,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x8_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x8_2 
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 256,32,0,0 
-    KERNEL1x8_L2 256,32,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 256,32,2,0
-    KERNEL1x8_L2 256,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 256,32,4,0
-    KERNEL1x8_L2 256,32,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 256,32,6,0
-    KERNEL1x8_E2 256,32,7,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x4_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x4_2  
-    MY_ALIGN
-
-
-ZGEMM_L1x4_LOOP:
-/*----------------------------------------*/   
-    KERNEL1x4_L2 128,32,0,0
-
-
-ZGEMM_L1x4_K32:
-/*----------------------------------------*/   
-    KERNEL1x4_L2 128,32,1,0   
-    KERNEL1x4_L2 128,32,2,0
-    KERNEL1x4_L2 128,32,3,0  
-    KERNEL1x4_L2 128,32,4,0
-    KERNEL1x4_L2 128,32,5,0 
-    KERNEL1x4_L2 128,32,6,0
-    KERNEL1x4_L2 128,32,7,0
-    KERNEL1x4_L2 128,32,8,0
-    KERNEL1x4_L2 128,32,9,0   
-    KERNEL1x4_L2 128,32,10,0
-    KERNEL1x4_L2 128,32,11,0  
-    KERNEL1x4_L2 128,32,12,0
-    KERNEL1x4_L2 128,32,13,0 
-    KERNEL1x4_L2 128,32,14,0
-    KERNEL1x4_L2 128,32,15,1    
-    bdnz    ZGEMM_L1x4_LOOP
-    MY_ALIGN  
-
-
-ZGEMM_L1x4_LOOP_END:
-/*----------------------------------------*/   
-    END1x4_2 
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x4_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x4_2
-    KERNEL1x4_L2 128,32,0,0
-    KERNEL1x4_L2 128,32,1,0   
-    KERNEL1x4_L2 128,32,2,0
-    KERNEL1x4_L2 128,32,3,0  
-    KERNEL1x4_L2 128,32,4,0
-    KERNEL1x4_L2 128,32,5,0 
-    KERNEL1x4_L2 128,32,6,0
-    KERNEL1x4_E2 128,32,7,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x4_L8_SUB:
-/*----------------------------------------*/   
-    LOAD1x4_2
-    KERNEL1x4_L2 128,32,0,0
-    KERNEL1x4_L2 128,32,1,0   
-    KERNEL1x4_L2 128,32,2,0
-    KERNEL1x4_E2 128,32,3,1  
-    blr
-
-
-ZGEMM_1x2_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x2_2  
-    MY_ALIGN
-
-
-ZGEMM_L1x2_LOOP:
-/*----------------------------------------*/   
-    KERNEL1x2_L2 64,32,0,0
-
-
-ZGEMM_L1x2_K32:
-/*----------------------------------------*/   
-    KERNEL1x2_L2 64,32,1,0  
-    KERNEL1x2_L2 64,32,2,0
-    KERNEL1x2_L2 64,32,3,0  
-    KERNEL1x2_L2 64,32,4,0
-    KERNEL1x2_L2 64,32,5,0 
-    KERNEL1x2_L2 64,32,6,0
-    KERNEL1x2_L2 64,32,7,0
-    KERNEL1x2_L2 64,32,8,0
-    KERNEL1x2_L2 64,32,9,0  
-    KERNEL1x2_L2 64,32,10,0
-    KERNEL1x2_L2 64,32,11,0  
-    KERNEL1x2_L2 64,32,12,0
-    KERNEL1x2_L2 64,32,13,0 
-    KERNEL1x2_L2 64,32,14,0
-    KERNEL1x2_L2 64,32,15,1   
-    bdnz    ZGEMM_L1x2_LOOP
-    MY_ALIGN  
-
-
-ZGEMM_L1x2_LOOP_END:
-/*----------------------------------------*/   
-    END1x2_2 
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x2_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x2_2
-    KERNEL1x2_L2 64,32,0,0
-    KERNEL1x2_L2 64,32,1,0  
-    KERNEL1x2_L2 64,32,2,0
-    KERNEL1x2_L2 64,32,3,0  
-    KERNEL1x2_L2 64,32,4,0
-    KERNEL1x2_L2 64,32,5,0 
-    KERNEL1x2_L2 64,32,6,0
-    KERNEL1x2_E2 64,32,7,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x2_L8_SUB:
-/*----------------------------------------*/   
-    LOAD1x2_2
-    KERNEL1x2_L2 64,32,0,0
-    KERNEL1x2_L2 64,32,1,0  
-    KERNEL1x2_L2 64,32,2,0
-    KERNEL1x2_E2 64,32,3,1  
-    blr
-
-
-ZGEMM_1x1_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x1_2  
-    MY_ALIGN
-
-
-ZGEMM_L1x1_LOOP:
-/*----------------------------------------*/   
-    KERNEL1x1_L2 32,32,0,0
-
-
-ZGEMM_L1x1_K32:
-/*----------------------------------------*/   
-    KERNEL1x1_L2 32,32,1,0  
-    KERNEL1x1_L2 32,32,2,0
-    KERNEL1x1_L2 32,32,3,0  
-    KERNEL1x1_L2 32,32,4,0
-    KERNEL1x1_L2 32,32,5,0 
-    KERNEL1x1_L2 32,32,6,0
-    KERNEL1x1_L2 32,32,7,0
-    KERNEL1x1_L2 32,32,8,0
-    KERNEL1x1_L2 32,32,9,0  
-    KERNEL1x1_L2 32,32,10,0
-    KERNEL1x1_L2 32,32,11,0  
-    KERNEL1x1_L2 32,32,12,0
-    KERNEL1x1_L2 32,32,13,0 
-    KERNEL1x1_L2 32,32,14,0
-    KERNEL1x1_L2 32,32,15,1   
-    bdnz    ZGEMM_L1x1_LOOP
-    MY_ALIGN  
-
-
-ZGEMM_L1x1_LOOP_END:
-/*----------------------------------------*/   
-    END1x1_2 
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x1_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x1_2
-    KERNEL1x1_L2 32,32,0,0
-    KERNEL1x1_L2 32,32,1,0  
-    KERNEL1x1_L2 32,32,2,0
-    KERNEL1x1_L2 32,32,3,0  
-    KERNEL1x1_L2 32,32,4,0
-    KERNEL1x1_L2 32,32,5,0 
-    KERNEL1x1_L2 32,32,6,0
-    KERNEL1x1_E2 32,32,7,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x1_L8_SUB:
-/*----------------------------------------*/   
-    LOAD1x1_2
-    KERNEL1x1_L2 32,32,0,0
-    KERNEL1x1_L2 32,32,1,0  
-    KERNEL1x1_L2 32,32,2,0
-    KERNEL1x1_E2 32,32,3,1  
-    blr
-
-
-/*----------------------N1 BEGINS---------*/
-ZGEMM_L1:
-/*----------------------------------------*/   
-    andi.   T1, N,  1
-    ble   ZGEMM_L1_END
-		
-ZGEMM_L1_BEGIN:
-/*----------------------------------------*/   
-    mr    CO, C
-   
-    add     T2,C,LDC    
-    mr    AO, A  
-    add   C,  C,  T1
-#if defined(TRMMKERNEL) && defined(LEFT)   
-    mr TEMP_REG, OFFSET  /*off = offset;*/
-#endif     
-    srawi.    I,  M,  3
-    ble   ZGEMM_L1x8_END
-    dcbt    CO,r0  /*just prefetch*/
-    dcbt    T2,r0    
-
-
-ZGEMM_L1x8_BEGIN:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
-#else    
-    mr    BO, B  
-    dcbt    B,  r0  
-#endif     
-    dcbt    AO, r0
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
-    mr T1, T6
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512   
-    srawi.   T8, T1, 7 /**(T11-2) % 128x */
-#else   
-    mr T1, K
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512 
-    srawi.   T8, T1, 7 /**(K-2) % 128x */
-#endif   
-    ZERO1x8  
-    ble   ZGEMM_L1x8_SUB0
-    bl ZGEMM_L1x8_LMAIN_SUB
-    andi.   L,  T1, 127
-    ble   ZGEMM_L1x8_SAVE
-    b   ZGEMM_L1x8_SUB2
-
-
-ZGEMM_L1x8_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 255
-    cmpwi   T6,129
-#else   
-    andi.   L,  K,  255
-    cmpwi   K,129
-#endif       
-    li T8,1
-    bne CMP1x8_128K
-    addi BO,BO,-16
-    addi AO,AO,-128 
-    LOAD1x8O 128,16 
-    END1x8_WITHOUT_ADD   
-    LOAD1x8_2O  256, 32 
-    mtctr   T8    
-    bl ZGEMM_L1x8_K128   
-    b ZGEMM_L1x8_SAVE  
-    CMP1x8_128K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,128
-#else    
-    cmpwi   K,128
-#endif        
-    bne ZGEMM_L1x8_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-256   
-    LOAD1x8_2O 256,32
-    bl ZGEMM_L1x8_K128   
-    b ZGEMM_L1x8_SAVE 
-    MY_ALIGN
-
-
-ZGEMM_L1x8_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 64
-    ble ZGEMM_L1x8_SUB2_32
-    bl ZGEMM_1x8_L64_SUB
-    MY_ALIGN
-
-
-ZGEMM_L1x8_SUB2_32:
-/*----------------------------------------*/   
-    andi.      T1,L, 32
-    ble ZGEMM_L1x8_SUB2_16    
-    bl ZGEMM_1x8_L32_SUB
-    MY_ALIGN 
-
-
-ZGEMM_L1x8_SUB2_16:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L1x8_SUB2_8
-    bl ZGEMM_1x8_L16_SUB  
-    MY_ALIGN    
-
-
-ZGEMM_L1x8_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L1x8_SUB2_4
-    LOAD1x8_2
-    KERNEL1x8_L2  256,32, 0,0
-    KERNEL1x8_L2  256,32, 1,0
-    KERNEL1x8_L2  256,32, 2,0
-    KERNEL1x8_E2  256,32, 3,1
-    MY_ALIGN   
-
-
-ZGEMM_L1x8_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L1x8_SUB2_2
-    LOAD1x8_2
-    KERNEL1x8_L2  256,32, 0,0
-    KERNEL1x8_E2  256,32, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L1x8_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L1x8_SUB2_1
-    LOAD1x8_2 
-    KERNEL1x8_E2  256,32, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L1x8_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L1x8_SAVE 
-    KERNEL1x8
-
-
-ZGEMM_L1x8_SAVE:
-/*----------------------------------------*/   
-    addic.    I,  I,  -1
-    SAVE1x8
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
-#endif     
-    bgt   ZGEMM_L1x8_BEGIN
-    andi.   T2, M,  7
-    ble   ZGEMM_L1x1_END
-    andi.   T1, M,  4
-    ble   ZGEMM_L1x4_END
-    b   ZGEMM_L1x4_BEGIN
-    MY_ALIGN 
-
-
-ZGEMM_L1x8_END:
-/*----------------------------------------*/   
-
-
-ZGEMM_L1x4_BEGIN:
-/*----------------------------------------*/   
-    andi.   T2, M,  7
-    ble   ZGEMM_L1x1_END
-    andi.   T1, M,  4
-    ble   ZGEMM_L1x4_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T11-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO1x4
-    ble   ZGEMM_L1x4_SUB0 
-    bl ZGEMM_1x4_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   ZGEMM_L1x4_SAVE
-    b   ZGEMM_L1x4_SUB2
-
-
-ZGEMM_L1x4_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP1x4_32K
-    addi BO,BO,-16
-    addi AO,AO,-64  
-    LOAD1x4O 64,16 
-    END1x4_WITHOUT_ADD   
-    LOAD1x4_2O  128, 32 
-    mtctr   T8    
-    bl ZGEMM_L1x4_K32   
-    b ZGEMM_L1x4_SAVE  
-    CMP1x4_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne ZGEMM_L1x4_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-128   
-    LOAD1x4_2O 128,32
-    bl ZGEMM_L1x4_K32   
-    b ZGEMM_L1x4_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-ZGEMM_L1x4_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L1x4_SUB2_8
-    bl ZGEMM_1x4_L16_SUB  
-    MY_ALIGN
-
-
-ZGEMM_L1x4_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L1x4_SUB2_4
-    bl ZGEMM_1x4_L8_SUB
-    MY_ALIGN  
-
-
-ZGEMM_L1x4_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L1x4_SUB2_2
-    LOAD1x4_2
-    KERNEL1x4_L2  128,32, 0,0
-    KERNEL1x4_E2  128,32, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L1x4_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L1x4_SUB2_1
-    LOAD1x4_2
-    KERNEL1x4_E2  128,32, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L1x4_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L1x4_SAVE 
-    KERNEL1x4
-
-
-ZGEMM_L1x4_SAVE:
-/*----------------------------------------*/   
-    SAVE1x4
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
-#endif     
-
-
-ZGEMM_L1x4_END:
-/*----------------------------------------*/   
-
-
-ZGEMM_L1x2_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  2
-    ble   ZGEMM_L1x2_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T11-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO1x2
-    ble   ZGEMM_L1x2_SUB0 
-    bl ZGEMM_1x2_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   ZGEMM_L1x2_SAVE
-    b   ZGEMM_L1x2_SUB2
-
-
-ZGEMM_L1x2_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP1x2_32K
-    addi BO,BO,-16
-    addi AO,AO,-32  
-    LOAD1x2O 32,16 
-    END1x2_WITHOUT_ADD   
-    LOAD1x2_2O  64, 32  
-    mtctr   T8    
-    bl ZGEMM_L1x2_K32   
-    b ZGEMM_L1x2_SAVE  
-    CMP1x2_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne ZGEMM_L1x2_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-64   
-    LOAD1x2_2O 64,32
-    bl ZGEMM_L1x2_K32   
-    b ZGEMM_L1x2_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-ZGEMM_L1x2_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L1x2_SUB2_8
-    bl ZGEMM_1x2_L16_SUB  
-    MY_ALIGN
-
-
-ZGEMM_L1x2_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L1x2_SUB2_4
-    bl ZGEMM_1x2_L8_SUB
-    MY_ALIGN  
-
-
-ZGEMM_L1x2_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L1x2_SUB2_2
-    LOAD1x2_2
-    KERNEL1x2_L2  64,32, 0,0
-    KERNEL1x2_E2  64,32, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L1x2_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L1x2_SUB2_1
-    LOAD1x2_2
-    KERNEL1x2_E2  64,32, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L1x2_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L1x2_SAVE 
-    KERNEL1x2
-
-
-ZGEMM_L1x2_SAVE:
-/*----------------------------------------*/   
-    SAVE1x2
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
-#endif     
-
-
-ZGEMM_L1x2_END:
-/*----------------------------------------*/   
-
-
-ZGEMM_L1x1_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  1
-    ble   ZGEMM_L1x1_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T11-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO1x1
-    ble   ZGEMM_L1x1_SUB0 
-    bl ZGEMM_1x1_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   ZGEMM_L1x1_SAVE
-    b   ZGEMM_L1x1_SUB2
-
-
-ZGEMM_L1x1_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP1x1_32K
-    addi BO,BO,-16
-    addi AO,AO,-16  
-    LOAD1x1O 16,16 
-    END1x1_WITHOUT_ADD   
-    LOAD1x1_2O  32, 32  
-    mtctr   T8    
-    bl ZGEMM_L1x1_K32   
-    b ZGEMM_L1x1_SAVE  
-    CMP1x1_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne ZGEMM_L1x1_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-32   
-    LOAD1x1_2O 32,32
-    bl ZGEMM_L1x1_K32   
-    b ZGEMM_L1x1_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-ZGEMM_L1x1_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L1x1_SUB2_8
-    bl ZGEMM_1x1_L16_SUB  
-    MY_ALIGN
-
-
-ZGEMM_L1x1_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L1x1_SUB2_4
-    bl ZGEMM_1x1_L8_SUB
-    MY_ALIGN  
-
-
-ZGEMM_L1x1_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L1x1_SUB2_2
-    LOAD1x1_2
-    KERNEL1x1_L2  32,32, 0,0
-    KERNEL1x1_E2  32,32, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L1x1_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L1x1_SUB2_1
-    LOAD1x1_2
-    KERNEL1x1_E2  32,32, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L1x1_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L1x1_SAVE 
-    KERNEL1x1
-
-
-ZGEMM_L1x1_SAVE:
-/*----------------------------------------*/   
-    SAVE1x1
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
-#endif   
-
-
-ZGEMM_L1x1_END:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL) && !defined(LEFT)   
-    addi TEMP_REG, TEMP_REG, 1
-#endif   
-
-
-ZGEMM_L1_END:
-/*----------------------------------------*/   
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define MY_ALIGN .align 3
+b ZGEMM_L2
+/*                MINI SUBROUTINES                            */      
+/*                2x8 MAIN 128x+2 LOOP                     */      
+
+
+ZGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x8_2 
+    MY_ALIGN
+ZGEMM_L2x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 256,64,0,0 
+ZGEMM_L2x8_K128:
+/*----------------------------------------*/   
+    KERNEL2x8_L2 256,64,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 256,64,2,0
+    KERNEL2x8_L2 256,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 256,64,4,0
+    KERNEL2x8_L2 256,64,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 256,64,6,0
+    KERNEL2x8_L2 256,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 256,64,8,0
+    KERNEL2x8_L2 256,64,9,0
+    KERNEL2x8_L2 256,64,10,0
+    KERNEL2x8_L2 256,64,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 256,64,12,0
+    KERNEL2x8_L2 256,64,13,0
+    KERNEL2x8_L2 256,64,14,0
+    KERNEL2x8_L2 256,64,15,0  
+    KERNEL2x8_L2 256,64,16,0
+    KERNEL2x8_L2 256,64,17,0 
+    KERNEL2x8_L2 256,64,18,0
+    KERNEL2x8_L2 256,64,19,0  
+    KERNEL2x8_L2 256,64,20,0
+    KERNEL2x8_L2 256,64,21,0 
+    KERNEL2x8_L2 256,64,22,0
+    KERNEL2x8_L2 256,64,23,0   
+    KERNEL2x8_L2 256,64,24,0
+    KERNEL2x8_L2 256,64,25,0
+    KERNEL2x8_L2 256,64,26,0
+    KERNEL2x8_L2 256,64,27,0  
+    KERNEL2x8_L2 256,64,28,0
+    KERNEL2x8_L2 256,64,29,0
+    KERNEL2x8_L2 256,64,30,0
+    KERNEL2x8_L2 256,64,31,0 
+    KERNEL2x8_L2 256,64,32,0
+    KERNEL2x8_L2 256,64,33,0
+    KERNEL2x8_L2 256,64,34,0
+    KERNEL2x8_L2 256,64,35,0 
+    KERNEL2x8_L2 256,64,36,0
+    KERNEL2x8_L2 256,64,37,0
+    KERNEL2x8_L2 256,64,38,0
+    KERNEL2x8_L2 256,64,39,0  
+    KERNEL2x8_L2 256,64,40,0
+    KERNEL2x8_L2 256,64,41,0
+    KERNEL2x8_L2 256,64,42,0
+    KERNEL2x8_L2 256,64,43,0  
+    KERNEL2x8_L2 256,64,44,0
+    KERNEL2x8_L2 256,64,45,0
+    KERNEL2x8_L2 256,64,46,0
+    KERNEL2x8_L2 256,64,47,0 
+    KERNEL2x8_L2 256,64,48,0
+    KERNEL2x8_L2 256,64,49,0 
+    KERNEL2x8_L2 256,64,50,0
+    KERNEL2x8_L2 256,64,51,0  
+    KERNEL2x8_L2 256,64,52,0
+    KERNEL2x8_L2 256,64,53,0 
+    KERNEL2x8_L2 256,64,54,0
+    KERNEL2x8_L2 256,64,55,0  
+    KERNEL2x8_L2 256,64,56,0
+    KERNEL2x8_L2 256,64,57,0
+    KERNEL2x8_L2 256,64,58,0
+    KERNEL2x8_L2 256,64,59,0  
+    KERNEL2x8_L2 256,64,60,0
+    KERNEL2x8_L2 256,64,61,0
+    KERNEL2x8_L2 256,64,62,0 
+    KERNEL2x8_L2 256,64,63,1  
+    bdnz    ZGEMM_L2x8_LOOP
+    MY_ALIGN  
+ZGEMM_L2x8_LOOP_END:
+/*----------------------------------------*/   
+    END2x8_2
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 256,64,0,0 
+    KERNEL2x8_L2 256,64,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 256,64,2,0
+    KERNEL2x8_L2 256,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 256,64,4,0
+    KERNEL2x8_L2 256,64,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 256,64,6,0
+    KERNEL2x8_L2 256,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 256,64,8,0
+    KERNEL2x8_L2 256,64,9,0
+    KERNEL2x8_L2 256,64,10,0
+    KERNEL2x8_L2 256,64,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 256,64,12,0
+    KERNEL2x8_L2 256,64,13,0
+    KERNEL2x8_L2 256,64,14,0
+    KERNEL2x8_L2 256,64,15,0  
+    KERNEL2x8_L2 256,64,16,0
+    KERNEL2x8_L2 256,64,17,0 
+    KERNEL2x8_L2 256,64,18,0
+    KERNEL2x8_L2 256,64,19,0  
+    KERNEL2x8_L2 256,64,20,0
+    KERNEL2x8_L2 256,64,21,0 
+    KERNEL2x8_L2 256,64,22,0
+    KERNEL2x8_L2 256,64,23,0   
+    KERNEL2x8_L2 256,64,24,0
+    KERNEL2x8_L2 256,64,25,0
+    KERNEL2x8_L2 256,64,26,0
+    KERNEL2x8_L2 256,64,27,0  
+    KERNEL2x8_L2 256,64,28,0
+    KERNEL2x8_L2 256,64,29,0
+    KERNEL2x8_L2 256,64,30,0
+    KERNEL2x8_E2 256,64,31,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 256,64,0,0 
+    KERNEL2x8_L2 256,64,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 256,64,2,0
+    KERNEL2x8_L2 256,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 256,64,4,0
+    KERNEL2x8_L2 256,64,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 256,64,6,0
+    KERNEL2x8_L2 256,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 256,64,8,0
+    KERNEL2x8_L2 256,64,9,0
+    KERNEL2x8_L2 256,64,10,0
+    KERNEL2x8_L2 256,64,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 256,64,12,0
+    KERNEL2x8_L2 256,64,13,0
+    KERNEL2x8_L2 256,64,14,0
+    KERNEL2x8_E2 256,64,15,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 256,64,0,0 
+    KERNEL2x8_L2 256,64,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 256,64,2,0
+    KERNEL2x8_L2 256,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 256,64,4,0
+    KERNEL2x8_L2 256,64,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 256,64,6,0
+    KERNEL2x8_E2 256,64,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x4_2  
+    MY_ALIGN
+ZGEMM_L2x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 128,64,0,0
+ZGEMM_L2x4_K32:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 128,64,1,0   
+    KERNEL2x4_L2 128,64,2,0
+    KERNEL2x4_L2 128,64,3,0  
+    KERNEL2x4_L2 128,64,4,0
+    KERNEL2x4_L2 128,64,5,0 
+    KERNEL2x4_L2 128,64,6,0
+    KERNEL2x4_L2 128,64,7,0
+    KERNEL2x4_L2 128,64,8,0
+    KERNEL2x4_L2 128,64,9,0   
+    KERNEL2x4_L2 128,64,10,0
+    KERNEL2x4_L2 128,64,11,0  
+    KERNEL2x4_L2 128,64,12,0
+    KERNEL2x4_L2 128,64,13,0 
+    KERNEL2x4_L2 128,64,14,0
+    KERNEL2x4_L2 128,64,15,1    
+    bdnz    ZGEMM_L2x4_LOOP
+    MY_ALIGN  
+ZGEMM_L2x4_LOOP_END:
+/*----------------------------------------*/   
+    END2x4_2 
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 128,64,0,0
+    KERNEL2x4_L2 128,64,1,0   
+    KERNEL2x4_L2 128,64,2,0
+    KERNEL2x4_L2 128,64,3,0  
+    KERNEL2x4_L2 128,64,4,0
+    KERNEL2x4_L2 128,64,5,0 
+    KERNEL2x4_L2 128,64,6,0
+    KERNEL2x4_E2 128,64,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 128,64,0,0
+    KERNEL2x4_L2 128,64,1,0   
+    KERNEL2x4_L2 128,64,2,0
+    KERNEL2x4_E2 128,64,3,1 
+    blr
+
+
+ZGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x2_2  
+    MY_ALIGN 
+ZGEMM_L2x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 64,64,0,0 
+ZGEMM_L2x2_K32:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 64,64,1,0  
+    KERNEL2x2_L2 64,64,2,0
+    KERNEL2x2_L2 64,64,3,0  
+    KERNEL2x2_L2 64,64,4,0
+    KERNEL2x2_L2 64,64,5,0 
+    KERNEL2x2_L2 64,64,6,0
+    KERNEL2x2_L2 64,64,7,0
+    KERNEL2x2_L2 64,64,8,0
+    KERNEL2x2_L2 64,64,9,0  
+    KERNEL2x2_L2 64,64,10,0
+    KERNEL2x2_L2 64,64,11,0  
+    KERNEL2x2_L2 64,64,12,0
+    KERNEL2x2_L2 64,64,13,0 
+    KERNEL2x2_L2 64,64,14,0
+    KERNEL2x2_L2 64,64,15,1   
+    bdnz    ZGEMM_L2x2_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/   
+    END2x2_2 
+    blr
+    MY_ALIGN
+ZGEMM_2x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 64,64,0,0
+    KERNEL2x2_L2 64,64,1,0  
+    KERNEL2x2_L2 64,64,2,0
+    KERNEL2x2_L2 64,64,3,0  
+    KERNEL2x2_L2 64,64,4,0
+    KERNEL2x2_L2 64,64,5,0 
+    KERNEL2x2_L2 64,64,6,0
+    KERNEL2x2_E2 64,64,7,1
+    blr
+    MY_ALIGN
+ZGEMM_2x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 64,64,0,0
+    KERNEL2x2_L2 64,64,1,0  
+    KERNEL2x2_L2 64,64,2,0
+    KERNEL2x2_E2 64,64,3,1  
+    blr
+
+
+ZGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x1_2  
+    MY_ALIGN
+ZGEMM_L2x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 32,64,0,0 
+ZGEMM_L2x1_K32:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 32,64,1,0  
+    KERNEL2x1_L2 32,64,2,0
+    KERNEL2x1_L2 32,64,3,0  
+    KERNEL2x1_L2 32,64,4,0
+    KERNEL2x1_L2 32,64,5,0 
+    KERNEL2x1_L2 32,64,6,0
+    KERNEL2x1_L2 32,64,7,0
+    KERNEL2x1_L2 32,64,8,0
+    KERNEL2x1_L2 32,64,9,0  
+    KERNEL2x1_L2 32,64,10,0
+    KERNEL2x1_L2 32,64,11,0  
+    KERNEL2x1_L2 32,64,12,0
+    KERNEL2x1_L2 32,64,13,0 
+    KERNEL2x1_L2 32,64,14,0
+    KERNEL2x1_L2 32,64,15,1   
+    bdnz    ZGEMM_L2x1_LOOP
+    MY_ALIGN  
+ZGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/   
+    END2x1_2 
+    blr
+
+    MY_ALIGN
+ZGEMM_2x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 32,64,0,0
+    KERNEL2x1_L2 32,64,1,0  
+    KERNEL2x1_L2 32,64,2,0
+    KERNEL2x1_L2 32,64,3,0  
+    KERNEL2x1_L2 32,64,4,0
+    KERNEL2x1_L2 32,64,5,0 
+    KERNEL2x1_L2 32,64,6,0
+    KERNEL2x1_E2 32,64,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 32,64,0,0
+    KERNEL2x1_L2 32,64,1,0  
+    KERNEL2x1_L2 32,64,2,0
+    KERNEL2x1_E2 32,64,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+ZGEMM_L2:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    neg TEMP_REG, OFFSET 
+#endif   
+    srawi.    J,  N,  1
+    ble   ZGEMM_L2_END
+
+
+ZGEMM_L2_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 1     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   ZGEMM_L2x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+ZGEMM_L2x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T11-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO2x8  
+    ble   ZGEMM_L2x8_SUB0
+    bl ZGEMM_L2x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   ZGEMM_L2x8_SAVE
+    b   ZGEMM_L2x8_SUB2
+
+
+ZGEMM_L2x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP2x8_128K
+    addi BO,BO,-32
+    addi AO,AO,-128 
+    LOAD2x8O 128,32 
+    END2x8_WITHOUT_ADD   
+    LOAD2x8_2O  256, 64 
+    mtctr   T8    
+    bl ZGEMM_L2x8_K128   
+    b ZGEMM_L2x8_SAVE  
+    CMP2x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne ZGEMM_L2x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-256   
+    LOAD2x8_2O 256,64
+    bl ZGEMM_L2x8_K128   
+    b ZGEMM_L2x8_SAVE 
+    MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble ZGEMM_L2x8_SUB2_32
+    bl  ZGEMM_2x8_L64_SUB
+    MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble ZGEMM_L2x8_SUB2_16    
+    bl  ZGEMM_2x8_L32_SUB
+    MY_ALIGN 
+
+
+ZGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L2x8_SUB2_8
+    bl  ZGEMM_2x8_L16_SUB  
+    MY_ALIGN    
+
+
+ZGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L2x8_SUB2_4
+    LOAD2x8_2
+    KERNEL2x8_L2  256,64, 0,0
+    KERNEL2x8_L2  256,64, 1,0
+    KERNEL2x8_L2  256,64, 2,0
+    KERNEL2x8_E2  256,64, 3,1
+    MY_ALIGN   
+
+
+ZGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L2x8_SUB2_2
+    LOAD2x8_2
+    KERNEL2x8_L2  256,64, 0,0
+    KERNEL2x8_E2  256,64, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L2x8_SUB2_1
+    LOAD2x8_2 
+    KERNEL2x8_E2  256,64, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L2x8_SAVE 
+    KERNEL2x8
+
+
+ZGEMM_L2x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    SAVE2x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
+#endif     
+    bgt   ZGEMM_L2x8_BEGIN
+    andi.   T2, M,  7
+    ble   ZGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   ZGEMM_L2x4_END
+    b   ZGEMM_L2x4_BEGIN
+    MY_ALIGN 
+
+
+ZGEMM_L2x8_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L2x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   ZGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   ZGEMM_L2x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x4
+    ble   ZGEMM_L2x4_SUB0 
+    bl ZGEMM_2x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L2x4_SAVE
+    b    ZGEMM_L2x4_SUB2
+
+
+ZGEMM_L2x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x4_32K
+    addi BO,BO,-32
+    addi AO,AO,-64  
+    LOAD2x4O 64,32 
+    END2x4_WITHOUT_ADD   
+    LOAD2x4_2O  128, 64 
+    mtctr   T8    
+    bl ZGEMM_L2x4_K32   
+    b ZGEMM_L2x4_SAVE  
+    CMP2x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L2x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-128   
+    LOAD2x4_2O 128,64
+    bl ZGEMM_L2x4_K32   
+    b ZGEMM_L2x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L2x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L2x4_SUB2_8
+    bl  ZGEMM_2x4_L16_SUB  
+    MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L2x4_SUB2_4
+    bl ZGEMM_2x4_L8_SUB
+    MY_ALIGN  
+
+
+ZGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L2x4_SUB2_2
+    LOAD2x4_2
+    KERNEL2x4_L2  128,64, 0,0
+    KERNEL2x4_E2  128,64, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L2x4_SUB2_1
+    LOAD2x4_2
+    KERNEL2x4_E2  128,64, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L2x4_SAVE 
+    KERNEL2x4
+
+
+ZGEMM_L2x4_SAVE:
+/*----------------------------------------*/   
+    SAVE2x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
+#endif     
+
+
+ZGEMM_L2x4_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L2x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   ZGEMM_L2x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x2
+    ble   ZGEMM_L2x2_SUB0 
+    bl ZGEMM_2x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L2x2_SAVE
+    b   ZGEMM_L2x2_SUB2
+
+
+ZGEMM_L2x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x2_32K
+    addi BO,BO,-32
+    addi AO,AO,-32  
+    LOAD2x2O 32,32 
+    END2x2_WITHOUT_ADD   
+    LOAD2x2_2O  64, 64  
+    mtctr   T8    
+    bl ZGEMM_L2x2_K32   
+    b ZGEMM_L2x2_SAVE  
+    CMP2x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L2x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-64   
+    LOAD2x2_2O 64,64
+    bl ZGEMM_L2x2_K32   
+    b ZGEMM_L2x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L2x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L2x2_SUB2_8
+    bl ZGEMM_2x2_L16_SUB  
+    MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L2x2_SUB2_4
+    bl ZGEMM_2x2_L8_SUB
+    MY_ALIGN  
+
+
+ZGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L2x2_SUB2_2
+    LOAD2x2_2
+    KERNEL2x2_L2  64,64, 0,0
+    KERNEL2x2_E2  64,64, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L2x2_SUB2_1
+    LOAD2x2_2
+    KERNEL2x2_E2  64,64, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L2x2_SAVE 
+    KERNEL2x2
+
+
+ZGEMM_L2x2_SAVE:
+/*----------------------------------------*/   
+    SAVE2x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
+#endif     
+
+
+ZGEMM_L2x2_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L2x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   ZGEMM_L2x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x1
+    ble   ZGEMM_L2x1_SUB0 
+    bl ZGEMM_2x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L2x1_SAVE
+    b   ZGEMM_L2x1_SUB2
+
+
+ZGEMM_L2x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x1_32K
+    addi BO,BO,-32
+    addi AO,AO,-16  
+    LOAD2x1O 16,32 
+    END2x1_WITHOUT_ADD   
+    LOAD2x1_2O  32, 64  
+    mtctr   T8    
+    bl ZGEMM_L2x1_K32   
+    b ZGEMM_L2x1_SAVE  
+    CMP2x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L2x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-32   
+    LOAD2x1_2O 32,64
+    bl ZGEMM_L2x1_K32   
+    b ZGEMM_L2x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L2x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L2x1_SUB2_8
+    bl ZGEMM_2x1_L16_SUB  
+    MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L2x1_SUB2_4
+    bl ZGEMM_2x1_L8_SUB
+    MY_ALIGN  
+
+
+ZGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L2x1_SUB2_2
+    LOAD2x1_2
+    KERNEL2x1_L2  32,64, 0,0
+    KERNEL2x1_E2  32,64, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L2x1_SUB2_1
+    LOAD2x1_2
+    KERNEL2x1_E2  32,64, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L2x1_SAVE 
+    KERNEL2x1
+
+
+ZGEMM_L2x1_SAVE:
+/*----------------------------------------*/   
+    SAVE2x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
+#endif   
+
+
+ZGEMM_L2x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  5
+    addic.    J,  J,  -1
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 2
+#endif   
+    bgt   ZGEMM_L2_BEGIN
+
+
+ZGEMM_L2_END:
+
+b ZGEMM_L1
+/*                MINI SUBROUTINES                            */      
+/*                1x8 MAIN 128x+2 LOOP                     */      
+
+
+ZGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x8_2 
+    MY_ALIGN
+ZGEMM_L1x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 256,32,0,0 
+ZGEMM_L1x8_K128:
+/*----------------------------------------*/   
+    KERNEL1x8_L2 256,32,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 256,32,2,0
+    KERNEL1x8_L2 256,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 256,32,4,0
+    KERNEL1x8_L2 256,32,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 256,32,6,0
+    KERNEL1x8_L2 256,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 256,32,8,0
+    KERNEL1x8_L2 256,32,9,0
+    KERNEL1x8_L2 256,32,10,0
+    KERNEL1x8_L2 256,32,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 256,32,12,0
+    KERNEL1x8_L2 256,32,13,0
+    KERNEL1x8_L2 256,32,14,0
+    KERNEL1x8_L2 256,32,15,0  
+    KERNEL1x8_L2 256,32,16,0
+    KERNEL1x8_L2 256,32,17,0 
+    KERNEL1x8_L2 256,32,18,0
+    KERNEL1x8_L2 256,32,19,0  
+    KERNEL1x8_L2 256,32,20,0
+    KERNEL1x8_L2 256,32,21,0 
+    KERNEL1x8_L2 256,32,22,0
+    KERNEL1x8_L2 256,32,23,0   
+    KERNEL1x8_L2 256,32,24,0
+    KERNEL1x8_L2 256,32,25,0
+    KERNEL1x8_L2 256,32,26,0
+    KERNEL1x8_L2 256,32,27,0  
+    KERNEL1x8_L2 256,32,28,0
+    KERNEL1x8_L2 256,32,29,0
+    KERNEL1x8_L2 256,32,30,0
+    KERNEL1x8_L2 256,32,31,0 
+    KERNEL1x8_L2 256,32,32,0
+    KERNEL1x8_L2 256,32,33,0
+    KERNEL1x8_L2 256,32,34,0
+    KERNEL1x8_L2 256,32,35,0 
+    KERNEL1x8_L2 256,32,36,0
+    KERNEL1x8_L2 256,32,37,0
+    KERNEL1x8_L2 256,32,38,0
+    KERNEL1x8_L2 256,32,39,0  
+    KERNEL1x8_L2 256,32,40,0
+    KERNEL1x8_L2 256,32,41,0
+    KERNEL1x8_L2 256,32,42,0
+    KERNEL1x8_L2 256,32,43,0  
+    KERNEL1x8_L2 256,32,44,0
+    KERNEL1x8_L2 256,32,45,0
+    KERNEL1x8_L2 256,32,46,0
+    KERNEL1x8_L2 256,32,47,0 
+    KERNEL1x8_L2 256,32,48,0
+    KERNEL1x8_L2 256,32,49,0 
+    KERNEL1x8_L2 256,32,50,0
+    KERNEL1x8_L2 256,32,51,0  
+    KERNEL1x8_L2 256,32,52,0
+    KERNEL1x8_L2 256,32,53,0 
+    KERNEL1x8_L2 256,32,54,0
+    KERNEL1x8_L2 256,32,55,0  
+    KERNEL1x8_L2 256,32,56,0
+    KERNEL1x8_L2 256,32,57,0
+    KERNEL1x8_L2 256,32,58,0
+    KERNEL1x8_L2 256,32,59,0  
+    KERNEL1x8_L2 256,32,60,0
+    KERNEL1x8_L2 256,32,61,0
+    KERNEL1x8_L2 256,32,62,0 
+    KERNEL1x8_L2 256,32,63,1  
+    bdnz    ZGEMM_L1x8_LOOP
+    MY_ALIGN  
+ZGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/   
+    END1x8_2
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 256,32,0,0 
+    KERNEL1x8_L2 256,32,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 256,32,2,0
+    KERNEL1x8_L2 256,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 256,32,4,0
+    KERNEL1x8_L2 256,32,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 256,32,6,0
+    KERNEL1x8_L2 256,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 256,32,8,0
+    KERNEL1x8_L2 256,32,9,0
+    KERNEL1x8_L2 256,32,10,0
+    KERNEL1x8_L2 256,32,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 256,32,12,0
+    KERNEL1x8_L2 256,32,13,0
+    KERNEL1x8_L2 256,32,14,0
+    KERNEL1x8_L2 256,32,15,0  
+    KERNEL1x8_L2 256,32,16,0
+    KERNEL1x8_L2 256,32,17,0 
+    KERNEL1x8_L2 256,32,18,0
+    KERNEL1x8_L2 256,32,19,0  
+    KERNEL1x8_L2 256,32,20,0
+    KERNEL1x8_L2 256,32,21,0 
+    KERNEL1x8_L2 256,32,22,0
+    KERNEL1x8_L2 256,32,23,0   
+    KERNEL1x8_L2 256,32,24,0
+    KERNEL1x8_L2 256,32,25,0
+    KERNEL1x8_L2 256,32,26,0
+    KERNEL1x8_L2 256,32,27,0  
+    KERNEL1x8_L2 256,32,28,0
+    KERNEL1x8_L2 256,32,29,0
+    KERNEL1x8_L2 256,32,30,0
+    KERNEL1x8_E2 256,32,31,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 256,32,0,0 
+    KERNEL1x8_L2 256,32,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 256,32,2,0
+    KERNEL1x8_L2 256,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 256,32,4,0
+    KERNEL1x8_L2 256,32,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 256,32,6,0
+    KERNEL1x8_L2 256,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 256,32,8,0
+    KERNEL1x8_L2 256,32,9,0
+    KERNEL1x8_L2 256,32,10,0
+    KERNEL1x8_L2 256,32,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 256,32,12,0
+    KERNEL1x8_L2 256,32,13,0
+    KERNEL1x8_L2 256,32,14,0
+    KERNEL1x8_E2 256,32,15,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 256,32,0,0 
+    KERNEL1x8_L2 256,32,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 256,32,2,0
+    KERNEL1x8_L2 256,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 256,32,4,0
+    KERNEL1x8_L2 256,32,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 256,32,6,0
+    KERNEL1x8_E2 256,32,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x4_2  
+    MY_ALIGN
+
+
+ZGEMM_L1x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 128,32,0,0
+
+
+ZGEMM_L1x4_K32:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 128,32,1,0   
+    KERNEL1x4_L2 128,32,2,0
+    KERNEL1x4_L2 128,32,3,0  
+    KERNEL1x4_L2 128,32,4,0
+    KERNEL1x4_L2 128,32,5,0 
+    KERNEL1x4_L2 128,32,6,0
+    KERNEL1x4_L2 128,32,7,0
+    KERNEL1x4_L2 128,32,8,0
+    KERNEL1x4_L2 128,32,9,0   
+    KERNEL1x4_L2 128,32,10,0
+    KERNEL1x4_L2 128,32,11,0  
+    KERNEL1x4_L2 128,32,12,0
+    KERNEL1x4_L2 128,32,13,0 
+    KERNEL1x4_L2 128,32,14,0
+    KERNEL1x4_L2 128,32,15,1    
+    bdnz    ZGEMM_L1x4_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/   
+    END1x4_2 
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 128,32,0,0
+    KERNEL1x4_L2 128,32,1,0   
+    KERNEL1x4_L2 128,32,2,0
+    KERNEL1x4_L2 128,32,3,0  
+    KERNEL1x4_L2 128,32,4,0
+    KERNEL1x4_L2 128,32,5,0 
+    KERNEL1x4_L2 128,32,6,0
+    KERNEL1x4_E2 128,32,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 128,32,0,0
+    KERNEL1x4_L2 128,32,1,0   
+    KERNEL1x4_L2 128,32,2,0
+    KERNEL1x4_E2 128,32,3,1  
+    blr
+
+
+ZGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x2_2  
+    MY_ALIGN
+
+
+ZGEMM_L1x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 64,32,0,0
+
+
+ZGEMM_L1x2_K32:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 64,32,1,0  
+    KERNEL1x2_L2 64,32,2,0
+    KERNEL1x2_L2 64,32,3,0  
+    KERNEL1x2_L2 64,32,4,0
+    KERNEL1x2_L2 64,32,5,0 
+    KERNEL1x2_L2 64,32,6,0
+    KERNEL1x2_L2 64,32,7,0
+    KERNEL1x2_L2 64,32,8,0
+    KERNEL1x2_L2 64,32,9,0  
+    KERNEL1x2_L2 64,32,10,0
+    KERNEL1x2_L2 64,32,11,0  
+    KERNEL1x2_L2 64,32,12,0
+    KERNEL1x2_L2 64,32,13,0 
+    KERNEL1x2_L2 64,32,14,0
+    KERNEL1x2_L2 64,32,15,1   
+    bdnz    ZGEMM_L1x2_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/   
+    END1x2_2 
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 64,32,0,0
+    KERNEL1x2_L2 64,32,1,0  
+    KERNEL1x2_L2 64,32,2,0
+    KERNEL1x2_L2 64,32,3,0  
+    KERNEL1x2_L2 64,32,4,0
+    KERNEL1x2_L2 64,32,5,0 
+    KERNEL1x2_L2 64,32,6,0
+    KERNEL1x2_E2 64,32,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 64,32,0,0
+    KERNEL1x2_L2 64,32,1,0  
+    KERNEL1x2_L2 64,32,2,0
+    KERNEL1x2_E2 64,32,3,1  
+    blr
+
+
+ZGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x1_2  
+    MY_ALIGN
+
+
+ZGEMM_L1x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 32,32,0,0
+
+
+ZGEMM_L1x1_K32:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 32,32,1,0  
+    KERNEL1x1_L2 32,32,2,0
+    KERNEL1x1_L2 32,32,3,0  
+    KERNEL1x1_L2 32,32,4,0
+    KERNEL1x1_L2 32,32,5,0 
+    KERNEL1x1_L2 32,32,6,0
+    KERNEL1x1_L2 32,32,7,0
+    KERNEL1x1_L2 32,32,8,0
+    KERNEL1x1_L2 32,32,9,0  
+    KERNEL1x1_L2 32,32,10,0
+    KERNEL1x1_L2 32,32,11,0  
+    KERNEL1x1_L2 32,32,12,0
+    KERNEL1x1_L2 32,32,13,0 
+    KERNEL1x1_L2 32,32,14,0
+    KERNEL1x1_L2 32,32,15,1   
+    bdnz    ZGEMM_L1x1_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/   
+    END1x1_2 
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 32,32,0,0
+    KERNEL1x1_L2 32,32,1,0  
+    KERNEL1x1_L2 32,32,2,0
+    KERNEL1x1_L2 32,32,3,0  
+    KERNEL1x1_L2 32,32,4,0
+    KERNEL1x1_L2 32,32,5,0 
+    KERNEL1x1_L2 32,32,6,0
+    KERNEL1x1_E2 32,32,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 32,32,0,0
+    KERNEL1x1_L2 32,32,1,0  
+    KERNEL1x1_L2 32,32,2,0
+    KERNEL1x1_E2 32,32,3,1  
+    blr
+
+
+/*----------------------N1 BEGINS---------*/
+ZGEMM_L1:
+/*----------------------------------------*/   
+    andi.   T1, N,  1
+    ble   ZGEMM_L1_END
+		
+ZGEMM_L1_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+   
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   ZGEMM_L1x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+ZGEMM_L1x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T11-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO1x8  
+    ble   ZGEMM_L1x8_SUB0
+    bl ZGEMM_L1x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   ZGEMM_L1x8_SAVE
+    b   ZGEMM_L1x8_SUB2
+
+
+ZGEMM_L1x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP1x8_128K
+    addi BO,BO,-16
+    addi AO,AO,-128 
+    LOAD1x8O 128,16 
+    END1x8_WITHOUT_ADD   
+    LOAD1x8_2O  256, 32 
+    mtctr   T8    
+    bl ZGEMM_L1x8_K128   
+    b ZGEMM_L1x8_SAVE  
+    CMP1x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne ZGEMM_L1x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-256   
+    LOAD1x8_2O 256,32
+    bl ZGEMM_L1x8_K128   
+    b ZGEMM_L1x8_SAVE 
+    MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble ZGEMM_L1x8_SUB2_32
+    bl ZGEMM_1x8_L64_SUB
+    MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble ZGEMM_L1x8_SUB2_16    
+    bl ZGEMM_1x8_L32_SUB
+    MY_ALIGN 
+
+
+ZGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L1x8_SUB2_8
+    bl ZGEMM_1x8_L16_SUB  
+    MY_ALIGN    
+
+
+ZGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L1x8_SUB2_4
+    LOAD1x8_2
+    KERNEL1x8_L2  256,32, 0,0
+    KERNEL1x8_L2  256,32, 1,0
+    KERNEL1x8_L2  256,32, 2,0
+    KERNEL1x8_E2  256,32, 3,1
+    MY_ALIGN   
+
+
+ZGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L1x8_SUB2_2
+    LOAD1x8_2
+    KERNEL1x8_L2  256,32, 0,0
+    KERNEL1x8_E2  256,32, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L1x8_SUB2_1
+    LOAD1x8_2 
+    KERNEL1x8_E2  256,32, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L1x8_SAVE 
+    KERNEL1x8
+
+
+ZGEMM_L1x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    SAVE1x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
+#endif     
+    bgt   ZGEMM_L1x8_BEGIN
+    andi.   T2, M,  7
+    ble   ZGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   ZGEMM_L1x4_END
+    b   ZGEMM_L1x4_BEGIN
+    MY_ALIGN 
+
+
+ZGEMM_L1x8_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L1x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   ZGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   ZGEMM_L1x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO1x4
+    ble   ZGEMM_L1x4_SUB0 
+    bl ZGEMM_1x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L1x4_SAVE
+    b   ZGEMM_L1x4_SUB2
+
+
+ZGEMM_L1x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x4_32K
+    addi BO,BO,-16
+    addi AO,AO,-64  
+    LOAD1x4O 64,16 
+    END1x4_WITHOUT_ADD   
+    LOAD1x4_2O  128, 32 
+    mtctr   T8    
+    bl ZGEMM_L1x4_K32   
+    b ZGEMM_L1x4_SAVE  
+    CMP1x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L1x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-128   
+    LOAD1x4_2O 128,32
+    bl ZGEMM_L1x4_K32   
+    b ZGEMM_L1x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L1x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L1x4_SUB2_8
+    bl ZGEMM_1x4_L16_SUB  
+    MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L1x4_SUB2_4
+    bl ZGEMM_1x4_L8_SUB
+    MY_ALIGN  
+
+
+ZGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L1x4_SUB2_2
+    LOAD1x4_2
+    KERNEL1x4_L2  128,32, 0,0
+    KERNEL1x4_E2  128,32, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L1x4_SUB2_1
+    LOAD1x4_2
+    KERNEL1x4_E2  128,32, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L1x4_SAVE 
+    KERNEL1x4
+
+
+ZGEMM_L1x4_SAVE:
+/*----------------------------------------*/   
+    SAVE1x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
+#endif     
+
+
+ZGEMM_L1x4_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L1x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   ZGEMM_L1x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO1x2
+    ble   ZGEMM_L1x2_SUB0 
+    bl ZGEMM_1x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L1x2_SAVE
+    b   ZGEMM_L1x2_SUB2
+
+
+ZGEMM_L1x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x2_32K
+    addi BO,BO,-16
+    addi AO,AO,-32  
+    LOAD1x2O 32,16 
+    END1x2_WITHOUT_ADD   
+    LOAD1x2_2O  64, 32  
+    mtctr   T8    
+    bl ZGEMM_L1x2_K32   
+    b ZGEMM_L1x2_SAVE  
+    CMP1x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L1x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-64   
+    LOAD1x2_2O 64,32
+    bl ZGEMM_L1x2_K32   
+    b ZGEMM_L1x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L1x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L1x2_SUB2_8
+    bl ZGEMM_1x2_L16_SUB  
+    MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L1x2_SUB2_4
+    bl ZGEMM_1x2_L8_SUB
+    MY_ALIGN  
+
+
+ZGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L1x2_SUB2_2
+    LOAD1x2_2
+    KERNEL1x2_L2  64,32, 0,0
+    KERNEL1x2_E2  64,32, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L1x2_SUB2_1
+    LOAD1x2_2
+    KERNEL1x2_E2  64,32, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L1x2_SAVE 
+    KERNEL1x2
+
+
+ZGEMM_L1x2_SAVE:
+/*----------------------------------------*/   
+    SAVE1x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
+#endif     
+
+
+ZGEMM_L1x2_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L1x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   ZGEMM_L1x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO1x1
+    ble   ZGEMM_L1x1_SUB0 
+    bl ZGEMM_1x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L1x1_SAVE
+    b   ZGEMM_L1x1_SUB2
+
+
+ZGEMM_L1x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x1_32K
+    addi BO,BO,-16
+    addi AO,AO,-16  
+    LOAD1x1O 16,16 
+    END1x1_WITHOUT_ADD   
+    LOAD1x1_2O  32, 32  
+    mtctr   T8    
+    bl ZGEMM_L1x1_K32   
+    b ZGEMM_L1x1_SAVE  
+    CMP1x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L1x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-32   
+    LOAD1x1_2O 32,32
+    bl ZGEMM_L1x1_K32   
+    b ZGEMM_L1x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L1x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L1x1_SUB2_8
+    bl ZGEMM_1x1_L16_SUB  
+    MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L1x1_SUB2_4
+    bl ZGEMM_1x1_L8_SUB
+    MY_ALIGN  
+
+
+ZGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L1x1_SUB2_2
+    LOAD1x1_2
+    KERNEL1x1_L2  32,32, 0,0
+    KERNEL1x1_E2  32,32, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L1x1_SUB2_1
+    LOAD1x1_2
+    KERNEL1x1_E2  32,32, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L1x1_SAVE 
+    KERNEL1x1
+
+
+ZGEMM_L1x1_SAVE:
+/*----------------------------------------*/   
+    SAVE1x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
+#endif   
+
+
+ZGEMM_L1x1_END:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 1
+#endif   
+
+
+ZGEMM_L1_END:
+/*----------------------------------------*/   
     
\ No newline at end of file
diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S
index 8670e9574..68024b826 100644
--- a/kernel/power/zgemm_macros_power9.S
+++ b/kernel/power/zgemm_macros_power9.S
@@ -1,1825 +1,1825 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define unit_size 16
-#define DISP32(ind,disp) (ind*unit_size*32+disp)
-#define DISP16(ind,disp) (ind*unit_size*16+disp)
-#define DISP8(ind,disp) (ind*unit_size*8+disp)
-#define DISP4(ind,disp) (ind*unit_size*4+disp)
-#define DISP2(ind,disp) (ind*unit_size*2+disp)
-#define DISP1(ind,disp) (ind*unit_size+disp)
-#define DISPX(disp)  (disp)
-/*	HELPERS FOR SAVE	*/
-/* {r0,i0} and {r1,i1} into  {r0,r1} {i0,i1} */
-
-
-.macro LOAD_COUPLE_AS_RR_II  VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET 
-#ifndef TRMMKERNEL 
-  lxv	\VS_TEMP1,	DISPX(\LOFFSET)(\REG)
-  lxv	\VS_TEMP2,	DISPX(\LOFFSET+16)(\REG)
-  xxmrgld  \VS_OUT1,\VS_TEMP1,\VS_TEMP2
-  xxmrghd  \VS_OUT2,\VS_TEMP1,\VS_TEMP2	
-#endif	
-.endm
-/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
-
-
-.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
-	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*real from 2 results*/
-	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*imag from 2 results*/
-.endm 
-/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
-
-
-.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 
-	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*imag */
-	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*real*/
-.endm
-/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
-
-
-.macro  AGGREGATE_REALS_IMAGES  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
-#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
-	xvsubdp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
-#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
-	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvsubdp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
-#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
-	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvsubdp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
-#else	// CC || CR || RC || RR 
-    /*we will assume {-alpha_r,-alpha_i} for this case */
-    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
-	xvsubdp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
-    /*we will negate alpha image instead  instead to fix sign*/
-	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
-#endif
-.endm 
-/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
-
-
-.macro MULT_APLHA_PART1  VSINRR,VSINII,VSOUT1,VSOUT2
-#ifndef TRMMKERNEL  
-	xvmsubadp \VSOUT1,\VSINII, alpha_i
-	xvmaddadp  \VSOUT2,\VSINRR, alpha_i
-#else 
-	xvmuldp \VSOUT1,\VSINII, alpha_i 
-	xvmuldp  \VSOUT2,\VSINRR, alpha_i
-#endif 
-.endm
-/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
-
-
-.macro MULT_APLHA_PART2  VSINRR,VSINII,VSOUT1,VSOUT2 
-	xvmsubadp  \VSOUT1,\VSINRR, alpha_r
-	xvmaddadp \VSOUT2,\VSINII, alpha_r
-.endm
-/* unpack to store 2{r,r} {i,i} into  {r,i} {r,i} (big endian because of stxv) */
-
-
-.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 
-	xxmrghd  \VSOUT1,\VSIN2,\VSIN1
-	xxmrgld  \VSOUT2,\VSIN2,\VSIN1
-.endm
-
-
-.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
-	stxv	\VSIN1,	DISPX(\LOFFSET)(\REG)
-	stxv	\VSIN2,	DISPX(\LOFFSET+16)(\REG)
-.endm
-
-
-.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
-  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5
-  LOAD_COUPLE_AS_RR_II	vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
-  LOAD_COUPLE_AS_RR_II	vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64)
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs8,vs9 
-  LOAD_COUPLE_AS_RR_II	vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96)
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11
-  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes10,\VSRes12,vs12,vs13 
-  AGGREGATE_REALS_IMAGES	vs6,vs7,vs8,vs9  
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
-  MULT_APLHA_PART1	vs2,vs4, vs14,vs15
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes14,\VSRes16,\VSRes3,\VSRes4
-  MULT_APLHA_PART1	vs6,vs8,vs16,vs17
-  MULT_APLHA_PART2  vs2,vs4,vs14,vs15 
-  AGGREGATE_REALS_IMAGES	vs10,vs11,vs12,vs13
-  MULT_APLHA_PART2	vs6,vs8,vs16,vs17
-  AGGREGATE_REALS_IMAGES	\VSRes1,\VSRes2,\VSRes3,\VSRes4	
-  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9
-  MULT_APLHA_PART1	vs10,vs12, vs24,vs25
-  UNPACK_FOR_STORE	vs16,vs17,vs3,vs5 
-  MULT_APLHA_PART1	\VSRes1,\VSRes3, vs26,vs27
-  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9
-  MULT_APLHA_PART2	vs10,vs12,vs24,vs25
-  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs3,vs5 
-  MULT_APLHA_PART2	\VSRes1,\VSRes3, vs26,vs27
-  UNPACK_FOR_STORE	vs24,vs25,vs10,vs12
-  UNPACK_FOR_STORE	vs26,vs27,\VSRes1,\VSRes3
-  STORE_COUPLE	\BASE_REG,(\LOFFSET +64),vs10,vs12
-  STORE_COUPLE	\BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
-.endm
-
-
-.macro SAVE4  VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
-  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5
-  LOAD_COUPLE_AS_RR_II	vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs8,vs9 
-  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
-  AGGREGATE_REALS_IMAGES	vs6,vs7,vs8,vs9  
-  MULT_APLHA_PART1	vs2,vs4, vs14,vs15
-  MULT_APLHA_PART1	vs6,vs8, vs16,vs17
-  MULT_APLHA_PART2	vs2,vs4, vs14,vs15 
-  MULT_APLHA_PART2	vs6,vs8,vs16,vs17
-  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9
-  UNPACK_FOR_STORE	vs16,vs17,vs3,vs5
-  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9
-  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs3,vs5
-.endm
-
-
-
-.macro SAVE2  VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
-  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5	
-  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
-  MULT_APLHA_PART1	vs2,vs4, vs14,vs15	
-  MULT_APLHA_PART2	vs2,vs4, vs14,vs15  
-  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9	
-  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9  
-.endm
-
-
-
-.macro SAVE1  VSRes1,VSRes2,BASE_REG,LOFFSET
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3
-#ifndef TRMMKERNEL 
-  lxv	vs18,	(\LOFFSET)(\BASE_REG) 
-  xxmrgld  vs14,vs18,vs18
-  xxmrghd  vs15,vs18,vs18	
-#endif	
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes2,vs4,vs5	
-  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
-  MULT_APLHA_PART1	vs2,vs4, vs14,vs15	
-  MULT_APLHA_PART2	vs2,vs4, vs14,vs15  
-  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9 
-  xxmrghd  vs7,vs15,vs14	
-  stxv	vs7,	(\LOFFSET)(\BASE_REG) 
-.endm
-/**********************************************************************************************
-*
-
-.macros for N=2 and M=8
-**********************************************************************************************/
-
-.macro Zero2x8
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs38,	vs38,	vs38
-	xxlxor	vs39,	vs39,	vs39
-	xxlxor	vs40,	vs40,	vs40
-	xxlxor	vs41,	vs41,	vs41
-	xxlxor	vs42,	vs42,	vs42
-	xxlxor	vs43,	vs43,	vs43
-	xxlxor	vs44,	vs44,	vs44
-	xxlxor	vs45,	vs45,	vs45
-	xxlxor	vs46,	vs46,	vs46
-	xxlxor	vs47,	vs47,	vs47
-	xxlxor	vs48,	vs48,	vs48
-	xxlxor	vs49,	vs49,	vs49
-	xxlxor	vs50,	vs50,	vs50
-	xxlxor	vs51,	vs51,	vs51
-	xxlxor	vs52,	vs52,	vs52
-	xxlxor	vs53,	vs53,	vs53
-	xxlxor	vs54,	vs54,	vs54
-	xxlxor	vs55,	vs55,	vs55
-	xxlxor	vs56,	vs56,	vs56
-	xxlxor	vs57,	vs57,	vs57
-	xxlxor	vs58,	vs58,	vs58
-	xxlxor	vs59,	vs59,	vs59
-	xxlxor	vs60,	vs60,	vs60
-	xxlxor	vs61,	vs61,	vs61
-	xxlxor	vs62,	vs62,	vs62
-	xxlxor	vs63,	vs63,	vs63
-.endm
-
-
-.macro LOAD2x8   
-	LOAD2x8O 0,0 
-.endm
-
-
-.macro LOAD2x8O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
- 
-.endm
-
-
-.macro END2x8_NORMAL
-	END2x8 AO,BO,128,32
-.endm
-
-
-.macro END2x8_WITHOUT_ADD
-	END2x8 AO,BO,0,0
-.endm
-
-
-.macro END2x8	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs48,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs49,	vs0,	vs19
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs50,	vs1,	vs18
-	xvmaddadp	vs35,	vs1,	vs17
-	xvmaddadp	vs51,	vs1,	vs19
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs52,	vs2,	vs18
-	xvmaddadp	vs37,	vs2,	vs17
-	xvmaddadp	vs53,	vs2,	vs19
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs54,	vs3,	vs18
-	xvmaddadp	vs39,	vs3,	vs17
-	xvmaddadp	vs55,	vs3,	vs19
-	xvmaddadp	vs40,	vs4,	vs16
-	xvmaddadp	vs56,	vs4,	vs18
-	xvmaddadp	vs41,	vs4,	vs17
-	xvmaddadp	vs57,	vs4,	vs19
-	xvmaddadp	vs42,	vs5,	vs16
-	xvmaddadp	vs58,	vs5,	vs18
-	xvmaddadp	vs43,	vs5,	vs17
-	xvmaddadp	vs59,	vs5,	vs19
-	xvmaddadp	vs44,	vs6,	vs16
-	xvmaddadp	vs60,	vs6,	vs18
-	xvmaddadp	vs45,	vs6,	vs17
-	xvmaddadp	vs61,	vs6,	vs19
-	xvmaddadp	vs46,	vs7,	vs16
-	xvmaddadp	vs62,	vs7,	vs18
-	xvmaddadp	vs47,	vs7,	vs17
-	xvmaddadp	vs63,	vs7,	vs19
-.endm
-
-
-.macro LOAD2x8_2
-    LOAD2x8_2O 0,0
-.endm	
-
-
-.macro LOAD2x8_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
-	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
-	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs8,	(128+0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs9,	(128+16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs10,   (128+32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs11,   (128+48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs12,   (128+64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs13,   (128+80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs14,   (128+96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs15,   (128+112+\OffsetA)(AO)	// load real,imag from A
-.endm	
-
-
-.macro END2x8_2	  
-  /*for load2 offset will be 256 and 64*/
-   KERNEL2x8_2	AO,BO,	256,64,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL2x8_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x8_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs48,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs49,	vs0,	vs19
-  xxswapd	vs21, vs20
-  xxswapd	vs23, vs22
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs50,	vs1,	vs18
-	xvmaddadp	vs35,	vs1,	vs17
-	xvmaddadp	vs51,	vs1,	vs19
-.if \Complete==0	
-	lxv	vs0,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs52,	vs2,	vs18
-	xvmaddadp	vs37,	vs2,	vs17
-	xvmaddadp	vs53,	vs2,	vs19
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs54,	vs3,	vs18
-	xvmaddadp	vs39,	vs3,	vs17
-	xvmaddadp	vs55,	vs3,	vs19
-.if \Complete==0	
-	lxv	vs2,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs3,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs40,	vs4,	vs16
-	xvmaddadp	vs56,	vs4,	vs18
-	xvmaddadp	vs41,	vs4,	vs17
-	xvmaddadp	vs57,	vs4,	vs19
-	xvmaddadp	vs42,	vs5,	vs16
-	xvmaddadp	vs58,	vs5,	vs18
-	xvmaddadp	vs43,	vs5,	vs17
-	xvmaddadp	vs59,	vs5,	vs19
-.if \Complete==0		
-	lxv	vs4,	DISP16(\Index,64+ \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs5,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs44,	vs6,	vs16
-	xvmaddadp	vs60,	vs6,	vs18
-	xvmaddadp	vs45,	vs6,	vs17
-	xvmaddadp	vs61,	vs6,	vs19
-	xvmaddadp	vs46,	vs7,	vs16
-	xvmaddadp	vs62,	vs7,	vs18
-	xvmaddadp	vs47,	vs7,	vs17
-	xvmaddadp	vs63,	vs7,	vs19	
-.if \Complete==0		
-	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
-.endif
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs48,	vs8,	vs22
-.if \Complete==0
-	lxv	vs6,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs7,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A	
-.endif	
-	xvmaddadp	vs33,	vs8,	vs21
-	xvmaddadp	vs49,	vs8,	vs23
-.if \Complete==0		
-  xxswapd	vs17, vs16
-  xxswapd	vs19, vs18
-.endif
-	xvmaddadp	vs34,	vs9,	vs20
-	xvmaddadp	vs50,	vs9,	vs22
-	xvmaddadp	vs35,	vs9,	vs21
-	xvmaddadp	vs51,	vs9,	vs23
-.if \Complete==0		
-	lxv	vs8,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
-	xvmaddadp	vs36,	vs10,	vs20
-	xvmaddadp	vs52,	vs10,	vs22
-	xvmaddadp	vs37,	vs10,	vs21
-	xvmaddadp	vs53,	vs10,	vs23
-	xvmaddadp	vs38,	vs11,	vs20
-	xvmaddadp	vs54,	vs11,	vs22
-	xvmaddadp	vs39,	vs11,	vs21
-	xvmaddadp	vs55,	vs11,	vs23
-.if \Complete==0	
-	lxv	vs10,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs11,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs40,	vs12,	vs20
-	xvmaddadp	vs56,	vs12,	vs22
-	xvmaddadp	vs41,	vs12,	vs21
-	xvmaddadp	vs57,	vs12,	vs23
-	xvmaddadp	vs42,	vs13,	vs20
-	xvmaddadp	vs58,	vs13,	vs22
-	xvmaddadp	vs43,	vs13,	vs21
-	xvmaddadp	vs59,	vs13,	vs23
-.if \Complete==0	
-	lxv	vs12,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs13,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs44,	vs14,	vs20
-	xvmaddadp	vs60,	vs14,	vs22
-	xvmaddadp	vs45,	vs14,	vs21
-	xvmaddadp	vs61,	vs14,	vs23
-	xvmaddadp	vs46,	vs15,	vs20
-	xvmaddadp	vs62,	vs15,	vs22
-	xvmaddadp	vs47,	vs15,	vs21
-	xvmaddadp	vs63,	vs15,	vs23
-.if \Complete==0	
-	lxv	vs14,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs15,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A
- 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
-	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
-.endif
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP16(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP16(\Index,256)
-	addi	\BREG, \BREG,  DISP4(\Index,64)
-.endif
-.endif 
-.endm
-
- 
-
-
-
-.macro KERNEL2x8
-  LOAD2x8
-  END2x8  AO, BO, 128,32
-.endm
-
-
-.macro SAVE2x8
-	add	T1, CO ,LDC 
-	SAVE8  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
-	SAVE8  vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0  
-	addi	CO, CO, 128
-.endm
-/**********************************************************************************************
-*
-
-.macros for N=2 and M=4
-**********************************************************************************************/
-
-
-.macro Zero2x4
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs38,	vs38,	vs38
-	xxlxor	vs39,	vs39,	vs39
-	xxlxor	vs40,	vs40,	vs40
-	xxlxor	vs41,	vs41,	vs41
-	xxlxor	vs42,	vs42,	vs42
-	xxlxor	vs43,	vs43,	vs43
-	xxlxor	vs44,	vs44,	vs44
-	xxlxor	vs45,	vs45,	vs45
-	xxlxor	vs46,	vs46,	vs46
-	xxlxor	vs47,	vs47,	vs47
-.endm
-
-
-.macro LOAD2x4   
-	LOAD2x4O 0,0 
-.endm
-
-
-.macro LOAD2x4O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A  
-.endm
-
-
-.macro END2x4_NORMAL
-	END2x4 AO,BO,64,32
-.endm
-
-
-.macro END2x4_WITHOUT_ADD
-	END2x4 AO,BO,0,0
-.endm
-
-
-.macro END2x4	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs40,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs41,	vs0,	vs19
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs42,	vs1,	vs18
-	xvmaddadp	vs35,	vs1,	vs17
-	xvmaddadp	vs43,	vs1,	vs19
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs44,	vs2,	vs18
-	xvmaddadp	vs37,	vs2,	vs17
-	xvmaddadp	vs45,	vs2,	vs19
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs46,	vs3,	vs18
-	xvmaddadp	vs39,	vs3,	vs17
-	xvmaddadp	vs47,	vs3,	vs19
-
-.endm
-
-
-.macro LOAD2x4_2
-    LOAD2x4_2O 0,0
-.endm	
-
-
-.macro LOAD2x4_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
-	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
-	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs8,	(64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs9,	(80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs10,	(96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs11,	(112+\OffsetA)(AO)	// load real,imag from A 
-.endm	
-
-
-.macro END2x4_2	  
-  /*for load2 offset will be 128 and 64*/
-   KERNEL2x4_2	AO,BO,	128,64,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL2x4_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x4_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs40,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs41,	vs0,	vs19
-  xxswapd	vs21, vs20
-  xxswapd	vs23, vs22
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs42,	vs1,	vs18
-	xvmaddadp	vs35,	vs1,	vs17
-	xvmaddadp	vs43,	vs1,	vs19
-.if \Complete==0	
-	lxv	vs0,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs44,	vs2,	vs18
-	xvmaddadp	vs37,	vs2,	vs17
-	xvmaddadp	vs45,	vs2,	vs19
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs46,	vs3,	vs18
-	xvmaddadp	vs39,	vs3,	vs17
-	xvmaddadp	vs47,	vs3,	vs19
-.if \Complete==0	
-	lxv	vs2,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs3,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
- 
-.if \Complete==0		
-	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
-.endif
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs40,	vs8,	vs22 
-	xvmaddadp	vs33,	vs8,	vs21
-	xvmaddadp	vs41,	vs8,	vs23
-.if \Complete==0		
-  xxswapd	vs17, vs16
-  xxswapd	vs19, vs18
-.endif
-	xvmaddadp	vs34,	vs9,	vs20
-	xvmaddadp	vs42,	vs9,	vs22
-	xvmaddadp	vs35,	vs9,	vs21
-	xvmaddadp	vs43,	vs9,	vs23
-.if \Complete==0		
-	lxv	vs8,	DISP8(\Index,64+0+ \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
-	xvmaddadp	vs36,	vs10,	vs20
-	xvmaddadp	vs44,	vs10,	vs22
-	xvmaddadp	vs37,	vs10,	vs21
-	xvmaddadp	vs45,	vs10,	vs23
-	xvmaddadp	vs38,	vs11,	vs20
-	xvmaddadp	vs46,	vs11,	vs22
-	xvmaddadp	vs39,	vs11,	vs21
-	xvmaddadp	vs47,	vs11,	vs23
-.if \Complete==0	
-	lxv	vs10,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs11,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
- 
-.if \Complete==0	 
- 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
-	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
-.endif
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP8(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP8(\Index,128)
-	addi	\BREG, \BREG,  DISP4(\Index,64)
-.endif
-.endif 
-.endm
- 
-
-
-.macro KERNEL2x4
-  LOAD2x4
-  END2x4  AO, BO, 64,32
-.endm
-
-
-
-.macro SAVE2x4 
-	add	T1, CO ,LDC 
-	SAVE4  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
-	SAVE4  vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0  
-	addi	CO, CO, 64
-.endm
-/**********************************************************************************************
-*
-
-.macros for N=2 and M=2
-**********************************************************************************************/
-
-
-.macro Zero2x2
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs38,	vs38,	vs38
-	xxlxor	vs39,	vs39,	vs39
-
-.endm
-
-
-.macro LOAD2x2   
-	LOAD2x2O 0,0 
-.endm
-
-
-.macro LOAD2x2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
- 
-.endm
-
-
-.macro END2x2_NORMAL
-	END2x2 AO,BO,32,32
-.endm
-
-
-.macro END2x2_WITHOUT_ADD
-	END2x2 AO,BO,0,0
-.endm
-
-
-.macro END2x2	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs36,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs37,	vs0,	vs19
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs38,	vs1,	vs18
-	xvmaddadp	vs35,	vs1,	vs17
-	xvmaddadp	vs39,	vs1,	vs19 
-
-.endm
-
-
-.macro LOAD2x2_2
-    LOAD2x2_2O 0,0
-.endm	
-
-
-.macro LOAD2x2_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
-	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
-	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A 
-	lxv	vs8,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs9,	(48+\OffsetA)(AO)	// load real,imag from A
- 	
-.endm	
-
-
-.macro END2x2_2	  
-  /*for load2 offset will be 64 and 64*/
-   KERNEL2x2_2	AO,BO,	64,64,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL2x2_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x2_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs36,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs37,	vs0,	vs19
-  xxswapd	vs21, vs20
-  xxswapd	vs23, vs22
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs38,	vs1,	vs18
-	xvmaddadp	vs35,	vs1,	vs17
-	xvmaddadp	vs39,	vs1,	vs19
-.if \Complete==0	
-	lxv	vs0,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	 
-.if \Complete==0		
-	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
-.endif
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs36,	vs8,	vs22 
-	xvmaddadp	vs33,	vs8,	vs21
-	xvmaddadp	vs37,	vs8,	vs23
-.if \Complete==0		
-  xxswapd	vs17, vs16
-  xxswapd	vs19, vs18
-.endif
-	xvmaddadp	vs34,	vs9,	vs20
-	xvmaddadp	vs38,	vs9,	vs22
-	xvmaddadp	vs35,	vs9,	vs21
-	xvmaddadp	vs39,	vs9,	vs23
-.if \Complete==0	 
- 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
-	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
-.endif
-.if \Complete==0		
-	lxv	vs8,	DISP4(\Index,32+0+ \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP4(\Index,32+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
- 
- 
-
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP4(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP4(\Index,64)
-	addi	\BREG, \BREG,  DISP4(\Index,64)
-.endif
-.endif 
-.endm
- 
-
-
-.macro KERNEL2x2
-  LOAD2x2
-  END2x2  AO, BO, 32,32
-.endm
-
-
-
-.macro SAVE2x2 
-	add	T1, CO ,LDC 
-	SAVE2  vs32,vs33,vs34,vs35,CO,0
-	SAVE2  vs36,vs37,vs38,vs39,T1,0 
-	addi	CO, CO, 32 
-.endm
-/**********************************************************************************************
-*
-
-.macros for N=2 and M=1
-**********************************************************************************************/
-
-
-
-.macro Zero2x1
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35
- 
-.endm
-
-
-.macro LOAD2x1   
-	LOAD2x1O 0,0 
-.endm
-
-
-.macro LOAD2x1O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A 
-.endm
-
-
-.macro END2x1_NORMAL
-	END2x1 AO,BO,16,32
-.endm
-
-
-.macro END2x1_WITHOUT_ADD
-	END2x1 AO,BO,0,0
-.endm
-
-
-.macro END2x1	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs34,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs35,	vs0,	vs19 
-.endm
-
-
-.macro LOAD2x1_2
-    LOAD2x1_2O 0,0
-.endm	
-
-
-.macro LOAD2x1_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
-	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
-	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs8,	(16+\OffsetA)(AO)	// load real,imag from A 
-.endm	
-
-
-.macro END2x1_2	  
-  /*for load2 offset will be 32 and 64*/
-   KERNEL2x1_2	AO,BO,	32,64,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL2x1_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x1_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-  xxswapd	vs21, vs20
-  xxswapd	vs23, vs22 
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs34,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs35,	vs0,	vs19
-.if \Complete==0	
-	lxv	vs0,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
-.endif	 
-.if \Complete==0		
-	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
-.endif
-.if \Complete==0		
-  xxswapd	vs17, vs16
-  xxswapd	vs19, vs18
-.endif 
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs34,	vs8,	vs22 
-	xvmaddadp	vs33,	vs8,	vs21
-	xvmaddadp	vs35,	vs8,	vs23
-.if \Complete==0		
-	lxv	vs8,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
-.endif
- 
-.if \Complete==0	 
- 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
-	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
-.endif
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP2(\Index,32)
-	addi	\BREG, \BREG,  DISP4(\Index,64)
-.endif
-.endif 
-.endm
- 
-
-
-.macro KERNEL2x1
-  LOAD2x1
-  END2x1  AO, BO, 16,32
-.endm
-
-
-
-.macro SAVE2x1
-	add	T1, CO ,LDC 
-	SAVE1  vs32,vs33,CO,0
-	SAVE1  vs34,vs35,T1,0  
-	addi	CO, CO, 16 
-.endm
-
-/**********************************************************************************************
-*
-
-.macros for N=1 and M=8
-**********************************************************************************************/
-
-
-.macro Zero1x8
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs38,	vs38,	vs38
-	xxlxor	vs39,	vs39,	vs39
-	xxlxor	vs40,	vs40,	vs40
-	xxlxor	vs41,	vs41,	vs41
-	xxlxor	vs42,	vs42,	vs42
-	xxlxor	vs43,	vs43,	vs43
-	xxlxor	vs44,	vs44,	vs44
-	xxlxor	vs45,	vs45,	vs45
-	xxlxor	vs46,	vs46,	vs46
-	xxlxor	vs47,	vs47,	vs47
-	xxlxor	vs48,	vs48,	vs48
-.endm
-
-
-.macro LOAD1x8   
-	LOAD1x8O 0,0 
-.endm
-
-
-.macro LOAD1x8O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B 
-	xxswapd	vs17, vs16 
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
- 
-.endm
-
-
-.macro END1x8_NORMAL
-	END1x8 AO,BO,128,16
-.endm
-
-
-.macro END1x8_WITHOUT_ADD
-	END1x8 AO,BO,0,0
-.endm
-
-
-.macro END1x8	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17
-
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs37,	vs2,	vs17
-
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs39,	vs3,	vs17
-
-	xvmaddadp	vs40,	vs4,	vs16
-	xvmaddadp	vs41,	vs4,	vs17
-
-	xvmaddadp	vs42,	vs5,	vs16
-	xvmaddadp	vs43,	vs5,	vs17
-
-	xvmaddadp	vs44,	vs6,	vs16
-	xvmaddadp	vs45,	vs6,	vs17
-
-	xvmaddadp	vs46,	vs7,	vs16
-	xvmaddadp	vs47,	vs7,	vs17
-
-.endm
-
-
-.macro LOAD1x8_2
-    LOAD1x8_2O 0,0
-.endm	
-
-
-.macro LOAD1x8_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
-	xxswapd	vs17, vs16
-
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs8,	(128+0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs9,	(128+16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs10,   (128+32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs11,   (128+48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs12,   (128+64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs13,   (128+80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs14,   (128+96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs15,   (128+112+\OffsetA)(AO)	// load real,imag from A
-.endm	
-
-
-.macro END1x8_2	  
-  /*for load2 offset will be 256 and 32*/
-   KERNEL1x8_2	AO,BO,	256,32,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL1x8_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x8_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-  xxswapd	vs21, vs20
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17
-.if \Complete==0	
-	lxv	vs0,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs37,	vs2,	vs17
-
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs39,	vs3,	vs17
-.if \Complete==0	
-	lxv	vs2,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs3,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs40,	vs4,	vs16
-	xvmaddadp	vs41,	vs4,	vs17
-
-	xvmaddadp	vs42,	vs5,	vs16
-	xvmaddadp	vs43,	vs5,	vs17
-.if \Complete==0		
-	lxv	vs4,	DISP16(\Index,64+ \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs5,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs44,	vs6,	vs16
-	xvmaddadp	vs45,	vs6,	vs17
-
-	xvmaddadp	vs46,	vs7,	vs16
-	xvmaddadp	vs47,	vs7,	vs17
-.if \Complete==0		
-	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-.endif
-.if \Complete==0		
-  xxswapd	vs17, vs16
-.endif
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs33,	vs8,	vs21
-.if \Complete==0
-	lxv	vs6,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs7,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A	
-.endif	
-	xvmaddadp	vs34,	vs9,	vs20
-	xvmaddadp	vs35,	vs9,	vs21
-.if \Complete==0		
-	lxv	vs8,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
-	xvmaddadp	vs36,	vs10,	vs20
-	xvmaddadp	vs37,	vs10,	vs21
-	xvmaddadp	vs38,	vs11,	vs20
-	xvmaddadp	vs39,	vs11,	vs21
-.if \Complete==0	
-	lxv	vs10,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs11,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs40,	vs12,	vs20
-	xvmaddadp	vs41,	vs12,	vs21
-	xvmaddadp	vs42,	vs13,	vs20
-	xvmaddadp	vs43,	vs13,	vs21
-.if \Complete==0	
-	lxv	vs12,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs13,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs44,	vs14,	vs20
-	xvmaddadp	vs45,	vs14,	vs21
-	xvmaddadp	vs46,	vs15,	vs20
-	xvmaddadp	vs47,	vs15,	vs21
-.if \Complete==0	
-	lxv	vs14,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs15,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A
- 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
-.endif
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP16(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP16(\Index,256)
-	addi	\BREG, \BREG,  DISP2(\Index,32)
-.endif
-.endif 
-.endm
-
- 
-
-
-
-.macro KERNEL1x8
-  LOAD1x8
-  END1x8  AO, BO, 128,16
-.endm
-
-
-.macro SAVE1x8
-	SAVE8  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
-	addi	CO, CO, 128
-.endm
-/**********************************************************************************************
-*
-
-.macros for N=2 and M=4
-**********************************************************************************************/
-
-
-.macro Zero1x4
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs38,	vs38,	vs38
-	xxlxor	vs39,	vs39,	vs39
-.endm
-
-
-.macro LOAD1x4   
-	LOAD1x4O 0,0 
-.endm
-
-
-.macro LOAD1x4O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	xxswapd	vs17, vs16
-
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A 
- 
-.endm
-
-
-.macro END1x4_NORMAL
-	END1x4 AO,BO,64,16
-.endm
-
-
-.macro END1x4_WITHOUT_ADD
-	END1x4 AO,BO,0,0
-.endm
-
-
-.macro END1x4	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17
-
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs37,	vs2,	vs17
-
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs39,	vs3,	vs17
-
-.endm
-
-
-.macro LOAD1x4_2
-    LOAD1x4_2O 0,0
-.endm	
-
-
-.macro LOAD1x4_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
-	xxswapd	vs17, vs16
-
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs8,	(64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs9,	(80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs10,	(96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs11,	(112+\OffsetA)(AO)	// load real,imag from A 
-.endm	
-
-
-.macro END1x4_2	  
-  /*for load2 offset will be 128 and 32*/
-   KERNEL1x4_2	AO,BO,	128,32,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL1x4_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x4_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-  xxswapd	vs21, vs20
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17
-.if \Complete==0	
-	lxv	vs0,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs37,	vs2,	vs17
-
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs39,	vs3,	vs17
-.if \Complete==0	
-	lxv	vs2,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs3,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
- 
-.if \Complete==0		
-	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-.endif
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs33,	vs8,	vs21
-.if \Complete==0		
-  xxswapd	vs17, vs16
-.endif
-	xvmaddadp	vs34,	vs9,	vs20
-	xvmaddadp	vs35,	vs9,	vs21
-.if \Complete==0		
-	lxv	vs8,	DISP8(\Index,64+0+ \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
-	xvmaddadp	vs36,	vs10,	vs20
-	xvmaddadp	vs37,	vs10,	vs21
-	xvmaddadp	vs38,	vs11,	vs20
-	xvmaddadp	vs39,	vs11,	vs21
-.if \Complete==0	
-	lxv	vs10,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs11,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
- 
-.if \Complete==0	 
- 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
-.endif
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP8(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP8(\Index,128)
-	addi	\BREG, \BREG,  DISP2(\Index,32)
-.endif
-.endif 
-.endm
- 
-
-
-.macro KERNEL1x4
-  LOAD1x4
-  END1x4  AO, BO, 64,16
-.endm
-
-
-
-.macro SAVE1x4 
-	SAVE4  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
-	addi	CO, CO, 64
-.endm
-/**********************************************************************************************
-*
-
-.macros for N=2 and M=2
-**********************************************************************************************/
-
-
-.macro Zero1x2
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35 
-
-.endm
-
-
-.macro LOAD1x2   
-	LOAD1x2O 0,0 
-.endm
-
-
-.macro LOAD1x2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	xxswapd	vs17, vs16
-
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A 
-
-.endm
-
-
-.macro END1x2_NORMAL
-	END1x2 AO,BO,32,16
-.endm
-
-
-.macro END1x2_WITHOUT_ADD
-	END1x2 AO,BO,0,0
-.endm
-
-
-.macro END1x2	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17
-
-.endm
-
-
-.macro LOAD1x2_2
-    LOAD1x2_2O 0,0
-.endm	
-
-
-.macro LOAD1x2_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
-	xxswapd	vs17, vs16
-
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs8,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs9,	(48+\OffsetA)(AO)	// load real,imag from A
-.endm	
-
-
-.macro END1x2_2	  
-  /*for load2 offset will be 64 and 32*/
-   KERNEL1x2_2	AO,BO,	64,32,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL1x2_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x2_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-  xxswapd	vs21, vs20
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17
-.if \Complete==0	
-	lxv	vs0,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	 
-.if \Complete==0		
-	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-.endif
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs33,	vs8,	vs21
-.if \Complete==0		
-  xxswapd	vs17, vs16
-.endif
-	xvmaddadp	vs34,	vs9,	vs20
-	xvmaddadp	vs35,	vs9,	vs21
-.if \Complete==0	 
- 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
-.endif
-.if \Complete==0		
-	lxv	vs8,	DISP4(\Index,32+0+ \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP4(\Index,32+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
- 
- 
-
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP4(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP4(\Index,64)
-	addi	\BREG, \BREG,  DISP2(\Index,32)
-.endif
-.endif 
-.endm
- 
-
-
-.macro KERNEL1x2
-  LOAD1x2
-  END1x2  AO, BO, 32,16
-.endm
-
-
-
-.macro SAVE1x2 
-	SAVE2  vs32,vs33,vs34,vs35,CO,0
-	addi	CO, CO, 32 
-.endm
-/**********************************************************************************************
-*
-
-.macros for N=2 and M=1
-**********************************************************************************************/
-
-
-
-.macro Zero1x1
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33 
-.endm
-
-
-.macro LOAD1x1   
-	LOAD1x1O 0,0 
-.endm
-
-
-.macro LOAD1x1O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A 
-	xxswapd	vs17, vs16
-
-.endm
-
-
-.macro END1x1_NORMAL
-	END1x1 AO,BO,16,16
-.endm
-
-
-.macro END1x1_WITHOUT_ADD
-	END1x1 AO,BO,0,0
-.endm
-
-
-.macro END1x1	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16 
-	xvmaddadp	vs33,	vs0,	vs17 
-.endm
-
-
-.macro LOAD1x1_2
-    LOAD1x1_2O 0,0
-.endm	
-
-
-.macro LOAD1x1_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
-	xxswapd	vs17, vs16
-
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs8,	(16+\OffsetA)(AO)	// load real,imag from A 
-.endm	
-
-
-.macro END1x1_2	  
-  /*for load2 offset will be 32 and 32*/
-   KERNEL1x1_2	AO,BO,	32,32,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL1x1_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x1_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-  xxswapd	vs21, vs20
-	xvmaddadp	vs32,	vs0,	vs16 
-	xvmaddadp	vs33,	vs0,	vs17 
-.if \Complete==0	
-	lxv	vs0,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
-.endif	 
-.if \Complete==0		
-	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-.endif
-.if \Complete==0		
-  xxswapd	vs17, vs16
-.endif 
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs33,	vs8,	vs21 
-.if \Complete==0		
-	lxv	vs8,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
-.endif
- 
-.if \Complete==0	 
- 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
-.endif
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP2(\Index,32)
-	addi	\BREG, \BREG,  DISP2(\Index,32)
-.endif
-.endif 
-.endm
- 
-
-
-.macro KERNEL1x1
-  LOAD1x1
-  END1x1  AO, BO, 16,16
-.endm
-
-
-
-.macro SAVE1x1
-	SAVE1  vs32,vs33,CO,0
-	addi	CO, CO, 16 
-.endm
-
-/****************************TRMM POINTER REFRESH
-
-.macroSES*************************/
-
-
-.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
-		.if \SHIFT_VAL==16 
-			slwi		\REG1,	\REG2,	8			
-		.elseif \SHIFT_VAL==8  
-			slwi		\REG1,	\REG2,	7			 
-		.elseif \SHIFT_VAL==4
-			slwi		\REG1,	\REG2,	6			  
-		.elseif \SHIFT_VAL==2
-			slwi		\REG1,	\REG2,	5			 
-		.elseif \SHIFT_VAL==1
-			slwi		\REG1,	\REG2,	4			 
-		.endif
-.endm
-/*
-//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		ptrbb = bb;
-// #else
-// 		ptrba += off*16;
-// 		ptrbb = bb + off*2;
-// #endif
-*/
-
-
-.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
-    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
-        /* ptrbb = bb;*/
-        mr \PTR_B,\B_VAL     /* refresh BPOINT */
-    #else
-		    /*
-        // ptrba  =ptrba+ off*C_A;
-        // ptrbb = bb + off*C_B; 
-				*/
-		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
-		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
-		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
-		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
-    #endif 
-.endm
-
-/*
-// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-// 		temp = bk-off;
-// #elif defined(LEFT)
-// 		temp = off+16;	// number of values in A
-// #else
-// 		temp = off+2;	// number of values in B
-// #endif
-*/
-
-
-.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
-    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
-                            /* temp = bk-off;*/
-           sub \TEMP_BK,\BK_VAL,\OFF_VAL
-    #elif defined(LEFT)
-                            /* temp = off+INCR_A;	// number of values in A */
-           addi \TEMP_BK, \OFF_VAL, \INCR_A
-    #else
-                            /* temp = off+INCR_B	// number of values in B*/
-           addi \TEMP_BK,\OFF_VAL, \INCR_B
-    #endif
-.endm
-/*
-// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		temp = bk - off;
-// #ifdef LEFT
-// 		temp -= 16; // number of values in A
-// #else
-// 		temp -= 2; // number of values in B
-// #endif
-// 		ptrba += temp*16;
-// 		ptrbb += temp*2;
-// #endif
-// #ifdef LEFT
-// 		off += 16; // number of values in A
-// #endif
-*/
- 
-
-
-.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
-    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-                    /*temp = bk - off;*/
-                sub \TEMP_BK,\BK_VAL,\OFF_VAL
-    #ifdef LEFT
-                    /*temp -= 8; // number of values in A*/
-                addi \TEMP_BK,\TEMP_BK,-\C_A
-    #else
-                    /*temp -= 4; // number of values in B*/
-                addi \TEMP_BK,\TEMP_BK,-\C_B 
-    #endif
-                    /*ptrba += temp*C_A;
-                    ptrbb += temp*C_B;*/ 
-                SHIFT_REG T4,\TEMP_BK,\C_A
-								SHIFT_REG T2,\TEMP_BK,\C_B
-                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
-								add \PTR_B, \PTR_B,T2 
-    #endif
-    #ifdef LEFT
-                    /*off += 8; // number of values in A*/
-                 addi \OFF_VAL,\OFF_VAL,\C_A
-    #endif
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define unit_size 16
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+#define DISPX(disp)  (disp)
+/*	HELPERS FOR SAVE	*/
+/* {r0,i0} and {r1,i1} into  {r0,r1} {i0,i1} */
+
+
+.macro LOAD_COUPLE_AS_RR_II  VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET 
+#ifndef TRMMKERNEL 
+  lxv	\VS_TEMP1,	DISPX(\LOFFSET)(\REG)
+  lxv	\VS_TEMP2,	DISPX(\LOFFSET+16)(\REG)
+  xxmrgld  \VS_OUT1,\VS_TEMP1,\VS_TEMP2
+  xxmrghd  \VS_OUT2,\VS_TEMP1,\VS_TEMP2	
+#endif	
+.endm
+/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
+
+
+.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
+	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*real from 2 results*/
+	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*imag from 2 results*/
+.endm 
+/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
+
+
+.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 
+	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*imag */
+	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*real*/
+.endm
+/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
+
+
+.macro  AGGREGATE_REALS_IMAGES  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
+	xvsubdp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
+	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubdp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
+	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubdp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
+#else	// CC || CR || RC || RR 
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+	xvsubdp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
+    /*we will negate alpha image instead  instead to fix sign*/
+	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#endif
+.endm 
+/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
+
+
+.macro MULT_APLHA_PART1  VSINRR,VSINII,VSOUT1,VSOUT2
+#ifndef TRMMKERNEL  
+	xvmsubadp \VSOUT1,\VSINII, alpha_i
+	xvmaddadp  \VSOUT2,\VSINRR, alpha_i
+#else 
+	xvmuldp \VSOUT1,\VSINII, alpha_i 
+	xvmuldp  \VSOUT2,\VSINRR, alpha_i
+#endif 
+.endm
+/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+
+.macro MULT_APLHA_PART2  VSINRR,VSINII,VSOUT1,VSOUT2 
+	xvmsubadp  \VSOUT1,\VSINRR, alpha_r
+	xvmaddadp \VSOUT2,\VSINII, alpha_r
+.endm
+/* unpack to store 2{r,r} {i,i} into  {r,i} {r,i} (big endian because of stxv) */
+
+
+.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 
+	xxmrghd  \VSOUT1,\VSIN2,\VSIN1
+	xxmrgld  \VSOUT2,\VSIN2,\VSIN1
+.endm
+
+
+.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
+	stxv	\VSIN1,	DISPX(\LOFFSET)(\REG)
+	stxv	\VSIN2,	DISPX(\LOFFSET+16)(\REG)
+.endm
+
+
+.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5
+  LOAD_COUPLE_AS_RR_II	vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
+  LOAD_COUPLE_AS_RR_II	vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64)
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs8,vs9 
+  LOAD_COUPLE_AS_RR_II	vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96)
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11
+  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes10,\VSRes12,vs12,vs13 
+  AGGREGATE_REALS_IMAGES	vs6,vs7,vs8,vs9  
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
+  MULT_APLHA_PART1	vs2,vs4, vs14,vs15
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes14,\VSRes16,\VSRes3,\VSRes4
+  MULT_APLHA_PART1	vs6,vs8,vs16,vs17
+  MULT_APLHA_PART2  vs2,vs4,vs14,vs15 
+  AGGREGATE_REALS_IMAGES	vs10,vs11,vs12,vs13
+  MULT_APLHA_PART2	vs6,vs8,vs16,vs17
+  AGGREGATE_REALS_IMAGES	\VSRes1,\VSRes2,\VSRes3,\VSRes4	
+  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9
+  MULT_APLHA_PART1	vs10,vs12, vs24,vs25
+  UNPACK_FOR_STORE	vs16,vs17,vs3,vs5 
+  MULT_APLHA_PART1	\VSRes1,\VSRes3, vs26,vs27
+  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9
+  MULT_APLHA_PART2	vs10,vs12,vs24,vs25
+  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs3,vs5 
+  MULT_APLHA_PART2	\VSRes1,\VSRes3, vs26,vs27
+  UNPACK_FOR_STORE	vs24,vs25,vs10,vs12
+  UNPACK_FOR_STORE	vs26,vs27,\VSRes1,\VSRes3
+  STORE_COUPLE	\BASE_REG,(\LOFFSET +64),vs10,vs12
+  STORE_COUPLE	\BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
+.endm
+
+
+.macro SAVE4  VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5
+  LOAD_COUPLE_AS_RR_II	vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs8,vs9 
+  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
+  AGGREGATE_REALS_IMAGES	vs6,vs7,vs8,vs9  
+  MULT_APLHA_PART1	vs2,vs4, vs14,vs15
+  MULT_APLHA_PART1	vs6,vs8, vs16,vs17
+  MULT_APLHA_PART2	vs2,vs4, vs14,vs15 
+  MULT_APLHA_PART2	vs6,vs8,vs16,vs17
+  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9
+  UNPACK_FOR_STORE	vs16,vs17,vs3,vs5
+  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9
+  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs3,vs5
+.endm
+
+
+
+.macro SAVE2  VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5	
+  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
+  MULT_APLHA_PART1	vs2,vs4, vs14,vs15	
+  MULT_APLHA_PART2	vs2,vs4, vs14,vs15  
+  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9	
+  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9  
+.endm
+
+
+
+.macro SAVE1  VSRes1,VSRes2,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3
+#ifndef TRMMKERNEL 
+  lxv	vs18,	(\LOFFSET)(\BASE_REG) 
+  xxmrgld  vs14,vs18,vs18
+  xxmrghd  vs15,vs18,vs18	
+#endif	
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes2,vs4,vs5	
+  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
+  MULT_APLHA_PART1	vs2,vs4, vs14,vs15	
+  MULT_APLHA_PART2	vs2,vs4, vs14,vs15  
+  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9 
+  xxmrghd  vs7,vs15,vs14	
+  stxv	vs7,	(\LOFFSET)(\BASE_REG) 
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro Zero2x8
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs42,	vs42,	vs42
+	xxlxor	vs43,	vs43,	vs43
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+	xxlxor	vs46,	vs46,	vs46
+	xxlxor	vs47,	vs47,	vs47
+	xxlxor	vs48,	vs48,	vs48
+	xxlxor	vs49,	vs49,	vs49
+	xxlxor	vs50,	vs50,	vs50
+	xxlxor	vs51,	vs51,	vs51
+	xxlxor	vs52,	vs52,	vs52
+	xxlxor	vs53,	vs53,	vs53
+	xxlxor	vs54,	vs54,	vs54
+	xxlxor	vs55,	vs55,	vs55
+	xxlxor	vs56,	vs56,	vs56
+	xxlxor	vs57,	vs57,	vs57
+	xxlxor	vs58,	vs58,	vs58
+	xxlxor	vs59,	vs59,	vs59
+	xxlxor	vs60,	vs60,	vs60
+	xxlxor	vs61,	vs61,	vs61
+	xxlxor	vs62,	vs62,	vs62
+	xxlxor	vs63,	vs63,	vs63
+.endm
+
+
+.macro LOAD2x8   
+	LOAD2x8O 0,0 
+.endm
+
+
+.macro LOAD2x8O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
+ 
+.endm
+
+
+.macro END2x8_NORMAL
+	END2x8 AO,BO,128,32
+.endm
+
+
+.macro END2x8_WITHOUT_ADD
+	END2x8 AO,BO,0,0
+.endm
+
+
+.macro END2x8	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs48,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs49,	vs0,	vs19
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs50,	vs1,	vs18
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs51,	vs1,	vs19
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs52,	vs2,	vs18
+	xvmaddadp	vs37,	vs2,	vs17
+	xvmaddadp	vs53,	vs2,	vs19
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs54,	vs3,	vs18
+	xvmaddadp	vs39,	vs3,	vs17
+	xvmaddadp	vs55,	vs3,	vs19
+	xvmaddadp	vs40,	vs4,	vs16
+	xvmaddadp	vs56,	vs4,	vs18
+	xvmaddadp	vs41,	vs4,	vs17
+	xvmaddadp	vs57,	vs4,	vs19
+	xvmaddadp	vs42,	vs5,	vs16
+	xvmaddadp	vs58,	vs5,	vs18
+	xvmaddadp	vs43,	vs5,	vs17
+	xvmaddadp	vs59,	vs5,	vs19
+	xvmaddadp	vs44,	vs6,	vs16
+	xvmaddadp	vs60,	vs6,	vs18
+	xvmaddadp	vs45,	vs6,	vs17
+	xvmaddadp	vs61,	vs6,	vs19
+	xvmaddadp	vs46,	vs7,	vs16
+	xvmaddadp	vs62,	vs7,	vs18
+	xvmaddadp	vs47,	vs7,	vs17
+	xvmaddadp	vs63,	vs7,	vs19
+.endm
+
+
+.macro LOAD2x8_2
+    LOAD2x8_2O 0,0
+.endm	
+
+
+.macro LOAD2x8_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
+	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
+	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(128+0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(128+16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs10,   (128+32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs11,   (128+48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs12,   (128+64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs13,   (128+80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs14,   (128+96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs15,   (128+112+\OffsetA)(AO)	// load real,imag from A
+.endm	
+
+
+.macro END2x8_2	  
+  /*for load2 offset will be 256 and 64*/
+   KERNEL2x8_2	AO,BO,	256,64,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL2x8_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x8_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs48,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs49,	vs0,	vs19
+  xxswapd	vs21, vs20
+  xxswapd	vs23, vs22
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs50,	vs1,	vs18
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs51,	vs1,	vs19
+.if \Complete==0	
+	lxv	vs0,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs52,	vs2,	vs18
+	xvmaddadp	vs37,	vs2,	vs17
+	xvmaddadp	vs53,	vs2,	vs19
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs54,	vs3,	vs18
+	xvmaddadp	vs39,	vs3,	vs17
+	xvmaddadp	vs55,	vs3,	vs19
+.if \Complete==0	
+	lxv	vs2,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs40,	vs4,	vs16
+	xvmaddadp	vs56,	vs4,	vs18
+	xvmaddadp	vs41,	vs4,	vs17
+	xvmaddadp	vs57,	vs4,	vs19
+	xvmaddadp	vs42,	vs5,	vs16
+	xvmaddadp	vs58,	vs5,	vs18
+	xvmaddadp	vs43,	vs5,	vs17
+	xvmaddadp	vs59,	vs5,	vs19
+.if \Complete==0		
+	lxv	vs4,	DISP16(\Index,64+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs5,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs44,	vs6,	vs16
+	xvmaddadp	vs60,	vs6,	vs18
+	xvmaddadp	vs45,	vs6,	vs17
+	xvmaddadp	vs61,	vs6,	vs19
+	xvmaddadp	vs46,	vs7,	vs16
+	xvmaddadp	vs62,	vs7,	vs18
+	xvmaddadp	vs47,	vs7,	vs17
+	xvmaddadp	vs63,	vs7,	vs19	
+.if \Complete==0		
+	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
+.endif
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs48,	vs8,	vs22
+.if \Complete==0
+	lxv	vs6,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs7,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A	
+.endif	
+	xvmaddadp	vs33,	vs8,	vs21
+	xvmaddadp	vs49,	vs8,	vs23
+.if \Complete==0		
+  xxswapd	vs17, vs16
+  xxswapd	vs19, vs18
+.endif
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs50,	vs9,	vs22
+	xvmaddadp	vs35,	vs9,	vs21
+	xvmaddadp	vs51,	vs9,	vs23
+.if \Complete==0		
+	lxv	vs8,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+	xvmaddadp	vs36,	vs10,	vs20
+	xvmaddadp	vs52,	vs10,	vs22
+	xvmaddadp	vs37,	vs10,	vs21
+	xvmaddadp	vs53,	vs10,	vs23
+	xvmaddadp	vs38,	vs11,	vs20
+	xvmaddadp	vs54,	vs11,	vs22
+	xvmaddadp	vs39,	vs11,	vs21
+	xvmaddadp	vs55,	vs11,	vs23
+.if \Complete==0	
+	lxv	vs10,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs40,	vs12,	vs20
+	xvmaddadp	vs56,	vs12,	vs22
+	xvmaddadp	vs41,	vs12,	vs21
+	xvmaddadp	vs57,	vs12,	vs23
+	xvmaddadp	vs42,	vs13,	vs20
+	xvmaddadp	vs58,	vs13,	vs22
+	xvmaddadp	vs43,	vs13,	vs21
+	xvmaddadp	vs59,	vs13,	vs23
+.if \Complete==0	
+	lxv	vs12,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs13,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs44,	vs14,	vs20
+	xvmaddadp	vs60,	vs14,	vs22
+	xvmaddadp	vs45,	vs14,	vs21
+	xvmaddadp	vs61,	vs14,	vs23
+	xvmaddadp	vs46,	vs15,	vs20
+	xvmaddadp	vs62,	vs15,	vs22
+	xvmaddadp	vs47,	vs15,	vs21
+	xvmaddadp	vs63,	vs15,	vs23
+.if \Complete==0	
+	lxv	vs14,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs15,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A
+ 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP16(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP16(\Index,256)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif 
+.endm
+
+ 
+
+
+
+.macro KERNEL2x8
+  LOAD2x8
+  END2x8  AO, BO, 128,32
+.endm
+
+
+.macro SAVE2x8
+	add	T1, CO ,LDC 
+	SAVE8  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
+	SAVE8  vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0  
+	addi	CO, CO, 128
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=4
+**********************************************************************************************/
+
+
+.macro Zero2x4
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs42,	vs42,	vs42
+	xxlxor	vs43,	vs43,	vs43
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+	xxlxor	vs46,	vs46,	vs46
+	xxlxor	vs47,	vs47,	vs47
+.endm
+
+
+.macro LOAD2x4   
+	LOAD2x4O 0,0 
+.endm
+
+
+.macro LOAD2x4O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A  
+.endm
+
+
+.macro END2x4_NORMAL
+	END2x4 AO,BO,64,32
+.endm
+
+
+.macro END2x4_WITHOUT_ADD
+	END2x4 AO,BO,0,0
+.endm
+
+
+.macro END2x4	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs40,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs41,	vs0,	vs19
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs42,	vs1,	vs18
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs43,	vs1,	vs19
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs44,	vs2,	vs18
+	xvmaddadp	vs37,	vs2,	vs17
+	xvmaddadp	vs45,	vs2,	vs19
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs46,	vs3,	vs18
+	xvmaddadp	vs39,	vs3,	vs17
+	xvmaddadp	vs47,	vs3,	vs19
+
+.endm
+
+
+.macro LOAD2x4_2
+    LOAD2x4_2O 0,0
+.endm	
+
+
+.macro LOAD2x4_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
+	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
+	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs10,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs11,	(112+\OffsetA)(AO)	// load real,imag from A 
+.endm	
+
+
+.macro END2x4_2	  
+  /*for load2 offset will be 128 and 64*/
+   KERNEL2x4_2	AO,BO,	128,64,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL2x4_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x4_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs40,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs41,	vs0,	vs19
+  xxswapd	vs21, vs20
+  xxswapd	vs23, vs22
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs42,	vs1,	vs18
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs43,	vs1,	vs19
+.if \Complete==0	
+	lxv	vs0,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs44,	vs2,	vs18
+	xvmaddadp	vs37,	vs2,	vs17
+	xvmaddadp	vs45,	vs2,	vs19
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs46,	vs3,	vs18
+	xvmaddadp	vs39,	vs3,	vs17
+	xvmaddadp	vs47,	vs3,	vs19
+.if \Complete==0	
+	lxv	vs2,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+ 
+.if \Complete==0		
+	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
+.endif
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs40,	vs8,	vs22 
+	xvmaddadp	vs33,	vs8,	vs21
+	xvmaddadp	vs41,	vs8,	vs23
+.if \Complete==0		
+  xxswapd	vs17, vs16
+  xxswapd	vs19, vs18
+.endif
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs42,	vs9,	vs22
+	xvmaddadp	vs35,	vs9,	vs21
+	xvmaddadp	vs43,	vs9,	vs23
+.if \Complete==0		
+	lxv	vs8,	DISP8(\Index,64+0+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+	xvmaddadp	vs36,	vs10,	vs20
+	xvmaddadp	vs44,	vs10,	vs22
+	xvmaddadp	vs37,	vs10,	vs21
+	xvmaddadp	vs45,	vs10,	vs23
+	xvmaddadp	vs38,	vs11,	vs20
+	xvmaddadp	vs46,	vs11,	vs22
+	xvmaddadp	vs39,	vs11,	vs21
+	xvmaddadp	vs47,	vs11,	vs23
+.if \Complete==0	
+	lxv	vs10,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+ 
+.if \Complete==0	 
+ 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP8(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP8(\Index,128)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif 
+.endm
+ 
+
+
+.macro KERNEL2x4
+  LOAD2x4
+  END2x4  AO, BO, 64,32
+.endm
+
+
+
+.macro SAVE2x4 
+	add	T1, CO ,LDC 
+	SAVE4  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
+	SAVE4  vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0  
+	addi	CO, CO, 64
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=2
+**********************************************************************************************/
+
+
+.macro Zero2x2
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+
+.endm
+
+
+.macro LOAD2x2   
+	LOAD2x2O 0,0 
+.endm
+
+
+.macro LOAD2x2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+ 
+.endm
+
+
+.macro END2x2_NORMAL
+	END2x2 AO,BO,32,32
+.endm
+
+
+.macro END2x2_WITHOUT_ADD
+	END2x2 AO,BO,0,0
+.endm
+
+
+.macro END2x2	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs36,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs37,	vs0,	vs19
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs38,	vs1,	vs18
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs39,	vs1,	vs19 
+
+.endm
+
+
+.macro LOAD2x2_2
+    LOAD2x2_2O 0,0
+.endm	
+
+
+.macro LOAD2x2_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
+	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
+	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A 
+	lxv	vs8,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(48+\OffsetA)(AO)	// load real,imag from A
+ 	
+.endm	
+
+
+.macro END2x2_2	  
+  /*for load2 offset will be 64 and 64*/
+   KERNEL2x2_2	AO,BO,	64,64,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL2x2_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x2_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs36,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs37,	vs0,	vs19
+  xxswapd	vs21, vs20
+  xxswapd	vs23, vs22
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs38,	vs1,	vs18
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs39,	vs1,	vs19
+.if \Complete==0	
+	lxv	vs0,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	 
+.if \Complete==0		
+	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
+.endif
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs36,	vs8,	vs22 
+	xvmaddadp	vs33,	vs8,	vs21
+	xvmaddadp	vs37,	vs8,	vs23
+.if \Complete==0		
+  xxswapd	vs17, vs16
+  xxswapd	vs19, vs18
+.endif
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs38,	vs9,	vs22
+	xvmaddadp	vs35,	vs9,	vs21
+	xvmaddadp	vs39,	vs9,	vs23
+.if \Complete==0	 
+ 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
+.endif
+.if \Complete==0		
+	lxv	vs8,	DISP4(\Index,32+0+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP4(\Index,32+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+ 
+ 
+
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP4(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP4(\Index,64)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif 
+.endm
+ 
+
+
+.macro KERNEL2x2
+  LOAD2x2
+  END2x2  AO, BO, 32,32
+.endm
+
+
+
+.macro SAVE2x2 
+	add	T1, CO ,LDC 
+	SAVE2  vs32,vs33,vs34,vs35,CO,0
+	SAVE2  vs36,vs37,vs38,vs39,T1,0 
+	addi	CO, CO, 32 
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=1
+**********************************************************************************************/
+
+
+
+.macro Zero2x1
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+ 
+.endm
+
+
+.macro LOAD2x1   
+	LOAD2x1O 0,0 
+.endm
+
+
+.macro LOAD2x1O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A 
+.endm
+
+
+.macro END2x1_NORMAL
+	END2x1 AO,BO,16,32
+.endm
+
+
+.macro END2x1_WITHOUT_ADD
+	END2x1 AO,BO,0,0
+.endm
+
+
+.macro END2x1	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs34,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs35,	vs0,	vs19 
+.endm
+
+
+.macro LOAD2x1_2
+    LOAD2x1_2O 0,0
+.endm	
+
+
+.macro LOAD2x1_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
+	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
+	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(16+\OffsetA)(AO)	// load real,imag from A 
+.endm	
+
+
+.macro END2x1_2	  
+  /*for load2 offset will be 32 and 64*/
+   KERNEL2x1_2	AO,BO,	32,64,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL2x1_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x1_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xxswapd	vs21, vs20
+  xxswapd	vs23, vs22 
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs34,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs35,	vs0,	vs19
+.if \Complete==0	
+	lxv	vs0,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
+.endif	 
+.if \Complete==0		
+	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
+.endif
+.if \Complete==0		
+  xxswapd	vs17, vs16
+  xxswapd	vs19, vs18
+.endif 
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs34,	vs8,	vs22 
+	xvmaddadp	vs33,	vs8,	vs21
+	xvmaddadp	vs35,	vs8,	vs23
+.if \Complete==0		
+	lxv	vs8,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
+.endif
+ 
+.if \Complete==0	 
+ 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP2(\Index,32)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif 
+.endm
+ 
+
+
+.macro KERNEL2x1
+  LOAD2x1
+  END2x1  AO, BO, 16,32
+.endm
+
+
+
+.macro SAVE2x1
+	add	T1, CO ,LDC 
+	SAVE1  vs32,vs33,CO,0
+	SAVE1  vs34,vs35,T1,0  
+	addi	CO, CO, 16 
+.endm
+
+/**********************************************************************************************
+*
+
+.macros for N=1 and M=8
+**********************************************************************************************/
+
+
+.macro Zero1x8
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs42,	vs42,	vs42
+	xxlxor	vs43,	vs43,	vs43
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+	xxlxor	vs46,	vs46,	vs46
+	xxlxor	vs47,	vs47,	vs47
+	xxlxor	vs48,	vs48,	vs48
+.endm
+
+
+.macro LOAD1x8   
+	LOAD1x8O 0,0 
+.endm
+
+
+.macro LOAD1x8O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B 
+	xxswapd	vs17, vs16 
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
+ 
+.endm
+
+
+.macro END1x8_NORMAL
+	END1x8 AO,BO,128,16
+.endm
+
+
+.macro END1x8_WITHOUT_ADD
+	END1x8 AO,BO,0,0
+.endm
+
+
+.macro END1x8	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs37,	vs2,	vs17
+
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs39,	vs3,	vs17
+
+	xvmaddadp	vs40,	vs4,	vs16
+	xvmaddadp	vs41,	vs4,	vs17
+
+	xvmaddadp	vs42,	vs5,	vs16
+	xvmaddadp	vs43,	vs5,	vs17
+
+	xvmaddadp	vs44,	vs6,	vs16
+	xvmaddadp	vs45,	vs6,	vs17
+
+	xvmaddadp	vs46,	vs7,	vs16
+	xvmaddadp	vs47,	vs7,	vs17
+
+.endm
+
+
+.macro LOAD1x8_2
+    LOAD1x8_2O 0,0
+.endm	
+
+
+.macro LOAD1x8_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(128+0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(128+16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs10,   (128+32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs11,   (128+48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs12,   (128+64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs13,   (128+80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs14,   (128+96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs15,   (128+112+\OffsetA)(AO)	// load real,imag from A
+.endm	
+
+
+.macro END1x8_2	  
+  /*for load2 offset will be 256 and 32*/
+   KERNEL1x8_2	AO,BO,	256,32,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL1x8_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x8_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+  xxswapd	vs21, vs20
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+.if \Complete==0	
+	lxv	vs0,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs37,	vs2,	vs17
+
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs39,	vs3,	vs17
+.if \Complete==0	
+	lxv	vs2,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs40,	vs4,	vs16
+	xvmaddadp	vs41,	vs4,	vs17
+
+	xvmaddadp	vs42,	vs5,	vs16
+	xvmaddadp	vs43,	vs5,	vs17
+.if \Complete==0		
+	lxv	vs4,	DISP16(\Index,64+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs5,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs44,	vs6,	vs16
+	xvmaddadp	vs45,	vs6,	vs17
+
+	xvmaddadp	vs46,	vs7,	vs16
+	xvmaddadp	vs47,	vs7,	vs17
+.if \Complete==0		
+	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+.endif
+.if \Complete==0		
+  xxswapd	vs17, vs16
+.endif
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21
+.if \Complete==0
+	lxv	vs6,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs7,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A	
+.endif	
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs35,	vs9,	vs21
+.if \Complete==0		
+	lxv	vs8,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+	xvmaddadp	vs36,	vs10,	vs20
+	xvmaddadp	vs37,	vs10,	vs21
+	xvmaddadp	vs38,	vs11,	vs20
+	xvmaddadp	vs39,	vs11,	vs21
+.if \Complete==0	
+	lxv	vs10,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs40,	vs12,	vs20
+	xvmaddadp	vs41,	vs12,	vs21
+	xvmaddadp	vs42,	vs13,	vs20
+	xvmaddadp	vs43,	vs13,	vs21
+.if \Complete==0	
+	lxv	vs12,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs13,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs44,	vs14,	vs20
+	xvmaddadp	vs45,	vs14,	vs21
+	xvmaddadp	vs46,	vs15,	vs20
+	xvmaddadp	vs47,	vs15,	vs21
+.if \Complete==0	
+	lxv	vs14,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs15,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A
+ 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP16(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP16(\Index,256)
+	addi	\BREG, \BREG,  DISP2(\Index,32)
+.endif
+.endif 
+.endm
+
+ 
+
+
+
+.macro KERNEL1x8
+  LOAD1x8
+  END1x8  AO, BO, 128,16
+.endm
+
+
+.macro SAVE1x8
+	SAVE8  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
+	addi	CO, CO, 128
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=4
+**********************************************************************************************/
+
+
+.macro Zero1x4
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+.endm
+
+
+.macro LOAD1x4   
+	LOAD1x4O 0,0 
+.endm
+
+
+.macro LOAD1x4O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A 
+ 
+.endm
+
+
+.macro END1x4_NORMAL
+	END1x4 AO,BO,64,16
+.endm
+
+
+.macro END1x4_WITHOUT_ADD
+	END1x4 AO,BO,0,0
+.endm
+
+
+.macro END1x4	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs37,	vs2,	vs17
+
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs39,	vs3,	vs17
+
+.endm
+
+
+.macro LOAD1x4_2
+    LOAD1x4_2O 0,0
+.endm	
+
+
+.macro LOAD1x4_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs10,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs11,	(112+\OffsetA)(AO)	// load real,imag from A 
+.endm	
+
+
+.macro END1x4_2	  
+  /*for load2 offset will be 128 and 32*/
+   KERNEL1x4_2	AO,BO,	128,32,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL1x4_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x4_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+  xxswapd	vs21, vs20
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+.if \Complete==0	
+	lxv	vs0,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs37,	vs2,	vs17
+
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs39,	vs3,	vs17
+.if \Complete==0	
+	lxv	vs2,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+ 
+.if \Complete==0		
+	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+.endif
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21
+.if \Complete==0		
+  xxswapd	vs17, vs16
+.endif
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs35,	vs9,	vs21
+.if \Complete==0		
+	lxv	vs8,	DISP8(\Index,64+0+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+	xvmaddadp	vs36,	vs10,	vs20
+	xvmaddadp	vs37,	vs10,	vs21
+	xvmaddadp	vs38,	vs11,	vs20
+	xvmaddadp	vs39,	vs11,	vs21
+.if \Complete==0	
+	lxv	vs10,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+ 
+.if \Complete==0	 
+ 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP8(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP8(\Index,128)
+	addi	\BREG, \BREG,  DISP2(\Index,32)
+.endif
+.endif 
+.endm
+ 
+
+
+.macro KERNEL1x4
+  LOAD1x4
+  END1x4  AO, BO, 64,16
+.endm
+
+
+
+.macro SAVE1x4 
+	SAVE4  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
+	addi	CO, CO, 64
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=2
+**********************************************************************************************/
+
+
+.macro Zero1x2
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35 
+
+.endm
+
+
+.macro LOAD1x2   
+	LOAD1x2O 0,0 
+.endm
+
+
+.macro LOAD1x2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A 
+
+.endm
+
+
+.macro END1x2_NORMAL
+	END1x2 AO,BO,32,16
+.endm
+
+
+.macro END1x2_WITHOUT_ADD
+	END1x2 AO,BO,0,0
+.endm
+
+
+.macro END1x2	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+
+.endm
+
+
+.macro LOAD1x2_2
+    LOAD1x2_2O 0,0
+.endm	
+
+
+.macro LOAD1x2_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(48+\OffsetA)(AO)	// load real,imag from A
+.endm	
+
+
+.macro END1x2_2	  
+  /*for load2 offset will be 64 and 32*/
+   KERNEL1x2_2	AO,BO,	64,32,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL1x2_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x2_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+  xxswapd	vs21, vs20
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+.if \Complete==0	
+	lxv	vs0,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	 
+.if \Complete==0		
+	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+.endif
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21
+.if \Complete==0		
+  xxswapd	vs17, vs16
+.endif
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs35,	vs9,	vs21
+.if \Complete==0	 
+ 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
+.endif
+.if \Complete==0		
+	lxv	vs8,	DISP4(\Index,32+0+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP4(\Index,32+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+ 
+ 
+
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP4(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP4(\Index,64)
+	addi	\BREG, \BREG,  DISP2(\Index,32)
+.endif
+.endif 
+.endm
+ 
+
+
+.macro KERNEL1x2
+  LOAD1x2
+  END1x2  AO, BO, 32,16
+.endm
+
+
+
+.macro SAVE1x2 
+	SAVE2  vs32,vs33,vs34,vs35,CO,0
+	addi	CO, CO, 32 
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=1
+**********************************************************************************************/
+
+
+
+.macro Zero1x1
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33 
+.endm
+
+
+.macro LOAD1x1   
+	LOAD1x1O 0,0 
+.endm
+
+
+.macro LOAD1x1O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A 
+	xxswapd	vs17, vs16
+
+.endm
+
+
+.macro END1x1_NORMAL
+	END1x1 AO,BO,16,16
+.endm
+
+
+.macro END1x1_WITHOUT_ADD
+	END1x1 AO,BO,0,0
+.endm
+
+
+.macro END1x1	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16 
+	xvmaddadp	vs33,	vs0,	vs17 
+.endm
+
+
+.macro LOAD1x1_2
+    LOAD1x1_2O 0,0
+.endm	
+
+
+.macro LOAD1x1_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(16+\OffsetA)(AO)	// load real,imag from A 
+.endm	
+
+
+.macro END1x1_2	  
+  /*for load2 offset will be 32 and 32*/
+   KERNEL1x1_2	AO,BO,	32,32,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL1x1_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x1_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xxswapd	vs21, vs20
+	xvmaddadp	vs32,	vs0,	vs16 
+	xvmaddadp	vs33,	vs0,	vs17 
+.if \Complete==0	
+	lxv	vs0,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
+.endif	 
+.if \Complete==0		
+	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+.endif
+.if \Complete==0		
+  xxswapd	vs17, vs16
+.endif 
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21 
+.if \Complete==0		
+	lxv	vs8,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
+.endif
+ 
+.if \Complete==0	 
+ 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP2(\Index,32)
+	addi	\BREG, \BREG,  DISP2(\Index,32)
+.endif
+.endif 
+.endm
+ 
+
+
+.macro KERNEL1x1
+  LOAD1x1
+  END1x1  AO, BO, 16,16
+.endm
+
+
+
+.macro SAVE1x1
+	SAVE1  vs32,vs33,CO,0
+	addi	CO, CO, 16 
+.endm
+
+/****************************TRMM POINTER REFRESH
+
+.macroSES*************************/
+
+
+.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
+		.if \SHIFT_VAL==16 
+			slwi		\REG1,	\REG2,	8			
+		.elseif \SHIFT_VAL==8  
+			slwi		\REG1,	\REG2,	7			 
+		.elseif \SHIFT_VAL==4
+			slwi		\REG1,	\REG2,	6			  
+		.elseif \SHIFT_VAL==2
+			slwi		\REG1,	\REG2,	5			 
+		.elseif \SHIFT_VAL==1
+			slwi		\REG1,	\REG2,	4			 
+		.endif
+.endm
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*16;
+// 		ptrbb = bb + off*2;
+// #endif
+*/
+
+
+.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+        /* ptrbb = bb;*/
+        mr \PTR_B,\B_VAL     /* refresh BPOINT */
+    #else
+		    /*
+        // ptrba  =ptrba+ off*C_A;
+        // ptrbb = bb + off*C_B; 
+				*/
+		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
+		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
+		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
+		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
+    #endif 
+.endm
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+16;	// number of values in A
+// #else
+// 		temp = off+2;	// number of values in B
+// #endif
+*/
+
+
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+                            /* temp = bk-off;*/
+           sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #elif defined(LEFT)
+                            /* temp = off+INCR_A;	// number of values in A */
+           addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+                            /* temp = off+INCR_B	// number of values in B*/
+           addi \TEMP_BK,\OFF_VAL, \INCR_B
+    #endif
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 16; // number of values in A
+// #else
+// 		temp -= 2; // number of values in B
+// #endif
+// 		ptrba += temp*16;
+// 		ptrbb += temp*2;
+// #endif
+// #ifdef LEFT
+// 		off += 16; // number of values in A
+// #endif
+*/
+ 
+
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                    /*temp = bk - off;*/
+                sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #ifdef LEFT
+                    /*temp -= 8; // number of values in A*/
+                addi \TEMP_BK,\TEMP_BK,-\C_A
+    #else
+                    /*temp -= 4; // number of values in B*/
+                addi \TEMP_BK,\TEMP_BK,-\C_B 
+    #endif
+                    /*ptrba += temp*C_A;
+                    ptrbb += temp*C_B;*/ 
+                SHIFT_REG T4,\TEMP_BK,\C_A
+								SHIFT_REG T2,\TEMP_BK,\C_B
+                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
+								add \PTR_B, \PTR_B,T2 
+    #endif
+    #ifdef LEFT
+                    /*off += 8; // number of values in A*/
+                 addi \OFF_VAL,\OFF_VAL,\C_A
+    #endif
 .endm
\ No newline at end of file
diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S
index ef156fd27..76ea12fee 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S
+++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S
@@ -1,6806 +1,6806 @@
-/*********************************************************************************
-Copyright (c) 2013, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**********************************************************************************/
-
-/*********************************************************************
-* 2014/07/28 Saar
-*        BLASTEST               : OK
-*        CTEST                  : OK
-*        TEST                   : OK
-*
-* 2013/10/28 Saar
-* Parameter:
-*	SGEMM_DEFAULT_UNROLL_N	4
-*	SGEMM_DEFAULT_UNROLL_M	16
-*	SGEMM_DEFAULT_P		768
-*	SGEMM_DEFAULT_Q		384
-*	A_PR1			512
-*	B_PR1			512
-*	
-* 
-* 2014/07/28 Saar
-* Performance at 9216x9216x9216:
-*       1 thread:      102 GFLOPS       (SANDYBRIDGE:  59)      (MKL:   83)
-*       2 threads:     195 GFLOPS       (SANDYBRIDGE: 116)      (MKL:  155)
-*       3 threads:     281 GFLOPS       (SANDYBRIDGE: 165)      (MKL:  230)
-*       4 threads:     366 GFLOPS       (SANDYBRIDGE: 223)      (MKL:  267)
-*
-*********************************************************************/
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define BO2	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define	CO2	%rdx
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 256
-
-#define OLD_A		40 + STACKSIZE(%rsp)
-#define OLD_B		48 + STACKSIZE(%rsp)
-#define OLD_C		56 + STACKSIZE(%rsp)
-#define OLD_LDC		64 + STACKSIZE(%rsp)
-#define OLD_OFFSET	72 + STACKSIZE(%rsp)
-
-#endif
-
-#if defined(OS_WINDOWS)
-#define L_BUFFER_SIZE 8192
-#else
-#define L_BUFFER_SIZE 12288
-#endif
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA	 48(%rsp)
-#define OFFSET	 56(%rsp)
-#define KK	 64(%rsp)
-#define KKK	 72(%rsp)
-#define BUFFER1	           128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $0,  4096 * 4(%rsp);\
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-#if defined(BULLDOZER)
-
-#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
-
-#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0
-
-#else
-
-#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0
-
-#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0
-
-#endif
-
-
-#define	A_PR1	512
-#define	B_PR1	512
-
-/*******************************************************************************************
-* 6 lines of N
-*******************************************************************************************/
-
-.macro KERNEL16x6_SUB
-	vmovups 	-16 * SIZE(AO), %ymm0
-	vmovups 	 -8 * SIZE(AO), %ymm1
-	vbroadcastss	 -4 * SIZE(BO), %ymm2
-	vbroadcastss	 -3 * SIZE(BO), %ymm3
-	prefetcht0	A_PR1(AO)
-
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
-	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
-	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )
-
-	vbroadcastss	 -2 * SIZE(BO), %ymm2
-	vbroadcastss	 -1 * SIZE(BO), %ymm3
-	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
-	VFMADD231PS_(  	%ymm9,%ymm2,%ymm1  )
-	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
-	VFMADD231PS_(  	%ymm11,%ymm3,%ymm1 )
-
-	vbroadcastss	  0 * SIZE(BO), %ymm2
-	vbroadcastss	  1 * SIZE(BO), %ymm3
-	VFMADD231PS_(  	%ymm12,%ymm2,%ymm0  )
-	VFMADD231PS_(  	%ymm13,%ymm2,%ymm1  )
-	VFMADD231PS_(  	%ymm14,%ymm3,%ymm0 )
-	VFMADD231PS_(  	%ymm15,%ymm3,%ymm1 )
-
-	addq	$ 6*SIZE, BO 
-	addq	$ 16*SIZE, AO 
-	decq	%rax 
-.endm
-
-.macro SAVE16x6
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm5 , %ymm5
-	vmulps	%ymm0 , %ymm6 , %ymm6
-	vmulps	%ymm0 , %ymm7 , %ymm7
-	vmulps	%ymm0 , %ymm8 , %ymm8
-	vmulps	%ymm0 , %ymm9 , %ymm9
-	vmulps	%ymm0 , %ymm10, %ymm10
-	vmulps	%ymm0 , %ymm11, %ymm11
-	vmulps	%ymm0 , %ymm12, %ymm12
-	vmulps	%ymm0 , %ymm13, %ymm13
-	vmulps	%ymm0 , %ymm14, %ymm14
-	vmulps	%ymm0 , %ymm15, %ymm15
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
-
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
-
-	vaddps 	        (CO1, LDC,2), %ymm8,%ymm8
-	vaddps  8 * SIZE(CO1, LDC,2), %ymm9,%ymm9
-
-	vaddps 	        (CO2), %ymm10,%ymm10
-	vaddps  8 * SIZE(CO2), %ymm11,%ymm11
-
-	vaddps 	        (CO2, LDC), %ymm12,%ymm12
-	vaddps  8 * SIZE(CO2, LDC), %ymm13,%ymm13
-
-	vaddps 	        (CO2, LDC,2), %ymm14,%ymm14
-	vaddps  8 * SIZE(CO2, LDC,2), %ymm15,%ymm15
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 , 8 * SIZE(CO1)
-
-	vmovups	%ymm6 ,  	(CO1, LDC)
-	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
-
-	vmovups	%ymm8 ,  	(CO1, LDC,2)
-	vmovups	%ymm9 , 8 * SIZE(CO1, LDC,2)
-
-	vmovups	%ymm10,  	(CO2)
-	vmovups	%ymm11, 8 * SIZE(CO2)
-
-	vmovups	%ymm12,  	(CO2, LDC)
-	vmovups	%ymm13, 8 * SIZE(CO2, LDC)
-
-	vmovups	%ymm14,  	(CO2, LDC,2)
-	vmovups	%ymm15, 8 * SIZE(CO2, LDC,2)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL8x6_SUB
-	vmovups 	-16 * SIZE(AO), %ymm0
-	vbroadcastss	 -4 * SIZE(BO), %ymm2
-	vbroadcastss	 -3 * SIZE(BO), %ymm3
-
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
-
-	vbroadcastss	 -2 * SIZE(BO), %ymm2
-	vbroadcastss	 -1 * SIZE(BO), %ymm3
-	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
-	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
-
-	vbroadcastss	  0 * SIZE(BO), %ymm2
-	vbroadcastss	  1 * SIZE(BO), %ymm3
-	VFMADD231PS_(  	%ymm12,%ymm2,%ymm0  )
-	VFMADD231PS_(  	%ymm14,%ymm3,%ymm0 )
-
-	addq	$ 6*SIZE, BO 
-	addq	$ 8*SIZE, AO 
-	decq	%rax 
-.endm
-
-.macro SAVE8x6
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm6 , %ymm6
-	vmulps	%ymm0 , %ymm8 , %ymm8
-	vmulps	%ymm0 , %ymm10, %ymm10
-	vmulps	%ymm0 , %ymm12, %ymm12
-	vmulps	%ymm0 , %ymm14, %ymm14
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-	vaddps 	        (CO1, LDC,2), %ymm8,%ymm8
-	vaddps 	        (CO2), %ymm10,%ymm10
-	vaddps 	        (CO2, LDC), %ymm12,%ymm12
-	vaddps 	        (CO2, LDC,2), %ymm14,%ymm14
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm6 ,  	(CO1, LDC)
-	vmovups	%ymm8 ,  	(CO1, LDC,2)
-	vmovups	%ymm10,  	(CO2)
-	vmovups	%ymm12,  	(CO2, LDC)
-	vmovups	%ymm14,  	(CO2, LDC,2)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL4x6_SUB
-	vmovups 	-16 * SIZE(AO), %xmm0
-	vbroadcastss	 -4 * SIZE(BO), %xmm2
-	vbroadcastss	 -3 * SIZE(BO), %xmm3
-
-	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
-
-	vbroadcastss	 -2 * SIZE(BO), %xmm2
-	vbroadcastss	 -1 * SIZE(BO), %xmm3
-	VFMADD231PS_(  	%xmm8,%xmm2,%xmm0  )
-	VFMADD231PS_(  	%xmm10,%xmm3,%xmm0 )
-
-	vbroadcastss	  0 * SIZE(BO), %xmm2
-	vbroadcastss	  1 * SIZE(BO), %xmm3
-	VFMADD231PS_(  	%xmm12,%xmm2,%xmm0  )
-	VFMADD231PS_(  	%xmm14,%xmm3,%xmm0 )
-
-	addq	$ 6*SIZE, BO 
-	addq	$ 4*SIZE, AO 
-	decq	%rax 
-.endm
-
-.macro SAVE4x6
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vmulps	%xmm0 , %xmm4 , %xmm4
-	vmulps	%xmm0 , %xmm6 , %xmm6
-	vmulps	%xmm0 , %xmm8 , %xmm8
-	vmulps	%xmm0 , %xmm10, %xmm10
-	vmulps	%xmm0 , %xmm12, %xmm12
-	vmulps	%xmm0 , %xmm14, %xmm14
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %xmm4,%xmm4
-	vaddps 	        (CO1, LDC), %xmm6,%xmm6
-	vaddps 	        (CO1, LDC,2), %xmm8,%xmm8
-	vaddps 	        (CO2), %xmm10,%xmm10
-	vaddps 	        (CO2, LDC), %xmm12,%xmm12
-	vaddps 	        (CO2, LDC,2), %xmm14,%xmm14
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm6 ,  	(CO1, LDC)
-	vmovups	%xmm8 ,  	(CO1, LDC,2)
-	vmovups	%xmm10,  	(CO2)
-	vmovups	%xmm12,  	(CO2, LDC)
-	vmovups	%xmm14,  	(CO2, LDC,2)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL2x6_SUB
-	vmovss 	-16 * SIZE(AO), %xmm0
-	vmovss 	-15 * SIZE(AO), %xmm1
-	vmovss	 -4 * SIZE(BO), %xmm2
-	vmovss	 -3 * SIZE(BO), %xmm3
-
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
-	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
-	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
-
-	vmovss	 -2 * SIZE(BO), %xmm2
-	vmovss	 -1 * SIZE(BO), %xmm3
-	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
-	VFMADD231SS_(  	%xmm9,%xmm2,%xmm1  )
-	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
-	VFMADD231SS_(  	%xmm11,%xmm3,%xmm1 )
-
-	vmovss	  0 * SIZE(BO), %xmm2
-	vmovss	  1 * SIZE(BO), %xmm3
-	VFMADD231SS_(  	%xmm12,%xmm2,%xmm0  )
-	VFMADD231SS_(  	%xmm13,%xmm2,%xmm1  )
-	VFMADD231SS_(  	%xmm14,%xmm3,%xmm0 )
-	VFMADD231SS_(  	%xmm15,%xmm3,%xmm1 )
-
-	addq	$ 6*SIZE, BO 
-	addq	$ 2*SIZE, AO 
-	decq	%rax 
-.endm
-
-.macro SAVE2x6
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm5 , %xmm5
-	vmulss	%xmm0 , %xmm6 , %xmm6
-	vmulss	%xmm0 , %xmm7 , %xmm7
-	vmulss	%xmm0 , %xmm8 , %xmm8
-	vmulss	%xmm0 , %xmm9 , %xmm9
-	vmulss	%xmm0 , %xmm10, %xmm10
-	vmulss	%xmm0 , %xmm11, %xmm11
-	vmulss	%xmm0 , %xmm12, %xmm12
-	vmulss	%xmm0 , %xmm13, %xmm13
-	vmulss	%xmm0 , %xmm14, %xmm14
-	vmulss	%xmm0 , %xmm15, %xmm15
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
-
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
-
-	vaddss 	        (CO1, LDC,2), %xmm8,%xmm8
-	vaddss  1 * SIZE(CO1, LDC,2), %xmm9,%xmm9
-
-	vaddss 	        (CO2), %xmm10,%xmm10
-	vaddss  1 * SIZE(CO2), %xmm11,%xmm11
-
-	vaddss 	        (CO2, LDC), %xmm12,%xmm12
-	vaddss  1 * SIZE(CO2, LDC), %xmm13,%xmm13
-
-	vaddss 	        (CO2, LDC,2), %xmm14,%xmm14
-	vaddss  1 * SIZE(CO2, LDC,2), %xmm15,%xmm15
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 , 1 * SIZE(CO1)
-
-	vmovss	%xmm6 ,  	(CO1, LDC)
-	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
-
-	vmovss	%xmm8 ,  	(CO1, LDC,2)
-	vmovss	%xmm9 , 1 * SIZE(CO1, LDC,2)
-
-	vmovss	%xmm10,  	(CO2)
-	vmovss	%xmm11, 1 * SIZE(CO2)
-
-	vmovss	%xmm12,  	(CO2, LDC)
-	vmovss	%xmm13, 1 * SIZE(CO2, LDC)
-
-	vmovss	%xmm14,  	(CO2, LDC,2)
-	vmovss	%xmm15, 1 * SIZE(CO2, LDC,2)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL1x6_SUB
-	vmovss 	-16 * SIZE(AO), %xmm0
-	vmovss	 -4 * SIZE(BO), %xmm2
-	vmovss	 -3 * SIZE(BO), %xmm3
-
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
-
-	vmovss	 -2 * SIZE(BO), %xmm2
-	vmovss	 -1 * SIZE(BO), %xmm3
-	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
-	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
-
-	vmovss	  0 * SIZE(BO), %xmm2
-	vmovss	  1 * SIZE(BO), %xmm3
-	VFMADD231SS_(  	%xmm12,%xmm2,%xmm0  )
-	VFMADD231SS_(  	%xmm14,%xmm3,%xmm0 )
-
-	addq	$ 6*SIZE, BO 
-	addq	$ 1*SIZE, AO 
-	decq	%rax 
-.endm
-
-.macro SAVE1x6
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm6 , %xmm6
-	vmulss	%xmm0 , %xmm8 , %xmm8
-	vmulss	%xmm0 , %xmm10, %xmm10
-	vmulss	%xmm0 , %xmm12, %xmm12
-	vmulss	%xmm0 , %xmm14, %xmm14
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-	vaddss 	        (CO1, LDC,2), %xmm8,%xmm8
-	vaddss 	        (CO2), %xmm10,%xmm10
-	vaddss 	        (CO2, LDC), %xmm12,%xmm12
-	vaddss 	        (CO2, LDC,2), %xmm14,%xmm14
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm6 ,  	(CO1, LDC)
-	vmovss	%xmm8 ,  	(CO1, LDC,2)
-	vmovss	%xmm10,  	(CO2)
-	vmovss	%xmm12,  	(CO2, LDC)
-	vmovss	%xmm14,  	(CO2, LDC,2)
-
-.endm
-
-
-/*******************************************************************************************/
-
-
-/*******************************************************************************************
-* 4 lines of N
-*******************************************************************************************/
-
-.macro KERNEL16x4_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
-	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
-	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
-	VFMADD231PS_(  	%ymm9,%ymm2,%ymm1  )
-	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
-	VFMADD231PS_(  	%ymm11,%ymm3,%ymm1 )
-	addq	$ 4 , BI	
-	addq	$ 16, %rax 
-.endm
-
-.macro SAVE16x4
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm5 , %ymm5
-	vmulps	%ymm0 , %ymm6 , %ymm6
-	vmulps	%ymm0 , %ymm7 , %ymm7
-	vmulps	%ymm0 , %ymm8 , %ymm8
-	vmulps	%ymm0 , %ymm9 , %ymm9
-	vmulps	%ymm0 , %ymm10, %ymm10
-	vmulps	%ymm0 , %ymm11, %ymm11
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
-
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
-
-	vaddps 	        (CO2), %ymm8,%ymm8
-	vaddps  8 * SIZE(CO2), %ymm9,%ymm9
-
-	vaddps 	        (CO2, LDC), %ymm10,%ymm10
-	vaddps  8 * SIZE(CO2, LDC), %ymm11,%ymm11
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 , 8 * SIZE(CO1)
-
-	vmovups	%ymm6 ,  	(CO1, LDC)
-	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
-
-	vmovups	%ymm8 ,  	(CO2)
-	vmovups	%ymm9 , 8 * SIZE(CO2)
-
-	vmovups	%ymm10,  	(CO2, LDC)
-	vmovups	%ymm11, 8 * SIZE(CO2, LDC)
-
-	prefetcht0	64(CO1)
-	prefetcht0	64(CO1, LDC)
-	prefetcht0	64(CO2)
-	prefetcht0	64(CO2, LDC)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL8x4_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
-	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
-	addq	$ 4 , BI	
-	addq	$ 8 , %rax 
-.endm
-
-.macro SAVE8x4
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm6 , %ymm6
-	vmulps	%ymm0 , %ymm8 , %ymm8
-	vmulps	%ymm0 , %ymm10, %ymm10
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-	vaddps 	        (CO2), %ymm8,%ymm8
-	vaddps 	        (CO2, LDC), %ymm10,%ymm10
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm6 ,  	(CO1, LDC)
-	vmovups	%ymm8 ,  	(CO2)
-	vmovups	%ymm10,  	(CO2, LDC)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL4x4_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231PS_(  	%xmm8,%xmm2,%xmm0  )
-	VFMADD231PS_(  	%xmm10,%xmm3,%xmm0 )
-	addq	$ 4 , BI	
-	addq	$ 4 , %rax 
-.endm
-
-.macro SAVE4x4
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vmulps	%xmm0 , %xmm4 , %xmm4
-	vmulps	%xmm0 , %xmm6 , %xmm6
-	vmulps	%xmm0 , %xmm8 , %xmm8
-	vmulps	%xmm0 , %xmm10, %xmm10
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %xmm4,%xmm4
-	vaddps 	        (CO1, LDC), %xmm6,%xmm6
-	vaddps 	        (CO2), %xmm8,%xmm8
-	vaddps 	        (CO2, LDC), %xmm10,%xmm10
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm6 ,  	(CO1, LDC)
-	vmovups	%xmm8 ,  	(CO2)
-	vmovups	%xmm10,  	(CO2, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL2x4_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
-	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
-	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
-	VFMADD231SS_(  	%xmm9,%xmm2,%xmm1  )
-	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
-	VFMADD231SS_(  	%xmm11,%xmm3,%xmm1 )
-	addq	$ 4 , BI	
-	addq	$ 2, %rax 
-.endm
-
-.macro SAVE2x4
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm5 , %xmm5
-	vmulss	%xmm0 , %xmm6 , %xmm6
-	vmulss	%xmm0 , %xmm7 , %xmm7
-	vmulss	%xmm0 , %xmm8 , %xmm8
-	vmulss	%xmm0 , %xmm9 , %xmm9
-	vmulss	%xmm0 , %xmm10, %xmm10
-	vmulss	%xmm0 , %xmm11, %xmm11
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
-
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
-
-	vaddss 	        (CO2), %xmm8,%xmm8
-	vaddss  1 * SIZE(CO2), %xmm9,%xmm9
-
-	vaddss 	        (CO2, LDC), %xmm10,%xmm10
-	vaddss  1 * SIZE(CO2, LDC), %xmm11,%xmm11
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 , 1 * SIZE(CO1)
-
-	vmovss	%xmm6 ,  	(CO1, LDC)
-	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
-
-	vmovss	%xmm8 ,  	(CO2)
-	vmovss	%xmm9 , 1 * SIZE(CO2)
-
-	vmovss	%xmm10,  	(CO2, LDC)
-	vmovss	%xmm11, 1 * SIZE(CO2, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL1x4_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
-	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
-	addq	$ 4 , BI	
-	addq	$ 1, %rax 
-.endm
-
-.macro SAVE1x4
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm6 , %xmm6
-	vmulss	%xmm0 , %xmm8 , %xmm8
-	vmulss	%xmm0 , %xmm10, %xmm10
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-	vaddss 	        (CO2), %xmm8,%xmm8
-	vaddss 	        (CO2, LDC), %xmm10,%xmm10
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm6 ,  	(CO1, LDC)
-	vmovss	%xmm8 ,  	(CO2)
-	vmovss	%xmm10,  	(CO2, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-/*******************************************************************************************
-* 2 lines of N
-*******************************************************************************************/
-
-.macro KERNEL16x2_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
-	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
-	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )
-	addq	$ 2 , BI	
-	addq	$ 16, %rax 
-.endm
-
-.macro SAVE16x2
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm5 , %ymm5
-	vmulps	%ymm0 , %ymm6 , %ymm6
-	vmulps	%ymm0 , %ymm7 , %ymm7
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
-
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 , 8 * SIZE(CO1)
-
-	vmovups	%ymm6 ,  	(CO1, LDC)
-	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL8x2_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
-	addq	$ 2 , BI	
-	addq	$ 8 , %rax 
-.endm
-
-.macro SAVE8x2
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm6 , %ymm6
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm6 ,  	(CO1, LDC)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL4x2_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
-	addq	$ 2 , BI	
-	addq	$ 4 , %rax 
-.endm
-
-.macro SAVE4x2
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vmulps	%xmm0 , %xmm4 , %xmm4
-	vmulps	%xmm0 , %xmm6 , %xmm6
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %xmm4,%xmm4
-	vaddps 	        (CO1, LDC), %xmm6,%xmm6
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm6 ,  	(CO1, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL2x2_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
-	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
-	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
-	addq	$ 2 , BI	
-	addq	$ 2, %rax 
-.endm
-
-.macro SAVE2x2
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm5 , %xmm5
-	vmulss	%xmm0 , %xmm6 , %xmm6
-	vmulss	%xmm0 , %xmm7 , %xmm7
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
-
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 , 1 * SIZE(CO1)
-
-	vmovss	%xmm6 ,  	(CO1, LDC)
-	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL1x2_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
-	addq	$ 2 , BI	
-	addq	$ 1, %rax 
-.endm
-
-.macro SAVE1x2
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm6 , %xmm6
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm6 ,  	(CO1, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-/*******************************************************************************************
-* 1 line of N
-*******************************************************************************************/
-
-.macro KERNEL16x1_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
-	addq	$ 1 , BI	
-	addq	$ 16, %rax 
-.endm
-
-.macro SAVE16x1
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm5 , %ymm5
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 , 8 * SIZE(CO1)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL8x1_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	addq	$ 1 , BI	
-	addq	$ 8 , %rax 
-.endm
-
-.macro SAVE8x1
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL4x1_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
-	addq	$ 1 , BI	
-	addq	$ 4 , %rax 
-.endm
-
-.macro SAVE4x1
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vmulps	%xmm0 , %xmm4 , %xmm4
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %xmm4,%xmm4
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL2x1_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
-	addq	$ 1 , BI	
-	addq	$ 2 , %rax 
-.endm
-
-.macro SAVE2x1
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm5 , %xmm5
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 , 1 * SIZE(CO1)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL1x1_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	addq	$ 1 , BI	
-	addq	$ 1 , %rax 
-.endm
-
-.macro SAVE1x1
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-
-.endm
-
-
-/*******************************************************************************************/
-
-#if !defined(TRMMKERNEL)
-
-/*************************************************************************************
-* GEMM Kernel
-*************************************************************************************/
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	movups	%xmm6,   64(%rsp)
-	movups	%xmm7,   80(%rsp)
-	movups	%xmm8,   96(%rsp)
-	movups	%xmm9,  112(%rsp)
-	movups	%xmm10, 128(%rsp)
-	movups	%xmm11, 144(%rsp)
-	movups	%xmm12, 160(%rsp)
-	movups	%xmm13, 176(%rsp)
-	movups	%xmm14, 192(%rsp)
-	movups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	vmovsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovss	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $12,  %rdi
-        divq    %rdi                    //    N / 12
-        movq    %rax, Ndiv6             //    N / 12
-        movq    %rdx, Nmod6             //    N % 12
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L4_00
-	ALIGN_4
-
-
-/*******************************************************************************************/
-
-.L6_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	salq	$2, %rax		// 4 values of B
-        leaq    (B, %rax,4), BO2
-        movq    BO2, B                  // next offset of B
-        movq    K, %rax
-
-	ALIGN_4
-
-
-.L6_02c:
-
-	vmovups	(BO1), %xmm0
-	vmovsd	(BO2), %xmm1
-	vmovups	%xmm0, (BO)
-	vmovsd	%xmm1, 4*SIZE(BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO2
-	addq	$ 6*SIZE,BO
-	decq	%rax
-	jnz	.L6_02c
-
-
-.L6_10:
-	movq	 C, CO1
-	leaq	(C,   LDC, 2), CO2	
-	leaq	(CO2, LDC, 1), CO2		// co2 = c + 3 * ldc
-	leaq	(C,   LDC, 4), C	
-	leaq	(C,   LDC, 2), C		// c = c + 6 * ldc
-
-	movq	A, AO		 	// aoffset = a
-	addq	$ 16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L6_20
-
-	ALIGN_4
-
-.L6_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L6_16
-
-	ALIGN_4
-
-.L6_12:
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	je	.L6_16
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	je	.L6_16
-
-	jmp	.L6_12
-	ALIGN_4
-
-.L6_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_19
-
-	ALIGN_4
-
-.L6_17:
-
-	KERNEL16x6_SUB
-
-	jnz	.L6_17
-	ALIGN_4
-
-
-.L6_19:
-
-	SAVE16x6
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	addq	$16 * SIZE, CO2		# coffset += 16
-	decq	I			# i --
-	jg	.L6_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L6_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L6_60		// to next 6 lines of N
-
-	testq	$8, M		
-	jz	.L6_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L6_20_1:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_20_6
-
-	ALIGN_4
-
-.L6_20_2:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	je	.L6_20_6
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	je	.L6_20_6
-
-	jmp	.L6_20_2
-	ALIGN_4
-
-.L6_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_20_9
-
-	ALIGN_4
-
-.L6_20_7:
-
-	KERNEL8x6_SUB
-
-	jnz	.L6_20_7
-	ALIGN_4
-
-
-.L6_20_9:
-
-	SAVE8x6
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	addq	$8 * SIZE, CO2		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L6_21pre:
-
-	testq	$4, M		
-	jz	.L6_30
-	ALIGN_4
-
-.L6_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_26
-
-	ALIGN_4
-
-.L6_22:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	je	.L6_26
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	je	.L6_26
-
-	jmp	.L6_22
-	ALIGN_4
-
-.L6_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_29
-
-	ALIGN_4
-
-.L6_27:
-
-	KERNEL4x6_SUB
-
-	jnz	.L6_27
-	ALIGN_4
-
-
-.L6_29:
-
-	SAVE4x6
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	addq	$4 * SIZE, CO2		# coffset += 4
-	ALIGN_4
-	
-
-.L6_30:
-	testq	$2, M		
-	jz	.L6_40
-
-	ALIGN_4
-
-.L6_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_36
-
-	ALIGN_4
-
-.L6_32:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	je	.L6_36
-
-	prefetcht0	A_PR1(AO)
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	je	.L6_36
-
-	jmp	.L6_32
-	ALIGN_4
-
-.L6_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_39
-
-	ALIGN_4
-
-.L6_37:
-
-	KERNEL2x6_SUB
-
-	jnz	.L6_37
-	ALIGN_4
-
-
-.L6_39:
-
-	SAVE2x6
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	addq	$2 * SIZE, CO2		# coffset += 2
-	ALIGN_4
-
-.L6_40:
-	testq	$1, M		
-	jz	.L6_60		// to next 4 lines of N
-
-	ALIGN_4
-
-.L6_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_46
-
-	ALIGN_4
-
-.L6_42:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	je	.L6_46
-
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	je	.L6_46
-
-	jmp	.L6_42
-	ALIGN_4
-
-.L6_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_49
-
-	ALIGN_4
-
-.L6_47:
-
-	KERNEL1x6_SUB
-
-	jnz	.L6_47
-	ALIGN_4
-
-
-.L6_49:
-
-	SAVE1x6
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	addq	$1 * SIZE, CO2		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L6_60:
-
-
-/*******************************************************************************************/
-
-
-.L7_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	salq	$2, %rax		// 4 values of B
-        leaq    (B, %rax,4), BO2
-        movq    K, %rax
-
-	ALIGN_4
-
-
-.L7_02c:
-
-	vmovsd	2*SIZE(BO1), %xmm0
-	vmovups	      (BO2), %xmm1
-	vmovsd	%xmm0, (BO)
-	vmovups	%xmm1, 2*SIZE(BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO2
-	addq	$ 6*SIZE,BO
-	decq	%rax
-	jnz	.L7_02c
-
-        movq    BO2, B                  // next offset of B
-
-.L7_10:
-	movq	 C, CO1
-	leaq	(C,   LDC, 2), CO2	
-	leaq	(CO2, LDC, 1), CO2		// co2 = c + 3 * ldc
-	leaq	(C,   LDC, 4), C	
-	leaq	(C,   LDC, 2), C		// c = c + 6 * ldc
-
-	movq	A, AO		 	// aoffset = a
-	addq	$ 16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L7_20
-
-	ALIGN_4
-
-.L7_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L7_16
-
-	ALIGN_4
-
-.L7_12:
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	je	.L7_16
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	je	.L7_16
-
-	jmp	.L7_12
-	ALIGN_4
-
-.L7_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_19
-
-	ALIGN_4
-
-.L7_17:
-
-	KERNEL16x6_SUB
-
-	jnz	.L7_17
-	ALIGN_4
-
-
-.L7_19:
-
-	SAVE16x6
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	addq	$16 * SIZE, CO2		# coffset += 16
-	decq	I			# i --
-	jg	.L7_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L7_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L7_60		// to next 6 lines of N
-
-	testq	$8, M		
-	jz	.L7_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L7_20_1:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_20_6
-
-	ALIGN_4
-
-.L7_20_2:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	je	.L7_20_6
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	je	.L7_20_6
-
-	jmp	.L7_20_2
-	ALIGN_4
-
-.L7_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_20_9
-
-	ALIGN_4
-
-.L7_20_7:
-
-	KERNEL8x6_SUB
-
-	jnz	.L7_20_7
-	ALIGN_4
-
-
-.L7_20_9:
-
-	SAVE8x6
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	addq	$8 * SIZE, CO2		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L7_21pre:
-
-	testq	$4, M		
-	jz	.L7_30
-	ALIGN_4
-
-.L7_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_26
-
-	ALIGN_4
-
-.L7_22:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	je	.L7_26
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	je	.L7_26
-
-	jmp	.L7_22
-	ALIGN_4
-
-.L7_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_29
-
-	ALIGN_4
-
-.L7_27:
-
-	KERNEL4x6_SUB
-
-	jnz	.L7_27
-	ALIGN_4
-
-
-.L7_29:
-
-	SAVE4x6
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	addq	$4 * SIZE, CO2		# coffset += 4
-	ALIGN_4
-	
-
-.L7_30:
-	testq	$2, M		
-	jz	.L7_40
-
-	ALIGN_4
-
-.L7_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_36
-
-	ALIGN_4
-
-.L7_32:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	je	.L7_36
-
-	prefetcht0	A_PR1(AO)
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	je	.L7_36
-
-	jmp	.L7_32
-	ALIGN_4
-
-.L7_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_39
-
-	ALIGN_4
-
-.L7_37:
-
-	KERNEL2x6_SUB
-
-	jnz	.L7_37
-	ALIGN_4
-
-
-.L7_39:
-
-	SAVE2x6
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	addq	$2 * SIZE, CO2		# coffset += 2
-	ALIGN_4
-
-.L7_40:
-	testq	$1, M		
-	jz	.L7_60		// to next 4 lines of N
-
-	ALIGN_4
-
-.L7_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_46
-
-	ALIGN_4
-
-.L7_42:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	je	.L7_46
-
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	je	.L7_46
-
-	jmp	.L7_42
-	ALIGN_4
-
-.L7_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_49
-
-	ALIGN_4
-
-.L7_47:
-
-	KERNEL1x6_SUB
-
-	jnz	.L7_47
-	ALIGN_4
-
-
-.L7_49:
-
-	SAVE1x6
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	addq	$1 * SIZE, CO2		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L7_60:
-
-	decq	J			// j --
-	jg	.L6_01			// next 12 lines of N
-
-
-
-
-/*******************************************************************************************/
-.L4_00:
-
- 	movq    Nmod6,  J
-        sarq    $2, J           // j = j / 4
-        cmpq    $ 0, J
-        je      .L2_00
-        ALIGN_4
-
-
-.L4_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	sarq	$2, %rax		// K / 4
-	jz	.L4_01b
-	ALIGN_4
-
-
-.L4_01a:
-        prefetcht0 512(BO1)
-        prefetchw  512(BO)
-
-	vmovups	       (BO1), %xmm0
-	vmovups	 4*SIZE(BO1), %xmm1
-	vmovups	 8*SIZE(BO1), %xmm2
-	vmovups	12*SIZE(BO1), %xmm3
-
-	vmovups	%xmm0,       (BO)
-	vmovups	%xmm1, 4*SIZE(BO)
-	vmovups	%xmm2, 8*SIZE(BO)
-	vmovups	%xmm3,12*SIZE(BO)
-
-	addq	$ 16*SIZE,BO1
-	addq	$ 16*SIZE,BO
-	decq	%rax
-	jnz	.L4_01a
-
-
-.L4_01b:
-
-        movq    K, %rax
-        andq    $3, %rax                // K % 4
-        jz      .L4_02d
-        ALIGN_4
-
-.L4_02c:
-
-	vmovups	(BO1), %xmm0
-	vmovups	%xmm0, (BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO
-	decq	%rax
-	jnz	.L4_02c
-
-.L4_02d:
-
-	movq	BO1, B			// next offset of B
-
-.L4_10:
-	movq	 C, CO1
-	leaq	(C, LDC, 2), CO2	
-	leaq	(C, LDC, 4), C		// c += 4 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L4_20
-
-	ALIGN_4
-
-.L4_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             	// first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $4, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4) , BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_12:
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	je	.L4_16
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	je	.L4_16
-
-	jmp	.L4_12
-	ALIGN_4
-
-.L4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_17:
-
-	KERNEL16x4_SUB
-
-	jl	.L4_17
-	ALIGN_4
-
-
-.L4_19:
-
-	SAVE16x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	addq	$16 * SIZE, CO2		# coffset += 16
-	decq	I			# i --
-	jg	.L4_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L4_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L4_60		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L4_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L4_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L4_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_20_2:
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	je	.L4_20_6
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	je	.L4_20_6
-
-	jmp	.L4_20_2
-	ALIGN_4
-
-.L4_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_20_7:
-
-	KERNEL8x4_SUB
-
-	jl	.L4_20_7
-	ALIGN_4
-
-
-.L4_20_9:
-
-	SAVE8x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	addq	$8 * SIZE, CO2		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L4_21pre:
-
-	testq	$4, M		
-	jz	.L4_30
-	ALIGN_4
-
-.L4_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L4_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_22:
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	je	.L4_26
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	je	.L4_26
-
-	jmp	.L4_22
-	ALIGN_4
-
-.L4_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_27:
-
-	KERNEL4x4_SUB
-
-	jl	.L4_27
-	ALIGN_4
-
-
-.L4_29:
-
-	SAVE4x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	addq	$4 * SIZE, CO2		# coffset += 4
-	ALIGN_4
-	
-
-.L4_30:
-	testq	$2, M		
-	jz	.L4_40
-
-	ALIGN_4
-
-.L4_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L4_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_32:
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	je	.L4_36
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	je	.L4_36
-
-	jmp	.L4_32
-	ALIGN_4
-
-.L4_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_39
-
-	movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_37:
-
-	KERNEL2x4_SUB
-
-	jl	.L4_37
-	ALIGN_4
-
-
-.L4_39:
-
-	SAVE2x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	addq	$2 * SIZE, CO2		# coffset += 2
-	ALIGN_4
-
-.L4_40:
-	testq	$1, M		
-	jz	.L4_60		// to next 4 lines of N
-
-	ALIGN_4
-
-.L4_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L4_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_42:
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	je	.L4_46
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	je	.L4_46
-
-	jmp	.L4_42
-	ALIGN_4
-
-.L4_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_47:
-
-	KERNEL1x4_SUB
-
-	jl	.L4_47
-	ALIGN_4
-
-
-.L4_49:
-
-	SAVE1x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	addq	$1 * SIZE, CO2		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L4_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $4, KK
-#endif
-
-	decq	J			// j --
-	jg	.L4_01			// next 4 lines of N
-
-
-
-/*******************************************************************************************/
-.L2_00:
-
-	movq	Nmod6, J		
-	andq	$3, J			// j % 4
-	je	.L999
-
-	movq	Nmod6, J		
-	andq	$2, J			// j % 4
-	je	.L1_0
-
-.L2_01:
-
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	sarq	$2, %rax		// K / 4
-	jz	.L2_01b
-	ALIGN_4
-
-.L2_01a:
-
-	vmovsd	      (BO1), %xmm0
-	vmovsd	2*SIZE(BO1), %xmm1
-	vmovsd	4*SIZE(BO1), %xmm2
-	vmovsd	6*SIZE(BO1), %xmm3
-
-	vmovsd	%xmm0,       (BO)
-	vmovsd	%xmm1, 2*SIZE(BO)
-	vmovsd	%xmm2, 4*SIZE(BO)
-	vmovsd	%xmm3, 6*SIZE(BO)
-
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO
-	decq	%rax
-	jnz	.L2_01a
-
-
-.L2_01b:
-
-        movq    K, %rax
-        andq    $3, %rax                // K % 4
-        jz      .L2_02d
-        ALIGN_4
-
-.L2_02c:
-
-	vmovsd 	(BO1), %xmm0
-	vmovsd 	%xmm0, (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L2_02c
-
-.L2_02d:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $2, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	je	.L2_16
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL16x2_SUB
-
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	SAVE16x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L2_60		// to next 2 lines of N
-
-	testq	$8, M		
-	jz	.L2_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L2_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_2:
-
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	je	.L2_20_6
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	je	.L2_20_6
-
-	jmp	.L2_20_2
-	ALIGN_4
-
-.L2_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_7:
-
-	KERNEL8x2_SUB
-
-	jl	.L2_20_7
-	ALIGN_4
-
-
-.L2_20_9:
-
-	SAVE8x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L2_21pre:
-
-	testq	$4, M		
-	jz	.L2_30
-	ALIGN_4
-
-.L2_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	je	.L2_26
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL4x2_SUB
-
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	SAVE4x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_32:
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_36
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_36
-
-	jmp	.L2_32
-	ALIGN_4
-
-.L2_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB
-
-	jl	.L2_37
-	ALIGN_4
-
-
-.L2_39:
-
-	SAVE2x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_46
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB
-
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	SAVE1x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovss	(BO1), %xmm0
-	vmovss	%xmm0,       (BO)
-	addq	$1*SIZE,BO1
-	addq	$1*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $1, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	je	.L1_16
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL16x1_SUB
-
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	SAVE16x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L999
-
-	testq	$8, M		
-	jz	.L1_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L1_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_20_6
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_2:
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	je	.L1_20_6
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	je	.L1_20_6
-
-	jmp	.L1_20_2
-	ALIGN_4
-
-.L1_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_20_9
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_7:
-
-	KERNEL8x1_SUB
-
-	jl	.L1_20_7
-	ALIGN_4
-
-
-.L1_20_9:
-
-	SAVE8x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L1_21pre:
-
-	testq	$4, M		
-	jz	.L1_30
-	ALIGN_4
-
-.L1_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_26
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL4x1_SUB
-
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	SAVE4x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_36
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_32:
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_36
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_36
-
-	jmp	.L1_32
-	ALIGN_4
-
-.L1_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	movq    %rax, BI                        //  Index for BO
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB
-
-	jl	.L1_37
-	ALIGN_4
-
-
-.L1_39:
-
-	SAVE2x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_46
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB
-
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	SAVE1x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-.L999:
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	movups	 64(%rsp), %xmm6
-	movups	 80(%rsp), %xmm7
-	movups	 96(%rsp), %xmm8
-	movups	112(%rsp), %xmm9
-	movups	128(%rsp), %xmm10
-	movups	144(%rsp), %xmm11
-	movups	160(%rsp), %xmm12
-	movups	176(%rsp), %xmm13
-	movups	192(%rsp), %xmm14
-	movups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-
-#else
-
-/*************************************************************************************
-* TRMM Kernel
-*************************************************************************************/
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	movups	%xmm6,   64(%rsp)
-	movups	%xmm7,   80(%rsp)
-	movups	%xmm8,   96(%rsp)
-	movups	%xmm9,  112(%rsp)
-	movups	%xmm10, 128(%rsp)
-	movups	%xmm11, 144(%rsp)
-	movups	%xmm12, 160(%rsp)
-	movups	%xmm13, 176(%rsp)
-	movups	%xmm14, 192(%rsp)
-	movups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	vmovsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovss	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $4,  %rdi
-        divq    %rdi                    //    N / 4
-        movq    %rax, Ndiv6             //    N / 4
-        movq    %rdx, Nmod6             //    N % 4
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L2_0
-	ALIGN_4
-
-/*******************************************************************************************/
-
-.L4_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	sarq	$2, %rax		// K / 4
-	jz	.L4_01b
-	ALIGN_4
-
-
-.L4_01a:
-        prefetcht0 512(BO1)
-        prefetchw  512(BO)
-
-	vmovups	       (BO1), %xmm0
-	vmovups	 4*SIZE(BO1), %xmm1
-	vmovups	 8*SIZE(BO1), %xmm2
-	vmovups	12*SIZE(BO1), %xmm3
-
-	vmovups	%xmm0,       (BO)
-	vmovups	%xmm1, 4*SIZE(BO)
-	vmovups	%xmm2, 8*SIZE(BO)
-	vmovups	%xmm3,12*SIZE(BO)
-
-	addq	$ 16*SIZE,BO1
-	addq	$ 16*SIZE,BO
-	decq	%rax
-	jnz	.L4_01a
-
-
-.L4_01b:
-
-        movq    K, %rax
-        andq    $3, %rax                // K % 4
-        jz      .L4_02d
-        ALIGN_4
-
-.L4_02c:
-
-	vmovups	(BO1), %xmm0
-	vmovups	%xmm0, (BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO
-	decq	%rax
-	jnz	.L4_02c
-
-.L4_02d:
-
-	movq	BO1, B			// next offset of B
-
-.L4_10:
-	movq	 C, CO1
-	leaq	(C, LDC, 2), CO2	
-	leaq	(C, LDC, 4), C		// c += 4 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L4_20
-
-	ALIGN_4
-
-.L4_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             	// first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $4, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4) , BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_12:
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	je	.L4_16
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	je	.L4_16
-
-	jmp	.L4_12
-	ALIGN_4
-
-.L4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_17:
-
-	KERNEL16x4_SUB
-
-	jl	.L4_17
-	ALIGN_4
-
-
-.L4_19:
-
-	SAVE16x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	addq	$16 * SIZE, CO2		# coffset += 16
-	decq	I			# i --
-	jg	.L4_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L4_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L4_60		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L4_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L4_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L4_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_20_2:
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	je	.L4_20_6
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	je	.L4_20_6
-
-	jmp	.L4_20_2
-	ALIGN_4
-
-.L4_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_20_7:
-
-	KERNEL8x4_SUB
-
-	jl	.L4_20_7
-	ALIGN_4
-
-
-.L4_20_9:
-
-	SAVE8x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	addq	$8 * SIZE, CO2		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L4_21pre:
-
-	testq	$4, M		
-	jz	.L4_30
-	ALIGN_4
-
-.L4_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L4_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_22:
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	je	.L4_26
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	je	.L4_26
-
-	jmp	.L4_22
-	ALIGN_4
-
-.L4_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_27:
-
-	KERNEL4x4_SUB
-
-	jl	.L4_27
-	ALIGN_4
-
-
-.L4_29:
-
-	SAVE4x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	addq	$4 * SIZE, CO2		# coffset += 4
-	ALIGN_4
-	
-
-.L4_30:
-	testq	$2, M		
-	jz	.L4_40
-
-	ALIGN_4
-
-.L4_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L4_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_32:
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	je	.L4_36
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	je	.L4_36
-
-	jmp	.L4_32
-	ALIGN_4
-
-.L4_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_39
-
-	movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_37:
-
-	KERNEL2x4_SUB
-
-	jl	.L4_37
-	ALIGN_4
-
-
-.L4_39:
-
-	SAVE2x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	addq	$2 * SIZE, CO2		# coffset += 2
-	ALIGN_4
-
-.L4_40:
-	testq	$1, M		
-	jz	.L4_60		// to next 4 lines of N
-
-	ALIGN_4
-
-.L4_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L4_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_42:
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	je	.L4_46
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	je	.L4_46
-
-	jmp	.L4_42
-	ALIGN_4
-
-.L4_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_47:
-
-	KERNEL1x4_SUB
-
-	jl	.L4_47
-	ALIGN_4
-
-
-.L4_49:
-
-	SAVE1x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	addq	$1 * SIZE, CO2		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L4_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $4, KK
-#endif
-
-	decq	J			// j --
-	jg	.L4_01			// next 4 lines of N
-
-
-
-/*******************************************************************************************/
-.L2_0:
-
-	movq	Nmod6, J		
-	andq	$3, J			// j % 4
-	je	.L999
-
-	movq	Nmod6, J		
-	andq	$2, J			// j % 4
-	je	.L1_0
-
-.L2_01:
-
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	sarq	$2, %rax		// K / 4
-	jz	.L2_01b
-	ALIGN_4
-
-.L2_01a:
-
-	vmovsd	      (BO1), %xmm0
-	vmovsd	2*SIZE(BO1), %xmm1
-	vmovsd	4*SIZE(BO1), %xmm2
-	vmovsd	6*SIZE(BO1), %xmm3
-
-	vmovsd	%xmm0,       (BO)
-	vmovsd	%xmm1, 2*SIZE(BO)
-	vmovsd	%xmm2, 4*SIZE(BO)
-	vmovsd	%xmm3, 6*SIZE(BO)
-
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO
-	decq	%rax
-	jnz	.L2_01a
-
-
-.L2_01b:
-
-        movq    K, %rax
-        andq    $3, %rax                // K % 4
-        jz      .L2_02d
-        ALIGN_4
-
-.L2_02c:
-
-	vmovsd 	(BO1), %xmm0
-	vmovsd 	%xmm0, (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L2_02c
-
-.L2_02d:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $2, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	je	.L2_16
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL16x2_SUB
-
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	SAVE16x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L2_60		// to next 2 lines of N
-
-	testq	$8, M		
-	jz	.L2_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L2_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_2:
-
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	je	.L2_20_6
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	je	.L2_20_6
-
-	jmp	.L2_20_2
-	ALIGN_4
-
-.L2_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_7:
-
-	KERNEL8x2_SUB
-
-	jl	.L2_20_7
-	ALIGN_4
-
-
-.L2_20_9:
-
-	SAVE8x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L2_21pre:
-
-	testq	$4, M		
-	jz	.L2_30
-	ALIGN_4
-
-.L2_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	je	.L2_26
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL4x2_SUB
-
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	SAVE4x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_32:
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_36
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_36
-
-	jmp	.L2_32
-	ALIGN_4
-
-.L2_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB
-
-	jl	.L2_37
-	ALIGN_4
-
-
-.L2_39:
-
-	SAVE2x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_46
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB
-
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	SAVE1x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovss	(BO1), %xmm0
-	vmovss	%xmm0,       (BO)
-	addq	$1*SIZE,BO1
-	addq	$1*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $1, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	je	.L1_16
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL16x1_SUB
-
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	SAVE16x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L999
-
-	testq	$8, M		
-	jz	.L1_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L1_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_20_6
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_2:
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	je	.L1_20_6
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	je	.L1_20_6
-
-	jmp	.L1_20_2
-	ALIGN_4
-
-.L1_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_20_9
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_7:
-
-	KERNEL8x1_SUB
-
-	jl	.L1_20_7
-	ALIGN_4
-
-
-.L1_20_9:
-
-	SAVE8x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L1_21pre:
-
-	testq	$4, M		
-	jz	.L1_30
-	ALIGN_4
-
-.L1_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_26
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL4x1_SUB
-
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	SAVE4x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_36
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_32:
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_36
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_36
-
-	jmp	.L1_32
-	ALIGN_4
-
-.L1_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	movq    %rax, BI                        //  Index for BO
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB
-
-	jl	.L1_37
-	ALIGN_4
-
-
-.L1_39:
-
-	SAVE2x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_46
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB
-
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	SAVE1x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-.L999:
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	movups	 64(%rsp), %xmm6
-	movups	 80(%rsp), %xmm7
-	movups	 96(%rsp), %xmm8
-	movups	112(%rsp), %xmm9
-	movups	128(%rsp), %xmm10
-	movups	144(%rsp), %xmm11
-	movups	160(%rsp), %xmm12
-	movups	176(%rsp), %xmm13
-	movups	192(%rsp), %xmm14
-	movups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-#endif
-
+/*********************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+/*********************************************************************
+* 2014/07/28 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+* 2013/10/28 Saar
+* Parameter:
+*	SGEMM_DEFAULT_UNROLL_N	4
+*	SGEMM_DEFAULT_UNROLL_M	16
+*	SGEMM_DEFAULT_P		768
+*	SGEMM_DEFAULT_Q		384
+*	A_PR1			512
+*	B_PR1			512
+*	
+* 
+* 2014/07/28 Saar
+* Performance at 9216x9216x9216:
+*       1 thread:      102 GFLOPS       (SANDYBRIDGE:  59)      (MKL:   83)
+*       2 threads:     195 GFLOPS       (SANDYBRIDGE: 116)      (MKL:  155)
+*       3 threads:     281 GFLOPS       (SANDYBRIDGE: 165)      (MKL:  230)
+*       4 threads:     366 GFLOPS       (SANDYBRIDGE: 223)      (MKL:  267)
+*
+*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define BO2	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define	CO2	%rdx
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 256
+
+#define OLD_A		40 + STACKSIZE(%rsp)
+#define OLD_B		48 + STACKSIZE(%rsp)
+#define OLD_C		56 + STACKSIZE(%rsp)
+#define OLD_LDC		64 + STACKSIZE(%rsp)
+#define OLD_OFFSET	72 + STACKSIZE(%rsp)
+
+#endif
+
+#if defined(OS_WINDOWS)
+#define L_BUFFER_SIZE 8192
+#else
+#define L_BUFFER_SIZE 12288
+#endif
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA	 48(%rsp)
+#define OFFSET	 56(%rsp)
+#define KK	 64(%rsp)
+#define KKK	 72(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+#if defined(BULLDOZER)
+
+#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
+
+#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0
+
+#else
+
+#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0
+
+#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0
+
+#endif
+
+
+#define	A_PR1	512
+#define	B_PR1	512
+
+/*******************************************************************************************
+* 6 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x6_SUB
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vmovups 	 -8 * SIZE(AO), %ymm1
+	vbroadcastss	 -4 * SIZE(BO), %ymm2
+	vbroadcastss	 -3 * SIZE(BO), %ymm3
+	prefetcht0	A_PR1(AO)
+
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )
+
+	vbroadcastss	 -2 * SIZE(BO), %ymm2
+	vbroadcastss	 -1 * SIZE(BO), %ymm3
+	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm9,%ymm2,%ymm1  )
+	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
+	VFMADD231PS_(  	%ymm11,%ymm3,%ymm1 )
+
+	vbroadcastss	  0 * SIZE(BO), %ymm2
+	vbroadcastss	  1 * SIZE(BO), %ymm3
+	VFMADD231PS_(  	%ymm12,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm13,%ymm2,%ymm1  )
+	VFMADD231PS_(  	%ymm14,%ymm3,%ymm0 )
+	VFMADD231PS_(  	%ymm15,%ymm3,%ymm1 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 16*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE16x6
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm5 , %ymm5
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm7 , %ymm7
+	vmulps	%ymm0 , %ymm8 , %ymm8
+	vmulps	%ymm0 , %ymm9 , %ymm9
+	vmulps	%ymm0 , %ymm10, %ymm10
+	vmulps	%ymm0 , %ymm11, %ymm11
+	vmulps	%ymm0 , %ymm12, %ymm12
+	vmulps	%ymm0 , %ymm13, %ymm13
+	vmulps	%ymm0 , %ymm14, %ymm14
+	vmulps	%ymm0 , %ymm15, %ymm15
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
+
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
+
+	vaddps 	        (CO1, LDC,2), %ymm8,%ymm8
+	vaddps  8 * SIZE(CO1, LDC,2), %ymm9,%ymm9
+
+	vaddps 	        (CO2), %ymm10,%ymm10
+	vaddps  8 * SIZE(CO2), %ymm11,%ymm11
+
+	vaddps 	        (CO2, LDC), %ymm12,%ymm12
+	vaddps  8 * SIZE(CO2, LDC), %ymm13,%ymm13
+
+	vaddps 	        (CO2, LDC,2), %ymm14,%ymm14
+	vaddps  8 * SIZE(CO2, LDC,2), %ymm15,%ymm15
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 , 8 * SIZE(CO1)
+
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
+
+	vmovups	%ymm8 ,  	(CO1, LDC,2)
+	vmovups	%ymm9 , 8 * SIZE(CO1, LDC,2)
+
+	vmovups	%ymm10,  	(CO2)
+	vmovups	%ymm11, 8 * SIZE(CO2)
+
+	vmovups	%ymm12,  	(CO2, LDC)
+	vmovups	%ymm13, 8 * SIZE(CO2, LDC)
+
+	vmovups	%ymm14,  	(CO2, LDC,2)
+	vmovups	%ymm15, 8 * SIZE(CO2, LDC,2)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x6_SUB
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vbroadcastss	 -4 * SIZE(BO), %ymm2
+	vbroadcastss	 -3 * SIZE(BO), %ymm3
+
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+
+	vbroadcastss	 -2 * SIZE(BO), %ymm2
+	vbroadcastss	 -1 * SIZE(BO), %ymm3
+	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
+
+	vbroadcastss	  0 * SIZE(BO), %ymm2
+	vbroadcastss	  1 * SIZE(BO), %ymm3
+	VFMADD231PS_(  	%ymm12,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm14,%ymm3,%ymm0 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 8*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE8x6
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm8 , %ymm8
+	vmulps	%ymm0 , %ymm10, %ymm10
+	vmulps	%ymm0 , %ymm12, %ymm12
+	vmulps	%ymm0 , %ymm14, %ymm14
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps 	        (CO1, LDC,2), %ymm8,%ymm8
+	vaddps 	        (CO2), %ymm10,%ymm10
+	vaddps 	        (CO2, LDC), %ymm12,%ymm12
+	vaddps 	        (CO2, LDC,2), %ymm14,%ymm14
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm8 ,  	(CO1, LDC,2)
+	vmovups	%ymm10,  	(CO2)
+	vmovups	%ymm12,  	(CO2, LDC)
+	vmovups	%ymm14,  	(CO2, LDC,2)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x6_SUB
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vbroadcastss	 -4 * SIZE(BO), %xmm2
+	vbroadcastss	 -3 * SIZE(BO), %xmm3
+
+	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
+
+	vbroadcastss	 -2 * SIZE(BO), %xmm2
+	vbroadcastss	 -1 * SIZE(BO), %xmm3
+	VFMADD231PS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231PS_(  	%xmm10,%xmm3,%xmm0 )
+
+	vbroadcastss	  0 * SIZE(BO), %xmm2
+	vbroadcastss	  1 * SIZE(BO), %xmm3
+	VFMADD231PS_(  	%xmm12,%xmm2,%xmm0  )
+	VFMADD231PS_(  	%xmm14,%xmm3,%xmm0 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 4*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE4x6
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+	vmulps	%xmm0 , %xmm6 , %xmm6
+	vmulps	%xmm0 , %xmm8 , %xmm8
+	vmulps	%xmm0 , %xmm10, %xmm10
+	vmulps	%xmm0 , %xmm12, %xmm12
+	vmulps	%xmm0 , %xmm14, %xmm14
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+	vaddps 	        (CO1, LDC), %xmm6,%xmm6
+	vaddps 	        (CO1, LDC,2), %xmm8,%xmm8
+	vaddps 	        (CO2), %xmm10,%xmm10
+	vaddps 	        (CO2, LDC), %xmm12,%xmm12
+	vaddps 	        (CO2, LDC,2), %xmm14,%xmm14
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+	vmovups	%xmm8 ,  	(CO1, LDC,2)
+	vmovups	%xmm10,  	(CO2)
+	vmovups	%xmm12,  	(CO2, LDC)
+	vmovups	%xmm14,  	(CO2, LDC,2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x6_SUB
+	vmovss 	-16 * SIZE(AO), %xmm0
+	vmovss 	-15 * SIZE(AO), %xmm1
+	vmovss	 -4 * SIZE(BO), %xmm2
+	vmovss	 -3 * SIZE(BO), %xmm3
+
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
+
+	vmovss	 -2 * SIZE(BO), %xmm2
+	vmovss	 -1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm9,%xmm2,%xmm1  )
+	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm11,%xmm3,%xmm1 )
+
+	vmovss	  0 * SIZE(BO), %xmm2
+	vmovss	  1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm12,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm13,%xmm2,%xmm1  )
+	VFMADD231SS_(  	%xmm14,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm15,%xmm3,%xmm1 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 2*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE2x6
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm7 , %xmm7
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm9 , %xmm9
+	vmulss	%xmm0 , %xmm10, %xmm10
+	vmulss	%xmm0 , %xmm11, %xmm11
+	vmulss	%xmm0 , %xmm12, %xmm12
+	vmulss	%xmm0 , %xmm13, %xmm13
+	vmulss	%xmm0 , %xmm14, %xmm14
+	vmulss	%xmm0 , %xmm15, %xmm15
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
+
+	vaddss 	        (CO1, LDC,2), %xmm8,%xmm8
+	vaddss  1 * SIZE(CO1, LDC,2), %xmm9,%xmm9
+
+	vaddss 	        (CO2), %xmm10,%xmm10
+	vaddss  1 * SIZE(CO2), %xmm11,%xmm11
+
+	vaddss 	        (CO2, LDC), %xmm12,%xmm12
+	vaddss  1 * SIZE(CO2, LDC), %xmm13,%xmm13
+
+	vaddss 	        (CO2, LDC,2), %xmm14,%xmm14
+	vaddss  1 * SIZE(CO2, LDC,2), %xmm15,%xmm15
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
+
+	vmovss	%xmm8 ,  	(CO1, LDC,2)
+	vmovss	%xmm9 , 1 * SIZE(CO1, LDC,2)
+
+	vmovss	%xmm10,  	(CO2)
+	vmovss	%xmm11, 1 * SIZE(CO2)
+
+	vmovss	%xmm12,  	(CO2, LDC)
+	vmovss	%xmm13, 1 * SIZE(CO2, LDC)
+
+	vmovss	%xmm14,  	(CO2, LDC,2)
+	vmovss	%xmm15, 1 * SIZE(CO2, LDC,2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x6_SUB
+	vmovss 	-16 * SIZE(AO), %xmm0
+	vmovss	 -4 * SIZE(BO), %xmm2
+	vmovss	 -3 * SIZE(BO), %xmm3
+
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+
+	vmovss	 -2 * SIZE(BO), %xmm2
+	vmovss	 -1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
+
+	vmovss	  0 * SIZE(BO), %xmm2
+	vmovss	  1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm12,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm14,%xmm3,%xmm0 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 1*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE1x6
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm10, %xmm10
+	vmulss	%xmm0 , %xmm12, %xmm12
+	vmulss	%xmm0 , %xmm14, %xmm14
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss 	        (CO1, LDC,2), %xmm8,%xmm8
+	vaddss 	        (CO2), %xmm10,%xmm10
+	vaddss 	        (CO2, LDC), %xmm12,%xmm12
+	vaddss 	        (CO2, LDC,2), %xmm14,%xmm14
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm8 ,  	(CO1, LDC,2)
+	vmovss	%xmm10,  	(CO2)
+	vmovss	%xmm12,  	(CO2, LDC)
+	vmovss	%xmm14,  	(CO2, LDC,2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+
+/*******************************************************************************************
+* 4 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x4_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm9,%ymm2,%ymm1  )
+	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
+	VFMADD231PS_(  	%ymm11,%ymm3,%ymm1 )
+	addq	$ 4 , BI	
+	addq	$ 16, %rax 
+.endm
+
+.macro SAVE16x4
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm5 , %ymm5
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm7 , %ymm7
+	vmulps	%ymm0 , %ymm8 , %ymm8
+	vmulps	%ymm0 , %ymm9 , %ymm9
+	vmulps	%ymm0 , %ymm10, %ymm10
+	vmulps	%ymm0 , %ymm11, %ymm11
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
+
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
+
+	vaddps 	        (CO2), %ymm8,%ymm8
+	vaddps  8 * SIZE(CO2), %ymm9,%ymm9
+
+	vaddps 	        (CO2, LDC), %ymm10,%ymm10
+	vaddps  8 * SIZE(CO2, LDC), %ymm11,%ymm11
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 , 8 * SIZE(CO1)
+
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
+
+	vmovups	%ymm8 ,  	(CO2)
+	vmovups	%ymm9 , 8 * SIZE(CO2)
+
+	vmovups	%ymm10,  	(CO2, LDC)
+	vmovups	%ymm11, 8 * SIZE(CO2, LDC)
+
+	prefetcht0	64(CO1)
+	prefetcht0	64(CO1, LDC)
+	prefetcht0	64(CO2)
+	prefetcht0	64(CO2, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x4_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
+	addq	$ 4 , BI	
+	addq	$ 8 , %rax 
+.endm
+
+.macro SAVE8x4
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm8 , %ymm8
+	vmulps	%ymm0 , %ymm10, %ymm10
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps 	        (CO2), %ymm8,%ymm8
+	vaddps 	        (CO2, LDC), %ymm10,%ymm10
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm8 ,  	(CO2)
+	vmovups	%ymm10,  	(CO2, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x4_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231PS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231PS_(  	%xmm10,%xmm3,%xmm0 )
+	addq	$ 4 , BI	
+	addq	$ 4 , %rax 
+.endm
+
+.macro SAVE4x4
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+	vmulps	%xmm0 , %xmm6 , %xmm6
+	vmulps	%xmm0 , %xmm8 , %xmm8
+	vmulps	%xmm0 , %xmm10, %xmm10
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+	vaddps 	        (CO1, LDC), %xmm6,%xmm6
+	vaddps 	        (CO2), %xmm8,%xmm8
+	vaddps 	        (CO2, LDC), %xmm10,%xmm10
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+	vmovups	%xmm8 ,  	(CO2)
+	vmovups	%xmm10,  	(CO2, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x4_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm9,%xmm2,%xmm1  )
+	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm11,%xmm3,%xmm1 )
+	addq	$ 4 , BI	
+	addq	$ 2, %rax 
+.endm
+
+.macro SAVE2x4
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm7 , %xmm7
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm9 , %xmm9
+	vmulss	%xmm0 , %xmm10, %xmm10
+	vmulss	%xmm0 , %xmm11, %xmm11
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
+
+	vaddss 	        (CO2), %xmm8,%xmm8
+	vaddss  1 * SIZE(CO2), %xmm9,%xmm9
+
+	vaddss 	        (CO2, LDC), %xmm10,%xmm10
+	vaddss  1 * SIZE(CO2, LDC), %xmm11,%xmm11
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
+
+	vmovss	%xmm8 ,  	(CO2)
+	vmovss	%xmm9 , 1 * SIZE(CO2)
+
+	vmovss	%xmm10,  	(CO2, LDC)
+	vmovss	%xmm11, 1 * SIZE(CO2, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x4_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
+	addq	$ 4 , BI	
+	addq	$ 1, %rax 
+.endm
+
+.macro SAVE1x4
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm10, %xmm10
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss 	        (CO2), %xmm8,%xmm8
+	vaddss 	        (CO2, LDC), %xmm10,%xmm10
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm8 ,  	(CO2)
+	vmovss	%xmm10,  	(CO2, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 2 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x2_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )
+	addq	$ 2 , BI	
+	addq	$ 16, %rax 
+.endm
+
+.macro SAVE16x2
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm5 , %ymm5
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm7 , %ymm7
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
+
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 , 8 * SIZE(CO1)
+
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x2_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+	addq	$ 2 , BI	
+	addq	$ 8 , %rax 
+.endm
+
+.macro SAVE8x2
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm6 , %ymm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm6 ,  	(CO1, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x2_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
+	addq	$ 2 , BI	
+	addq	$ 4 , %rax 
+.endm
+
+.macro SAVE4x2
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+	vmulps	%xmm0 , %xmm6 , %xmm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+	vaddps 	        (CO1, LDC), %xmm6,%xmm6
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x2_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
+	addq	$ 2 , BI	
+	addq	$ 2, %rax 
+.endm
+
+.macro SAVE2x2
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm7 , %xmm7
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x2_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	addq	$ 2 , BI	
+	addq	$ 1, %rax 
+.endm
+
+.macro SAVE1x2
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm6 , %xmm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm6 ,  	(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 1 line of N
+*******************************************************************************************/
+
+.macro KERNEL16x1_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
+	addq	$ 1 , BI	
+	addq	$ 16, %rax 
+.endm
+
+.macro SAVE16x1
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm5 , %ymm5
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 , 8 * SIZE(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x1_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	addq	$ 1 , BI	
+	addq	$ 8 , %rax 
+.endm
+
+.macro SAVE8x1
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x1_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
+	addq	$ 1 , BI	
+	addq	$ 4 , %rax 
+.endm
+
+.macro SAVE4x1
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x1_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
+	addq	$ 1 , BI	
+	addq	$ 2 , %rax 
+.endm
+
+.macro SAVE2x1
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x1_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	addq	$ 1 , BI	
+	addq	$ 1 , %rax 
+.endm
+
+.macro SAVE1x1
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+#if !defined(TRMMKERNEL)
+
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovss	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $12,  %rdi
+        divq    %rdi                    //    N / 12
+        movq    %rax, Ndiv6             //    N / 12
+        movq    %rdx, Nmod6             //    N % 12
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L4_00
+	ALIGN_4
+
+
+/*******************************************************************************************/
+
+.L6_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	salq	$2, %rax		// 4 values of B
+        leaq    (B, %rax,4), BO2
+        movq    BO2, B                  // next offset of B
+        movq    K, %rax
+
+	ALIGN_4
+
+
+.L6_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovsd	(BO2), %xmm1
+	vmovups	%xmm0, (BO)
+	vmovsd	%xmm1, 4*SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L6_02c
+
+
+.L6_10:
+	movq	 C, CO1
+	leaq	(C,   LDC, 2), CO2	
+	leaq	(CO2, LDC, 1), CO2		// co2 = c + 3 * ldc
+	leaq	(C,   LDC, 4), C	
+	leaq	(C,   LDC, 2), C		// c = c + 6 * ldc
+
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L6_20
+
+	ALIGN_4
+
+.L6_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L6_16
+
+	ALIGN_4
+
+.L6_12:
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	je	.L6_16
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	je	.L6_16
+
+	jmp	.L6_12
+	ALIGN_4
+
+.L6_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_19
+
+	ALIGN_4
+
+.L6_17:
+
+	KERNEL16x6_SUB
+
+	jnz	.L6_17
+	ALIGN_4
+
+
+.L6_19:
+
+	SAVE16x6
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L6_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L6_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L6_60		// to next 6 lines of N
+
+	testq	$8, M		
+	jz	.L6_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L6_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_20_6
+
+	ALIGN_4
+
+.L6_20_2:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L6_20_6
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L6_20_6
+
+	jmp	.L6_20_2
+	ALIGN_4
+
+.L6_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_20_9
+
+	ALIGN_4
+
+.L6_20_7:
+
+	KERNEL8x6_SUB
+
+	jnz	.L6_20_7
+	ALIGN_4
+
+
+.L6_20_9:
+
+	SAVE8x6
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L6_21pre:
+
+	testq	$4, M		
+	jz	.L6_30
+	ALIGN_4
+
+.L6_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_26
+
+	ALIGN_4
+
+.L6_22:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L6_26
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L6_26
+
+	jmp	.L6_22
+	ALIGN_4
+
+.L6_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_29
+
+	ALIGN_4
+
+.L6_27:
+
+	KERNEL4x6_SUB
+
+	jnz	.L6_27
+	ALIGN_4
+
+
+.L6_29:
+
+	SAVE4x6
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L6_30:
+	testq	$2, M		
+	jz	.L6_40
+
+	ALIGN_4
+
+.L6_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_36
+
+	ALIGN_4
+
+.L6_32:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L6_36
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L6_36
+
+	jmp	.L6_32
+	ALIGN_4
+
+.L6_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_39
+
+	ALIGN_4
+
+.L6_37:
+
+	KERNEL2x6_SUB
+
+	jnz	.L6_37
+	ALIGN_4
+
+
+.L6_39:
+
+	SAVE2x6
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L6_40:
+	testq	$1, M		
+	jz	.L6_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L6_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_46
+
+	ALIGN_4
+
+.L6_42:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L6_46
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L6_46
+
+	jmp	.L6_42
+	ALIGN_4
+
+.L6_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_49
+
+	ALIGN_4
+
+.L6_47:
+
+	KERNEL1x6_SUB
+
+	jnz	.L6_47
+	ALIGN_4
+
+
+.L6_49:
+
+	SAVE1x6
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L6_60:
+
+
+/*******************************************************************************************/
+
+
+.L7_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	salq	$2, %rax		// 4 values of B
+        leaq    (B, %rax,4), BO2
+        movq    K, %rax
+
+	ALIGN_4
+
+
+.L7_02c:
+
+	vmovsd	2*SIZE(BO1), %xmm0
+	vmovups	      (BO2), %xmm1
+	vmovsd	%xmm0, (BO)
+	vmovups	%xmm1, 2*SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L7_02c
+
+        movq    BO2, B                  // next offset of B
+
+.L7_10:
+	movq	 C, CO1
+	leaq	(C,   LDC, 2), CO2	
+	leaq	(CO2, LDC, 1), CO2		// co2 = c + 3 * ldc
+	leaq	(C,   LDC, 4), C	
+	leaq	(C,   LDC, 2), C		// c = c + 6 * ldc
+
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L7_20
+
+	ALIGN_4
+
+.L7_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L7_16
+
+	ALIGN_4
+
+.L7_12:
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	je	.L7_16
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	je	.L7_16
+
+	jmp	.L7_12
+	ALIGN_4
+
+.L7_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_19
+
+	ALIGN_4
+
+.L7_17:
+
+	KERNEL16x6_SUB
+
+	jnz	.L7_17
+	ALIGN_4
+
+
+.L7_19:
+
+	SAVE16x6
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L7_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L7_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L7_60		// to next 6 lines of N
+
+	testq	$8, M		
+	jz	.L7_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L7_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_20_6
+
+	ALIGN_4
+
+.L7_20_2:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L7_20_6
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L7_20_6
+
+	jmp	.L7_20_2
+	ALIGN_4
+
+.L7_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_20_9
+
+	ALIGN_4
+
+.L7_20_7:
+
+	KERNEL8x6_SUB
+
+	jnz	.L7_20_7
+	ALIGN_4
+
+
+.L7_20_9:
+
+	SAVE8x6
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L7_21pre:
+
+	testq	$4, M		
+	jz	.L7_30
+	ALIGN_4
+
+.L7_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_26
+
+	ALIGN_4
+
+.L7_22:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L7_26
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L7_26
+
+	jmp	.L7_22
+	ALIGN_4
+
+.L7_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_29
+
+	ALIGN_4
+
+.L7_27:
+
+	KERNEL4x6_SUB
+
+	jnz	.L7_27
+	ALIGN_4
+
+
+.L7_29:
+
+	SAVE4x6
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L7_30:
+	testq	$2, M		
+	jz	.L7_40
+
+	ALIGN_4
+
+.L7_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_36
+
+	ALIGN_4
+
+.L7_32:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L7_36
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L7_36
+
+	jmp	.L7_32
+	ALIGN_4
+
+.L7_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_39
+
+	ALIGN_4
+
+.L7_37:
+
+	KERNEL2x6_SUB
+
+	jnz	.L7_37
+	ALIGN_4
+
+
+.L7_39:
+
+	SAVE2x6
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L7_40:
+	testq	$1, M		
+	jz	.L7_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L7_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_46
+
+	ALIGN_4
+
+.L7_42:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L7_46
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L7_46
+
+	jmp	.L7_42
+	ALIGN_4
+
+.L7_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_49
+
+	ALIGN_4
+
+.L7_47:
+
+	KERNEL1x6_SUB
+
+	jnz	.L7_47
+	ALIGN_4
+
+
+.L7_49:
+
+	SAVE1x6
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L7_60:
+
+	decq	J			// j --
+	jg	.L6_01			// next 12 lines of N
+
+
+
+
+/*******************************************************************************************/
+.L4_00:
+
+ 	movq    Nmod6,  J
+        sarq    $2, J           // j = j / 4
+        cmpq    $ 0, J
+        je      .L2_00
+        ALIGN_4
+
+
+.L4_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L4_01b
+	ALIGN_4
+
+
+.L4_01a:
+        prefetcht0 512(BO1)
+        prefetchw  512(BO)
+
+	vmovups	       (BO1), %xmm0
+	vmovups	 4*SIZE(BO1), %xmm1
+	vmovups	 8*SIZE(BO1), %xmm2
+	vmovups	12*SIZE(BO1), %xmm3
+
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 4*SIZE(BO)
+	vmovups	%xmm2, 8*SIZE(BO)
+	vmovups	%xmm3,12*SIZE(BO)
+
+	addq	$ 16*SIZE,BO1
+	addq	$ 16*SIZE,BO
+	decq	%rax
+	jnz	.L4_01a
+
+
+.L4_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L4_02d
+        ALIGN_4
+
+.L4_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0, (BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L4_02c
+
+.L4_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L4_10:
+	movq	 C, CO1
+	leaq	(C, LDC, 2), CO2	
+	leaq	(C, LDC, 4), C		// c += 4 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L4_20
+
+	ALIGN_4
+
+.L4_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             	// first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $4, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4) , BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_12:
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	jmp	.L4_12
+	ALIGN_4
+
+.L4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_17:
+
+	KERNEL16x4_SUB
+
+	jl	.L4_17
+	ALIGN_4
+
+
+.L4_19:
+
+	SAVE16x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L4_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L4_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L4_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L4_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L4_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_2:
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	jmp	.L4_20_2
+	ALIGN_4
+
+.L4_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_7:
+
+	KERNEL8x4_SUB
+
+	jl	.L4_20_7
+	ALIGN_4
+
+
+.L4_20_9:
+
+	SAVE8x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L4_21pre:
+
+	testq	$4, M		
+	jz	.L4_30
+	ALIGN_4
+
+.L4_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_22:
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	jmp	.L4_22
+	ALIGN_4
+
+.L4_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_27:
+
+	KERNEL4x4_SUB
+
+	jl	.L4_27
+	ALIGN_4
+
+
+.L4_29:
+
+	SAVE4x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L4_30:
+	testq	$2, M		
+	jz	.L4_40
+
+	ALIGN_4
+
+.L4_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_32:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	jmp	.L4_32
+	ALIGN_4
+
+.L4_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_39
+
+	movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_37:
+
+	KERNEL2x4_SUB
+
+	jl	.L4_37
+	ALIGN_4
+
+
+.L4_39:
+
+	SAVE2x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L4_40:
+	testq	$1, M		
+	jz	.L4_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L4_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L4_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_42:
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	jmp	.L4_42
+	ALIGN_4
+
+.L4_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_47:
+
+	KERNEL1x4_SUB
+
+	jl	.L4_47
+	ALIGN_4
+
+
+.L4_49:
+
+	SAVE1x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L4_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $4, KK
+#endif
+
+	decq	J			// j --
+	jg	.L4_01			// next 4 lines of N
+
+
+
+/*******************************************************************************************/
+.L2_00:
+
+	movq	Nmod6, J		
+	andq	$3, J			// j % 4
+	je	.L999
+
+	movq	Nmod6, J		
+	andq	$2, J			// j % 4
+	je	.L1_0
+
+.L2_01:
+
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L2_01b
+	ALIGN_4
+
+.L2_01a:
+
+	vmovsd	      (BO1), %xmm0
+	vmovsd	2*SIZE(BO1), %xmm1
+	vmovsd	4*SIZE(BO1), %xmm2
+	vmovsd	6*SIZE(BO1), %xmm3
+
+	vmovsd	%xmm0,       (BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	vmovsd	%xmm2, 4*SIZE(BO)
+	vmovsd	%xmm3, 6*SIZE(BO)
+
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO
+	decq	%rax
+	jnz	.L2_01a
+
+
+.L2_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L2_02d
+        ALIGN_4
+
+.L2_02c:
+
+	vmovsd 	(BO1), %xmm0
+	vmovsd 	%xmm0, (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02c
+
+.L2_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $2, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB
+
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE16x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 2 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB
+
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	SAVE8x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovss	(BO1), %xmm0
+	vmovss	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $1, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB
+
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE16x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB
+
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	SAVE8x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+
+#else
+
+/*************************************************************************************
+* TRMM Kernel
+*************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovss	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $4,  %rdi
+        divq    %rdi                    //    N / 4
+        movq    %rax, Ndiv6             //    N / 4
+        movq    %rdx, Nmod6             //    N % 4
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L2_0
+	ALIGN_4
+
+/*******************************************************************************************/
+
+.L4_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L4_01b
+	ALIGN_4
+
+
+.L4_01a:
+        prefetcht0 512(BO1)
+        prefetchw  512(BO)
+
+	vmovups	       (BO1), %xmm0
+	vmovups	 4*SIZE(BO1), %xmm1
+	vmovups	 8*SIZE(BO1), %xmm2
+	vmovups	12*SIZE(BO1), %xmm3
+
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 4*SIZE(BO)
+	vmovups	%xmm2, 8*SIZE(BO)
+	vmovups	%xmm3,12*SIZE(BO)
+
+	addq	$ 16*SIZE,BO1
+	addq	$ 16*SIZE,BO
+	decq	%rax
+	jnz	.L4_01a
+
+
+.L4_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L4_02d
+        ALIGN_4
+
+.L4_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0, (BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L4_02c
+
+.L4_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L4_10:
+	movq	 C, CO1
+	leaq	(C, LDC, 2), CO2	
+	leaq	(C, LDC, 4), C		// c += 4 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L4_20
+
+	ALIGN_4
+
+.L4_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             	// first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $4, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4) , BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_12:
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	jmp	.L4_12
+	ALIGN_4
+
+.L4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_17:
+
+	KERNEL16x4_SUB
+
+	jl	.L4_17
+	ALIGN_4
+
+
+.L4_19:
+
+	SAVE16x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L4_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L4_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L4_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L4_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L4_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_2:
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	jmp	.L4_20_2
+	ALIGN_4
+
+.L4_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_7:
+
+	KERNEL8x4_SUB
+
+	jl	.L4_20_7
+	ALIGN_4
+
+
+.L4_20_9:
+
+	SAVE8x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L4_21pre:
+
+	testq	$4, M		
+	jz	.L4_30
+	ALIGN_4
+
+.L4_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_22:
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	jmp	.L4_22
+	ALIGN_4
+
+.L4_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_27:
+
+	KERNEL4x4_SUB
+
+	jl	.L4_27
+	ALIGN_4
+
+
+.L4_29:
+
+	SAVE4x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L4_30:
+	testq	$2, M		
+	jz	.L4_40
+
+	ALIGN_4
+
+.L4_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_32:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	jmp	.L4_32
+	ALIGN_4
+
+.L4_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_39
+
+	movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_37:
+
+	KERNEL2x4_SUB
+
+	jl	.L4_37
+	ALIGN_4
+
+
+.L4_39:
+
+	SAVE2x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L4_40:
+	testq	$1, M		
+	jz	.L4_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L4_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L4_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_42:
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	jmp	.L4_42
+	ALIGN_4
+
+.L4_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_47:
+
+	KERNEL1x4_SUB
+
+	jl	.L4_47
+	ALIGN_4
+
+
+.L4_49:
+
+	SAVE1x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L4_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $4, KK
+#endif
+
+	decq	J			// j --
+	jg	.L4_01			// next 4 lines of N
+
+
+
+/*******************************************************************************************/
+.L2_0:
+
+	movq	Nmod6, J		
+	andq	$3, J			// j % 4
+	je	.L999
+
+	movq	Nmod6, J		
+	andq	$2, J			// j % 4
+	je	.L1_0
+
+.L2_01:
+
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L2_01b
+	ALIGN_4
+
+.L2_01a:
+
+	vmovsd	      (BO1), %xmm0
+	vmovsd	2*SIZE(BO1), %xmm1
+	vmovsd	4*SIZE(BO1), %xmm2
+	vmovsd	6*SIZE(BO1), %xmm3
+
+	vmovsd	%xmm0,       (BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	vmovsd	%xmm2, 4*SIZE(BO)
+	vmovsd	%xmm3, 6*SIZE(BO)
+
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO
+	decq	%rax
+	jnz	.L2_01a
+
+
+.L2_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L2_02d
+        ALIGN_4
+
+.L2_02c:
+
+	vmovsd 	(BO1), %xmm0
+	vmovsd 	%xmm0, (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02c
+
+.L2_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $2, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB
+
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE16x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 2 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB
+
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	SAVE8x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovss	(BO1), %xmm0
+	vmovss	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $1, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB
+
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE16x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB
+
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	SAVE8x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#endif
+
diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h
index 36b7aa1a3..970d63578 100644
--- a/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h
+++ b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h
@@ -1,226 +1,226 @@
-/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */
-/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */
-/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */
-
-#define init_m8n4(c1,c2,c3,c4)\
-  "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\
-  "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";"
-#define INIT_m8n4 init_m8n4(4,5,6,7)
-#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11)
-#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15)
-
-#define init_m4n4(c1,c2,c3,c4)\
-  "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\
-  "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";"
-#define INIT_m4n4 init_m4n4(4,5,6,7)
-#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11)
-#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15)
-
-#define init_m2n4(c1,c2)\
-  "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"
-#define INIT_m2n4 init_m2n4(4,5)
-#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7)
-#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9)
-
-#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";"
-#define INIT_m1n4 init_m1n4(4)
-#define INIT_m1n8 INIT_m1n4 init_m1n4(5)
-#define INIT_m1n12 INIT_m1n8 init_m1n4(6)
-
-#define GEMM_KERNEL_k1m8n4 \
-  "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\
-  "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\
-  "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;"
-#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\
-  "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\
-  "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;"
-#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\
-  "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\
-  "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;"
-
-#define GEMM_KERNEL_k1m4n4 \
-  "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\
-  "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
-  "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
-#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\
-  "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\
-  "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;"
-#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\
-  "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\
-  "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;"
-
-#define GEMM_KERNEL_k1m2n4 \
-  "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\
-  "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"
-#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\
-  "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
-#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\
-  "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"
-
-#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;"
-#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;"
-#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;"
-
-#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\
-  "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
-  "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
-  "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\
-  "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
-  "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
-  "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";"
-
-#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\
-  "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
-  "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
-  "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\
-  "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\
-  "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
-  "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
-  "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\
-  "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\
-  "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";"
-
-#define GEMM_SUM_REORDER_2x4(c1,c2)\
-  "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\
-  "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\
-  "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
-  "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\
-
-#define GEMM_SUM_REORDER_1x4(c1)\
-  "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
-  "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
-  "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_le_m4n2(b_off,c1,...)\
-  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
-  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
-  "vmovsldup %%ymm"#c1",%%ymm1;"
-
-#define SOLVE_le_m8n2(b_off,c1,c2,...)\
-  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
-  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
-  "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;"
-
-#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\
-  "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
-
-#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
-
-#define SOLVE_ri_m4n2(b_off,c1,...)\
-  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
-  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
-  "vmovshdup %%ymm"#c1",%%ymm1;"
-
-#define SOLVE_ri_m8n2(b_off,c1,c2,...)\
-  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
-  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
-  "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;"
-
-#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\
-  "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
-
-#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
-
-#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
-  "vpermilps $0,%%xmm"#c1",%%xmm1;"
-
-#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
-  "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;"
-
-#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SOLVE_col2_mul_m1n4(b_off,c1,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
-  "vpermilps $85,%%xmm"#c1",%%xmm1;"
-
-#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
-  "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;"
-
-#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SOLVE_col3_mul_m1n4(b_off,c1,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
-  "vpermilps $170,%%xmm"#c1",%%xmm1;"
-
-#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
-  "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;"
-
-#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
-  "vpermilps $255,%%xmm"#c1",%%xmm1;"
-
-#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
-  "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;"
-
-#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
-
-#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
-
-#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\
-  "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\
-  "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\
-  "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\
-  "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;"
-
-#define SAVE_SOLUTION_m4n2(c1,a_off)\
-  "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\
-  "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
-
-#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\
-  "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\
-  "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"
-
-#define SAVE_SOLUTION_m1n4(c1,a_off)\
-  "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\
-  "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
+/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */
+/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */
+/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */
+
+#define init_m8n4(c1,c2,c3,c4)\
+  "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\
+  "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";"
+#define INIT_m8n4 init_m8n4(4,5,6,7)
+#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11)
+#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15)
+
+#define init_m4n4(c1,c2,c3,c4)\
+  "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\
+  "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";"
+#define INIT_m4n4 init_m4n4(4,5,6,7)
+#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11)
+#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15)
+
+#define init_m2n4(c1,c2)\
+  "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"
+#define INIT_m2n4 init_m2n4(4,5)
+#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7)
+#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9)
+
+#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";"
+#define INIT_m1n4 init_m1n4(4)
+#define INIT_m1n8 INIT_m1n4 init_m1n4(5)
+#define INIT_m1n12 INIT_m1n8 init_m1n4(6)
+
+#define GEMM_KERNEL_k1m8n4 \
+  "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\
+  "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\
+  "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;"
+#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\
+  "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\
+  "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;"
+#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\
+  "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\
+  "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;"
+
+#define GEMM_KERNEL_k1m4n4 \
+  "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\
+  "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
+  "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
+#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\
+  "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\
+  "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;"
+#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\
+  "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\
+  "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;"
+
+#define GEMM_KERNEL_k1m2n4 \
+  "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\
+  "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"
+#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\
+  "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
+#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\
+  "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"
+
+#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;"
+#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;"
+#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;"
+
+#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\
+  "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
+  "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
+  "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\
+  "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
+  "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
+  "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";"
+
+#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\
+  "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
+  "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
+  "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\
+  "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\
+  "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
+  "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
+  "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\
+  "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\
+  "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";"
+
+#define GEMM_SUM_REORDER_2x4(c1,c2)\
+  "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\
+  "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\
+  "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
+  "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\
+
+#define GEMM_SUM_REORDER_1x4(c1)\
+  "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
+  "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
+  "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_le_m4n2(b_off,c1,...)\
+  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
+  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
+  "vmovsldup %%ymm"#c1",%%ymm1;"
+
+#define SOLVE_le_m8n2(b_off,c1,c2,...)\
+  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
+  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
+  "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;"
+
+#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\
+  "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
+
+#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
+
+#define SOLVE_ri_m4n2(b_off,c1,...)\
+  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
+  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
+  "vmovshdup %%ymm"#c1",%%ymm1;"
+
+#define SOLVE_ri_m8n2(b_off,c1,c2,...)\
+  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
+  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
+  "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;"
+
+#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\
+  "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
+
+#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
+
+#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
+  "vpermilps $0,%%xmm"#c1",%%xmm1;"
+
+#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
+  "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;"
+
+#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SOLVE_col2_mul_m1n4(b_off,c1,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
+  "vpermilps $85,%%xmm"#c1",%%xmm1;"
+
+#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
+  "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;"
+
+#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SOLVE_col3_mul_m1n4(b_off,c1,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
+  "vpermilps $170,%%xmm"#c1",%%xmm1;"
+
+#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
+  "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;"
+
+#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
+  "vpermilps $255,%%xmm"#c1",%%xmm1;"
+
+#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
+  "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;"
+
+#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
+
+#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
+
+#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\
+  "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\
+  "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\
+  "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\
+  "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;"
+
+#define SAVE_SOLUTION_m4n2(c1,a_off)\
+  "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\
+  "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
+
+#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\
+  "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\
+  "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"
+
+#define SAVE_SOLUTION_m1n4(c1,a_off)\
+  "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\
+  "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
diff --git a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
index 94e2f6117..6c8b4c872 100644
--- a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
+++ b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
@@ -1,1404 +1,1404 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 320
-
-#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
-#define OLD_A           48 + STACKSIZE(%rsp)
-#define OLD_B           56 + STACKSIZE(%rsp)
-#define OLD_C           64 + STACKSIZE(%rsp)
-#define OLD_LDC         72 + STACKSIZE(%rsp)
-#define OLD_OFFSET      80 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 8192
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA_R  48(%rsp)
-#define ALPHA_I  56(%rsp)
-#define OFFSET   64(%rsp)
-#define KK       72(%rsp)
-#define KKK      80(%rsp)
-#define BUFFER1	           128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $0,  4096 * 4(%rsp);\
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-#define VFMADD_R    vfmaddpd
-#define VFMADD_I    vfmaddpd
-#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
-#define VFMADD_R    vfnmaddpd
-#define VFMADD_I    vfmaddpd
-#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
-#define VFMADD_R    vfmaddpd
-#define VFMADD_I    vfnmaddpd
-#else
-#define VFMADD_R    vfnmaddpd
-#define VFMADD_I    vfnmaddpd
-#endif
-
-
-#define	A_PR1	384
-#define	B_PR1	192
-
-#define KERNEL2x2_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL2x2_2(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL2x2_3(xx) \
-	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
-        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL2x2_4(xx) \
-        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-        addq    $16, BI                            ;\
-        addq    $16, %rax                          ;\
-
-
-#define KERNEL2x2_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-        addq    $4, BI                            ;\
-        addq    $4, %rax                          ;\
-
-/************************************************************************************************/
-
-#define KERNEL1x2_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_2(xx) \
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_3(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_4(xx) \
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $16, BI                            ;\
-        addq    $8 , %rax                          ;\
-
-
-#define KERNEL1x2_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $4, BI                            ;\
-        addq    $2, %rax                          ;\
-
-/************************************************************************************************/
-
-#define KERNEL2x1_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL2x1_2(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL2x1_3(xx) \
-	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
-        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL2x1_4(xx) \
-        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        addq    $8, BI                            ;\
-        addq    $16, %rax                          ;\
-
-
-#define KERNEL2x1_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        addq    $2, BI                            ;\
-        addq    $4, %rax                          ;\
-
-
-/************************************************************************************************/
-
-#define KERNEL1x1_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_2(xx) \
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_3(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_4(xx) \
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $8, BI                            ;\
-        addq    $8, %rax                          ;\
-
-
-#define KERNEL1x1_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $2, BI                            ;\
-        addq    $2, %rax                          ;\
-
-
-/************************************************************************************************/
-
-
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	vmovsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-	vmovsd   OLD_ALPHA_I, %xmm1
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	vmovsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA_R
-	vmovsd	 %xmm1, ALPHA_I
-
-	salq	$ZBASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $2,  %rdi
-        divq    %rdi                    //    N / 2
-        movq    %rax, Ndiv6             //    N / 2
-        movq    %rdx, Nmod6             //    N % 2
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-.L2_0:
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L1_0
-	ALIGN_4
-
-
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	2 * SIZE(BO1), %xmm1
-	vmovups	%xmm0,       (BO)
-	vmovups	%xmm1, 2 * SIZE(BO)
-	addq	$4*SIZE,BO1
-	addq	$4*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$1, I			// i = (m >> 1)
-	je	.L2_40
-
-	ALIGN_4
-
-.L2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL2x2_SUB(xxx)
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $0x01, %xmm15, %xmm15, %xmm15
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $0x01, %xmm10, %xmm10, %xmm11
-        vshufpd $0x01, %xmm12, %xmm12, %xmm13
-        vshufpd $0x01, %xmm14, %xmm14, %xmm15
-
-#else
-        vaddsubpd %xmm8,  %xmm9 ,%xmm9
-        vaddsubpd %xmm10, %xmm11,%xmm11
-        vaddsubpd %xmm12, %xmm13,%xmm13
-        vaddsubpd %xmm14, %xmm15,%xmm15
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-        vmovapd   %xmm13, %xmm12
-        vmovapd   %xmm15, %xmm14
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $0x01, %xmm15, %xmm15, %xmm15
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-        vmulpd  %xmm12, %xmm0, %xmm12
-        vmulpd  %xmm14, %xmm0, %xmm14
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-        vmulpd  %xmm13, %xmm1, %xmm13
-        vmulpd  %xmm15, %xmm1, %xmm15
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
-
-	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
-	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 2 * SIZE(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB(xxx)
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $0x01, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubpd %xmm8, %xmm9, %xmm9
-        vaddsubpd %xmm10,%xmm11, %xmm11
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm10 ,  	(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4	
-
-
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$1, I			// i = (m >> 1)
-	je	.L1_40
-
-	ALIGN_4
-
-.L1_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL2x1_SUB(xxx)
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8  , %xmm8
-        vaddsubpd %xmm13,%xmm12 , %xmm12
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $0x01, %xmm12, %xmm12, %xmm13
-
-#else
-        vaddsubpd %xmm8, %xmm9 , %xmm9
-        vaddsubpd %xmm12,%xmm13, %xmm13
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm13, %xmm12
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm12, %xmm0, %xmm12
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm13, %xmm1, %xmm13
-
-	vaddsubpd %xmm9,  %xmm8 , %xmm8
-        vaddsubpd %xmm13, %xmm12, %xmm12
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 2 * SIZE(CO1)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB(xxx)
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8,  %xmm8
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-
-#else
-        vaddsubpd %xmm8, %xmm9,  %xmm9
-
-        vmovapd   %xmm9,  %xmm8
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-
-	vaddsubpd %xmm9 ,%xmm8,  %xmm8
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4	
-
-
-
-.L999:
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 320
+
+#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
+#define OLD_A           48 + STACKSIZE(%rsp)
+#define OLD_B           56 + STACKSIZE(%rsp)
+#define OLD_C           64 + STACKSIZE(%rsp)
+#define OLD_LDC         72 + STACKSIZE(%rsp)
+#define OLD_OFFSET      80 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 8192
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA_R  48(%rsp)
+#define ALPHA_I  56(%rsp)
+#define OFFSET   64(%rsp)
+#define KK       72(%rsp)
+#define KKK      80(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define VFMADD_R    vfmaddpd
+#define VFMADD_I    vfmaddpd
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define VFMADD_R    vfnmaddpd
+#define VFMADD_I    vfmaddpd
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define VFMADD_R    vfmaddpd
+#define VFMADD_I    vfnmaddpd
+#else
+#define VFMADD_R    vfnmaddpd
+#define VFMADD_I    vfnmaddpd
+#endif
+
+
+#define	A_PR1	384
+#define	B_PR1	192
+
+#define KERNEL2x2_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL2x2_2(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL2x2_3(xx) \
+	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
+        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL2x2_4(xx) \
+        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+        addq    $16, BI                            ;\
+        addq    $16, %rax                          ;\
+
+
+#define KERNEL2x2_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+        addq    $4, BI                            ;\
+        addq    $4, %rax                          ;\
+
+/************************************************************************************************/
+
+#define KERNEL1x2_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_2(xx) \
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_3(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_4(xx) \
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $16, BI                            ;\
+        addq    $8 , %rax                          ;\
+
+
+#define KERNEL1x2_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $4, BI                            ;\
+        addq    $2, %rax                          ;\
+
+/************************************************************************************************/
+
+#define KERNEL2x1_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL2x1_2(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL2x1_3(xx) \
+	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
+        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL2x1_4(xx) \
+        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        addq    $8, BI                            ;\
+        addq    $16, %rax                          ;\
+
+
+#define KERNEL2x1_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        addq    $2, BI                            ;\
+        addq    $4, %rax                          ;\
+
+
+/************************************************************************************************/
+
+#define KERNEL1x1_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_2(xx) \
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_3(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_4(xx) \
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $8, BI                            ;\
+        addq    $8, %rax                          ;\
+
+
+#define KERNEL1x1_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $2, BI                            ;\
+        addq    $2, %rax                          ;\
+
+
+/************************************************************************************************/
+
+
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+	vmovsd   OLD_ALPHA_I, %xmm1
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	vmovsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA_R
+	vmovsd	 %xmm1, ALPHA_I
+
+	salq	$ZBASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $2,  %rdi
+        divq    %rdi                    //    N / 2
+        movq    %rax, Ndiv6             //    N / 2
+        movq    %rdx, Nmod6             //    N % 2
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+.L2_0:
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L1_0
+	ALIGN_4
+
+
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm1
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 2 * SIZE(BO)
+	addq	$4*SIZE,BO1
+	addq	$4*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$1, I			// i = (m >> 1)
+	je	.L2_40
+
+	ALIGN_4
+
+.L2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL2x2_SUB(xxx)
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $0x01, %xmm15, %xmm15, %xmm15
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $0x01, %xmm10, %xmm10, %xmm11
+        vshufpd $0x01, %xmm12, %xmm12, %xmm13
+        vshufpd $0x01, %xmm14, %xmm14, %xmm15
+
+#else
+        vaddsubpd %xmm8,  %xmm9 ,%xmm9
+        vaddsubpd %xmm10, %xmm11,%xmm11
+        vaddsubpd %xmm12, %xmm13,%xmm13
+        vaddsubpd %xmm14, %xmm15,%xmm15
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+        vmovapd   %xmm13, %xmm12
+        vmovapd   %xmm15, %xmm14
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $0x01, %xmm15, %xmm15, %xmm15
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+        vmulpd  %xmm12, %xmm0, %xmm12
+        vmulpd  %xmm14, %xmm0, %xmm14
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+        vmulpd  %xmm13, %xmm1, %xmm13
+        vmulpd  %xmm15, %xmm1, %xmm15
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
+
+	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
+	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 2 * SIZE(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB(xxx)
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $0x01, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubpd %xmm8, %xmm9, %xmm9
+        vaddsubpd %xmm10,%xmm11, %xmm11
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm10 ,  	(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$1, I			// i = (m >> 1)
+	je	.L1_40
+
+	ALIGN_4
+
+.L1_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL2x1_SUB(xxx)
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8  , %xmm8
+        vaddsubpd %xmm13,%xmm12 , %xmm12
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $0x01, %xmm12, %xmm12, %xmm13
+
+#else
+        vaddsubpd %xmm8, %xmm9 , %xmm9
+        vaddsubpd %xmm12,%xmm13, %xmm13
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm13, %xmm12
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm12, %xmm0, %xmm12
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm13, %xmm1, %xmm13
+
+	vaddsubpd %xmm9,  %xmm8 , %xmm8
+        vaddsubpd %xmm13, %xmm12, %xmm12
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 2 * SIZE(CO1)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB(xxx)
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8,  %xmm8
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+
+#else
+        vaddsubpd %xmm8, %xmm9,  %xmm9
+
+        vmovapd   %xmm9,  %xmm8
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+
+	vaddsubpd %xmm9 ,%xmm8,  %xmm8
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4	
+
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
diff --git a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S
index 848b6f237..bffe5439d 100644
--- a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S
+++ b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S
@@ -1,1429 +1,1429 @@
-/***************************************************************************
-Copyright (c) 2013, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/*********************************************************************
-*
-* 2014/06/28 Saar
-*        BLASTEST               : OK
-*        CTEST                  : OK
-*        TEST                   : OK
-*
-*
-* 2013/10/30 Saar
-*
-* Parameter:
-*       UNROLL_M        2
-*       UNROLL_N        2
-*       ZGEMM_P         384
-*       ZGEMM_Q         168
-*       A_PR1           512
-*       B_PR1           256
-*
-* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
-* 
-* 3456x3456     82.4    GFLOPS with 8 threads on 4 modules (ACML:  76.3 ) (BULLDOZER:  81.0 )
-* 3456x3456     79.9    GFLOPS with 4 threads on 4 modules (ACML:  69.9 ) (BULLDOZER:  74.6 )
-* 3456x3456     40.4    GFLOPS with 2 threads on 2 modules (ACML:  35.8 ) (BULLDOZER:  37.9 )
-* 3456x3456     20.3    GFLOPS with 1 threads on 1 modules (ACML:  18.1 ) (BULLDOZER:  19.2 )
-*
-* Performance at m x n on AMD 6380  (ACML-Version: 5.3.1):
-* 
-* 6912x6912    227.5    GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 )
-* 6912x6912    211.6    GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 )
-* 6912x6912    123.5    GFLOPS with  8 threads on  8 modules (ACML:  92.7 ) (BULLDOZER: 117.0 )
-* 3456x3456     64.1    GFLOPS with  4 threads on  4 modules (ACML:  49.1 ) (BULLDOZER:  61.7 )
-* 3456x3456     33.4    GFLOPS with  2 threads on  2 modules (ACML:  28.1 ) (BULLDOZER:  30.9 )
-* 3456x3456     17.0    GFLOPS with  1 threads on  1 modules (ACML:  15.2 ) (BULLDOZER:  15.7 )
-*
-*********************************************************************/
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 320
-
-#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
-#define OLD_A           48 + STACKSIZE(%rsp)
-#define OLD_B           56 + STACKSIZE(%rsp)
-#define OLD_C           64 + STACKSIZE(%rsp)
-#define OLD_LDC         72 + STACKSIZE(%rsp)
-#define OLD_OFFSET      80 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 256*8*4
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA_R  48(%rsp)
-#define ALPHA_I  56(%rsp)
-#define OFFSET   64(%rsp)
-#define KK       72(%rsp)
-#define KKK      80(%rsp)
-#define BUFFER1	           128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $0,  4096 * 4(%rsp);\
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-#define VFMADD_R    vfmaddpd
-#define VFMADD_I    vfmaddpd
-#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
-#define VFMADD_R    vfnmaddpd
-#define VFMADD_I    vfmaddpd
-#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
-#define VFMADD_R    vfmaddpd
-#define VFMADD_I    vfnmaddpd
-#else
-#define VFMADD_R    vfnmaddpd
-#define VFMADD_I    vfnmaddpd
-#endif
-
-
-#define	A_PR1	512
-#define	B_PR1	256
-
-#define KERNEL2x2_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL2x2_2(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL2x2_3(xx) \
-	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
-        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL2x2_4(xx) \
-        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-        addq    $16, BI                            ;\
-        addq    $16, %rax                          ;\
-
-
-#define KERNEL2x2_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-        addq    $4, BI                            ;\
-        addq    $4, %rax                          ;\
-
-/************************************************************************************************/
-
-#define KERNEL1x2_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_2(xx) \
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_3(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_4(xx) \
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $16, BI                            ;\
-        addq    $8 , %rax                          ;\
-
-
-#define KERNEL1x2_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $4, BI                            ;\
-        addq    $2, %rax                          ;\
-
-/************************************************************************************************/
-
-#define KERNEL2x1_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL2x1_2(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL2x1_3(xx) \
-	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
-        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL2x1_4(xx) \
-        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        addq    $8, BI                            ;\
-        addq    $16, %rax                          ;\
-
-
-#define KERNEL2x1_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        addq    $2, BI                            ;\
-        addq    $4, %rax                          ;\
-
-
-/************************************************************************************************/
-
-#define KERNEL1x1_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_2(xx) \
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_3(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_4(xx) \
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $8, BI                            ;\
-        addq    $8, %rax                          ;\
-
-
-#define KERNEL1x1_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $2, BI                            ;\
-        addq    $2, %rax                          ;\
-
-
-/************************************************************************************************/
-
-
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	vmovsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-	vmovsd   OLD_ALPHA_I, %xmm1
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	vmovsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA_R
-	vmovsd	 %xmm1, ALPHA_I
-
-	salq	$ZBASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $2,  %rdi
-        divq    %rdi                    //    N / 2
-        movq    %rax, Ndiv6             //    N / 2
-        movq    %rdx, Nmod6             //    N % 2
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-.L2_0:
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L1_0
-	ALIGN_4
-
-
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	2 * SIZE(BO1), %xmm1
-	vmovups	%xmm0,       (BO)
-	vmovups	%xmm1, 2 * SIZE(BO)
-	addq	$4*SIZE,BO1
-	addq	$4*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$1, I			// i = (m >> 1)
-	je	.L2_40
-
-	ALIGN_4
-
-.L2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL2x2_SUB(xxx)
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $0x01, %xmm15, %xmm15, %xmm15
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $0x01, %xmm10, %xmm10, %xmm11
-        vshufpd $0x01, %xmm12, %xmm12, %xmm13
-        vshufpd $0x01, %xmm14, %xmm14, %xmm15
-
-#else
-        vaddsubpd %xmm8,  %xmm9 ,%xmm9
-        vaddsubpd %xmm10, %xmm11,%xmm11
-        vaddsubpd %xmm12, %xmm13,%xmm13
-        vaddsubpd %xmm14, %xmm15,%xmm15
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-        vmovapd   %xmm13, %xmm12
-        vmovapd   %xmm15, %xmm14
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $0x01, %xmm15, %xmm15, %xmm15
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-        vmulpd  %xmm12, %xmm0, %xmm12
-        vmulpd  %xmm14, %xmm0, %xmm14
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-        vmulpd  %xmm13, %xmm1, %xmm13
-        vmulpd  %xmm15, %xmm1, %xmm15
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
-
-	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
-	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 2 * SIZE(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB(xxx)
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $0x01, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubpd %xmm8, %xmm9, %xmm9
-        vaddsubpd %xmm10,%xmm11, %xmm11
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm10 ,  	(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4	
-
-
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$1, I			// i = (m >> 1)
-	je	.L1_40
-
-	ALIGN_4
-
-.L1_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL2x1_SUB(xxx)
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8  , %xmm8
-        vaddsubpd %xmm13,%xmm12 , %xmm12
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $0x01, %xmm12, %xmm12, %xmm13
-
-#else
-        vaddsubpd %xmm8, %xmm9 , %xmm9
-        vaddsubpd %xmm12,%xmm13, %xmm13
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm13, %xmm12
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm12, %xmm0, %xmm12
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm13, %xmm1, %xmm13
-
-	vaddsubpd %xmm9,  %xmm8 , %xmm8
-        vaddsubpd %xmm13, %xmm12, %xmm12
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 2 * SIZE(CO1)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB(xxx)
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8,  %xmm8
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-
-#else
-        vaddsubpd %xmm8, %xmm9,  %xmm9
-
-        vmovapd   %xmm9,  %xmm8
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-
-	vaddsubpd %xmm9 ,%xmm8,  %xmm8
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4	
-
-
-
-.L999:
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/*********************************************************************
+*
+* 2014/06/28 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+* 2013/10/30 Saar
+*
+* Parameter:
+*       UNROLL_M        2
+*       UNROLL_N        2
+*       ZGEMM_P         384
+*       ZGEMM_Q         168
+*       A_PR1           512
+*       B_PR1           256
+*
+* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
+* 
+* 3456x3456     82.4    GFLOPS with 8 threads on 4 modules (ACML:  76.3 ) (BULLDOZER:  81.0 )
+* 3456x3456     79.9    GFLOPS with 4 threads on 4 modules (ACML:  69.9 ) (BULLDOZER:  74.6 )
+* 3456x3456     40.4    GFLOPS with 2 threads on 2 modules (ACML:  35.8 ) (BULLDOZER:  37.9 )
+* 3456x3456     20.3    GFLOPS with 1 threads on 1 modules (ACML:  18.1 ) (BULLDOZER:  19.2 )
+*
+* Performance at m x n on AMD 6380  (ACML-Version: 5.3.1):
+* 
+* 6912x6912    227.5    GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 )
+* 6912x6912    211.6    GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 )
+* 6912x6912    123.5    GFLOPS with  8 threads on  8 modules (ACML:  92.7 ) (BULLDOZER: 117.0 )
+* 3456x3456     64.1    GFLOPS with  4 threads on  4 modules (ACML:  49.1 ) (BULLDOZER:  61.7 )
+* 3456x3456     33.4    GFLOPS with  2 threads on  2 modules (ACML:  28.1 ) (BULLDOZER:  30.9 )
+* 3456x3456     17.0    GFLOPS with  1 threads on  1 modules (ACML:  15.2 ) (BULLDOZER:  15.7 )
+*
+*********************************************************************/
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 320
+
+#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
+#define OLD_A           48 + STACKSIZE(%rsp)
+#define OLD_B           56 + STACKSIZE(%rsp)
+#define OLD_C           64 + STACKSIZE(%rsp)
+#define OLD_LDC         72 + STACKSIZE(%rsp)
+#define OLD_OFFSET      80 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 256*8*4
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA_R  48(%rsp)
+#define ALPHA_I  56(%rsp)
+#define OFFSET   64(%rsp)
+#define KK       72(%rsp)
+#define KKK      80(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define VFMADD_R    vfmaddpd
+#define VFMADD_I    vfmaddpd
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define VFMADD_R    vfnmaddpd
+#define VFMADD_I    vfmaddpd
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define VFMADD_R    vfmaddpd
+#define VFMADD_I    vfnmaddpd
+#else
+#define VFMADD_R    vfnmaddpd
+#define VFMADD_I    vfnmaddpd
+#endif
+
+
+#define	A_PR1	512
+#define	B_PR1	256
+
+#define KERNEL2x2_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL2x2_2(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL2x2_3(xx) \
+	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
+        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL2x2_4(xx) \
+        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+        addq    $16, BI                            ;\
+        addq    $16, %rax                          ;\
+
+
+#define KERNEL2x2_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+        addq    $4, BI                            ;\
+        addq    $4, %rax                          ;\
+
+/************************************************************************************************/
+
+#define KERNEL1x2_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_2(xx) \
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_3(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_4(xx) \
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $16, BI                            ;\
+        addq    $8 , %rax                          ;\
+
+
+#define KERNEL1x2_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $4, BI                            ;\
+        addq    $2, %rax                          ;\
+
+/************************************************************************************************/
+
+#define KERNEL2x1_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL2x1_2(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL2x1_3(xx) \
+	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
+        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL2x1_4(xx) \
+        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        addq    $8, BI                            ;\
+        addq    $16, %rax                          ;\
+
+
+#define KERNEL2x1_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        addq    $2, BI                            ;\
+        addq    $4, %rax                          ;\
+
+
+/************************************************************************************************/
+
+#define KERNEL1x1_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_2(xx) \
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_3(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_4(xx) \
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $8, BI                            ;\
+        addq    $8, %rax                          ;\
+
+
+#define KERNEL1x1_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $2, BI                            ;\
+        addq    $2, %rax                          ;\
+
+
+/************************************************************************************************/
+
+
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+	vmovsd   OLD_ALPHA_I, %xmm1
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	vmovsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA_R
+	vmovsd	 %xmm1, ALPHA_I
+
+	salq	$ZBASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $2,  %rdi
+        divq    %rdi                    //    N / 2
+        movq    %rax, Ndiv6             //    N / 2
+        movq    %rdx, Nmod6             //    N % 2
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+.L2_0:
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L1_0
+	ALIGN_4
+
+
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm1
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 2 * SIZE(BO)
+	addq	$4*SIZE,BO1
+	addq	$4*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$1, I			// i = (m >> 1)
+	je	.L2_40
+
+	ALIGN_4
+
+.L2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL2x2_SUB(xxx)
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $0x01, %xmm15, %xmm15, %xmm15
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $0x01, %xmm10, %xmm10, %xmm11
+        vshufpd $0x01, %xmm12, %xmm12, %xmm13
+        vshufpd $0x01, %xmm14, %xmm14, %xmm15
+
+#else
+        vaddsubpd %xmm8,  %xmm9 ,%xmm9
+        vaddsubpd %xmm10, %xmm11,%xmm11
+        vaddsubpd %xmm12, %xmm13,%xmm13
+        vaddsubpd %xmm14, %xmm15,%xmm15
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+        vmovapd   %xmm13, %xmm12
+        vmovapd   %xmm15, %xmm14
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $0x01, %xmm15, %xmm15, %xmm15
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+        vmulpd  %xmm12, %xmm0, %xmm12
+        vmulpd  %xmm14, %xmm0, %xmm14
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+        vmulpd  %xmm13, %xmm1, %xmm13
+        vmulpd  %xmm15, %xmm1, %xmm15
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
+
+	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
+	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 2 * SIZE(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB(xxx)
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $0x01, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubpd %xmm8, %xmm9, %xmm9
+        vaddsubpd %xmm10,%xmm11, %xmm11
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm10 ,  	(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$1, I			// i = (m >> 1)
+	je	.L1_40
+
+	ALIGN_4
+
+.L1_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL2x1_SUB(xxx)
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8  , %xmm8
+        vaddsubpd %xmm13,%xmm12 , %xmm12
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $0x01, %xmm12, %xmm12, %xmm13
+
+#else
+        vaddsubpd %xmm8, %xmm9 , %xmm9
+        vaddsubpd %xmm12,%xmm13, %xmm13
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm13, %xmm12
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm12, %xmm0, %xmm12
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm13, %xmm1, %xmm13
+
+	vaddsubpd %xmm9,  %xmm8 , %xmm8
+        vaddsubpd %xmm13, %xmm12, %xmm12
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 2 * SIZE(CO1)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB(xxx)
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8,  %xmm8
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+
+#else
+        vaddsubpd %xmm8, %xmm9,  %xmm9
+
+        vmovapd   %xmm9,  %xmm8
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+
+	vaddsubpd %xmm9 ,%xmm8,  %xmm8
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4	
+
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S
index f91bfa89b..29729b101 100644
--- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S
+++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S
@@ -1,3881 +1,3881 @@
-/*********************************************************************************
-Copyright (c) 2013, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**********************************************************************************/
-
-/********************************************************************************
-* 2014/07/28 Saar
-*        BLASTEST               : OK
-*        CTEST                  : OK
-*        TEST                   : OK
-*
-* 2013/10/28 Saar
-* Parameter:
-*       ZGEMM_DEFAULT_UNROLL_N  2
-*       ZGEMM_DEFAULT_UNROLL_M  4 
-*       ZGEMM_DEFAULT_P         256
-*       ZGEMM_DEFAULT_Q         128
-*	A_PR1			512
-*	B_PR1			512
-*
-* 2014/07/28 Saar
-* Performance at 4608x4608x4608:
-*       1 thread:       53 GFLOPS	(SANDYBRIDGE:  29)	(MKL:   53)
-*       2 threads:     101 GFLOPS	(SANDYBRIDGE:  59)	(MKL:  100)
-*       3 threads:     146 GFLOPS	(SANDYBRIDGE:  86)	(MKL:  138)
-*       4 threads:     184 GFLOPS	(SANDYBRIDGE: 108)	(MKL:  172)
-*
-********************************************************************************/
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 320
-
-#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
-#define OLD_A           48 + STACKSIZE(%rsp)
-#define OLD_B           56 + STACKSIZE(%rsp)
-#define OLD_C           64 + STACKSIZE(%rsp)
-#define OLD_LDC         72 + STACKSIZE(%rsp)
-#define OLD_OFFSET      80 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 8192
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA_R  48(%rsp)
-#define ALPHA_I  56(%rsp)
-#define OFFSET   64(%rsp)
-#define KK       72(%rsp)
-#define KKK      80(%rsp)
-#define BUFFER1	           128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 4(%rsp);\
-        movl    $ 0,  4096 * 3(%rsp);\
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 3(%rsp);\
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-
-#if defined(BULLDOZER) 
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
-
-#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
-
-#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
-
-#else
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
-
-#endif
-
-#else
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0
-
-#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0
-
-#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
-
-#else
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
-
-#endif
-
-#endif
-
-#define	A_PR1	512
-#define	B_PR1	512
-
-
-
-/***************************************************************************************************/
-
-.macro KERNEL4x3_SUB
-        vmovups                   (AO), %ymm0
-        vmovups           4 * SIZE(AO), %ymm1
-	prefetcht0	  A_PR1(AO)
-
-        vbroadcastsd              (BO),   %ymm2
-        vbroadcastsd      1 * SIZE(BO),   %ymm3
-        VFMADDPD_R(        %ymm8 ,%ymm2,%ymm0 )
-        VFMADDPD_R(        %ymm12,%ymm2,%ymm1 )
-        VFMADDPD_I(        %ymm9 ,%ymm3,%ymm0 )
-        VFMADDPD_I(        %ymm13,%ymm3,%ymm1 )
-
-        vbroadcastsd      2 * SIZE(BO),   %ymm2
-        vbroadcastsd      3 * SIZE(BO),   %ymm3
-        VFMADDPD_R(        %ymm10,%ymm2,%ymm0 )
-        VFMADDPD_R(        %ymm14,%ymm2,%ymm1 )
-        VFMADDPD_I(        %ymm11,%ymm3,%ymm0 )
-        VFMADDPD_I(        %ymm15,%ymm3,%ymm1 )
-
-        vbroadcastsd      4 * SIZE(BO),   %ymm2
-        vbroadcastsd      5 * SIZE(BO),   %ymm3
-        VFMADDPD_R(        %ymm4 ,%ymm2,%ymm0 )
-        VFMADDPD_R(        %ymm6 ,%ymm2,%ymm1 )
-        VFMADDPD_I(        %ymm5 ,%ymm3,%ymm0 )
-        VFMADDPD_I(        %ymm7 ,%ymm3,%ymm1 )
-
-        addq    $ 6*SIZE, BO                           
-        addq    $ 8*SIZE, AO                           
-        decq	%rax                         
-.endm
-
-.macro SAVE4x3
-
-	vbroadcastsd	ALPHA_R, %ymm0
-	vbroadcastsd	ALPHA_I, %ymm1
-
-	// swap high and low 8 bytes
-        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
-        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
-        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
-        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
-        vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5
-        vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %ymm9, %ymm8 , %ymm8
-        vaddsubpd %ymm11,%ymm10, %ymm10
-        vaddsubpd %ymm13,%ymm12, %ymm12
-        vaddsubpd %ymm15,%ymm14, %ymm14
-        vaddsubpd %ymm5 ,%ymm4 , %ymm4
-        vaddsubpd %ymm7 ,%ymm6 , %ymm6
-
-        vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9
-        vshufpd $ 0x05, %ymm10, %ymm10, %ymm11
-        vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
-        vshufpd $ 0x05, %ymm14, %ymm14, %ymm15
-        vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5
-        vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7
-
-#else
-        vaddsubpd %ymm8,  %ymm9 ,%ymm9
-        vaddsubpd %ymm10, %ymm11,%ymm11
-        vaddsubpd %ymm12, %ymm13,%ymm13
-        vaddsubpd %ymm14, %ymm15,%ymm15
-        vaddsubpd %ymm4 , %ymm5 ,%ymm5
-        vaddsubpd %ymm6 , %ymm7 ,%ymm7
-
-        vmovapd   %ymm9,  %ymm8
-        vmovapd   %ymm11, %ymm10
-        vmovapd   %ymm13, %ymm12
-        vmovapd   %ymm15, %ymm14
-        vmovapd   %ymm5 , %ymm4
-        vmovapd   %ymm7 , %ymm6
-
-	// swap high and low 8 bytes
-        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
-        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
-        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
-        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
-        vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5
-        vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %ymm8 , %ymm0, %ymm8
-        vmulpd  %ymm10, %ymm0, %ymm10
-        vmulpd  %ymm12, %ymm0, %ymm12
-        vmulpd  %ymm14, %ymm0, %ymm14
-        vmulpd  %ymm4 , %ymm0, %ymm4
-        vmulpd  %ymm6 , %ymm0, %ymm6
-
-	// multiply with ALPHA_I
-        vmulpd  %ymm9 , %ymm1, %ymm9
-        vmulpd  %ymm11, %ymm1, %ymm11
-        vmulpd  %ymm13, %ymm1, %ymm13
-        vmulpd  %ymm15, %ymm1, %ymm15
-        vmulpd  %ymm5 , %ymm1, %ymm5
-        vmulpd  %ymm7 , %ymm1, %ymm7
-
-	vaddsubpd %ymm9, %ymm8 , %ymm8
-        vaddsubpd %ymm11,%ymm10, %ymm10
-        vaddsubpd %ymm13,%ymm12, %ymm12
-        vaddsubpd %ymm15,%ymm14, %ymm14
-        vaddsubpd %ymm5 ,%ymm4 , %ymm4
-        vaddsubpd %ymm7 ,%ymm6 , %ymm6
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %ymm8 , %ymm8
-	vaddpd  4 * SIZE(CO1), %ymm12, %ymm12
-
-	vaddpd 	 	(CO1, LDC), %ymm10, %ymm10
-	vaddpd  4 * SIZE(CO1, LDC), %ymm14, %ymm14
-
-	vaddpd 	 	(CO1, LDC,2), %ymm4 , %ymm4
-	vaddpd  4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6
-#endif
-
-	vmovups	%ymm8 ,  	 (CO1)
-	vmovups	%ymm12 , 4 * SIZE(CO1)
-
-	vmovups	%ymm10 ,  	 (CO1, LDC)
-	vmovups	%ymm14 , 4 * SIZE(CO1, LDC)
-
-	vmovups	%ymm4  ,  	 (CO1, LDC, 2)
-	vmovups	%ymm6  , 4 * SIZE(CO1, LDC, 2)
-
-	prefetcht0	64(CO1)
-	prefetcht0	64(CO1, LDC)
-
-.endm
-
-
-
-/***************************************************************************************************/
-
-.macro KERNEL2x3_SUB
-        vmovups                  (AO), %xmm0
-        vmovups          2 * SIZE(AO), %xmm1
-        vmovddup                 (BO), %xmm2
-        vmovddup         1 * SIZE(BO), %xmm3
-
-        VFMADDPD_R(        %xmm8 ,%xmm2,%xmm0 )
-        VFMADDPD_R(        %xmm12,%xmm2,%xmm1 )
-        VFMADDPD_I(        %xmm9 ,%xmm3,%xmm0 )
-        VFMADDPD_I(        %xmm13,%xmm3,%xmm1 )
-
-        vmovddup         2 * SIZE(BO), %xmm2
-        vmovddup         3 * SIZE(BO), %xmm3
-        VFMADDPD_R(        %xmm10,%xmm2,%xmm0 )
-        VFMADDPD_R(        %xmm14,%xmm2,%xmm1 )
-        VFMADDPD_I(        %xmm11,%xmm3,%xmm0 )
-        VFMADDPD_I(        %xmm15,%xmm3,%xmm1 )
-
-        vmovddup         4 * SIZE(BO), %xmm2
-        vmovddup         5 * SIZE(BO), %xmm3
-        VFMADDPD_R(        %xmm4 ,%xmm2,%xmm0 )
-        VFMADDPD_R(        %xmm6 ,%xmm2,%xmm1 )
-        VFMADDPD_I(        %xmm5 ,%xmm3,%xmm0 )
-        VFMADDPD_I(        %xmm7 ,%xmm3,%xmm1 )
-
-        addq    $ 6*SIZE, BO                           
-        addq    $ 4*SIZE, AO                           
-        decq    %rax                         
-.endm
-
-.macro SAVE2x3
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
-        vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5
-        vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-        vaddsubpd %xmm5, %xmm4 , %xmm4
-        vaddsubpd %xmm7, %xmm6 , %xmm6
-
-        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
-        vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
-        vshufpd $ 0x01, %xmm14, %xmm14, %xmm15
-        vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5
-        vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7
-
-#else
-        vaddsubpd %xmm8,  %xmm9 ,%xmm9
-        vaddsubpd %xmm10, %xmm11,%xmm11
-        vaddsubpd %xmm12, %xmm13,%xmm13
-        vaddsubpd %xmm14, %xmm15,%xmm15
-        vaddsubpd %xmm4,  %xmm5 ,%xmm5
-        vaddsubpd %xmm6,  %xmm7 ,%xmm7
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-        vmovapd   %xmm13, %xmm12
-        vmovapd   %xmm15, %xmm14
-        vmovapd   %xmm5,  %xmm4
-        vmovapd   %xmm7,  %xmm6
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
-        vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
-        vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-        vmulpd  %xmm12, %xmm0, %xmm12
-        vmulpd  %xmm14, %xmm0, %xmm14
-        vmulpd  %xmm4 , %xmm0, %xmm4
-        vmulpd  %xmm6 , %xmm0, %xmm6
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-        vmulpd  %xmm13, %xmm1, %xmm13
-        vmulpd  %xmm15, %xmm1, %xmm15
-        vmulpd  %xmm5 , %xmm1, %xmm5
-        vmulpd  %xmm7 , %xmm1, %xmm7
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-	vaddsubpd %xmm5, %xmm4 , %xmm4
-	vaddsubpd %xmm7, %xmm6 , %xmm6
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
-
-	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
-	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
-
-	vaddpd 	 	(CO1, LDC,2), %xmm4 , %xmm4
-	vaddpd  2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 2 * SIZE(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
-
-	vmovups	%xmm4  ,  	(CO1, LDC,2)
-	vmovups	%xmm6  , 2 * SIZE(CO1, LDC,2)
-
-.endm
-
-
-/************************************************************************************************/
-
-
-.macro KERNEL1x3_SUB
-        vmovups                  (AO), %xmm0
-        vmovddup                 (BO), %xmm2
-        vmovddup         1 * SIZE(BO), %xmm3
-
-        VFMADDPD_R(        %xmm8,%xmm2,%xmm0 )
-        VFMADDPD_I(        %xmm9,%xmm3,%xmm0 )
-
-        vmovddup         2 * SIZE(BO), %xmm2
-        vmovddup         3 * SIZE(BO), %xmm3
-        VFMADDPD_R(        %xmm10,%xmm2,%xmm0 )
-        VFMADDPD_I(        %xmm11,%xmm3,%xmm0 )
-
-        vmovddup         4 * SIZE(BO), %xmm2
-        vmovddup         5 * SIZE(BO), %xmm3
-        VFMADDPD_R(        %xmm4 ,%xmm2,%xmm0 )
-        VFMADDPD_I(        %xmm5 ,%xmm3,%xmm0 )
-
-        addq    $ 6*SIZE, BO                           
-        addq    $ 2*SIZE, AO                           
-        decq    %rax                         
-.endm
-
-.macro SAVE1x3
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm5, %xmm4 , %xmm4
-
-        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
-        vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5
-
-#else
-        vaddsubpd %xmm8, %xmm9, %xmm9
-        vaddsubpd %xmm10,%xmm11, %xmm11
-        vaddsubpd %xmm4, %xmm5, %xmm5
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-        vmovapd   %xmm5,  %xmm4
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-        vmulpd  %xmm4 , %xmm0, %xmm4
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-        vmulpd  %xmm5 , %xmm1, %xmm5
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-	vaddsubpd %xmm5, %xmm4 , %xmm4
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1)        , %xmm8 , %xmm8
-	vaddpd 	 	(CO1, LDC)   , %xmm10, %xmm10
-	vaddpd 	 	(CO1, LDC,2) , %xmm4 , %xmm4
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm10 ,  	(CO1, LDC)
-	vmovups	%xmm4  ,  	(CO1, LDC,2)
-
-.endm
-
-
-
-
-/***************************************************************************************************/
-
-.macro KERNEL4x2_SUB
-        vmovups           -8 * SIZE(AO, %rax, SIZE), %ymm0
-        vmovups           -4 * SIZE(AO, %rax, SIZE), %ymm1
-
-        vbroadcastsd      -8 * SIZE(BO, BI, SIZE),   %ymm4
-        vbroadcastsd      -7 * SIZE(BO, BI, SIZE),   %ymm5
-        VFMADDPD_R(        %ymm8 ,%ymm4,%ymm0 )
-        VFMADDPD_R(        %ymm12,%ymm4,%ymm1 )
-        vbroadcastsd      -6 * SIZE(BO, BI, SIZE),   %ymm6
-        VFMADDPD_I(        %ymm9 ,%ymm5,%ymm0 )
-        VFMADDPD_I(        %ymm13,%ymm5,%ymm1 )
-        vbroadcastsd      -5 * SIZE(BO, BI, SIZE),   %ymm7
-        VFMADDPD_R(        %ymm10,%ymm6,%ymm0 )
-        VFMADDPD_R(        %ymm14,%ymm6,%ymm1 )
-        VFMADDPD_I(        %ymm11,%ymm7,%ymm0 )
-        VFMADDPD_I(        %ymm15,%ymm7,%ymm1 )
-
-        addq    $ 4, BI                           
-        addq    $ 8, %rax                         
-.endm
-
-.macro SAVE4x2
-
-	vbroadcastsd	ALPHA_R, %ymm0
-	vbroadcastsd	ALPHA_I, %ymm1
-
-	// swap high and low 8 bytes
-        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
-        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
-        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
-        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %ymm9, %ymm8 , %ymm8
-        vaddsubpd %ymm11,%ymm10, %ymm10
-        vaddsubpd %ymm13,%ymm12, %ymm12
-        vaddsubpd %ymm15,%ymm14, %ymm14
-
-        vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9
-        vshufpd $ 0x05, %ymm10, %ymm10, %ymm11
-        vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
-        vshufpd $ 0x05, %ymm14, %ymm14, %ymm15
-
-#else
-        vaddsubpd %ymm8,  %ymm9 ,%ymm9
-        vaddsubpd %ymm10, %ymm11,%ymm11
-        vaddsubpd %ymm12, %ymm13,%ymm13
-        vaddsubpd %ymm14, %ymm15,%ymm15
-
-        vmovapd   %ymm9,  %ymm8
-        vmovapd   %ymm11, %ymm10
-        vmovapd   %ymm13, %ymm12
-        vmovapd   %ymm15, %ymm14
-
-	// swap high and low 8 bytes
-        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
-        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
-        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
-        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %ymm8 , %ymm0, %ymm8
-        vmulpd  %ymm10, %ymm0, %ymm10
-        vmulpd  %ymm12, %ymm0, %ymm12
-        vmulpd  %ymm14, %ymm0, %ymm14
-
-	// multiply with ALPHA_I
-        vmulpd  %ymm9 , %ymm1, %ymm9
-        vmulpd  %ymm11, %ymm1, %ymm11
-        vmulpd  %ymm13, %ymm1, %ymm13
-        vmulpd  %ymm15, %ymm1, %ymm15
-
-	vaddsubpd %ymm9, %ymm8 , %ymm8
-        vaddsubpd %ymm11,%ymm10, %ymm10
-        vaddsubpd %ymm13,%ymm12, %ymm12
-        vaddsubpd %ymm15,%ymm14, %ymm14
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %ymm8 , %ymm8
-	vaddpd  4 * SIZE(CO1), %ymm12, %ymm12
-
-	vaddpd 	 	(CO1, LDC), %ymm10, %ymm10
-	vaddpd  4 * SIZE(CO1, LDC), %ymm14, %ymm14
-
-#endif
-
-	vmovups	%ymm8 ,  	(CO1)
-	vmovups	%ymm12 , 4 * SIZE(CO1)
-
-	vmovups	%ymm10 ,  	(CO1, LDC)
-	vmovups	%ymm14 , 4 * SIZE(CO1, LDC)
-
-	prefetcht0	64(CO1)
-	prefetcht0	64(CO1, LDC)
-
-.endm
-
-/***************************************************************************************************/
-
-.macro KERNEL2x2_SUB
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1
-        VFMADDPD_R(        %xmm8,%xmm4,%xmm0  )
-        VFMADDPD_R(        %xmm12,%xmm4,%xmm1 )
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPD_I(        %xmm9,%xmm5,%xmm0  )
-        VFMADDPD_I(        %xmm13,%xmm5,%xmm1 )
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6
-        VFMADDPD_R(        %xmm10,%xmm6,%xmm0 )
-        VFMADDPD_R(        %xmm14,%xmm6,%xmm1 )
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7
-        VFMADDPD_I(        %xmm11,%xmm7,%xmm0 )
-        VFMADDPD_I(        %xmm15,%xmm7,%xmm1 )
-        addq    $ 4, BI                           
-        addq    $ 4, %rax                         
-.endm
-
-.macro SAVE2x2
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-
-        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
-        vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
-        vshufpd $ 0x01, %xmm14, %xmm14, %xmm15
-
-#else
-        vaddsubpd %xmm8,  %xmm9 ,%xmm9
-        vaddsubpd %xmm10, %xmm11,%xmm11
-        vaddsubpd %xmm12, %xmm13,%xmm13
-        vaddsubpd %xmm14, %xmm15,%xmm15
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-        vmovapd   %xmm13, %xmm12
-        vmovapd   %xmm15, %xmm14
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-        vmulpd  %xmm12, %xmm0, %xmm12
-        vmulpd  %xmm14, %xmm0, %xmm14
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-        vmulpd  %xmm13, %xmm1, %xmm13
-        vmulpd  %xmm15, %xmm1, %xmm15
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
-
-	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
-	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 2 * SIZE(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
-
-.endm
-
-/************************************************************************************************/
-
-/************************************************************************************************/
-
-
-.macro KERNEL1x2_SUB
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPD_R(        %xmm8,%xmm4,%xmm0 )
-        VFMADDPD_I(        %xmm9,%xmm5,%xmm0 )
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7
-        VFMADDPD_R(        %xmm10,%xmm6,%xmm0 )
-        VFMADDPD_I(        %xmm11,%xmm7,%xmm0 )
-        addq    $ 4, BI                           
-        addq    $ 2, %rax                         
-.endm
-
-.macro SAVE1x2
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-
-        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubpd %xmm8, %xmm9, %xmm9
-        vaddsubpd %xmm10,%xmm11, %xmm11
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm10 ,  	(CO1, LDC)
-
-.endm
-
-
-/************************************************************************************************/
-
-.macro KERNEL4x1_SUB
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm0
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %ymm1
-        vbroadcastsd     -4 * SIZE(BO, BI, SIZE)  , %ymm4
-        vbroadcastsd     -3 * SIZE(BO, BI, SIZE)  , %ymm5
-        VFMADDPD_R(        %ymm8 ,%ymm4,%ymm0 )
-        VFMADDPD_R(        %ymm12,%ymm4,%ymm1 )
-        VFMADDPD_I(        %ymm9 ,%ymm5,%ymm0 )
-        VFMADDPD_I(        %ymm13,%ymm5,%ymm1 )
-
-        addq    $ 2, BI                           
-        addq    $ 8, %rax                         
-.endm
-
-.macro SAVE4x1
-
-	vbroadcastsd	ALPHA_R, %ymm0
-	vbroadcastsd	ALPHA_I, %ymm1
-
-	// swap high and low 8 bytes
-        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
-        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %ymm9, %ymm8  , %ymm8
-        vaddsubpd %ymm13,%ymm12 , %ymm12
-
-        vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9
-        vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
-
-#else
-        vaddsubpd %ymm8, %ymm9 , %ymm9
-        vaddsubpd %ymm12,%ymm13, %ymm13
-
-        vmovapd   %ymm9,  %ymm8
-        vmovapd   %ymm13, %ymm12
-
-	// swap high and low 8 bytes
-        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
-        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %ymm8 , %ymm0, %ymm8
-        vmulpd  %ymm12, %ymm0, %ymm12
-
-	// multiply with ALPHA_I
-        vmulpd  %ymm9 , %ymm1, %ymm9
-        vmulpd  %ymm13, %ymm1, %ymm13
-
-	vaddsubpd %ymm9,  %ymm8 , %ymm8
-        vaddsubpd %ymm13, %ymm12, %ymm12
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %ymm8 , %ymm8
-	vaddpd  4 * SIZE(CO1), %ymm12, %ymm12
-
-#endif
-
-	vmovups	%ymm8 ,  	(CO1)
-	vmovups	%ymm12 ,4 * SIZE(CO1)
-
-.endm
-
-
-
-/************************************************************************************************/
-
-.macro KERNEL2x1_SUB
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPD_R(        %xmm8,%xmm4,%xmm0  )
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1
-        VFMADDPD_R(        %xmm12,%xmm4,%xmm1 )
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPD_I(        %xmm9,%xmm5,%xmm0  )
-        VFMADDPD_I(        %xmm13,%xmm5,%xmm1 )
-        addq    $ 2, BI                           
-        addq    $ 4, %rax                         
-.endm
-
-.macro SAVE2x1
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8  , %xmm8
-        vaddsubpd %xmm13,%xmm12 , %xmm12
-
-        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
-
-#else
-        vaddsubpd %xmm8, %xmm9 , %xmm9
-        vaddsubpd %xmm12,%xmm13, %xmm13
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm13, %xmm12
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm12, %xmm0, %xmm12
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm13, %xmm1, %xmm13
-
-	vaddsubpd %xmm9,  %xmm8 , %xmm8
-        vaddsubpd %xmm13, %xmm12, %xmm12
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 2 * SIZE(CO1)
-
-.endm
-
-
-/************************************************************************************************/
-
-.macro KERNEL1x1_SUB
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPD_R(        %xmm8,%xmm4,%xmm0 )
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPD_I(        %xmm9,%xmm5,%xmm0 )
-        addq    $ 2, BI                           
-        addq    $ 2, %rax                         
-.endm
-
-.macro SAVE1x1
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8,  %xmm8
-
-        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
-
-#else
-        vaddsubpd %xmm8, %xmm9,  %xmm9
-
-        vmovapd   %xmm9,  %xmm8
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-
-	vaddsubpd %xmm9 ,%xmm8,  %xmm8
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-.endm
-
-
-/************************************************************************************************/
-
-
-
-#if !defined(TRMMKERNEL)
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$ STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	movsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-	vmovsd   OLD_ALPHA_I, %xmm1
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $ 128 + L_BUFFER_SIZE, %rsp
-        andq    $ -4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$ 0, OLD_M
-	je	.L999
-
-	cmpq	$ 0, OLD_N
-	je	.L999
-
-	cmpq	$ 0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA_R
-	vmovsd	 %xmm1, ALPHA_I
-
-	salq	$ ZBASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $ 6,  %rdi
-        divq    %rdi                    //    N / 6
-        movq    %rax, Ndiv6             //    N / 6
-        movq    %rdx, Nmod6             //    N % 6
-
-	
-
-/************************************************************************************************/
-.L6_00_0:
-
-	movq	Ndiv6,  J
-	cmpq	$ 0, J
-	je	.L2_00_0
-	ALIGN_4
-
-
-
-.L6_00_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	salq	$2, %rax		// 2 * COMPSIZE
-	leaq	(B, %rax,8), BO2 
-	movq	BO2, B			// next offset of B
-	movq	K, %rax
-	ALIGN_4
-
-.L6_00_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	2 * SIZE(BO1), %xmm1
-	vmovups	        (BO2), %xmm2
-	vmovups	%xmm0,         (BO)
-	vmovups	%xmm1, 2 * SIZE(BO)
-	vmovups	%xmm2, 4 * SIZE(BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO2
-	addq	$ 6*SIZE,BO
-	decq	%rax
-	jnz	.L6_00_02b
-
-.L6_00_02c:
-
-
-
-.L6_00_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-	movq	A, AO		 	// aoffset = a
-
-	movq	M,  I
-	sarq	$ 2, I			// i = (m >> 2)
-	je	.L6_2_10
-
-	ALIGN_4
-
-/******************************************************************************************************************/
-
-.L6_4_11:
-
-	leaq	BUFFER1, BO		// first buffer to BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L6_4_16
-	ALIGN_4
-
-.L6_4_12:
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	je	.L6_4_16
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	je	.L6_4_16
-
-	jmp	.L6_4_12
-	ALIGN_4
-
-.L6_4_16:
-        movq    K, %rax
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L6_4_19
-	ALIGN_4
-
-.L6_4_17:
-
-	KERNEL4x3_SUB
-
-	jnz	.L6_4_17
-	ALIGN_4
-
-
-.L6_4_19:
-
-	SAVE4x3
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L6_4_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-
-/******************************************************************************************************************/
-.L6_2_10:
-	testq	$ 2, M		
-	jz	.L6_2_40		// to next 2 lines of N
-
-.L6_2_11:
-
-	leaq	BUFFER1, BO		// first buffer to BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L6_2_16
-	ALIGN_4
-
-.L6_2_12:
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	je	.L6_2_16
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	je	.L6_2_16
-
-	jmp	.L6_2_12
-	ALIGN_4
-
-.L6_2_16:
-        movq    K, %rax
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L6_2_19
-	ALIGN_4
-
-.L6_2_17:
-
-	KERNEL2x3_SUB
-
-	jnz	.L6_2_17
-	ALIGN_4
-
-
-.L6_2_19:
-
-	SAVE2x3
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L6_2_40:
-	testq	$ 1, M		
-	jz	.L6_2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L6_2_41:
-
-	leaq	BUFFER1, BO		// first buffer to BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L6_2_46
-
-	ALIGN_4
-
-.L6_2_42:
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	je	.L6_2_46
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	je	.L6_2_46
-
-	jmp	.L6_2_42
-	ALIGN_4
-
-.L6_2_46:
-        movq    K, %rax
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L6_2_49
-
-	ALIGN_4
-
-.L6_2_47:
-
-	KERNEL1x3_SUB
-
-	jnz	.L6_2_47
-	ALIGN_4
-
-
-.L6_2_49:
-
-	SAVE1x3
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	decq	I			# i --
-	jg	.L6_2_41
-	ALIGN_4	
-
-
-
-	
-.L6_2_60:
-
-
-/************************************************************************************************/
-
-/************************************************************************************************/
-
-
-.L7_00_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	salq	$2, %rax		// 2 * COMPSIZE
-	leaq	(B, %rax,8), BO2 
-	movq	K, %rax
-	ALIGN_4
-
-.L7_00_02b:
-
-	vmovups	2 * SIZE(BO1), %xmm0
-	vmovups	        (BO2), %xmm1
-	vmovups	2 * SIZE(BO2), %xmm2
-	vmovups	%xmm0,         (BO)
-	vmovups	%xmm1, 2 * SIZE(BO)
-	vmovups	%xmm2, 4 * SIZE(BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO2
-	addq	$ 6*SIZE,BO
-	decq	%rax
-	jnz	.L7_00_02b
-
-.L7_00_02c:
-
-	movq	BO2, B			// next offset of B
-
-
-.L7_00_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-	movq	A, AO		 	// aoffset = a
-
-	movq	M,  I
-	sarq	$ 2, I			// i = (m >> 2)
-	je	.L7_2_10
-
-	ALIGN_4
-
-/******************************************************************************************************************/
-
-.L7_4_11:
-
-	leaq	BUFFER1, BO		// first buffer to BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L7_4_16
-	ALIGN_4
-
-.L7_4_12:
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	je	.L7_4_16
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	je	.L7_4_16
-
-	jmp	.L7_4_12
-	ALIGN_4
-
-.L7_4_16:
-        movq    K, %rax
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L7_4_19
-
-	ALIGN_4
-
-.L7_4_17:
-
-	KERNEL4x3_SUB
-
-	jnz	.L7_4_17
-	ALIGN_4
-
-
-.L7_4_19:
-
-	SAVE4x3
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L7_4_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-
-/******************************************************************************************************************/
-.L7_2_10:
-	testq	$ 2, M		
-	jz	.L7_2_40		// to next 2 lines of N
-
-.L7_2_11:
-
-	leaq	BUFFER1, BO		// first buffer to BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L7_2_16
-	ALIGN_4
-
-.L7_2_12:
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	je	.L7_2_16
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	je	.L7_2_16
-
-	jmp	.L7_2_12
-	ALIGN_4
-
-.L7_2_16:
-        movq    K, %rax
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L7_2_19
-
-	ALIGN_4
-
-.L7_2_17:
-
-	KERNEL2x3_SUB
-
-	jnz	.L7_2_17
-	ALIGN_4
-
-
-.L7_2_19:
-
-	SAVE2x3
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L7_2_40:
-	testq	$ 1, M		
-	jz	.L7_2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L7_2_41:
-
-	leaq	BUFFER1, BO		// first buffer to BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L7_2_46
-
-	ALIGN_4
-
-.L7_2_42:
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	je	.L7_2_46
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	je	.L7_2_46
-
-	jmp	.L7_2_42
-	ALIGN_4
-
-.L7_2_46:
-        movq    K, %rax
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L7_2_49
-	ALIGN_4
-
-.L7_2_47:
-
-	KERNEL1x3_SUB
-
-	jnz	.L7_2_47
-	ALIGN_4
-
-
-.L7_2_49:
-
-	SAVE1x3
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	decq	I			# i --
-	jg	.L7_2_41
-	ALIGN_4	
-
-
-
-	
-.L7_2_60:
-
-	decq	J			// j --
-	jg	.L6_00_01		// next 6 lines of N
-
-/************************************************************************************************/
-
-
-
-/************************************************************************************************/
-.L2_00_0:
-
-	movq	Nmod6,  J
-	sarq	$1, J		// j = j / 2
-	cmpq	$ 0, J
-	je	.L1_2_0
-	ALIGN_4
-
-
-
-.L2_00_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_00_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	2 * SIZE(BO1), %xmm1
-	vmovups	%xmm0,       (BO)
-	vmovups	%xmm1, 2 * SIZE(BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO
-	decq	%rax
-	jnz	.L2_00_02b
-
-.L2_00_02c:
-
-	movq	BO1, B			// next offset of B
-
-
-.L2_00_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$ 2, I			// i = (m >> 2)
-	je	.L2_2_10
-
-	ALIGN_4
-
-/******************************************************************************************************************/
-
-.L2_4_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 4, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_12:
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-	je	.L2_4_16
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-	je	.L2_4_16
-
-	jmp	.L2_4_12
-	ALIGN_4
-
-.L2_4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_17:
-
-	KERNEL4x2_SUB
-
-	jl	.L2_4_17
-	ALIGN_4
-
-
-.L2_4_19:
-
-	SAVE4x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 4, KK
-#endif
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L2_4_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-
-/******************************************************************************************************************/
-.L2_2_10:
-	testq	$ 2, M		
-	jz	.L2_2_40		// to next 2 lines of N
-
-.L2_2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 2, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_12:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_2_16
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_2_16
-
-	jmp	.L2_2_12
-	ALIGN_4
-
-.L2_2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_17:
-
-	KERNEL2x2_SUB
-
-	jl	.L2_2_17
-	ALIGN_4
-
-
-.L2_2_19:
-
-	SAVE2x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_2_40:
-	testq	$ 1, M		
-	jz	.L2_2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 1, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_42:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_2_46
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_2_46
-
-	jmp	.L2_2_42
-	ALIGN_4
-
-.L2_2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_47:
-
-	KERNEL1x2_SUB
-
-	jl	.L2_2_47
-	ALIGN_4
-
-
-.L2_2_49:
-
-	SAVE1x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 1, KK
-#endif
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	decq	I			# i --
-	jg	.L2_2_41
-	ALIGN_4	
-
-
-
-	
-.L2_2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_00_01			// next 2 lines of N
-
-
-
-.L1_2_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$ 1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_00_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_00_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$ 2*SIZE,BO1
-	addq	$ 2*SIZE,BO
-	decq	%rax
-	jnz	.L1_00_02b
-
-.L1_00_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_00_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$ 2, I			// i = (m >> 2)
-	je	.L1_2_10
-
-	ALIGN_4
-
-/*******************************************************************************************************/
-
-
-.L1_4_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 4, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_12:
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_4_16
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_4_16
-
-	jmp	.L1_4_12
-	ALIGN_4
-
-.L1_4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_17:
-
-	KERNEL4x1_SUB
-
-	jl	.L1_4_17
-	ALIGN_4
-
-
-.L1_4_19:
-
-	SAVE4x1
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 4, KK
-#endif
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L1_4_11
-	ALIGN_4	
-
-
-
-
-/*******************************************************************************************************/
-.L1_2_10:
-	testq	$ 2, M		
-	jz	.L1_2_40
-
-
-.L1_2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 2, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_12:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_2_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_2_16
-
-	jmp	.L1_2_12
-	ALIGN_4
-
-.L1_2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_17:
-
-	KERNEL2x1_SUB
-
-	jl	.L1_2_17
-	ALIGN_4
-
-
-.L1_2_19:
-
-	SAVE2x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_2_40:
-	testq	$ 1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 1, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_42:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_2_46
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_2_46
-
-	jmp	.L1_2_42
-	ALIGN_4
-
-.L1_2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_47:
-
-	KERNEL1x1_SUB
-
-	jl	.L1_2_47
-	ALIGN_4
-
-
-.L1_2_49:
-
-	SAVE1x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 1, KK
-#endif
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	decq	I			# i --
-	jg	.L1_2_41
-	ALIGN_4	
-
-
-
-
-
-
-.L999:
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$ STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-#else
-/************************************************************************************************
- TRMM Kernel
-************************************************************************************************/
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$ STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	movsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-	vmovsd   OLD_ALPHA_I, %xmm1
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $ 128 + L_BUFFER_SIZE, %rsp
-        andq    $ -4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$ 0, OLD_M
-	je	.L999
-
-	cmpq	$ 0, OLD_N
-	je	.L999
-
-	cmpq	$ 0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA_R
-	vmovsd	 %xmm1, ALPHA_I
-
-	salq	$ ZBASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $ 2,  %rdi
-        divq    %rdi                    //    N / 2
-        movq    %rax, Ndiv6             //    N / 2
-        movq    %rdx, Nmod6             //    N % 2
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-.L2_00_0:
-
-	movq	Ndiv6,  J
-	cmpq	$ 0, J
-	je	.L1_2_0
-	ALIGN_4
-
-
-
-.L2_00_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_00_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	2 * SIZE(BO1), %xmm1
-	vmovups	%xmm0,       (BO)
-	vmovups	%xmm1, 2 * SIZE(BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO
-	decq	%rax
-	jnz	.L2_00_02b
-
-.L2_00_02c:
-
-	movq	BO1, B			// next offset of B
-
-
-.L2_00_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$ 2, I			// i = (m >> 2)
-	je	.L2_2_10
-
-	ALIGN_4
-
-/******************************************************************************************************************/
-
-.L2_4_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 4, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_12:
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-	je	.L2_4_16
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-	je	.L2_4_16
-
-	jmp	.L2_4_12
-	ALIGN_4
-
-.L2_4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_17:
-
-	KERNEL4x2_SUB
-
-	jl	.L2_4_17
-	ALIGN_4
-
-
-.L2_4_19:
-
-	SAVE4x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 4, KK
-#endif
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L2_4_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-
-/******************************************************************************************************************/
-.L2_2_10:
-	testq	$ 2, M		
-	jz	.L2_2_40		// to next 2 lines of N
-
-.L2_2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 2, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_12:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_2_16
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_2_16
-
-	jmp	.L2_2_12
-	ALIGN_4
-
-.L2_2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_17:
-
-	KERNEL2x2_SUB
-
-	jl	.L2_2_17
-	ALIGN_4
-
-
-.L2_2_19:
-
-	SAVE2x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_2_40:
-	testq	$ 1, M		
-	jz	.L2_2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 1, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_42:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_2_46
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_2_46
-
-	jmp	.L2_2_42
-	ALIGN_4
-
-.L2_2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_47:
-
-	KERNEL1x2_SUB
-
-	jl	.L2_2_47
-	ALIGN_4
-
-
-.L2_2_49:
-
-	SAVE1x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 1, KK
-#endif
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	decq	I			# i --
-	jg	.L2_2_41
-	ALIGN_4	
-
-
-
-	
-.L2_2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_00_01			// next 2 lines of N
-
-
-
-.L1_2_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$ 1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_00_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_00_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$ 2*SIZE,BO1
-	addq	$ 2*SIZE,BO
-	decq	%rax
-	jnz	.L1_00_02b
-
-.L1_00_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_00_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$ 2, I			// i = (m >> 2)
-	je	.L1_2_10
-
-	ALIGN_4
-
-/*******************************************************************************************************/
-
-
-.L1_4_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 4, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_12:
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_4_16
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_4_16
-
-	jmp	.L1_4_12
-	ALIGN_4
-
-.L1_4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_17:
-
-	KERNEL4x1_SUB
-
-	jl	.L1_4_17
-	ALIGN_4
-
-
-.L1_4_19:
-
-	SAVE4x1
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 4, KK
-#endif
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L1_4_11
-	ALIGN_4	
-
-
-
-
-/*******************************************************************************************************/
-.L1_2_10:
-	testq	$ 2, M		
-	jz	.L1_2_40
-
-
-.L1_2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 2, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_12:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_2_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_2_16
-
-	jmp	.L1_2_12
-	ALIGN_4
-
-.L1_2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_17:
-
-	KERNEL2x1_SUB
-
-	jl	.L1_2_17
-	ALIGN_4
-
-
-.L1_2_19:
-
-	SAVE2x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_2_40:
-	testq	$ 1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 1, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_42:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_2_46
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_2_46
-
-	jmp	.L1_2_42
-	ALIGN_4
-
-.L1_2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_47:
-
-	KERNEL1x1_SUB
-
-	jl	.L1_2_47
-	ALIGN_4
-
-
-.L1_2_49:
-
-	SAVE1x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 1, KK
-#endif
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	decq	I			# i --
-	jg	.L1_2_41
-	ALIGN_4	
-
-
-
-
-
-
-.L999:
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$ STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-#endif
-
-
+/*********************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+/********************************************************************************
+* 2014/07/28 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+* 2013/10/28 Saar
+* Parameter:
+*       ZGEMM_DEFAULT_UNROLL_N  2
+*       ZGEMM_DEFAULT_UNROLL_M  4 
+*       ZGEMM_DEFAULT_P         256
+*       ZGEMM_DEFAULT_Q         128
+*	A_PR1			512
+*	B_PR1			512
+*
+* 2014/07/28 Saar
+* Performance at 4608x4608x4608:
+*       1 thread:       53 GFLOPS	(SANDYBRIDGE:  29)	(MKL:   53)
+*       2 threads:     101 GFLOPS	(SANDYBRIDGE:  59)	(MKL:  100)
+*       3 threads:     146 GFLOPS	(SANDYBRIDGE:  86)	(MKL:  138)
+*       4 threads:     184 GFLOPS	(SANDYBRIDGE: 108)	(MKL:  172)
+*
+********************************************************************************/
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 320
+
+#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
+#define OLD_A           48 + STACKSIZE(%rsp)
+#define OLD_B           56 + STACKSIZE(%rsp)
+#define OLD_C           64 + STACKSIZE(%rsp)
+#define OLD_LDC         72 + STACKSIZE(%rsp)
+#define OLD_OFFSET      80 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 8192
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA_R  48(%rsp)
+#define ALPHA_I  56(%rsp)
+#define OFFSET   64(%rsp)
+#define KK       72(%rsp)
+#define KKK      80(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 4(%rsp);\
+        movl    $ 0,  4096 * 3(%rsp);\
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 3(%rsp);\
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+
+#if defined(BULLDOZER) 
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
+
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
+
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
+
+#else
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
+
+#endif
+
+#else
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0
+
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0
+
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
+
+#else
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
+
+#endif
+
+#endif
+
+#define	A_PR1	512
+#define	B_PR1	512
+
+
+
+/***************************************************************************************************/
+
+.macro KERNEL4x3_SUB
+        vmovups                   (AO), %ymm0
+        vmovups           4 * SIZE(AO), %ymm1
+	prefetcht0	  A_PR1(AO)
+
+        vbroadcastsd              (BO),   %ymm2
+        vbroadcastsd      1 * SIZE(BO),   %ymm3
+        VFMADDPD_R(        %ymm8 ,%ymm2,%ymm0 )
+        VFMADDPD_R(        %ymm12,%ymm2,%ymm1 )
+        VFMADDPD_I(        %ymm9 ,%ymm3,%ymm0 )
+        VFMADDPD_I(        %ymm13,%ymm3,%ymm1 )
+
+        vbroadcastsd      2 * SIZE(BO),   %ymm2
+        vbroadcastsd      3 * SIZE(BO),   %ymm3
+        VFMADDPD_R(        %ymm10,%ymm2,%ymm0 )
+        VFMADDPD_R(        %ymm14,%ymm2,%ymm1 )
+        VFMADDPD_I(        %ymm11,%ymm3,%ymm0 )
+        VFMADDPD_I(        %ymm15,%ymm3,%ymm1 )
+
+        vbroadcastsd      4 * SIZE(BO),   %ymm2
+        vbroadcastsd      5 * SIZE(BO),   %ymm3
+        VFMADDPD_R(        %ymm4 ,%ymm2,%ymm0 )
+        VFMADDPD_R(        %ymm6 ,%ymm2,%ymm1 )
+        VFMADDPD_I(        %ymm5 ,%ymm3,%ymm0 )
+        VFMADDPD_I(        %ymm7 ,%ymm3,%ymm1 )
+
+        addq    $ 6*SIZE, BO                           
+        addq    $ 8*SIZE, AO                           
+        decq	%rax                         
+.endm
+
+.macro SAVE4x3
+
+	vbroadcastsd	ALPHA_R, %ymm0
+	vbroadcastsd	ALPHA_I, %ymm1
+
+	// swap high and low 8 bytes
+        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
+        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
+        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
+        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
+        vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5
+        vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %ymm9, %ymm8 , %ymm8
+        vaddsubpd %ymm11,%ymm10, %ymm10
+        vaddsubpd %ymm13,%ymm12, %ymm12
+        vaddsubpd %ymm15,%ymm14, %ymm14
+        vaddsubpd %ymm5 ,%ymm4 , %ymm4
+        vaddsubpd %ymm7 ,%ymm6 , %ymm6
+
+        vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9
+        vshufpd $ 0x05, %ymm10, %ymm10, %ymm11
+        vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
+        vshufpd $ 0x05, %ymm14, %ymm14, %ymm15
+        vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5
+        vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7
+
+#else
+        vaddsubpd %ymm8,  %ymm9 ,%ymm9
+        vaddsubpd %ymm10, %ymm11,%ymm11
+        vaddsubpd %ymm12, %ymm13,%ymm13
+        vaddsubpd %ymm14, %ymm15,%ymm15
+        vaddsubpd %ymm4 , %ymm5 ,%ymm5
+        vaddsubpd %ymm6 , %ymm7 ,%ymm7
+
+        vmovapd   %ymm9,  %ymm8
+        vmovapd   %ymm11, %ymm10
+        vmovapd   %ymm13, %ymm12
+        vmovapd   %ymm15, %ymm14
+        vmovapd   %ymm5 , %ymm4
+        vmovapd   %ymm7 , %ymm6
+
+	// swap high and low 8 bytes
+        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
+        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
+        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
+        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
+        vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5
+        vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %ymm8 , %ymm0, %ymm8
+        vmulpd  %ymm10, %ymm0, %ymm10
+        vmulpd  %ymm12, %ymm0, %ymm12
+        vmulpd  %ymm14, %ymm0, %ymm14
+        vmulpd  %ymm4 , %ymm0, %ymm4
+        vmulpd  %ymm6 , %ymm0, %ymm6
+
+	// multiply with ALPHA_I
+        vmulpd  %ymm9 , %ymm1, %ymm9
+        vmulpd  %ymm11, %ymm1, %ymm11
+        vmulpd  %ymm13, %ymm1, %ymm13
+        vmulpd  %ymm15, %ymm1, %ymm15
+        vmulpd  %ymm5 , %ymm1, %ymm5
+        vmulpd  %ymm7 , %ymm1, %ymm7
+
+	vaddsubpd %ymm9, %ymm8 , %ymm8
+        vaddsubpd %ymm11,%ymm10, %ymm10
+        vaddsubpd %ymm13,%ymm12, %ymm12
+        vaddsubpd %ymm15,%ymm14, %ymm14
+        vaddsubpd %ymm5 ,%ymm4 , %ymm4
+        vaddsubpd %ymm7 ,%ymm6 , %ymm6
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %ymm8 , %ymm8
+	vaddpd  4 * SIZE(CO1), %ymm12, %ymm12
+
+	vaddpd 	 	(CO1, LDC), %ymm10, %ymm10
+	vaddpd  4 * SIZE(CO1, LDC), %ymm14, %ymm14
+
+	vaddpd 	 	(CO1, LDC,2), %ymm4 , %ymm4
+	vaddpd  4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6
+#endif
+
+	vmovups	%ymm8 ,  	 (CO1)
+	vmovups	%ymm12 , 4 * SIZE(CO1)
+
+	vmovups	%ymm10 ,  	 (CO1, LDC)
+	vmovups	%ymm14 , 4 * SIZE(CO1, LDC)
+
+	vmovups	%ymm4  ,  	 (CO1, LDC, 2)
+	vmovups	%ymm6  , 4 * SIZE(CO1, LDC, 2)
+
+	prefetcht0	64(CO1)
+	prefetcht0	64(CO1, LDC)
+
+.endm
+
+
+
+/***************************************************************************************************/
+
+.macro KERNEL2x3_SUB
+        vmovups                  (AO), %xmm0
+        vmovups          2 * SIZE(AO), %xmm1
+        vmovddup                 (BO), %xmm2
+        vmovddup         1 * SIZE(BO), %xmm3
+
+        VFMADDPD_R(        %xmm8 ,%xmm2,%xmm0 )
+        VFMADDPD_R(        %xmm12,%xmm2,%xmm1 )
+        VFMADDPD_I(        %xmm9 ,%xmm3,%xmm0 )
+        VFMADDPD_I(        %xmm13,%xmm3,%xmm1 )
+
+        vmovddup         2 * SIZE(BO), %xmm2
+        vmovddup         3 * SIZE(BO), %xmm3
+        VFMADDPD_R(        %xmm10,%xmm2,%xmm0 )
+        VFMADDPD_R(        %xmm14,%xmm2,%xmm1 )
+        VFMADDPD_I(        %xmm11,%xmm3,%xmm0 )
+        VFMADDPD_I(        %xmm15,%xmm3,%xmm1 )
+
+        vmovddup         4 * SIZE(BO), %xmm2
+        vmovddup         5 * SIZE(BO), %xmm3
+        VFMADDPD_R(        %xmm4 ,%xmm2,%xmm0 )
+        VFMADDPD_R(        %xmm6 ,%xmm2,%xmm1 )
+        VFMADDPD_I(        %xmm5 ,%xmm3,%xmm0 )
+        VFMADDPD_I(        %xmm7 ,%xmm3,%xmm1 )
+
+        addq    $ 6*SIZE, BO                           
+        addq    $ 4*SIZE, AO                           
+        decq    %rax                         
+.endm
+
+.macro SAVE2x3
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
+        vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5
+        vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+        vaddsubpd %xmm5, %xmm4 , %xmm4
+        vaddsubpd %xmm7, %xmm6 , %xmm6
+
+        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
+        vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
+        vshufpd $ 0x01, %xmm14, %xmm14, %xmm15
+        vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5
+        vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7
+
+#else
+        vaddsubpd %xmm8,  %xmm9 ,%xmm9
+        vaddsubpd %xmm10, %xmm11,%xmm11
+        vaddsubpd %xmm12, %xmm13,%xmm13
+        vaddsubpd %xmm14, %xmm15,%xmm15
+        vaddsubpd %xmm4,  %xmm5 ,%xmm5
+        vaddsubpd %xmm6,  %xmm7 ,%xmm7
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+        vmovapd   %xmm13, %xmm12
+        vmovapd   %xmm15, %xmm14
+        vmovapd   %xmm5,  %xmm4
+        vmovapd   %xmm7,  %xmm6
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
+        vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
+        vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+        vmulpd  %xmm12, %xmm0, %xmm12
+        vmulpd  %xmm14, %xmm0, %xmm14
+        vmulpd  %xmm4 , %xmm0, %xmm4
+        vmulpd  %xmm6 , %xmm0, %xmm6
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+        vmulpd  %xmm13, %xmm1, %xmm13
+        vmulpd  %xmm15, %xmm1, %xmm15
+        vmulpd  %xmm5 , %xmm1, %xmm5
+        vmulpd  %xmm7 , %xmm1, %xmm7
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+	vaddsubpd %xmm5, %xmm4 , %xmm4
+	vaddsubpd %xmm7, %xmm6 , %xmm6
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
+
+	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
+	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
+
+	vaddpd 	 	(CO1, LDC,2), %xmm4 , %xmm4
+	vaddpd  2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 2 * SIZE(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
+
+	vmovups	%xmm4  ,  	(CO1, LDC,2)
+	vmovups	%xmm6  , 2 * SIZE(CO1, LDC,2)
+
+.endm
+
+
+/************************************************************************************************/
+
+
+.macro KERNEL1x3_SUB
+        vmovups                  (AO), %xmm0
+        vmovddup                 (BO), %xmm2
+        vmovddup         1 * SIZE(BO), %xmm3
+
+        VFMADDPD_R(        %xmm8,%xmm2,%xmm0 )
+        VFMADDPD_I(        %xmm9,%xmm3,%xmm0 )
+
+        vmovddup         2 * SIZE(BO), %xmm2
+        vmovddup         3 * SIZE(BO), %xmm3
+        VFMADDPD_R(        %xmm10,%xmm2,%xmm0 )
+        VFMADDPD_I(        %xmm11,%xmm3,%xmm0 )
+
+        vmovddup         4 * SIZE(BO), %xmm2
+        vmovddup         5 * SIZE(BO), %xmm3
+        VFMADDPD_R(        %xmm4 ,%xmm2,%xmm0 )
+        VFMADDPD_I(        %xmm5 ,%xmm3,%xmm0 )
+
+        addq    $ 6*SIZE, BO                           
+        addq    $ 2*SIZE, AO                           
+        decq    %rax                         
+.endm
+
+.macro SAVE1x3
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm5, %xmm4 , %xmm4
+
+        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
+        vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5
+
+#else
+        vaddsubpd %xmm8, %xmm9, %xmm9
+        vaddsubpd %xmm10,%xmm11, %xmm11
+        vaddsubpd %xmm4, %xmm5, %xmm5
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+        vmovapd   %xmm5,  %xmm4
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+        vmulpd  %xmm4 , %xmm0, %xmm4
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+        vmulpd  %xmm5 , %xmm1, %xmm5
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+	vaddsubpd %xmm5, %xmm4 , %xmm4
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1)        , %xmm8 , %xmm8
+	vaddpd 	 	(CO1, LDC)   , %xmm10, %xmm10
+	vaddpd 	 	(CO1, LDC,2) , %xmm4 , %xmm4
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm4  ,  	(CO1, LDC,2)
+
+.endm
+
+
+
+
+/***************************************************************************************************/
+
+.macro KERNEL4x2_SUB
+        vmovups           -8 * SIZE(AO, %rax, SIZE), %ymm0
+        vmovups           -4 * SIZE(AO, %rax, SIZE), %ymm1
+
+        vbroadcastsd      -8 * SIZE(BO, BI, SIZE),   %ymm4
+        vbroadcastsd      -7 * SIZE(BO, BI, SIZE),   %ymm5
+        VFMADDPD_R(        %ymm8 ,%ymm4,%ymm0 )
+        VFMADDPD_R(        %ymm12,%ymm4,%ymm1 )
+        vbroadcastsd      -6 * SIZE(BO, BI, SIZE),   %ymm6
+        VFMADDPD_I(        %ymm9 ,%ymm5,%ymm0 )
+        VFMADDPD_I(        %ymm13,%ymm5,%ymm1 )
+        vbroadcastsd      -5 * SIZE(BO, BI, SIZE),   %ymm7
+        VFMADDPD_R(        %ymm10,%ymm6,%ymm0 )
+        VFMADDPD_R(        %ymm14,%ymm6,%ymm1 )
+        VFMADDPD_I(        %ymm11,%ymm7,%ymm0 )
+        VFMADDPD_I(        %ymm15,%ymm7,%ymm1 )
+
+        addq    $ 4, BI                           
+        addq    $ 8, %rax                         
+.endm
+
+.macro SAVE4x2
+
+	vbroadcastsd	ALPHA_R, %ymm0
+	vbroadcastsd	ALPHA_I, %ymm1
+
+	// swap high and low 8 bytes
+        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
+        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
+        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
+        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %ymm9, %ymm8 , %ymm8
+        vaddsubpd %ymm11,%ymm10, %ymm10
+        vaddsubpd %ymm13,%ymm12, %ymm12
+        vaddsubpd %ymm15,%ymm14, %ymm14
+
+        vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9
+        vshufpd $ 0x05, %ymm10, %ymm10, %ymm11
+        vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
+        vshufpd $ 0x05, %ymm14, %ymm14, %ymm15
+
+#else
+        vaddsubpd %ymm8,  %ymm9 ,%ymm9
+        vaddsubpd %ymm10, %ymm11,%ymm11
+        vaddsubpd %ymm12, %ymm13,%ymm13
+        vaddsubpd %ymm14, %ymm15,%ymm15
+
+        vmovapd   %ymm9,  %ymm8
+        vmovapd   %ymm11, %ymm10
+        vmovapd   %ymm13, %ymm12
+        vmovapd   %ymm15, %ymm14
+
+	// swap high and low 8 bytes
+        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
+        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
+        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
+        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %ymm8 , %ymm0, %ymm8
+        vmulpd  %ymm10, %ymm0, %ymm10
+        vmulpd  %ymm12, %ymm0, %ymm12
+        vmulpd  %ymm14, %ymm0, %ymm14
+
+	// multiply with ALPHA_I
+        vmulpd  %ymm9 , %ymm1, %ymm9
+        vmulpd  %ymm11, %ymm1, %ymm11
+        vmulpd  %ymm13, %ymm1, %ymm13
+        vmulpd  %ymm15, %ymm1, %ymm15
+
+	vaddsubpd %ymm9, %ymm8 , %ymm8
+        vaddsubpd %ymm11,%ymm10, %ymm10
+        vaddsubpd %ymm13,%ymm12, %ymm12
+        vaddsubpd %ymm15,%ymm14, %ymm14
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %ymm8 , %ymm8
+	vaddpd  4 * SIZE(CO1), %ymm12, %ymm12
+
+	vaddpd 	 	(CO1, LDC), %ymm10, %ymm10
+	vaddpd  4 * SIZE(CO1, LDC), %ymm14, %ymm14
+
+#endif
+
+	vmovups	%ymm8 ,  	(CO1)
+	vmovups	%ymm12 , 4 * SIZE(CO1)
+
+	vmovups	%ymm10 ,  	(CO1, LDC)
+	vmovups	%ymm14 , 4 * SIZE(CO1, LDC)
+
+	prefetcht0	64(CO1)
+	prefetcht0	64(CO1, LDC)
+
+.endm
+
+/***************************************************************************************************/
+
+.macro KERNEL2x2_SUB
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1
+        VFMADDPD_R(        %xmm8,%xmm4,%xmm0  )
+        VFMADDPD_R(        %xmm12,%xmm4,%xmm1 )
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPD_I(        %xmm9,%xmm5,%xmm0  )
+        VFMADDPD_I(        %xmm13,%xmm5,%xmm1 )
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6
+        VFMADDPD_R(        %xmm10,%xmm6,%xmm0 )
+        VFMADDPD_R(        %xmm14,%xmm6,%xmm1 )
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7
+        VFMADDPD_I(        %xmm11,%xmm7,%xmm0 )
+        VFMADDPD_I(        %xmm15,%xmm7,%xmm1 )
+        addq    $ 4, BI                           
+        addq    $ 4, %rax                         
+.endm
+
+.macro SAVE2x2
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+
+        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
+        vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
+        vshufpd $ 0x01, %xmm14, %xmm14, %xmm15
+
+#else
+        vaddsubpd %xmm8,  %xmm9 ,%xmm9
+        vaddsubpd %xmm10, %xmm11,%xmm11
+        vaddsubpd %xmm12, %xmm13,%xmm13
+        vaddsubpd %xmm14, %xmm15,%xmm15
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+        vmovapd   %xmm13, %xmm12
+        vmovapd   %xmm15, %xmm14
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+        vmulpd  %xmm12, %xmm0, %xmm12
+        vmulpd  %xmm14, %xmm0, %xmm14
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+        vmulpd  %xmm13, %xmm1, %xmm13
+        vmulpd  %xmm15, %xmm1, %xmm15
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
+
+	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
+	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 2 * SIZE(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
+
+.endm
+
+/************************************************************************************************/
+
+/************************************************************************************************/
+
+
+.macro KERNEL1x2_SUB
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPD_R(        %xmm8,%xmm4,%xmm0 )
+        VFMADDPD_I(        %xmm9,%xmm5,%xmm0 )
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7
+        VFMADDPD_R(        %xmm10,%xmm6,%xmm0 )
+        VFMADDPD_I(        %xmm11,%xmm7,%xmm0 )
+        addq    $ 4, BI                           
+        addq    $ 2, %rax                         
+.endm
+
+.macro SAVE1x2
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+
+        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubpd %xmm8, %xmm9, %xmm9
+        vaddsubpd %xmm10,%xmm11, %xmm11
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm10 ,  	(CO1, LDC)
+
+.endm
+
+
+/************************************************************************************************/
+
+.macro KERNEL4x1_SUB
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm0
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %ymm1
+        vbroadcastsd     -4 * SIZE(BO, BI, SIZE)  , %ymm4
+        vbroadcastsd     -3 * SIZE(BO, BI, SIZE)  , %ymm5
+        VFMADDPD_R(        %ymm8 ,%ymm4,%ymm0 )
+        VFMADDPD_R(        %ymm12,%ymm4,%ymm1 )
+        VFMADDPD_I(        %ymm9 ,%ymm5,%ymm0 )
+        VFMADDPD_I(        %ymm13,%ymm5,%ymm1 )
+
+        addq    $ 2, BI                           
+        addq    $ 8, %rax                         
+.endm
+
+.macro SAVE4x1
+
+	vbroadcastsd	ALPHA_R, %ymm0
+	vbroadcastsd	ALPHA_I, %ymm1
+
+	// swap high and low 8 bytes
+        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
+        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %ymm9, %ymm8  , %ymm8
+        vaddsubpd %ymm13,%ymm12 , %ymm12
+
+        vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9
+        vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
+
+#else
+        vaddsubpd %ymm8, %ymm9 , %ymm9
+        vaddsubpd %ymm12,%ymm13, %ymm13
+
+        vmovapd   %ymm9,  %ymm8
+        vmovapd   %ymm13, %ymm12
+
+	// swap high and low 8 bytes
+        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
+        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %ymm8 , %ymm0, %ymm8
+        vmulpd  %ymm12, %ymm0, %ymm12
+
+	// multiply with ALPHA_I
+        vmulpd  %ymm9 , %ymm1, %ymm9
+        vmulpd  %ymm13, %ymm1, %ymm13
+
+	vaddsubpd %ymm9,  %ymm8 , %ymm8
+        vaddsubpd %ymm13, %ymm12, %ymm12
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %ymm8 , %ymm8
+	vaddpd  4 * SIZE(CO1), %ymm12, %ymm12
+
+#endif
+
+	vmovups	%ymm8 ,  	(CO1)
+	vmovups	%ymm12 ,4 * SIZE(CO1)
+
+.endm
+
+
+
+/************************************************************************************************/
+
+.macro KERNEL2x1_SUB
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPD_R(        %xmm8,%xmm4,%xmm0  )
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1
+        VFMADDPD_R(        %xmm12,%xmm4,%xmm1 )
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPD_I(        %xmm9,%xmm5,%xmm0  )
+        VFMADDPD_I(        %xmm13,%xmm5,%xmm1 )
+        addq    $ 2, BI                           
+        addq    $ 4, %rax                         
+.endm
+
+.macro SAVE2x1
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8  , %xmm8
+        vaddsubpd %xmm13,%xmm12 , %xmm12
+
+        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
+
+#else
+        vaddsubpd %xmm8, %xmm9 , %xmm9
+        vaddsubpd %xmm12,%xmm13, %xmm13
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm13, %xmm12
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm12, %xmm0, %xmm12
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm13, %xmm1, %xmm13
+
+	vaddsubpd %xmm9,  %xmm8 , %xmm8
+        vaddsubpd %xmm13, %xmm12, %xmm12
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 2 * SIZE(CO1)
+
+.endm
+
+
+/************************************************************************************************/
+
+.macro KERNEL1x1_SUB
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPD_R(        %xmm8,%xmm4,%xmm0 )
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPD_I(        %xmm9,%xmm5,%xmm0 )
+        addq    $ 2, BI                           
+        addq    $ 2, %rax                         
+.endm
+
+.macro SAVE1x1
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8,  %xmm8
+
+        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
+
+#else
+        vaddsubpd %xmm8, %xmm9,  %xmm9
+
+        vmovapd   %xmm9,  %xmm8
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+
+	vaddsubpd %xmm9 ,%xmm8,  %xmm8
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+.endm
+
+
+/************************************************************************************************/
+
+
+
+#if !defined(TRMMKERNEL)
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$ STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	movsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+	vmovsd   OLD_ALPHA_I, %xmm1
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $ 128 + L_BUFFER_SIZE, %rsp
+        andq    $ -4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$ 0, OLD_M
+	je	.L999
+
+	cmpq	$ 0, OLD_N
+	je	.L999
+
+	cmpq	$ 0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA_R
+	vmovsd	 %xmm1, ALPHA_I
+
+	salq	$ ZBASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $ 6,  %rdi
+        divq    %rdi                    //    N / 6
+        movq    %rax, Ndiv6             //    N / 6
+        movq    %rdx, Nmod6             //    N % 6
+
+	
+
+/************************************************************************************************/
+.L6_00_0:
+
+	movq	Ndiv6,  J
+	cmpq	$ 0, J
+	je	.L2_00_0
+	ALIGN_4
+
+
+
+.L6_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	salq	$2, %rax		// 2 * COMPSIZE
+	leaq	(B, %rax,8), BO2 
+	movq	BO2, B			// next offset of B
+	movq	K, %rax
+	ALIGN_4
+
+.L6_00_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm1
+	vmovups	        (BO2), %xmm2
+	vmovups	%xmm0,         (BO)
+	vmovups	%xmm1, 2 * SIZE(BO)
+	vmovups	%xmm2, 4 * SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L6_00_02b
+
+.L6_00_02c:
+
+
+
+.L6_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	movq	A, AO		 	// aoffset = a
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L6_2_10
+
+	ALIGN_4
+
+/******************************************************************************************************************/
+
+.L6_4_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L6_4_16
+	ALIGN_4
+
+.L6_4_12:
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L6_4_16
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L6_4_16
+
+	jmp	.L6_4_12
+	ALIGN_4
+
+.L6_4_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L6_4_19
+	ALIGN_4
+
+.L6_4_17:
+
+	KERNEL4x3_SUB
+
+	jnz	.L6_4_17
+	ALIGN_4
+
+
+.L6_4_19:
+
+	SAVE4x3
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L6_4_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+
+/******************************************************************************************************************/
+.L6_2_10:
+	testq	$ 2, M		
+	jz	.L6_2_40		// to next 2 lines of N
+
+.L6_2_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L6_2_16
+	ALIGN_4
+
+.L6_2_12:
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L6_2_16
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L6_2_16
+
+	jmp	.L6_2_12
+	ALIGN_4
+
+.L6_2_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L6_2_19
+	ALIGN_4
+
+.L6_2_17:
+
+	KERNEL2x3_SUB
+
+	jnz	.L6_2_17
+	ALIGN_4
+
+
+.L6_2_19:
+
+	SAVE2x3
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L6_2_40:
+	testq	$ 1, M		
+	jz	.L6_2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L6_2_41:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L6_2_46
+
+	ALIGN_4
+
+.L6_2_42:
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L6_2_46
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L6_2_46
+
+	jmp	.L6_2_42
+	ALIGN_4
+
+.L6_2_46:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L6_2_49
+
+	ALIGN_4
+
+.L6_2_47:
+
+	KERNEL1x3_SUB
+
+	jnz	.L6_2_47
+	ALIGN_4
+
+
+.L6_2_49:
+
+	SAVE1x3
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L6_2_41
+	ALIGN_4	
+
+
+
+	
+.L6_2_60:
+
+
+/************************************************************************************************/
+
+/************************************************************************************************/
+
+
+.L7_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	salq	$2, %rax		// 2 * COMPSIZE
+	leaq	(B, %rax,8), BO2 
+	movq	K, %rax
+	ALIGN_4
+
+.L7_00_02b:
+
+	vmovups	2 * SIZE(BO1), %xmm0
+	vmovups	        (BO2), %xmm1
+	vmovups	2 * SIZE(BO2), %xmm2
+	vmovups	%xmm0,         (BO)
+	vmovups	%xmm1, 2 * SIZE(BO)
+	vmovups	%xmm2, 4 * SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L7_00_02b
+
+.L7_00_02c:
+
+	movq	BO2, B			// next offset of B
+
+
+.L7_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	movq	A, AO		 	// aoffset = a
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L7_2_10
+
+	ALIGN_4
+
+/******************************************************************************************************************/
+
+.L7_4_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L7_4_16
+	ALIGN_4
+
+.L7_4_12:
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L7_4_16
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L7_4_16
+
+	jmp	.L7_4_12
+	ALIGN_4
+
+.L7_4_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L7_4_19
+
+	ALIGN_4
+
+.L7_4_17:
+
+	KERNEL4x3_SUB
+
+	jnz	.L7_4_17
+	ALIGN_4
+
+
+.L7_4_19:
+
+	SAVE4x3
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L7_4_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+
+/******************************************************************************************************************/
+.L7_2_10:
+	testq	$ 2, M		
+	jz	.L7_2_40		// to next 2 lines of N
+
+.L7_2_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L7_2_16
+	ALIGN_4
+
+.L7_2_12:
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L7_2_16
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L7_2_16
+
+	jmp	.L7_2_12
+	ALIGN_4
+
+.L7_2_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L7_2_19
+
+	ALIGN_4
+
+.L7_2_17:
+
+	KERNEL2x3_SUB
+
+	jnz	.L7_2_17
+	ALIGN_4
+
+
+.L7_2_19:
+
+	SAVE2x3
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L7_2_40:
+	testq	$ 1, M		
+	jz	.L7_2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L7_2_41:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L7_2_46
+
+	ALIGN_4
+
+.L7_2_42:
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L7_2_46
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L7_2_46
+
+	jmp	.L7_2_42
+	ALIGN_4
+
+.L7_2_46:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L7_2_49
+	ALIGN_4
+
+.L7_2_47:
+
+	KERNEL1x3_SUB
+
+	jnz	.L7_2_47
+	ALIGN_4
+
+
+.L7_2_49:
+
+	SAVE1x3
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L7_2_41
+	ALIGN_4	
+
+
+
+	
+.L7_2_60:
+
+	decq	J			// j --
+	jg	.L6_00_01		// next 6 lines of N
+
+/************************************************************************************************/
+
+
+
+/************************************************************************************************/
+.L2_00_0:
+
+	movq	Nmod6,  J
+	sarq	$1, J		// j = j / 2
+	cmpq	$ 0, J
+	je	.L1_2_0
+	ALIGN_4
+
+
+
+.L2_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_00_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm1
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 2 * SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L2_00_02b
+
+.L2_00_02c:
+
+	movq	BO1, B			// next offset of B
+
+
+.L2_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L2_2_10
+
+	ALIGN_4
+
+/******************************************************************************************************************/
+
+.L2_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_12:
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+	jmp	.L2_4_12
+	ALIGN_4
+
+.L2_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_17:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_4_17
+	ALIGN_4
+
+
+.L2_4_19:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L2_4_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+
+/******************************************************************************************************************/
+.L2_2_10:
+	testq	$ 2, M		
+	jz	.L2_2_40		// to next 2 lines of N
+
+.L2_2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_2_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_2_16
+
+	jmp	.L2_2_12
+	ALIGN_4
+
+.L2_2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_17:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_2_17
+	ALIGN_4
+
+
+.L2_2_19:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_2_40:
+	testq	$ 1, M		
+	jz	.L2_2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_2_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_2_46
+
+	jmp	.L2_2_42
+	ALIGN_4
+
+.L2_2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_2_47
+	ALIGN_4
+
+
+.L2_2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L2_2_41
+	ALIGN_4	
+
+
+
+	
+.L2_2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_00_01			// next 2 lines of N
+
+
+
+.L1_2_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$ 1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_00_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$ 2*SIZE,BO1
+	addq	$ 2*SIZE,BO
+	decq	%rax
+	jnz	.L1_00_02b
+
+.L1_00_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L1_2_10
+
+	ALIGN_4
+
+/*******************************************************************************************************/
+
+
+.L1_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_12:
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	jmp	.L1_4_12
+	ALIGN_4
+
+.L1_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_17:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_4_17
+	ALIGN_4
+
+
+.L1_4_19:
+
+	SAVE4x1
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L1_4_11
+	ALIGN_4	
+
+
+
+
+/*******************************************************************************************************/
+.L1_2_10:
+	testq	$ 2, M		
+	jz	.L1_2_40
+
+
+.L1_2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_2_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_2_16
+
+	jmp	.L1_2_12
+	ALIGN_4
+
+.L1_2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_17:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_2_17
+	ALIGN_4
+
+
+.L1_2_19:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_2_40:
+	testq	$ 1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_2_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_2_46
+
+	jmp	.L1_2_42
+	ALIGN_4
+
+.L1_2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_2_47
+	ALIGN_4
+
+
+.L1_2_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L1_2_41
+	ALIGN_4	
+
+
+
+
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$ STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#else
+/************************************************************************************************
+ TRMM Kernel
+************************************************************************************************/
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$ STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	movsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+	vmovsd   OLD_ALPHA_I, %xmm1
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $ 128 + L_BUFFER_SIZE, %rsp
+        andq    $ -4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$ 0, OLD_M
+	je	.L999
+
+	cmpq	$ 0, OLD_N
+	je	.L999
+
+	cmpq	$ 0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA_R
+	vmovsd	 %xmm1, ALPHA_I
+
+	salq	$ ZBASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $ 2,  %rdi
+        divq    %rdi                    //    N / 2
+        movq    %rax, Ndiv6             //    N / 2
+        movq    %rdx, Nmod6             //    N % 2
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+.L2_00_0:
+
+	movq	Ndiv6,  J
+	cmpq	$ 0, J
+	je	.L1_2_0
+	ALIGN_4
+
+
+
+.L2_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_00_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm1
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 2 * SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L2_00_02b
+
+.L2_00_02c:
+
+	movq	BO1, B			// next offset of B
+
+
+.L2_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L2_2_10
+
+	ALIGN_4
+
+/******************************************************************************************************************/
+
+.L2_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_12:
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+	jmp	.L2_4_12
+	ALIGN_4
+
+.L2_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_17:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_4_17
+	ALIGN_4
+
+
+.L2_4_19:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L2_4_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+
+/******************************************************************************************************************/
+.L2_2_10:
+	testq	$ 2, M		
+	jz	.L2_2_40		// to next 2 lines of N
+
+.L2_2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_2_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_2_16
+
+	jmp	.L2_2_12
+	ALIGN_4
+
+.L2_2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_17:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_2_17
+	ALIGN_4
+
+
+.L2_2_19:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_2_40:
+	testq	$ 1, M		
+	jz	.L2_2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_2_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_2_46
+
+	jmp	.L2_2_42
+	ALIGN_4
+
+.L2_2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_2_47
+	ALIGN_4
+
+
+.L2_2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L2_2_41
+	ALIGN_4	
+
+
+
+	
+.L2_2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_00_01			// next 2 lines of N
+
+
+
+.L1_2_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$ 1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_00_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$ 2*SIZE,BO1
+	addq	$ 2*SIZE,BO
+	decq	%rax
+	jnz	.L1_00_02b
+
+.L1_00_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L1_2_10
+
+	ALIGN_4
+
+/*******************************************************************************************************/
+
+
+.L1_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_12:
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	jmp	.L1_4_12
+	ALIGN_4
+
+.L1_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_17:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_4_17
+	ALIGN_4
+
+
+.L1_4_19:
+
+	SAVE4x1
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L1_4_11
+	ALIGN_4	
+
+
+
+
+/*******************************************************************************************************/
+.L1_2_10:
+	testq	$ 2, M		
+	jz	.L1_2_40
+
+
+.L1_2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_2_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_2_16
+
+	jmp	.L1_2_12
+	ALIGN_4
+
+.L1_2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_17:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_2_17
+	ALIGN_4
+
+
+.L1_2_19:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_2_40:
+	testq	$ 1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_2_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_2_46
+
+	jmp	.L1_2_42
+	ALIGN_4
+
+.L1_2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_2_47
+	ALIGN_4
+
+
+.L1_2_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L1_2_41
+	ALIGN_4	
+
+
+
+
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$ STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+#endif
+
+
diff --git a/relapack/src/CMakeLists.txt b/relapack/src/CMakeLists.txt
index 78fb1431f..b92089418 100644
--- a/relapack/src/CMakeLists.txt
+++ b/relapack/src/CMakeLists.txt
@@ -1,86 +1,86 @@
-include_directories(${PROJECT_SOURCE_DIR})
-include_directories(${PROJECT_BINARY_DIR})
-include_directories(${PROJECT_SOURCE_DIR}/relapack)
-
-set(RELAFILES
-clauum.c
-ctrsyl_rec2.c
-dsytrf.c
-spbtrf.c
-strsyl_rec2.c
-zhetrf_rook_rec2.c
-ztrsyl.c
-cgbtrf.c
-cpbtrf.c
-ctrtri.c
-dsytrf_rec2.c
-spotrf.c
-strtri.c
-zlauum.c
-ztrsyl_rec2.c
-cgemmt.c
-cpotrf.c
-dgbtrf.c
-dsytrf_rook.c
-lapack_wrappers.c
-ssygst.c
-zgbtrf.c
-zpbtrf.c
-ztrtri.c
-cgetrf.c
-csytrf.c
-dgemmt.c
-dsytrf_rook_rec2.c
-ssytrf.c
-zgemmt.c
-zpotrf.c
-chegst.c
-csytrf_rec2.c
-dgetrf.c
-dtgsyl.c
-ssytrf_rec2.c
-zgetrf.c
-zsytrf.c
-chetrf.c
-csytrf_rook.c
-dlauum.c
-dtrsyl.c
-sgbtrf.c
-ssytrf_rook.c
-zhegst.c
-zsytrf_rec2.c
-chetrf_rec2.c
-csytrf_rook_rec2.c
-dpbtrf.c
-dtrsyl_rec2.c
-sgemmt.c
-ssytrf_rook_rec2.c
-zhetrf.c
-zsytrf_rook.c
-chetrf_rook.c
-ctgsyl.c
-dpotrf.c
-dtrtri.c
-sgetrf.c
-stgsyl.c
-zhetrf_rec2.c
-zsytrf_rook_rec2.c
-chetrf_rook_rec2.c
-ctrsyl.c
-dsygst.c
-f2c.c
-slauum.c
-strsyl.c
-zhetrf_rook.c
-ztgsyl.c
-)
-
-
-
-# add relapack folder to the sources
-set(RELA_SOURCES "")
-foreach (RELA_FILE ${RELAFILES})
-  list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}")
-endforeach ()
-add_library(relapack_src OBJECT ${RELA_SOURCES})
-set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
+include_directories(${PROJECT_SOURCE_DIR})
+include_directories(${PROJECT_BINARY_DIR})
+include_directories(${PROJECT_SOURCE_DIR}/relapack)
+
+set(RELAFILES
+clauum.c
+ctrsyl_rec2.c
+dsytrf.c
+spbtrf.c
+strsyl_rec2.c
+zhetrf_rook_rec2.c
+ztrsyl.c
+cgbtrf.c
+cpbtrf.c
+ctrtri.c
+dsytrf_rec2.c
+spotrf.c
+strtri.c
+zlauum.c
+ztrsyl_rec2.c
+cgemmt.c
+cpotrf.c
+dgbtrf.c
+dsytrf_rook.c
+lapack_wrappers.c
+ssygst.c
+zgbtrf.c
+zpbtrf.c
+ztrtri.c
+cgetrf.c
+csytrf.c
+dgemmt.c
+dsytrf_rook_rec2.c
+ssytrf.c
+zgemmt.c
+zpotrf.c
+chegst.c
+csytrf_rec2.c
+dgetrf.c
+dtgsyl.c
+ssytrf_rec2.c
+zgetrf.c
+zsytrf.c
+chetrf.c
+csytrf_rook.c
+dlauum.c
+dtrsyl.c
+sgbtrf.c
+ssytrf_rook.c
+zhegst.c
+zsytrf_rec2.c
+chetrf_rec2.c
+csytrf_rook_rec2.c
+dpbtrf.c
+dtrsyl_rec2.c
+sgemmt.c
+ssytrf_rook_rec2.c
+zhetrf.c
+zsytrf_rook.c
+chetrf_rook.c
+ctgsyl.c
+dpotrf.c
+dtrtri.c
+sgetrf.c
+stgsyl.c
+zhetrf_rec2.c
+zsytrf_rook_rec2.c
+chetrf_rook_rec2.c
+ctrsyl.c
+dsygst.c
+f2c.c
+slauum.c
+strsyl.c
+zhetrf_rook.c
+ztgsyl.c
+)
+
+
+
+# add relapack folder to the sources
+set(RELA_SOURCES "")
+foreach (RELA_FILE ${RELAFILES})
+  list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}")
+endforeach ()
+add_library(relapack_src OBJECT ${RELA_SOURCES})
+set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")