diff --git a/.drone.yml b/.drone.yml
index 3bbd8fc88..b1c211d14 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -8,7 +8,7 @@ platform:
 
 steps:
 - name: Build and Test
-  image: ubuntu:19.04
+  image: ubuntu:18.04
   environment:
     CC: gcc
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
@@ -32,7 +32,7 @@ platform:
 
 steps:
 - name: Build and Test
-  image: ubuntu:19.04
+  image: ubuntu:18.04
   environment:
     CC: gcc
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
@@ -152,7 +152,31 @@ platform:
 
 steps:
 - name: Build and Test
-  image: ubuntu:19.04
+  image: ubuntu:18.04
+  environment:
+    CC: gcc
+    COMMON_FLAGS: 'USE_OPENMP=1'
+  commands:
+    - echo "MAKE_FLAGS:= $COMMON_FLAGS"
+    - apt-get update -y
+    - apt-get install -y make $CC gfortran perl python g++
+    - $CC --version
+    - make QUIET_MAKE=1 $COMMON_FLAGS
+    - make -C test $COMMON_FLAGS
+    - make -C ctest $COMMON_FLAGS
+    - make -C utest $COMMON_FLAGS
+    - make -C cpp_thread_test dgemm_tester
+---
+kind: pipeline
+name: epyc_native_test
+
+platform:
+  os: linux
+  arch: amd64
+
+steps:
+- name: Build and Test
+  image: ubuntu:18.04
   environment:
     CC: gcc
     COMMON_FLAGS: 'USE_OPENMP=1'
diff --git a/README.md b/README.md
index 04f43f4c7..61393bd8f 100644
--- a/README.md
+++ b/README.md
@@ -6,8 +6,11 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev
 
 AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
 
+Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)
+
 [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
 
+
 ## Introduction
 
 OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
@@ -140,6 +143,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 - **ThunderX**: Optimized some Level-1 functions
 - **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
 - **TSV110**: Optimized some Level-3 helper functions
+- **EMAG 8180**: preliminary support based on A57
 
 #### PPC/PPC64
 
@@ -154,11 +158,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 ### Support for multiple targets in a single library
 
 OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake.
+
 For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default.
+
 DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
 Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
+
 On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
+
 For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
+
 The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the
 common code in the library, usually you will want to set this to the oldest model you expect to encounter.
 Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
diff --git a/benchmark/potrf.c b/benchmark/potrf.c
index 580e46072..cb4c23bab 100644
--- a/benchmark/potrf.c
+++ b/benchmark/potrf.c
@@ -193,14 +193,14 @@ int main(int argc, char *argv[]){
 	  a[((long)j + (long)j * (long)m) * 2 + 1] = 0.;
 
 	  for(i = j + 1; i < m; i++) {
-	    a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
+	    a[((long)i + (long)j * (long)m) * 2 + 0] = 0;
 	    a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
 	  }
 	}
       } else {
 	for (j = 0; j < m; j++) {
 	  for(i = 0; i < j; i++) {
-	    a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
+	    a[((long)i + (long)j * (long)m) * 2 + 0] = 0.;
 	    a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
 	  }
 
diff --git a/common_alpha.h b/common_alpha.h
index 9739c941d..f1ea8ff94 100644
--- a/common_alpha.h
+++ b/common_alpha.h
@@ -43,6 +43,7 @@
 
 #define MB  asm("mb")
 #define WMB asm("wmb")
+#define RMB asm("rmb")
 
 static void __inline blas_lock(unsigned long *address){
 #ifndef __DECC
diff --git a/common_arm.h b/common_arm.h
index 8411e6dd6..682315de5 100644
--- a/common_arm.h
+++ b/common_arm.h
@@ -37,11 +37,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define MB
 #define WMB
+#define RMB
 
 #else
 
 #define MB   __asm__ __volatile__ ("dmb  ish" : : : "memory")
 #define WMB  __asm__ __volatile__ ("dmb  ishst" : : : "memory")
+#define RMB  __asm__ __volatile__ ("dmb  ish" : : : "memory")
 
 #endif
 
diff --git a/common_arm64.h b/common_arm64.h
index 99e0cee57..314946282 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define MB   __asm__ __volatile__ ("dmb  ish" : : : "memory")
 #define WMB  __asm__ __volatile__ ("dmb  ishst" : : : "memory")
-
+#define RMB  __asm__ __volatile__ ("dmb  ishld" : : : "memory")
 
 #define INLINE inline
 
diff --git a/common_ia64.h b/common_ia64.h
index 72b75fc4e..59aefbd6d 100644
--- a/common_ia64.h
+++ b/common_ia64.h
@@ -47,6 +47,7 @@
 
 #define MB
 #define WMB
+#define RMB
 
 #ifdef __ECC
 #include <ia64intrin.h>
diff --git a/common_mips.h b/common_mips.h
index 35bff5083..2cc923043 100644
--- a/common_mips.h
+++ b/common_mips.h
@@ -35,6 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define MB  __sync_synchronize()
 #define WMB __sync_synchronize()
+#define RMB __sync_synchronize()
 
 #define INLINE inline
 
diff --git a/common_mips64.h b/common_mips64.h
index 1163413dc..af638d60c 100644
--- a/common_mips64.h
+++ b/common_mips64.h
@@ -73,6 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define MB  __sync_synchronize()
 #define WMB __sync_synchronize()
+#define RMB __sync_synchronize()
 
 #define INLINE inline
 
diff --git a/common_power.h b/common_power.h
index e7caf9adf..e29d0f382 100644
--- a/common_power.h
+++ b/common_power.h
@@ -71,9 +71,11 @@
 #if defined(POWER8) || defined(POWER9)
 #define MB		__asm__ __volatile__ ("eieio":::"memory")
 #define WMB		__asm__ __volatile__ ("eieio":::"memory")
+#define RMB		__asm__ __volatile__ ("eieio":::"memory")
 #else
 #define MB		__asm__ __volatile__ ("sync")
 #define WMB		__asm__ __volatile__ ("sync")
+#define RMB		__asm__ __volatile__ ("sync")
 #endif
 
 #define INLINE inline
diff --git a/common_sparc.h b/common_sparc.h
index f99972db9..85e29fffa 100644
--- a/common_sparc.h
+++ b/common_sparc.h
@@ -41,6 +41,7 @@
 
 #define MB	__asm__ __volatile__ ("nop")
 #define WMB	__asm__ __volatile__ ("nop")
+#define RMB	__asm__ __volatile__ ("nop")
 
 #ifndef ASSEMBLER
 
diff --git a/common_x86.h b/common_x86.h
index 99adc9f5b..ec928e236 100644
--- a/common_x86.h
+++ b/common_x86.h
@@ -47,6 +47,7 @@
 
 #define MB
 #define WMB
+#define RMB
 
 #ifdef C_SUN
 #define	__asm__ __asm
diff --git a/common_x86_64.h b/common_x86_64.h
index 958e9caed..0247674cd 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -63,13 +63,16 @@
 #ifdef __GNUC__
 #define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
 #define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
+#define RMB
 #else
 #define MB do {} while (0)
 #define WMB do {} while (0)
+#define RMB
 #endif
 
 static void __inline blas_lock(volatile BLASULONG *address){
 
+	
 #ifndef C_MSVC
   int ret;
 #else
diff --git a/common_zarch.h b/common_zarch.h
index b5503a7a4..442bae821 100644
--- a/common_zarch.h
+++ b/common_zarch.h
@@ -34,9 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define COMMON_ZARCH
 
 #define MB   
-//__asm__ __volatile__ ("dmb  ish" : : : "memory")
 #define WMB  
-//__asm__ __volatile__ ("dmb  ishst" : : : "memory")
+#define RMB
 
 
 #define INLINE inline
diff --git a/driver/others/memory.c b/driver/others/memory.c
index a49fb1fa1..5abcbf3a4 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2741,6 +2741,7 @@ void *blas_memory_alloc(int procpos){
   LOCK_COMMAND(&alloc_lock);
 #endif
   do {
+	  RMB;
 #if defined(USE_OPENMP)	  
     if (!memory[position].used) { 
       blas_lock(&memory[position].lock);
diff --git a/kernel/x86_64/copy_sse2.S b/kernel/x86_64/copy_sse2.S
index 200daafd9..a5ab2ea91 100644
--- a/kernel/x86_64/copy_sse2.S
+++ b/kernel/x86_64/copy_sse2.S
@@ -54,7 +54,7 @@
 #ifdef OPTERON
 #define LOAD(OFFSET, ADDR, REG)		xorps	REG, REG; addpd	OFFSET(ADDR), REG
 #else
-#define LOAD(OFFSET, ADDR, REG)		movaps	OFFSET(ADDR), REG
+#define LOAD(OFFSET, ADDR, REG)		movups	OFFSET(ADDR), REG
 #endif
 
 	PROLOGUE
@@ -104,14 +104,14 @@
 	sarq	$4, %rax
 	jle	.L13
 
-	movaps	-16 * SIZE(X), %xmm0
-	movaps	-14 * SIZE(X), %xmm1
-	movaps	-12 * SIZE(X), %xmm2
-	movaps	-10 * SIZE(X), %xmm3
-	movaps	 -8 * SIZE(X), %xmm4
-	movaps	 -6 * SIZE(X), %xmm5
-	movaps	 -4 * SIZE(X), %xmm6
-	movaps	 -2 * SIZE(X), %xmm7
+	movups	-16 * SIZE(X), %xmm0
+	movups	-14 * SIZE(X), %xmm1
+	movups	-12 * SIZE(X), %xmm2
+	movups	-10 * SIZE(X), %xmm3
+	movups	 -8 * SIZE(X), %xmm4
+	movups	 -6 * SIZE(X), %xmm5
+	movups	 -4 * SIZE(X), %xmm6
+	movups	 -2 * SIZE(X), %xmm7
 
 	decq	%rax
 	jle .L12
@@ -122,36 +122,36 @@
 	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
 #endif
 
-	movaps	%xmm0, -16 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
 	LOAD( 0 * SIZE, X, %xmm0)
-	movaps	%xmm1, -14 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
 	LOAD( 2 * SIZE, X, %xmm1)
 
 #ifdef PREFETCH
 	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
 #endif
 
-	movaps	%xmm2, -12 * SIZE(Y)
+	movups	%xmm2, -12 * SIZE(Y)
 	LOAD( 4 * SIZE, X, %xmm2)
-	movaps	%xmm3, -10 * SIZE(Y)
+	movups	%xmm3, -10 * SIZE(Y)
 	LOAD( 6 * SIZE, X, %xmm3)
 
 #if defined(PREFETCHW) && !defined(FETCH128)
 	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
 #endif
 
-	movaps	%xmm4, -8 * SIZE(Y)
+	movups	%xmm4, -8 * SIZE(Y)
 	LOAD( 8 * SIZE, X, %xmm4)
-	movaps	%xmm5, -6 * SIZE(Y)
+	movups	%xmm5, -6 * SIZE(Y)
 	LOAD(10 * SIZE, X, %xmm5)
 
 #if defined(PREFETCH) && !defined(FETCH128)
 	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
 #endif
 
-	movaps	%xmm6, -4 * SIZE(Y)
+	movups	%xmm6, -4 * SIZE(Y)
 	LOAD(12 * SIZE, X, %xmm6)
-	movaps	%xmm7, -2 * SIZE(Y)
+	movups	%xmm7, -2 * SIZE(Y)
 	LOAD(14 * SIZE, X, %xmm7)
 
 	subq	$-16 * SIZE, Y
@@ -161,14 +161,14 @@
 	ALIGN_3
 
 .L12:
-	movaps	%xmm0, -16 * SIZE(Y)
-	movaps	%xmm1, -14 * SIZE(Y)
-	movaps	%xmm2, -12 * SIZE(Y)
-	movaps	%xmm3, -10 * SIZE(Y)
-	movaps	%xmm4,  -8 * SIZE(Y)
-	movaps	%xmm5,  -6 * SIZE(Y)
-	movaps	%xmm6,  -4 * SIZE(Y)
-	movaps	%xmm7,  -2 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
+	movups	%xmm2, -12 * SIZE(Y)
+	movups	%xmm3, -10 * SIZE(Y)
+	movups	%xmm4,  -8 * SIZE(Y)
+	movups	%xmm5,  -6 * SIZE(Y)
+	movups	%xmm6,  -4 * SIZE(Y)
+	movups	%xmm7,  -2 * SIZE(Y)
 
 	subq	$-16 * SIZE, Y
 	subq	$-16 * SIZE, X
@@ -179,15 +179,15 @@
 	jle	.L14
 	ALIGN_3
 
-	movaps	-16 * SIZE(X), %xmm0
-	movaps	-14 * SIZE(X), %xmm1
-	movaps	-12 * SIZE(X), %xmm2
-	movaps	-10 * SIZE(X), %xmm3
+	movups	-16 * SIZE(X), %xmm0
+	movups	-14 * SIZE(X), %xmm1
+	movups	-12 * SIZE(X), %xmm2
+	movups	-10 * SIZE(X), %xmm3
 
-	movaps	%xmm0, -16 * SIZE(Y)
-	movaps	%xmm1, -14 * SIZE(Y)
-	movaps	%xmm2, -12 * SIZE(Y)
-	movaps	%xmm3, -10 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
+	movups	%xmm2, -12 * SIZE(Y)
+	movups	%xmm3, -10 * SIZE(Y)
 
 	addq	$8 * SIZE, X
 	addq	$8 * SIZE, Y
@@ -198,11 +198,11 @@
 	jle	.L15
 	ALIGN_3
 
-	movaps	-16 * SIZE(X), %xmm0
-	movaps	-14 * SIZE(X), %xmm1
+	movups	-16 * SIZE(X), %xmm0
+	movups	-14 * SIZE(X), %xmm1
 
-	movaps	%xmm0, -16 * SIZE(Y)
-	movaps	%xmm1, -14 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
 
 	addq	$4 * SIZE, X
 	addq	$4 * SIZE, Y
@@ -213,8 +213,8 @@
 	jle	.L16
 	ALIGN_3
 
-	movaps	-16 * SIZE(X), %xmm0
-	movaps	%xmm0, -16 * SIZE(Y)
+	movups	-16 * SIZE(X), %xmm0
+	movups	%xmm0, -16 * SIZE(Y)
 
 	addq	$2 * SIZE, X
 	addq	$2 * SIZE, Y
@@ -246,13 +246,13 @@
 	sarq	$4, %rax
 	jle	.L23
 
-	movaps	-15 * SIZE(X), %xmm1
-	movaps	-13 * SIZE(X), %xmm2
-	movaps	-11 * SIZE(X), %xmm3
-	movaps	 -9 * SIZE(X), %xmm4
-	movaps	 -7 * SIZE(X), %xmm5
-	movaps	 -5 * SIZE(X), %xmm6
-	movaps	 -3 * SIZE(X), %xmm7
+	movups	-15 * SIZE(X), %xmm1
+	movups	-13 * SIZE(X), %xmm2
+	movups	-11 * SIZE(X), %xmm3
+	movups	 -9 * SIZE(X), %xmm4
+	movups	 -7 * SIZE(X), %xmm5
+	movups	 -5 * SIZE(X), %xmm6
+	movups	 -3 * SIZE(X), %xmm7
 
 	decq	%rax
 	jle .L22
@@ -264,11 +264,11 @@
 #endif
 
 	SHUFPD_1 %xmm1, %xmm0
-	movaps	%xmm0, -16 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
 	LOAD(-1 * SIZE, X, %xmm0)
 
 	SHUFPD_1 %xmm2, %xmm1
-	movaps	%xmm1, -14 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
 	LOAD( 1 * SIZE, X, %xmm1)
 
 #ifdef PREFETCH
@@ -276,11 +276,11 @@
 #endif
 
 	SHUFPD_1 %xmm3, %xmm2
-	movaps	%xmm2, -12 * SIZE(Y)
+	movups	%xmm2, -12 * SIZE(Y)
 	LOAD( 3 * SIZE, X, %xmm2)
 
 	SHUFPD_1 %xmm4, %xmm3
-	movaps	%xmm3, -10 * SIZE(Y)
+	movups	%xmm3, -10 * SIZE(Y)
 	LOAD( 5 * SIZE, X, %xmm3)
 
 #if defined(PREFETCHW) && !defined(FETCH128)
@@ -288,11 +288,11 @@
 #endif
 
 	SHUFPD_1 %xmm5, %xmm4
-	movaps	%xmm4,  -8 * SIZE(Y)
+	movups	%xmm4,  -8 * SIZE(Y)
 	LOAD( 7 * SIZE, X, %xmm4)
 
 	SHUFPD_1 %xmm6, %xmm5
-	movaps	%xmm5, -6 * SIZE(Y)
+	movups	%xmm5, -6 * SIZE(Y)
 	LOAD( 9 * SIZE, X, %xmm5)
 
 #if defined(PREFETCH) && !defined(FETCH128)
@@ -300,11 +300,11 @@
 #endif
 
 	SHUFPD_1 %xmm7, %xmm6
-	movaps	%xmm6, -4 * SIZE(Y)
+	movups	%xmm6, -4 * SIZE(Y)
 	LOAD(11 * SIZE, X, %xmm6)
 
 	SHUFPD_1 %xmm0, %xmm7
-	movaps	%xmm7, -2 * SIZE(Y)
+	movups	%xmm7, -2 * SIZE(Y)
 	LOAD(13 * SIZE, X, %xmm7)
 
 	subq	$-16 * SIZE, X
@@ -315,26 +315,26 @@
 
 .L22:
 	SHUFPD_1 %xmm1, %xmm0
-	movaps	%xmm0, -16 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
 	LOAD(-1 * SIZE, X, %xmm0)
 
 	SHUFPD_1 %xmm2, %xmm1
-	movaps	%xmm1, -14 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
 
 	SHUFPD_1 %xmm3, %xmm2
-	movaps	%xmm2, -12 * SIZE(Y)
+	movups	%xmm2, -12 * SIZE(Y)
 	SHUFPD_1 %xmm4, %xmm3
-	movaps	%xmm3, -10 * SIZE(Y)
+	movups	%xmm3, -10 * SIZE(Y)
 
 	SHUFPD_1 %xmm5, %xmm4
-	movaps	%xmm4,  -8 * SIZE(Y)
+	movups	%xmm4,  -8 * SIZE(Y)
 	SHUFPD_1 %xmm6, %xmm5
-	movaps	%xmm5,  -6 * SIZE(Y)
+	movups	%xmm5,  -6 * SIZE(Y)
 
 	SHUFPD_1 %xmm7, %xmm6
-	movaps	%xmm6,  -4 * SIZE(Y)
+	movups	%xmm6,  -4 * SIZE(Y)
 	SHUFPD_1 %xmm0, %xmm7
-	movaps	%xmm7,  -2 * SIZE(Y)
+	movups	%xmm7,  -2 * SIZE(Y)
 
 	subq	$-16 * SIZE, X
 	subq	$-16 * SIZE, Y
@@ -345,24 +345,24 @@
 	jle	.L24
 	ALIGN_3
 
-	movaps	-15 * SIZE(X), %xmm1
-	movaps	-13 * SIZE(X), %xmm2
-	movaps	-11 * SIZE(X), %xmm3
-	movaps	 -9 * SIZE(X), %xmm8
+	movups	-15 * SIZE(X), %xmm1
+	movups	-13 * SIZE(X), %xmm2
+	movups	-11 * SIZE(X), %xmm3
+	movups	 -9 * SIZE(X), %xmm8
 
 	SHUFPD_1 %xmm1, %xmm0
-	movaps	%xmm0, -16 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
 
 	SHUFPD_1 %xmm2, %xmm1
-	movaps	%xmm1, -14 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
 
 	SHUFPD_1 %xmm3, %xmm2
-	movaps	%xmm2, -12 * SIZE(Y)
+	movups	%xmm2, -12 * SIZE(Y)
 
 	SHUFPD_1 %xmm8, %xmm3
-	movaps	%xmm3, -10 * SIZE(Y)
+	movups	%xmm3, -10 * SIZE(Y)
 
-	movaps	%xmm8, %xmm0
+	movups	%xmm8, %xmm0
 
 	addq	$8 * SIZE, X
 	addq	$8 * SIZE, Y
@@ -373,15 +373,15 @@
 	jle	.L25
 	ALIGN_3
 
-	movaps	-15 * SIZE(X), %xmm1
-	movaps	-13 * SIZE(X), %xmm2
+	movups	-15 * SIZE(X), %xmm1
+	movups	-13 * SIZE(X), %xmm2
 
 	SHUFPD_1 %xmm1, %xmm0
 	SHUFPD_1 %xmm2, %xmm1
 
-	movaps	%xmm0, -16 * SIZE(Y)
-	movaps	%xmm1, -14 * SIZE(Y)
-	movaps	%xmm2, %xmm0
+	movups	%xmm0, -16 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
+	movups	%xmm2, %xmm0
 
 	addq	$4 * SIZE, X
 	addq	$4 * SIZE, Y
@@ -392,10 +392,10 @@
 	jle	.L26
 	ALIGN_3
 
-	movaps	-15 * SIZE(X), %xmm1
+	movups	-15 * SIZE(X), %xmm1
 	SHUFPD_1 %xmm1, %xmm0
 
-	movaps	%xmm0, -16 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
 
 	addq	$2 * SIZE, X
 	addq	$2 * SIZE, Y
@@ -424,14 +424,14 @@
 	sarq	$4, %rax
 	jle	.L23
 
-	movaps	-16 * SIZE(X), %xmm0
-	movaps	-14 * SIZE(X), %xmm1
-	movaps	-12 * SIZE(X), %xmm2
-	movaps	-10 * SIZE(X), %xmm3
-	movaps	 -8 * SIZE(X), %xmm4
-	movaps	 -6 * SIZE(X), %xmm5
-	movaps	 -4 * SIZE(X), %xmm6
-	movaps	 -2 * SIZE(X), %xmm7
+	movups	-16 * SIZE(X), %xmm0
+	movups	-14 * SIZE(X), %xmm1
+	movups	-12 * SIZE(X), %xmm2
+	movups	-10 * SIZE(X), %xmm3
+	movups	 -8 * SIZE(X), %xmm4
+	movups	 -6 * SIZE(X), %xmm5
+	movups	 -4 * SIZE(X), %xmm6
+	movups	 -2 * SIZE(X), %xmm7
 
 	decq	%rax
 	jle .L22
@@ -515,16 +515,16 @@
 	jle	.L24
 	ALIGN_3
 
-	movaps	-16 * SIZE(X), %xmm0
+	movups	-16 * SIZE(X), %xmm0
 	movlps	%xmm0, -16 * SIZE(Y)
 	movhps	%xmm0, -15 * SIZE(Y)
-	movaps	-14 * SIZE(X), %xmm1
+	movups	-14 * SIZE(X), %xmm1
 	movlps	%xmm1, -14 * SIZE(Y)
 	movhps	%xmm1, -13 * SIZE(Y)
-	movaps	-12 * SIZE(X), %xmm2
+	movups	-12 * SIZE(X), %xmm2
 	movlps	%xmm2, -12 * SIZE(Y)
 	movhps	%xmm2, -11 * SIZE(Y)
-	movaps	-10 * SIZE(X), %xmm3
+	movups	-10 * SIZE(X), %xmm3
 	movlps	%xmm3, -10 * SIZE(Y)
 	movhps	%xmm3,  -9 * SIZE(Y)
 
@@ -537,10 +537,10 @@
 	jle	.L25
 	ALIGN_3
 
-	movaps	-16 * SIZE(X), %xmm0
+	movups	-16 * SIZE(X), %xmm0
 	movlps	%xmm0, -16 * SIZE(Y)
 	movhps	%xmm0, -15 * SIZE(Y)
-	movaps	-14 * SIZE(X), %xmm1
+	movups	-14 * SIZE(X), %xmm1
 	movlps	%xmm1, -14 * SIZE(Y)
 	movhps	%xmm1, -13 * SIZE(Y)
 
@@ -553,7 +553,7 @@
 	jle	.L26
 	ALIGN_3
 
-	movaps	-16 * SIZE(X), %xmm0
+	movups	-16 * SIZE(X), %xmm0
 	movlps	%xmm0, -16 * SIZE(Y)
 	movhps	%xmm0, -15 * SIZE(Y)