Browse Source

Merge remote branch 'origin/armv7' into develop

tags/v0.2.9.rc1
wernsaar 12 years ago
parent
commit
a74ac84981
98 changed files with 40184 additions and 0 deletions
  1. +12
    -0
      Makefile.arm
  2. +7
    -0
      Makefile.arm64
  3. +303
    -0
      cblas_noconst.h
  4. +169
    -0
      common_arm.h
  5. +169
    -0
      common_arm64.h
  6. +262
    -0
      cpuid_arm.c
  7. +9
    -0
      ctest.c
  8. +46
    -0
      kernel/arm/KERNEL
  9. +142
    -0
      kernel/arm/KERNEL.ARMV6
  10. +141
    -0
      kernel/arm/KERNEL.ARMV7
  11. +2
    -0
      kernel/arm/Makefile
  12. +73
    -0
      kernel/arm/amax.c
  13. +73
    -0
      kernel/arm/amin.c
  14. +67
    -0
      kernel/arm/asum.c
  15. +481
    -0
      kernel/arm/asum_vfp.S
  16. +64
    -0
      kernel/arm/axpy.c
  17. +503
    -0
      kernel/arm/axpy_vfp.S
  18. +222
    -0
      kernel/arm/ccopy_vfp.S
  19. +284
    -0
      kernel/arm/cdot_vfp.S
  20. +1252
    -0
      kernel/arm/cgemm_kernel_2x2_vfp.S
  21. +1309
    -0
      kernel/arm/cgemm_kernel_2x2_vfpv3.S
  22. +258
    -0
      kernel/arm/cgemm_ncopy_2_vfp.S
  23. +243
    -0
      kernel/arm/cgemm_tcopy_2_vfp.S
  24. +697
    -0
      kernel/arm/cgemv_n_vfp.S
  25. +607
    -0
      kernel/arm/cgemv_t_vfp.S
  26. +59
    -0
      kernel/arm/copy.c
  27. +1455
    -0
      kernel/arm/ctrmm_kernel_2x2_vfp.S
  28. +1476
    -0
      kernel/arm/ctrmm_kernel_2x2_vfpv3.S
  29. +222
    -0
      kernel/arm/dcopy_vfp.S
  30. +248
    -0
      kernel/arm/ddot_vfp.S
  31. +806
    -0
      kernel/arm/dgemm_kernel_4x2_vfp.S
  32. +1483
    -0
      kernel/arm/dgemm_kernel_4x4_vfpv3.S
  33. +225
    -0
      kernel/arm/dgemm_ncopy_2_vfp.S
  34. +349
    -0
      kernel/arm/dgemm_ncopy_4_vfp.S
  35. +408
    -0
      kernel/arm/dgemm_tcopy_4_vfp.S
  36. +64
    -0
      kernel/arm/dot.c
  37. +1089
    -0
      kernel/arm/dtrmm_kernel_4x2_vfp.S
  38. +1953
    -0
      kernel/arm/dtrmm_kernel_4x4_vfpv3.S
  39. +67
    -0
      kernel/arm/gemv_n.c
  40. +740
    -0
      kernel/arm/gemv_n_vfp.S
  41. +781
    -0
      kernel/arm/gemv_n_vfpv3.S
  42. +67
    -0
      kernel/arm/gemv_t.c
  43. +750
    -0
      kernel/arm/gemv_t_vfp.S
  44. +732
    -0
      kernel/arm/gemv_t_vfpv3.S
  45. +75
    -0
      kernel/arm/iamax.c
  46. +478
    -0
      kernel/arm/iamax_vfp.S
  47. +75
    -0
      kernel/arm/iamin.c
  48. +67
    -0
      kernel/arm/imax.c
  49. +65
    -0
      kernel/arm/imin.c
  50. +81
    -0
      kernel/arm/izamax.c
  51. +81
    -0
      kernel/arm/izamin.c
  52. +63
    -0
      kernel/arm/max.c
  53. +63
    -0
      kernel/arm/min.c
  54. +88
    -0
      kernel/arm/nrm2.c
  55. +565
    -0
      kernel/arm/nrm2_vfp.S
  56. +508
    -0
      kernel/arm/nrm2_vfpv3.S
  57. +62
    -0
      kernel/arm/rot.c
  58. +584
    -0
      kernel/arm/rot_vfp.S
  59. +58
    -0
      kernel/arm/scal.c
  60. +376
    -0
      kernel/arm/scal_vfp.S
  61. +224
    -0
      kernel/arm/scopy_vfp.S
  62. +347
    -0
      kernel/arm/sdot_vfp.S
  63. +797
    -0
      kernel/arm/sgemm_kernel_4x2_vfp.S
  64. +1436
    -0
      kernel/arm/sgemm_kernel_4x4_vfpv3.S
  65. +225
    -0
      kernel/arm/sgemm_ncopy_2_vfp.S
  66. +353
    -0
      kernel/arm/sgemm_ncopy_4_vfp.S
  67. +430
    -0
      kernel/arm/sgemm_tcopy_4_vfp.S
  68. +1081
    -0
      kernel/arm/strmm_kernel_4x2_vfp.S
  69. +1884
    -0
      kernel/arm/strmm_kernel_4x4_vfpv3.S
  70. +62
    -0
      kernel/arm/swap.c
  71. +354
    -0
      kernel/arm/swap_vfp.S
  72. +81
    -0
      kernel/arm/zamax.c
  73. +81
    -0
      kernel/arm/zamin.c
  74. +71
    -0
      kernel/arm/zasum.c
  75. +72
    -0
      kernel/arm/zaxpy.c
  76. +63
    -0
      kernel/arm/zcopy.c
  77. +223
    -0
      kernel/arm/zcopy_vfp.S
  78. +78
    -0
      kernel/arm/zdot.c
  79. +286
    -0
      kernel/arm/zdot_vfp.S
  80. +1299
    -0
      kernel/arm/zgemm_kernel_2x2_vfp.S
  81. +1345
    -0
      kernel/arm/zgemm_kernel_2x2_vfpv3.S
  82. +254
    -0
      kernel/arm/zgemm_ncopy_2_vfp.S
  83. +245
    -0
      kernel/arm/zgemm_tcopy_2_vfp.S
  84. +157
    -0
      kernel/arm/zgemv_n.c
  85. +699
    -0
      kernel/arm/zgemv_n_vfp.S
  86. +140
    -0
      kernel/arm/zgemv_t.c
  87. +608
    -0
      kernel/arm/zgemv_t_vfp.S
  88. +106
    -0
      kernel/arm/znrm2.c
  89. +68
    -0
      kernel/arm/zrot.c
  90. +64
    -0
      kernel/arm/zscal.c
  91. +70
    -0
      kernel/arm/zswap.c
  92. +1537
    -0
      kernel/arm/ztrmm_kernel_2x2_vfp.S
  93. +1538
    -0
      kernel/arm/ztrmm_kernel_2x2_vfpv3.S
  94. +46
    -0
      kernel/arm64/KERNEL
  95. +134
    -0
      kernel/arm64/KERNEL.ARMV8
  96. +2
    -0
      kernel/arm64/Makefile
  97. +33
    -0
      lapack/laswp/arm/Makefile
  98. +33
    -0
      lapack/laswp/arm64/Makefile

+ 12
- 0
Makefile.arm View File

@@ -0,0 +1,12 @@

ifeq ($(CORE), ARMV7)
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
endif

ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6
FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6
endif



+ 7
- 0
Makefile.arm64 View File

@@ -0,0 +1,7 @@

ifeq ($(CORE), ARMV8)
CCOMMON_OPT += -march=armv8-a
FCOMMON_OPT += -march=armv8-a
endif



+ 303
- 0
cblas_noconst.h View File

@@ -0,0 +1,303 @@
#ifndef CBLAS_H
#define CBLAS_H

#include <stddef.h>
#include "common.h"

#ifdef __cplusplus
extern "C" {
/* Assume C declarations for C++ */
#endif /* __cplusplus */

/*Set the number of threads on runtime.*/
void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads);

/*Get the build configure on runtime.*/
char* openblas_get_config(void);

/* Get the parallelization type which is used by OpenBLAS */
int openblas_get_parallel(void);
/* OpenBLAS is compiled for sequential use */
#define OPENBLAS_SEQUENTIAL 0
/* OpenBLAS is compiled using normal threading model */
#define OPENBLAS_THREAD 1
/* OpenBLAS is compiled using OpenMP threading model */
#define OPENBLAS_OPENMP 2


#define CBLAS_INDEX size_t

typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;

float cblas_sdsdot(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy);
double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy);
float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy);
double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy);

openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy);
openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy);
openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy);
openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy);

void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);

float cblas_sasum (blasint n, float *x, blasint incx);
double cblas_dasum (blasint n, double *x, blasint incx);
float cblas_scasum(blasint n, float *x, blasint incx);
double cblas_dzasum(blasint n, double *x, blasint incx);

float cblas_snrm2 (blasint N, float *X, blasint incX);
double cblas_dnrm2 (blasint N, double *X, blasint incX);
float cblas_scnrm2(blasint N, float *X, blasint incX);
double cblas_dznrm2(blasint N, double *X, blasint incX);

CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx);
CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx);
CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx);
CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx);

void cblas_saxpy(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy);
void cblas_daxpy(blasint n, double alpha, double *x, blasint incx, double *y, blasint incy);
void cblas_caxpy(blasint n, float *alpha, float *x, blasint incx, float *y, blasint incy);
void cblas_zaxpy(blasint n, double *alpha, double *x, blasint incx, double *y, blasint incy);

void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy);

void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy);

void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s);
void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s);

void cblas_srotg(float *a, float *b, float *c, float *s);
void cblas_drotg(double *a, double *b, double *c, double *s);

void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P);
void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P);

void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P);
void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P);

void cblas_sscal(blasint N, float alpha, float *X, blasint incX);
void cblas_dscal(blasint N, double alpha, double *X, blasint incX);
void cblas_cscal(blasint N, float *alpha, float *X, blasint incX);
void cblas_zscal(blasint N, double *alpha, double *X, blasint incX);
void cblas_csscal(blasint N, float alpha, float *X, blasint incX);
void cblas_zdscal(blasint N, double alpha, double *X, blasint incX);

void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy);
void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy);
void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy);
void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy);

void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);

void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);

void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);

void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);

void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X,
blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,
blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX,
float *Y, blasint incY, float *A, blasint lda);
void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX,
double *Y, blasint incY, double *A, blasint lda);

void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);

void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A,
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A,
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);


void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);

void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);

void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);

void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);

void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A,
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A,
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A,
blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A,
blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);


void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap,
float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap,
double *X, blasint incX, double beta, double *Y, blasint incY);

void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap);
void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap);

void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A);
void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A);

void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A);
void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A);
void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap);
void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap);

void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);

void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY);

void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);

void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);

void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc);
void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc);

void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);

void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);

void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);

void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);

void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);

void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);

void cblas_xerbla(blasint p, char *rout, char *form, ...);

#ifdef __cplusplus
}
#endif /* __cplusplus */

#endif

+ 169
- 0
common_arm.h View File

@@ -0,0 +1,169 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

**********************************************************************************/

/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#ifndef COMMON_ARM
#define COMMON_ARM

#define MB
#define WMB

#define INLINE inline

#define RETURN_BY_COMPLEX

#ifndef ASSEMBLER

static void __inline blas_lock(volatile BLASULONG *address){

int register ret;

do {
while (*address) {YIELDING;};

__asm__ __volatile__(
"ldrex r2, [%1] \n\t"
"mov r2, #0 \n\t"
"strex r3, r2, [%1] \n\t"
"mov %0 , r3 \n\t"
: "=r"(ret), "=r"(address)
: "1"(address)
: "memory", "r2" , "r3"


);

} while (ret);

}


static inline unsigned long long rpcc(void){
unsigned long long ret=0;
double v;
struct timeval tv;
gettimeofday(&tv,NULL);
v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6;
ret = (unsigned long long) ( v * 1000.0d );
return ret;
}

static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}

#if defined(DOUBLE)
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory")
#else
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory")
#endif

#define GET_IMAGE_CANCEL

#endif


#ifndef F_INTERFACE
#define REALNAME ASMNAME
#else
#define REALNAME ASMFNAME
#endif

#if defined(ASSEMBLER) && !defined(NEEDPARAM)

#define PROLOGUE \
.arm ;\
.global REALNAME ;\
.func REALNAME ;\
REALNAME:

#define EPILOGUE

#define PROFCODE

#endif


#define SEEK_ADDRESS

#ifndef PAGESIZE
#define PAGESIZE ( 4 << 10)
#endif
#define HUGE_PAGESIZE ( 4 << 20)

#define BUFFER_SIZE (16 << 20)


#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)

#ifndef MAP_ANONYMOUS
#define MAP_ANONYMOUS MAP_ANON
#endif

#endif

+ 169
- 0
common_arm64.h View File

@@ -0,0 +1,169 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

**********************************************************************************/

/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#ifndef COMMON_ARM64
#define COMMON_ARM64

#define MB
#define WMB

#define INLINE inline

#define RETURN_BY_COMPLEX

#ifndef ASSEMBLER

static void __inline blas_lock(volatile BLASULONG *address){
/*
int register ret;

do {
while (*address) {YIELDING;};

__asm__ __volatile__(
"ldrex r2, [%1] \n\t"
"mov r2, #0 \n\t"
"strex r3, r2, [%1] \n\t"
"mov %0 , r3 \n\t"
: "=r"(ret), "=r"(address)
: "1"(address)
: "memory", "r2" , "r3"


);

} while (ret);
*/
}


static inline unsigned long long rpcc(void){
unsigned long long ret=0;
double v;
struct timeval tv;
gettimeofday(&tv,NULL);
v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6;
ret = (unsigned long long) ( v * 1000.0d );
return ret;
}

static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}

#if defined(DOUBLE)
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory")
#else
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory")
#endif

#define GET_IMAGE_CANCEL

#endif


#ifndef F_INTERFACE
#define REALNAME ASMNAME
#else
#define REALNAME ASMFNAME
#endif

#if defined(ASSEMBLER) && !defined(NEEDPARAM)

#define PROLOGUE \
.arm ;\
.global REALNAME ;\
.func REALNAME ;\
REALNAME:

#define EPILOGUE

#define PROFCODE

#endif


#define SEEK_ADDRESS

#ifndef PAGESIZE
#define PAGESIZE ( 4 << 10)
#endif
#define HUGE_PAGESIZE ( 4 << 20)

#define BUFFER_SIZE (16 << 20)


#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)

#ifndef MAP_ANONYMOUS
#define MAP_ANONYMOUS MAP_ANON
#endif

#endif

+ 262
- 0
cpuid_arm.c View File

@@ -0,0 +1,262 @@
/**************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <string.h>

#define CPU_UNKNOWN 0
#define CPU_ARMV6 1
#define CPU_ARMV7 2
#define CPU_CORTEXA15 3

static char *cpuname[] = {
"UNKOWN",
"ARMV6",
"ARMV7",
"CORTEXA15"
};


int get_feature(char *search)
{

#ifdef linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;

infile = fopen("/proc/cpuinfo", "r");

while (fgets(buffer, sizeof(buffer), infile))
{

if (!strncmp("Features", buffer, 8))
{
p = strchr(buffer, ':') + 2;
break;
}
}

fclose(infile);


if( p == NULL ) return;

t = strtok(p," ");
while( t = strtok(NULL," "))
{
if (!strcmp(t, search)) { return(1); }
}

#endif
return(0);
}


int detect(void)
{

#ifdef linux

FILE *infile;
char buffer[512], *p;
p = (char *) NULL ;

infile = fopen("/proc/cpuinfo", "r");

while (fgets(buffer, sizeof(buffer), infile))
{

if (!strncmp("model name", buffer, 10))
{
p = strchr(buffer, ':') + 2;
break;
}
}

fclose(infile);

if(p != NULL)
{

if (strstr(p, "ARMv7"))
{
if ( get_feature("vfpv4"))
return CPU_ARMV7;

if ( get_feature("vfpv3"))
return CPU_ARMV7;

if ( get_feature("vfp"))
return CPU_ARMV6;


}

if (strstr(p, "ARMv6"))
{
if ( get_feature("vfp"))
return CPU_ARMV6;
}


}
#endif

return CPU_UNKNOWN;
}

char *get_corename(void)
{
return cpuname[detect()];
}

void get_architecture(void)
{
printf("ARM");
}

void get_subarchitecture(void)
{
int d = detect();
switch (d)
{

case CPU_ARMV7:
printf("ARMV7");
break;

case CPU_ARMV6:
printf("ARMV6");
break;

default:
printf("UNKNOWN");
break;
}
}

void get_subdirname(void)
{
printf("arm");
}

void get_cpuconfig(void)
{

int d = detect();
switch (d)
{

case CPU_ARMV7:
printf("#define ARMV7\n");
printf("#define HAVE_VFP\n");
printf("#define HAVE_VFPV3\n");
if ( get_feature("neon")) printf("#define HAVE_NEON\n");
if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
break;

case CPU_ARMV6:
printf("#define ARMV6\n");
printf("#define HAVE_VFP\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
break;

}
}


void get_libname(void)
{

int d = detect();
switch (d)
{

case CPU_ARMV7:
printf("armv7\n");
break;

case CPU_ARMV6:
printf("armv6\n");
break;

}
}


void get_features(void)
{

#ifdef linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;

infile = fopen("/proc/cpuinfo", "r");

while (fgets(buffer, sizeof(buffer), infile))
{

if (!strncmp("Features", buffer, 8))
{
p = strchr(buffer, ':') + 2;
break;
}
}

fclose(infile);


if( p == NULL ) return;

t = strtok(p," ");
while( t = strtok(NULL," "))
{
if (!strcmp(t, "vfp")) { printf("HAVE_VFP=1\n"); continue; }
if (!strcmp(t, "vfpv3")) { printf("HAVE_VFPV3=1\n"); continue; }
if (!strcmp(t, "vfpv4")) { printf("HAVE_VFPV4=1\n"); continue; }
if (!strcmp(t, "neon")) { printf("HAVE_NEON=1\n"); continue; }
}

#endif
return;
}



+ 9
- 0
ctest.c View File

@@ -124,3 +124,12 @@ ARCH_IA64
#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__)
BINARY_64
#endif

#if defined(__ARM_ARCH) || defined(__ARM_ARCH_7A__)
ARCH_ARM
#endif

#if defined(__aarch64__)
ARCH_ARM64
#endif


+ 46
- 0
kernel/arm/KERNEL View File

@@ -0,0 +1,46 @@
ifndef SNRM2KERNEL
SNRM2KERNEL = nrm2.c
endif

ifndef DNRM2KERNEL
DNRM2KERNEL = nrm2.c
endif

ifndef CNRM2KERNEL
CNRM2KERNEL = znrm2.c
endif

ifndef ZNRM2KERNEL
ZNRM2KERNEL = znrm2.c
endif

ifndef SCABS_KERNEL
SCABS_KERNEL = ../generic/cabs.c
endif

ifndef DCABS_KERNEL
DCABS_KERNEL = ../generic/cabs.c
endif

ifndef QCABS_KERNEL
QCABS_KERNEL = ../generic/cabs.c
endif

ifndef LSAME_KERNEL
LSAME_KERNEL = ../generic/lsame.c
endif

ifndef SGEMM_BETA
SGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef DGEMM_BETA
DGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef CGEMM_BETA
CGEMM_BETA = ../generic/zgemm_beta.c
endif
ifndef ZGEMM_BETA
ZGEMM_BETA = ../generic/zgemm_beta.c
endif



+ 142
- 0
kernel/arm/KERNEL.ARMV6 View File

@@ -0,0 +1,142 @@
SAMAXKERNEL = iamax_vfp.S
DAMAXKERNEL = iamax_vfp.S
CAMAXKERNEL = iamax_vfp.S
ZAMAXKERNEL = iamax_vfp.S

SAMINKERNEL = iamax_vfp.S
DAMINKERNEL = iamax_vfp.S
CAMINKERNEL = iamax_vfp.S
ZAMINKERNEL = iamax_vfp.S

SMAXKERNEL = iamax_vfp.S
DMAXKERNEL = iamax_vfp.S

SMINKERNEL = iamax_vfp.S
DMINKERNEL = iamax_vfp.S

ISAMAXKERNEL = iamax_vfp.S
IDAMAXKERNEL = iamax_vfp.S
ICAMAXKERNEL = iamax_vfp.S
IZAMAXKERNEL = iamax_vfp.S

ISAMINKERNEL = iamax_vfp.S
IDAMINKERNEL = iamax_vfp.S
ICAMINKERNEL = iamax_vfp.S
IZAMINKERNEL = iamax_vfp.S

ISMAXKERNEL = iamax_vfp.S
IDMAXKERNEL = iamax_vfp.S

ISMINKERNEL = iamax_vfp.S
IDMINKERNEL = iamax_vfp.S

SASUMKERNEL = asum_vfp.S
DASUMKERNEL = asum_vfp.S
CASUMKERNEL = asum_vfp.S
ZASUMKERNEL = asum_vfp.S

SAXPYKERNEL = axpy_vfp.S
DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S

SCOPYKERNEL = scopy_vfp.S
DCOPYKERNEL = dcopy_vfp.S
CCOPYKERNEL = ccopy_vfp.S
ZCOPYKERNEL = zcopy_vfp.S

SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S
CDOTKERNEL = cdot_vfp.S
ZDOTKERNEL = zdot_vfp.S

SNRM2KERNEL = nrm2_vfp.S
DNRM2KERNEL = nrm2_vfp.S
CNRM2KERNEL = nrm2_vfp.S
ZNRM2KERNEL = nrm2_vfp.S

SROTKERNEL = rot_vfp.S
DROTKERNEL = rot_vfp.S
CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S

SSCALKERNEL = scal_vfp.S
DSCALKERNEL = scal_vfp.S
CSCALKERNEL = scal_vfp.S
ZSCALKERNEL = scal_vfp.S

SSWAPKERNEL = swap_vfp.S
DSWAPKERNEL = swap_vfp.S
CSWAPKERNEL = swap_vfp.S
ZSWAPKERNEL = swap_vfp.S

SGEMVNKERNEL = gemv_n_vfp.S
DGEMVNKERNEL = gemv_n_vfp.S
CGEMVNKERNEL = cgemv_n_vfp.S
ZGEMVNKERNEL = zgemv_n_vfp.S

SGEMVTKERNEL = gemv_t_vfp.S
DGEMVTKERNEL = gemv_t_vfp.S
CGEMVTKERNEL = cgemv_t_vfp.S
ZGEMVTKERNEL = zgemv_t_vfp.S

STRMMKERNEL = strmm_kernel_4x2_vfp.S
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S

SGEMMKERNEL = sgemm_kernel_4x2_vfp.S
SGEMMINCOPY = sgemm_ncopy_4_vfp.S
SGEMMITCOPY = sgemm_tcopy_4_vfp.S
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPY = sgemm_ncopy_2_vfp.S
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o

DGEMMKERNEL = dgemm_kernel_4x2_vfp.S
DGEMMINCOPY = dgemm_ncopy_4_vfp.S
DGEMMITCOPY = dgemm_tcopy_4_vfp.S
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPY = dgemm_ncopy_2_vfp.S
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o

CGEMMKERNEL = cgemm_kernel_2x2_vfp.S
CGEMMONCOPY = cgemm_ncopy_2_vfp.S
CGEMMOTCOPY = cgemm_tcopy_2_vfp.S
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o

ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S
ZGEMMONCOPY = zgemm_ncopy_2_vfp.S
ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o

STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c





+ 141
- 0
kernel/arm/KERNEL.ARMV7 View File

@@ -0,0 +1,141 @@
SAMAXKERNEL = iamax_vfp.S
DAMAXKERNEL = iamax_vfp.S
CAMAXKERNEL = iamax_vfp.S
ZAMAXKERNEL = iamax_vfp.S

SAMINKERNEL = iamax_vfp.S
DAMINKERNEL = iamax_vfp.S
CAMINKERNEL = iamax_vfp.S
ZAMINKERNEL = iamax_vfp.S

SMAXKERNEL = iamax_vfp.S
DMAXKERNEL = iamax_vfp.S

SMINKERNEL = iamax_vfp.S
DMINKERNEL = iamax_vfp.S

ISAMAXKERNEL = iamax_vfp.S
IDAMAXKERNEL = iamax_vfp.S
ICAMAXKERNEL = iamax_vfp.S
IZAMAXKERNEL = iamax_vfp.S

ISAMINKERNEL = iamax_vfp.S
IDAMINKERNEL = iamax_vfp.S
ICAMINKERNEL = iamax_vfp.S
IZAMINKERNEL = iamax_vfp.S

ISMAXKERNEL = iamax_vfp.S
IDMAXKERNEL = iamax_vfp.S

ISMINKERNEL = iamax_vfp.S
IDMINKERNEL = iamax_vfp.S

SSWAPKERNEL = swap_vfp.S
DSWAPKERNEL = swap_vfp.S
CSWAPKERNEL = swap_vfp.S
ZSWAPKERNEL = swap_vfp.S

SASUMKERNEL = asum_vfp.S
DASUMKERNEL = asum_vfp.S
CASUMKERNEL = asum_vfp.S
ZASUMKERNEL = asum_vfp.S

SAXPYKERNEL = axpy_vfp.S
DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S

SCOPYKERNEL = scopy_vfp.S
DCOPYKERNEL = dcopy_vfp.S
CCOPYKERNEL = ccopy_vfp.S
ZCOPYKERNEL = zcopy_vfp.S

SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S
CDOTKERNEL = cdot_vfp.S
ZDOTKERNEL = zdot_vfp.S

SNRM2KERNEL = nrm2_vfpv3.S
DNRM2KERNEL = nrm2_vfpv3.S
CNRM2KERNEL = nrm2_vfpv3.S
ZNRM2KERNEL = nrm2_vfpv3.S

SROTKERNEL = rot_vfp.S
DROTKERNEL = rot_vfp.S
CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S

SSCALKERNEL = scal_vfp.S
DSCALKERNEL = scal_vfp.S
CSCALKERNEL = scal_vfp.S
ZSCALKERNEL = scal_vfp.S

SGEMVNKERNEL = gemv_n_vfp.S
DGEMVNKERNEL = gemv_n_vfp.S
CGEMVNKERNEL = cgemv_n_vfp.S
ZGEMVNKERNEL = zgemv_n_vfp.S

SGEMVTKERNEL = gemv_t_vfp.S
DGEMVTKERNEL = gemv_t_vfp.S
CGEMVTKERNEL = cgemv_t_vfp.S
ZGEMVTKERNEL = zgemv_t_vfp.S

STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S

#SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = sgemm_ncopy_4_vfp.S
SGEMMOTCOPY = sgemm_tcopy_4_vfp.S
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o

DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = dgemm_ncopy_4_vfp.S
DGEMMOTCOPY = dgemm_tcopy_4_vfp.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o

CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S
CGEMMONCOPY = cgemm_ncopy_2_vfp.S
CGEMMOTCOPY = cgemm_tcopy_2_vfp.S
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o

ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S
ZGEMMONCOPY = zgemm_ncopy_2_vfp.S
ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o

STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c



+ 2
- 0
kernel/arm/Makefile View File

@@ -0,0 +1,2 @@
clean ::


+ 73
- 0
kernel/arm/amax.c View File

@@ -0,0 +1,73 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#define ABS fabs

#else

#define ABS fabsf

#endif


FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;

if (n < 0 || inc_x < 1 ) return(maxf);

maxf=ABS(x[0]);

while(i < n)
{
if( ABS(x[ix]) > ABS(maxf) )
{
maxf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(maxf);
}


+ 73
- 0
kernel/arm/amin.c View File

@@ -0,0 +1,73 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#define ABS fabs

#else

#define ABS fabsf

#endif


FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;

if (n < 0 || inc_x < 1 ) return(minf);

minf=ABS(x[0]);

while(i < n)
{
if( ABS(x[ix]) < ABS(minf) )
{
minf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(minf);
}


+ 67
- 0
kernel/arm/asum.c View File

@@ -0,0 +1,67 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/


#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#define ABS fabs

#else

#define ABS fabsf

#endif


FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
if (n < 0 || inc_x < 1 ) return(sumf);

n *= inc_x;
while(i < n)
{
sumf += ABS(x[i]);
i += inc_x;
}
return(sumf);
}


+ 481
- 0
kernel/arm/asum_vfp.S View File

@@ -0,0 +1,481 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/11 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define N r0
#define X r1
#define INC_X r2


#define I r12

#define X_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/

#if !defined(COMPLEX)

#if defined(DOUBLE)

.macro KERNEL_F4

pld [ X, #X_PRE ]
fldmiad X!, { d4 - d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
fldmiad X!, { d6 - d7 }
vabs.f64 d6, d6
vadd.f64 d1 , d1, d5
vabs.f64 d7, d7
vadd.f64 d0 , d0, d6
vadd.f64 d1 , d1, d7

.endm

.macro KERNEL_F1

fldmiad X!, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4

.endm


.macro KERNEL_S4

fldmiad X, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
add X, X, INC_X

fldmiad X, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
add X, X, INC_X

fldmiad X, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
add X, X, INC_X

fldmiad X, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
add X, X, INC_X

.endm


.macro KERNEL_S1

fldmiad X, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
add X, X, INC_X

.endm

#else

.macro KERNEL_F4

fldmias X!, { s4 - s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
fldmias X!, { s6 - s7 }
vabs.f32 s6, s6
vadd.f32 s1 , s1, s5
vabs.f32 s7, s7
vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7

.endm

.macro KERNEL_F1

fldmias X!, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4

.endm


.macro KERNEL_S4

fldmias X, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
add X, X, INC_X

fldmias X, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
add X, X, INC_X

fldmias X, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
add X, X, INC_X

fldmias X, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
add X, X, INC_X

.endm


.macro KERNEL_S1

fldmias X, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
add X, X, INC_X

.endm


#endif

#else

#if defined(DOUBLE)

.macro KERNEL_F4

pld [ X, #X_PRE ]
fldmiad X!, { d4 - d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
fldmiad X!, { d6 - d7 }
vabs.f64 d6, d6
vadd.f64 d1 , d1, d5
vabs.f64 d7, d7
vadd.f64 d0 , d0, d6
vadd.f64 d1 , d1, d7

pld [ X, #X_PRE ]
fldmiad X!, { d4 - d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
fldmiad X!, { d6 - d7 }
vabs.f64 d6, d6
vadd.f64 d1 , d1, d5
vabs.f64 d7, d7
vadd.f64 d0 , d0, d6
vadd.f64 d1 , d1, d7


.endm

.macro KERNEL_F1

fldmiad X!, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4

fldmiad X!, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4


.endm


.macro KERNEL_S4

fldmiad X, { d4 -d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
vadd.f64 d0 , d0, d5
add X, X, INC_X

fldmiad X, { d4 -d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
vadd.f64 d0 , d0, d5
add X, X, INC_X

fldmiad X, { d4 -d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
vadd.f64 d0 , d0, d5
add X, X, INC_X

fldmiad X, { d4 -d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
vadd.f64 d0 , d0, d5
add X, X, INC_X

.endm


.macro KERNEL_S1

fldmiad X, { d4 -d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
vadd.f64 d0 , d0, d5
add X, X, INC_X

.endm

#else

.macro KERNEL_F4

pld [ X, #X_PRE ]
fldmias X!, { s4 - s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
fldmias X!, { s6 - s7 }
vabs.f32 s6, s6
vadd.f32 s1 , s1, s5
vabs.f32 s7, s7
vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7

fldmias X!, { s4 - s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
fldmias X!, { s6 - s7 }
vabs.f32 s6, s6
vadd.f32 s1 , s1, s5
vabs.f32 s7, s7
vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7


.endm

.macro KERNEL_F1

fldmias X!, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4

fldmias X!, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4

.endm


.macro KERNEL_S4

fldmias X, { s4 -s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
vadd.f32 s0 , s0, s5
add X, X, INC_X

fldmias X, { s4 -s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
vadd.f32 s0 , s0, s5
add X, X, INC_X

fldmias X, { s4 -s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
vadd.f32 s0 , s0, s5
add X, X, INC_X

fldmias X, { s4 -s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
vadd.f32 s0 , s0, s5
add X, X, INC_X

.endm


.macro KERNEL_S1

fldmias X, { s4 -s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
vadd.f32 s0 , s0, s5
add X, X, INC_X

.endm

#endif

#endif

/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

#if defined(DOUBLE)
vsub.f64 d0 , d0 , d0
vsub.f64 d1 , d1 , d1
#else
vsub.f32 s0 , s0 , s0
vsub.f32 s1 , s1 , s1
#endif

cmp N, #0
ble asum_kernel_L999

cmp INC_X, #0
beq asum_kernel_L999

cmp INC_X, #1
bne asum_kernel_S_BEGIN


asum_kernel_F_BEGIN:

asrs I, N, #2 // I = N / 4
ble asum_kernel_F1

.align 5

asum_kernel_F4:

#if !defined(DOUBLE) && !defined(COMPLEX)
pld [ X, #X_PRE ]
#endif
KERNEL_F4

subs I, I, #1
ble asum_kernel_F1

KERNEL_F4

subs I, I, #1
bne asum_kernel_F4

asum_kernel_F1:

ands I, N, #3
ble asum_kernel_L999

asum_kernel_F10:

KERNEL_F1

subs I, I, #1
bne asum_kernel_F10

b asum_kernel_L999

asum_kernel_S_BEGIN:

#if defined(COMPLEX)

#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif

#else

#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif

#endif

asrs I, N, #2 // I = N / 4
ble asum_kernel_S1

.align 5

asum_kernel_S4:

KERNEL_S4

subs I, I, #1
bne asum_kernel_S4

asum_kernel_S1:

ands I, N, #3
ble asum_kernel_L999

asum_kernel_S10:

KERNEL_S1

subs I, I, #1
bne asum_kernel_S10


asum_kernel_L999:


#if defined(DOUBLE)
vadd.f64 d0 , d0, d1 // set return value
#else
vadd.f32 s0 , s0, s1 // set return value
#endif

bx lr

EPILOGUE


+ 64
- 0
kernel/arm/axpy.c View File

@@ -0,0 +1,64 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/


#include "common.h"

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix,iy;

if ( n < 0 ) return(0);
if ( da == 0.0 ) return(0);

ix = 0;
iy = 0;

while(i < n)
{

y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;

}
return(0);

}


+ 503
- 0
kernel/arm/axpy_vfp.S View File

@@ -0,0 +1,503 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/14 Saar
* BLASTEST : xOK
* CTEST : xOK
* TEST : xOK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_INC_X [fp, #0 ]
#define OLD_Y [fp, #4 ]
#define OLD_INC_Y [fp, #8 ]


#define N r0
#define Y r1
#define INC_X r2
#define X r3
#define INC_Y r4

#define I r12

#define X_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/

/*****************************************************************************************/

#if !defined(CONJ)

#if defined(DOUBLE)

#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd

#else

#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs

#endif

#else // CONJ

#if defined(DOUBLE)

#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I2 fmacd

#else

#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I2 fmacs

#endif

#endif


#if !defined(COMPLEX)

#if defined(DOUBLE)

.macro KERNEL_F4

pld [ X, #X_PRE ]
fldmiad X!, { d4 - d7 }
pld [ Y, #X_PRE ]
fldmiad Y , { d8 - d11 }
fmacd d8 , d0, d4
fstmiad Y!, { d8 }
fmacd d9 , d0, d5
fstmiad Y!, { d9 }
fmacd d10, d0, d6
fstmiad Y!, { d10 }
fmacd d11, d0, d7
fstmiad Y!, { d11 }


.endm


.macro KERNEL_F1

fldmiad X!, { d4 }
fldmiad Y , { d8 }
fmacd d8 , d0, d4
fstmiad Y!, { d8 }

.endm

.macro KERNEL_S1

fldmiad X , { d4 }
fldmiad Y , { d8 }
fmacd d8 , d0, d4
fstmiad Y , { d8 }
add X, X, INC_X
add Y, Y, INC_Y

.endm

#else

.macro KERNEL_F4

fldmias X!, { s4 - s7 }
fldmias Y , { s8 - s11 }
fmacs s8 , s0, s4
fstmias Y!, { s8 }
fmacs s9 , s0, s5
fstmias Y!, { s9 }
fmacs s10, s0, s6
fstmias Y!, { s10 }
fmacs s11, s0, s7
fstmias Y!, { s11 }


.endm


.macro KERNEL_F1

fldmias X!, { s4 }
fldmias Y , { s8 }
fmacs s8 , s0, s4
fstmias Y!, { s8 }

.endm

.macro KERNEL_S1

fldmias X , { s4 }
fldmias Y , { s8 }
fmacs s8 , s0, s4
fstmias Y , { s8 }
add X, X, INC_X
add Y, Y, INC_Y

.endm


#endif

#else

#if defined(DOUBLE)

.macro KERNEL_F4

pld [ X, #X_PRE ]
fldmiad X!, { d4 - d7 }
pld [ Y, #X_PRE ]
fldmiad Y , { d8 - d11 }

FMAC_R1 d8 , d0, d4
FMAC_R2 d8 , d1, d5
FMAC_I1 d9 , d0, d5
FMAC_I2 d9 , d1, d4
fstmiad Y!, { d8 }
fstmiad Y!, { d9 }

FMAC_R1 d10, d0, d6
FMAC_R2 d10, d1, d7
FMAC_I1 d11, d0, d7
FMAC_I2 d11, d1, d6
fstmiad Y!, { d10 }
fstmiad Y!, { d11 }

pld [ X, #X_PRE ]
fldmiad X!, { d4 - d7 }
pld [ Y, #X_PRE ]
fldmiad Y , { d8 - d11 }

FMAC_R1 d8 , d0, d4
FMAC_R2 d8 , d1, d5
FMAC_I1 d9 , d0, d5
FMAC_I2 d9 , d1, d4
fstmiad Y!, { d8 }
fstmiad Y!, { d9 }

FMAC_R1 d10, d0, d6
FMAC_R2 d10, d1, d7
FMAC_I1 d11, d0, d7
FMAC_I2 d11, d1, d6
fstmiad Y!, { d10 }
fstmiad Y!, { d11 }





.endm


.macro KERNEL_F1

fldmiad X!, { d4 - d5 }
fldmiad Y , { d8 - d9 }

FMAC_R1 d8 , d0, d4
FMAC_R2 d8 , d1, d5
FMAC_I1 d9 , d0, d5
FMAC_I2 d9 , d1, d4
fstmiad Y!, { d8 }
fstmiad Y!, { d9 }



.endm

.macro KERNEL_S1

fldmiad X , { d4 - d5 }
fldmiad Y , { d8 - d9 }

FMAC_R1 d8 , d0, d4
FMAC_R2 d8 , d1, d5
FMAC_I1 d9 , d0, d5
FMAC_I2 d9 , d1, d4
fstmiad Y , { d8 - d9 }

add X, X, INC_X
add Y, Y, INC_Y

.endm



#else

.macro KERNEL_F4

pld [ X, #X_PRE ]
fldmias X!, { s4 - s7 }
pld [ Y, #X_PRE ]
fldmias Y , { s8 - s11 }

FMAC_R1 s8 , s0, s4
FMAC_R2 s8 , s1, s5
FMAC_I1 s9 , s0, s5
FMAC_I2 s9 , s1, s4
fstmias Y!, { s8 }
fstmias Y!, { s9 }

FMAC_R1 s10, s0, s6
FMAC_R2 s10, s1, s7
FMAC_I1 s11, s0, s7
FMAC_I2 s11, s1, s6
fstmias Y!, { s10 }
fstmias Y!, { s11 }

fldmias X!, { s4 - s7 }
fldmias Y , { s8 - s11 }

FMAC_R1 s8 , s0, s4
FMAC_R2 s8 , s1, s5
FMAC_I1 s9 , s0, s5
FMAC_I2 s9 , s1, s4
fstmias Y!, { s8 }
fstmias Y!, { s9 }

FMAC_R1 s10, s0, s6
FMAC_R2 s10, s1, s7
FMAC_I1 s11, s0, s7
FMAC_I2 s11, s1, s6
fstmias Y!, { s10 }
fstmias Y!, { s11 }





.endm


.macro KERNEL_F1

fldmias X!, { s4 - s5 }
fldmias Y , { s8 - s9 }

FMAC_R1 s8 , s0, s4
FMAC_R2 s8 , s1, s5
FMAC_I1 s9 , s0, s5
FMAC_I2 s9 , s1, s4
fstmias Y!, { s8 }
fstmias Y!, { s9 }



.endm

.macro KERNEL_S1

fldmias X , { s4 - s5 }
fldmias Y , { s8 - s9 }

FMAC_R1 s8 , s0, s4
FMAC_R2 s8 , s1, s5
FMAC_I1 s9 , s0, s5
FMAC_I2 s9 , s1, s4
fstmias Y , { s8 - s9 }

add X, X, INC_X
add Y, Y, INC_Y

.endm


#endif

#endif

/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5
push {r4 , fp}
add fp, sp, #8
sub sp, sp, #STACKSIZE // reserve stack

ldr INC_X , OLD_INC_X
ldr Y, OLD_Y
ldr INC_Y , OLD_INC_Y

sub r12, fp, #128

#if defined(DOUBLE)
vstm r12, { d8 - d15} // store floating point registers
#else
vstm r12, { s8 - s15} // store floating point registers
#endif

cmp N, #0
ble axpy_kernel_L999

cmp INC_X, #0
beq axpy_kernel_L999

cmp INC_Y, #0
beq axpy_kernel_L999

cmp INC_X, #1
bne axpy_kernel_S_BEGIN

cmp INC_Y, #1
bne axpy_kernel_S_BEGIN


axpy_kernel_F_BEGIN:


asrs I, N, #2 // I = N / 4
ble axpy_kernel_F1

.align 5

axpy_kernel_F4:

#if !defined(COMPLEX) && !defined(DOUBLE)
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
#endif

KERNEL_F4

subs I, I, #1
ble axpy_kernel_F1

KERNEL_F4

subs I, I, #1
bne axpy_kernel_F4

axpy_kernel_F1:

ands I, N, #3
ble axpy_kernel_L999

axpy_kernel_F10:

KERNEL_F1

subs I, I, #1
bne axpy_kernel_F10

b axpy_kernel_L999

axpy_kernel_S_BEGIN:

#if defined(COMPLEX)

#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
#endif

#else

#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
#endif

#endif


asrs I, N, #2 // I = N / 4
ble axpy_kernel_S1

.align 5

axpy_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne axpy_kernel_S4

axpy_kernel_S1:

ands I, N, #3
ble axpy_kernel_L999

axpy_kernel_S10:

KERNEL_S1

subs I, I, #1
bne axpy_kernel_S10


axpy_kernel_L999:

sub r3, fp, #128

#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s15 } // restore floating point registers
#endif

mov r0, #0 // set return value

sub sp, fp, #8
pop {r4,fp}
bx lr

EPILOGUE


+ 222
- 0
kernel/arm/ccopy_vfp.S View File

@@ -0,0 +1,222 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/07 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define OLD_INC_Y [fp, #4 ]

#define I r5
#define Y r6
#define INC_Y r7

#define X_PRE 256

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro COPY_F4

pld [ X, #X_PRE ]
fldmias X!, { s0 - s7 }
fstmias Y!, { s0 - s7 }

.endm

.macro COPY_F1

fldmias X!, { s0 - s1 }
fstmias Y!, { s0 - s1 }

.endm


/*************************************************************************************************************************/

.macro COPY_S4

nop
fldmias X, { s0 - s1 }
fstmias Y, { s0 - s1 }
add X, X, INC_X
add Y, Y, INC_Y

fldmias X, { s2 - s3 }
fstmias Y, { s2 - s3 }
add X, X, INC_X
add Y, Y, INC_Y

fldmias X, { s0 - s1 }
fstmias Y, { s0 - s1 }
add X, X, INC_X
add Y, Y, INC_Y

fldmias X, { s2 - s3 }
fstmias Y, { s2 - s3 }
add X, X, INC_X
add Y, Y, INC_Y

.endm


.macro COPY_S1

fldmias X, { s0 - s1 }
fstmias Y, { s0 - s1 }
add X, X, INC_X
add Y, Y, INC_Y

.endm



/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack

sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers

mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
cmp N, #0
ble ccopy_kernel_L999

cmp INC_X, #0
beq ccopy_kernel_L999

cmp INC_Y, #0
beq ccopy_kernel_L999

cmp INC_X, #1
bne ccopy_kernel_S_BEGIN

cmp INC_Y, #1
bne ccopy_kernel_S_BEGIN

ccopy_kernel_F_BEGIN:

asrs I, N, #2 // I = N / 4
ble ccopy_kernel_F1

ccopy_kernel_F4:

COPY_F4

subs I, I, #1
bne ccopy_kernel_F4

ccopy_kernel_F1:

ands I, N, #3
ble ccopy_kernel_L999

ccopy_kernel_F10:

COPY_F1

subs I, I, #1
bne ccopy_kernel_F10

b ccopy_kernel_L999

ccopy_kernel_S_BEGIN:

lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2

asrs I, N, #2 // I = N / 4
ble ccopy_kernel_S1

ccopy_kernel_S4:

COPY_S4

subs I, I, #1
bne ccopy_kernel_S4

ccopy_kernel_S1:

ands I, N, #3
ble ccopy_kernel_L999

ccopy_kernel_S10:

COPY_S1

subs I, I, #1
bne ccopy_kernel_S10






ccopy_kernel_L999:

sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers

mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 284
- 0
kernel/arm/cdot_vfp.S View File

@@ -0,0 +1,284 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/11 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define OLD_INC_Y [fp, #4 ]

#define I r5
#define Y r6
#define INC_Y r7

#define X_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro KERNEL_F4

pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]

fldmias X!, { s4 - s5 }
fldmias Y!, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fldmias X!, { s6 - s7 }
fmacs s2 , s5, s9
fmacs s3 , s5, s8

fldmias Y!, { s10 - s11 }
fmacs s0 , s6, s10
fmacs s1 , s6, s11
fmacs s2 , s7, s11
fmacs s3 , s7, s10


fldmias X!, { s4 - s5 }
fldmias Y!, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fldmias X!, { s6 - s7 }
fmacs s2 , s5, s9
fmacs s3 , s5, s8

fldmias Y!, { s10 - s11 }
fmacs s0 , s6, s10
fmacs s1 , s6, s11
fmacs s2 , s7, s11
fmacs s3 , s7, s10

.endm

.macro KERNEL_F1

fldmias X!, { s4 - s5 }
fldmias Y!, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fmacs s2 , s5, s9
fmacs s3 , s5, s8

.endm


/*************************************************************************************************************************/

.macro KERNEL_S4

nop

fldmias X, { s4 - s5 }
fldmias Y, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fmacs s2 , s5, s9
fmacs s3 , s5, s8
add X, X, INC_X
add Y, Y, INC_Y

fldmias X, { s4 - s5 }
fldmias Y, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fmacs s2 , s5, s9
fmacs s3 , s5, s8
add X, X, INC_X
add Y, Y, INC_Y

fldmias X, { s4 - s5 }
fldmias Y, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fmacs s2 , s5, s9
fmacs s3 , s5, s8
add X, X, INC_X
add Y, Y, INC_Y

fldmias X, { s4 - s5 }
fldmias Y, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fmacs s2 , s5, s9
fmacs s3 , s5, s8
add X, X, INC_X
add Y, Y, INC_Y

.endm


.macro KERNEL_S1

fldmias X, { s4 - s5 }
fldmias Y, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fmacs s2 , s5, s9
fmacs s3 , s5, s8
add X, X, INC_X
add Y, Y, INC_Y

.endm



/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack

sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers

mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
vsub.f32 s0 , s0 , s0
vsub.f32 s1 , s1 , s1
vsub.f32 s2 , s2 , s2
vsub.f32 s3 , s3 , s3

cmp N, #0
ble cdot_kernel_L999

cmp INC_X, #0
beq cdot_kernel_L999

cmp INC_Y, #0
beq cdot_kernel_L999

cmp INC_X, #1
bne cdot_kernel_S_BEGIN

cmp INC_Y, #1
bne cdot_kernel_S_BEGIN

cdot_kernel_F_BEGIN:

asrs I, N, #2 // I = N / 4
ble cdot_kernel_F1

cdot_kernel_F4:

KERNEL_F4

subs I, I, #1
bne cdot_kernel_F4

cdot_kernel_F1:

ands I, N, #3
ble cdot_kernel_L999

cdot_kernel_F10:

KERNEL_F1

subs I, I, #1
bne cdot_kernel_F10

b cdot_kernel_L999

cdot_kernel_S_BEGIN:

lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2

asrs I, N, #2 // I = N / 4
ble cdot_kernel_S1

cdot_kernel_S4:

KERNEL_S4

subs I, I, #1
bne cdot_kernel_S4

cdot_kernel_S1:

ands I, N, #3
ble cdot_kernel_L999

cdot_kernel_S10:

KERNEL_S1

subs I, I, #1
bne cdot_kernel_S10



cdot_kernel_L999:

sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers

#if !defined(CONJ)
vsub.f32 s0 , s0, s2
vadd.f32 s1 , s1, s3
#else
vadd.f32 s0 , s0, s2
vsub.f32 s1 , s1, s3
#endif

sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 1252
- 0
kernel/arm/cgemm_kernel_2x2_vfp.S
File diff suppressed because it is too large
View File


+ 1309
- 0
kernel/arm/cgemm_kernel_2x2_vfpv3.S
File diff suppressed because it is too large
View File


+ 258
- 0
kernel/arm/cgemm_ncopy_2_vfp.S View File

@@ -0,0 +1,258 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/05 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define LDA [fp, #-260 ]

#define B [fp, #4 ]

#define M r0
#define N r1
#define A r2

#define BO r5

#define AO1 r6
#define AO2 r7

#define I r3
#define J r12

#define A_PRE 256

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro COPY2x2

flds s0 , [ AO1, #0 ]
flds s1 , [ AO1, #4 ]
flds s4 , [ AO1, #8 ]
flds s5 , [ AO1, #12 ]

flds s2 , [ AO2, #0 ]
flds s3 , [ AO2, #4 ]
add AO1, AO1, #16
flds s6 , [ AO2, #8 ]
flds s7 , [ AO2, #12 ]

fstmias BO!, { s0 - s7 }
add AO2, AO2, #16

.endm


.macro COPY1x2

flds s0 , [ AO1, #0 ]
flds s1 , [ AO1, #4 ]
flds s2 , [ AO2, #0 ]
flds s3 , [ AO2, #4 ]

add AO1, AO1, #8
fstmias BO!, { s0 - s3 }
add AO2, AO2, #8

.endm

.macro COPY2x1

flds s0 , [ AO1, #0 ]
flds s1 , [ AO1, #4 ]
flds s2 , [ AO1, #8 ]
flds s3 , [ AO1, #12 ]

fstmias BO!, { s0 - s3 }
add AO1, AO1, #16

.endm


.macro COPY1x1

flds s0 , [ AO1, #0 ]
flds s1 , [ AO1, #4 ]

fstmias BO!, { s0 - s1 }
add AO1, AO1, #8

.endm





/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack


lsl r3, r3, #3 // lda = lda * 4 * 2
str r3, LDA

sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers

ldr BO, B

/*********************************************************************************************/

cgemm_ncopy_L2_BEGIN:

asrs J, N, #1 // J = N / 2
ble cgemm_ncopy_L1_BEGIN

cgemm_ncopy_L2_M2_BEGIN:

mov AO1, A // AO1 = A
ldr r4 , LDA
add AO2, AO1, r4
add A , AO2, r4 // A = A + 2 * LDA

asrs I, M, #1 // I = M / 2
ble cgemm_ncopy_L2_M2_40

cgemm_ncopy_L2_M2_20:

pld [ AO1, #A_PRE ]
pld [ AO2, #A_PRE ]

COPY2x2
subs I , I , #1
ble cgemm_ncopy_L2_M2_40

COPY2x2
subs I , I , #1
bne cgemm_ncopy_L2_M2_20
cgemm_ncopy_L2_M2_40:

ands I, M , #1
ble cgemm_ncopy_L2_M2_END

cgemm_ncopy_L2_M2_60:

COPY1x2

subs I , I , #1
bne cgemm_ncopy_L2_M2_60

cgemm_ncopy_L2_M2_END:

subs J , J, #1 // j--
bne cgemm_ncopy_L2_M2_BEGIN


/*********************************************************************************************/

cgemm_ncopy_L1_BEGIN:

tst N, #1
ble cgemm_ncopy_L999


cgemm_ncopy_L1_M2_BEGIN:

mov AO1, A // AO1 = A
ldr r4 , LDA
add A , AO1, r4 // A = A + 1 * LDA

asrs I, M, #1 // I = M / 2
ble cgemm_ncopy_L1_M2_40

cgemm_ncopy_L1_M2_20:

COPY2x1

subs I , I , #1
bne cgemm_ncopy_L1_M2_20
cgemm_ncopy_L1_M2_40:

ands I, M , #1
ble cgemm_ncopy_L1_M2_END

cgemm_ncopy_L1_M2_60:

COPY1x1

subs I , I , #1
bne cgemm_ncopy_L1_M2_60

cgemm_ncopy_L1_M2_END:



cgemm_ncopy_L999:

sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers

movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 243
- 0
kernel/arm/cgemm_tcopy_2_vfp.S View File

@@ -0,0 +1,243 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/07 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define B [fp, #4 ]
#define A [fp, #-248 ]

#define M r0
#define N r1
#define M4 r2

#define LDA r5

#define AO1 r6
#define BO1 r7
#define BO2 r8

#define I r4
#define J r12

#define A_PRE 256

/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY2x2

fldmias AO1, { s0 - s3 }

add r3, AO1, LDA
fldmias r3, { s4 - s7 }

fstmias BO1, { s0 - s7 }
add AO1, AO1, #16
add BO1, BO1, M4

.endm

.macro COPY1x2

fldmias AO1, { s0 -s1 }

add r3, AO1, LDA
fldmias r3, { s2 - s3 }

fstmias BO2, { s0 - s3 }
add AO1, AO1, #8
add BO2, BO2, #16

.endm

/*************************************************************************************************************************/
.macro COPY2x1

fldmias AO1, { s0 - s3 }

fstmias BO1, { s0 - s3 }
add AO1, AO1, #16
add BO1, BO1, M4

.endm

.macro COPY1x1

fldmias AO1, { s0 - s1 }

fstmias BO2, { s0 - s1 }
add AO1, AO1, #8
add BO2, BO2, #8

.endm



/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack

str OLD_A, A // store A

lsl LDA, OLD_LDA, #3 // lda = lda * SIZE * 2

sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers

lsl r4 , M, #3 // M * SIZE * 2

ldr r3, B

and BO2 , N , #-2

mul BO2, BO2, r4

add BO2 , BO2, r3

lsl M4, M, #4 // M4 = M * 2 * SIZE * 2

cgemm_tcopy_L2_BEGIN:

asrs J, M, #1 // J = N / 2
ble cgemm_tcopy_L1_BEGIN

cgemm_tcopy_L2_M2_BEGIN:

ldr AO1, A // AO1 = A
lsl r3, LDA, #1 // r3 = 2 * LDA
add r3, r3 , AO1 // A = A + 2 * LDA
str r3, A // store A

ldr BO1, B
add r3, BO1, #32 // B = B + 4 * SIZE *2
str r3, B

asrs I, N, #1 // I = M / 2
ble cgemm_tcopy_L2_M2_60

cgemm_tcopy_L2_M2_40:

COPY2x2
subs I, I, #1
bne cgemm_tcopy_L2_M2_40

cgemm_tcopy_L2_M2_60:

tst N , #1
ble cgemm_tcopy_L2_M2_END

COPY1x2


cgemm_tcopy_L2_M2_END:

subs J , J, #1 // j--
bne cgemm_tcopy_L2_M2_BEGIN

/*********************************************************************************************/

cgemm_tcopy_L1_BEGIN:

tst M, #1
ble cgemm_tcopy_L999


cgemm_tcopy_L1_M2_BEGIN:

ldr AO1, A // AO1 = A
add r3, LDA , AO1 // A = A + 1 * LDA
str r3, A // store A

ldr BO1, B
add r3, BO1, #16 // B = B + 2 * SIZE *2
str r3, B

asrs I, N, #1 // I = M / 2
ble cgemm_tcopy_L1_M2_60


cgemm_tcopy_L1_M2_40:

COPY2x1
subs I, I, #1
bne cgemm_tcopy_L1_M2_40

cgemm_tcopy_L1_M2_60:

tst N , #1
ble cgemm_tcopy_L1_M2_END

COPY1x1


cgemm_tcopy_L1_M2_END:



cgemm_tcopy_L999:

sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers

mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 697
- 0
kernel/arm/cgemv_n_vfp.S View File

@@ -0,0 +1,697 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/29 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_M r0

#define AO1 r0
#define N r1
#define J r2

#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9

#define I r12

#define ALPHA_I [fp, #-236]
#define ALPHA_R [fp, #-244]

#define M [fp, #-252 ]
#define A [fp, #-256 ]


#define X_PRE 64
#define Y_PRE 0
#define A_PRE 0

/**************************************************************************************/

#if !defined(CONJ) && !defined(XCONJ)

#define KMAC_R fnmacs
#define KMAC_I fmacs

#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs

#elif defined(CONJ) && !defined(XCONJ)

#define KMAC_R fmacs
#define KMAC_I fnmacs

#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs

#elif !defined(CONJ) && defined(XCONJ)

#define KMAC_R fmacs
#define KMAC_I fnmacs

#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I2 fmacs

#else

#define KMAC_R fnmacs
#define KMAC_I fmacs

#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I2 fmacs

#endif

.macro INIT_F4

pld [ YO, #Y_PRE ]
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
vmov.f32 s10, s8
vmov.f32 s11, s8
vmov.f32 s12, s8
vmov.f32 s13, s8
vmov.f32 s14, s8
vmov.f32 s15, s8

.endm

.macro KERNEL_F4X4

pld [ XO, #X_PRE ]
KERNEL_F4X1
KERNEL_F4X1
KERNEL_F4X1
KERNEL_F4X1

.endm

.macro KERNEL_F4X1

pld [ AO2, #A_PRE ]
flds s0 , [ AO1 ]
flds s1 , [ AO1, #4 ]
flds s2 , [ AO1, #8 ]
flds s3 , [ AO1, #12 ]

flds s4 , [ XO ]
flds s5 , [ XO, #4 ]

fmacs s8 , s0, s4
fmacs s9 , s0, s5
fmacs s10 , s2, s4
fmacs s11 , s2, s5

KMAC_R s8 , s1, s5
KMAC_I s9 , s1, s4
KMAC_R s10 , s3, s5
KMAC_I s11 , s3, s4

flds s0 , [ AO1, #16 ]
flds s1 , [ AO1, #20 ]
flds s2 , [ AO1, #24 ]
flds s3 , [ AO1, #28 ]

fmacs s12 , s0, s4
fmacs s13 , s0, s5
fmacs s14 , s2, s4
fmacs s15 , s2, s5

KMAC_R s12 , s1, s5
KMAC_I s13 , s1, s4
KMAC_R s14 , s3, s5
KMAC_I s15 , s3, s4

add XO , XO, #8
add AO1 , AO1, LDA
add AO2 , AO2, LDA

.endm

.macro SAVE_F4

flds s0, ALPHA_R
flds s1, ALPHA_I

fldmias YO, { s4 - s7 }

FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9
FMAC_R2 s4 , s1 , s9
FMAC_I2 s5 , s1 , s8

FMAC_R1 s6 , s0 , s10
FMAC_I1 s7 , s0 , s11
FMAC_R2 s6 , s1 , s11
FMAC_I2 s7 , s1 , s10

fstmias YO!, { s4 - s7 }

fldmias YO, { s4 - s7 }

FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13
FMAC_R2 s4 , s1 , s13
FMAC_I2 s5 , s1 , s12

FMAC_R1 s6 , s0 , s14
FMAC_I1 s7 , s0 , s15
FMAC_R2 s6 , s1 , s15
FMAC_I2 s7 , s1 , s14

fstmias YO!, { s4 - s7 }

.endm




.macro INIT_F1

vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8

.endm

.macro KERNEL_F1X1

flds s0 , [ AO1 ]
flds s1 , [ AO1, #4 ]

flds s4 , [ XO ]
flds s5 , [ XO, #4 ]

fmacs s8 , s0, s4
fmacs s9 , s0, s5

KMAC_R s8 , s1, s5
KMAC_I s9 , s1, s4

add XO , XO, #8
add AO1 , AO1, LDA


.endm

.macro SAVE_F1

flds s0, ALPHA_R
flds s1, ALPHA_I

fldmias YO, { s4 - s5 }

FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9
FMAC_R2 s4 , s1 , s9
FMAC_I2 s5 , s1 , s8

fstmias YO, { s4 - s5 }

add YO, YO, #8

.endm

/****************************************************************************************/

.macro INIT_S4

vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
vmov.f32 s10, s8
vmov.f32 s11, s8
vmov.f32 s12, s8
vmov.f32 s13, s8
vmov.f32 s14, s8
vmov.f32 s15, s8

.endm

.macro KERNEL_S4X4

KERNEL_S4X1
KERNEL_S4X1
KERNEL_S4X1
KERNEL_S4X1

.endm

.macro KERNEL_S4X1

flds s0 , [ AO1 ]
flds s1 , [ AO1, #4 ]
flds s2 , [ AO1, #8 ]
flds s3 , [ AO1, #12 ]

flds s4 , [ XO ]
flds s5 , [ XO, #4 ]

fmacs s8 , s0, s4
fmacs s9 , s0, s5
fmacs s10 , s2, s4
fmacs s11 , s2, s5

KMAC_R s8 , s1, s5
KMAC_I s9 , s1, s4
KMAC_R s10 , s3, s5
KMAC_I s11 , s3, s4

flds s0 , [ AO1, #16 ]
flds s1 , [ AO1, #20 ]
flds s2 , [ AO1, #24 ]
flds s3 , [ AO1, #28 ]

fmacs s12 , s0, s4
fmacs s13 , s0, s5
fmacs s14 , s2, s4
fmacs s15 , s2, s5

KMAC_R s12 , s1, s5
KMAC_I s13 , s1, s4
KMAC_R s14 , s3, s5
KMAC_I s15 , s3, s4

add XO , XO, INC_X
add AO1 , AO1, LDA
add AO2 , AO2, LDA

.endm

.macro SAVE_S4

flds s0, ALPHA_R
flds s1, ALPHA_I

fldmias YO, { s4 - s5 }

FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9
FMAC_R2 s4 , s1 , s9
FMAC_I2 s5 , s1 , s8

fstmias YO, { s4 - s5 }

add YO, YO, INC_Y

fldmias YO, { s6 - s7 }

FMAC_R1 s6 , s0 , s10
FMAC_I1 s7 , s0 , s11
FMAC_R2 s6 , s1 , s11
FMAC_I2 s7 , s1 , s10

fstmias YO, { s6 - s7 }

add YO, YO, INC_Y

fldmias YO, { s4 - s5 }

FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13
FMAC_R2 s4 , s1 , s13
FMAC_I2 s5 , s1 , s12

fstmias YO, { s4 - s5 }

add YO, YO, INC_Y

fldmias YO, { s6 - s7 }

FMAC_R1 s6 , s0 , s14
FMAC_I1 s7 , s0 , s15
FMAC_R2 s6 , s1 , s15
FMAC_I2 s7 , s1 , s14

fstmias YO, { s6 - s7 }

add YO, YO, INC_Y

.endm




.macro INIT_S1

vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8

.endm

.macro KERNEL_S1X1

flds s0 , [ AO1 ]
flds s1 , [ AO1, #4 ]

flds s4 , [ XO ]
flds s5 , [ XO, #4 ]

fmacs s8 , s0, s4
fmacs s9 , s0, s5

KMAC_R s8 , s1, s5
KMAC_I s9 , s1, s4

add XO , XO, INC_X
add AO1 , AO1, LDA


.endm

.macro SAVE_S1

flds s0, ALPHA_R
flds s1, ALPHA_I

fldmias YO, { s4 - s5 }

FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9
FMAC_R2 s4 , s1 , s9
FMAC_I2 s5 , s1 , s8

fstmias YO, { s4 - s5 }

add YO, YO, INC_Y

.endm



/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack

sub r12, fp, #192

#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s15 } // store floating point registers
#endif

cmp OLD_M, #0
ble cgemvn_kernel_L999

cmp N, #0
ble cgemvn_kernel_L999

str OLD_A, A
str OLD_M, M
vstr s0 , ALPHA_R
vstr s1 , ALPHA_I


ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y

cmp INC_X, #0
beq cgemvn_kernel_L999

cmp INC_Y, #0
beq cgemvn_kernel_L999

ldr LDA, OLD_LDA


#if defined(DOUBLE)
lsl LDA, LDA, #4 // LDA * SIZE * 2
#else
lsl LDA, LDA, #3 // LDA * SIZE * 2
#endif

cmp INC_X, #1
bne cgemvn_kernel_S4_BEGIN

cmp INC_Y, #1
bne cgemvn_kernel_S4_BEGIN


cgemvn_kernel_F4_BEGIN:

ldr YO , Y

ldr I, M
asrs I, I, #2 // I = M / 4
ble cgemvn_kernel_F1_BEGIN

cgemvn_kernel_F4X4:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #32
str r3 , A

add AO2, AO2, LDA
add AO2, AO2, LDA

ldr XO , X

INIT_F4

asrs J, N, #2 // J = N / 4
ble cgemvn_kernel_F4X1


cgemvn_kernel_F4X4_10:

KERNEL_F4X4

subs J, J, #1
bne cgemvn_kernel_F4X4_10


cgemvn_kernel_F4X1:

ands J, N , #3
ble cgemvn_kernel_F4_END

cgemvn_kernel_F4X1_10:

KERNEL_F4X1

subs J, J, #1
bne cgemvn_kernel_F4X1_10


cgemvn_kernel_F4_END:

SAVE_F4

subs I , I , #1
bne cgemvn_kernel_F4X4


cgemvn_kernel_F1_BEGIN:

ldr I, M
ands I, I , #3
ble cgemvn_kernel_L999

cgemvn_kernel_F1X1:

ldr AO1, A
add r3, AO1, #8
str r3, A
ldr XO , X

INIT_F1

mov J, N


cgemvn_kernel_F1X1_10:

KERNEL_F1X1

subs J, J, #1
bne cgemvn_kernel_F1X1_10


cgemvn_kernel_F1_END:

SAVE_F1

subs I , I , #1
bne cgemvn_kernel_F1X1

b cgemvn_kernel_L999



/*************************************************************************************************************/

cgemvn_kernel_S4_BEGIN:

#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
#endif

ldr YO , Y

ldr I, M
asrs I, I, #2 // I = M / 4
ble cgemvn_kernel_S1_BEGIN

cgemvn_kernel_S4X4:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #32
str r3 , A

ldr XO , X

INIT_S4

asrs J, N, #2 // J = N / 4
ble cgemvn_kernel_S4X1


cgemvn_kernel_S4X4_10:

KERNEL_S4X4

subs J, J, #1
bne cgemvn_kernel_S4X4_10


cgemvn_kernel_S4X1:

ands J, N , #3
ble cgemvn_kernel_S4_END

cgemvn_kernel_S4X1_10:

KERNEL_S4X1

subs J, J, #1
bne cgemvn_kernel_S4X1_10


cgemvn_kernel_S4_END:

SAVE_S4

subs I , I , #1
bne cgemvn_kernel_S4X4


cgemvn_kernel_S1_BEGIN:

ldr I, M
ands I, I , #3
ble cgemvn_kernel_L999

cgemvn_kernel_S1X1:

ldr AO1, A
add r3, AO1, #8
str r3, A
ldr XO , X

INIT_S1

mov J, N


cgemvn_kernel_S1X1_10:

KERNEL_S1X1

subs J, J, #1
bne cgemvn_kernel_S1X1_10


cgemvn_kernel_S1_END:

SAVE_S1

subs I , I , #1
bne cgemvn_kernel_S1X1


/*************************************************************************************************************/

cgemvn_kernel_L999:

sub r3, fp, #192

#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s15 } // restore floating point registers
#endif

mov r0, #0 // set return value

sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr

EPILOGUE


+ 607
- 0
kernel/arm/cgemv_t_vfp.S View File

@@ -0,0 +1,607 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/29 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_N r1

#define M r0
#define AO1 r1
#define J r2

#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9

#define I r12

#define N [fp, #-252 ]
#define A [fp, #-256 ]


#define X_PRE 512
#define A_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/

#if !defined(CONJ) && !defined(XCONJ)

#define KMAC_R fnmacs
#define KMAC_I fmacs

#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs

#elif defined(CONJ) && !defined(XCONJ)

#define KMAC_R fmacs
#define KMAC_I fnmacs

#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs

#elif !defined(CONJ) && defined(XCONJ)

#define KMAC_R fmacs
#define KMAC_I fnmacs

#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I2 fmacs

#else

#define KMAC_R fnmacs
#define KMAC_I fmacs

#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I2 fmacs

#endif



.macro INIT_F2

vsub.f32 s12, s12, s12
vsub.f32 s13, s13, s13
vsub.f32 s14, s14, s14
vsub.f32 s15, s15, s15

.endm

.macro KERNEL_F2X4

KERNEL_F2X1
KERNEL_F2X1
KERNEL_F2X1
KERNEL_F2X1

.endm

.macro KERNEL_F2X1

fldmias XO! , { s2 - s3 }
fldmias AO1!, { s4 - s5 }
fldmias AO2!, { s8 - s9 }

fmacs s12 , s4 , s2
fmacs s13 , s4 , s3
KMAC_R s12 , s5 , s3
KMAC_I s13 , s5 , s2

fmacs s14 , s8 , s2
fmacs s15 , s8 , s3
KMAC_R s14 , s9 , s3
KMAC_I s15 , s9 , s2

.endm

.macro SAVE_F2

fldmias YO, { s4 - s7 }

FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13
FMAC_R2 s4 , s1 , s13
FMAC_I2 s5 , s1 , s12

FMAC_R1 s6 , s0 , s14
FMAC_I1 s7 , s0 , s15
FMAC_R2 s6 , s1 , s15
FMAC_I2 s7 , s1 , s14

fstmias YO!, { s4 - s7 }

.endm

/************************************************************************************************/

.macro INIT_F1

vsub.f32 s12, s12, s12
vsub.f32 s13, s13, s13

.endm

.macro KERNEL_F1X4

KERNEL_F1X1
KERNEL_F1X1
KERNEL_F1X1
KERNEL_F1X1

.endm

.macro KERNEL_F1X1

fldmias XO! , { s2 - s3 }
fldmias AO1!, { s4 - s5 }

fmacs s12 , s4 , s2
fmacs s13 , s4 , s3
KMAC_R s12 , s5 , s3
KMAC_I s13 , s5 , s2

.endm

.macro SAVE_F1

fldmias YO, { s4 - s5 }

FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13
FMAC_R2 s4 , s1 , s13
FMAC_I2 s5 , s1 , s12

fstmias YO!, { s4 - s5 }

.endm

/************************************************************************************************/

.macro INIT_S2

vsub.f32 s12, s12, s12
vsub.f32 s13, s13, s13
vsub.f32 s14, s14, s14
vsub.f32 s15, s15, s15

.endm

.macro KERNEL_S2X4

KERNEL_S2X1
KERNEL_S2X1
KERNEL_S2X1
KERNEL_S2X1

.endm

.macro KERNEL_S2X1

fldmias XO , { s2 - s3 }
fldmias AO1!, { s4 - s5 }
fldmias AO2!, { s8 - s9 }

fmacs s12 , s4 , s2
fmacs s13 , s4 , s3
KMAC_R s12 , s5 , s3
KMAC_I s13 , s5 , s2

fmacs s14 , s8 , s2
fmacs s15 , s8 , s3
KMAC_R s14 , s9 , s3
KMAC_I s15 , s9 , s2

add XO, XO, INC_X

.endm

.macro SAVE_S2

fldmias YO, { s4 - s5 }

FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13
FMAC_R2 s4 , s1 , s13
FMAC_I2 s5 , s1 , s12

fstmias YO, { s4 - s5 }

add YO, YO, INC_Y

fldmias YO, { s6 - s7 }

FMAC_R1 s6 , s0 , s14
FMAC_I1 s7 , s0 , s15
FMAC_R2 s6 , s1 , s15
FMAC_I2 s7 , s1 , s14

fstmias YO, { s6 - s7 }

add YO, YO, INC_Y

.endm

/************************************************************************************************/

.macro INIT_S1

vsub.f32 s12, s12, s12
vsub.f32 s13, s13, s13

.endm

.macro KERNEL_S1X4

KERNEL_S1X1
KERNEL_S1X1
KERNEL_S1X1
KERNEL_S1X1

.endm

.macro KERNEL_S1X1

fldmias XO , { s2 - s3 }
fldmias AO1!, { s4 - s5 }

fmacs s12 , s4 , s2
fmacs s13 , s4 , s3
KMAC_R s12 , s5 , s3
KMAC_I s13 , s5 , s2

add XO, XO, INC_X

.endm

.macro SAVE_S1

fldmias YO, { s4 - s5 }

FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13
FMAC_R2 s4 , s1 , s13
FMAC_I2 s5 , s1 , s12

fstmias YO, { s4 - s5 }

add YO, YO, INC_Y

.endm



/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack

sub r12, fp, #192

#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s15 } // store floating point registers
#endif

cmp M, #0
ble cgemvt_kernel_L999

cmp OLD_N, #0
ble cgemvt_kernel_L999

str OLD_A, A
str OLD_N, N

ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y

cmp INC_X, #0
beq cgemvt_kernel_L999

cmp INC_Y, #0
beq cgemvt_kernel_L999

ldr LDA, OLD_LDA


#if defined(DOUBLE)
lsl LDA, LDA, #4 // LDA * SIZE
#else
lsl LDA, LDA, #3 // LDA * SIZE
#endif

cmp INC_X, #1
bne cgemvt_kernel_S2_BEGIN

cmp INC_Y, #1
bne cgemvt_kernel_S2_BEGIN


cgemvt_kernel_F2_BEGIN:

ldr YO , Y

ldr J, N
asrs J, J, #1 // J = N / 2
ble cgemvt_kernel_F1_BEGIN

cgemvt_kernel_F2X4:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A

ldr XO , X

INIT_F2

asrs I, M, #2 // I = M / 4
ble cgemvt_kernel_F2X1


cgemvt_kernel_F2X4_10:

KERNEL_F2X4

subs I, I, #1
bne cgemvt_kernel_F2X4_10


cgemvt_kernel_F2X1:

ands I, M , #3
ble cgemvt_kernel_F2_END

cgemvt_kernel_F2X1_10:

KERNEL_F2X1

subs I, I, #1
bne cgemvt_kernel_F2X1_10


cgemvt_kernel_F2_END:

SAVE_F2

subs J , J , #1
bne cgemvt_kernel_F2X4


cgemvt_kernel_F1_BEGIN:

ldr J, N
ands J, J, #1
ble cgemvt_kernel_L999

cgemvt_kernel_F1X4:

ldr AO1, A

ldr XO , X

INIT_F1

asrs I, M, #2 // I = M / 4
ble cgemvt_kernel_F1X1


cgemvt_kernel_F1X4_10:

KERNEL_F1X4

subs I, I, #1
bne cgemvt_kernel_F1X4_10


cgemvt_kernel_F1X1:

ands I, M , #3
ble cgemvt_kernel_F1_END

cgemvt_kernel_F1X1_10:

KERNEL_F1X1

subs I, I, #1
bne cgemvt_kernel_F1X1_10


cgemvt_kernel_F1_END:

SAVE_F1

b cgemvt_kernel_L999



/*************************************************************************************************************/

cgemvt_kernel_S2_BEGIN:

#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#endif

ldr YO , Y

ldr J, N
asrs J, J, #1 // J = N / 2
ble cgemvt_kernel_S1_BEGIN

cgemvt_kernel_S2X4:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A

ldr XO , X

INIT_S2

asrs I, M, #2 // I = M / 4
ble cgemvt_kernel_S2X1


cgemvt_kernel_S2X4_10:

KERNEL_S2X4

subs I, I, #1
bne cgemvt_kernel_S2X4_10


cgemvt_kernel_S2X1:

ands I, M , #3
ble cgemvt_kernel_S2_END

cgemvt_kernel_S2X1_10:

KERNEL_S2X1

subs I, I, #1
bne cgemvt_kernel_S2X1_10


cgemvt_kernel_S2_END:

SAVE_S2

subs J , J , #1
bne cgemvt_kernel_S2X4


cgemvt_kernel_S1_BEGIN:

ldr J, N
ands J, J, #1
ble cgemvt_kernel_L999

cgemvt_kernel_S1X4:

ldr AO1, A

ldr XO , X

INIT_S1

asrs I, M, #2 // I = M / 4
ble cgemvt_kernel_S1X1


cgemvt_kernel_S1X4_10:

KERNEL_S1X4

subs I, I, #1
bne cgemvt_kernel_S1X4_10


cgemvt_kernel_S1X1:

ands I, M , #3
ble cgemvt_kernel_S1_END

cgemvt_kernel_S1X1_10:

KERNEL_S1X1

subs I, I, #1
bne cgemvt_kernel_S1X1_10


cgemvt_kernel_S1_END:

SAVE_S1



/*************************************************************************************************************/

cgemvt_kernel_L999:

sub r3, fp, #192

#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s15 } // restore floating point registers
#endif

mov r0, #0 // set return value

sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr

EPILOGUE


+ 59
- 0
kernel/arm/copy.c View File

@@ -0,0 +1,59 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;

if ( n < 0 ) return(0);

while(i < n)
{

y[iy] = x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;

}
return(0);

}


+ 1455
- 0
kernel/arm/ctrmm_kernel_2x2_vfp.S
File diff suppressed because it is too large
View File


+ 1476
- 0
kernel/arm/ctrmm_kernel_2x2_vfpv3.S
File diff suppressed because it is too large
View File


+ 222
- 0
kernel/arm/dcopy_vfp.S View File

@@ -0,0 +1,222 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/07 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define OLD_INC_Y [fp, #4 ]

#define I r5
#define Y r6
#define INC_Y r7

#define X_PRE 256

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro COPY_F4

pld [ X, #X_PRE ]
fldmiad X!, { d0 - d3 }
fstmiad Y!, { d0 - d3 }

.endm

.macro COPY_F1

fldmiad X!, { d0 }
fstmiad Y!, { d0 }

.endm


/*************************************************************************************************************************/

.macro COPY_S4

nop
fldmiad X, { d0 }
fstmiad Y, { d0 }
add X, X, INC_X
add Y, Y, INC_Y

fldmiad X, { d1 }
fstmiad Y, { d1 }
add X, X, INC_X
add Y, Y, INC_Y

fldmiad X, { d0 }
fstmiad Y, { d0 }
add X, X, INC_X
add Y, Y, INC_Y

fldmiad X, { d1 }
fstmiad Y, { d1 }
add X, X, INC_X
add Y, Y, INC_Y

.endm


.macro COPY_S1

fldmiad X, { d0 }
fstmiad Y, { d0 }
add X, X, INC_X
add Y, Y, INC_Y

.endm



/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack

sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers

mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
cmp N, #0
ble dcopy_kernel_L999

cmp INC_X, #0
beq dcopy_kernel_L999

cmp INC_Y, #0
beq dcopy_kernel_L999

cmp INC_X, #1
bne dcopy_kernel_S_BEGIN

cmp INC_Y, #1
bne dcopy_kernel_S_BEGIN

dcopy_kernel_F_BEGIN:

asrs I, N, #2 // I = N / 4
ble dcopy_kernel_F1

dcopy_kernel_F4:

COPY_F4

subs I, I, #1
bne dcopy_kernel_F4

dcopy_kernel_F1:

ands I, N, #3
ble dcopy_kernel_L999

dcopy_kernel_F10:

COPY_F1

subs I, I, #1
bne dcopy_kernel_F10

b dcopy_kernel_L999

dcopy_kernel_S_BEGIN:

lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE

asrs I, N, #2 // I = N / 4
ble dcopy_kernel_S1

dcopy_kernel_S4:

COPY_S4

subs I, I, #1
bne dcopy_kernel_S4

dcopy_kernel_S1:

ands I, N, #3
ble dcopy_kernel_L999

dcopy_kernel_S10:

COPY_S1

subs I, I, #1
bne dcopy_kernel_S10






dcopy_kernel_L999:

sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers

mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 248
- 0
kernel/arm/ddot_vfp.S View File

@@ -0,0 +1,248 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/11 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define OLD_INC_Y [fp, #4 ]

#define I r5
#define Y r6
#define INC_Y r7

#define X_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro KERNEL_F4

pld [ X, #X_PRE ]
fldmiad X!, { d8 }
pld [ Y, #X_PRE ]
fldmiad Y!, { d4 }
fldmiad Y!, { d5 }
fmacd d0 , d4, d8
fldmiad X!, { d9 }
fldmiad Y!, { d6 }
fmacd d1 , d5, d9
fldmiad X!, { d10 }
fldmiad X!, { d11 }
fmacd d0 , d6, d10
fldmiad Y!, { d7 }
fmacd d1 , d7, d11

.endm

.macro KERNEL_F1

fldmiad X!, { d4 }
fldmiad Y!, { d8 }
fmacd d0 , d4, d8

.endm


/*************************************************************************************************************************/

.macro KERNEL_S4

nop
fldmiad X, { d4 }
fldmiad Y, { d8 }
add X, X, INC_X
add Y, Y, INC_Y
fmacd d0 , d4, d8

fldmiad X, { d5 }
fldmiad Y, { d9 }
add X, X, INC_X
add Y, Y, INC_Y
fmacd d1 , d5, d9

fldmiad X, { d6 }
fldmiad Y, { d10 }
add X, X, INC_X
add Y, Y, INC_Y
fmacd d0 , d6, d10

fldmiad X, { d7 }
fldmiad Y, { d11 }
add X, X, INC_X
add Y, Y, INC_Y
fmacd d1 , d7, d11

.endm


.macro KERNEL_S1

fldmiad X, { d4 }
fldmiad Y, { d8 }
add X, X, INC_X
fmacd d0 , d4, d8
add Y, Y, INC_Y

.endm



/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack

sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers

mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
vsub.f64 d0 , d0 , d0
vsub.f64 d1 , d1 , d1

cmp N, #0
ble ddot_kernel_L999

cmp INC_X, #0
beq ddot_kernel_L999

cmp INC_Y, #0
beq ddot_kernel_L999

cmp INC_X, #1
bne ddot_kernel_S_BEGIN

cmp INC_Y, #1
bne ddot_kernel_S_BEGIN

ddot_kernel_F_BEGIN:

asrs I, N, #2 // I = N / 4
ble ddot_kernel_F1

ddot_kernel_F4:

KERNEL_F4

subs I, I, #1
ble ddot_kernel_F1


KERNEL_F4

subs I, I, #1
bne ddot_kernel_F4

ddot_kernel_F1:

ands I, N, #3
ble ddot_kernel_L999

ddot_kernel_F10:

KERNEL_F1

subs I, I, #1
bne ddot_kernel_F10

b ddot_kernel_L999

ddot_kernel_S_BEGIN:

lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE

asrs I, N, #2 // I = N / 4
ble ddot_kernel_S1

ddot_kernel_S4:

KERNEL_S4

subs I, I, #1
bne ddot_kernel_S4

ddot_kernel_S1:

ands I, N, #3
ble ddot_kernel_L999

ddot_kernel_S10:

KERNEL_S1

subs I, I, #1
bne ddot_kernel_S10






ddot_kernel_L999:

sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers

vadd.f64 d0 , d0, d1 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 806
- 0
kernel/arm/dgemm_kernel_4x2_vfp.S View File

@@ -0,0 +1,806 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/27 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_M r0
#define OLD_N r1
#define OLD_K r2
#define OLD_A r3
#define OLD_ALPHA d0

/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define LDC [fp, #-252 ]
#define M [fp, #-256 ]
#define N [fp, #-260 ]
#define K [fp, #-264 ]
#define A [fp, #-268 ]

#define ALPHA [fp, #-280]

#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]

#define I r0
#define J r1
#define L r2

#define AO r5
#define BO r6

#define CO1 r8
#define CO2 r9

#define K1 r7
#define BC r12

#define A_PRE 96
#define B_PRE 96
#define C_PRE 32

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro INIT4x2

vsub.f64 d8 , d8 , d8
vmov.f64 d9, d8
vmov.f64 d10, d8
vmov.f64 d11, d8
vmov.f64 d12, d8
vmov.f64 d13, d8
vmov.f64 d14, d8
vmov.f64 d15, d8

.endm



.macro KERNEL4x2_SUB

pld [ AO, #A_PRE ]
fldd d4 , [ BO ]

fldd d0 , [ AO ]
fldd d1 , [ AO, #8 ]

fmacd d8 , d0, d4
fldd d2 , [ AO, #16 ]
fmacd d9 , d1, d4
fldd d3 , [ AO, #24 ]
fmacd d10 , d2, d4
fldd d5 , [ BO, #8 ]
fmacd d11 , d3, d4

fmacd d12 , d0, d5
fmacd d13 , d1, d5
add AO , AO, #32
fmacd d14 , d2, d5
add BO , BO, #16
fmacd d15 , d3, d5


.endm

.macro SAVE4x2

ldr r3 , LDC
add CO2 , CO1, r3

fldd d0, ALPHA


fldd d4 , [CO1]
fldd d5 , [CO1, #8 ]
pld [ CO1, #C_PRE ]
fmacd d4 , d0 , d8
fldd d6 , [CO1, #16 ]
fmacd d5 , d0 , d9
fldd d7 , [CO1, #24 ]
fmacd d6 , d0 , d10
fstd d4 , [CO1]
fmacd d7 , d0 , d11

fstd d5 , [CO1, #8 ]
fstd d6 , [CO1, #16 ]
fstd d7 , [CO1, #24 ]

fldd d4 , [CO2]
fldd d5 , [CO2, #8 ]

pld [ CO2, #C_PRE ]
fmacd d4 , d0 , d12
fldd d6 , [CO2, #16 ]
fmacd d5 , d0 , d13
fldd d7 , [CO2, #24 ]
fmacd d6 , d0 , d14
fstd d4 , [CO2]
fmacd d7 , d0 , d15
add CO1, CO1, #32

fstd d5 , [CO2, #8 ]
fstd d6 , [CO2, #16 ]
fstd d7 , [CO2, #24 ]


.endm


/******************************************************************************/

.macro INIT2x2

vsub.f64 d8 , d8 , d8
vmov.f64 d9, d8
vmov.f64 d12, d8
vmov.f64 d13, d8

.endm

.macro KERNEL2x2_SUB

fldd d4 , [ BO ]
fldd d5 , [ BO, #8 ]

fldd d0 , [ AO ]
fldd d1 , [ AO, #8 ]

fmacd d8 , d0, d4
fmacd d9 , d1, d4

fmacd d12 , d0, d5
fmacd d13 , d1, d5

add AO , AO, #16
add BO , BO, #16

.endm

.macro SAVE2x2

ldr r3 , LDC
add CO2 , CO1, r3

fldd d0, ALPHA

fldd d4 , [CO1]
fldd d5 , [CO1, #8 ]
fmacd d4 , d0 , d8
fmacd d5 , d0 , d9

fstd d4 , [CO1]
fstd d5 , [CO1, #8 ]

fldd d4 , [CO2]
fldd d5 , [CO2, #8 ]

fmacd d4 , d0 , d12
fmacd d5 , d0 , d13

fstd d4 , [CO2]
fstd d5 , [CO2, #8 ]

add CO1, CO1, #16

.endm


/******************************************************************************/

.macro INIT1x2

vsub.f64 d8 , d8 , d8
vmov.f64 d12, d8

.endm

.macro KERNEL1x2_SUB

fldd d4 , [ BO ]
fldd d5 , [ BO, #8 ]

fldd d0 , [ AO ]

fmacd d8 , d0, d4

fmacd d12 , d0, d5

add AO , AO, #8
add BO , BO, #16

.endm

.macro SAVE1x2

ldr r3 , LDC
add CO2 , CO1, r3

fldd d0, ALPHA

fldd d4 , [CO1]
fmacd d4 , d0 , d8

fstd d4 , [CO1]

fldd d4 , [CO2]

fmacd d4 , d0 , d12

fstd d4 , [CO2]

add CO1, CO1, #8

.endm



/******************************************************************************/

.macro INIT4x1

vsub.f64 d8 , d8 , d8
vmov.f64 d9, d8
vmov.f64 d10, d8
vmov.f64 d11, d8

.endm



.macro KERNEL4x1_SUB

fldd d4 , [ BO ]

fldd d0 , [ AO ]
fldd d1 , [ AO, #8 ]
fldd d2 , [ AO, #16 ]
fldd d3 , [ AO, #24 ]

fmacd d8 , d0, d4
fmacd d9 , d1, d4
fmacd d10 , d2, d4
fmacd d11 , d3, d4

add AO , AO, #32
add BO , BO, #8

.endm

.macro SAVE4x1

fldd d0, ALPHA

fldd d4 , [CO1]
fldd d5 , [CO1, #8 ]
fldd d6 , [CO1, #16 ]
fldd d7 , [CO1, #24 ]
fmacd d4 , d0 , d8
fmacd d5 , d0 , d9
fmacd d6 , d0 , d10
fmacd d7 , d0 , d11

fstd d4 , [CO1]
fstd d5 , [CO1, #8 ]
fstd d6 , [CO1, #16 ]
fstd d7 , [CO1, #24 ]

add CO1, CO1, #32

.endm

/******************************************************************************/

.macro INIT2x1

vsub.f64 d8 , d8 , d8
vmov.f64 d9 , d8

.endm

.macro KERNEL2x1_SUB

fldd d4 , [ BO ]

fldd d0 , [ AO ]
fldd d1 , [ AO, #8 ]

fmacd d8 , d0, d4
fmacd d9 , d1, d4

add AO , AO, #16
add BO , BO, #8

.endm

.macro SAVE2x1

fldd d0, ALPHA

fldd d4 , [CO1]
fldd d5 , [CO1, #8 ]
fmacd d4 , d0 , d8
fmacd d5 , d0 , d9

fstd d4 , [CO1]
fstd d5 , [CO1, #8 ]

add CO1, CO1, #16

.endm


/******************************************************************************/

.macro INIT1x1

vsub.f64 d8 , d8 , d8

.endm

.macro KERNEL1x1_SUB

fldd d4 , [ BO ]

fldd d0 , [ AO ]

fmacd d8 , d0, d4

add AO , AO, #8
add BO , BO, #8

.endm

.macro SAVE1x1

fldd d0, ALPHA

fldd d4 , [CO1]
fmacd d4 , d0 , d8

fstd d4 , [CO1]

add CO1, CO1, #8

.endm


/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack

str OLD_M, M
str OLD_N, N
str OLD_K, K
str OLD_A, A
vstr OLD_ALPHA, ALPHA

sub r3, fp, #128
vstm r3, { d8 - d15} // store floating point registers

ldr r3, OLD_LDC
lsl r3, r3, #3 // ldc = ldc * 8
str r3, LDC

ldr K1, K
ldr BC, B

ldr J, N
asrs J, J, #1 // J = J / 2
ble dgemm_kernel_L1_BEGIN


/*********************************************************************************************/

dgemm_kernel_L2_BEGIN:

ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
add r3 , r4, CO1
str r3 , C // store C

ldr AO, A // AO = A

dgemm_kernel_L2_M4_BEGIN:

ldr I, M
asrs I, I, #2 // I = I / 4
ble dgemm_kernel_L2_M2_BEGIN

dgemm_kernel_L2_M4_20:

INIT4x2

mov BO, BC
asrs L , K1, #3 // L = L / 8
ble dgemm_kernel_L2_M4_40
.align 5

dgemm_kernel_L2_M4_22:

pld [ BO, #B_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB
pld [ BO, #B_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB

pld [ BO, #B_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB
pld [ BO, #B_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB

subs L, L, #1
bgt dgemm_kernel_L2_M4_22

dgemm_kernel_L2_M4_40:
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L2_M4_100

dgemm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs L, L, #1
bgt dgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100:

SAVE4x2

dgemm_kernel_L2_M4_END:

subs I, I, #1
bgt dgemm_kernel_L2_M4_20


dgemm_kernel_L2_M2_BEGIN:

ldr I, M
tst I , #3
ble dgemm_kernel_L2_END

tst I, #2 // I = I / 2
ble dgemm_kernel_L2_M1_BEGIN

dgemm_kernel_L2_M2_20:

INIT2x2

mov BO, BC
asrs L , K1, #3 // L = L / 8
ble dgemm_kernel_L2_M2_40

dgemm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB

KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB

subs L, L, #1
bgt dgemm_kernel_L2_M2_22

dgemm_kernel_L2_M2_40:
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L2_M2_100

dgemm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs L, L, #1
bgt dgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100:

SAVE2x2

dgemm_kernel_L2_M2_END:


dgemm_kernel_L2_M1_BEGIN:

tst I, #1 // I = I % 2
ble dgemm_kernel_L2_END

dgemm_kernel_L2_M1_20:

INIT1x2

mov BO, BC
asrs L , K1, #3 // L = L / 8
ble dgemm_kernel_L2_M1_40

dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB

KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB

subs L, L, #1
bgt dgemm_kernel_L2_M1_22

dgemm_kernel_L2_M1_40:
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L2_M1_100

dgemm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs L, L, #1
bgt dgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100:

SAVE1x2


dgemm_kernel_L2_END:

mov r3, BC
mov r4, K1
lsl r4, r4, #4 // k * 2 * 8
add r3, r3, r4 // B = B + K * 2 * 8
mov BC, r3

subs J , #1 // j--
bgt dgemm_kernel_L2_BEGIN

/*********************************************************************************************/

dgemm_kernel_L1_BEGIN:

ldr J , N
tst J , #1
ble dgemm_kernel_L999

ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
str r3 , C // store C

ldr AO, A // AO = A



dgemm_kernel_L1_M4_BEGIN:

ldr I, M
asrs I, I, #2 // I = I / 4
ble dgemm_kernel_L1_M2_BEGIN

dgemm_kernel_L1_M4_20:

INIT4x1

mov BO, BC
asrs L , K1, #3 // L = L / 8
ble dgemm_kernel_L1_M4_40
.align 5

dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB

KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB

subs L, L, #1
bgt dgemm_kernel_L1_M4_22

dgemm_kernel_L1_M4_40:
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L1_M4_100

dgemm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs L, L, #1
bgt dgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100:

SAVE4x1

dgemm_kernel_L1_M4_END:

subs I, I, #1
bgt dgemm_kernel_L1_M4_20


dgemm_kernel_L1_M2_BEGIN:

ldr I, M
tst I , #3
ble dgemm_kernel_L1_END

tst I, #2 // I = I / 2
ble dgemm_kernel_L1_M1_BEGIN

dgemm_kernel_L1_M2_20:

INIT2x1

mov BO, BC
asrs L , K1, #3 // L = L / 8
ble dgemm_kernel_L1_M2_40

dgemm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB

KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB

subs L, L, #1
bgt dgemm_kernel_L1_M2_22

dgemm_kernel_L1_M2_40:
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L1_M2_100

dgemm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs L, L, #1
bgt dgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100:

SAVE2x1

dgemm_kernel_L1_M2_END:


dgemm_kernel_L1_M1_BEGIN:

tst I, #1 // I = I % 2
ble dgemm_kernel_L1_END

dgemm_kernel_L1_M1_20:

INIT1x1

mov BO, BC
asrs L , K1, #3 // L = L / 8
ble dgemm_kernel_L1_M1_40

dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB

KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB

subs L, L, #1
bgt dgemm_kernel_L1_M1_22

dgemm_kernel_L1_M1_40:
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L1_M1_100

dgemm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs L, L, #1
bgt dgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100:

SAVE1x1


dgemm_kernel_L1_END:


dgemm_kernel_L999:

sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers

movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 1483
- 0
kernel/arm/dgemm_kernel_4x4_vfpv3.S
File diff suppressed because it is too large
View File


+ 225
- 0
kernel/arm/dgemm_ncopy_2_vfp.S View File

@@ -0,0 +1,225 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/24 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3

#define B [fp, #4 ]

#define M r0
#define N r1
#define A r2

#define BO r5

#define AO1 r6
#define AO2 r7
#define LDA r8

#define I r3
#define J r12

#define A_PRE 256

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro COPY2x2

fldd d0 , [ AO1, #0 ]
fldd d2 , [ AO1, #8 ]

fldd d1 , [ AO2, #0 ]
fldd d3 , [ AO2, #8 ]

add AO1, AO1, #16
fstmiad BO!, { d0 - d3 }
add AO2, AO2, #16

.endm


.macro COPY1x2

fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO2, #0 ]
add AO1, AO1, #8

fstmiad BO!, { d0 - d1 }
add AO2, AO2, #8

.endm

.macro COPY2x1

fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO1, #8 ]

fstmiad BO!, { d0 - d1 }
add AO1, AO1, #16

.endm


.macro COPY1x1

fldd d0 , [ AO1, #0 ]

fstmiad BO!, { d0 }
add AO1, AO1, #8

.endm





/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24

lsl LDA, OLD_LDA, #3 // lda = lda * 8

ldr BO, B


/*********************************************************************************************/

dgemm_ncopy_L2_BEGIN:

asrs J, N, #1 // J = N / 2
ble dgemm_ncopy_L1_BEGIN

dgemm_ncopy_L2_M2_BEGIN:

mov AO1, A // AO1 = A
add AO2, AO1, LDA
add A , AO2, LDA // A = A + 2 * LDA

asrs I, M, #1 // I = M / 2
ble dgemm_ncopy_L2_M2_40

dgemm_ncopy_L2_M2_20:

COPY2x2

subs I , I , #1
bne dgemm_ncopy_L2_M2_20
dgemm_ncopy_L2_M2_40:

ands I, M , #1
ble dgemm_ncopy_L2_M2_END

dgemm_ncopy_L2_M2_60:

COPY1x2

subs I , I , #1
bne dgemm_ncopy_L2_M2_60

dgemm_ncopy_L2_M2_END:

subs J , J, #1 // j--
bne dgemm_ncopy_L2_M2_BEGIN

/*********************************************************************************************/

dgemm_ncopy_L1_BEGIN:

tst N, #1
ble dgemm_ncopy_L999


dgemm_ncopy_L1_M2_BEGIN:

mov AO1, A // AO1 = A
add A , AO1, LDA // A = A + 1 * LDA

asrs I, M, #1 // I = M / 2
ble dgemm_ncopy_L1_M2_40

dgemm_ncopy_L1_M2_20:

COPY2x1

subs I , I , #1
bne dgemm_ncopy_L1_M2_20
dgemm_ncopy_L1_M2_40:

ands I, M , #1
ble dgemm_ncopy_L1_M2_END

dgemm_ncopy_L1_M2_60:

COPY1x1

subs I , I , #1
bne dgemm_ncopy_L1_M2_60

dgemm_ncopy_L1_M2_END:



dgemm_ncopy_L999:


movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 349
- 0
kernel/arm/dgemm_ncopy_4_vfp.S View File

@@ -0,0 +1,349 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/05 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define LDA [fp, #-260 ]

#define B [fp, #4 ]

#define M r0
#define N r1
#define A r2

#define BO r5

#define AO1 r6
#define AO2 r7
#define AO3 r8
#define AO4 r9

#define I r3
#define J r12

#define A_PRE 256

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro COPY4x4

pld [ AO1, #A_PRE ]
pld [ AO2, #A_PRE ]
pld [ AO3, #A_PRE ]
pld [ AO4, #A_PRE ]

fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO2, #0 ]
fldd d2 , [ AO3, #0 ]
fldd d3 , [ AO4, #0 ]

fldd d4 , [ AO1, #8 ]
fldd d8 , [ AO1, #16 ]
fldd d12, [ AO1, #24 ]

fldd d5 , [ AO2, #8 ]
add AO1, AO1, #32
fldd d9 , [ AO2, #16 ]
fldd d13, [ AO2, #24 ]

fldd d6 , [ AO3, #8 ]
add AO2, AO2, #32
fldd d10, [ AO3, #16 ]
fldd d14, [ AO3, #24 ]

fldd d7 , [ AO4, #8 ]
add AO3, AO3, #32
fldd d11, [ AO4, #16 ]
fldd d15, [ AO4, #24 ]

fstmiad BO!, { d0 - d3 }
add AO4, AO4, #32
fstmiad BO!, { d4 - d7 }
fstmiad BO!, { d8 - d15 }

.endm

.macro COPY1x4

fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO2, #0 ]
add AO1, AO1, #8
fldd d2 , [ AO3, #0 ]
add AO2, AO2, #8
fldd d3 , [ AO4, #0 ]

add AO3, AO3, #8
fstmiad BO!, { d0 - d3 }
add AO4, AO4, #8

.endm

.macro COPY4x2

fldd d0 , [ AO1, #0 ]
fldd d2 , [ AO1, #8 ]
fldd d4 , [ AO1, #16 ]
fldd d6 , [ AO1, #24 ]

fldd d1 , [ AO2, #0 ]
fldd d3 , [ AO2, #8 ]
add AO1, AO1, #32
fldd d5 , [ AO2, #16 ]
fldd d7 , [ AO2, #24 ]

fstmiad BO!, { d0 - d7 }
add AO2, AO2, #32

.endm


.macro COPY1x2

fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO2, #0 ]
add AO1, AO1, #8

fstmiad BO!, { d0 - d1 }
add AO2, AO2, #8

.endm

.macro COPY4x1

fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO1, #8 ]
fldd d2 , [ AO1, #16 ]
fldd d3 , [ AO1, #24 ]

fstmiad BO!, { d0 - d3 }
add AO1, AO1, #32

.endm


.macro COPY1x1

fldd d0 , [ AO1, #0 ]

fstmiad BO!, { d0 }
add AO1, AO1, #8

.endm





/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack


lsl r3, r3, #3 // lda = lda * 8
str r3, LDA

sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers

ldr BO, B

dgemm_ncopy_L4_BEGIN:

asrs J, N, #2 // J = N / 4
ble dgemm_ncopy_L2_BEGIN

dgemm_ncopy_L4_M4_BEGIN:

mov AO1, A // AO1 = A
ldr r4 , LDA
add AO2, AO1, r4
add AO3, AO2, r4
add AO4, AO3, r4
add A , AO4, r4 // A = A + 4 * LDA

asrs I, M, #2 // I = M / 4
ble dgemm_ncopy_L4_M4_40

dgemm_ncopy_L4_M4_20:

COPY4x4

subs I , I , #1
bne dgemm_ncopy_L4_M4_20
dgemm_ncopy_L4_M4_40:

ands I, M , #3
ble dgemm_ncopy_L4_M4_END

dgemm_ncopy_L4_M4_60:

COPY1x4

subs I , I , #1
bne dgemm_ncopy_L4_M4_60

dgemm_ncopy_L4_M4_END:

subs J , J, #1 // j--
bne dgemm_ncopy_L4_M4_BEGIN



/*********************************************************************************************/

dgemm_ncopy_L2_BEGIN:

tst N, #3
ble dgemm_ncopy_L999

tst N, #2
ble dgemm_ncopy_L1_BEGIN

dgemm_ncopy_L2_M4_BEGIN:

mov AO1, A // AO1 = A
ldr r4 , LDA
add AO2, AO1, r4
add A , AO2, r4 // A = A + 2 * LDA

asrs I, M, #2 // I = M / 4
ble dgemm_ncopy_L2_M4_40

dgemm_ncopy_L2_M4_20:

COPY4x2

subs I , I , #1
bne dgemm_ncopy_L2_M4_20
dgemm_ncopy_L2_M4_40:

ands I, M , #3
ble dgemm_ncopy_L2_M4_END

dgemm_ncopy_L2_M4_60:

COPY1x2

subs I , I , #1
bne dgemm_ncopy_L2_M4_60

dgemm_ncopy_L2_M4_END:


/*********************************************************************************************/

dgemm_ncopy_L1_BEGIN:

tst N, #1
ble dgemm_ncopy_L999


dgemm_ncopy_L1_M4_BEGIN:

mov AO1, A // AO1 = A
ldr r4 , LDA
add A , AO1, r4 // A = A + 1 * LDA

asrs I, M, #2 // I = M / 4
ble dgemm_ncopy_L1_M4_40

dgemm_ncopy_L1_M4_20:

COPY4x1

subs I , I , #1
bne dgemm_ncopy_L1_M4_20
dgemm_ncopy_L1_M4_40:

ands I, M , #3
ble dgemm_ncopy_L1_M4_END

dgemm_ncopy_L1_M4_60:

COPY1x1

subs I , I , #1
bne dgemm_ncopy_L1_M4_60

dgemm_ncopy_L1_M4_END:



dgemm_ncopy_L999:

sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers

movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 408
- 0
kernel/arm/dgemm_tcopy_4_vfp.S View File

@@ -0,0 +1,408 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/06 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define B [fp, #4 ]
#define A [fp, #-248 ]

#define M r0
#define N r1
#define M4 r2

#define LDA r5

#define AO1 r6
#define BO1 r7
#define BO2 r8
#define BO3 r9

#define I r4
#define J r12

#define A_PRE 256

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro COPY4x4

pld [ AO1, #A_PRE ]
fldmiad AO1, { d0 - d3 }

add r3, AO1, LDA
pld [ r3, #A_PRE ]
fldmiad r3, { d4 - d7 }

add r3, r3, LDA
pld [ r3, #A_PRE ]
fldmiad r3, { d8 - d11 }

add r3, r3, LDA
pld [ r3, #A_PRE ]
fldmiad r3, { d12 - d15 }

fstmiad BO1, { d0 - d15 }
add AO1, AO1, #32
add BO1, BO1, M4

.endm

.macro COPY2x4

fldmiad AO1, { d0 - d1 }

add r3, AO1, LDA
fldmiad r3, { d2 - d3 }

add r3, r3, LDA
fldmiad r3, { d4 - d5 }

add r3, r3, LDA
fldmiad r3, { d6 - d7 }

fstmiad BO2, { d0 - d7 }
add AO1, AO1, #16
add BO2, BO2, #64

.endm

.macro COPY1x4

fldmiad AO1, { d0 }

add r3, AO1, LDA
fldmiad r3, { d1 }

add r3, r3, LDA
fldmiad r3, { d2 }

add r3, r3, LDA
fldmiad r3, { d3 }

fstmiad BO3, { d0 - d3 }
add AO1, AO1, #8
add BO3, BO3, #32

.endm

/*************************************************************************************************************************/

.macro COPY4x2

pld [ AO1, #A_PRE ]
fldmiad AO1, { d0 - d3 }

add r3, AO1, LDA
pld [ r3, #A_PRE ]
fldmiad r3, { d4 - d7 }

fstmiad BO1, { d0 - d7 }
add AO1, AO1, #32
add BO1, BO1, M4

.endm

.macro COPY2x2

fldmiad AO1, { d0 - d1 }

add r3, AO1, LDA
fldmiad r3, { d2 - d3 }

fstmiad BO2, { d0 - d3 }
add AO1, AO1, #16
add BO2, BO2, #32

.endm

.macro COPY1x2

fldmiad AO1, { d0 }

add r3, AO1, LDA
fldmiad r3, { d1 }

fstmiad BO3, { d0 - d1 }
add AO1, AO1, #8
add BO3, BO3, #16

.endm

/*************************************************************************************************************************/

.macro COPY4x1

pld [ AO1, #A_PRE ]
fldmiad AO1, { d0 - d3 }

fstmiad BO1, { d0 - d3 }
add AO1, AO1, #32
add BO1, BO1, M4

.endm

.macro COPY2x1

fldmiad AO1, { d0 - d1 }

fstmiad BO2, { d0 - d1 }
add AO1, AO1, #16
add BO2, BO2, #16

.endm

.macro COPY1x1

fldmiad AO1, { d0 }

fstmiad BO3, { d0 }
add AO1, AO1, #8
add BO3, BO3, #8

.endm



/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack

str OLD_A, A // store A

lsl LDA, OLD_LDA, #3 // lda = lda * SIZE

sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers

lsl r4 , M, #3 // M * SIZE

ldr r3, B

and BO2 , N , #-4
and BO3 , N , #-2

mul BO2, BO2, r4
mul BO3, BO3, r4

add BO2 , BO2, r3
add BO3 , BO3, r3

lsl M4, M, #5 // M4 = M * 4 * SIZE

dgemm_tcopy_L4_BEGIN:

asrs J, M, #2 // J = N / 4
ble dgemm_tcopy_L2_BEGIN

dgemm_tcopy_L4_M4_BEGIN:

ldr AO1, A // AO1 = A
lsl r3, LDA, #2 // r3 = 4 * LDA
add r3, r3 , AO1 // A = A + 4 * LDA
str r3, A // store A

ldr BO1, B
add r3, BO1, #128 // B = B + 16 * SIZE
str r3, B

asrs I, N, #2 // I = M / 4
ble dgemm_tcopy_L4_M4_40

dgemm_tcopy_L4_M4_20:

COPY4x4

subs I , I , #1
bne dgemm_tcopy_L4_M4_20
dgemm_tcopy_L4_M4_40:

tst N , #2
ble dgemm_tcopy_L4_M4_60

COPY2x4

dgemm_tcopy_L4_M4_60:

tst N, #1
ble dgemm_tcopy_L4_M4_END

COPY1x4

dgemm_tcopy_L4_M4_END:

subs J , J, #1 // j--
bne dgemm_tcopy_L4_M4_BEGIN



/*********************************************************************************************/

dgemm_tcopy_L2_BEGIN:

tst M, #3
ble dgemm_tcopy_L999

tst M, #2
ble dgemm_tcopy_L1_BEGIN

dgemm_tcopy_L2_M4_BEGIN:

ldr AO1, A // AO1 = A
lsl r3, LDA, #1 // r3 = 2 * LDA
add r3, r3 , AO1 // A = A + 2 * LDA
str r3, A // store A

ldr BO1, B
add r3, BO1, #64 // B = B + 8 * SIZE
str r3, B

asrs I, N, #2 // I = M / 4
ble dgemm_tcopy_L2_M4_40

dgemm_tcopy_L2_M4_20:

COPY4x2

subs I , I , #1
bne dgemm_tcopy_L2_M4_20
dgemm_tcopy_L2_M4_40:

tst N , #2
ble dgemm_tcopy_L2_M4_60

COPY2x2

dgemm_tcopy_L2_M4_60:

tst N , #1
ble dgemm_tcopy_L2_M4_END

COPY1x2


dgemm_tcopy_L2_M4_END:


/*********************************************************************************************/

dgemm_tcopy_L1_BEGIN:

tst M, #1
ble dgemm_tcopy_L999


dgemm_tcopy_L1_M4_BEGIN:

ldr AO1, A // AO1 = A
add r3, LDA , AO1 // A = A + 1 * LDA
str r3, A // store A

ldr BO1, B
add r3, BO1, #32 // B = B + 4 * SIZE
str r3, B

asrs I, N, #2 // I = M / 4
ble dgemm_tcopy_L1_M4_40

dgemm_tcopy_L1_M4_20:

COPY4x1

subs I , I , #1
bne dgemm_tcopy_L1_M4_20
dgemm_tcopy_L1_M4_40:

tst N , #2
ble dgemm_tcopy_L1_M4_60

COPY2x1

dgemm_tcopy_L1_M4_60:

tst N , #1
ble dgemm_tcopy_L1_M4_END

COPY1x1


dgemm_tcopy_L1_M4_END:



dgemm_tcopy_L999:

sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers

mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 64
- 0
kernel/arm/dot.c View File

@@ -0,0 +1,64 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"

#if defined(DSDOT)
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#else
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#endif
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
double dot = 0.0 ;

if ( n < 0 ) return(dot);

while(i < n)
{

dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;

}
return(dot);

}


+ 1089
- 0
kernel/arm/dtrmm_kernel_4x2_vfp.S
File diff suppressed because it is too large
View File


+ 1953
- 0
kernel/arm/dtrmm_kernel_4x4_vfpv3.S
File diff suppressed because it is too large
View File


+ 67
- 0
kernel/arm/gemv_n.c View File

@@ -0,0 +1,67 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/


/**************************************************************************************
* * 2013/09/14 Saar
* * BLASTEST float : OK
* * BLASTEST double : OK
* CTEST : OK
* TEST : OK
* *
* **************************************************************************************/


#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp;

ix = 0;
a_ptr = a;

for (j=0; j<n; j++)
{
temp = alpha * x[ix];
iy = 0;
for (i=0; i<m; i++)
{
y[iy] += temp * a_ptr[i];
iy += inc_y;
}
a_ptr += lda;
ix += inc_x;
}

}


+ 740
- 0
kernel/arm/gemv_n_vfp.S View File

@@ -0,0 +1,740 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/28 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_M r0

#define AO1 r0
#define N r1
#define J r2

#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9

#define I r12

#define M [fp, #-252 ]
#define A [fp, #-256 ]


#define X_PRE 64
#define Y_PRE 0
#define A_PRE 0

/**************************************************************************************
* Macro definitions
**************************************************************************************/


#if defined(DOUBLE)

.macro INIT_F8

pld [ YO , #Y_PRE ]
pld [ YO , #Y_PRE+32 ]

vsub.f64 d8 , d8 , d8
vmov.f64 d9 , d8
vmov.f64 d10 , d8
vmov.f64 d11 , d8
vmov.f64 d12 , d8
vmov.f64 d13 , d8
vmov.f64 d14 , d8
vmov.f64 d15 , d8

.endm

.macro KERNEL_F8X8

pld [ XO , #X_PRE ]
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1

pld [ XO , #X_PRE ]
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1

.endm


.macro KERNEL_F8X1

pld [ AO2 , #A_PRE ]
fldmiad XO! , { d2 }
fldmiad AO1 , { d4 - d7 }

vmla.f64 d8 , d2 , d4
pld [ AO2 , #4*SIZE ]
vmla.f64 d9 , d2 , d5
add r3, AO1, #4*SIZE
vmla.f64 d10 , d2 , d6
vmla.f64 d11 , d2 , d7


fldmiad r3 , { d4 - d7 }

vmla.f64 d12 , d2 , d4
vmla.f64 d13 , d2 , d5
add AO1, AO1, LDA
vmla.f64 d14 , d2 , d6
add AO2, AO2, LDA
vmla.f64 d15 , d2 , d7


.endm

.macro SAVE_F8

fldmiad YO, { d4 - d7 }

vmla.f64 d4 , d0, d8
vmla.f64 d5 , d0, d9
vmla.f64 d6 , d0, d10
vmla.f64 d7 , d0, d11

fstmiad YO!, { d4 - d7 }

fldmiad YO, { d4 - d7 }

vmla.f64 d4 , d0, d12
vmla.f64 d5 , d0, d13
vmla.f64 d6 , d0, d14
vmla.f64 d7 , d0, d15

fstmiad YO!, { d4 - d7 }

.endm


.macro INIT_F1

vsub.f64 d12 , d12 , d12

.endm



.macro KERNEL_F1X1

fldmiad XO! , { d2 }
fldmiad AO1 , { d8 }
vmla.f64 d12 , d2 , d8
add AO1, AO1, LDA

.endm

.macro SAVE_F1

fldmiad YO, { d4 }
vmla.f64 d4, d0, d12
fstmiad YO!, { d4 }

.endm

/*********************************************************************************************/

.macro INIT_S4

vsub.f64 d12 , d12 , d12
vmov.f64 d13 , d12
vmov.f64 d14 , d12
vmov.f64 d15 , d12

.endm

.macro KERNEL_S4X4

KERNEL_S4X1
KERNEL_S4X1
KERNEL_S4X1
KERNEL_S4X1

.endm


.macro KERNEL_S4X1

pld [ AO2 , #A_PRE ]
fldmiad XO , { d2 }
fldmiad AO1 , { d8 - d11 }

vmla.f64 d12 , d2 , d8
add AO1, AO1, LDA
vmla.f64 d13 , d2 , d9
add AO2, AO2, LDA
vmla.f64 d14 , d2 , d10
vmla.f64 d15 , d2 , d11
add XO, XO , INC_X

.endm

.macro SAVE_S4

fldmiad YO, { d4 }
vmla.f64 d4 , d0, d12
fstmiad YO, { d4 }
add YO, YO, INC_Y

fldmiad YO, { d5 }
vmla.f64 d5 , d0, d13
fstmiad YO, { d5 }
add YO, YO, INC_Y

fldmiad YO, { d4 }
vmla.f64 d4 , d0, d14
fstmiad YO, { d4 }
add YO, YO, INC_Y

fldmiad YO, { d5 }
vmla.f64 d5 , d0, d15
fstmiad YO, { d5 }
add YO, YO, INC_Y

.endm


.macro INIT_S1

vsub.f64 d12 , d12 , d12

.endm



.macro KERNEL_S1X1

fldmiad XO , { d2 }
fldmiad AO1 , { d8 }
vmla.f64 d12 , d2 , d8
add AO1, AO1, LDA
add XO, XO , INC_X

.endm

.macro SAVE_S1

fldmiad YO, { d4 }
vmla.f64 d4, d0, d12
fstmiad YO , { d4 }
add YO, YO, INC_Y

.endm




#else /************************* SINGLE PRECISION *****************************************/

.macro INIT_F8

pld [ YO , #Y_PRE ]

vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
vmov.f32 s10 , s8
vmov.f32 s11 , s8
vmov.f32 s12 , s8
vmov.f32 s13 , s8
vmov.f32 s14 , s8
vmov.f32 s15 , s8

.endm

.macro KERNEL_F8X8

pld [ XO , #X_PRE ]
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1

KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1

.endm


.macro KERNEL_F8X1

pld [ AO2, #A_PRE ]
fldmias XO! , { s2 }
fldmias AO1 , { s4 - s7 }

vmla.f32 s8 , s2 , s4
vmla.f32 s9 , s2 , s5
vmla.f32 s10 , s2 , s6
vmla.f32 s11 , s2 , s7

add r3, AO1, #4*SIZE

fldmias r3 , { s4 - s7 }

vmla.f32 s12 , s2 , s4
vmla.f32 s13 , s2 , s5
vmla.f32 s14 , s2 , s6
vmla.f32 s15 , s2 , s7

add AO1, AO1, LDA
add AO2, AO2, LDA

.endm

.macro SAVE_F8

fldmias YO, { s4 - s7 }

vmla.f32 s4 , s0, s8
vmla.f32 s5 , s0, s9
vmla.f32 s6 , s0, s10
vmla.f32 s7 , s0, s11

fstmias YO!, { s4 - s7 }


fldmias YO, { s4 - s7 }

vmla.f32 s4 , s0, s12
vmla.f32 s5 , s0, s13
vmla.f32 s6 , s0, s14
vmla.f32 s7 , s0, s15

fstmias YO!, { s4 - s7 }

.endm


.macro INIT_F1

vsub.f32 s12 , s12 , s12

.endm



.macro KERNEL_F1X1

fldmias XO! , { s2 }
fldmias AO1 , { s8 }
vmla.f32 s12 , s2 , s8
add AO1, AO1, LDA

.endm

.macro SAVE_F1

fldmias YO, { s4 }
vmla.f32 s4, s0, s12
fstmias YO!, { s4 }

.endm

/*********************************************************************************************/

.macro INIT_S4

vsub.f32 s12 , s12 , s12
vmov.f32 s13 , s12
vmov.f32 s14 , s12
vmov.f32 s15 , s12

.endm

.macro KERNEL_S4X4

pld [ AO2 , #A_PRE ]
KERNEL_S4X1
KERNEL_S4X1
pld [ AO2 , #A_PRE ]
KERNEL_S4X1
KERNEL_S4X1

.endm


.macro KERNEL_S4X1

fldmias XO , { s2 }
fldmias AO1 , { s8 - s11 }

vmla.f32 s12 , s2 , s8
vmla.f32 s13 , s2 , s9
vmla.f32 s14 , s2 , s10
vmla.f32 s15 , s2 , s11
add AO1, AO1, LDA
add AO2, AO2, LDA
add XO, XO , INC_X

.endm

.macro SAVE_S4

fldmias YO, { s4 }
vmla.f32 s4 , s0, s12
fstmias YO, { s4 }
add YO, YO, INC_Y

fldmias YO, { s5 }
vmla.f32 s5 , s0, s13
fstmias YO, { s5 }
add YO, YO, INC_Y

fldmias YO, { s4 }
vmla.f32 s4 , s0, s14
fstmias YO, { s4 }
add YO, YO, INC_Y

fldmias YO, { s5 }
vmla.f32 s5 , s0, s15
fstmias YO, { s5 }
add YO, YO, INC_Y

.endm


.macro INIT_S1

vsub.f32 s12 , s12 , s12

.endm



.macro KERNEL_S1X1

fldmias XO , { s2 }
fldmias AO1 , { s8 }
vmla.f32 s12 , s2 , s8
add AO1, AO1, LDA
add XO, XO , INC_X

.endm

.macro SAVE_S1

fldmias YO, { s4 }
vmla.f32 s4, s0, s12
fstmias YO , { s4 }
add YO, YO, INC_Y

.endm




#endif

/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack

sub r12, fp, #192

#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s15 } // store floating point registers
#endif

cmp OLD_M, #0
ble gemvn_kernel_L999

cmp N, #0
ble gemvn_kernel_L999

str OLD_A, A
str OLD_M, M

ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y

cmp INC_X, #0
beq gemvn_kernel_L999

cmp INC_Y, #0
beq gemvn_kernel_L999

ldr LDA, OLD_LDA


#if defined(DOUBLE)
lsl LDA, LDA, #3 // LDA * SIZE
#else
lsl LDA, LDA, #2 // LDA * SIZE
#endif

cmp INC_X, #1
bne gemvn_kernel_S4_BEGIN

cmp INC_Y, #1
bne gemvn_kernel_S4_BEGIN


gemvn_kernel_F4_BEGIN:

ldr YO , Y

ldr I, M
asrs I, I, #3 // I = M / 8
ble gemvn_kernel_F1_BEGIN

gemvn_kernel_F4X4:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #8*SIZE
str r3 , A

add AO2, AO2, LDA
add AO2, AO2, LDA

ldr XO , X

INIT_F8

asrs J, N, #3 // J = N / 8
ble gemvn_kernel_F4X1


gemvn_kernel_F4X4_10:

KERNEL_F8X8

subs J, J, #1
bne gemvn_kernel_F4X4_10


gemvn_kernel_F4X1:

ands J, N , #7
ble gemvn_kernel_F4_END

gemvn_kernel_F4X1_10:

KERNEL_F8X1

subs J, J, #1
bne gemvn_kernel_F4X1_10


gemvn_kernel_F4_END:

SAVE_F8

subs I , I , #1
bne gemvn_kernel_F4X4


gemvn_kernel_F1_BEGIN:

ldr I, M
ands I, I , #7
ble gemvn_kernel_L999

gemvn_kernel_F1X1:

ldr AO1, A
add r3, AO1, #SIZE
str r3, A
ldr XO , X

INIT_F1

mov J, N


gemvn_kernel_F1X1_10:

KERNEL_F1X1

subs J, J, #1
bne gemvn_kernel_F1X1_10


gemvn_kernel_F1_END:

SAVE_F1

subs I , I , #1
bne gemvn_kernel_F1X1

b gemvn_kernel_L999



/*************************************************************************************************************/

gemvn_kernel_S4_BEGIN:

#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
#endif

ldr YO , Y

ldr I, M
asrs I, I, #2 // I = M / 4
ble gemvn_kernel_S1_BEGIN

gemvn_kernel_S4X4:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #4*SIZE
str r3 , A

ldr XO , X

INIT_S4

asrs J, N, #2 // J = N / 4
ble gemvn_kernel_S4X1


gemvn_kernel_S4X4_10:

KERNEL_S4X4

subs J, J, #1
bne gemvn_kernel_S4X4_10


gemvn_kernel_S4X1:

ands J, N , #3
ble gemvn_kernel_S4_END

gemvn_kernel_S4X1_10:

KERNEL_S4X1

subs J, J, #1
bne gemvn_kernel_S4X1_10


gemvn_kernel_S4_END:

SAVE_S4

subs I , I , #1
bne gemvn_kernel_S4X4


gemvn_kernel_S1_BEGIN:

ldr I, M
ands I, I , #3
ble gemvn_kernel_L999

gemvn_kernel_S1X1:

ldr AO1, A
add r3, AO1, #SIZE
str r3, A
ldr XO , X

INIT_S1

mov J, N


gemvn_kernel_S1X1_10:

KERNEL_S1X1

subs J, J, #1
bne gemvn_kernel_S1X1_10


gemvn_kernel_S1_END:

SAVE_S1

subs I , I , #1
bne gemvn_kernel_S1X1


/*************************************************************************************************************/

gemvn_kernel_L999:

sub r3, fp, #192

#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s15 } // restore floating point registers
#endif

mov r0, #0 // set return value

sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr

EPILOGUE


+ 781
- 0
kernel/arm/gemv_n_vfpv3.S View File

@@ -0,0 +1,781 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/19 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_M r0

#define AO1 r0
#define N r1
#define J r2

#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9

#define I r12

#define M [fp, #-252 ]
#define A [fp, #-256 ]


#define X_PRE 64
#define Y_PRE 0
#define A_PRE 0

/**************************************************************************************
* Macro definitions
**************************************************************************************/


#if defined(DOUBLE)

.macro INIT_F8

pld [ YO , #Y_PRE ]
pld [ YO , #Y_PRE+32 ]

vsub.f64 d24 , d24 , d24
vmov.f64 d25 , d24
vmov.f64 d26 , d24
vmov.f64 d27 , d24
vmov.f64 d28 , d24
vmov.f64 d29 , d24
vmov.f64 d30 , d24
vmov.f64 d31 , d24

.endm

.macro KERNEL_F8X8

pld [ XO , #X_PRE ]
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1

pld [ XO , #X_PRE ]
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1

.endm


.macro KERNEL_F8X1

fldmiad XO! , { d4 }
fldmiad AO1 , { d8 - d15 }

vmla.f64 d24 , d4 , d8
pld [ AO2 , #A_PRE ]
vmla.f64 d25 , d4 , d9
pld [ AO2 , #A_PRE+32 ]
vmla.f64 d26 , d4 , d10
vmla.f64 d27 , d4 , d11
vmla.f64 d28 , d4 , d12
vmla.f64 d29 , d4 , d13
add AO1, AO1, LDA
vmla.f64 d30 , d4 , d14
add AO2, AO2, LDA
vmla.f64 d31 , d4 , d15

.endm

.macro SAVE_F8

fldmiad YO, { d16 - d23 }

vmla.f64 d16, d0, d24
vmla.f64 d17, d0, d25
vmla.f64 d18, d0, d26
vmla.f64 d19, d0, d27
vmla.f64 d20, d0, d28
vmla.f64 d21, d0, d29
vmla.f64 d22, d0, d30
vmla.f64 d23, d0, d31

fstmiad YO!, { d16 - d23 }

.endm


.macro INIT_F1

vsub.f64 d24 , d24 , d24

.endm



.macro KERNEL_F1X1

fldmiad XO! , { d4 }
fldmiad AO1 , { d8 }
vmla.f64 d24 , d4 , d8
add AO1, AO1, LDA

.endm

.macro SAVE_F1

fldmiad YO, { d16 }
vmla.f64 d16, d0, d24
fstmiad YO!, { d16 }

.endm

/*********************************************************************************************/


.macro INIT_S8

vsub.f64 d24 , d24 , d24
vmov.f64 d25 , d24
vmov.f64 d26 , d24
vmov.f64 d27 , d24
vmov.f64 d28 , d24
vmov.f64 d29 , d24
vmov.f64 d30 , d24
vmov.f64 d31 , d24

.endm

.macro KERNEL_S8X8

KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1

KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1

.endm


.macro KERNEL_S8X1

pld [ AO2 , #A_PRE ]
pld [ AO2 , #A_PRE+32 ]
fldmiad XO , { d4 }
fldmiad AO1 , { d8 - d15 }

vmla.f64 d24 , d4 , d8
vmla.f64 d25 , d4 , d9
vmla.f64 d26 , d4 , d10
vmla.f64 d27 , d4 , d11
vmla.f64 d28 , d4 , d12
vmla.f64 d29 , d4 , d13
vmla.f64 d30 , d4 , d14
vmla.f64 d31 , d4 , d15
add AO1, AO1, LDA
add AO2, AO2, LDA
add XO, XO, INC_X

.endm

.macro SAVE_S8

fldmiad YO, { d16 }
vmla.f64 d16, d0, d24
fstmiad YO, { d16 }
add YO, YO, INC_Y

fldmiad YO, { d17 }
vmla.f64 d17, d0, d25
fstmiad YO, { d17 }
add YO, YO, INC_Y

fldmiad YO, { d18 }
vmla.f64 d18, d0, d26
fstmiad YO, { d18 }
add YO, YO, INC_Y

fldmiad YO, { d19 }
vmla.f64 d19, d0, d27
fstmiad YO, { d19 }
add YO, YO, INC_Y

fldmiad YO, { d20 }
vmla.f64 d20, d0, d28
fstmiad YO, { d20 }
add YO, YO, INC_Y

fldmiad YO, { d21 }
vmla.f64 d21, d0, d29
fstmiad YO, { d21 }
add YO, YO, INC_Y

fldmiad YO, { d22 }
vmla.f64 d22, d0, d30
fstmiad YO, { d22 }
add YO, YO, INC_Y

fldmiad YO, { d23 }
vmla.f64 d23, d0, d31
fstmiad YO, { d23 }
add YO, YO, INC_Y

.endm


.macro INIT_S1

vsub.f64 d24 , d24 , d24

.endm



.macro KERNEL_S1X1

fldmiad XO , { d4 }
fldmiad AO1 , { d8 }
vmla.f64 d24 , d4 , d8
add AO1, AO1, LDA
add XO, XO, INC_X

.endm

.macro SAVE_S1

fldmiad YO, { d16 }
vmla.f64 d16, d0, d24
fstmiad YO, { d16 }
add YO, YO, INC_Y

.endm



#else /************************* SINGLE PRECISION *****************************************/

.macro INIT_F8

pld [ YO , #Y_PRE ]

vsub.f32 s24 , s24 , s24
vmov.f32 s25 , s24
vmov.f32 s26 , s24
vmov.f32 s27 , s24
vmov.f32 s28 , s24
vmov.f32 s29 , s24
vmov.f32 s30 , s24
vmov.f32 s31 , s24

.endm

.macro KERNEL_F8X8

pld [ XO , #X_PRE ]
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1

KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1

.endm


.macro KERNEL_F8X1

pld [ AO2 , #A_PRE ]
fldmias XO! , { s4 }
fldmias AO1 , { s8 - s15 }

vmla.f32 s24 , s4 , s8
vmla.f32 s25 , s4 , s9
vmla.f32 s26 , s4 , s10
vmla.f32 s27 , s4 , s11
vmla.f32 s28 , s4 , s12
vmla.f32 s29 , s4 , s13
vmla.f32 s30 , s4 , s14
vmla.f32 s31 , s4 , s15
add AO1, AO1, LDA
add AO2, AO2, LDA

.endm

.macro SAVE_F8

fldmias YO, { s16 - s23 }

vmla.f32 s16, s0, s24
vmla.f32 s17, s0, s25
vmla.f32 s18, s0, s26
vmla.f32 s19, s0, s27
vmla.f32 s20, s0, s28
vmla.f32 s21, s0, s29
vmla.f32 s22, s0, s30
vmla.f32 s23, s0, s31

fstmias YO!, { s16 - s23 }

.endm


.macro INIT_F1

vsub.f32 s24 , s24 , s24

.endm



.macro KERNEL_F1X1

fldmias XO! , { s4 }
fldmias AO1 , { s8 }
vmla.f32 s24 , s4 , s8
add AO1, AO1, LDA

.endm

.macro SAVE_F1

fldmias YO, { s16 }
vmla.f32 s16, s0, s24
fstmias YO!, { s16 }

.endm

/*********************************************************************************************/


.macro INIT_S8

vsub.f32 s24 , s24 , s24
vmov.f32 s25 , s24
vmov.f32 s26 , s24
vmov.f32 s27 , s24
vmov.f32 s28 , s24
vmov.f32 s29 , s24
vmov.f32 s30 , s24
vmov.f32 s31 , s24

.endm

.macro KERNEL_S8X8

KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1

KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1

.endm


.macro KERNEL_S8X1

pld [ AO2 , #A_PRE ]
fldmias XO , { s4 }
fldmias AO1 , { s8 - s15 }

vmla.f32 s24 , s4 , s8
vmla.f32 s25 , s4 , s9
vmla.f32 s26 , s4 , s10
vmla.f32 s27 , s4 , s11
vmla.f32 s28 , s4 , s12
vmla.f32 s29 , s4 , s13
vmla.f32 s30 , s4 , s14
vmla.f32 s31 , s4 , s15
add AO1, AO1, LDA
add AO2, AO2, LDA
add XO, XO, INC_X

.endm

.macro SAVE_S8

fldmias YO, { s16 }
vmla.f32 s16, s0, s24
fstmias YO, { s16 }
add YO, YO, INC_Y

fldmias YO, { s17 }
vmla.f32 s17, s0, s25
fstmias YO, { s17 }
add YO, YO, INC_Y

fldmias YO, { s18 }
vmla.f32 s18, s0, s26
fstmias YO, { s18 }
add YO, YO, INC_Y

fldmias YO, { s19 }
vmla.f32 s19, s0, s27
fstmias YO, { s19 }
add YO, YO, INC_Y

fldmias YO, { s20 }
vmla.f32 s20, s0, s28
fstmias YO, { s20 }
add YO, YO, INC_Y

fldmias YO, { s21 }
vmla.f32 s21, s0, s29
fstmias YO, { s21 }
add YO, YO, INC_Y

fldmias YO, { s22 }
vmla.f32 s22, s0, s30
fstmias YO, { s22 }
add YO, YO, INC_Y

fldmias YO, { s23 }
vmla.f32 s23, s0, s31
fstmias YO, { s23 }
add YO, YO, INC_Y

.endm


.macro INIT_S1

vsub.f32 s24 , s24 , s24

.endm



.macro KERNEL_S1X1

fldmias XO , { s4 }
fldmias AO1 , { s8 }
vmla.f32 s24 , s4 , s8
add AO1, AO1, LDA
add XO, XO, INC_X

.endm

.macro SAVE_S1

fldmias YO, { s16 }
vmla.f32 s16, s0, s24
fstmias YO, { s16 }
add YO, YO, INC_Y

.endm




#endif

/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack

sub r12, fp, #192

#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s31 } // store floating point registers
#endif

cmp OLD_M, #0
ble gemvn_kernel_L999

cmp N, #0
ble gemvn_kernel_L999

str OLD_A, A
str OLD_M, M

ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y

cmp INC_X, #0
beq gemvn_kernel_L999

cmp INC_Y, #0
beq gemvn_kernel_L999

ldr LDA, OLD_LDA


#if defined(DOUBLE)
lsl LDA, LDA, #3 // LDA * SIZE
#else
lsl LDA, LDA, #2 // LDA * SIZE
#endif

cmp INC_X, #1
bne gemvn_kernel_S8_BEGIN

cmp INC_Y, #1
bne gemvn_kernel_S8_BEGIN


gemvn_kernel_F8_BEGIN:

ldr YO , Y

ldr I, M
asrs I, I, #3 // I = M / 8
ble gemvn_kernel_F1_BEGIN

gemvn_kernel_F8X8:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #8*SIZE
str r3 , A

ldr XO , X

INIT_F8

asrs J, N, #3 // J = N / 8
ble gemvn_kernel_F8X1


gemvn_kernel_F8X8_10:

KERNEL_F8X8

subs J, J, #1
bne gemvn_kernel_F8X8_10


gemvn_kernel_F8X1:

ands J, N , #7
ble gemvn_kernel_F8_END

gemvn_kernel_F8X1_10:

KERNEL_F8X1

subs J, J, #1
bne gemvn_kernel_F8X1_10


gemvn_kernel_F8_END:

SAVE_F8

subs I , I , #1
bne gemvn_kernel_F8X8


gemvn_kernel_F1_BEGIN:

ldr I, M
ands I, I , #7
ble gemvn_kernel_L999

gemvn_kernel_F1X1:

ldr AO1, A
add r3, AO1, #SIZE
str r3, A
ldr XO , X

INIT_F1

mov J, N


gemvn_kernel_F1X1_10:

KERNEL_F1X1

subs J, J, #1
bne gemvn_kernel_F1X1_10


gemvn_kernel_F1_END:

SAVE_F1

subs I , I , #1
bne gemvn_kernel_F1X1

b gemvn_kernel_L999



/*************************************************************************************************************/

gemvn_kernel_S8_BEGIN:

#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
#endif

ldr YO , Y

ldr I, M
asrs I, I, #3 // I = M / 8
ble gemvn_kernel_S1_BEGIN

gemvn_kernel_S8X8:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #8*SIZE
str r3 , A

ldr XO , X

INIT_S8

asrs J, N, #3 // J = N / 8
ble gemvn_kernel_S8X1


gemvn_kernel_S8X8_10:

KERNEL_S8X8

subs J, J, #1
bne gemvn_kernel_S8X8_10


gemvn_kernel_S8X1:

ands J, N , #7
ble gemvn_kernel_S8_END

gemvn_kernel_S8X1_10:

KERNEL_S8X1

subs J, J, #1
bne gemvn_kernel_S8X1_10


gemvn_kernel_S8_END:

SAVE_S8

subs I , I , #1
bne gemvn_kernel_S8X8


gemvn_kernel_S1_BEGIN:

ldr I, M
ands I, I , #7
ble gemvn_kernel_L999

gemvn_kernel_S1X1:

ldr AO1, A
add r3, AO1, #SIZE
str r3, A
ldr XO , X

INIT_S1

mov J, N


gemvn_kernel_S1X1_10:

KERNEL_S1X1

subs J, J, #1
bne gemvn_kernel_S1X1_10


gemvn_kernel_S1_END:

SAVE_S1

subs I , I , #1
bne gemvn_kernel_S1X1


/*************************************************************************************************************/

gemvn_kernel_L999:

sub r3, fp, #192

#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s31 } // restore floating point registers
#endif

mov r0, #0 // set return value

sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr

EPILOGUE


+ 67
- 0
kernel/arm/gemv_t.c View File

@@ -0,0 +1,67 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* * 2013/09/14 Saar
* * BLASTEST float : OK
* * BLASTEST double : OK
* CTEST : OK
* TEST : OK
* *
* **************************************************************************************/


#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp;

iy = 0;
a_ptr = a;

for (j=0; j<n; j++)
{
temp = 0.0;
ix = 0;
for (i=0; i<m; i++)
{
temp += a_ptr[i] * x[ix];
ix += inc_x;
}
y[iy] += alpha * temp;
iy += inc_y;
a_ptr += lda;
}

}


+ 750
- 0
kernel/arm/gemv_t_vfp.S View File

@@ -0,0 +1,750 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/25 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_N r1

#define M r0
#define AO1 r1
#define J r2

#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9

#define I r12

#define N [fp, #-252 ]
#define A [fp, #-256 ]


#define X_PRE 512
#define A_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/


#if defined(DOUBLE)

.macro INIT_F2

vsub.f64 d2 , d2 , d2
vsub.f64 d3 , d3 , d3

.endm

.macro KERNEL_F2X4

pld [ XO , #X_PRE ]
fldmiad XO! , { d12 - d15 }
pld [ AO1 , #A_PRE ]
fldmiad AO1!, { d8 - d9 }
pld [ AO2 , #A_PRE ]
fldmiad AO2!, { d4 - d5 }
fldmiad AO1!, { d10 - d11 }
fldmiad AO2!, { d6 - d7 }

vmla.f64 d2 , d12 , d8
vmla.f64 d3 , d12 , d4
vmla.f64 d2 , d13 , d9
vmla.f64 d3 , d13 , d5
vmla.f64 d2 , d14, d10
vmla.f64 d3 , d14, d6
vmla.f64 d2 , d15, d11
vmla.f64 d3 , d15, d7

.endm

.macro KERNEL_F2X1

fldmiad XO! , { d1 }
fldmiad AO1!, { d8 }
fldmiad AO2!, { d4 }
vmla.f64 d2 , d1 , d8
vmla.f64 d3 , d1 , d4

.endm

.macro SAVE_F2

fldmiad YO, { d4 - d5 }
vmla.f64 d4, d0, d2
vmla.f64 d5, d0, d3
fstmiad YO!, { d4 - d5 }

.endm

.macro INIT_F1

vsub.f64 d2 , d2 , d2

.endm

.macro KERNEL_F1X4

pld [ XO , #X_PRE ]
fldmiad XO! , { d12 - d15 }
pld [ AO1 , #A_PRE ]
fldmiad AO1!, { d8 - d9 }
fldmiad AO1!, { d10 - d11 }
vmla.f64 d2 , d12 , d8
vmla.f64 d2 , d13 , d9
vmla.f64 d2 , d14, d10
vmla.f64 d2 , d15, d11

.endm

.macro KERNEL_F1X1

fldmiad XO! , { d1 }
fldmiad AO1!, { d8 }
vmla.f64 d2 , d1 , d8

.endm

.macro SAVE_F1

fldmiad YO, { d4 }
vmla.f64 d4, d0, d2
fstmiad YO!, { d4 }

.endm


.macro INIT_S2

vsub.f64 d2 , d2 , d2
vsub.f64 d3 , d3 , d3

.endm

.macro KERNEL_S2X4

fldmiad XO , { d12 }
add XO, XO, INC_X

pld [ AO1 , #A_PRE ]
fldmiad AO1!, { d8 - d9 }
pld [ AO2 , #A_PRE ]
fldmiad AO2!, { d4 - d5 }

fldmiad XO , { d13 }
add XO, XO, INC_X
fldmiad AO1!, { d10 - d11 }
fldmiad AO2!, { d6 - d7 }

fldmiad XO , { d14 }
add XO, XO, INC_X

fldmiad XO , { d15 }
add XO, XO, INC_X

vmla.f64 d2 , d12 , d8
vmla.f64 d3 , d12 , d4
vmla.f64 d2 , d13 , d9
vmla.f64 d3 , d13 , d5
vmla.f64 d2 , d14, d10
vmla.f64 d3 , d14, d6
vmla.f64 d2 , d15, d11
vmla.f64 d3 , d15, d7

.endm

.macro KERNEL_S2X1

fldmiad XO , { d1 }
fldmiad AO1!, { d8 }
fldmiad AO2!, { d4 }
vmla.f64 d2 , d1 , d8
add XO, XO, INC_X
vmla.f64 d3 , d1 , d4

.endm

.macro SAVE_S2

fldmiad YO, { d4 }
vmla.f64 d4, d0, d2
fstmiad YO, { d4 }
add YO, YO, INC_Y

fldmiad YO, { d5 }
vmla.f64 d5, d0, d3
fstmiad YO, { d5 }
add YO, YO, INC_Y

.endm

.macro INIT_S1

vsub.f64 d2 , d2 , d2

.endm

.macro KERNEL_S1X4

fldmiad XO , { d12 }
add XO, XO, INC_X

pld [ AO1 , #A_PRE ]
fldmiad AO1!, { d8 - d9 }

fldmiad XO , { d13 }
add XO, XO, INC_X
fldmiad AO1!, { d10 - d11 }

fldmiad XO , { d14 }
add XO, XO, INC_X

fldmiad XO , { d15 }
add XO, XO, INC_X

vmla.f64 d2 , d12 , d8
vmla.f64 d2 , d13 , d9
vmla.f64 d2 , d14, d10
vmla.f64 d2 , d15, d11

.endm

.macro KERNEL_S1X1

fldmiad XO , { d1 }
fldmiad AO1!, { d8 }
vmla.f64 d2 , d1 , d8
add XO, XO, INC_X

.endm

.macro SAVE_S1

fldmiad YO, { d4 }
vmla.f64 d4, d0, d2
fstmiad YO, { d4 }
add YO, YO, INC_Y

.endm


#else /************************* SINGLE PRECISION *****************************************/

.macro INIT_F2

vsub.f32 s2 , s2 , s2
vsub.f32 s3 , s3 , s3

.endm

.macro KERNEL_F2X4

fldmias XO! , { s12 - s15 }
fldmias AO1!, { s8 - s9 }
fldmias AO2!, { s4 - s5 }
fldmias AO1!, { s10 - s11 }
fldmias AO2!, { s6 - s7 }

vmla.f32 s2 , s12 , s8
vmla.f32 s3 , s12 , s4
vmla.f32 s2 , s13 , s9
vmla.f32 s3 , s13 , s5
vmla.f32 s2 , s14, s10
vmla.f32 s3 , s14, s6
vmla.f32 s2 , s15, s11
vmla.f32 s3 , s15, s7

.endm

.macro KERNEL_F2X1

fldmias XO! , { s1 }
fldmias AO1!, { s8 }
fldmias AO2!, { s4 }
vmla.f32 s2 , s1 , s8
vmla.f32 s3 , s1 , s4

.endm

.macro SAVE_F2

fldmias YO, { s4 - s5 }
vmla.f32 s4, s0, s2
vmla.f32 s5, s0, s3
fstmias YO!, { s4 - s5 }

.endm

.macro INIT_F1

vsub.f32 s2 , s2 , s2

.endm

.macro KERNEL_F1X4

fldmias XO! , { s12 - s15 }
fldmias AO1!, { s8 - s9 }
fldmias AO1!, { s10 - s11 }
vmla.f32 s2 , s12 , s8
vmla.f32 s2 , s13 , s9
vmla.f32 s2 , s14, s10
vmla.f32 s2 , s15, s11

.endm

.macro KERNEL_F1X1

fldmias XO! , { s1 }
fldmias AO1!, { s8 }
vmla.f32 s2 , s1 , s8

.endm

.macro SAVE_F1

fldmias YO, { s4 }
vmla.f32 s4, s0, s2
fstmias YO!, { s4 }

.endm


.macro INIT_S2

vsub.f32 s2 , s2 , s2
vsub.f32 s3 , s3 , s3

.endm

.macro KERNEL_S2X4

fldmias XO , { s12 }
add XO, XO, INC_X

fldmias AO1!, { s8 - s9 }
fldmias AO2!, { s4 - s5 }

fldmias XO , { s13 }
add XO, XO, INC_X
fldmias AO1!, { s10 - s11 }
fldmias AO2!, { s6 - s7 }

fldmias XO , { s14 }
add XO, XO, INC_X

fldmias XO , { s15 }
add XO, XO, INC_X

vmla.f32 s2 , s12 , s8
vmla.f32 s3 , s12 , s4
vmla.f32 s2 , s13 , s9
vmla.f32 s3 , s13 , s5
vmla.f32 s2 , s14, s10
vmla.f32 s3 , s14, s6
vmla.f32 s2 , s15, s11
vmla.f32 s3 , s15, s7

.endm

.macro KERNEL_S2X1

fldmias XO , { s1 }
fldmias AO1!, { s8 }
fldmias AO2!, { s4 }
vmla.f32 s2 , s1 , s8
add XO, XO, INC_X
vmla.f32 s3 , s1 , s4

.endm

.macro SAVE_S2

fldmias YO, { s4 }
vmla.f32 s4, s0, s2
fstmias YO, { s4 }
add YO, YO, INC_Y

fldmias YO, { s5 }
vmla.f32 s5, s0, s3
fstmias YO, { s5 }
add YO, YO, INC_Y

.endm

.macro INIT_S1

vsub.f32 s2 , s2 , s2

.endm

.macro KERNEL_S1X4

fldmias XO , { s12 }
add XO, XO, INC_X

pld [ AO1 , #A_PRE ]
fldmias AO1!, { s8 - s9 }

fldmias XO , { s13 }
add XO, XO, INC_X
fldmias AO1!, { s10 - s11 }

fldmias XO , { s14 }
add XO, XO, INC_X

fldmias XO , { s15 }
add XO, XO, INC_X

vmla.f32 s2 , s12 , s8
vmla.f32 s2 , s13 , s9
vmla.f32 s2 , s14, s10
vmla.f32 s2 , s15, s11

.endm

.macro KERNEL_S1X1

fldmias XO , { s1 }
fldmias AO1!, { s8 }
vmla.f32 s2 , s1 , s8
add XO, XO, INC_X

.endm

.macro SAVE_S1

fldmias YO, { s4 }
vmla.f32 s4, s0, s2
fstmias YO, { s4 }
add YO, YO, INC_Y

.endm



#endif

/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack

sub r12, fp, #192

#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s15 } // store floating point registers
#endif

cmp M, #0
ble gemvt_kernel_L999

cmp OLD_N, #0
ble gemvt_kernel_L999

str OLD_A, A
str OLD_N, N

ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y

cmp INC_X, #0
beq gemvt_kernel_L999

cmp INC_Y, #0
beq gemvt_kernel_L999

ldr LDA, OLD_LDA


#if defined(DOUBLE)
lsl LDA, LDA, #3 // LDA * SIZE
#else
lsl LDA, LDA, #2 // LDA * SIZE
#endif

cmp INC_X, #1
bne gemvt_kernel_S2_BEGIN

cmp INC_Y, #1
bne gemvt_kernel_S2_BEGIN


gemvt_kernel_F2_BEGIN:

ldr YO , Y

ldr J, N
asrs J, J, #1 // J = N / 2
ble gemvt_kernel_F1_BEGIN

gemvt_kernel_F2X4:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A

ldr XO , X

INIT_F2

asrs I, M, #2 // I = M / 4
ble gemvt_kernel_F2X1


gemvt_kernel_F2X4_10:

KERNEL_F2X4

subs I, I, #1
bne gemvt_kernel_F2X4_10


gemvt_kernel_F2X1:

ands I, M , #3
ble gemvt_kernel_F2_END

gemvt_kernel_F2X1_10:

KERNEL_F2X1

subs I, I, #1
bne gemvt_kernel_F2X1_10


gemvt_kernel_F2_END:

SAVE_F2

subs J , J , #1
bne gemvt_kernel_F2X4


gemvt_kernel_F1_BEGIN:

ldr J, N
ands J, J, #1
ble gemvt_kernel_L999

gemvt_kernel_F1X4:

ldr AO1, A

ldr XO , X

INIT_F1

asrs I, M, #2 // I = M / 4
ble gemvt_kernel_F1X1


gemvt_kernel_F1X4_10:

KERNEL_F1X4

subs I, I, #1
bne gemvt_kernel_F1X4_10


gemvt_kernel_F1X1:

ands I, M , #3
ble gemvt_kernel_F1_END

gemvt_kernel_F1X1_10:

KERNEL_F1X1

subs I, I, #1
bne gemvt_kernel_F1X1_10


gemvt_kernel_F1_END:

SAVE_F1

b gemvt_kernel_L999



/*************************************************************************************************************/

gemvt_kernel_S2_BEGIN:

#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
#endif

ldr YO , Y

ldr J, N
asrs J, J, #1 // J = N / 2
ble gemvt_kernel_S1_BEGIN

gemvt_kernel_S2X4:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A

ldr XO , X

INIT_S2

asrs I, M, #2 // I = M / 4
ble gemvt_kernel_S2X1


gemvt_kernel_S2X4_10:

KERNEL_S2X4

subs I, I, #1
bne gemvt_kernel_S2X4_10


gemvt_kernel_S2X1:

ands I, M , #3
ble gemvt_kernel_S2_END

gemvt_kernel_S2X1_10:

KERNEL_S2X1

subs I, I, #1
bne gemvt_kernel_S2X1_10


gemvt_kernel_S2_END:

SAVE_S2

subs J , J , #1
bne gemvt_kernel_S2X4


gemvt_kernel_S1_BEGIN:

ldr J, N
ands J, J, #1
ble gemvt_kernel_L999

gemvt_kernel_S1X4:

ldr AO1, A

ldr XO , X

INIT_S1

asrs I, M, #2 // I = M / 4
ble gemvt_kernel_S1X1


gemvt_kernel_S1X4_10:

KERNEL_S1X4

subs I, I, #1
bne gemvt_kernel_S1X4_10


gemvt_kernel_S1X1:

ands I, M , #3
ble gemvt_kernel_S1_END

gemvt_kernel_S1X1_10:

KERNEL_S1X1

subs I, I, #1
bne gemvt_kernel_S1X1_10


gemvt_kernel_S1_END:

SAVE_S1



/*************************************************************************************************************/

gemvt_kernel_L999:

sub r3, fp, #192

#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s15 } // restore floating point registers
#endif

mov r0, #0 // set return value

sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr

EPILOGUE


+ 732
- 0
kernel/arm/gemv_t_vfpv3.S View File

@@ -0,0 +1,732 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/18 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_N r1

#define M r0
#define AO1 r1
#define J r2

#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9

#define I r12

#define N [fp, #-252 ]
#define A [fp, #-256 ]


#define X_PRE 512
#define A_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/


#if defined(DOUBLE)

.macro INIT_F2

vsub.f64 d4 , d4 , d4
vsub.f64 d5 , d5 , d5

.endm

.macro KERNEL_F2X4

pld [ XO , #X_PRE ]
fldmiad XO! , { d28 - d31 }
pld [ AO1 , #A_PRE ]
fldmiad AO1!, { d8 - d9 }
pld [ AO2 , #A_PRE ]
fldmiad AO2!, { d16 - d17 }
vmla.f64 d4 , d28 , d8
vmla.f64 d5 , d28 , d16
fldmiad AO1!, { d10 - d11 }
vmla.f64 d4 , d29 , d9
vmla.f64 d5 , d29 , d17
fldmiad AO2!, { d18 - d19 }
vmla.f64 d4 , d30, d10
vmla.f64 d5 , d30, d18
vmla.f64 d4 , d31, d11
vmla.f64 d5 , d31, d19

.endm


.macro KERNEL_F2X1

fldmiad XO! , { d2 }
fldmiad AO1!, { d8 }
fldmiad AO2!, { d16 }
vmla.f64 d4 , d2 , d8
vmla.f64 d5 , d2 , d16

.endm

.macro SAVE_F2

fldmiad YO, { d24 - d25 }
vmla.f64 d24, d0, d4
vmla.f64 d25, d0, d5
fstmiad YO!, { d24 - d25 }

.endm

.macro INIT_S2

vsub.f64 d4 , d4 , d4
vsub.f64 d5 , d5 , d5

.endm

.macro KERNEL_S2X4

pld [ AO1 , #A_PRE ]
fldmiad XO , { d28 }
add XO, XO, INC_X
fldmiad AO1!, { d8 - d9 }
pld [ AO2 , #A_PRE ]
fldmiad AO2!, { d16 - d17 }
vmla.f64 d4 , d28 , d8
fldmiad XO , { d29 }
add XO, XO, INC_X
vmla.f64 d5 , d28 , d16
fldmiad AO1!, { d10 - d11 }
vmla.f64 d4 , d29 , d9
fldmiad XO , { d30 }
add XO, XO, INC_X
vmla.f64 d5 , d29 , d17
fldmiad AO2!, { d18 - d19 }
vmla.f64 d4 , d30, d10
fldmiad XO , { d31 }
add XO, XO, INC_X
vmla.f64 d5 , d30, d18
vmla.f64 d4 , d31, d11
vmla.f64 d5 , d31, d19

.endm


.macro KERNEL_S2X1

fldmiad XO , { d2 }
fldmiad AO1!, { d8 }
add XO, XO, INC_X
fldmiad AO2!, { d16 }
vmla.f64 d4 , d2 , d8
vmla.f64 d5 , d2 , d16

.endm

.macro SAVE_S2

fldmiad YO, { d24 }
vmla.f64 d24, d0, d4
fstmiad YO, { d24 }
add YO, YO, INC_Y

fldmiad YO, { d24 }
vmla.f64 d24, d0, d5
fstmiad YO, { d24 }
add YO, YO, INC_Y

.endm

.macro INIT_F1

vsub.f64 d4 , d4 , d4

.endm

.macro KERNEL_F1X4

pld [ XO , #X_PRE ]
fldmiad XO! , { d28 - d31 }
pld [ AO1 , #A_PRE ]
fldmiad AO1!, { d8 - d9 }
vmla.f64 d4 , d28 , d8
fldmiad AO1!, { d10 - d11 }
vmla.f64 d4 , d29 , d9
vmla.f64 d4 , d30, d10
vmla.f64 d4 , d31, d11

.endm


.macro KERNEL_F1X1

fldmiad XO! , { d2 }
fldmiad AO1!, { d8 }
vmla.f64 d4 , d2 , d8

.endm

.macro SAVE_F1

fldmiad YO, { d24 }
vmla.f64 d24, d0, d4
fstmiad YO!, { d24 }

.endm

.macro INIT_S1

vsub.f64 d4 , d4 , d4

.endm

.macro KERNEL_S1X4

pld [ AO1 , #A_PRE ]
fldmiad XO , { d28 }
add XO, XO, INC_X
fldmiad AO1!, { d8 - d9 }
vmla.f64 d4 , d28 , d8
fldmiad XO , { d29 }
add XO, XO, INC_X
fldmiad AO1!, { d10 - d11 }
vmla.f64 d4 , d29 , d9
fldmiad XO , { d30 }
add XO, XO, INC_X
vmla.f64 d4 , d30, d10
fldmiad XO , { d31 }
add XO, XO, INC_X
vmla.f64 d4 , d31, d11

.endm


.macro KERNEL_S1X1

fldmiad XO , { d2 }
fldmiad AO1!, { d8 }
add XO, XO, INC_X
vmla.f64 d4 , d2 , d8

.endm

.macro SAVE_S1

fldmiad YO, { d24 }
vmla.f64 d24, d0, d4
fstmiad YO, { d24 }
add YO, YO, INC_Y

.endm


#else /************************* SINGLE PRECISION *****************************************/

.macro INIT_F2

vsub.f32 s4 , s4 , s4
vsub.f32 s5 , s5 , s5

.endm

.macro KERNEL_F2X4

fldmias XO! , { s28 - s31 }
fldmias AO1!, { s8 - s9 }
fldmias AO2!, { s16 - s17 }
vmla.f32 s4 , s28 , s8
vmla.f32 s5 , s28 , s16
fldmias AO1!, { s10 - s11 }
vmla.f32 s4 , s29 , s9
vmla.f32 s5 , s29 , s17
fldmias AO2!, { s18 - s19 }
vmla.f32 s4 , s30, s10
vmla.f32 s5 , s30, s18
vmla.f32 s4 , s31, s11
vmla.f32 s5 , s31, s19

.endm


.macro KERNEL_F2X1

fldmias XO! , { s2 }
fldmias AO1!, { s8 }
fldmias AO2!, { s16 }
vmla.f32 s4 , s2 , s8
vmla.f32 s5 , s2 , s16

.endm

.macro SAVE_F2

fldmias YO, { s24 - s25 }
vmla.f32 s24, s0, s4
vmla.f32 s25, s0, s5
fstmias YO!, { s24 - s25 }

.endm

.macro INIT_S2

vsub.f32 s4 , s4 , s4
vsub.f32 s5 , s5 , s5

.endm

.macro KERNEL_S2X4

fldmias XO , { s28 }
add XO, XO, INC_X
fldmias AO1!, { s8 - s9 }
fldmias AO2!, { s16 - s17 }
vmla.f32 s4 , s28 , s8
fldmias XO , { s29 }
add XO, XO, INC_X
vmla.f32 s5 , s28 , s16
fldmias AO1!, { s10 - s11 }
vmla.f32 s4 , s29 , s9
fldmias XO , { s30 }
add XO, XO, INC_X
vmla.f32 s5 , s29 , s17
fldmias AO2!, { s18 - s19 }
vmla.f32 s4 , s30, s10
fldmias XO , { s31 }
add XO, XO, INC_X
vmla.f32 s5 , s30, s18
vmla.f32 s4 , s31, s11
vmla.f32 s5 , s31, s19

.endm


.macro KERNEL_S2X1

fldmias XO , { s2 }
fldmias AO1!, { s8 }
add XO, XO, INC_X
fldmias AO2!, { s16 }
vmla.f32 s4 , s2 , s8
vmla.f32 s5 , s2 , s16

.endm

.macro SAVE_S2

fldmias YO, { s24 }
vmla.f32 s24, s0, s4
fstmias YO, { s24 }
add YO, YO, INC_Y

fldmias YO, { s24 }
vmla.f32 s24, s0, s5
fstmias YO, { s24 }
add YO, YO, INC_Y

.endm

.macro INIT_F1

vsub.f32 s4 , s4 , s4

.endm

.macro KERNEL_F1X4

fldmias XO! , { s28 - s31 }
fldmias AO1!, { s8 - s9 }
vmla.f32 s4 , s28 , s8
fldmias AO1!, { s10 - s11 }
vmla.f32 s4 , s29 , s9
vmla.f32 s4 , s30, s10
vmla.f32 s4 , s31, s11

.endm


.macro KERNEL_F1X1

fldmias XO! , { s2 }
fldmias AO1!, { s8 }
vmla.f32 s4 , s2 , s8

.endm

.macro SAVE_F1

fldmias YO, { s24 }
vmla.f32 s24, s0, s4
fstmias YO!, { s24 }

.endm

.macro INIT_S1

vsub.f32 s4 , s4 , s4

.endm

.macro KERNEL_S1X4

fldmias XO , { s28 }
add XO, XO, INC_X
fldmias AO1!, { s8 - s9 }
vmla.f32 s4 , s28 , s8
fldmias XO , { s29 }
add XO, XO, INC_X
fldmias AO1!, { s10 - s11 }
vmla.f32 s4 , s29 , s9
fldmias XO , { s30 }
add XO, XO, INC_X
vmla.f32 s4 , s30, s10
fldmias XO , { s31 }
add XO, XO, INC_X
vmla.f32 s4 , s31, s11

.endm


.macro KERNEL_S1X1

fldmias XO , { s2 }
fldmias AO1!, { s8 }
add XO, XO, INC_X
vmla.f32 s4 , s2 , s8

.endm

.macro SAVE_S1

fldmias YO, { s24 }
vmla.f32 s24, s0, s4
fstmias YO, { s24 }
add YO, YO, INC_Y

.endm


#endif

/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack

sub r12, fp, #192

#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s31 } // store floating point registers
#endif

cmp M, #0
ble gemvt_kernel_L999

cmp OLD_N, #0
ble gemvt_kernel_L999

str OLD_A, A
str OLD_N, N

ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y

cmp INC_X, #0
beq gemvt_kernel_L999

cmp INC_Y, #0
beq gemvt_kernel_L999

ldr LDA, OLD_LDA


#if defined(DOUBLE)
lsl LDA, LDA, #3 // LDA * SIZE
#else
lsl LDA, LDA, #2 // LDA * SIZE
#endif

cmp INC_X, #1
bne gemvt_kernel_S2_BEGIN

cmp INC_Y, #1
bne gemvt_kernel_S2_BEGIN


gemvt_kernel_F2_BEGIN:

ldr YO , Y

ldr J, N
asrs J, J, #1 // J = N / 2
ble gemvt_kernel_F1_BEGIN

gemvt_kernel_F2X4:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A

ldr XO , X

INIT_F2

asrs I, M, #2 // I = M / 4
ble gemvt_kernel_F2X1


gemvt_kernel_F2X4_10:

KERNEL_F2X4

subs I, I, #1
bne gemvt_kernel_F2X4_10


gemvt_kernel_F2X1:

ands I, M , #3
ble gemvt_kernel_F2_END

gemvt_kernel_F2X1_10:

KERNEL_F2X1

subs I, I, #1
bne gemvt_kernel_F2X1_10


gemvt_kernel_F2_END:

SAVE_F2

subs J , J , #1
bne gemvt_kernel_F2X4


gemvt_kernel_F1_BEGIN:

ldr J, N
ands J, J, #1
ble gemvt_kernel_L999

gemvt_kernel_F1X4:

ldr AO1, A

ldr XO , X

INIT_F1

asrs I, M, #2 // I = M / 4
ble gemvt_kernel_F1X1


gemvt_kernel_F1X4_10:

KERNEL_F1X4

subs I, I, #1
bne gemvt_kernel_F1X4_10


gemvt_kernel_F1X1:

ands I, M , #3
ble gemvt_kernel_F1_END

gemvt_kernel_F1X1_10:

KERNEL_F1X1

subs I, I, #1
bne gemvt_kernel_F1X1_10


gemvt_kernel_F1_END:

SAVE_F1

b gemvt_kernel_L999



/*************************************************************************************************************/

gemvt_kernel_S2_BEGIN:

#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
#endif

ldr YO , Y

ldr J, N
asrs J, J, #1 // J = N / 2
ble gemvt_kernel_S1_BEGIN

gemvt_kernel_S2X4:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A

ldr XO , X

INIT_S2

asrs I, M, #2 // I = M / 4
ble gemvt_kernel_S2X1


gemvt_kernel_S2X4_10:

KERNEL_S2X4

subs I, I, #1
bne gemvt_kernel_S2X4_10


gemvt_kernel_S2X1:

ands I, M , #3
ble gemvt_kernel_S2_END

gemvt_kernel_S2X1_10:

KERNEL_S2X1

subs I, I, #1
bne gemvt_kernel_S2X1_10


gemvt_kernel_S2_END:

SAVE_S2

subs J , J , #1
bne gemvt_kernel_S2X4


gemvt_kernel_S1_BEGIN:

ldr J, N
ands J, J, #1
ble gemvt_kernel_L999

gemvt_kernel_S1X4:

ldr AO1, A

ldr XO , X

INIT_S1

asrs I, M, #2 // I = M / 4
ble gemvt_kernel_S1X1


gemvt_kernel_S1X4_10:

KERNEL_S1X4

subs I, I, #1
bne gemvt_kernel_S1X4_10


gemvt_kernel_S1X1:

ands I, M , #3
ble gemvt_kernel_S1_END

gemvt_kernel_S1X1_10:

KERNEL_S1X1

subs I, I, #1
bne gemvt_kernel_S1X1_10


gemvt_kernel_S1_END:

SAVE_S1



/*************************************************************************************************************/

gemvt_kernel_L999:

sub r3, fp, #192

#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s31 } // restore floating point registers
#endif

mov r0, #0 // set return value

sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr

EPILOGUE


+ 75
- 0
kernel/arm/iamax.c View File

@@ -0,0 +1,75 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : NoTest
* BLASTEST double : NoTest
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#define ABS fabs

#else

#define ABS fabsf

#endif


BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;
BLASLONG max=0;

if (n < 0 || inc_x < 1 ) return(max);

maxf=ABS(x[0]);

while(i < n)
{
if( ABS(x[ix]) > ABS(maxf) )
{
max = i;
maxf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(max+1);
}


+ 478
- 0
kernel/arm/iamax_vfp.S View File

@@ -0,0 +1,478 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/14 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define N r0
#define X r1
#define INC_X r2
#define INDEX r3
#define Z r4

#define I r12

#define X_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/

#if defined(USE_ABS)

#if defined(DOUBLE)

#define VABS(x0,x1) vabs.f64 x0, x1

#else

#define VABS(x0,x1) vabs.f32 x0, x1

#endif

#else

#define VABS(x0,x1) nop

#endif

/*****************************************************************************************/

#if defined(USE_MIN)

#define MOVCOND movlt

#if defined(DOUBLE)

#define VMOVCOND vmovlt.f64

#else

#define VMOVCOND vmovlt.f32

#endif

#else

#define MOVCOND movgt

#if defined(DOUBLE)

#define VMOVCOND vmovgt.f64

#else

#define VMOVCOND vmovgt.f32

#endif


#endif


/*****************************************************************************************/



#if !defined(COMPLEX)

#if defined(DOUBLE)

.macro INIT_F

fldmiad X!, { d0 }
VABS( d0, d0 )
mov Z, #1
mov INDEX, Z

.endm

.macro KERNEL_F1

fldmiad X!, { d4 }
add Z, Z, #1
VABS( d4, d4 )
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
MOVCOND INDEX, Z

.endm

.macro INIT_S

fldmiad X, { d0 }
VABS( d0, d0 )
mov Z, #1
mov INDEX, Z
add X, X, INC_X

.endm


.macro KERNEL_S1

fldmiad X, { d4 }
add Z, Z, #1
VABS( d4, d4 )
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
MOVCOND INDEX, Z
add X, X, INC_X

.endm

#else

.macro INIT_F

fldmias X!, { s0 }
VABS( s0, s0 )
mov Z, #1
mov INDEX, Z

.endm

.macro KERNEL_F1

fldmias X!, { s4 }
add Z, Z, #1
VABS( s4, s4 )
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
MOVCOND INDEX, Z

.endm

.macro INIT_S

fldmias X, { s0 }
VABS( s0, s0 )
mov Z, #1
mov INDEX, Z
add X, X, INC_X

.endm


.macro KERNEL_S1

fldmias X, { s4 }
add Z, Z, #1
VABS( s4, s4 )
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
MOVCOND INDEX, Z
add X, X, INC_X

.endm




#endif

#else

#if defined(DOUBLE)

.macro INIT_F

fldmiad X!, { d0 -d1 }
vabs.f64 d0, d0
vabs.f64 d1, d1
vadd.f64 d0 , d0, d1
mov Z, #1
mov INDEX, Z

.endm


.macro KERNEL_F1

fldmiad X!, { d4 - d5 }
add Z, Z, #1
vabs.f64 d4, d4
vabs.f64 d5, d5
vadd.f64 d4 , d4, d5
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
MOVCOND INDEX, Z

.endm

.macro INIT_S

fldmiad X, { d0 -d1 }
vabs.f64 d0, d0
vabs.f64 d1, d1
vadd.f64 d0 , d0, d1
mov Z, #1
mov INDEX, Z
add X, X, INC_X

.endm



.macro KERNEL_S1

fldmiad X, { d4 - d5 }
add Z, Z, #1
vabs.f64 d4, d4
vabs.f64 d5, d5
vadd.f64 d4 , d4, d5
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
MOVCOND INDEX, Z
add X, X, INC_X

.endm

#else

.macro INIT_F

fldmias X!, { s0 -s1 }
vabs.f32 s0, s0
vabs.f32 s1, s1
vadd.f32 s0 , s0, s1
mov Z, #1
mov INDEX, Z

.endm


.macro KERNEL_F1

fldmias X!, { s4 - s5 }
add Z, Z, #1
vabs.f32 s4, s4
vabs.f32 s5, s5
vadd.f32 s4 , s4, s5
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
MOVCOND INDEX, Z

.endm

.macro INIT_S

fldmias X, { s0 -s1 }
vabs.f32 s0, s0
vabs.f32 s1, s1
vadd.f32 s0 , s0, s1
mov Z, #1
mov INDEX, Z
add X, X, INC_X

.endm



.macro KERNEL_S1

fldmias X, { s4 - s5 }
add Z, Z, #1
vabs.f32 s4, s4
vabs.f32 s5, s5
vadd.f32 s4 , s4, s5
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
MOVCOND INDEX, Z
add X, X, INC_X

.endm




#endif

#endif

/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5
push {r4}

#if defined(DOUBLE)
vsub.f64 d0 , d0 , d0
#else
vsub.f32 s0 , s0 , s0
#endif
mov INDEX, #0

cmp N, #0
ble iamax_kernel_L999

cmp INC_X, #0
beq iamax_kernel_L999

cmp INC_X, #1
bne iamax_kernel_S_BEGIN


iamax_kernel_F_BEGIN:

INIT_F

subs N, N , #1
ble iamax_kernel_L999

asrs I, N, #2 // I = N / 4
ble iamax_kernel_F1

.align 5

iamax_kernel_F4:

pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
#if defined(COMPLEX) && defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F1
KERNEL_F1

subs I, I, #1
ble iamax_kernel_F1


#if defined(COMPLEX) || defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F1
KERNEL_F1
#if defined(COMPLEX) && defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F1
KERNEL_F1

subs I, I, #1
bne iamax_kernel_F4

iamax_kernel_F1:

ands I, N, #3
ble iamax_kernel_L999

iamax_kernel_F10:

KERNEL_F1

subs I, I, #1
bne iamax_kernel_F10

b iamax_kernel_L999

iamax_kernel_S_BEGIN:

#if defined(COMPLEX)

#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif

#else

#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif

#endif

INIT_S

subs N, N , #1
ble iamax_kernel_L999

asrs I, N, #2 // I = N / 4
ble iamax_kernel_S1

.align 5

iamax_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne iamax_kernel_S4

iamax_kernel_S1:

ands I, N, #3
ble iamax_kernel_L999

iamax_kernel_S10:

KERNEL_S1

subs I, I, #1
bne iamax_kernel_S10


iamax_kernel_L999:

mov r0, INDEX // set return value

pop {r4}
bx lr

EPILOGUE


+ 75
- 0
kernel/arm/iamin.c View File

@@ -0,0 +1,75 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : NoTest
* BLASTEST double : NoTest
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#define ABS fabs

#else

#define ABS fabsf

#endif


BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;
BLASLONG min=0;

if (n < 0 || inc_x < 1 ) return(min);

minf=ABS(x[0]);

while(i < n)
{
if( ABS(x[ix]) < ABS(minf) )
{
min = i;
minf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(min+1);
}


+ 67
- 0
kernel/arm/imax.c View File

@@ -0,0 +1,67 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/


/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : NoTest
* BLASTEST double : NoTest
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/

#include "common.h"
#include <math.h>



BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;
BLASLONG max=0;

if (n < 0 || inc_x < 1 ) return(max);

maxf=x[0];

while(i < n)
{
if( x[ix] > maxf )
{
max = i;
maxf = x[ix];
}
ix += inc_x;
i++;
}
return(max+1);
}


+ 65
- 0
kernel/arm/imin.c View File

@@ -0,0 +1,65 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/


/**************************************************************************************
* 2013/08/19 Saar
* BLASTEST float
* BLASTEST double
*
**************************************************************************************/

#include "common.h"
#include <math.h>



BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;
BLASLONG min=0;

if (n < 0 || inc_x < 1 ) return(min);

minf=x[0];

while(i < n)
{
if( x[ix] > minf )
{
min = i;
minf = x[ix];
}
ix += inc_x;
i++;
}
return(min+1);
}


+ 81
- 0
kernel/arm/izamax.c View File

@@ -0,0 +1,81 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : NoTest
* BLASTEST double : NoTest
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#define ABS fabs

#else

#define ABS fabsf

#endif

#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])

BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf[2];
BLASLONG max=0;
BLASLONG inc_x2;

if (n < 0 || inc_x < 1 ) return(max);

inc_x2 = 2 * inc_x;

maxf[0] = ABS(x[ix]);
maxf[1] = ABS(x[ix+1]);

while(i < n)
{
if( CABS1(x,ix) > CABS1(maxf,0) )
{
max = i;
maxf[0] = ABS(x[ix]);
maxf[1] = ABS(x[ix+1]);
}
ix += inc_x2;
i++;
}
return(max+1);
}


+ 81
- 0
kernel/arm/izamin.c View File

@@ -0,0 +1,81 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : NoTest
* BLASTEST double : NoTest
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#define ABS fabs

#else

#define ABS fabsf

#endif

#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])

BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf[2];
BLASLONG min=0;
BLASLONG inc_x2;

if (n < 0 || inc_x < 1 ) return(min);

inc_x2 = 2 * inc_x;

minf[0] = ABS(x[ix]);
minf[1] = ABS(x[ix+1]);

while(i < n)
{
if( CABS1(x,ix) < CABS1(minf,0) )
{
min = i;
minf[0] = ABS(x[ix]);
minf[1] = ABS(x[ix+1]);
}
ix += inc_x2;
i++;
}
return(min+1);
}


+ 63
- 0
kernel/arm/max.c View File

@@ -0,0 +1,63 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : NoTest
* BLASTEST double : NoTest
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/

#include "common.h"
#include <math.h>


FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;

if (n < 0 || inc_x < 1 ) return(maxf);

maxf=x[0];

while(i < n)
{
if( x[ix] > maxf )
{
maxf = x[ix];
}
ix += inc_x;
i++;
}
return(maxf);
}


+ 63
- 0
kernel/arm/min.c View File

@@ -0,0 +1,63 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : NoTest
* BLASTEST double : NoTest
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/

#include "common.h"
#include <math.h>


FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;

if (n < 0 || inc_x < 1 ) return(minf);

minf=x[0];

while(i < n)
{
if( x[ix] < minf )
{
minf = x[ix];
}
ix += inc_x;
i++;
}
return(minf);
}


+ 88
- 0
kernel/arm/nrm2.c View File

@@ -0,0 +1,88 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/13 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#define ABS fabs

#else

#define ABS fabsf

#endif



FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT scale = 0.0;
FLOAT ssq = 1.0;
FLOAT absxi = 0.0;


if (n < 0 || inc_x < 1 ) return(0.0);
if ( n == 1 ) return( ABS(x[0]) );

n *= inc_x;
while(i < n)
{
if ( x[i] != 0.0 )
{
absxi = ABS( x[i] );
if ( scale < absxi )
{
ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
scale = absxi ;
}
else
{
ssq += ( absxi/scale ) * ( absxi/scale );
}

}
i += inc_x;
}
scale = scale * sqrt( ssq );
return(scale);

}


+ 565
- 0
kernel/arm/nrm2_vfp.S View File

@@ -0,0 +1,565 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/22 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define N r0
#define X r1
#define INC_X r2

#define I r12

#define X_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/


#if !defined(COMPLEX)

#if defined(DOUBLE)


.macro KERNEL_F1

fldmiad X!, { d4 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x

KERNEL_F1_NEXT_\@:

.endm

.macro KERNEL_F8

pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1

.endm

.macro KERNEL_S1

fldmiad X, { d4 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x

KERNEL_S1_NEXT:

add X, X, INC_X

.endm

#else

.macro KERNEL_F1

fldmias X!, { s4 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x

KERNEL_F1_NEXT_\@:

.endm

.macro KERNEL_F8

pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1

KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1

.endm

.macro KERNEL_S1

fldmias X, { s4 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x

KERNEL_S1_NEXT:

add X, X, INC_X

.endm




#endif

#else

#if defined(DOUBLE)

.macro KERNEL_F1

fldmiad X!, { d4 - d5 }

vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x

KERNEL_F1_NEXT_\@:

vcmpe.f64 d5, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_END_\@
vabs.f64 d5, d5
vcmpe.f64 d0, d5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_END_\@
vdiv.f64 d2 , d0, d5 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d5 // scale = x

KERNEL_F1_END_\@:


.endm

.macro KERNEL_F8

pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1

.endm

.macro KERNEL_S1

fldmiad X, { d4 - d5 }

vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT_\@
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT_\@
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x

KERNEL_S1_NEXT_\@:

vcmpe.f64 d5, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_END_\@
vabs.f64 d5, d5
vcmpe.f64 d0, d5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_END_\@
vdiv.f64 d2 , d0, d5 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d5 // scale = x

KERNEL_S1_END_\@:

add X, X, INC_X

.endm


#else

.macro KERNEL_F1

fldmias X!, { s4 - s5 }

vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x

KERNEL_F1_NEXT_\@:

vcmpe.f32 s5, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_END_\@
vabs.f32 s5, s5
vcmpe.f32 s0, s5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_END_\@
vdiv.f32 s2 , s0, s5 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s5 // scale = x

KERNEL_F1_END_\@:


.endm

.macro KERNEL_F8

pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1

.endm

.macro KERNEL_S1

fldmias X, { s4 - s5 }

vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT_\@
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT_\@
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x

KERNEL_S1_NEXT_\@:

vcmpe.f32 s5, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_END_\@
vabs.f32 s5, s5
vcmpe.f32 s0, s5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_END_\@
vdiv.f32 s2 , s0, s5 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s5 // scale = x

KERNEL_S1_END_\@:

add X, X, INC_X

.endm



#endif

#endif

/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

b nrm2_begin


#if defined(COMPLEX)

#if defined(DOUBLE)

znrm2_one:
.word 0x00000000
.word 0x3ff00000

#else

cnrm2_one:
.word 0x3f800000

#endif

#else

#if defined(DOUBLE)

dnrm2_one:
.word 0x00000000
.word 0x3ff00000

#else

snrm2_one:
.word 0x3f800000

#endif

#endif


.align 5


nrm2_begin:

#if defined(COMPLEX)

#if defined(DOUBLE)
vsub.f64 d0 , d0 , d0 // scale=0.0
vldr.64 d1 , znrm2_one // ssq=1.0
vmov.f64 d7 , d1 // value 1.0
vmov.f64 d6 , d0 // value 0.0
#else
vsub.f32 s0 , s0 , s0 // scale=0.0
vldr.32 s1 , cnrm2_one // ssq=1.0
vmov.f32 s7 , s1 // value 1.0
vmov.f32 s6 , s0 // value 0.0
#endif

#else

#if defined(DOUBLE)
vsub.f64 d0 , d0 , d0 // scale=0.0
vldr.64 d1 , dnrm2_one // ssq=1.0
vmov.f64 d7 , d1 // value 1.0
vmov.f64 d6 , d0 // value 0.0
#else
vsub.f32 s0 , s0 , s0 // scale=0.0
vldr.32 s1 , snrm2_one // ssq=1.0
vmov.f32 s7 , s1 // value 1.0
vmov.f32 s6 , s0 // value 0.0
#endif


#endif


cmp N, #0
ble nrm2_kernel_L999

cmp INC_X, #0
beq nrm2_kernel_L999

cmp INC_X, #1
bne nrm2_kernel_S_BEGIN


nrm2_kernel_F_BEGIN:

asrs I, N, #3 // I = N / 8
ble nrm2_kernel_F1

nrm2_kernel_F8:

KERNEL_F8

subs I, I, #1
bne nrm2_kernel_F8

nrm2_kernel_F1:

ands I, N, #7
ble nrm2_kernel_L999


nrm2_kernel_F10:

KERNEL_F1

subs I, I, #1
bne nrm2_kernel_F10

b nrm2_kernel_L999

nrm2_kernel_S_BEGIN:

#if defined(COMPLEX)

#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif

#else

#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif

#endif



nrm2_kernel_S1:

mov I, N

.align 5

nrm2_kernel_S10:

KERNEL_S1

subs I, I, #1
bne nrm2_kernel_S10


nrm2_kernel_L999:

#if defined(DOUBLE)
vsqrt.f64 d1, d1
vmul.f64 d0, d0, d1
#else
vsqrt.f32 s1, s1
vmul.f32 s0, s0, s1
#endif

bx lr

EPILOGUE


+ 508
- 0
kernel/arm/nrm2_vfpv3.S View File

@@ -0,0 +1,508 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/16 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define N r0
#define X r1
#define INC_X r2

#define I r12

#define X_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/


#if !defined(COMPLEX)

#if defined(DOUBLE)


.macro KERNEL_F1

fldmiad X!, { d4 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x

KERNEL_F1_NEXT_\@:

.endm

.macro KERNEL_F8

pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1

.endm

.macro KERNEL_S1

fldmiad X, { d4 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x

KERNEL_S1_NEXT:

add X, X, INC_X

.endm

#else

.macro KERNEL_F1

fldmias X!, { s4 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x

KERNEL_F1_NEXT_\@:

.endm

.macro KERNEL_F8

pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1

KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1

.endm

.macro KERNEL_S1

fldmias X, { s4 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x

KERNEL_S1_NEXT:

add X, X, INC_X

.endm




#endif

#else

#if defined(DOUBLE)

.macro KERNEL_F1

fldmiad X!, { d4 - d5 }

vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x

KERNEL_F1_NEXT_\@:

vcmpe.f64 d5, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_END_\@
vabs.f64 d5, d5
vcmpe.f64 d0, d5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_END_\@
vdiv.f64 d2 , d0, d5 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d5 // scale = x

KERNEL_F1_END_\@:


.endm

.macro KERNEL_F8

pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1

.endm

.macro KERNEL_S1

fldmiad X, { d4 - d5 }

vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT_\@
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT_\@
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x

KERNEL_S1_NEXT_\@:

vcmpe.f64 d5, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_END_\@
vabs.f64 d5, d5
vcmpe.f64 d0, d5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_END_\@
vdiv.f64 d2 , d0, d5 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d5 // scale = x

KERNEL_S1_END_\@:

add X, X, INC_X

.endm


#else

.macro KERNEL_F1

fldmias X!, { s4 - s5 }

vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x

KERNEL_F1_NEXT_\@:

vcmpe.f32 s5, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_END_\@
vabs.f32 s5, s5
vcmpe.f32 s0, s5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_END_\@
vdiv.f32 s2 , s0, s5 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s5 // scale = x

KERNEL_F1_END_\@:


.endm

.macro KERNEL_F8

pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1

.endm

.macro KERNEL_S1

fldmias X, { s4 - s5 }

vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT_\@
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT_\@
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x

KERNEL_S1_NEXT_\@:

vcmpe.f32 s5, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_END_\@
vabs.f32 s5, s5
vcmpe.f32 s0, s5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_END_\@
vdiv.f32 s2 , s0, s5 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s5 // scale = x

KERNEL_S1_END_\@:

add X, X, INC_X

.endm



#endif

#endif

/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

#if defined(DOUBLE)
vsub.f64 d0 , d0 , d0 // scale=0.0
vmov.f64 d1 , #1.0 // ssq=1.0
vmov.f64 d7 , d1 // value 1.0
vmov.f64 d6 , d0 // value 0.0
#else
vsub.f32 s0 , s0 , s0 // scale=0.0
vmov.f32 s1 , #1.0 // ssq=1.0
vmov.f32 s7 , s1 // value 1.0
vmov.f32 s6 , s0 // value 0.0
#endif



cmp N, #0
ble nrm2_kernel_L999

cmp INC_X, #0
beq nrm2_kernel_L999

cmp INC_X, #1
bne nrm2_kernel_S_BEGIN


nrm2_kernel_F_BEGIN:

asrs I, N, #3 // I = N / 8
ble nrm2_kernel_F1

nrm2_kernel_F8:

KERNEL_F8

subs I, I, #1
bne nrm2_kernel_F8

nrm2_kernel_F1:

ands I, N, #7
ble nrm2_kernel_L999


nrm2_kernel_F10:

KERNEL_F1

subs I, I, #1
bne nrm2_kernel_F10

b nrm2_kernel_L999

nrm2_kernel_S_BEGIN:

#if defined(COMPLEX)

#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif

#else

#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif

#endif



nrm2_kernel_S1:

mov I, N

.align 5

nrm2_kernel_S10:

KERNEL_S1

subs I, I, #1
bne nrm2_kernel_S10


nrm2_kernel_L999:

#if defined(DOUBLE)
vsqrt.f64 d1, d1
vmul.f64 d0, d0, d1
#else
vsqrt.f32 s1, s1
vmul.f32 s0, s0, s1
#endif

bx lr

EPILOGUE


+ 62
- 0
kernel/arm/rot.c View File

@@ -0,0 +1,62 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;

if ( n <= 0 ) return(0);

while(i < n)
{
temp = c*x[ix] + s*y[iy] ;
y[iy] = c*y[iy] - s*x[ix] ;
x[ix] = temp ;

ix += inc_x ;
iy += inc_y ;
i++ ;

}
return(0);

}


+ 584
- 0
kernel/arm/rot_vfp.S View File

@@ -0,0 +1,584 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/15 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_INC_Y [fp, #0 ]


#define N r0
#define X r1
#define INC_X r2
#define Y r3
#define INC_Y r4

#define I r12

#define X_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/

/*****************************************************************************************/



#if !defined(COMPLEX)

#if defined(DOUBLE)

.macro KERNEL_F4

pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]

fldmiad X, { d4 }
fldmiad Y, { d5 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }

fldmiad X, { d4 }
fldmiad Y, { d5 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }

fldmiad X, { d4 }
fldmiad Y, { d5 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }

fldmiad X, { d4 }
fldmiad Y, { d5 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }

.endm


.macro KERNEL_F1

fldmiad X, { d4 }
fldmiad Y, { d5 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }

.endm

.macro KERNEL_S1

fldmiad X, { d4 }
fldmiad Y, { d5 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
fstmiad X, { d2 }
fstmiad Y, { d3 }

add X, X, INC_X
add Y, Y, INC_Y

.endm

#else

.macro KERNEL_F4

fldmias X, { s4 }
fldmias Y, { s5 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }

fldmias X, { s4 }
fldmias Y, { s5 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }

fldmias X, { s4 }
fldmias Y, { s5 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }

fldmias X, { s4 }
fldmias Y, { s5 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }

.endm


.macro KERNEL_F1

fldmias X, { s4 }
fldmias Y, { s5 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }

.endm

.macro KERNEL_S1

fldmias X, { s4 }
fldmias Y, { s5 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
fstmias X, { s2 }
fstmias Y, { s3 }

add X, X, INC_X
add Y, Y, INC_Y

.endm



#endif

#else

#if defined(DOUBLE)

.macro KERNEL_F4

pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]

fldmiad X, { d4 - d5 }
fldmiad Y, { d6 - d7 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
fldmiad X, { d4 - d5 }
fldmiad Y, { d6 - d7 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]

fldmiad X, { d4 - d5 }
fldmiad Y, { d6 - d7 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
fldmiad X, { d4 - d5 }
fldmiad Y, { d6 - d7 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
.endm


.macro KERNEL_F1

fldmiad X, { d4 - d5 }
fldmiad Y, { d6 - d7 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }

.endm

.macro KERNEL_S1

fldmiad X, { d4 - d5 }
fldmiad Y, { d6 - d7 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
vstr d2 , [ X, #0 ]
vstr d3 , [ Y, #0 ]
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
vstr d2 , [ X, #8 ]
vstr d3 , [ Y, #8 ]

add X, X, INC_X
add Y, Y, INC_Y

.endm



#else

.macro KERNEL_F4

pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]

fldmias X, { s4 - s5 }
fldmias Y, { s6 - s7 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
fldmias X, { s4 - s5 }
fldmias Y, { s6 - s7 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]

fldmias X, { s4 - s5 }
fldmias Y, { s6 - s7 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
fldmias X, { s4 - s5 }
fldmias Y, { s6 - s7 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
.endm


.macro KERNEL_F1

fldmias X, { s4 - s5 }
fldmias Y, { s6 - s7 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }

.endm

.macro KERNEL_S1

fldmias X, { s4 - s5 }
fldmias Y, { s6 - s7 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
vstr s2 , [ X, #0 ]
vstr s3 , [ Y, #0 ]
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
vstr s2 , [ X, #4 ]
vstr s3 , [ Y, #4 ]

add X, X, INC_X
add Y, Y, INC_Y

.endm


#endif

#endif

/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5
push {r4 , fp}
add fp, sp, #8

ldr INC_Y , OLD_INC_Y


cmp N, #0
ble rot_kernel_L999

cmp INC_X, #0
beq rot_kernel_L999

cmp INC_Y, #0
beq rot_kernel_L999

cmp INC_X, #1
bne rot_kernel_S_BEGIN

cmp INC_Y, #1
bne rot_kernel_S_BEGIN


rot_kernel_F_BEGIN:


asrs I, N, #2 // I = N / 4
ble rot_kernel_F1

.align 5

rot_kernel_F4:

#if !defined(COMPLEX) && !defined(DOUBLE)
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
#endif

KERNEL_F4

subs I, I, #1
ble rot_kernel_F1

KERNEL_F4

subs I, I, #1
bne rot_kernel_F4

rot_kernel_F1:

ands I, N, #3
ble rot_kernel_L999

rot_kernel_F10:

KERNEL_F1

subs I, I, #1
bne rot_kernel_F10

b rot_kernel_L999

rot_kernel_S_BEGIN:

#if defined(COMPLEX)

#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
#endif

#else

#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
#endif

#endif


asrs I, N, #2 // I = N / 4
ble rot_kernel_S1

.align 5

rot_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne rot_kernel_S4

rot_kernel_S1:

ands I, N, #3
ble rot_kernel_L999

rot_kernel_S10:

KERNEL_S1

subs I, I, #1
bne rot_kernel_S10


rot_kernel_L999:

mov r0, #0 // set return value

sub sp, fp, #8
pop {r4,fp}
bx lr

EPILOGUE


+ 58
- 0
kernel/arm/scal.c View File

@@ -0,0 +1,58 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;

if ( n < 0 || inc_x < 1 ) return(0);
if ( da == 1.0 ) return(0);

n *= inc_x;
while(i < n)
{

x[i] = da * x[i] ;
i += inc_x ;

}
return(0);

}


+ 376
- 0
kernel/arm/scal_vfp.S View File

@@ -0,0 +1,376 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/15 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_INC_X [sp, #0 ]


#define N r0
#define INC_X r1
#define X r3

#define I r12

#define X_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/

/*****************************************************************************************/



#if !defined(COMPLEX)

#if defined(DOUBLE)

.macro KERNEL_F4

pld [ X, #X_PRE ]
fldmiad X, { d4 - d7 }
vmul.f64 d4, d4, d0
vmul.f64 d5, d5, d0
vmul.f64 d6, d6, d0
fstmiad X!, { d4 - d5 }
vmul.f64 d7, d7, d0
fstmiad X!, { d6 - d7 }

.endm


.macro KERNEL_F1

fldmiad X, { d4 }
vmul.f64 d4, d4, d0
fstmiad X!, { d4 }

.endm

.macro KERNEL_S1

fldmiad X, { d4 }
vmul.f64 d4, d4, d0
fstmiad X, { d4 }
add X, X, INC_X

.endm

#else

.macro KERNEL_F4

fldmias X, { s4 - s7 }
vmul.f32 s4, s4, s0
vmul.f32 s5, s5, s0
vmul.f32 s6, s6, s0
fstmias X!, { s4 - s5 }
vmul.f32 s7, s7, s0
fstmias X!, { s6 - s7 }

.endm


.macro KERNEL_F1

fldmias X, { s4 }
vmul.f32 s4, s4, s0
fstmias X!, { s4 }

.endm

.macro KERNEL_S1

fldmias X, { s4 }
vmul.f32 s4, s4, s0
fstmias X, { s4 }
add X, X, INC_X

.endm



#endif

#else

#if defined(DOUBLE)

.macro KERNEL_F4

pld [ X, #X_PRE ]

fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }

fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }

pld [ X, #X_PRE ]

fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }

fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }

.endm


.macro KERNEL_F1

fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }

.endm

.macro KERNEL_S1

fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X, { d2 - d3 }
add X, X, INC_X

.endm


#else

.macro KERNEL_F4

pld [ X, #X_PRE ]

fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }

fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }

fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }

fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }

.endm


.macro KERNEL_F1

fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }

.endm

.macro KERNEL_S1

fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X, { s2 - s3 }
add X, X, INC_X

.endm



#endif

#endif

/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

ldr INC_X , OLD_INC_X

cmp N, #0
ble scal_kernel_L999

cmp INC_X, #0
ble scal_kernel_L999

cmp INC_X, #1
bne scal_kernel_S_BEGIN


scal_kernel_F_BEGIN:


asrs I, N, #2 // I = N / 4
ble scal_kernel_F1

.align 5

scal_kernel_F4:

#if !defined(COMPLEX) && !defined(DOUBLE)
pld [ X, #X_PRE ]
#endif

KERNEL_F4

subs I, I, #1
ble scal_kernel_F1

KERNEL_F4

subs I, I, #1
bne scal_kernel_F4

scal_kernel_F1:

ands I, N, #3
ble scal_kernel_L999

scal_kernel_F10:

KERNEL_F1

subs I, I, #1
bne scal_kernel_F10

b scal_kernel_L999

scal_kernel_S_BEGIN:

#if defined(COMPLEX)

#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif

#else

#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif

#endif


asrs I, N, #2 // I = N / 4
ble scal_kernel_S1

.align 5

scal_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne scal_kernel_S4

scal_kernel_S1:

ands I, N, #3
ble scal_kernel_L999

scal_kernel_S10:

KERNEL_S1

subs I, I, #1
bne scal_kernel_S10


scal_kernel_L999:

mov r0, #0 // set return value

bx lr

EPILOGUE


+ 224
- 0
kernel/arm/scopy_vfp.S View File

@@ -0,0 +1,224 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/07 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define OLD_INC_Y [fp, #4 ]

#define I r5
#define Y r6
#define INC_Y r7

#define X_PRE 256

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro COPY_F8

pld [ X, #X_PRE ]
fldmias X!, { s0 - s3 }
fldmias X!, { s4 - s7 }
fstmias Y!, { s0 - s3 }
fstmias Y!, { s4 - s7 }

.endm

.macro COPY_F1

fldmias X!, { s0 }
fstmias Y!, { s0 }

.endm


/*************************************************************************************************************************/

.macro COPY_S4

nop
fldmias X, { s0 }
fstmias Y, { s0 }
add X, X, INC_X
add Y, Y, INC_Y

fldmias X, { s1 }
fstmias Y, { s1 }
add X, X, INC_X
add Y, Y, INC_Y

fldmias X, { s0 }
fstmias Y, { s0 }
add X, X, INC_X
add Y, Y, INC_Y

fldmias X, { s1 }
fstmias Y, { s1 }
add X, X, INC_X
add Y, Y, INC_Y

.endm


.macro COPY_S1

fldmias X, { s0 }
fstmias Y, { s0 }
add X, X, INC_X
add Y, Y, INC_Y

.endm



/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack

sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers

mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
cmp N, #0
ble scopy_kernel_L999

cmp INC_X, #0
beq scopy_kernel_L999

cmp INC_Y, #0
beq scopy_kernel_L999

cmp INC_X, #1
bne scopy_kernel_S_BEGIN

cmp INC_Y, #1
bne scopy_kernel_S_BEGIN

scopy_kernel_F_BEGIN:

asrs I, N, #3 // I = N / 8
ble scopy_kernel_F1

scopy_kernel_F8:

COPY_F8

subs I, I, #1
bne scopy_kernel_F8

scopy_kernel_F1:

ands I, N, #7
ble scopy_kernel_L999

scopy_kernel_F10:

COPY_F1

subs I, I, #1
bne scopy_kernel_F10

b scopy_kernel_L999

scopy_kernel_S_BEGIN:

lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE

asrs I, N, #2 // I = N / 4
ble scopy_kernel_S1

scopy_kernel_S4:

COPY_S4

subs I, I, #1
bne scopy_kernel_S4

scopy_kernel_S1:

ands I, N, #3
ble scopy_kernel_L999

scopy_kernel_S10:

COPY_S1

subs I, I, #1
bne scopy_kernel_S10






scopy_kernel_L999:

sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers

mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 347
- 0
kernel/arm/sdot_vfp.S View File

@@ -0,0 +1,347 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/11 Saar
* BLASTEST : OK
* CTEST : OK (no test for dsdot)
* TEST : OK (no test for dsdot)
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define OLD_INC_Y [fp, #4 ]

#define I r5
#define Y r6
#define INC_Y r7

#define X_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/

#if defined(DSDOT)

.macro KERNEL_F4

fldmias X!, { s14 }
fldmias Y!, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4

fldmias X!, { s14 }
fldmias Y!, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4

fldmias X!, { s14 }
fldmias Y!, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4

fldmias X!, { s14 }
fldmias Y!, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4

.endm

.macro KERNEL_F1

fldmias X!, { s14 }
fldmias Y!, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4

.endm


.macro KERNEL_S4

nop

fldmias X, { s14 }
fldmias Y, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4
add X, X, INC_X
add Y, Y, INC_Y

fldmias X, { s14 }
fldmias Y, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4
add X, X, INC_X
add Y, Y, INC_Y

fldmias X, { s14 }
fldmias Y, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4
add X, X, INC_X
add Y, Y, INC_Y

fldmias X, { s14 }
fldmias Y, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4
add X, X, INC_X
add Y, Y, INC_Y

.endm


.macro KERNEL_S1

fldmias X, { s14 }
fldmias Y, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4
add X, X, INC_X
add Y, Y, INC_Y

.endm



#else

.macro KERNEL_F4

fldmias X!, { s8 - s9 }
fldmias Y!, { s4 - s5}
fmacs s0 , s4, s8
fldmias X!, { s10 - s11 }
fmacs s1 , s5, s9
fldmias Y!, { s6 - s7 }
fmacs s0 , s6, s10
fmacs s1 , s7, s11

.endm

.macro KERNEL_F1

fldmias X!, { s4 }
fldmias Y!, { s8 }
fmacs s0 , s4, s8

.endm


.macro KERNEL_S4

nop
fldmias X, { s4 }
fldmias Y, { s8 }
add X, X, INC_X
add Y, Y, INC_Y
fmacs s0 , s4, s8

fldmias X, { s5 }
fldmias Y, { s9 }
add X, X, INC_X
add Y, Y, INC_Y
fmacs s1 , s5, s9

fldmias X, { s6 }
fldmias Y, { s10 }
add X, X, INC_X
add Y, Y, INC_Y
fmacs s0 , s6, s10

fldmias X, { s7 }
fldmias Y, { s11 }
add X, X, INC_X
add Y, Y, INC_Y
fmacs s1 , s7, s11

.endm


.macro KERNEL_S1

fldmias X, { s4 }
fldmias Y, { s8 }
add X, X, INC_X
fmacs s0 , s4, s8
add Y, Y, INC_Y

.endm

#endif

/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack

sub r4, fp, #128
vstm r4, { s8 - s15 } // store floating point registers

mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
#if defined(DSDOT)

vsub.f64 d0 , d0 , d0
vsub.f64 d1 , d1 , d1

#else

vsub.f32 s0 , s0 , s0
vsub.f32 s1 , s1 , s1

#endif

cmp N, #0
ble sdot_kernel_L999

cmp INC_X, #0
beq sdot_kernel_L999

cmp INC_Y, #0
beq sdot_kernel_L999

cmp INC_X, #1
bne sdot_kernel_S_BEGIN

cmp INC_Y, #1
bne sdot_kernel_S_BEGIN

sdot_kernel_F_BEGIN:

asrs I, N, #2 // I = N / 4
ble sdot_kernel_F1

sdot_kernel_F4:

KERNEL_F4

subs I, I, #1
bne sdot_kernel_F4

sdot_kernel_F1:

ands I, N, #3
ble sdot_kernel_L999

sdot_kernel_F10:

KERNEL_F1

subs I, I, #1
bne sdot_kernel_F10

b sdot_kernel_L999

sdot_kernel_S_BEGIN:

lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE

asrs I, N, #2 // I = N / 4
ble sdot_kernel_S1

sdot_kernel_S4:

KERNEL_S4

subs I, I, #1
bne sdot_kernel_S4

sdot_kernel_S1:

ands I, N, #3
ble sdot_kernel_L999

sdot_kernel_S10:

KERNEL_S1

subs I, I, #1
bne sdot_kernel_S10






sdot_kernel_L999:

sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers

#if defined(DSDOT)

vadd.f64 d0 , d0, d1 // set return value

#else

vadd.f32 s0 , s0, s1 // set return value

#endif
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 797
- 0
kernel/arm/sgemm_kernel_4x2_vfp.S View File

@@ -0,0 +1,797 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/28 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_M r0
#define OLD_N r1
#define OLD_K r2
#define OLD_A r3
#define OLD_ALPHA s0

/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define LDC [fp, #-252 ]
#define M [fp, #-256 ]
#define N [fp, #-260 ]
#define K [fp, #-264 ]
#define A [fp, #-268 ]

#define ALPHA [fp, #-280]

#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]

#define I r0
#define J r1
#define L r2

#define AO r5
#define BO r6

#define CO1 r8
#define CO2 r9

#define K1 r7
#define BC r12

#define A_PRE 96
#define B_PRE 96
#define C_PRE 64

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro INIT4x2

vsub.f32 s8 , s8 , s8
vmov.f32 s9, s8
vmov.f32 s10, s8
vmov.f32 s11, s8
vmov.f32 s12, s8
vmov.f32 s13, s8
vmov.f32 s14, s8
vmov.f32 s15, s8

.endm



.macro KERNEL4x2_SUB

fldmias AO! , { s0 - s3 }
fldmias BO! , { s4 - s5 }

fmacs s8 , s0, s4
fmacs s9 , s1, s4
fmacs s10 , s2, s4
fmacs s11 , s3, s4

fmacs s12 , s0, s5
fmacs s13 , s1, s5
fmacs s14 , s2, s5
fmacs s15 , s3, s5

.endm

.macro SAVE4x2

ldr r3 , LDC
add CO2 , CO1, r3

flds s0, ALPHA

flds s4 , [CO1]
flds s5 , [CO1, #4 ]
flds s6 , [CO1, #8 ]
flds s7 , [CO1, #12 ]
fmacs s4 , s0 , s8
fmacs s5 , s0 , s9
fmacs s6 , s0 , s10
fmacs s7 , s0 , s11

fsts s4 , [CO1]
fsts s5 , [CO1, #4 ]
fsts s6 , [CO1, #8 ]
fsts s7 , [CO1, #12 ]

flds s4 , [CO2]
flds s5 , [CO2, #4 ]
flds s6 , [CO2, #8 ]
flds s7 , [CO2, #12 ]

fmacs s4 , s0 , s12
fmacs s5 , s0 , s13
fmacs s6 , s0 , s14
fmacs s7 , s0 , s15

fsts s4 , [CO2]
fsts s5 , [CO2, #4 ]
fsts s6 , [CO2, #8 ]
fsts s7 , [CO2, #12 ]

add CO1, CO1, #16

.endm


/******************************************************************************/

.macro INIT2x2

vsub.f32 s8 , s8 , s8
vmov.f32 s9, s8
vmov.f32 s12, s8
vmov.f32 s13, s8

.endm

.macro KERNEL2x2_SUB

flds s4 , [ BO ]
flds s5 , [ BO, #4 ]

flds s0 , [ AO ]
flds s1 , [ AO, #4 ]

fmacs s8 , s0, s4
fmacs s9 , s1, s4

fmacs s12 , s0, s5
fmacs s13 , s1, s5

add AO , AO, #8
add BO , BO, #8

.endm

.macro SAVE2x2

ldr r3 , LDC
add CO2 , CO1, r3

flds s0, ALPHA

flds s4 , [CO1]
flds s5 , [CO1, #4 ]
fmacs s4 , s0 , s8
fmacs s5 , s0 , s9

fsts s4 , [CO1]
fsts s5 , [CO1, #4 ]

flds s4 , [CO2]
flds s5 , [CO2, #4 ]

fmacs s4 , s0 , s12
fmacs s5 , s0 , s13

fsts s4 , [CO2]
fsts s5 , [CO2, #4 ]

add CO1, CO1, #8

.endm


/******************************************************************************/

.macro INIT1x2

vsub.f32 s8 , s8 , s8
vmov.f32 s12, s8

.endm

.macro KERNEL1x2_SUB

flds s4 , [ BO ]
flds s5 , [ BO, #4 ]

flds s0 , [ AO ]

fmacs s8 , s0, s4

fmacs s12 , s0, s5

add AO , AO, #4
add BO , BO, #8

.endm

.macro SAVE1x2

ldr r3 , LDC
add CO2 , CO1, r3

flds s0, ALPHA

flds s4 , [CO1]
fmacs s4 , s0 , s8

fsts s4 , [CO1]

flds s4 , [CO2]

fmacs s4 , s0 , s12

fsts s4 , [CO2]

add CO1, CO1, #4

.endm



/******************************************************************************/

.macro INIT4x1

vsub.f32 s8 , s8 , s8
vmov.f32 s9, s8
vmov.f32 s10, s8
vmov.f32 s11, s8

.endm



.macro KERNEL4x1_SUB

flds s4 , [ BO ]

flds s0 , [ AO ]
flds s1 , [ AO, #4 ]
flds s2 , [ AO, #8 ]
flds s3 , [ AO, #12 ]

fmacs s8 , s0, s4
fmacs s9 , s1, s4
fmacs s10 , s2, s4
fmacs s11 , s3, s4

add AO , AO, #16
add BO , BO, #4

.endm

.macro SAVE4x1

flds s0, ALPHA

flds s4 , [CO1]
flds s5 , [CO1, #4 ]
flds s6 , [CO1, #8 ]
flds s7 , [CO1, #12 ]
fmacs s4 , s0 , s8
fmacs s5 , s0 , s9
fmacs s6 , s0 , s10
fmacs s7 , s0 , s11

fsts s4 , [CO1]
fsts s5 , [CO1, #4 ]
fsts s6 , [CO1, #8 ]
fsts s7 , [CO1, #12 ]

add CO1, CO1, #16

.endm

/******************************************************************************/

.macro INIT2x1

vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8

.endm

.macro KERNEL2x1_SUB

flds s4 , [ BO ]

flds s0 , [ AO ]
flds s1 , [ AO, #4 ]

fmacs s8 , s0, s4
fmacs s9 , s1, s4

add AO , AO, #8
add BO , BO, #4

.endm

.macro SAVE2x1

flds s0, ALPHA

flds s4 , [CO1]
flds s5 , [CO1, #4 ]
fmacs s4 , s0 , s8
fmacs s5 , s0 , s9

fsts s4 , [CO1]
fsts s5 , [CO1, #4 ]

add CO1, CO1, #8

.endm


/******************************************************************************/

.macro INIT1x1

vsub.f32 s8 , s8 , s8

.endm

.macro KERNEL1x1_SUB

flds s4 , [ BO ]

flds s0 , [ AO ]

fmacs s8 , s0, s4

add AO , AO, #4
add BO , BO, #4

.endm

.macro SAVE1x1

flds s0, ALPHA

flds s4 , [CO1]
fmacs s4 , s0 , s8

fsts s4 , [CO1]

add CO1, CO1, #4

.endm


/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack

str OLD_M, M
str OLD_N, N
str OLD_K, K
str OLD_A, A
vstr OLD_ALPHA, ALPHA

sub r3, fp, #128
vstm r3, { s8 - s15} // store floating point registers

ldr r3, OLD_LDC
lsl r3, r3, #2 // ldc = ldc * 4
str r3, LDC

ldr K1, K
ldr BC, B

ldr J, N
asrs J, J, #1 // J = J / 2
ble sgemm_kernel_L1_BEGIN


/*********************************************************************************************/

sgemm_kernel_L2_BEGIN:

ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
add r3 , r4, CO1
str r3 , C // store C

ldr AO, A // AO = A

sgemm_kernel_L2_M4_BEGIN:

ldr I, M
asrs I, I, #2 // I = I / 4
ble sgemm_kernel_L2_M2_BEGIN

sgemm_kernel_L2_M4_20:

INIT4x2

mov BO, BC
asrs L , K1, #3 // L = L / 8
ble sgemm_kernel_L2_M4_40
.align 5

sgemm_kernel_L2_M4_22:

pld [ AO, #A_PRE ]
pld [ BO, #B_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB
pld [ AO, #A_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB

pld [ AO, #A_PRE ]
pld [ BO, #B_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB
pld [ AO, #A_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB

subs L, L, #1
bgt sgemm_kernel_L2_M4_22

sgemm_kernel_L2_M4_40:
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L2_M4_100

sgemm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs L, L, #1
bgt sgemm_kernel_L2_M4_42
sgemm_kernel_L2_M4_100:

SAVE4x2

sgemm_kernel_L2_M4_END:

subs I, I, #1
bgt sgemm_kernel_L2_M4_20


sgemm_kernel_L2_M2_BEGIN:

ldr I, M
tst I , #3
ble sgemm_kernel_L2_END

tst I, #2 // I = I / 2
ble sgemm_kernel_L2_M1_BEGIN

sgemm_kernel_L2_M2_20:

INIT2x2

mov BO, BC
asrs L , K1, #3 // L = L / 8
ble sgemm_kernel_L2_M2_40

sgemm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB

KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB

subs L, L, #1
bgt sgemm_kernel_L2_M2_22

sgemm_kernel_L2_M2_40:
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L2_M2_100

sgemm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs L, L, #1
bgt sgemm_kernel_L2_M2_42
sgemm_kernel_L2_M2_100:

SAVE2x2

sgemm_kernel_L2_M2_END:


sgemm_kernel_L2_M1_BEGIN:

tst I, #1 // I = I % 2
ble sgemm_kernel_L2_END

sgemm_kernel_L2_M1_20:

INIT1x2

mov BO, BC
asrs L , K1, #3 // L = L / 8
ble sgemm_kernel_L2_M1_40

sgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB

KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB

subs L, L, #1
bgt sgemm_kernel_L2_M1_22

sgemm_kernel_L2_M1_40:
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L2_M1_100

sgemm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs L, L, #1
bgt sgemm_kernel_L2_M1_42
sgemm_kernel_L2_M1_100:

SAVE1x2


sgemm_kernel_L2_END:

mov r3, BC
mov r4, K1
lsl r4, r4, #3 // k * 2 * 4
add r3, r3, r4 // B = B + K * 2 * 4
mov BC, r3

subs J , #1 // j--
bgt sgemm_kernel_L2_BEGIN

/*********************************************************************************************/

sgemm_kernel_L1_BEGIN:

ldr J , N
tst J , #1
ble sgemm_kernel_L999

ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
str r3 , C // store C

ldr AO, A // AO = A



sgemm_kernel_L1_M4_BEGIN:

ldr I, M
asrs I, I, #2 // I = I / 4
ble sgemm_kernel_L1_M2_BEGIN

sgemm_kernel_L1_M4_20:

INIT4x1

mov BO, BC
asrs L , K1, #3 // L = L / 8
ble sgemm_kernel_L1_M4_40
.align 5

sgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB

KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB

subs L, L, #1
bgt sgemm_kernel_L1_M4_22

sgemm_kernel_L1_M4_40:
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L1_M4_100

sgemm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs L, L, #1
bgt sgemm_kernel_L1_M4_42
sgemm_kernel_L1_M4_100:

SAVE4x1

sgemm_kernel_L1_M4_END:

subs I, I, #1
bgt sgemm_kernel_L1_M4_20


sgemm_kernel_L1_M2_BEGIN:

ldr I, M
tst I , #3
ble sgemm_kernel_L1_END

tst I, #2 // I = I / 2
ble sgemm_kernel_L1_M1_BEGIN

sgemm_kernel_L1_M2_20:

INIT2x1

mov BO, BC
asrs L , K1, #3 // L = L / 8
ble sgemm_kernel_L1_M2_40

sgemm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB

KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB

subs L, L, #1
bgt sgemm_kernel_L1_M2_22

sgemm_kernel_L1_M2_40:
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L1_M2_100

sgemm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs L, L, #1
bgt sgemm_kernel_L1_M2_42
sgemm_kernel_L1_M2_100:

SAVE2x1

sgemm_kernel_L1_M2_END:


sgemm_kernel_L1_M1_BEGIN:

tst I, #1 // I = I % 2
ble sgemm_kernel_L1_END

sgemm_kernel_L1_M1_20:

INIT1x1

mov BO, BC
asrs L , K1, #3 // L = L / 8
ble sgemm_kernel_L1_M1_40

sgemm_kernel_L1_M1_22:

KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB

KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB

subs L, L, #1
bgt sgemm_kernel_L1_M1_22

sgemm_kernel_L1_M1_40:
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L1_M1_100

sgemm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs L, L, #1
bgt sgemm_kernel_L1_M1_42
sgemm_kernel_L1_M1_100:

SAVE1x1


sgemm_kernel_L1_END:


sgemm_kernel_L999:

sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers

movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 1436
- 0
kernel/arm/sgemm_kernel_4x4_vfpv3.S
File diff suppressed because it is too large
View File


+ 225
- 0
kernel/arm/sgemm_ncopy_2_vfp.S View File

@@ -0,0 +1,225 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/24 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3

#define B [fp, #4 ]

#define M r0
#define N r1
#define A r2

#define BO r5

#define AO1 r6
#define AO2 r7
#define LDA r8

#define I r3
#define J r12

#define A_PRE 256

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro COPY2x2

flds s0 , [ AO1, #0 ]
flds s2 , [ AO1, #4 ]

flds s1 , [ AO2, #0 ]
flds s3 , [ AO2, #4 ]

add AO1, AO1, #8
fstmias BO!, { s0 - s3 }
add AO2, AO2, #8

.endm


.macro COPY1x2

flds s0 , [ AO1, #0 ]
flds s1 , [ AO2, #0 ]
add AO1, AO1, #4

fstmias BO!, { s0 - s1 }
add AO2, AO2, #4

.endm

.macro COPY2x1

flds s0 , [ AO1, #0 ]
flds s1 , [ AO1, #4 ]

fstmias BO!, { s0 - s1 }
add AO1, AO1, #8

.endm


.macro COPY1x1

flds s0 , [ AO1, #0 ]

fstmias BO!, { s0 }
add AO1, AO1, #4

.endm





/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24

lsl LDA, OLD_LDA, #2 // lda = lda * 4

ldr BO, B


/*********************************************************************************************/

sgemm_ncopy_L2_BEGIN:

asrs J, N, #1 // J = N / 2
ble sgemm_ncopy_L1_BEGIN

sgemm_ncopy_L2_M2_BEGIN:

mov AO1, A // AO1 = A
add AO2, AO1, LDA
add A , AO2, LDA // A = A + 2 * LDA

asrs I, M, #1 // I = M / 2
ble sgemm_ncopy_L2_M2_40

sgemm_ncopy_L2_M2_20:

COPY2x2

subs I , I , #1
bne sgemm_ncopy_L2_M2_20
sgemm_ncopy_L2_M2_40:

ands I, M , #1
ble sgemm_ncopy_L2_M2_END

sgemm_ncopy_L2_M2_60:

COPY1x2

subs I , I , #1
bne sgemm_ncopy_L2_M2_60

sgemm_ncopy_L2_M2_END:

subs J , J, #1 // j--
bne sgemm_ncopy_L2_M2_BEGIN

/*********************************************************************************************/

sgemm_ncopy_L1_BEGIN:

tst N, #1
ble sgemm_ncopy_L999


sgemm_ncopy_L1_M2_BEGIN:

mov AO1, A // AO1 = A
add A , AO1, LDA // A = A + 1 * LDA

asrs I, M, #1 // I = M / 2
ble sgemm_ncopy_L1_M2_40

sgemm_ncopy_L1_M2_20:

COPY2x1

subs I , I , #1
bne sgemm_ncopy_L1_M2_20
sgemm_ncopy_L1_M2_40:

ands I, M , #1
ble sgemm_ncopy_L1_M2_END

sgemm_ncopy_L1_M2_60:

COPY1x1

subs I , I , #1
bne sgemm_ncopy_L1_M2_60

sgemm_ncopy_L1_M2_END:



sgemm_ncopy_L999:


movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 353
- 0
kernel/arm/sgemm_ncopy_4_vfp.S View File

@@ -0,0 +1,353 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/05 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define LDA [fp, #-260 ]

#define B [fp, #4 ]

#define M r0
#define N r1
#define A r2

#define BO r5

#define AO1 r6
#define AO2 r7
#define AO3 r8
#define AO4 r9

#define I r3
#define J r12

#define A_PRE 192

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro COPY4x4

flds s0 , [ AO1, #0 ]
flds s1 , [ AO2, #0 ]
flds s2 , [ AO3, #0 ]
flds s3 , [ AO4, #0 ]

flds s4 , [ AO1, #4 ]
flds s8 , [ AO1, #8 ]
flds s12, [ AO1, #12 ]

flds s5 , [ AO2, #4 ]
add AO1, AO1, #16
flds s9 , [ AO2, #8 ]
flds s13, [ AO2, #12 ]

flds s6 , [ AO3, #4 ]
add AO2, AO2, #16
flds s10, [ AO3, #8 ]
flds s14, [ AO3, #12 ]

flds s7 , [ AO4, #4 ]
add AO3, AO3, #16
flds s11, [ AO4, #8 ]
flds s15, [ AO4, #12 ]

fstmias BO!, { s0 - s3 }
add AO4, AO4, #16
fstmias BO!, { s4 - s7 }
fstmias BO!, { s8 - s15 }

.endm

.macro COPY1x4

flds s0 , [ AO1, #0 ]
flds s1 , [ AO2, #0 ]
add AO1, AO1, #4
flds s2 , [ AO3, #0 ]
add AO2, AO2, #4
flds s3 , [ AO4, #0 ]

add AO3, AO3, #4
fstmias BO!, { s0 - s3 }
add AO4, AO4, #4

.endm

.macro COPY4x2

flds s0 , [ AO1, #0 ]
flds s2 , [ AO1, #4 ]
flds s4 , [ AO1, #8 ]
flds s6 , [ AO1, #12 ]

flds s1 , [ AO2, #0 ]
flds s3 , [ AO2, #4 ]
add AO1, AO1, #16
flds s5 , [ AO2, #8 ]
flds s7 , [ AO2, #12 ]

fstmias BO!, { s0 - s7 }
add AO2, AO2, #16

.endm


.macro COPY1x2

flds s0 , [ AO1, #0 ]
flds s1 , [ AO2, #0 ]
add AO1, AO1, #4

fstmias BO!, { s0 - s1 }
add AO2, AO2, #4

.endm

.macro COPY4x1

flds s0 , [ AO1, #0 ]
flds s1 , [ AO1, #4 ]
flds s2 , [ AO1, #8 ]
flds s3 , [ AO1, #12 ]

fstmias BO!, { s0 - s3 }
add AO1, AO1, #16

.endm


.macro COPY1x1

flds s0 , [ AO1, #0 ]

fstmias BO!, { s0 }
add AO1, AO1, #4

.endm





/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack


lsl r3, r3, #2 // lda = lda * 4
str r3, LDA

sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers

ldr BO, B

sgemm_ncopy_L4_BEGIN:

asrs J, N, #2 // J = N / 4
ble sgemm_ncopy_L2_BEGIN

sgemm_ncopy_L4_M4_BEGIN:

mov AO1, A // AO1 = A
ldr r4 , LDA
add AO2, AO1, r4
add AO3, AO2, r4
add AO4, AO3, r4
add A , AO4, r4 // A = A + 4 * LDA

asrs I, M, #2 // I = M / 4
ble sgemm_ncopy_L4_M4_40

sgemm_ncopy_L4_M4_20:

pld [ AO1, #A_PRE ]
pld [ AO2, #A_PRE ]
pld [ AO3, #A_PRE ]
pld [ AO4, #A_PRE ]
COPY4x4

subs I , I , #1
ble sgemm_ncopy_L4_M4_40

COPY4x4

subs I , I , #1
bne sgemm_ncopy_L4_M4_20
sgemm_ncopy_L4_M4_40:

ands I, M , #3
ble sgemm_ncopy_L4_M4_END

sgemm_ncopy_L4_M4_60:

COPY1x4

subs I , I , #1
bne sgemm_ncopy_L4_M4_60

sgemm_ncopy_L4_M4_END:

subs J , J, #1 // j--
bne sgemm_ncopy_L4_M4_BEGIN



/*********************************************************************************************/

sgemm_ncopy_L2_BEGIN:

tst N, #3
ble sgemm_ncopy_L999

tst N, #2
ble sgemm_ncopy_L1_BEGIN

sgemm_ncopy_L2_M4_BEGIN:

mov AO1, A // AO1 = A
ldr r4 , LDA
add AO2, AO1, r4
add A , AO2, r4 // A = A + 2 * LDA

asrs I, M, #2 // I = M / 4
ble sgemm_ncopy_L2_M4_40

sgemm_ncopy_L2_M4_20:

COPY4x2

subs I , I , #1
bne sgemm_ncopy_L2_M4_20
sgemm_ncopy_L2_M4_40:

ands I, M , #3
ble sgemm_ncopy_L2_M4_END

sgemm_ncopy_L2_M4_60:

COPY1x2

subs I , I , #1
bne sgemm_ncopy_L2_M4_60

sgemm_ncopy_L2_M4_END:


/*********************************************************************************************/

sgemm_ncopy_L1_BEGIN:

tst N, #1
ble sgemm_ncopy_L999


sgemm_ncopy_L1_M4_BEGIN:

mov AO1, A // AO1 = A
ldr r4 , LDA
add A , AO1, r4 // A = A + 1 * LDA

asrs I, M, #2 // I = M / 4
ble sgemm_ncopy_L1_M4_40

sgemm_ncopy_L1_M4_20:

COPY4x1

subs I , I , #1
bne sgemm_ncopy_L1_M4_20
sgemm_ncopy_L1_M4_40:

ands I, M , #3
ble sgemm_ncopy_L1_M4_END

sgemm_ncopy_L1_M4_60:

COPY1x1

subs I , I , #1
bne sgemm_ncopy_L1_M4_60

sgemm_ncopy_L1_M4_END:



sgemm_ncopy_L999:

sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers

movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 430
- 0
kernel/arm/sgemm_tcopy_4_vfp.S View File

@@ -0,0 +1,430 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/06 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define B [fp, #4 ]
#define A [fp, #-248 ]

#define M r0
#define N r1
#define M4 r2

#define LDA r5

#define AO1 r6
#define BO1 r7
#define BO2 r8
#define BO3 r9

#define I r4
#define J r12

#define A_PRE 256

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro COPY4x4_1

pld [ AO1, #A_PRE ]
fldmias AO1, { s0 - s3 }

add r3, AO1, LDA
pld [ r3, #A_PRE ]
fldmias r3, { s4 - s7 }

add r3, r3, LDA
pld [ r3, #A_PRE ]
fldmias r3, { s8 - s11 }

add r3, r3, LDA
pld [ r3, #A_PRE ]
fldmias r3, { s12 - s15 }

fstmias BO1, { s0 - s15 }
add AO1, AO1, #16
add BO1, BO1, M4

.endm

.macro COPY4x4_2

fldmias AO1, { s0 - s3 }

add r3, AO1, LDA
fldmias r3, { s4 - s7 }

add r3, r3, LDA
fldmias r3, { s8 - s11 }

add r3, r3, LDA
fldmias r3, { s12 - s15 }

fstmias BO1, { s0 - s15 }
add AO1, AO1, #16
add BO1, BO1, M4

.endm


.macro COPY2x4

fldmias AO1, { s0 - s1 }

add r3, AO1, LDA
fldmias r3, { s2 - s3 }

add r3, r3, LDA
fldmias r3, { s4 - s5 }

add r3, r3, LDA
fldmias r3, { s6 - s7 }

fstmias BO2, { s0 - s7 }
add AO1, AO1, #8
add BO2, BO2, #32

.endm

.macro COPY1x4

fldmias AO1, { s0 }

add r3, AO1, LDA
fldmias r3, { s1 }

add r3, r3, LDA
fldmias r3, { s2 }

add r3, r3, LDA
fldmias r3, { s3 }

fstmias BO3, { s0 - s3 }
add AO1, AO1, #4
add BO3, BO3, #16

.endm

/*************************************************************************************************************************/

.macro COPY4x2

fldmias AO1, { s0 - s3 }

add r3, AO1, LDA
fldmias r3, { s4 - s7 }

fstmias BO1, { s0 - s7 }
add AO1, AO1, #16
add BO1, BO1, M4

.endm

.macro COPY2x2

fldmias AO1, { s0 - s1 }

add r3, AO1, LDA
fldmias r3, { s2 - s3 }

fstmias BO2, { s0 - s3 }
add AO1, AO1, #8
add BO2, BO2, #16

.endm

.macro COPY1x2

fldmias AO1, { s0 }

add r3, AO1, LDA
fldmias r3, { s1 }

fstmias BO3, { s0 - s1 }
add AO1, AO1, #4
add BO3, BO3, #8

.endm

/*************************************************************************************************************************/

.macro COPY4x1

fldmias AO1, { s0 - s3 }

fstmias BO1, { s0 - s3 }
add AO1, AO1, #16
add BO1, BO1, M4

.endm

.macro COPY2x1

fldmias AO1, { s0 - s1 }

fstmias BO2, { s0 - s1 }
add AO1, AO1, #8
add BO2, BO2, #8

.endm

.macro COPY1x1

fldmias AO1, { s0 }

fstmias BO3, { s0 }
add AO1, AO1, #4
add BO3, BO3, #4

.endm



/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack

str OLD_A, A // store A

lsl LDA, OLD_LDA, #2 // lda = lda * SIZE

sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers

lsl r4 , M, #2 // M * SIZE

ldr r3, B

and BO2 , N , #-4
and BO3 , N , #-2

mul BO2, BO2, r4
mul BO3, BO3, r4

add BO2 , BO2, r3
add BO3 , BO3, r3

lsl M4, M, #4 // M4 = M * 4 * SIZE

sgemm_tcopy_L4_BEGIN:

asrs J, M, #2 // J = N / 4
ble sgemm_tcopy_L2_BEGIN

sgemm_tcopy_L4_M4_BEGIN:

ldr AO1, A // AO1 = A
lsl r3, LDA, #2 // r3 = 4 * LDA
add r3, r3 , AO1 // A = A + 4 * LDA
str r3, A // store A

ldr BO1, B
add r3, BO1, #64 // B = B + 16 * SIZE
str r3, B

asrs I, N, #2 // I = M / 4
ble sgemm_tcopy_L4_M4_40

sgemm_tcopy_L4_M4_20:

COPY4x4_1

subs I , I , #1
ble sgemm_tcopy_L4_M4_40
COPY4x4_2

subs I , I , #1
bne sgemm_tcopy_L4_M4_20
sgemm_tcopy_L4_M4_40:

tst N , #2
ble sgemm_tcopy_L4_M4_60

COPY2x4

sgemm_tcopy_L4_M4_60:

tst N, #1
ble sgemm_tcopy_L4_M4_END

COPY1x4

sgemm_tcopy_L4_M4_END:

subs J , J, #1 // j--
bne sgemm_tcopy_L4_M4_BEGIN



/*********************************************************************************************/

sgemm_tcopy_L2_BEGIN:

tst M, #3
ble sgemm_tcopy_L999

tst M, #2
ble sgemm_tcopy_L1_BEGIN

sgemm_tcopy_L2_M4_BEGIN:

ldr AO1, A // AO1 = A
lsl r3, LDA, #1 // r3 = 2 * LDA
add r3, r3 , AO1 // A = A + 2 * LDA
str r3, A // store A

ldr BO1, B
add r3, BO1, #32 // B = B + 8 * SIZE
str r3, B

asrs I, N, #2 // I = M / 4
ble sgemm_tcopy_L2_M4_40

sgemm_tcopy_L2_M4_20:

COPY4x2

subs I , I , #1
bne sgemm_tcopy_L2_M4_20
sgemm_tcopy_L2_M4_40:

tst N , #2
ble sgemm_tcopy_L2_M4_60

COPY2x2

sgemm_tcopy_L2_M4_60:

tst N , #1
ble sgemm_tcopy_L2_M4_END

COPY1x2


sgemm_tcopy_L2_M4_END:


/*********************************************************************************************/

sgemm_tcopy_L1_BEGIN:

tst M, #1
ble sgemm_tcopy_L999


sgemm_tcopy_L1_M4_BEGIN:

ldr AO1, A // AO1 = A
add r3, LDA , AO1 // A = A + 1 * LDA
str r3, A // store A

ldr BO1, B
add r3, BO1, #16 // B = B + 4 * SIZE
str r3, B

asrs I, N, #2 // I = M / 4
ble sgemm_tcopy_L1_M4_40

sgemm_tcopy_L1_M4_20:

COPY4x1

subs I , I , #1
bne sgemm_tcopy_L1_M4_20
sgemm_tcopy_L1_M4_40:

tst N , #2
ble sgemm_tcopy_L1_M4_60

COPY2x1

sgemm_tcopy_L1_M4_60:

tst N , #1
ble sgemm_tcopy_L1_M4_END

COPY1x1


sgemm_tcopy_L1_M4_END:



sgemm_tcopy_L999:

sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers

mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 1081
- 0
kernel/arm/strmm_kernel_4x2_vfp.S
File diff suppressed because it is too large
View File


+ 1884
- 0
kernel/arm/strmm_kernel_4x4_vfpv3.S
File diff suppressed because it is too large
View File


+ 62
- 0
kernel/arm/swap.c View File

@@ -0,0 +1,62 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/08/20 Saar
* BLASTEST float OK
* BLASTEST double OK
*
**************************************************************************************/

#include "common.h"
#include <stdio.h>

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;

if ( n < 0 ) return(0);

while(i < n)
{

temp = x[ix] ;
x[ix] = y[iy] ;
y[iy] = temp ;

ix += inc_x ;
iy += inc_y ;
i++ ;

}
return(0);

}


+ 354
- 0
kernel/arm/swap_vfp.S View File

@@ -0,0 +1,354 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/14 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_INC_X [fp, #0 ]
#define OLD_Y [fp, #4 ]
#define OLD_INC_Y [fp, #8 ]


#define N r0
#define Y r1
#define INC_X r2
#define X r3
#define INC_Y r4

#define I r12

#define X_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/

/*****************************************************************************************/



#if !defined(COMPLEX)

#if defined(DOUBLE)

.macro KERNEL_F4

pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
fldmiad X, { d0 - d3 }
fldmiad Y, { d4 - d7 }
fstmiad Y!, { d0 - d3 }
fstmiad X!, { d4 - d7}

.endm


.macro KERNEL_F1

fldmiad X, { d0 }
fldmiad Y, { d4 }
fstmiad Y!, { d0 }
fstmiad X!, { d4 }

.endm

.macro KERNEL_S1

fldmiad X, { d0 }
fldmiad Y, { d4 }
fstmiad Y, { d0 }
fstmiad X, { d4 }
add X, X, INC_X
add Y, Y, INC_Y

.endm

#else

.macro KERNEL_F4

fldmias X, { s0 - s3 }
fldmias Y, { s4 - s7 }
fstmias Y!, { s0 - s3 }
fstmias X!, { s4 - s7}

.endm


.macro KERNEL_F1

fldmias X, { s0 }
fldmias Y, { s4 }
fstmias Y!, { s0 }
fstmias X!, { s4 }

.endm

.macro KERNEL_S1

fldmias X, { s0 }
fldmias Y, { s4 }
fstmias Y, { s0 }
fstmias X, { s4 }
add X, X, INC_X
add Y, Y, INC_Y

.endm


#endif

#else

#if defined(DOUBLE)

.macro KERNEL_F4

pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
fldmiad X, { d0 - d3 }
fldmiad Y, { d4 - d7 }
fstmiad Y!, { d0 - d3 }
fstmiad X!, { d4 - d7}

pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
fldmiad X, { d0 - d3 }
fldmiad Y, { d4 - d7 }
fstmiad Y!, { d0 - d3 }
fstmiad X!, { d4 - d7}

.endm

.macro KERNEL_F1

fldmiad X, { d0 - d1 }
fldmiad Y, { d4 - d5 }
fstmiad Y!, { d0 - d1 }
fstmiad X!, { d4 - d5 }

.endm

.macro KERNEL_S1

fldmiad X, { d0 - d1 }
fldmiad Y, { d4 - d5 }
fstmiad Y, { d0 - d1 }
fstmiad X, { d4 - d5 }
add X, X, INC_X
add Y, Y, INC_Y

.endm


#else

.macro KERNEL_F4

pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
fldmias X, { s0 - s3 }
fldmias Y, { s4 - s7 }
fstmias Y!, { s0 - s3 }
fstmias X!, { s4 - s7}

fldmias X, { s0 - s3 }
fldmias Y, { s4 - s7 }
fstmias Y!, { s0 - s3 }
fstmias X!, { s4 - s7}

.endm

.macro KERNEL_F1

fldmias X, { s0 - s1 }
fldmias Y, { s4 - s5 }
fstmias Y!, { s0 - s1 }
fstmias X!, { s4 - s5 }

.endm

.macro KERNEL_S1

fldmias X, { s0 - s1 }
fldmias Y, { s4 - s5 }
fstmias Y, { s0 - s1 }
fstmias X, { s4 - s5 }
add X, X, INC_X
add Y, Y, INC_Y

.endm



#endif

#endif

/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5
push {r4 , fp}
add fp, sp, #8

ldr INC_X , OLD_INC_X
ldr Y, OLD_Y
ldr INC_Y , OLD_INC_Y


cmp N, #0
ble swap_kernel_L999

cmp INC_X, #0
beq swap_kernel_L999

cmp INC_Y, #0
beq swap_kernel_L999

cmp INC_X, #1
bne swap_kernel_S_BEGIN

cmp INC_Y, #1
bne swap_kernel_S_BEGIN


swap_kernel_F_BEGIN:


asrs I, N, #2 // I = N / 4
ble swap_kernel_F1

.align 5

swap_kernel_F4:

#if !defined(COMPLEX) && !defined(DOUBLE)
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
#endif

KERNEL_F4

subs I, I, #1
ble swap_kernel_F1

KERNEL_F4

subs I, I, #1
bne swap_kernel_F4

swap_kernel_F1:

ands I, N, #3
ble swap_kernel_L999

swap_kernel_F10:

KERNEL_F1

subs I, I, #1
bne swap_kernel_F10

b swap_kernel_L999

swap_kernel_S_BEGIN:

#if defined(COMPLEX)

#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
#endif

#else

#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
#endif

#endif


asrs I, N, #2 // I = N / 4
ble swap_kernel_S1

.align 5

swap_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne swap_kernel_S4

swap_kernel_S1:

ands I, N, #3
ble swap_kernel_L999

swap_kernel_S10:

KERNEL_S1

subs I, I, #1
bne swap_kernel_S10


swap_kernel_L999:

mov r0, #0 // set return value

sub sp, fp, #8
pop {r4,fp}
bx lr

EPILOGUE


+ 81
- 0
kernel/arm/zamax.c View File

@@ -0,0 +1,81 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#define ABS fabs

#else

#define ABS fabsf

#endif

#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf[2];
BLASLONG max=0;
BLASLONG inc_x2;

if (n < 0 || inc_x < 1 ) return(0.0);

inc_x2 = 2 * inc_x;

maxf[0] = ABS(x[ix]);
maxf[1] = ABS(x[ix+1]);

while(i < n)
{
if( CABS1(x,ix) > CABS1(maxf,0) )
{
max = i;
maxf[0] = ABS(x[ix]);
maxf[1] = ABS(x[ix+1]);
}
ix += inc_x2;
i++;
}
return(CABS1(maxf,0));
}


+ 81
- 0
kernel/arm/zamin.c View File

@@ -0,0 +1,81 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#define ABS fabs

#else

#define ABS fabsf

#endif

#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf[2];
BLASLONG min=0;
BLASLONG inc_x2;

if (n < 0 || inc_x < 1 ) return(0.0);

inc_x2 = 2 * inc_x;

minf[0] = ABS(x[ix]);
minf[1] = ABS(x[ix+1]);

while(i < n)
{
if( CABS1(x,ix) < CABS1(minf,0) )
{
min = i;
minf[0] = ABS(x[ix]);
minf[1] = ABS(x[ix+1]);
}
ix += inc_x2;
i++;
}
return(CABS1(minf,0));
}


+ 71
- 0
kernel/arm/zasum.c View File

@@ -0,0 +1,71 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/


#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#define ABS fabs

#else

#define ABS fabsf

#endif

#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
BLASLONG inc_x2;
if (n < 0 || inc_x < 1 ) return(sumf);

inc_x2 = 2 * inc_x;

n *= inc_x2;
while(i < n)
{
sumf += CABS1(x,i);
i += inc_x2;
}
return(sumf);
}


+ 72
- 0
kernel/arm/zaxpy.c View File

@@ -0,0 +1,72 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/15 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/


#include "common.h"

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix,iy;

if ( n < 0 ) return(0);
if ( da_r == 0.0 && da_i == 0.0 ) return(0);

ix = 0;
iy = 0;

BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;

while(i < n)
{
#if !defined(CONJ)
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;

}
return(0);

}


+ 63
- 0
kernel/arm/zcopy.c View File

@@ -0,0 +1,63 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;

if ( n < 0 ) return(0);

BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;

while(i < n)
{

y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2;
iy += inc_y2;
i++ ;

}
return(0);

}


+ 223
- 0
kernel/arm/zcopy_vfp.S View File

@@ -0,0 +1,223 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/07 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define OLD_INC_Y [fp, #4 ]

#define I r5
#define Y r6
#define INC_Y r7

#define X_PRE 256

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro COPY_F4

pld [ X, #X_PRE ]
pld [ X, #X_PRE+32 ]
fldmiad X!, { d0 - d7 }
fstmiad Y!, { d0 - d7 }

.endm

.macro COPY_F1

fldmiad X!, { d0 - d1 }
fstmiad Y!, { d0 - d1 }

.endm


/*************************************************************************************************************************/

.macro COPY_S4

nop
fldmiad X, { d0 - d1 }
fstmiad Y, { d0 - d1 }
add X, X, INC_X
add Y, Y, INC_Y

fldmiad X, { d2 - d3 }
fstmiad Y, { d2 - d3 }
add X, X, INC_X
add Y, Y, INC_Y

fldmiad X, { d0 - d1 }
fstmiad Y, { d0 - d1 }
add X, X, INC_X
add Y, Y, INC_Y

fldmiad X, { d2 - d3 }
fstmiad Y, { d2 - d3 }
add X, X, INC_X
add Y, Y, INC_Y

.endm


.macro COPY_S1

fldmiad X, { d0 - d1 }
fstmiad Y, { d0 - d1 }
add X, X, INC_X
add Y, Y, INC_Y

.endm



/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack

sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers

mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
cmp N, #0
ble zcopy_kernel_L999

cmp INC_X, #0
beq zcopy_kernel_L999

cmp INC_Y, #0
beq zcopy_kernel_L999

cmp INC_X, #1
bne zcopy_kernel_S_BEGIN

cmp INC_Y, #1
bne zcopy_kernel_S_BEGIN

zcopy_kernel_F_BEGIN:

asrs I, N, #2 // I = N / 4
ble zcopy_kernel_F1

zcopy_kernel_F4:

COPY_F4

subs I, I, #1
bne zcopy_kernel_F4

zcopy_kernel_F1:

ands I, N, #3
ble zcopy_kernel_L999

zcopy_kernel_F10:

COPY_F1

subs I, I, #1
bne zcopy_kernel_F10

b zcopy_kernel_L999

zcopy_kernel_S_BEGIN:

lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2

asrs I, N, #2 // I = N / 4
ble zcopy_kernel_S1

zcopy_kernel_S4:

COPY_S4

subs I, I, #1
bne zcopy_kernel_S4

zcopy_kernel_S1:

ands I, N, #3
ble zcopy_kernel_L999

zcopy_kernel_S10:

COPY_S1

subs I, I, #1
bne zcopy_kernel_S10






zcopy_kernel_L999:

sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers

mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 78
- 0
kernel/arm/zdot.c View File

@@ -0,0 +1,78 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : FAIL
* BLASTEST double : FAIL
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"
#include <complex.h>

FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot[2];
FLOAT _Complex result;
dot[0]=0.0;
dot[1]=0.0;

__real__ result = 0.0 ;
__imag__ result = 0.0 ;

if ( n < 1 ) return(result);

BLASLONG inc_x2 = 2 * inc_x ;
BLASLONG inc_y2 = 2 * inc_y ;

while(i < n)
{
#if !defined(CONJ)
dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ;
dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ;
#else
dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ;
dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ;
#endif
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;

}
__real__ result = dot[0];
__imag__ result = dot[1];
return(result);

}


+ 286
- 0
kernel/arm/zdot_vfp.S View File

@@ -0,0 +1,286 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/11 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define OLD_INC_Y [fp, #4 ]

#define I r5
#define Y r6
#define INC_Y r7

#define X_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro KERNEL_F4

pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]

fldmiad X!, { d4 - d5 }
fldmiad Y!, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fldmiad X!, { d6 - d7 }
fmacd d2 , d5, d9
fmacd d3 , d5, d8

fldmiad Y!, { d10 - d11 }
fmacd d0 , d6, d10
fmacd d1 , d6, d11
pld [ X, #X_PRE ]
fmacd d2 , d7, d11
fmacd d3 , d7, d10

pld [ Y, #X_PRE ]

fldmiad X!, { d4 - d5 }
fldmiad Y!, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fldmiad X!, { d6 - d7 }
fmacd d2 , d5, d9
fmacd d3 , d5, d8

fldmiad Y!, { d10 - d11 }
fmacd d0 , d6, d10
fmacd d1 , d6, d11
fmacd d2 , d7, d11
fmacd d3 , d7, d10

.endm

.macro KERNEL_F1

fldmiad X!, { d4 - d5 }
fldmiad Y!, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fmacd d2 , d5, d9
fmacd d3 , d5, d8

.endm


/*************************************************************************************************************************/

.macro KERNEL_S4

nop

fldmiad X, { d4 - d5 }
fldmiad Y, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fmacd d2 , d5, d9
fmacd d3 , d5, d8
add X, X, INC_X
add Y, Y, INC_Y

fldmiad X, { d4 - d5 }
fldmiad Y, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fmacd d2 , d5, d9
fmacd d3 , d5, d8
add X, X, INC_X
add Y, Y, INC_Y

fldmiad X, { d4 - d5 }
fldmiad Y, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fmacd d2 , d5, d9
fmacd d3 , d5, d8
add X, X, INC_X
add Y, Y, INC_Y

fldmiad X, { d4 - d5 }
fldmiad Y, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fmacd d2 , d5, d9
fmacd d3 , d5, d8
add X, X, INC_X
add Y, Y, INC_Y

.endm


.macro KERNEL_S1

fldmiad X, { d4 - d5 }
fldmiad Y, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fmacd d2 , d5, d9
fmacd d3 , d5, d8
add X, X, INC_X
add Y, Y, INC_Y

.endm



/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack

sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers

mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
vsub.f64 d0 , d0 , d0
vsub.f64 d1 , d1 , d1
vsub.f64 d2 , d2 , d2
vsub.f64 d3 , d3 , d3

cmp N, #0
ble zdot_kernel_L999

cmp INC_X, #0
beq zdot_kernel_L999

cmp INC_Y, #0
beq zdot_kernel_L999

cmp INC_X, #1
bne zdot_kernel_S_BEGIN

cmp INC_Y, #1
bne zdot_kernel_S_BEGIN

zdot_kernel_F_BEGIN:

asrs I, N, #2 // I = N / 4
ble zdot_kernel_F1

zdot_kernel_F4:

KERNEL_F4

subs I, I, #1
bne zdot_kernel_F4

zdot_kernel_F1:

ands I, N, #3
ble zdot_kernel_L999

zdot_kernel_F10:

KERNEL_F1

subs I, I, #1
bne zdot_kernel_F10

b zdot_kernel_L999

zdot_kernel_S_BEGIN:

lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2

asrs I, N, #2 // I = N / 4
ble zdot_kernel_S1

zdot_kernel_S4:

KERNEL_S4

subs I, I, #1
bne zdot_kernel_S4

zdot_kernel_S1:

ands I, N, #3
ble zdot_kernel_L999

zdot_kernel_S10:

KERNEL_S1

subs I, I, #1
bne zdot_kernel_S10



zdot_kernel_L999:

sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers

#if !defined(CONJ)
vsub.f64 d0 , d0, d2
vadd.f64 d1 , d1, d3
#else
vadd.f64 d0 , d0, d2
vsub.f64 d1 , d1, d3
#endif

sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 1299
- 0
kernel/arm/zgemm_kernel_2x2_vfp.S
File diff suppressed because it is too large
View File


+ 1345
- 0
kernel/arm/zgemm_kernel_2x2_vfpv3.S
File diff suppressed because it is too large
View File


+ 254
- 0
kernel/arm/zgemm_ncopy_2_vfp.S View File

@@ -0,0 +1,254 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/05 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define LDA [fp, #-260 ]

#define B [fp, #4 ]

#define M r0
#define N r1
#define A r2

#define BO r5

#define AO1 r6
#define AO2 r7

#define I r3
#define J r12

#define A_PRE 256

/**************************************************************************************
* Macro definitions
**************************************************************************************/

.macro COPY2x2

pld [ AO1, #A_PRE ]
pld [ AO2, #A_PRE ]
fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO1, #8 ]
fldd d4 , [ AO1, #16 ]
fldd d5 , [ AO1, #24 ]

fldd d2 , [ AO2, #0 ]
fldd d3 , [ AO2, #8 ]
add AO1, AO1, #32
fldd d6 , [ AO2, #16 ]
fldd d7 , [ AO2, #24 ]

fstmiad BO!, { d0 - d7 }
add AO2, AO2, #32

.endm


.macro COPY1x2

fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO1, #8 ]
fldd d2 , [ AO2, #0 ]
fldd d3 , [ AO2, #8 ]

add AO1, AO1, #16
fstmiad BO!, { d0 - d3 }
add AO2, AO2, #16

.endm

.macro COPY2x1

fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO1, #8 ]
fldd d2 , [ AO1, #16 ]
fldd d3 , [ AO1, #24 ]

fstmiad BO!, { d0 - d3 }
add AO1, AO1, #32

.endm


.macro COPY1x1

fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO1, #8 ]

fstmiad BO!, { d0 - d1 }
add AO1, AO1, #16

.endm





/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack


lsl r3, r3, #4 // lda = lda * 8 * 2
str r3, LDA

sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers

ldr BO, B

/*********************************************************************************************/

zgemm_ncopy_L2_BEGIN:

asrs J, N, #1 // J = N / 2
ble zgemm_ncopy_L1_BEGIN

zgemm_ncopy_L2_M2_BEGIN:

mov AO1, A // AO1 = A
ldr r4 , LDA
add AO2, AO1, r4
add A , AO2, r4 // A = A + 2 * LDA

asrs I, M, #1 // I = M / 2
ble zgemm_ncopy_L2_M2_40

zgemm_ncopy_L2_M2_20:

COPY2x2

subs I , I , #1
bne zgemm_ncopy_L2_M2_20
zgemm_ncopy_L2_M2_40:

ands I, M , #1
ble zgemm_ncopy_L2_M2_END

zgemm_ncopy_L2_M2_60:

COPY1x2

subs I , I , #1
bne zgemm_ncopy_L2_M2_60

zgemm_ncopy_L2_M2_END:

subs J , J, #1 // j--
bne zgemm_ncopy_L2_M2_BEGIN


/*********************************************************************************************/

zgemm_ncopy_L1_BEGIN:

tst N, #1
ble zgemm_ncopy_L999


zgemm_ncopy_L1_M2_BEGIN:

mov AO1, A // AO1 = A
ldr r4 , LDA
add A , AO1, r4 // A = A + 1 * LDA

asrs I, M, #1 // I = M / 2
ble zgemm_ncopy_L1_M2_40

zgemm_ncopy_L1_M2_20:

COPY2x1

subs I , I , #1
bne zgemm_ncopy_L1_M2_20
zgemm_ncopy_L1_M2_40:

ands I, M , #1
ble zgemm_ncopy_L1_M2_END

zgemm_ncopy_L1_M2_60:

COPY1x1

subs I , I , #1
bne zgemm_ncopy_L1_M2_60

zgemm_ncopy_L1_M2_END:



zgemm_ncopy_L999:

sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers

movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 245
- 0
kernel/arm/zgemm_tcopy_2_vfp.S View File

@@ -0,0 +1,245 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/07 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3


/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/

#define B [fp, #4 ]
#define A [fp, #-248 ]

#define M r0
#define N r1
#define M4 r2

#define LDA r5

#define AO1 r6
#define BO1 r7
#define BO2 r8

#define I r4
#define J r12

#define A_PRE 256

/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY2x2

pld [ AO1, #A_PRE ]
fldmiad AO1, { d0 - d3 }

add r3, AO1, LDA
pld [ r3, #A_PRE ]
fldmiad r3, { d4 - d7 }

fstmiad BO1, { d0 - d7 }
add AO1, AO1, #32
add BO1, BO1, M4

.endm

.macro COPY1x2

fldmiad AO1, { d0 -d1 }

add r3, AO1, LDA
fldmiad r3, { d2 - d3 }

fstmiad BO2, { d0 - d3 }
add AO1, AO1, #16
add BO2, BO2, #32

.endm

/*************************************************************************************************************************/
.macro COPY2x1

fldmiad AO1, { d0 - d3 }

fstmiad BO1, { d0 - d3 }
add AO1, AO1, #32
add BO1, BO1, M4

.endm

.macro COPY1x1

fldmiad AO1, { d0 - d1 }

fstmiad BO2, { d0 - d1 }
add AO1, AO1, #16
add BO2, BO2, #16

.endm



/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack

str OLD_A, A // store A

lsl LDA, OLD_LDA, #4 // lda = lda * SIZE * 2

sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers

lsl r4 , M, #4 // M * SIZE * 2

ldr r3, B

and BO2 , N , #-2

mul BO2, BO2, r4

add BO2 , BO2, r3

lsl M4, M, #5 // M4 = M * 2 * SIZE * 2

zgemm_tcopy_L2_BEGIN:

asrs J, M, #1 // J = N / 2
ble zgemm_tcopy_L1_BEGIN

zgemm_tcopy_L2_M2_BEGIN:

ldr AO1, A // AO1 = A
lsl r3, LDA, #1 // r3 = 2 * LDA
add r3, r3 , AO1 // A = A + 2 * LDA
str r3, A // store A

ldr BO1, B
add r3, BO1, #64 // B = B + 4 * SIZE *2
str r3, B

asrs I, N, #1 // I = M / 2
ble zgemm_tcopy_L2_M2_60

zgemm_tcopy_L2_M2_40:

COPY2x2
subs I, I, #1
bne zgemm_tcopy_L2_M2_40

zgemm_tcopy_L2_M2_60:

tst N , #1
ble zgemm_tcopy_L2_M2_END

COPY1x2


zgemm_tcopy_L2_M2_END:

subs J , J, #1 // j--
bne zgemm_tcopy_L2_M2_BEGIN

/*********************************************************************************************/

zgemm_tcopy_L1_BEGIN:

tst M, #1
ble zgemm_tcopy_L999


zgemm_tcopy_L1_M2_BEGIN:

ldr AO1, A // AO1 = A
add r3, LDA , AO1 // A = A + 1 * LDA
str r3, A // store A

ldr BO1, B
add r3, BO1, #32 // B = B + 2 * SIZE *2
str r3, B

asrs I, N, #1 // I = M / 2
ble zgemm_tcopy_L1_M2_60


zgemm_tcopy_L1_M2_40:

COPY2x1
subs I, I, #1
bne zgemm_tcopy_L1_M2_40

zgemm_tcopy_L1_M2_60:

tst N , #1
ble zgemm_tcopy_L1_M2_END

COPY1x1


zgemm_tcopy_L1_M2_END:



zgemm_tcopy_L999:

sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers

mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr

EPILOGUE


+ 157
- 0
kernel/arm/zgemv_n.c View File

@@ -0,0 +1,157 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* * 2013/11/23 Saar
* * BLASTEST float : OK
* * BLASTEST double : OK
* CTEST : OK
* TEST : OK
* *
* **************************************************************************************/


#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp_r,temp_i;
BLASLONG inc_x2,inc_y2;
BLASLONG lda2;
BLASLONG i2;

lda2 = 2*lda;

ix = 0;
a_ptr = a;

if ( inc_x == 1 && inc_y == 1 )
{

for (j=0; j<n; j++)
{

#if !defined(XCONJ)
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
#else
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
#endif
iy = 0;
i2=0;

for (i=0; i<m; i++)
{
#if !defined(CONJ)

#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#endif

#else

#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#endif

#endif
i2 += 2;
iy += 2;
}
a_ptr += lda2;
ix += 2;
}

return(0);

}

inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;

for (j=0; j<n; j++)
{

#if !defined(XCONJ)
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
#else
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
#endif
iy = 0;
i2=0;

for (i=0; i<m; i++)
{
#if !defined(CONJ)

#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#endif

#else

#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#endif

#endif
i2 += 2;
iy += inc_y2;
}
a_ptr += lda2;
ix += inc_x2;
}


return(0);
}


+ 699
- 0
kernel/arm/zgemv_n_vfp.S View File

@@ -0,0 +1,699 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/29 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_M r0

#define AO1 r0
#define N r1
#define J r2

#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9

#define I r12

#define ALPHA_I [fp, #-236]
#define ALPHA_R [fp, #-244]

#define M [fp, #-252 ]
#define A [fp, #-256 ]


#define X_PRE 64
#define Y_PRE 0
#define A_PRE 0

/**************************************************************************************/

#if !defined(CONJ) && !defined(XCONJ)

#define KMAC_R fnmacd
#define KMAC_I fmacd

#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd

#elif defined(CONJ) && !defined(XCONJ)

#define KMAC_R fmacd
#define KMAC_I fnmacd

#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd

#elif !defined(CONJ) && defined(XCONJ)

#define KMAC_R fmacd
#define KMAC_I fnmacd

#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I2 fmacd

#else

#define KMAC_R fnmacd
#define KMAC_I fmacd

#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I2 fmacd

#endif

.macro INIT_F4

pld [ YO, #Y_PRE ]
vsub.f64 d8 , d8 , d8
vmov.f64 d9 , d8
vmov.f64 d10, d8
vmov.f64 d11, d8
vmov.f64 d12, d8
vmov.f64 d13, d8
vmov.f64 d14, d8
vmov.f64 d15, d8

.endm

.macro KERNEL_F4X4

pld [ XO, #X_PRE ]
KERNEL_F4X1
KERNEL_F4X1
pld [ XO, #X_PRE ]
KERNEL_F4X1
KERNEL_F4X1

.endm

.macro KERNEL_F4X1

fldd d0 , [ AO1 ]

fldd d4 , [ XO ]
fldd d5 , [ XO, #8 ]

pld [ AO2, #A_PRE ]

fldd d1 , [ AO1, #8 ]
fmacd d8 , d0, d4
fldd d2 , [ AO1, #16 ]
fmacd d9 , d0, d5
fldd d3 , [ AO1, #24 ]
fmacd d10 , d2, d4
fldd d0 , [ AO1, #32 ]
fmacd d11 , d2, d5

KMAC_R d8 , d1, d5
KMAC_I d9 , d1, d4
KMAC_R d10 , d3, d5
fldd d1 , [ AO1, #40 ]
KMAC_I d11 , d3, d4

fldd d2 , [ AO1, #48 ]

fmacd d12 , d0, d4
fldd d3 , [ AO1, #56 ]
fmacd d13 , d0, d5
pld [ AO2, #A_PRE+32 ]
fmacd d14 , d2, d4
fmacd d15 , d2, d5

KMAC_R d12 , d1, d5
add XO , XO, #16
KMAC_I d13 , d1, d4
add AO1 , AO1, LDA
KMAC_R d14 , d3, d5
add AO2 , AO2, LDA
KMAC_I d15 , d3, d4

.endm

.macro SAVE_F4

fldd d0, ALPHA_R
fldd d1, ALPHA_I

fldmiad YO, { d4 - d7 }

FMAC_R1 d4 , d0 , d8
FMAC_I1 d5 , d0 , d9
FMAC_R2 d4 , d1 , d9
FMAC_I2 d5 , d1 , d8

FMAC_R1 d6 , d0 , d10
FMAC_I1 d7 , d0 , d11
FMAC_R2 d6 , d1 , d11
FMAC_I2 d7 , d1 , d10

fstmiad YO!, { d4 - d7 }

fldmiad YO, { d4 - d7 }

FMAC_R1 d4 , d0 , d12
FMAC_I1 d5 , d0 , d13
FMAC_R2 d4 , d1 , d13
FMAC_I2 d5 , d1 , d12

FMAC_R1 d6 , d0 , d14
FMAC_I1 d7 , d0 , d15
FMAC_R2 d6 , d1 , d15
FMAC_I2 d7 , d1 , d14

fstmiad YO!, { d4 - d7 }

.endm




.macro INIT_F1

vsub.f64 d8 , d8 , d8
vmov.f64 d9 , d8

.endm

.macro KERNEL_F1X1

fldd d0 , [ AO1 ]
fldd d1 , [ AO1, #8 ]

fldd d4 , [ XO ]
fldd d5 , [ XO, #8 ]

fmacd d8 , d0, d4
fmacd d9 , d0, d5

KMAC_R d8 , d1, d5
KMAC_I d9 , d1, d4

add XO , XO, #16
add AO1 , AO1, LDA


.endm

.macro SAVE_F1

fldd d0, ALPHA_R
fldd d1, ALPHA_I

fldmiad YO, { d4 - d5 }

FMAC_R1 d4 , d0 , d8
FMAC_I1 d5 , d0 , d9
FMAC_R2 d4 , d1 , d9
FMAC_I2 d5 , d1 , d8

fstmiad YO, { d4 - d5 }

add YO, YO, #16

.endm

/****************************************************************************************/

.macro INIT_S4

vsub.f64 d8 , d8 , d8
vmov.f64 d9 , d8
vmov.f64 d10, d8
vmov.f64 d11, d8
vmov.f64 d12, d8
vmov.f64 d13, d8
vmov.f64 d14, d8
vmov.f64 d15, d8

.endm

.macro KERNEL_S4X4

KERNEL_S4X1
KERNEL_S4X1
KERNEL_S4X1
KERNEL_S4X1

.endm

.macro KERNEL_S4X1

fldd d0 , [ AO1 ]
fldd d1 , [ AO1, #8 ]
fldd d2 , [ AO1, #16 ]
fldd d3 , [ AO1, #24 ]

fldd d4 , [ XO ]
fldd d5 , [ XO, #8 ]

fmacd d8 , d0, d4
fmacd d9 , d0, d5
fmacd d10 , d2, d4
fmacd d11 , d2, d5

KMAC_R d8 , d1, d5
KMAC_I d9 , d1, d4
KMAC_R d10 , d3, d5
KMAC_I d11 , d3, d4

fldd d0 , [ AO1, #32 ]
fldd d1 , [ AO1, #40 ]
fldd d2 , [ AO1, #48 ]
fldd d3 , [ AO1, #56 ]

fmacd d12 , d0, d4
fmacd d13 , d0, d5
fmacd d14 , d2, d4
fmacd d15 , d2, d5

KMAC_R d12 , d1, d5
KMAC_I d13 , d1, d4
KMAC_R d14 , d3, d5
KMAC_I d15 , d3, d4

add XO , XO, INC_X
add AO1 , AO1, LDA
add AO2 , AO2, LDA

.endm

.macro SAVE_S4

fldd d0, ALPHA_R
fldd d1, ALPHA_I

fldmiad YO, { d4 - d5 }

FMAC_R1 d4 , d0 , d8
FMAC_I1 d5 , d0 , d9
FMAC_R2 d4 , d1 , d9
FMAC_I2 d5 , d1 , d8

fstmiad YO, { d4 - d5 }

add YO, YO, INC_Y

fldmiad YO, { d6 - d7 }

FMAC_R1 d6 , d0 , d10
FMAC_I1 d7 , d0 , d11
FMAC_R2 d6 , d1 , d11
FMAC_I2 d7 , d1 , d10

fstmiad YO, { d6 - d7 }

add YO, YO, INC_Y

fldmiad YO, { d4 - d5 }

FMAC_R1 d4 , d0 , d12
FMAC_I1 d5 , d0 , d13
FMAC_R2 d4 , d1 , d13
FMAC_I2 d5 , d1 , d12

fstmiad YO, { d4 - d5 }

add YO, YO, INC_Y

fldmiad YO, { d6 - d7 }

FMAC_R1 d6 , d0 , d14
FMAC_I1 d7 , d0 , d15
FMAC_R2 d6 , d1 , d15
FMAC_I2 d7 , d1 , d14

fstmiad YO, { d6 - d7 }

add YO, YO, INC_Y

.endm




.macro INIT_S1

vsub.f64 d8 , d8 , d8
vmov.f64 d9 , d8

.endm

.macro KERNEL_S1X1

fldd d0 , [ AO1 ]
fldd d1 , [ AO1, #8 ]

fldd d4 , [ XO ]
fldd d5 , [ XO, #8 ]

fmacd d8 , d0, d4
fmacd d9 , d0, d5

KMAC_R d8 , d1, d5
KMAC_I d9 , d1, d4

add XO , XO, INC_X
add AO1 , AO1, LDA


.endm

.macro SAVE_S1

fldd d0, ALPHA_R
fldd d1, ALPHA_I

fldmiad YO, { d4 - d5 }

FMAC_R1 d4 , d0 , d8
FMAC_I1 d5 , d0 , d9
FMAC_R2 d4 , d1 , d9
FMAC_I2 d5 , d1 , d8

fstmiad YO, { d4 - d5 }

add YO, YO, INC_Y

.endm



/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack

sub r12, fp, #192

#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s15 } // store floating point registers
#endif

cmp OLD_M, #0
ble zgemvn_kernel_L999

cmp N, #0
ble zgemvn_kernel_L999

str OLD_A, A
str OLD_M, M
vstr d0 , ALPHA_R
vstr d1 , ALPHA_I


ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y

cmp INC_X, #0
beq zgemvn_kernel_L999

cmp INC_Y, #0
beq zgemvn_kernel_L999

ldr LDA, OLD_LDA


#if defined(DOUBLE)
lsl LDA, LDA, #4 // LDA * SIZE * 2
#else
lsl LDA, LDA, #3 // LDA * SIZE * 2
#endif

cmp INC_X, #1
bne zgemvn_kernel_S4_BEGIN

cmp INC_Y, #1
bne zgemvn_kernel_S4_BEGIN


zgemvn_kernel_F4_BEGIN:

ldr YO , Y

ldr I, M
asrs I, I, #2 // I = M / 4
ble zgemvn_kernel_F1_BEGIN

zgemvn_kernel_F4X4:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #64
str r3 , A

add AO2, AO2, LDA
add AO2, AO2, LDA

ldr XO , X

INIT_F4

asrs J, N, #2 // J = N / 4
ble zgemvn_kernel_F4X1


zgemvn_kernel_F4X4_10:

KERNEL_F4X4

subs J, J, #1
bne zgemvn_kernel_F4X4_10


zgemvn_kernel_F4X1:

ands J, N , #3
ble zgemvn_kernel_F4_END

zgemvn_kernel_F4X1_10:

KERNEL_F4X1

subs J, J, #1
bne zgemvn_kernel_F4X1_10


zgemvn_kernel_F4_END:

SAVE_F4

subs I , I , #1
bne zgemvn_kernel_F4X4


zgemvn_kernel_F1_BEGIN:

ldr I, M
ands I, I , #3
ble zgemvn_kernel_L999

zgemvn_kernel_F1X1:

ldr AO1, A
add r3, AO1, #16
str r3, A
ldr XO , X

INIT_F1

mov J, N


zgemvn_kernel_F1X1_10:

KERNEL_F1X1

subs J, J, #1
bne zgemvn_kernel_F1X1_10


zgemvn_kernel_F1_END:

SAVE_F1

subs I , I , #1
bne zgemvn_kernel_F1X1

b zgemvn_kernel_L999



/*************************************************************************************************************/

zgemvn_kernel_S4_BEGIN:

#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
#endif

ldr YO , Y

ldr I, M
asrs I, I, #2 // I = M / 4
ble zgemvn_kernel_S1_BEGIN

zgemvn_kernel_S4X4:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #64
str r3 , A

ldr XO , X

INIT_S4

asrs J, N, #2 // J = N / 4
ble zgemvn_kernel_S4X1


zgemvn_kernel_S4X4_10:

KERNEL_S4X4

subs J, J, #1
bne zgemvn_kernel_S4X4_10


zgemvn_kernel_S4X1:

ands J, N , #3
ble zgemvn_kernel_S4_END

zgemvn_kernel_S4X1_10:

KERNEL_S4X1

subs J, J, #1
bne zgemvn_kernel_S4X1_10


zgemvn_kernel_S4_END:

SAVE_S4

subs I , I , #1
bne zgemvn_kernel_S4X4


zgemvn_kernel_S1_BEGIN:

ldr I, M
ands I, I , #3
ble zgemvn_kernel_L999

zgemvn_kernel_S1X1:

ldr AO1, A
add r3, AO1, #16
str r3, A
ldr XO , X

INIT_S1

mov J, N


zgemvn_kernel_S1X1_10:

KERNEL_S1X1

subs J, J, #1
bne zgemvn_kernel_S1X1_10


zgemvn_kernel_S1_END:

SAVE_S1

subs I , I , #1
bne zgemvn_kernel_S1X1


/*************************************************************************************************************/

zgemvn_kernel_L999:

sub r3, fp, #192

#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s15 } // restore floating point registers
#endif

mov r0, #0 // set return value

sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr

EPILOGUE


+ 140
- 0
kernel/arm/zgemv_t.c View File

@@ -0,0 +1,140 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* * 2013/11/23 Saar
* * BLASTEST float : OK
* * BLASTEST double : OK
* CTEST : OK
* TEST : OK
* *
* **************************************************************************************/


#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp_r,temp_i;
BLASLONG inc_x2,inc_y2;
BLASLONG lda2;
BLASLONG i2;

lda2 = 2*lda;

iy = 0;
a_ptr = a;

if ( inc_x == 1 && inc_y == 1 )
{

for (j=0; j<n; j++)
{
temp_r = 0.0;
temp_i = 0.0;
ix = 0;
i2=0;

for (i=0; i<m; i++)
{

#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
#else
temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
#endif

i2 += 2;
ix += 2;
}

#if !defined(XCONJ)
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif

a_ptr += lda2;
iy += 2;
}

return(0);

}


inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;

for (j=0; j<n; j++)
{
temp_r = 0.0;
temp_i = 0.0;
ix = 0;
i2=0;

for (i=0; i<m; i++)
{

#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
#else
temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
#endif

i2 += 2;
ix += inc_x2;
}

#if !defined(XCONJ)
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif

a_ptr += lda2;
iy += inc_y2;
}

return(0);

}




+ 608
- 0
kernel/arm/zgemv_t_vfp.S View File

@@ -0,0 +1,608 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/11/29 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_N r1

#define M r0
#define AO1 r1
#define J r2

#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9

#define I r12

#define N [fp, #-252 ]
#define A [fp, #-256 ]


#define X_PRE 512
#define A_PRE 512
#define Y_PRE 32

/**************************************************************************************
* Macro definitions
**************************************************************************************/

#if !defined(CONJ) && !defined(XCONJ)

#define KMAC_R fnmacd
#define KMAC_I fmacd

#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd

#elif defined(CONJ) && !defined(XCONJ)

#define KMAC_R fmacd
#define KMAC_I fnmacd

#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd

#elif !defined(CONJ) && defined(XCONJ)

#define KMAC_R fmacd
#define KMAC_I fnmacd

#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I2 fmacd

#else

#define KMAC_R fnmacd
#define KMAC_I fmacd

#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I2 fmacd

#endif



.macro INIT_F2

vsub.f64 d12, d12, d12
vsub.f64 d13, d13, d13
vsub.f64 d14, d14, d14
vsub.f64 d15, d15, d15

.endm

.macro KERNEL_F2X4

KERNEL_F2X1
KERNEL_F2X1
KERNEL_F2X1
KERNEL_F2X1

.endm

.macro KERNEL_F2X1

fldmiad XO! , { d2 - d3 }
fldmiad AO1!, { d4 - d5 }

fmacd d12 , d4 , d2
fmacd d13 , d4 , d3
fldmiad AO2!, { d8 - d9 }
KMAC_R d12 , d5 , d3
KMAC_I d13 , d5 , d2

fmacd d14 , d8 , d2
fmacd d15 , d8 , d3
KMAC_R d14 , d9 , d3
KMAC_I d15 , d9 , d2

.endm

.macro SAVE_F2

fldmiad YO, { d4 - d7 }

FMAC_R1 d4 , d0 , d12
FMAC_I1 d5 , d0 , d13
FMAC_R2 d4 , d1 , d13
FMAC_I2 d5 , d1 , d12

FMAC_R1 d6 , d0 , d14
FMAC_I1 d7 , d0 , d15
FMAC_R2 d6 , d1 , d15
FMAC_I2 d7 , d1 , d14

fstmiad YO!, { d4 - d7 }

.endm

/************************************************************************************************/

.macro INIT_F1

vsub.f64 d12, d12, d12
vsub.f64 d13, d13, d13

.endm

.macro KERNEL_F1X4

KERNEL_F1X1
KERNEL_F1X1
KERNEL_F1X1
KERNEL_F1X1

.endm

.macro KERNEL_F1X1

fldmiad XO! , { d2 - d3 }
fldmiad AO1!, { d4 - d5 }

fmacd d12 , d4 , d2
fmacd d13 , d4 , d3
KMAC_R d12 , d5 , d3
KMAC_I d13 , d5 , d2

.endm

.macro SAVE_F1

fldmiad YO, { d4 - d5 }

FMAC_R1 d4 , d0 , d12
FMAC_I1 d5 , d0 , d13
FMAC_R2 d4 , d1 , d13
FMAC_I2 d5 , d1 , d12

fstmiad YO!, { d4 - d5 }

.endm

/************************************************************************************************/

.macro INIT_S2

vsub.f64 d12, d12, d12
vsub.f64 d13, d13, d13
vsub.f64 d14, d14, d14
vsub.f64 d15, d15, d15

.endm

.macro KERNEL_S2X4

KERNEL_S2X1
KERNEL_S2X1
KERNEL_S2X1
KERNEL_S2X1

.endm

.macro KERNEL_S2X1

fldmiad XO , { d2 - d3 }
fldmiad AO1!, { d4 - d5 }
fldmiad AO2!, { d8 - d9 }

fmacd d12 , d4 , d2
fmacd d13 , d4 , d3
KMAC_R d12 , d5 , d3
KMAC_I d13 , d5 , d2

fmacd d14 , d8 , d2
fmacd d15 , d8 , d3
KMAC_R d14 , d9 , d3
KMAC_I d15 , d9 , d2

add XO, XO, INC_X

.endm

.macro SAVE_S2

fldmiad YO, { d4 - d5 }

FMAC_R1 d4 , d0 , d12
FMAC_I1 d5 , d0 , d13
FMAC_R2 d4 , d1 , d13
FMAC_I2 d5 , d1 , d12

fstmiad YO, { d4 - d5 }

add YO, YO, INC_Y

fldmiad YO, { d6 - d7 }

FMAC_R1 d6 , d0 , d14
FMAC_I1 d7 , d0 , d15
FMAC_R2 d6 , d1 , d15
FMAC_I2 d7 , d1 , d14

fstmiad YO, { d6 - d7 }

add YO, YO, INC_Y

.endm

/************************************************************************************************/

.macro INIT_S1

vsub.f64 d12, d12, d12
vsub.f64 d13, d13, d13

.endm

.macro KERNEL_S1X4

KERNEL_S1X1
KERNEL_S1X1
KERNEL_S1X1
KERNEL_S1X1

.endm

.macro KERNEL_S1X1

fldmiad XO , { d2 - d3 }
fldmiad AO1!, { d4 - d5 }

fmacd d12 , d4 , d2
fmacd d13 , d4 , d3
KMAC_R d12 , d5 , d3
KMAC_I d13 , d5 , d2

add XO, XO, INC_X

.endm

.macro SAVE_S1

fldmiad YO, { d4 - d5 }

FMAC_R1 d4 , d0 , d12
FMAC_I1 d5 , d0 , d13
FMAC_R2 d4 , d1 , d13
FMAC_I2 d5 , d1 , d12

fstmiad YO, { d4 - d5 }

add YO, YO, INC_Y

.endm



/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack

sub r12, fp, #192

#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s15 } // store floating point registers
#endif

cmp M, #0
ble zgemvt_kernel_L999

cmp OLD_N, #0
ble zgemvt_kernel_L999

str OLD_A, A
str OLD_N, N

ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y

cmp INC_X, #0
beq zgemvt_kernel_L999

cmp INC_Y, #0
beq zgemvt_kernel_L999

ldr LDA, OLD_LDA


#if defined(DOUBLE)
lsl LDA, LDA, #4 // LDA * SIZE
#else
lsl LDA, LDA, #3 // LDA * SIZE
#endif

cmp INC_X, #1
bne zgemvt_kernel_S2_BEGIN

cmp INC_Y, #1
bne zgemvt_kernel_S2_BEGIN


zgemvt_kernel_F2_BEGIN:

ldr YO , Y

ldr J, N
asrs J, J, #1 // J = N / 2
ble zgemvt_kernel_F1_BEGIN

zgemvt_kernel_F2X4:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A

ldr XO , X

INIT_F2

asrs I, M, #2 // I = M / 4
ble zgemvt_kernel_F2X1


zgemvt_kernel_F2X4_10:

KERNEL_F2X4

subs I, I, #1
bne zgemvt_kernel_F2X4_10


zgemvt_kernel_F2X1:

ands I, M , #3
ble zgemvt_kernel_F2_END

zgemvt_kernel_F2X1_10:

KERNEL_F2X1

subs I, I, #1
bne zgemvt_kernel_F2X1_10


zgemvt_kernel_F2_END:

SAVE_F2

subs J , J , #1
bne zgemvt_kernel_F2X4


zgemvt_kernel_F1_BEGIN:

ldr J, N
ands J, J, #1
ble zgemvt_kernel_L999

zgemvt_kernel_F1X4:

ldr AO1, A

ldr XO , X

INIT_F1

asrs I, M, #2 // I = M / 4
ble zgemvt_kernel_F1X1


zgemvt_kernel_F1X4_10:

KERNEL_F1X4

subs I, I, #1
bne zgemvt_kernel_F1X4_10


zgemvt_kernel_F1X1:

ands I, M , #3
ble zgemvt_kernel_F1_END

zgemvt_kernel_F1X1_10:

KERNEL_F1X1

subs I, I, #1
bne zgemvt_kernel_F1X1_10


zgemvt_kernel_F1_END:

SAVE_F1

b zgemvt_kernel_L999



/*************************************************************************************************************/

zgemvt_kernel_S2_BEGIN:

#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#endif

ldr YO , Y

ldr J, N
asrs J, J, #1 // J = N / 2
ble zgemvt_kernel_S1_BEGIN

zgemvt_kernel_S2X4:

ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A

ldr XO , X

INIT_S2

asrs I, M, #2 // I = M / 4
ble zgemvt_kernel_S2X1


zgemvt_kernel_S2X4_10:

KERNEL_S2X4

subs I, I, #1
bne zgemvt_kernel_S2X4_10


zgemvt_kernel_S2X1:

ands I, M , #3
ble zgemvt_kernel_S2_END

zgemvt_kernel_S2X1_10:

KERNEL_S2X1

subs I, I, #1
bne zgemvt_kernel_S2X1_10


zgemvt_kernel_S2_END:

SAVE_S2

subs J , J , #1
bne zgemvt_kernel_S2X4


zgemvt_kernel_S1_BEGIN:

ldr J, N
ands J, J, #1
ble zgemvt_kernel_L999

zgemvt_kernel_S1X4:

ldr AO1, A

ldr XO , X

INIT_S1

asrs I, M, #2 // I = M / 4
ble zgemvt_kernel_S1X1


zgemvt_kernel_S1X4_10:

KERNEL_S1X4

subs I, I, #1
bne zgemvt_kernel_S1X4_10


zgemvt_kernel_S1X1:

ands I, M , #3
ble zgemvt_kernel_S1_END

zgemvt_kernel_S1X1_10:

KERNEL_S1X1

subs I, I, #1
bne zgemvt_kernel_S1X1_10


zgemvt_kernel_S1_END:

SAVE_S1



/*************************************************************************************************************/

zgemvt_kernel_L999:

sub r3, fp, #192

#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s15 } // restore floating point registers
#endif

mov r0, #0 // set return value

sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr

EPILOGUE


+ 106
- 0
kernel/arm/znrm2.c View File

@@ -0,0 +1,106 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/13 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#define ABS fabs

#else

#define ABS fabsf

#endif



FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT scale = 0.0;
FLOAT ssq = 1.0;
BLASLONG inc_x2;
FLOAT temp;

if (n < 0 || inc_x < 1 ) return(0.0);

inc_x2 = 2 * inc_x;

n *= inc_x2;
while(i < n)
{
if ( x[i] != 0.0 )
{
temp = ABS( x[i] );
if ( scale < temp )
{
ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
scale = temp ;
}
else
{
ssq += ( temp / scale ) * ( temp / scale );
}

}
if ( x[i+1] != 0.0 )
{
temp = ABS( x[i+1] );
if ( scale < temp )
{
ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
scale = temp ;
}
else
{
ssq += ( temp / scale ) * ( temp / scale );
}

}


i += inc_x2;
}
scale = scale * sqrt( ssq );
return(scale);

}


+ 68
- 0
kernel/arm/zrot.c View File

@@ -0,0 +1,68 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];

if ( n <= 0 ) return(0);

BLASLONG inc_x2 = 2 * inc_x ;
BLASLONG inc_y2 = 2 * inc_y ;

while(i < n)
{
temp[0] = c*x[ix] + s*y[iy] ;
temp[1] = c*x[ix+1] + s*y[iy+1] ;
y[iy] = c*y[iy] - s*x[ix] ;
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;

ix += inc_x2 ;
iy += inc_y2 ;
i++ ;

}
return(0);

}


+ 64
- 0
kernel/arm/zscal.c View File

@@ -0,0 +1,64 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG inc_x2;
BLASLONG ip = 0;
FLOAT temp;

if ( n < 0 || inc_x < 1 ) return(0);

inc_x2 = 2 * inc_x;
for ( i=0; i<n; i++ )
{

temp = da_r * x[ip] - da_i * x[ip+1] ;
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
x[ip] = temp;

ip += inc_x2;
}

return(0);

}


+ 70
- 0
kernel/arm/zswap.c View File

@@ -0,0 +1,70 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"
#include <stdio.h>

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];

if ( n < 0 ) return(0);

BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;

while(i < n)
{

temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;

ix += inc_x2 ;
iy += inc_y2 ;
i++ ;

}
return(0);

}


+ 1537
- 0
kernel/arm/ztrmm_kernel_2x2_vfp.S
File diff suppressed because it is too large
View File


+ 1538
- 0
kernel/arm/ztrmm_kernel_2x2_vfpv3.S
File diff suppressed because it is too large
View File


+ 46
- 0
kernel/arm64/KERNEL View File

@@ -0,0 +1,46 @@
ifndef SNRM2KERNEL
SNRM2KERNEL = nrm2.c
endif

ifndef DNRM2KERNEL
DNRM2KERNEL = nrm2.c
endif

ifndef CNRM2KERNEL
CNRM2KERNEL = znrm2.c
endif

ifndef ZNRM2KERNEL
ZNRM2KERNEL = znrm2.c
endif

ifndef SCABS_KERNEL
SCABS_KERNEL = ../generic/cabs.c
endif

ifndef DCABS_KERNEL
DCABS_KERNEL = ../generic/cabs.c
endif

ifndef QCABS_KERNEL
QCABS_KERNEL = ../generic/cabs.c
endif

ifndef LSAME_KERNEL
LSAME_KERNEL = ../generic/lsame.c
endif

ifndef SGEMM_BETA
SGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef DGEMM_BETA
DGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef CGEMM_BETA
CGEMM_BETA = ../generic/zgemm_beta.c
endif
ifndef ZGEMM_BETA
ZGEMM_BETA = ../generic/zgemm_beta.c
endif



+ 134
- 0
kernel/arm64/KERNEL.ARMV8 View File

@@ -0,0 +1,134 @@
SAMAXKERNEL = ../arm/amax.c
DAMAXKERNEL = ../arm/amax.c
CAMAXKERNEL = ../arm/zamax.c
ZAMAXKERNEL = ../arm/zamax.c

SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c

SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c

SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c

ISAMAXKERNEL = ../arm/iamax.c
IDAMAXKERNEL = ../arm/iamax.c
ICAMAXKERNEL = ../arm/izamax.c
IZAMAXKERNEL = ../arm/izamax.c

ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c

ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c

ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c

SASUMKERNEL = ../arm/asum.c
DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c

SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c
ZAXPYKERNEL = ../arm/zaxpy.c

SCOPYKERNEL = ../arm/copy.c
DCOPYKERNEL = ../arm/copy.c
CCOPYKERNEL = ../arm/zcopy.c
ZCOPYKERNEL = ../arm/zcopy.c

SDOTKERNEL = ../arm/dot.c
DDOTKERNEL = ../arm/dot.c
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c

SNRM2KERNEL = ../arm/nrm2.c
DNRM2KERNEL = ../arm/nrm2.c
CNRM2KERNEL = ../arm/znrm2.c
ZNRM2KERNEL = ../arm/znrm2.c

SROTKERNEL = ../arm/rot.c
DROTKERNEL = ../arm/rot.c
CROTKERNEL = ../arm/zrot.c
ZROTKERNEL = ../arm/zrot.c

SSCALKERNEL = ../arm/scal.c
DSCALKERNEL = ../arm/scal.c
CSCALKERNEL = ../arm/zscal.c
ZSCALKERNEL = ../arm/zscal.c

SSWAPKERNEL = ../arm/swap.c
DSWAPKERNEL = ../arm/swap.c
CSWAPKERNEL = ../arm/zswap.c
ZSWAPKERNEL = ../arm/zswap.c

SGEMVNKERNEL = ../arm/gemv_n.c
DGEMVNKERNEL = ../arm/gemv_n.c
CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = ../arm/zgemv_n.c

SGEMVTKERNEL = ../arm/gemv_t.c
DGEMVTKERNEL = ../arm/gemv_t.c
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c

STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c

SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o

DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o

CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o

ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o

STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c





+ 2
- 0
kernel/arm64/Makefile View File

@@ -0,0 +1,2 @@
clean ::


+ 33
- 0
lapack/laswp/arm/Makefile View File

@@ -0,0 +1,33 @@
TOPDIR = ../../..
include ../../../Makefile.system

ifeq ($(CORE), CORE2)
LASWP = ../generic/laswp_k_2.c
ZLASWP = ../generic/zlaswp_k_2.c
endif

ifeq ($(CORE), OPTERON)
LASWP = ../generic/laswp_k_1.c
ZLASWP = ../generic/zlaswp_k_1.c
endif

ifeq ($(CORE), PRESCOTT)
LASWP = ../generic/laswp_k_1.c
ZLASWP = ../generic/zlaswp_k_1.c
endif

ifeq ($(DYNAMIC_ARCH), 1)
LASWP = ../generic/laswp_k_4.c
ZLASWP = ../generic/zlaswp_k_4.c
endif

ifndef LASWP
LASWP = ../generic/laswp_k.c
endif

ifndef ZLASWP
ZLASWP = ../generic/zlaswp_k.c
endif

include ../generic/Makefile


+ 33
- 0
lapack/laswp/arm64/Makefile View File

@@ -0,0 +1,33 @@
TOPDIR = ../../..
include ../../../Makefile.system

ifeq ($(CORE), CORE2)
LASWP = ../generic/laswp_k_2.c
ZLASWP = ../generic/zlaswp_k_2.c
endif

ifeq ($(CORE), OPTERON)
LASWP = ../generic/laswp_k_1.c
ZLASWP = ../generic/zlaswp_k_1.c
endif

ifeq ($(CORE), PRESCOTT)
LASWP = ../generic/laswp_k_1.c
ZLASWP = ../generic/zlaswp_k_1.c
endif

ifeq ($(DYNAMIC_ARCH), 1)
LASWP = ../generic/laswp_k_4.c
ZLASWP = ../generic/zlaswp_k_4.c
endif

ifndef LASWP
LASWP = ../generic/laswp_k.c
endif

ifndef ZLASWP
ZLASWP = ../generic/zlaswp_k.c
endif

include ../generic/Makefile


Loading…
Cancel
Save