You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_t_loongson3a.c 6.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. #include "common.h"
  2. #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
  3. #define likely(x) __builtin_expect(!!(x), 1)
  4. #define unlikely(x) __builtin_expect(!!(x), 0)
  5. #if !defined(CONJ) && !defined(XCONJ)
  6. #define spec_loop_alpha1 spec_loop_alpha1_0
  7. #define spec_loop spec_loop_0
  8. #define norm_loop_alpha1 norm_loop_alpha1_0
  9. #define norm_loop norm_loop_0
  10. #endif
  11. #if defined(CONJ) && !defined(XCONJ)
  12. #define spec_loop_alpha1 spec_loop_alpha1_1
  13. #define spec_loop spec_loop_1
  14. #define norm_loop_alpha1 norm_loop_alpha1_1
  15. #define norm_loop norm_loop_1
  16. #endif
  17. #if !defined(CONJ) && defined(XCONJ)
  18. #define spec_loop_alpha1 spec_loop_alpha1_2
  19. #define spec_loop spec_loop_2
  20. #define norm_loop_alpha1 norm_loop_alpha1_2
  21. #define norm_loop norm_loop_2
  22. #endif
  23. #if defined(CONJ) && defined(XCONJ)
  24. #define spec_loop_alpha1 spec_loop_alpha1_3
  25. #define spec_loop spec_loop_3
  26. #define norm_loop_alpha1 norm_loop_alpha1_3
  27. #define norm_loop norm_loop_3
  28. #endif
  29. #define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
  30. #define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
  31. #define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
  32. #define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
  33. #define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
  34. #define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
  35. #define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
  36. #define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
  37. #define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
  38. #define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
  39. #define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
  40. #define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
  41. #define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
  42. #define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
  43. #define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
  44. #define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
  45. int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
  46. if(!rALPHA && iALPHA)
  47. return 0;
  48. BLASLONG fahead = 30;
  49. BLASLONG spec_unroll = 2;
  50. BLASLONG tMQ = M - M % spec_unroll;
  51. BLASLONG j = 0, k = 0, jj = 0;
  52. if(rALPHA == 1 && iALPHA == 0) {
  53. if(INCX == 1) {
  54. for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
  55. BLASLONG i = 0, ii = 0;
  56. for(; likely(i < tMQ); i += spec_unroll) {
  57. prefetch(A[jj + ii + fahead]);
  58. prefetch(X[ii + fahead]);
  59. /*loop_mark*/ spec_loop_alpha1;
  60. /*loop_mark*/ spec_loop_alpha1;
  61. }
  62. for(; likely(i < M); i++) {
  63. spec_loop_alpha1;
  64. }
  65. }
  66. } else {
  67. for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
  68. BLASLONG i = 0, ii = 0, iii = 0;
  69. for(; likely(i < tMQ); i += spec_unroll) {
  70. prefetch(A[jj + ii + fahead]);
  71. prefetch(X[iii + fahead]);
  72. /*loop_mark*/ norm_loop_alpha1;
  73. /*loop_mark*/ norm_loop_alpha1;
  74. }
  75. for(; likely(i < M); i++) {
  76. norm_loop_alpha1;
  77. }
  78. }
  79. }
  80. } else {
  81. FLOAT rTmp, iTmp;
  82. if(INCX == 1) {
  83. for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
  84. BLASLONG i = 0, ii = 0;
  85. for(; likely(i < tMQ); i += spec_unroll) {
  86. prefetch(A[jj + ii + fahead]);
  87. prefetch(X[ii + fahead]);
  88. /*loop_mark*/ spec_loop;
  89. /*loop_mark*/ spec_loop;
  90. }
  91. for(; likely(i < M); i++) {
  92. spec_loop;
  93. }
  94. }
  95. } else {
  96. for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
  97. BLASLONG i = 0, ii = 0, iii = 0;
  98. for(; likely(i < tMQ); i += spec_unroll) {
  99. prefetch(A[jj + ii + fahead]);
  100. prefetch(X[iii + fahead]);
  101. /*loop_mark*/ norm_loop;
  102. /*loop_mark*/ norm_loop;
  103. }
  104. for(; likely(i < M); i++) {
  105. norm_loop;
  106. }
  107. }
  108. }
  109. }
  110. return 0;
  111. }