You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n_loongson3a.c 2.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. #include "common.h"
  2. //These are auto-tuning codes on Loongson-3A platform.
  3. //#define prefetch(x) __builtin_prefetch(x)
  4. //#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
  5. #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
  6. #define likely(x) __builtin_expect(!!(x), 1)
  7. #define unlikely(x) __builtin_expect(!!(x), 0)
  8. #define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0)
  9. #define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0)
  10. #define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
  11. #define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
  12. int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER)
  13. {
  14. BLASLONG kx=0, ky=0;
  15. if(!ALPHA)
  16. return 0;
  17. //if(INCX < 0)
  18. // kx = (1-N) * INCX;
  19. // INCX = -INCX;
  20. //if(INCY < 0)
  21. // ky = (1-M) * INCY;
  22. // INCY = -INCY;
  23. BLASLONG fahead = 30;
  24. BLASLONG spec_unroll = 4;
  25. BLASLONG tMQ = M - M % spec_unroll;
  26. BLASLONG j = 0, k = 0;
  27. if(ALPHA == 1) {
  28. if(INCY == 1) {
  29. for(k=kx; likely(j < N); j++, k += INCX) {
  30. BLASLONG i = 0;
  31. for(; likely(i < tMQ);) {
  32. prefetch(A[LDA * j + i + fahead]);
  33. prefetch(Y[i + fahead]);
  34. /*loop_mark*/ spec_loop_alpha1;
  35. /*loop_mark*/ spec_loop_alpha1;
  36. /*loop_mark*/ spec_loop_alpha1;
  37. /*loop_mark*/ spec_loop_alpha1;
  38. }
  39. for(; likely(i < M);) {
  40. spec_loop_alpha1;
  41. }
  42. }
  43. } else {
  44. for(k=kx; likely(j < N); j++, k += INCX) {
  45. BLASLONG i = 0, h = ky;
  46. for(; likely(i < tMQ);) {
  47. prefetch(A[LDA * j + i + fahead]);
  48. prefetch(Y[h + fahead]);
  49. /*loop_mark*/ norm_loop_alpha1;
  50. /*loop_mark*/ norm_loop_alpha1;
  51. /*loop_mark*/ norm_loop_alpha1;
  52. /*loop_mark*/ norm_loop_alpha1;
  53. }
  54. for(; likely(i < M);) {
  55. norm_loop_alpha1;
  56. }
  57. }
  58. }
  59. } else {
  60. if(INCY == 1) {
  61. for(k=kx; likely(j < N); j++, k += INCX) {
  62. BLASLONG i = 0;
  63. for(; likely(i < tMQ);) {
  64. prefetch(A[LDA * j + i + fahead]);
  65. prefetch(Y[i + fahead]);
  66. /*loop_mark*/ spec_loop;
  67. /*loop_mark*/ spec_loop;
  68. /*loop_mark*/ spec_loop;
  69. /*loop_mark*/ spec_loop;
  70. }
  71. for(; likely(i < M);) {
  72. spec_loop;
  73. }
  74. }
  75. } else {
  76. for(k=kx; likely(j < N); j++, k += INCX) {
  77. BLASLONG i = 0, h = ky;
  78. for(; likely(i < tMQ);) {
  79. prefetch(A[LDA * j + i + fahead]);
  80. prefetch(Y[h + fahead]);
  81. /*loop_mark*/ norm_loop;
  82. /*loop_mark*/ norm_loop;
  83. /*loop_mark*/ norm_loop;
  84. /*loop_mark*/ norm_loop;
  85. }
  86. for(; likely(i < M);) {
  87. norm_loop;
  88. }
  89. }
  90. }
  91. }
  92. return 0;
  93. }