You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_ncopy_8_power10.c 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. #include <altivec.h>
  41. #define PREFETCHA(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
  42. int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
  43. BLASLONG i, j;
  44. IFLOAT *aoffset;
  45. IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
  46. IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
  47. IFLOAT *boffset;
  48. IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
  49. IFLOAT ctemp09, ctemp17, ctemp33;
  50. IFLOAT ctemp25, ctemp41;
  51. IFLOAT ctemp49, ctemp57;
  52. aoffset = a;
  53. boffset = b;
  54. j = (n >> 3);
  55. if (j > 0){
  56. do{
  57. aoffset1 = aoffset;
  58. aoffset2 = aoffset1 + lda;
  59. aoffset3 = aoffset2 + lda;
  60. aoffset4 = aoffset3 + lda;
  61. aoffset5 = aoffset4 + lda;
  62. aoffset6 = aoffset5 + lda;
  63. aoffset7 = aoffset6 + lda;
  64. aoffset8 = aoffset7 + lda;
  65. aoffset += 8 * lda;
  66. i = (m >> 3);
  67. if (i > 0){
  68. do{
  69. PREFETCHA (aoffset1, 384);
  70. PREFETCHA (aoffset2, 384);
  71. PREFETCHA (aoffset3, 384);
  72. PREFETCHA (aoffset4, 384);
  73. PREFETCHA (aoffset5, 384);
  74. PREFETCHA (aoffset6, 384);
  75. PREFETCHA (aoffset7, 384);
  76. PREFETCHA (aoffset8, 384);
  77. __vector double va0 = *(__vector double*)(aoffset1 + 0);
  78. __vector double va1 = *(__vector double*)(aoffset1 + 2);
  79. __vector double va2 = *(__vector double*)(aoffset1 + 4);
  80. __vector double va3 = *(__vector double*)(aoffset1 + 6);
  81. __vector double va4 = *(__vector double*)(aoffset2 + 0);
  82. __vector double va5 = *(__vector double*)(aoffset2 + 2);
  83. __vector double va6 = *(__vector double*)(aoffset2 + 4);
  84. __vector double va7 = *(__vector double*)(aoffset2 + 6);
  85. __vector double va8 = *(__vector double*)(aoffset3 + 0);
  86. __vector double va9 = *(__vector double*)(aoffset3 + 2);
  87. __vector double va10 = *(__vector double*)(aoffset3 + 4);
  88. __vector double va11 = *(__vector double*)(aoffset3 + 6);
  89. __vector double va12 = *(__vector double*)(aoffset4 + 0);
  90. __vector double va13 = *(__vector double*)(aoffset4 + 2);
  91. __vector double va14 = *(__vector double*)(aoffset4 + 4);
  92. __vector double va15 = *(__vector double*)(aoffset4 + 6);
  93. __vector double va16 = *(__vector double*)(aoffset5 + 0);
  94. __vector double va17 = *(__vector double*)(aoffset5 + 2);
  95. __vector double va18 = *(__vector double*)(aoffset5 + 4);
  96. __vector double va19 = *(__vector double*)(aoffset5 + 6);
  97. __vector double va20 = *(__vector double*)(aoffset6 + 0);
  98. __vector double va21 = *(__vector double*)(aoffset6 + 2);
  99. __vector double va22 = *(__vector double*)(aoffset6 + 4);
  100. __vector double va23 = *(__vector double*)(aoffset6 + 6);
  101. __vector double va24 = *(__vector double*)(aoffset7 + 0);
  102. __vector double va25 = *(__vector double*)(aoffset7 + 2);
  103. __vector double va26 = *(__vector double*)(aoffset7 + 4);
  104. __vector double va27 = *(__vector double*)(aoffset7 + 6);
  105. __vector double va28 = *(__vector double*)(aoffset8 + 0);
  106. __vector double va29 = *(__vector double*)(aoffset8 + 2);
  107. __vector double va30 = *(__vector double*)(aoffset8 + 4);
  108. __vector double va31 = *(__vector double*)(aoffset8 + 6);
  109. *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va4, 0);
  110. *(__vector double*)(boffset + 2) = vec_xxpermdi(va8, va12, 0);
  111. *(__vector double*)(boffset + 4) = vec_xxpermdi(va16, va20, 0);
  112. *(__vector double*)(boffset + 6) = vec_xxpermdi(va24, va28, 0);
  113. *(__vector double*)(boffset + 8) = vec_xxpermdi(va0, va4, 3);
  114. *(__vector double*)(boffset + 10) = vec_xxpermdi(va8, va12, 3);
  115. *(__vector double*)(boffset + 12) = vec_xxpermdi(va16, va20, 3);
  116. *(__vector double*)(boffset + 14) = vec_xxpermdi(va24, va28, 3);
  117. *(__vector double*)(boffset + 16) = vec_xxpermdi(va1, va5, 0);
  118. *(__vector double*)(boffset + 18) = vec_xxpermdi(va9, va13, 0);
  119. *(__vector double*)(boffset + 20) = vec_xxpermdi(va17, va21, 0);
  120. *(__vector double*)(boffset + 22) = vec_xxpermdi(va25, va29, 0);
  121. *(__vector double*)(boffset + 24) = vec_xxpermdi(va1, va5, 3);
  122. *(__vector double*)(boffset + 26) = vec_xxpermdi(va9, va13, 3);
  123. *(__vector double*)(boffset + 28) = vec_xxpermdi(va17, va21, 3);
  124. *(__vector double*)(boffset + 30) = vec_xxpermdi(va25, va29, 3);
  125. *(__vector double*)(boffset + 32) = vec_xxpermdi(va2, va6, 0);
  126. *(__vector double*)(boffset + 34) = vec_xxpermdi(va10, va14, 0);
  127. *(__vector double*)(boffset + 36) = vec_xxpermdi(va18, va22, 0);
  128. *(__vector double*)(boffset + 38) = vec_xxpermdi(va26, va30, 0);
  129. *(__vector double*)(boffset + 40) = vec_xxpermdi(va2, va6, 3);
  130. *(__vector double*)(boffset + 42) = vec_xxpermdi(va10, va14, 3);
  131. *(__vector double*)(boffset + 44) = vec_xxpermdi(va18, va22, 3);
  132. *(__vector double*)(boffset + 46) = vec_xxpermdi(va26, va30, 3);
  133. *(__vector double*)(boffset + 48) = vec_xxpermdi(va3, va7, 0);
  134. *(__vector double*)(boffset + 50) = vec_xxpermdi(va11, va15, 0);
  135. *(__vector double*)(boffset + 52) = vec_xxpermdi(va19, va23, 0);
  136. *(__vector double*)(boffset + 54) = vec_xxpermdi(va27, va31, 0);
  137. *(__vector double*)(boffset + 56) = vec_xxpermdi(va3, va7, 3);
  138. *(__vector double*)(boffset + 58) = vec_xxpermdi(va11, va15, 3);
  139. *(__vector double*)(boffset + 60) = vec_xxpermdi(va19, va23, 3);
  140. *(__vector double*)(boffset + 62) = vec_xxpermdi(va27, va31, 3);
  141. aoffset1 += 8;
  142. aoffset2 += 8;
  143. aoffset3 += 8;
  144. aoffset4 += 8;
  145. aoffset5 += 8;
  146. aoffset6 += 8;
  147. aoffset7 += 8;
  148. aoffset8 += 8;
  149. boffset += 64;
  150. i --;
  151. }while(i > 0);
  152. }
  153. i = (m & 7);
  154. if (i > 0){
  155. do{
  156. ctemp01 = *(aoffset1 + 0);
  157. ctemp09 = *(aoffset2 + 0);
  158. ctemp17 = *(aoffset3 + 0);
  159. ctemp25 = *(aoffset4 + 0);
  160. ctemp33 = *(aoffset5 + 0);
  161. ctemp41 = *(aoffset6 + 0);
  162. ctemp49 = *(aoffset7 + 0);
  163. ctemp57 = *(aoffset8 + 0);
  164. *(boffset + 0) = ctemp01;
  165. *(boffset + 1) = ctemp09;
  166. *(boffset + 2) = ctemp17;
  167. *(boffset + 3) = ctemp25;
  168. *(boffset + 4) = ctemp33;
  169. *(boffset + 5) = ctemp41;
  170. *(boffset + 6) = ctemp49;
  171. *(boffset + 7) = ctemp57;
  172. aoffset1 ++;
  173. aoffset2 ++;
  174. aoffset3 ++;
  175. aoffset4 ++;
  176. aoffset5 ++;
  177. aoffset6 ++;
  178. aoffset7 ++;
  179. aoffset8 ++;
  180. boffset += 8;
  181. i --;
  182. }while(i > 0);
  183. }
  184. j--;
  185. }while(j > 0);
  186. } /* end of if(j > 0) */
  187. if (n & 4){
  188. aoffset1 = aoffset;
  189. aoffset2 = aoffset1 + lda;
  190. aoffset3 = aoffset2 + lda;
  191. aoffset4 = aoffset3 + lda;
  192. aoffset += 4 * lda;
  193. i = (m >> 2);
  194. if (i > 0){
  195. do{
  196. PREFETCHA (aoffset1, 384);
  197. PREFETCHA (aoffset2, 384);
  198. PREFETCHA (aoffset3, 384);
  199. PREFETCHA (aoffset4, 384);
  200. __vector double va0 = *(__vector double*)(aoffset1 + 0);
  201. __vector double va1 = *(__vector double*)(aoffset1 + 2);
  202. __vector double va2 = *(__vector double*)(aoffset2 + 0);
  203. __vector double va3 = *(__vector double*)(aoffset2 + 2);
  204. __vector double va4 = *(__vector double*)(aoffset3 + 0);
  205. __vector double va5 = *(__vector double*)(aoffset3 + 2);
  206. __vector double va6 = *(__vector double*)(aoffset4 + 0);
  207. __vector double va7 = *(__vector double*)(aoffset4 + 2);
  208. *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va2, 0);
  209. *(__vector double*)(boffset + 2) = vec_xxpermdi(va4, va6, 0);
  210. *(__vector double*)(boffset + 4) = vec_xxpermdi(va0, va2, 3);
  211. *(__vector double*)(boffset + 6) = vec_xxpermdi(va4, va6, 3);
  212. *(__vector double*)(boffset + 8) = vec_xxpermdi(va1, va3, 0);
  213. *(__vector double*)(boffset + 10) = vec_xxpermdi(va5, va7, 0);
  214. *(__vector double*)(boffset + 12) = vec_xxpermdi(va1, va3, 3);
  215. *(__vector double*)(boffset + 14) = vec_xxpermdi(va5, va7, 3);
  216. aoffset1 += 4;
  217. aoffset2 += 4;
  218. aoffset3 += 4;
  219. aoffset4 += 4;
  220. boffset += 16;
  221. i --;
  222. }while(i > 0);
  223. }
  224. i = (m & 3);
  225. if (i > 0){
  226. do{
  227. ctemp01 = *(aoffset1 + 0);
  228. ctemp02 = *(aoffset2 + 0);
  229. ctemp03 = *(aoffset3 + 0);
  230. ctemp04 = *(aoffset4 + 0);
  231. *(boffset + 0) = ctemp01;
  232. *(boffset + 1) = ctemp02;
  233. *(boffset + 2) = ctemp03;
  234. *(boffset + 3) = ctemp04;
  235. aoffset1 ++;
  236. aoffset2 ++;
  237. aoffset3 ++;
  238. aoffset4 ++;
  239. boffset += 4;
  240. i --;
  241. }while(i > 0);
  242. }
  243. } /* end of if(j > 0) */
  244. if (n & 2){
  245. aoffset1 = aoffset;
  246. aoffset2 = aoffset1 + lda;
  247. aoffset += 2 * lda;
  248. i = (m >> 1);
  249. if (i > 0){
  250. do{
  251. __vector double va0 = *(__vector double*)(aoffset1 + 0);
  252. __vector double va1 = *(__vector double*)(aoffset2 + 0);
  253. *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va1, 0);
  254. *(__vector double*)(boffset + 2) = vec_xxpermdi(va0, va1, 3);
  255. aoffset1 += 2;
  256. aoffset2 += 2;
  257. boffset += 4;
  258. i --;
  259. }while(i > 0);
  260. }
  261. if (m & 1){
  262. ctemp01 = *(aoffset1 + 0);
  263. ctemp02 = *(aoffset2 + 0);
  264. *(boffset + 0) = ctemp01;
  265. *(boffset + 1) = ctemp02;
  266. aoffset1 ++;
  267. aoffset2 ++;
  268. boffset += 2;
  269. }
  270. } /* end of if(j > 0) */
  271. if (n & 1){
  272. aoffset1 = aoffset;
  273. i = m;
  274. if (i > 0){
  275. do{
  276. ctemp01 = *(aoffset1 + 0);
  277. *(boffset + 0) = ctemp01;
  278. aoffset1 ++;
  279. boffset ++;
  280. i --;
  281. }while(i > 0);
  282. }
  283. } /* end of if(j > 0) */
  284. return 0;
  285. }