You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_ncopy_4.S 7.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M r3
  41. #define N r4
  42. #define A r5
  43. #define LDA r6
  44. #define B r7
  45. #define AO1 r8
  46. #define AO2 r9
  47. #define AO3 r10
  48. #define AO4 r11
  49. #define J r12
  50. #define PREA r14
  51. #define PREB1 r15
  52. #define c01 f0
  53. #define c02 f1
  54. #define c03 f2
  55. #define c04 f3
  56. #define c05 f4
  57. #define c06 f5
  58. #define c07 f6
  59. #define c08 f7
  60. #define c09 f8
  61. #define c10 f9
  62. #define c11 f10
  63. #define c12 f11
  64. #define c13 f12
  65. #define c14 f13
  66. #define c15 f14
  67. #define c16 f15
  68. #define STACKSIZE 32
  69. #ifdef CELL
  70. #define PREFETCHSIZE 16
  71. #define PREFETCHWSIZE 72
  72. #endif
  73. #ifdef PPC970
  74. #define PREFETCHSIZE 16
  75. #define PREFETCHWSIZE 72
  76. #endif
  77. #ifdef PPC440
  78. #define PREFETCHSIZE 16
  79. #define PREFETCHWSIZE 72
  80. #endif
  81. #ifdef POWER4
  82. #define PREFETCHSIZE 16
  83. #define PREFETCHWSIZE 72
  84. #endif
  85. #ifdef POWER5
  86. #define PREFETCHSIZE 16
  87. #define PREFETCHWSIZE 72
  88. #endif
  89. #ifdef POWER6
  90. #define PREFETCHSIZE 16
  91. #define PREFETCHWSIZE 72
  92. #endif
  93. #ifdef PPCG4
  94. #define PREFETCHSIZE 16
  95. #define PREFETCHWSIZE 72
  96. #endif
  97. #ifdef POWER8
  98. #define PREFETCHSIZE 16
  99. #define PREFETCHWSIZE 72
  100. #endif
  101. PROLOGUE
  102. PROFCODE
  103. addi SP, SP, -STACKSIZE
  104. li r0, 0
  105. stfd f14, 0(SP)
  106. stfd f15, 8(SP)
  107. #ifdef __64BIT__
  108. std r14, 16(SP)
  109. std r15, 24(SP)
  110. #else
  111. stw r14, 16(SP)
  112. stw r15, 20(SP)
  113. #endif
  114. slwi LDA, LDA, BASE_SHIFT
  115. li PREA, PREFETCHSIZE * SIZE
  116. li PREB1, (PREFETCHWSIZE + 0) * SIZE
  117. cmpwi cr0, M, 0
  118. ble- LL(999)
  119. cmpwi cr0, N, 0
  120. ble- LL(999)
  121. srawi. J, N, 2
  122. ble LL(20)
  123. .align 4
  124. LL(10):
  125. mr AO1, A
  126. add AO2, A, LDA
  127. add AO3, AO2, LDA
  128. add AO4, AO3, LDA
  129. add A, AO4, LDA
  130. srawi. r0, M, 2
  131. mtspr CTR, r0
  132. ble LL(15)
  133. .align 4
  134. LL(12):
  135. LFD c01, 0 * SIZE(AO1)
  136. LFD c02, 1 * SIZE(AO1)
  137. LFD c03, 2 * SIZE(AO1)
  138. LFD c04, 3 * SIZE(AO1)
  139. LFD c05, 0 * SIZE(AO2)
  140. LFD c06, 1 * SIZE(AO2)
  141. LFD c07, 2 * SIZE(AO2)
  142. LFD c08, 3 * SIZE(AO2)
  143. LFD c09, 0 * SIZE(AO3)
  144. LFD c10, 1 * SIZE(AO3)
  145. LFD c11, 2 * SIZE(AO3)
  146. LFD c12, 3 * SIZE(AO3)
  147. LFD c13, 0 * SIZE(AO4)
  148. LFD c14, 1 * SIZE(AO4)
  149. LFD c15, 2 * SIZE(AO4)
  150. LFD c16, 3 * SIZE(AO4)
  151. STFD c01, 0 * SIZE(B)
  152. STFD c05, 1 * SIZE(B)
  153. STFD c09, 2 * SIZE(B)
  154. STFD c13, 3 * SIZE(B)
  155. STFD c02, 4 * SIZE(B)
  156. STFD c06, 5 * SIZE(B)
  157. STFD c10, 6 * SIZE(B)
  158. STFD c14, 7 * SIZE(B)
  159. STFD c03, 8 * SIZE(B)
  160. STFD c07, 9 * SIZE(B)
  161. STFD c11, 10 * SIZE(B)
  162. STFD c15, 11 * SIZE(B)
  163. STFD c04, 12 * SIZE(B)
  164. STFD c08, 13 * SIZE(B)
  165. STFD c12, 14 * SIZE(B)
  166. STFD c16, 15 * SIZE(B)
  167. #if defined(POWER6) || defined(POWER8)
  168. dcbtst PREA, AO1
  169. dcbtst PREA, AO2
  170. dcbtst PREA, AO3
  171. dcbtst PREA, AO4
  172. #else
  173. dcbt PREA, AO1
  174. dcbt PREA, AO2
  175. dcbt PREA, AO3
  176. dcbt PREA, AO4
  177. #endif
  178. dcbtst PREB1, B
  179. addi AO1, AO1, 4 * SIZE
  180. addi AO2, AO2, 4 * SIZE
  181. addi AO3, AO3, 4 * SIZE
  182. addi AO4, AO4, 4 * SIZE
  183. addi B, B, 16 * SIZE
  184. bdnz LL(12)
  185. .align 4
  186. LL(15):
  187. andi. r0, M, 3
  188. mtspr CTR, r0
  189. ble LL(17)
  190. .align 4
  191. LL(16):
  192. LFD c01, 0 * SIZE(AO1)
  193. LFD c05, 0 * SIZE(AO2)
  194. LFD c09, 0 * SIZE(AO3)
  195. LFD c13, 0 * SIZE(AO4)
  196. STFD c01, 0 * SIZE(B)
  197. STFD c05, 1 * SIZE(B)
  198. STFD c09, 2 * SIZE(B)
  199. STFD c13, 3 * SIZE(B)
  200. addi AO1, AO1, 1 * SIZE
  201. addi AO2, AO2, 1 * SIZE
  202. addi AO3, AO3, 1 * SIZE
  203. addi AO4, AO4, 1 * SIZE
  204. addi B, B, 4 * SIZE
  205. bdnz LL(16)
  206. .align 4
  207. LL(17):
  208. addic. J, J, -1
  209. bgt LL(10)
  210. .align 4
  211. LL(20):
  212. andi. J, N, 2
  213. ble LL(30)
  214. mr AO1, A
  215. add AO2, A, LDA
  216. add A, AO2, LDA
  217. srawi. r0, M, 2
  218. mtspr CTR, r0
  219. ble LL(25)
  220. .align 4
  221. LL(22):
  222. LFD c01, 0 * SIZE(AO1)
  223. LFD c02, 1 * SIZE(AO1)
  224. LFD c03, 2 * SIZE(AO1)
  225. LFD c04, 3 * SIZE(AO1)
  226. LFD c05, 0 * SIZE(AO2)
  227. LFD c06, 1 * SIZE(AO2)
  228. LFD c07, 2 * SIZE(AO2)
  229. LFD c08, 3 * SIZE(AO2)
  230. STFD c01, 0 * SIZE(B)
  231. STFD c05, 1 * SIZE(B)
  232. STFD c02, 2 * SIZE(B)
  233. STFD c06, 3 * SIZE(B)
  234. STFD c03, 4 * SIZE(B)
  235. STFD c07, 5 * SIZE(B)
  236. STFD c04, 6 * SIZE(B)
  237. STFD c08, 7 * SIZE(B)
  238. addi AO1, AO1, 4 * SIZE
  239. addi AO2, AO2, 4 * SIZE
  240. addi B, B, 8 * SIZE
  241. bdnz LL(22)
  242. .align 4
  243. LL(25):
  244. andi. r0, M, 3
  245. mtspr CTR, r0
  246. ble LL(30)
  247. .align 4
  248. LL(26):
  249. LFD c01, 0 * SIZE(AO1)
  250. LFD c05, 0 * SIZE(AO2)
  251. STFD c01, 0 * SIZE(B)
  252. STFD c05, 1 * SIZE(B)
  253. addi AO1, AO1, 1 * SIZE
  254. addi AO2, AO2, 1 * SIZE
  255. addi B, B, 2 * SIZE
  256. bdnz LL(26)
  257. .align 4
  258. LL(30):
  259. andi. J, N, 1
  260. ble LL(999)
  261. mr AO1, A
  262. srawi. r0, M, 2
  263. mtspr CTR, r0
  264. ble LL(35)
  265. .align 4
  266. LL(32):
  267. LFD c01, 0 * SIZE(AO1)
  268. LFD c02, 1 * SIZE(AO1)
  269. LFD c03, 2 * SIZE(AO1)
  270. LFD c04, 3 * SIZE(AO1)
  271. STFD c01, 0 * SIZE(B)
  272. STFD c02, 1 * SIZE(B)
  273. STFD c03, 2 * SIZE(B)
  274. STFD c04, 3 * SIZE(B)
  275. addi AO1, AO1, 4 * SIZE
  276. addi B, B, 4 * SIZE
  277. bdnz LL(32)
  278. .align 4
  279. LL(35):
  280. andi. r0, M, 3
  281. mtspr CTR, r0
  282. ble LL(999)
  283. .align 4
  284. LL(36):
  285. LFD c01, 0 * SIZE(AO1)
  286. STFD c01, 0 * SIZE(B)
  287. addi AO1, AO1, 1 * SIZE
  288. addi B, B, 1 * SIZE
  289. bdnz LL(36)
  290. .align 4
  291. LL(999):
  292. li r3, 0
  293. lfd f14, 0(SP)
  294. lfd f15, 8(SP)
  295. #ifdef __64BIT__
  296. ld r14, 16(SP)
  297. ld r15, 24(SP)
  298. #else
  299. lwz r14, 16(SP)
  300. lwz r15, 20(SP)
  301. #endif
  302. addi SP, SP, STACKSIZE
  303. blr
  304. EPILOGUE