You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_tcopy.S 8.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %i0
  41. #define N %i1
  42. #define A %i2
  43. #define LDA %i3
  44. #define B %i4
  45. #define A1 %l0
  46. #define A2 %l1
  47. #define A3 %l2
  48. #define A4 %l3
  49. #define I %l4
  50. #define J %l5
  51. #define B1 %o0
  52. #define B2 %o1
  53. #define B3 %o3
  54. #define M4 %o4
  55. #ifdef DOUBLE
  56. #define c01 %f0
  57. #define c02 %f2
  58. #define c03 %f4
  59. #define c04 %f6
  60. #define c05 %f8
  61. #define c06 %f10
  62. #define c07 %f12
  63. #define c08 %f14
  64. #define c09 %f16
  65. #define c10 %f18
  66. #define c11 %f20
  67. #define c12 %f22
  68. #define c13 %f24
  69. #define c14 %f26
  70. #define c15 %f28
  71. #define c16 %f30
  72. #else
  73. #define c01 %f0
  74. #define c02 %f1
  75. #define c03 %f2
  76. #define c04 %f3
  77. #define c05 %f4
  78. #define c06 %f5
  79. #define c07 %f6
  80. #define c08 %f7
  81. #define c09 %f8
  82. #define c10 %f9
  83. #define c11 %f10
  84. #define c12 %f11
  85. #define c13 %f12
  86. #define c14 %f13
  87. #define c15 %f14
  88. #define c16 %f15
  89. #endif
  90. PROLOGUE
  91. SAVESP
  92. sll M, BASE_SHIFT + 2, M4
  93. and N, -4, B2
  94. and N, -2, B3
  95. sll M, BASE_SHIFT, B1
  96. smul B1, B2, B2
  97. smul B1, B3, B3
  98. add B, B2, B2
  99. add B, B3, B3
  100. sra M, 2, J
  101. cmp J, 0
  102. ble,pn %icc, .LL100
  103. sll LDA, BASE_SHIFT, LDA
  104. .LL11:
  105. add A, LDA, A2
  106. mov A, A1
  107. add A2, LDA, A3
  108. sra N, 2, I
  109. add A3, LDA, A4
  110. cmp I, 0
  111. mov B, B1
  112. add B, 16 * SIZE, B
  113. ble,pn %icc, .LL15
  114. add A4, LDA, A
  115. #define PREFETCHSIZE 8
  116. .LL12:
  117. prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0
  118. LDF [A1 + 0 * SIZE], c01
  119. LDF [A1 + 1 * SIZE], c02
  120. LDF [A1 + 2 * SIZE], c03
  121. LDF [A1 + 3 * SIZE], c04
  122. prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0
  123. LDF [A2 + 0 * SIZE], c05
  124. LDF [A2 + 1 * SIZE], c06
  125. LDF [A2 + 2 * SIZE], c07
  126. LDF [A2 + 3 * SIZE], c08
  127. prefetch [A3 + (PREFETCHSIZE + 0) * SIZE], 0
  128. LDF [A3 + 0 * SIZE], c09
  129. LDF [A3 + 1 * SIZE], c10
  130. LDF [A3 + 2 * SIZE], c11
  131. LDF [A3 + 3 * SIZE], c12
  132. prefetch [A4 + (PREFETCHSIZE + 0) * SIZE], 0
  133. LDF [A4 + 0 * SIZE], c13
  134. LDF [A4 + 1 * SIZE], c14
  135. LDF [A4 + 2 * SIZE], c15
  136. LDF [A4 + 3 * SIZE], c16
  137. prefetch [B1 + (PREFETCHSIZE + 0) * SIZE], 0
  138. STF c01, [B1 + 0 * SIZE]
  139. add A1, 4 * SIZE, A1
  140. STF c02, [B1 + 1 * SIZE]
  141. add A2, 4 * SIZE, A2
  142. STF c03, [B1 + 2 * SIZE]
  143. add A3, 4 * SIZE, A3
  144. STF c04, [B1 + 3 * SIZE]
  145. add A4, 4 * SIZE, A4
  146. STF c05, [B1 + 4 * SIZE]
  147. add I, -1, I
  148. STF c06, [B1 + 5 * SIZE]
  149. cmp I, 0
  150. STF c07, [B1 + 6 * SIZE]
  151. STF c08, [B1 + 7 * SIZE]
  152. #ifdef DOUBLE
  153. prefetch [B1 + (PREFETCHSIZE + 8) * SIZE], 0
  154. #endif
  155. STF c09, [B1 + 8 * SIZE]
  156. STF c10, [B1 + 9 * SIZE]
  157. STF c11, [B1 + 10 * SIZE]
  158. STF c12, [B1 + 11 * SIZE]
  159. STF c13, [B1 + 12 * SIZE]
  160. STF c14, [B1 + 13 * SIZE]
  161. STF c15, [B1 + 14 * SIZE]
  162. STF c16, [B1 + 15 * SIZE]
  163. bg,pt %icc, .LL12
  164. add B1, M4, B1
  165. .LL15:
  166. and N, 2, I
  167. cmp I, 0
  168. ble,pn %icc, .LL17
  169. nop
  170. LDF [A1 + 0 * SIZE], c01
  171. LDF [A1 + 1 * SIZE], c02
  172. LDF [A2 + 0 * SIZE], c03
  173. LDF [A2 + 1 * SIZE], c04
  174. LDF [A3 + 0 * SIZE], c05
  175. LDF [A3 + 1 * SIZE], c06
  176. LDF [A4 + 0 * SIZE], c07
  177. LDF [A4 + 1 * SIZE], c08
  178. STF c01, [B2 + 0 * SIZE]
  179. add A1, 2 * SIZE, A1
  180. STF c02, [B2 + 1 * SIZE]
  181. add A2, 2 * SIZE, A2
  182. STF c03, [B2 + 2 * SIZE]
  183. add A3, 2 * SIZE, A3
  184. STF c04, [B2 + 3 * SIZE]
  185. add A4, 2 * SIZE, A4
  186. STF c05, [B2 + 4 * SIZE]
  187. STF c06, [B2 + 5 * SIZE]
  188. STF c07, [B2 + 6 * SIZE]
  189. STF c08, [B2 + 7 * SIZE]
  190. add B2, 8 * SIZE, B2
  191. .LL17:
  192. and N, 1, I
  193. cmp I, 0
  194. ble,pn %icc, .LL99
  195. nop
  196. LDF [A1 + 0 * SIZE], c01
  197. LDF [A2 + 0 * SIZE], c02
  198. LDF [A3 + 0 * SIZE], c03
  199. LDF [A4 + 0 * SIZE], c04
  200. STF c01, [B3 + 0 * SIZE]
  201. STF c02, [B3 + 1 * SIZE]
  202. STF c03, [B3 + 2 * SIZE]
  203. STF c04, [B3 + 3 * SIZE]
  204. add B3, 4 * SIZE, B3
  205. .LL99:
  206. add J, -1, J
  207. cmp J, 0
  208. bg,pt %icc, .LL11
  209. nop
  210. .LL100:
  211. and M, 2, J
  212. cmp J, 0
  213. ble,pn %icc, .LL200
  214. nop
  215. .LL111:
  216. sra N, 2, I
  217. add A, LDA, A2
  218. cmp I, 0
  219. mov A, A1
  220. mov B, B1
  221. add B, 8 * SIZE, B
  222. ble,pn %icc, .LL115
  223. add A2, LDA, A
  224. .LL112:
  225. LDF [A1 + 0 * SIZE], c01
  226. LDF [A1 + 1 * SIZE], c02
  227. LDF [A1 + 2 * SIZE], c03
  228. LDF [A1 + 3 * SIZE], c04
  229. LDF [A2 + 0 * SIZE], c05
  230. LDF [A2 + 1 * SIZE], c06
  231. LDF [A2 + 2 * SIZE], c07
  232. LDF [A2 + 3 * SIZE], c08
  233. STF c01, [B1 + 0 * SIZE]
  234. add A1, 4 * SIZE, A1
  235. STF c02, [B1 + 1 * SIZE]
  236. add A2, 4 * SIZE, A2
  237. STF c03, [B1 + 2 * SIZE]
  238. add I, -1, I
  239. STF c04, [B1 + 3 * SIZE]
  240. cmp I, 0
  241. STF c05, [B1 + 4 * SIZE]
  242. STF c06, [B1 + 5 * SIZE]
  243. STF c07, [B1 + 6 * SIZE]
  244. STF c08, [B1 + 7 * SIZE]
  245. bg,pt %icc, .LL112
  246. add B1, M4, B1
  247. .LL115:
  248. and N, 2, I
  249. cmp I, 0
  250. ble,pn %icc, .LL117
  251. nop
  252. LDF [A1 + 0 * SIZE], c01
  253. LDF [A1 + 1 * SIZE], c02
  254. LDF [A2 + 0 * SIZE], c03
  255. LDF [A2 + 1 * SIZE], c04
  256. STF c01, [B2 + 0 * SIZE]
  257. add A1, 2 * SIZE, A1
  258. STF c02, [B2 + 1 * SIZE]
  259. add A2, 2 * SIZE, A2
  260. STF c03, [B2 + 2 * SIZE]
  261. add I, -1, I
  262. STF c04, [B2 + 3 * SIZE]
  263. cmp I, 0
  264. add B2, 4 * SIZE, B2
  265. .LL117:
  266. and N, 1, I
  267. cmp I, 0
  268. ble,pn %icc, .LL200
  269. nop
  270. LDF [A1 + 0 * SIZE], c01
  271. LDF [A2 + 0 * SIZE], c02
  272. STF c01, [B3 + 0 * SIZE]
  273. STF c02, [B3 + 1 * SIZE]
  274. add B3, 2 * SIZE, B3
  275. .LL200:
  276. and M, 1, J
  277. cmp J, 0
  278. ble,pn %icc, .LL999
  279. nop
  280. .LL211:
  281. sra N, 2, I
  282. cmp I, 0
  283. mov B, B1
  284. ble,pn %icc, .LL215
  285. mov A, A1
  286. .LL212:
  287. LDF [A1 + 0 * SIZE], c01
  288. LDF [A1 + 1 * SIZE], c02
  289. LDF [A1 + 2 * SIZE], c03
  290. LDF [A1 + 3 * SIZE], c04
  291. STF c01, [B + 0 * SIZE]
  292. add I, -1, I
  293. STF c02, [B + 1 * SIZE]
  294. cmp I, 0
  295. STF c03, [B + 2 * SIZE]
  296. add A1, 4 * SIZE, A1
  297. STF c04, [B + 3 * SIZE]
  298. bg,pt %icc, .LL212
  299. add B, M4, B
  300. .LL215:
  301. and N, 2, I
  302. cmp I, 0
  303. ble,pn %icc, .LL217
  304. nop
  305. LDF [A1 + 0 * SIZE], c01
  306. LDF [A1 + 1 * SIZE], c02
  307. STF c01, [B2 + 0 * SIZE]
  308. STF c02, [B2 + 1 * SIZE]
  309. add A1, 2 * SIZE, A1
  310. .LL217:
  311. and N, 1, I
  312. cmp I, 0
  313. ble,pn %icc, .LL999
  314. nop
  315. LDF [A1 + 0 * SIZE], c01
  316. STF c01, [B3 + 0 * SIZE]
  317. .LL999:
  318. return %i7 + 8
  319. clr %o0
  320. EPILOGUE