You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_ncopy_hummer_2.S 8.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M r3
  41. #define N r4
  42. #define A r5
  43. #define LDA r6
  44. #define B r7
  45. #define AO1 r8
  46. #define AO2 r9
  47. #define J r12
  48. #define INC r30
  49. #define INC2 r31
  50. #define c01 f0
  51. #define c02 f1
  52. #define c03 f2
  53. #define c04 f3
  54. #define c05 f4
  55. #define c06 f5
  56. #define c07 f6
  57. #define c08 f7
  58. #define c09 f8
  59. #define c10 f9
  60. #define c11 f10
  61. #define c12 f11
  62. #define c13 f12
  63. #define c14 f13
  64. #define c15 f14
  65. #define c16 f15
  66. PROLOGUE
  67. PROFCODE
  68. li r0, -16
  69. stfpdux f14, SP, r0
  70. stfpdux f15, SP, r0
  71. stwu r31, -4(SP)
  72. stwu r30, -4(SP)
  73. slwi LDA, LDA, ZBASE_SHIFT
  74. cmpwi cr0, M, 0
  75. ble- LL(99)
  76. cmpwi cr0, N, 0
  77. ble- LL(99)
  78. li INC, 1 * SIZE
  79. li INC2, 2 * SIZE
  80. subi B, B, 2 * SIZE
  81. andi. r0, A, 2 * SIZE - 1
  82. bne LL(100)
  83. subi A, A, 2 * SIZE
  84. srawi. J, N, 1
  85. ble LL(20)
  86. .align 4
  87. LL(11):
  88. mr AO1, A
  89. add AO2, A, LDA
  90. add A, AO2, LDA
  91. srawi. r0, M, 3
  92. mtspr CTR, r0
  93. ble LL(15)
  94. .align 4
  95. LL(12):
  96. LFPDUX c01, AO1, INC2
  97. LFPDUX c02, AO2, INC2
  98. LFPDUX c03, AO1, INC2
  99. LFPDUX c04, AO2, INC2
  100. LFPDUX c05, AO1, INC2
  101. LFPDUX c06, AO2, INC2
  102. LFPDUX c07, AO1, INC2
  103. LFPDUX c08, AO2, INC2
  104. LFPDUX c09, AO1, INC2
  105. LFPDUX c10, AO2, INC2
  106. LFPDUX c11, AO1, INC2
  107. LFPDUX c12, AO2, INC2
  108. LFPDUX c13, AO1, INC2
  109. LFPDUX c14, AO2, INC2
  110. LFPDUX c15, AO1, INC2
  111. LFPDUX c16, AO2, INC2
  112. STFPDUX c01, B, INC2
  113. STFPDUX c02, B, INC2
  114. STFPDUX c03, B, INC2
  115. STFPDUX c04, B, INC2
  116. STFPDUX c05, B, INC2
  117. STFPDUX c06, B, INC2
  118. STFPDUX c07, B, INC2
  119. STFPDUX c08, B, INC2
  120. STFPDUX c09, B, INC2
  121. STFPDUX c10, B, INC2
  122. STFPDUX c11, B, INC2
  123. STFPDUX c12, B, INC2
  124. STFPDUX c13, B, INC2
  125. STFPDUX c14, B, INC2
  126. STFPDUX c15, B, INC2
  127. STFPDUX c16, B, INC2
  128. bdnz LL(12)
  129. .align 4
  130. LL(15):
  131. andi. r0, M, 7
  132. ble LL(19)
  133. andi. r0, M, 4
  134. beq LL(16)
  135. LFPDUX c01, AO1, INC2
  136. LFPDUX c02, AO2, INC2
  137. LFPDUX c03, AO1, INC2
  138. LFPDUX c04, AO2, INC2
  139. LFPDUX c05, AO1, INC2
  140. LFPDUX c06, AO2, INC2
  141. LFPDUX c07, AO1, INC2
  142. LFPDUX c08, AO2, INC2
  143. STFPDUX c01, B, INC2
  144. STFPDUX c02, B, INC2
  145. STFPDUX c03, B, INC2
  146. STFPDUX c04, B, INC2
  147. STFPDUX c05, B, INC2
  148. STFPDUX c06, B, INC2
  149. STFPDUX c07, B, INC2
  150. STFPDUX c08, B, INC2
  151. .align 4
  152. LL(16):
  153. andi. r0, M, 2
  154. beq LL(17)
  155. LFPDUX c01, AO1, INC2
  156. LFPDUX c02, AO2, INC2
  157. LFPDUX c03, AO1, INC2
  158. LFPDUX c04, AO2, INC2
  159. STFPDUX c01, B, INC2
  160. STFPDUX c02, B, INC2
  161. STFPDUX c03, B, INC2
  162. STFPDUX c04, B, INC2
  163. .align 4
  164. LL(17):
  165. andi. r0, M, 1
  166. beq LL(19)
  167. LFPDUX c01, AO1, INC2
  168. LFPDUX c02, AO2, INC2
  169. STFPDUX c01, B, INC2
  170. STFPDUX c02, B, INC2
  171. .align 4
  172. LL(19):
  173. addic. J, J, -1
  174. bgt LL(11)
  175. .align 4
  176. LL(20):
  177. andi. J, N, 1
  178. ble LL(99)
  179. mr AO1, A
  180. srawi. r0, M, 2
  181. mtspr CTR, r0
  182. ble LL(25)
  183. .align 4
  184. LL(22):
  185. LFPDUX c01, AO1, INC2
  186. LFPDUX c03, AO1, INC2
  187. LFPDUX c05, AO1, INC2
  188. LFPDUX c07, AO1, INC2
  189. STFPDUX c01, B, INC2
  190. STFPDUX c03, B, INC2
  191. STFPDUX c05, B, INC2
  192. STFPDUX c07, B, INC2
  193. bdnz LL(22)
  194. .align 4
  195. LL(25):
  196. andi. r0, M, 3
  197. ble LL(99)
  198. andi. r0, M, 2
  199. beq LL(27)
  200. LFPDUX c01, AO1, INC2
  201. LFPDUX c03, AO1, INC2
  202. STFPDUX c01, B, INC2
  203. STFPDUX c03, B, INC2
  204. .align 4
  205. LL(27):
  206. andi. r0, M, 1
  207. beq LL(99)
  208. LFPDUX c01, AO1, INC2
  209. STFPDUX c01, B, INC2
  210. .align 4
  211. LL(99):
  212. addi SP, SP, -4
  213. lwzu r30, 4(SP)
  214. lwzu r31, 4(SP)
  215. subi SP, SP, 12
  216. li r0, 16
  217. lfpdux f15, SP, r0
  218. lfpdux f14, SP, r0
  219. addi SP, SP, 16
  220. blr
  221. .align 4
  222. LL(100):
  223. subi A, A, 1 * SIZE
  224. srawi. J, N, 1
  225. ble LL(120)
  226. .align 4
  227. LL(111):
  228. mr AO1, A
  229. add AO2, A, LDA
  230. add A, AO2, LDA
  231. srawi. r0, M, 2
  232. mtspr CTR, r0
  233. ble LL(115)
  234. .align 4
  235. LL(112):
  236. LFDUX c01, AO1, INC
  237. LFDUX c02, AO1, INC
  238. LFDUX c03, AO2, INC
  239. LFDUX c04, AO2, INC
  240. LFDUX c05, AO1, INC
  241. LFDUX c06, AO1, INC
  242. LFDUX c07, AO2, INC
  243. LFDUX c08, AO2, INC
  244. LFDUX c09, AO1, INC
  245. LFDUX c10, AO1, INC
  246. LFDUX c11, AO2, INC
  247. LFDUX c12, AO2, INC
  248. fsmfp c01, c02
  249. LFDUX c13, AO1, INC
  250. fsmfp c03, c04
  251. LFDUX c14, AO1, INC
  252. fsmfp c05, c06
  253. LFDUX c15, AO2, INC
  254. fsmfp c07, c08
  255. LFDUX c16, AO2, INC
  256. fsmfp c09, c10
  257. STFPDUX c01, B, INC2
  258. fsmfp c11, c12
  259. STFPDUX c03, B, INC2
  260. fsmfp c13, c14
  261. STFPDUX c05, B, INC2
  262. fsmfp c15, c16
  263. STFPDUX c07, B, INC2
  264. STFPDUX c09, B, INC2
  265. STFPDUX c11, B, INC2
  266. STFPDUX c13, B, INC2
  267. STFPDUX c15, B, INC2
  268. bdnz LL(112)
  269. .align 4
  270. LL(115):
  271. andi. r0, M, 3
  272. ble LL(119)
  273. andi. r0, M, 2
  274. beq LL(117)
  275. LFDUX c01, AO1, INC
  276. LFDUX c02, AO1, INC
  277. LFDUX c03, AO2, INC
  278. LFDUX c04, AO2, INC
  279. LFDUX c05, AO1, INC
  280. LFDUX c06, AO1, INC
  281. LFDUX c07, AO2, INC
  282. LFDUX c08, AO2, INC
  283. fsmfp c01, c02
  284. fsmfp c03, c04
  285. fsmfp c05, c06
  286. fsmfp c07, c08
  287. STFPDUX c01, B, INC2
  288. STFPDUX c03, B, INC2
  289. STFPDUX c05, B, INC2
  290. STFPDUX c07, B, INC2
  291. .align 4
  292. LL(117):
  293. andi. r0, M, 1
  294. beq LL(119)
  295. LFDUX c01, AO1, INC
  296. LFDUX c02, AO1, INC
  297. LFDUX c03, AO2, INC
  298. LFDUX c04, AO2, INC
  299. fsmfp c01, c02
  300. fsmfp c03, c04
  301. STFPDUX c01, B, INC2
  302. STFPDUX c03, B, INC2
  303. .align 4
  304. LL(119):
  305. addic. J, J, -1
  306. bgt LL(111)
  307. .align 4
  308. LL(120):
  309. andi. J, N, 1
  310. ble LL(999)
  311. mr AO1, A
  312. srawi. r0, M, 2
  313. mtspr CTR, r0
  314. ble LL(125)
  315. .align 4
  316. LL(122):
  317. LFDUX c01, AO1, INC
  318. LFDUX c02, AO1, INC
  319. LFDUX c03, AO1, INC
  320. LFDUX c04, AO1, INC
  321. LFDUX c05, AO1, INC
  322. LFDUX c06, AO1, INC
  323. LFDUX c07, AO1, INC
  324. LFDUX c08, AO1, INC
  325. fsmfp c01, c02
  326. fsmfp c03, c04
  327. fsmfp c05, c06
  328. fsmfp c07, c08
  329. STFPDUX c01, B, INC2
  330. STFPDUX c03, B, INC2
  331. STFPDUX c05, B, INC2
  332. STFPDUX c07, B, INC2
  333. bdnz LL(122)
  334. .align 4
  335. LL(125):
  336. andi. r0, M, 3
  337. ble LL(999)
  338. andi. r0, M, 2
  339. beq LL(127)
  340. LFDUX c01, AO1, INC
  341. LFDUX c02, AO1, INC
  342. LFDUX c03, AO1, INC
  343. LFDUX c04, AO1, INC
  344. fsmfp c01, c02
  345. fsmfp c03, c04
  346. STFPDUX c01, B, INC2
  347. STFPDUX c03, B, INC2
  348. .align 4
  349. LL(127):
  350. andi. r0, M, 1
  351. beq LL(999)
  352. LFDUX c01, AO1, INC
  353. LFDUX c02, AO1, INC
  354. fsmfp c01, c02
  355. STFPDUX c01, B, INC2
  356. .align 4
  357. LL(999):
  358. addi SP, SP, -4
  359. lwzu r30, 4(SP)
  360. lwzu r31, 4(SP)
  361. subi SP, SP, 12
  362. li r0, 16
  363. lfpdux f15, SP, r0
  364. lfpdux f14, SP, r0
  365. addi SP, SP, 16
  366. blr
  367. EPILOGUE