You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_tcopy_hummer_4.S 9.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M r3
  41. #define N r4
  42. #define A r5
  43. #define LDA r6
  44. #define B r7
  45. #define AO1 r8
  46. #define AO2 r9
  47. #define AO3 r10
  48. #define AO4 r11
  49. #define J r25
  50. #define B1 r26
  51. #define B2 r27
  52. #define B3 r28
  53. #define M4 r29
  54. #define INC r30
  55. #define INC2 r31
  56. #define c01 f0
  57. #define c02 f1
  58. #define c03 f2
  59. #define c04 f3
  60. #define c05 f4
  61. #define c06 f5
  62. #define c07 f6
  63. #define c08 f7
  64. PROLOGUE
  65. PROFCODE
  66. stwu r31, -4(SP)
  67. stwu r30, -4(SP)
  68. stwu r29, -4(SP)
  69. stwu r28, -4(SP)
  70. stwu r27, -4(SP)
  71. stwu r26, -4(SP)
  72. stwu r25, -4(SP)
  73. slwi LDA, LDA, BASE_SHIFT
  74. slwi M4, M, 2 + BASE_SHIFT
  75. li r8, -4
  76. li r9, -2
  77. and B2, N, r8
  78. and B3, N, r9
  79. mullw B2, B2, M
  80. mullw B3, B3, M
  81. slwi B2, B2, BASE_SHIFT
  82. slwi B3, B3, BASE_SHIFT
  83. add B2, B2, B
  84. add B3, B3, B
  85. cmpwi cr0, M, 0
  86. ble- .L99
  87. cmpwi cr0, N, 0
  88. ble- .L99
  89. subi B2, B2, 2 * SIZE
  90. subi B3, B3, 2 * SIZE
  91. subi M4, M4, 14 * SIZE
  92. li INC, 1 * SIZE
  93. li INC2, 2 * SIZE
  94. andi. r0, A, 2 * SIZE - 1
  95. bne .L100
  96. andi. r0, LDA, 2 * SIZE - 1
  97. bne .L100
  98. subi A, A, 2 * SIZE
  99. srawi. J, M, 2
  100. ble .L20
  101. .align 4
  102. .L10:
  103. mr AO1, A
  104. add AO2, A, LDA
  105. add AO3, AO2, LDA
  106. add AO4, AO3, LDA
  107. add A, AO4, LDA
  108. sub B1, B, M4
  109. addi B, B, 16 * SIZE
  110. srawi. r0, N, 2
  111. mtspr CTR, r0
  112. ble .L15
  113. .align 4
  114. .L12:
  115. LFPDUX c01, AO1, INC2
  116. LFPDUX c02, AO1, INC2
  117. LFPDUX c03, AO2, INC2
  118. LFPDUX c04, AO2, INC2
  119. LFPDUX c05, AO3, INC2
  120. LFPDUX c06, AO3, INC2
  121. LFPDUX c07, AO4, INC2
  122. LFPDUX c08, AO4, INC2
  123. STFPDUX c01, B1, M4
  124. STFPDUX c02, B1, INC2
  125. STFPDUX c03, B1, INC2
  126. STFPDUX c04, B1, INC2
  127. STFPDUX c05, B1, INC2
  128. STFPDUX c06, B1, INC2
  129. STFPDUX c07, B1, INC2
  130. STFPDUX c08, B1, INC2
  131. bdnz .L12
  132. .align 4
  133. .L15:
  134. andi. r0, N, 3
  135. ble .L19
  136. andi. r0, N, 2
  137. ble .L17
  138. LFPDUX c01, AO1, INC2
  139. LFPDUX c03, AO2, INC2
  140. LFPDUX c05, AO3, INC2
  141. LFPDUX c07, AO4, INC2
  142. STFPDUX c01, B2, INC2
  143. STFPDUX c03, B2, INC2
  144. STFPDUX c05, B2, INC2
  145. STFPDUX c07, B2, INC2
  146. .align 4
  147. .L17:
  148. andi. r0, N, 1
  149. ble .L19
  150. LFDUX c01, AO1, INC2
  151. LFDUX c02, AO2, INC2
  152. LFDUX c03, AO3, INC2
  153. LFDUX c04, AO4, INC2
  154. fsmfp c01, c02
  155. fsmfp c03, c04
  156. STFPDUX c01, B3, INC2
  157. STFPDUX c03, B3, INC2
  158. .align 4
  159. .L19:
  160. addic. J, J, -1
  161. bgt .L10
  162. .align 4
  163. .L20:
  164. andi. J, M, 2
  165. addi M4, M4, 8 * SIZE
  166. ble .L30
  167. mr AO1, A
  168. add AO2, A, LDA
  169. add A, AO2, LDA
  170. sub B1, B, M4
  171. addi B, B, 8 * SIZE
  172. srawi. r0, N, 2
  173. mtspr CTR, r0
  174. ble .L23
  175. .align 4
  176. .L22:
  177. LFPDUX c01, AO1, INC2
  178. LFPDUX c02, AO1, INC2
  179. LFPDUX c03, AO2, INC2
  180. LFPDUX c04, AO2, INC2
  181. STFPDUX c01, B1, M4
  182. STFPDUX c02, B1, INC2
  183. STFPDUX c03, B1, INC2
  184. STFPDUX c04, B1, INC2
  185. bdnz .L22
  186. .align 4
  187. .L23:
  188. andi. r0, N, 2
  189. ble .L24
  190. LFPDUX c01, AO1, INC2
  191. LFPDUX c03, AO2, INC2
  192. STFPDUX c01, B2, INC2
  193. STFPDUX c03, B2, INC2
  194. .align 4
  195. .L24:
  196. andi. r0, N, 1
  197. ble .L30
  198. LFDUX c01, AO1, INC2
  199. LFDUX c02, AO2, INC2
  200. fsmfp c01, c02
  201. STFPDUX c01, B3, INC2
  202. .align 4
  203. .L30:
  204. andi. J, M, 1
  205. addi M4, M4, 4 * SIZE
  206. ble .L99
  207. mr AO1, A
  208. sub B1, B, M4
  209. srawi. r0, N, 2
  210. mtspr CTR, r0
  211. ble .L33
  212. .align 4
  213. .L32:
  214. LFPDUX c01, AO1, INC2
  215. LFPDUX c02, AO1, INC2
  216. STFPDUX c01, B1, M4
  217. STFPDUX c02, B1, INC2
  218. bdnz .L32
  219. .align 4
  220. .L33:
  221. andi. r0, N, 2
  222. ble .L34
  223. LFPDUX c01, AO1, INC2
  224. STFPDUX c01, B2, INC2
  225. .align 4
  226. .L34:
  227. andi. r0, N, 1
  228. ble .L99
  229. LFDX c01, AO1, INC2
  230. STFDX c01, B3, INC2
  231. .align 4
  232. .L99:
  233. addi SP, SP, -4
  234. lwzu r25, 4(SP)
  235. lwzu r26, 4(SP)
  236. lwzu r27, 4(SP)
  237. lwzu r28, 4(SP)
  238. lwzu r29, 4(SP)
  239. lwzu r30, 4(SP)
  240. lwzu r31, 4(SP)
  241. addi SP, SP, 4
  242. blr
  243. .L100:
  244. subi A, A, SIZE
  245. srawi. J, M, 2
  246. ble .L120
  247. .align 4
  248. .L110:
  249. mr AO1, A
  250. add AO2, A, LDA
  251. add AO3, AO2, LDA
  252. add AO4, AO3, LDA
  253. add A, AO4, LDA
  254. sub B1, B, M4
  255. addi B, B, 16 * SIZE
  256. srawi. r0, N, 2
  257. mtspr CTR, r0
  258. ble .L115
  259. .align 4
  260. .L112:
  261. LFDUX c01, AO1, INC
  262. LFDUX c03, AO2, INC
  263. LFDUX c05, AO3, INC
  264. LFDUX c07, AO4, INC
  265. LFSDUX c01, AO1, INC
  266. LFSDUX c03, AO2, INC
  267. LFSDUX c05, AO3, INC
  268. LFSDUX c07, AO4, INC
  269. LFDUX c02, AO1, INC
  270. LFDUX c04, AO2, INC
  271. LFDUX c06, AO3, INC
  272. LFDUX c08, AO4, INC
  273. LFSDUX c02, AO1, INC
  274. LFSDUX c04, AO2, INC
  275. LFSDUX c06, AO3, INC
  276. LFSDUX c08, AO4, INC
  277. STFPDUX c01, B1, M4
  278. STFPDUX c02, B1, INC2
  279. STFPDUX c03, B1, INC2
  280. STFPDUX c04, B1, INC2
  281. STFPDUX c05, B1, INC2
  282. STFPDUX c06, B1, INC2
  283. STFPDUX c07, B1, INC2
  284. STFPDUX c08, B1, INC2
  285. bdnz .L112
  286. .align 4
  287. .L115:
  288. andi. r0, N, 3
  289. ble .L119
  290. andi. r0, N, 2
  291. ble .L117
  292. LFDUX c01, AO1, INC
  293. LFDUX c03, AO2, INC
  294. LFDUX c05, AO3, INC
  295. LFDUX c07, AO4, INC
  296. LFSDUX c01, AO1, INC
  297. LFSDUX c03, AO2, INC
  298. LFSDUX c05, AO3, INC
  299. LFSDUX c07, AO4, INC
  300. STFPDUX c01, B2, INC2
  301. STFPDUX c03, B2, INC2
  302. STFPDUX c05, B2, INC2
  303. STFPDUX c07, B2, INC2
  304. .align 4
  305. .L117:
  306. andi. r0, N, 1
  307. ble .L119
  308. LFDUX c01, AO1, INC
  309. LFDUX c02, AO2, INC
  310. LFDUX c03, AO3, INC
  311. LFDUX c04, AO4, INC
  312. fsmfp c01, c02
  313. fsmfp c03, c04
  314. STFPDUX c01, B3, INC2
  315. STFPDUX c03, B3, INC2
  316. .align 4
  317. .L119:
  318. addic. J, J, -1
  319. bgt .L110
  320. .align 4
  321. .L120:
  322. andi. J, M, 2
  323. addi M4, M4, 8 * SIZE
  324. ble .L130
  325. mr AO1, A
  326. add AO2, A, LDA
  327. add A, AO2, LDA
  328. sub B1, B, M4
  329. addi B, B, 8 * SIZE
  330. srawi. r0, N, 2
  331. mtspr CTR, r0
  332. ble .L123
  333. .align 4
  334. .L122:
  335. LFDUX c01, AO1, INC
  336. LFDUX c03, AO2, INC
  337. LFSDUX c01, AO1, INC
  338. LFSDUX c03, AO2, INC
  339. LFDUX c02, AO1, INC
  340. LFDUX c04, AO2, INC
  341. LFSDUX c02, AO1, INC
  342. LFSDUX c04, AO2, INC
  343. STFPDUX c01, B1, M4
  344. STFPDUX c02, B1, INC2
  345. STFPDUX c03, B1, INC2
  346. STFPDUX c04, B1, INC2
  347. bdnz .L122
  348. .align 4
  349. .L123:
  350. andi. r0, N, 2
  351. ble .L124
  352. LFDUX c01, AO1, INC
  353. LFDUX c03, AO2, INC
  354. LFSDUX c01, AO1, INC
  355. LFSDUX c03, AO2, INC
  356. STFPDUX c01, B2, INC2
  357. STFPDUX c03, B2, INC2
  358. .align 4
  359. .L124:
  360. andi. r0, N, 1
  361. ble .L130
  362. LFDUX c01, AO1, INC
  363. LFDUX c02, AO2, INC
  364. fsmfp c01, c02
  365. STFPDUX c01, B3, INC2
  366. .align 4
  367. .L130:
  368. andi. J, M, 1
  369. addi M4, M4, 4 * SIZE
  370. ble .L999
  371. mr AO1, A
  372. sub B1, B, M4
  373. srawi. r0, N, 2
  374. mtspr CTR, r0
  375. ble .L133
  376. .align 4
  377. .L132:
  378. LFDUX c01, AO1, INC
  379. LFDUX c02, AO1, INC
  380. LFDUX c03, AO1, INC
  381. LFDUX c04, AO1, INC
  382. fsmfp c01, c02
  383. fsmfp c03, c04
  384. STFPDUX c01, B1, M4
  385. STFPDUX c03, B1, INC2
  386. bdnz .L132
  387. .align 4
  388. .L133:
  389. andi. r0, N, 2
  390. ble .L134
  391. LFDUX c01, AO1, INC
  392. LFDUX c02, AO1, INC
  393. fsmfp c01, c02
  394. STFPDUX c01, B2, INC2
  395. .align 4
  396. .L134:
  397. andi. r0, N, 1
  398. ble .L999
  399. LFDX c01, AO1, INC
  400. STFDX c01, B3, INC2
  401. .align 4
  402. .L999:
  403. addi SP, SP, -4
  404. lwzu r25, 4(SP)
  405. lwzu r26, 4(SP)
  406. lwzu r27, 4(SP)
  407. lwzu r28, 4(SP)
  408. lwzu r29, 4(SP)
  409. lwzu r30, 4(SP)
  410. lwzu r31, 4(SP)
  411. addi SP, SP, 4
  412. blr
  413. EPILOGUE