You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_tcopy_hummer_4.S 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M r3
  41. #define N r4
  42. #define A r5
  43. #define LDA r6
  44. #define B r7
  45. #define AO1 r8
  46. #define AO2 r9
  47. #define AO3 r10
  48. #define AO4 r11
  49. #define J r25
  50. #define B1 r26
  51. #define B2 r27
  52. #define B3 r28
  53. #define M4 r29
  54. #define INC r30
  55. #define INC2 r31
  56. #define c01 f0
  57. #define c02 f1
  58. #define c03 f2
  59. #define c04 f3
  60. #define c05 f4
  61. #define c06 f5
  62. #define c07 f6
  63. #define c08 f7
  64. #define c09 f8
  65. #define c10 f9
  66. #define c11 f10
  67. #define c12 f11
  68. #define c13 f12
  69. #define c14 f13
  70. #define c15 f14
  71. #define c16 f15
  72. PROLOGUE
  73. PROFCODE
  74. li r0, -16
  75. stfpdux f14, SP, r0
  76. stfpdux f15, SP, r0
  77. stwu r31, -4(SP)
  78. stwu r30, -4(SP)
  79. stwu r29, -4(SP)
  80. stwu r28, -4(SP)
  81. stwu r27, -4(SP)
  82. stwu r26, -4(SP)
  83. stwu r25, -4(SP)
  84. slwi LDA, LDA, ZBASE_SHIFT
  85. slwi M4, M, 2 + ZBASE_SHIFT
  86. li r8, -4
  87. li r9, -2
  88. and B2, N, r8
  89. and B3, N, r9
  90. mullw B2, B2, M
  91. mullw B3, B3, M
  92. slwi B2, B2, ZBASE_SHIFT
  93. slwi B3, B3, ZBASE_SHIFT
  94. add B2, B2, B
  95. add B3, B3, B
  96. cmpwi cr0, M, 0
  97. ble- LL(99)
  98. cmpwi cr0, N, 0
  99. ble- LL(99)
  100. subi B2, B2, 2 * SIZE
  101. subi B3, B3, 2 * SIZE
  102. subi M4, M4, 30 * SIZE
  103. li INC, 1 * SIZE
  104. li INC2, 2 * SIZE
  105. andi. r0, A, 2 * SIZE - 1
  106. bne LL(100)
  107. subi A, A, 2 * SIZE
  108. srawi. J, M, 2
  109. ble LL(20)
  110. .align 4
  111. LL(10):
  112. mr AO1, A
  113. add AO2, A, LDA
  114. add AO3, AO2, LDA
  115. add AO4, AO3, LDA
  116. add A, AO4, LDA
  117. sub B1, B, M4
  118. addi B, B, 32 * SIZE
  119. srawi. r0, N, 2
  120. mtspr CTR, r0
  121. ble LL(15)
  122. .align 4
  123. LL(12):
  124. LFPDUX c01, AO1, INC2
  125. LFPDUX c05, AO2, INC2
  126. LFPDUX c09, AO3, INC2
  127. LFPDUX c13, AO4, INC2
  128. LFPDUX c02, AO1, INC2
  129. LFPDUX c06, AO2, INC2
  130. LFPDUX c10, AO3, INC2
  131. LFPDUX c14, AO4, INC2
  132. LFPDUX c03, AO1, INC2
  133. LFPDUX c07, AO2, INC2
  134. LFPDUX c11, AO3, INC2
  135. LFPDUX c15, AO4, INC2
  136. LFPDUX c04, AO1, INC2
  137. LFPDUX c08, AO2, INC2
  138. LFPDUX c12, AO3, INC2
  139. LFPDUX c16, AO4, INC2
  140. STFPDUX c01, B1, M4
  141. STFPDUX c02, B1, INC2
  142. STFPDUX c03, B1, INC2
  143. STFPDUX c04, B1, INC2
  144. STFPDUX c05, B1, INC2
  145. STFPDUX c06, B1, INC2
  146. STFPDUX c07, B1, INC2
  147. STFPDUX c08, B1, INC2
  148. STFPDUX c09, B1, INC2
  149. STFPDUX c10, B1, INC2
  150. STFPDUX c11, B1, INC2
  151. STFPDUX c12, B1, INC2
  152. STFPDUX c13, B1, INC2
  153. STFPDUX c14, B1, INC2
  154. STFPDUX c15, B1, INC2
  155. STFPDUX c16, B1, INC2
  156. bdnz LL(12)
  157. .align 4
  158. LL(15):
  159. andi. r0, N, 3
  160. ble LL(19)
  161. andi. r0, N, 2
  162. ble LL(17)
  163. LFPDUX c01, AO1, INC2
  164. LFPDUX c02, AO1, INC2
  165. LFPDUX c03, AO2, INC2
  166. LFPDUX c04, AO2, INC2
  167. LFPDUX c05, AO3, INC2
  168. LFPDUX c06, AO3, INC2
  169. LFPDUX c07, AO4, INC2
  170. LFPDUX c08, AO4, INC2
  171. STFPDUX c01, B2, INC2
  172. STFPDUX c02, B2, INC2
  173. STFPDUX c03, B2, INC2
  174. STFPDUX c04, B2, INC2
  175. STFPDUX c05, B2, INC2
  176. STFPDUX c06, B2, INC2
  177. STFPDUX c07, B2, INC2
  178. STFPDUX c08, B2, INC2
  179. .align 4
  180. LL(17):
  181. andi. r0, N, 1
  182. ble LL(19)
  183. LFPDUX c01, AO1, INC2
  184. LFPDUX c02, AO2, INC2
  185. LFPDUX c03, AO3, INC2
  186. LFPDUX c04, AO4, INC2
  187. STFPDUX c01, B3, INC2
  188. STFPDUX c02, B3, INC2
  189. STFPDUX c03, B3, INC2
  190. STFPDUX c04, B3, INC2
  191. .align 4
  192. LL(19):
  193. addic. J, J, -1
  194. bgt LL(10)
  195. .align 4
  196. LL(20):
  197. andi. J, M, 2
  198. addi M4, M4, 16 * SIZE
  199. ble LL(30)
  200. mr AO1, A
  201. add AO2, A, LDA
  202. add A, AO2, LDA
  203. sub B1, B, M4
  204. addi B, B, 16 * SIZE
  205. srawi. r0, N, 2
  206. mtspr CTR, r0
  207. ble LL(23)
  208. .align 4
  209. LL(22):
  210. LFPDUX c01, AO1, INC2
  211. LFPDUX c02, AO1, INC2
  212. LFPDUX c03, AO1, INC2
  213. LFPDUX c04, AO1, INC2
  214. LFPDUX c05, AO2, INC2
  215. LFPDUX c06, AO2, INC2
  216. LFPDUX c07, AO2, INC2
  217. LFPDUX c08, AO2, INC2
  218. STFPDUX c01, B1, M4
  219. STFPDUX c02, B1, INC2
  220. STFPDUX c03, B1, INC2
  221. STFPDUX c04, B1, INC2
  222. STFPDUX c05, B1, INC2
  223. STFPDUX c06, B1, INC2
  224. STFPDUX c07, B1, INC2
  225. STFPDUX c08, B1, INC2
  226. bdnz LL(22)
  227. .align 4
  228. LL(23):
  229. andi. r0, N, 2
  230. ble LL(24)
  231. LFPDUX c01, AO1, INC2
  232. LFPDUX c02, AO1, INC2
  233. LFPDUX c03, AO2, INC2
  234. LFPDUX c04, AO2, INC2
  235. STFPDUX c01, B2, INC2
  236. STFPDUX c02, B2, INC2
  237. STFPDUX c03, B2, INC2
  238. STFPDUX c04, B2, INC2
  239. .align 4
  240. LL(24):
  241. andi. r0, N, 1
  242. ble LL(30)
  243. LFPDUX c01, AO1, INC2
  244. LFPDUX c02, AO2, INC2
  245. STFPDUX c01, B3, INC2
  246. STFPDUX c02, B3, INC2
  247. .align 4
  248. LL(30):
  249. andi. J, M, 1
  250. addi M4, M4, 8 * SIZE
  251. ble LL(99)
  252. mr AO1, A
  253. sub B1, B, M4
  254. srawi. r0, N, 2
  255. mtspr CTR, r0
  256. ble LL(33)
  257. .align 4
  258. LL(32):
  259. LFPDUX c01, AO1, INC2
  260. LFPDUX c02, AO1, INC2
  261. LFPDUX c03, AO1, INC2
  262. LFPDUX c04, AO1, INC2
  263. STFPDUX c01, B1, M4
  264. STFPDUX c02, B1, INC2
  265. STFPDUX c03, B1, INC2
  266. STFPDUX c04, B1, INC2
  267. bdnz LL(32)
  268. .align 4
  269. LL(33):
  270. andi. r0, N, 2
  271. ble LL(34)
  272. LFPDUX c01, AO1, INC2
  273. LFPDUX c02, AO1, INC2
  274. STFPDUX c01, B2, INC2
  275. STFPDUX c02, B2, INC2
  276. .align 4
  277. LL(34):
  278. andi. r0, N, 1
  279. ble LL(99)
  280. LFPDUX c01, AO1, INC2
  281. STFPDX c01, B3, INC2
  282. .align 4
  283. LL(99):
  284. addi SP, SP, -4
  285. lwzu r25, 4(SP)
  286. lwzu r26, 4(SP)
  287. lwzu r27, 4(SP)
  288. lwzu r28, 4(SP)
  289. lwzu r29, 4(SP)
  290. lwzu r30, 4(SP)
  291. lwzu r31, 4(SP)
  292. subi SP, SP, 12
  293. li r0, 16
  294. lfpdux f15, SP, r0
  295. lfpdux f14, SP, r0
  296. addi SP, SP, 16
  297. blr
  298. .align 4
  299. LL(100):
  300. subi A, A, SIZE
  301. srawi. J, M, 2
  302. ble LL(120)
  303. .align 4
  304. LL(110):
  305. mr AO1, A
  306. add AO2, A, LDA
  307. add AO3, AO2, LDA
  308. add AO4, AO3, LDA
  309. add A, AO4, LDA
  310. sub B1, B, M4
  311. addi B, B, 32 * SIZE
  312. srawi. r0, N, 2
  313. mtspr CTR, r0
  314. ble LL(115)
  315. .align 4
  316. LL(112):
  317. LFDUX c01, AO1, INC
  318. LFDUX c05, AO2, INC
  319. LFDUX c09, AO3, INC
  320. LFDUX c13, AO4, INC
  321. LFSDUX c01, AO1, INC
  322. LFSDUX c05, AO2, INC
  323. LFSDUX c09, AO3, INC
  324. LFSDUX c13, AO4, INC
  325. LFDUX c02, AO1, INC
  326. LFDUX c06, AO2, INC
  327. LFDUX c10, AO3, INC
  328. LFDUX c14, AO4, INC
  329. LFSDUX c02, AO1, INC
  330. LFSDUX c06, AO2, INC
  331. LFSDUX c10, AO3, INC
  332. LFSDUX c14, AO4, INC
  333. LFDUX c03, AO1, INC
  334. LFDUX c07, AO2, INC
  335. LFDUX c11, AO3, INC
  336. LFDUX c15, AO4, INC
  337. LFSDUX c03, AO1, INC
  338. LFSDUX c07, AO2, INC
  339. LFSDUX c11, AO3, INC
  340. LFSDUX c15, AO4, INC
  341. LFDUX c04, AO1, INC
  342. LFDUX c08, AO2, INC
  343. LFDUX c12, AO3, INC
  344. LFDUX c16, AO4, INC
  345. LFSDUX c04, AO1, INC
  346. LFSDUX c08, AO2, INC
  347. LFSDUX c12, AO3, INC
  348. LFSDUX c16, AO4, INC
  349. STFPDUX c01, B1, M4
  350. STFPDUX c02, B1, INC2
  351. STFPDUX c03, B1, INC2
  352. STFPDUX c04, B1, INC2
  353. STFPDUX c05, B1, INC2
  354. STFPDUX c06, B1, INC2
  355. STFPDUX c07, B1, INC2
  356. STFPDUX c08, B1, INC2
  357. STFPDUX c09, B1, INC2
  358. STFPDUX c10, B1, INC2
  359. STFPDUX c11, B1, INC2
  360. STFPDUX c12, B1, INC2
  361. STFPDUX c13, B1, INC2
  362. STFPDUX c14, B1, INC2
  363. STFPDUX c15, B1, INC2
  364. STFPDUX c16, B1, INC2
  365. bdnz LL(112)
  366. .align 4
  367. LL(115):
  368. andi. r0, N, 3
  369. ble LL(119)
  370. andi. r0, N, 2
  371. ble LL(117)
  372. LFDUX c01, AO1, INC
  373. LFDUX c02, AO1, INC
  374. LFDUX c03, AO1, INC
  375. LFDUX c04, AO1, INC
  376. LFDUX c05, AO2, INC
  377. LFDUX c06, AO2, INC
  378. LFDUX c07, AO2, INC
  379. LFDUX c08, AO2, INC
  380. LFDUX c09, AO3, INC
  381. LFDUX c10, AO3, INC
  382. LFDUX c11, AO3, INC
  383. LFDUX c12, AO3, INC
  384. fsmfp c01, c02
  385. LFDUX c13, AO4, INC
  386. fsmfp c03, c04
  387. LFDUX c14, AO4, INC
  388. fsmfp c05, c06
  389. LFDUX c15, AO4, INC
  390. fsmfp c07, c08
  391. LFDUX c16, AO4, INC
  392. fsmfp c09, c10
  393. STFPDUX c01, B2, INC2
  394. fsmfp c11, c12
  395. STFPDUX c03, B2, INC2
  396. fsmfp c13, c14
  397. STFPDUX c05, B2, INC2
  398. fsmfp c15, c16
  399. STFPDUX c07, B2, INC2
  400. STFPDUX c09, B2, INC2
  401. STFPDUX c11, B2, INC2
  402. STFPDUX c13, B2, INC2
  403. STFPDUX c15, B2, INC2
  404. .align 4
  405. LL(117):
  406. andi. r0, N, 1
  407. ble LL(119)
  408. LFDUX c01, AO1, INC
  409. LFDUX c02, AO1, INC
  410. LFDUX c03, AO2, INC
  411. LFDUX c04, AO2, INC
  412. LFDUX c05, AO3, INC
  413. fsmfp c01, c02
  414. LFDUX c06, AO3, INC
  415. fsmfp c03, c04
  416. LFDUX c07, AO4, INC
  417. fsmfp c05, c06
  418. LFDUX c08, AO4, INC
  419. fsmfp c07, c08
  420. STFPDUX c01, B3, INC2
  421. STFPDUX c03, B3, INC2
  422. STFPDUX c05, B3, INC2
  423. STFPDUX c07, B3, INC2
  424. .align 4
  425. LL(119):
  426. addic. J, J, -1
  427. bgt LL(110)
  428. .align 4
  429. LL(120):
  430. andi. J, M, 2
  431. addi M4, M4, 16 * SIZE
  432. ble LL(130)
  433. mr AO1, A
  434. add AO2, A, LDA
  435. add A, AO2, LDA
  436. sub B1, B, M4
  437. addi B, B, 16 * SIZE
  438. srawi. r0, N, 2
  439. mtspr CTR, r0
  440. ble LL(123)
  441. .align 4
  442. LL(122):
  443. LFDUX c01, AO1, INC
  444. LFDUX c02, AO1, INC
  445. LFDUX c03, AO1, INC
  446. LFDUX c04, AO1, INC
  447. LFDUX c05, AO1, INC
  448. LFDUX c06, AO1, INC
  449. LFDUX c07, AO1, INC
  450. LFDUX c08, AO1, INC
  451. LFDUX c09, AO2, INC
  452. LFDUX c10, AO2, INC
  453. LFDUX c11, AO2, INC
  454. LFDUX c12, AO2, INC
  455. fsmfp c01, c02
  456. LFDUX c13, AO2, INC
  457. fsmfp c03, c04
  458. LFDUX c14, AO2, INC
  459. fsmfp c05, c06
  460. LFDUX c15, AO2, INC
  461. fsmfp c07, c08
  462. LFDUX c16, AO2, INC
  463. fsmfp c09, c10
  464. STFPDUX c01, B1, M4
  465. fsmfp c11, c12
  466. STFPDUX c03, B1, INC2
  467. fsmfp c13, c14
  468. STFPDUX c05, B1, INC2
  469. fsmfp c15, c16
  470. STFPDUX c07, B1, INC2
  471. STFPDUX c09, B1, INC2
  472. STFPDUX c11, B1, INC2
  473. STFPDUX c13, B1, INC2
  474. STFPDUX c15, B1, INC2
  475. bdnz LL(122)
  476. .align 4
  477. LL(123):
  478. andi. r0, N, 2
  479. ble LL(124)
  480. LFDUX c01, AO1, INC
  481. LFDUX c02, AO1, INC
  482. LFDUX c03, AO1, INC
  483. LFDUX c04, AO1, INC
  484. LFDUX c05, AO2, INC
  485. fsmfp c01, c02
  486. LFDUX c06, AO2, INC
  487. fsmfp c03, c04
  488. LFDUX c07, AO2, INC
  489. fsmfp c05, c06
  490. LFDUX c08, AO2, INC
  491. fsmfp c07, c08
  492. STFPDUX c01, B2, INC2
  493. STFPDUX c03, B2, INC2
  494. STFPDUX c05, B2, INC2
  495. STFPDUX c07, B2, INC2
  496. .align 4
  497. LL(124):
  498. andi. r0, N, 1
  499. ble LL(130)
  500. LFDUX c01, AO1, INC
  501. LFDUX c02, AO1, INC
  502. LFDUX c03, AO2, INC
  503. LFDUX c04, AO2, INC
  504. fsmfp c01, c02
  505. fsmfp c03, c04
  506. STFPDUX c01, B3, INC2
  507. STFPDUX c03, B3, INC2
  508. .align 4
  509. LL(130):
  510. andi. J, M, 1
  511. addi M4, M4, 8 * SIZE
  512. ble LL(999)
  513. mr AO1, A
  514. sub B1, B, M4
  515. srawi. r0, N, 2
  516. mtspr CTR, r0
  517. ble LL(133)
  518. .align 4
  519. LL(132):
  520. LFDUX c01, AO1, INC
  521. LFDUX c02, AO1, INC
  522. LFDUX c03, AO1, INC
  523. LFDUX c04, AO1, INC
  524. LFDUX c05, AO1, INC
  525. fsmfp c01, c02
  526. LFDUX c06, AO1, INC
  527. fsmfp c03, c04
  528. LFDUX c07, AO1, INC
  529. fsmfp c05, c06
  530. LFDUX c08, AO1, INC
  531. fsmfp c07, c08
  532. STFPDUX c01, B1, M4
  533. STFPDUX c03, B1, INC2
  534. STFPDUX c05, B1, INC2
  535. STFPDUX c07, B1, INC2
  536. bdnz LL(132)
  537. .align 4
  538. LL(133):
  539. andi. r0, N, 2
  540. ble LL(134)
  541. LFDUX c01, AO1, INC
  542. LFDUX c02, AO1, INC
  543. LFDUX c03, AO1, INC
  544. LFDUX c04, AO1, INC
  545. fsmfp c01, c02
  546. fsmfp c03, c04
  547. STFPDUX c01, B2, INC2
  548. STFPDUX c03, B2, INC2
  549. .align 4
  550. LL(134):
  551. andi. r0, N, 1
  552. ble LL(999)
  553. LFDUX c01, AO1, INC
  554. LFDUX c02, AO1, INC
  555. fsmfp c01, c02
  556. STFPDX c01, B3, INC2
  557. .align 4
  558. LL(999):
  559. addi SP, SP, -4
  560. lwzu r25, 4(SP)
  561. lwzu r26, 4(SP)
  562. lwzu r27, 4(SP)
  563. lwzu r28, 4(SP)
  564. lwzu r29, 4(SP)
  565. lwzu r30, 4(SP)
  566. lwzu r31, 4(SP)
  567. subi SP, SP, 12
  568. li r0, 16
  569. lfpdux f15, SP, r0
  570. lfpdux f14, SP, r0
  571. addi SP, SP, 16
  572. blr
  573. EPILOGUE