You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_ncopy_hummer_4.S 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M r3
  41. #define N r4
  42. #define A r5
  43. #define LDA r6
  44. #define B r7
  45. #define AO1 r8
  46. #define AO2 r9
  47. #define AO3 r10
  48. #define AO4 r11
  49. #define J r12
  50. #define INC r30
  51. #define INC2 r31
  52. #define c01 f0
  53. #define c02 f1
  54. #define c03 f2
  55. #define c04 f3
  56. #define c05 f4
  57. #define c06 f5
  58. #define c07 f6
  59. #define c08 f7
  60. #define c09 f8
  61. #define c10 f9
  62. #define c11 f10
  63. #define c12 f11
  64. #define c13 f12
  65. #define c14 f13
  66. #define c15 f14
  67. #define c16 f15
  68. PROLOGUE
  69. PROFCODE
  70. li r0, -16
  71. stfpdux f14, SP, r0
  72. stfpdux f15, SP, r0
  73. stwu r31, -4(SP)
  74. stwu r30, -4(SP)
  75. slwi LDA, LDA, ZBASE_SHIFT
  76. cmpwi cr0, M, 0
  77. ble- LL(99)
  78. cmpwi cr0, N, 0
  79. ble- LL(99)
  80. li INC, 1 * SIZE
  81. li INC2, 2 * SIZE
  82. subi B, B, 2 * SIZE
  83. andi. r0, A, 2 * SIZE - 1
  84. bne LL(100)
  85. subi A, A, 2 * SIZE
  86. srawi. J, N, 2
  87. ble LL(20)
  88. .align 4
  89. LL(11):
  90. mr AO1, A
  91. add AO2, A, LDA
  92. add AO3, AO2, LDA
  93. add AO4, AO3, LDA
  94. add A, AO4, LDA
  95. srawi. r0, M, 2
  96. mtspr CTR, r0
  97. ble LL(15)
  98. .align 4
  99. LL(12):
  100. LFPDUX c01, AO1, INC2
  101. LFPDUX c05, AO2, INC2
  102. LFPDUX c09, AO3, INC2
  103. LFPDUX c13, AO4, INC2
  104. LFPDUX c02, AO1, INC2
  105. LFPDUX c06, AO2, INC2
  106. LFPDUX c10, AO3, INC2
  107. LFPDUX c14, AO4, INC2
  108. LFPDUX c03, AO1, INC2
  109. LFPDUX c07, AO2, INC2
  110. LFPDUX c11, AO3, INC2
  111. LFPDUX c15, AO4, INC2
  112. LFPDUX c04, AO1, INC2
  113. LFPDUX c08, AO2, INC2
  114. LFPDUX c12, AO3, INC2
  115. LFPDUX c16, AO4, INC2
  116. STFPDUX c01, B, INC2
  117. STFPDUX c05, B, INC2
  118. STFPDUX c09, B, INC2
  119. STFPDUX c13, B, INC2
  120. STFPDUX c02, B, INC2
  121. STFPDUX c06, B, INC2
  122. STFPDUX c10, B, INC2
  123. STFPDUX c14, B, INC2
  124. STFPDUX c03, B, INC2
  125. STFPDUX c07, B, INC2
  126. STFPDUX c11, B, INC2
  127. STFPDUX c15, B, INC2
  128. STFPDUX c04, B, INC2
  129. STFPDUX c08, B, INC2
  130. STFPDUX c12, B, INC2
  131. STFPDUX c16, B, INC2
  132. bdnz LL(12)
  133. .align 4
  134. LL(15):
  135. andi. r0, M, 3
  136. ble LL(19)
  137. andi. r0, M, 2
  138. beq LL(17)
  139. LFPDUX c01, AO1, INC2
  140. LFPDUX c05, AO2, INC2
  141. LFPDUX c09, AO3, INC2
  142. LFPDUX c13, AO4, INC2
  143. LFPDUX c02, AO1, INC2
  144. LFPDUX c06, AO2, INC2
  145. LFPDUX c10, AO3, INC2
  146. LFPDUX c14, AO4, INC2
  147. STFPDUX c01, B, INC2
  148. STFPDUX c05, B, INC2
  149. STFPDUX c09, B, INC2
  150. STFPDUX c13, B, INC2
  151. STFPDUX c02, B, INC2
  152. STFPDUX c06, B, INC2
  153. STFPDUX c10, B, INC2
  154. STFPDUX c14, B, INC2
  155. .align 4
  156. LL(17):
  157. andi. r0, M, 1
  158. beq LL(19)
  159. LFPDUX c01, AO1, INC2
  160. LFPDUX c05, AO2, INC2
  161. LFPDUX c09, AO3, INC2
  162. LFPDUX c13, AO4, INC2
  163. STFPDUX c01, B, INC2
  164. STFPDUX c05, B, INC2
  165. STFPDUX c09, B, INC2
  166. STFPDUX c13, B, INC2
  167. .align 4
  168. LL(19):
  169. addic. J, J, -1
  170. bgt LL(11)
  171. .align 4
  172. LL(20):
  173. andi. J, N, 2
  174. ble LL(30)
  175. mr AO1, A
  176. add AO2, A, LDA
  177. add A, AO2, LDA
  178. srawi. r0, M, 2
  179. mtspr CTR, r0
  180. ble LL(25)
  181. .align 4
  182. LL(22):
  183. LFPDUX c01, AO1, INC2
  184. LFPDUX c05, AO2, INC2
  185. LFPDUX c02, AO1, INC2
  186. LFPDUX c06, AO2, INC2
  187. LFPDUX c03, AO1, INC2
  188. LFPDUX c07, AO2, INC2
  189. LFPDUX c04, AO1, INC2
  190. LFPDUX c08, AO2, INC2
  191. STFPDUX c01, B, INC2
  192. STFPDUX c05, B, INC2
  193. STFPDUX c02, B, INC2
  194. STFPDUX c06, B, INC2
  195. STFPDUX c03, B, INC2
  196. STFPDUX c07, B, INC2
  197. STFPDUX c04, B, INC2
  198. STFPDUX c08, B, INC2
  199. bdnz LL(22)
  200. .align 4
  201. LL(25):
  202. andi. r0, M, 3
  203. ble LL(30)
  204. andi. r0, M, 2
  205. beq LL(27)
  206. LFPDUX c01, AO1, INC2
  207. LFPDUX c05, AO2, INC2
  208. LFPDUX c02, AO1, INC2
  209. LFPDUX c06, AO2, INC2
  210. STFPDUX c01, B, INC2
  211. STFPDUX c05, B, INC2
  212. STFPDUX c02, B, INC2
  213. STFPDUX c06, B, INC2
  214. .align 4
  215. LL(27):
  216. andi. r0, M, 1
  217. beq LL(30)
  218. LFPDUX c01, AO1, INC2
  219. LFPDUX c05, AO2, INC2
  220. STFPDUX c01, B, INC2
  221. STFPDUX c05, B, INC2
  222. .align 4
  223. LL(30):
  224. andi. J, N, 1
  225. ble LL(99)
  226. mr AO1, A
  227. srawi. r0, M, 2
  228. mtspr CTR, r0
  229. ble LL(35)
  230. .align 4
  231. LL(32):
  232. LFPDUX c01, AO1, INC2
  233. LFPDUX c02, AO1, INC2
  234. LFPDUX c03, AO1, INC2
  235. LFPDUX c04, AO1, INC2
  236. STFPDUX c01, B, INC2
  237. STFPDUX c02, B, INC2
  238. STFPDUX c03, B, INC2
  239. STFPDUX c04, B, INC2
  240. bdnz LL(32)
  241. .align 4
  242. LL(35):
  243. andi. r0, M, 3
  244. ble LL(99)
  245. andi. r0, M, 2
  246. beq LL(37)
  247. LFPDUX c01, AO1, INC2
  248. LFPDUX c02, AO1, INC2
  249. STFPDUX c01, B, INC2
  250. STFPDUX c02, B, INC2
  251. .align 4
  252. LL(37):
  253. andi. r0, M, 1
  254. beq LL(99)
  255. LFPDUX c01, AO1, INC2
  256. STFPDUX c01, B, INC2
  257. .align 4
  258. LL(99):
  259. addi SP, SP, -4
  260. lwzu r30, 4(SP)
  261. lwzu r31, 4(SP)
  262. subi SP, SP, 12
  263. li r0, 16
  264. lfpdux f15, SP, r0
  265. lfpdux f14, SP, r0
  266. addi SP, SP, 16
  267. blr
  268. .align 4
  269. LL(100):
  270. subi A, A, 1 * SIZE
  271. srawi. J, N, 2
  272. ble LL(120)
  273. .align 4
  274. LL(111):
  275. mr AO1, A
  276. add AO2, A, LDA
  277. add AO3, AO2, LDA
  278. add AO4, AO3, LDA
  279. add A, AO4, LDA
  280. srawi. r0, M, 2
  281. mtspr CTR, r0
  282. ble LL(115)
  283. .align 4
  284. LL(112):
  285. LFDUX c01, AO1, INC
  286. LFDUX c05, AO2, INC
  287. LFDUX c09, AO3, INC
  288. LFDUX c13, AO4, INC
  289. LFSDUX c01, AO1, INC
  290. LFSDUX c05, AO2, INC
  291. LFSDUX c09, AO3, INC
  292. LFSDUX c13, AO4, INC
  293. LFDUX c02, AO1, INC
  294. LFDUX c06, AO2, INC
  295. LFDUX c10, AO3, INC
  296. LFDUX c14, AO4, INC
  297. LFSDUX c02, AO1, INC
  298. LFSDUX c06, AO2, INC
  299. LFSDUX c10, AO3, INC
  300. LFSDUX c14, AO4, INC
  301. LFDUX c03, AO1, INC
  302. LFDUX c07, AO2, INC
  303. LFDUX c11, AO3, INC
  304. LFDUX c15, AO4, INC
  305. LFSDUX c03, AO1, INC
  306. LFSDUX c07, AO2, INC
  307. LFSDUX c11, AO3, INC
  308. LFSDUX c15, AO4, INC
  309. LFDUX c04, AO1, INC
  310. LFDUX c08, AO2, INC
  311. LFDUX c12, AO3, INC
  312. LFDUX c16, AO4, INC
  313. LFSDUX c04, AO1, INC
  314. LFSDUX c08, AO2, INC
  315. LFSDUX c12, AO3, INC
  316. LFSDUX c16, AO4, INC
  317. STFPDUX c01, B, INC2
  318. STFPDUX c05, B, INC2
  319. STFPDUX c09, B, INC2
  320. STFPDUX c13, B, INC2
  321. STFPDUX c02, B, INC2
  322. STFPDUX c06, B, INC2
  323. STFPDUX c10, B, INC2
  324. STFPDUX c14, B, INC2
  325. STFPDUX c03, B, INC2
  326. STFPDUX c07, B, INC2
  327. STFPDUX c11, B, INC2
  328. STFPDUX c15, B, INC2
  329. STFPDUX c04, B, INC2
  330. STFPDUX c08, B, INC2
  331. STFPDUX c12, B, INC2
  332. STFPDUX c16, B, INC2
  333. bdnz LL(112)
  334. .align 4
  335. LL(115):
  336. andi. r0, M, 3
  337. ble LL(119)
  338. andi. r0, M, 2
  339. beq LL(117)
  340. LFDUX c01, AO1, INC
  341. LFDUX c02, AO1, INC
  342. LFDUX c05, AO2, INC
  343. LFDUX c06, AO2, INC
  344. LFDUX c09, AO3, INC
  345. LFDUX c10, AO3, INC
  346. LFDUX c13, AO4, INC
  347. LFDUX c14, AO4, INC
  348. LFDUX c03, AO1, INC
  349. LFDUX c04, AO1, INC
  350. LFDUX c07, AO2, INC
  351. LFDUX c08, AO2, INC
  352. fsmfp c01, c02
  353. LFDUX c11, AO3, INC
  354. fsmfp c05, c06
  355. LFDUX c12, AO3, INC
  356. fsmfp c09, c10
  357. LFDUX c15, AO4, INC
  358. fsmfp c13, c14
  359. LFDUX c16, AO4, INC
  360. fsmfp c03, c04
  361. STFPDUX c01, B, INC2
  362. fsmfp c07, c08
  363. STFPDUX c05, B, INC2
  364. fsmfp c11, c12
  365. STFPDUX c09, B, INC2
  366. fsmfp c15, c16
  367. STFPDUX c13, B, INC2
  368. STFPDUX c03, B, INC2
  369. STFPDUX c07, B, INC2
  370. STFPDUX c11, B, INC2
  371. STFPDUX c15, B, INC2
  372. .align 4
  373. LL(117):
  374. andi. r0, M, 1
  375. beq LL(119)
  376. LFDUX c01, AO1, INC
  377. LFDUX c02, AO1, INC
  378. LFDUX c03, AO2, INC
  379. LFDUX c04, AO2, INC
  380. LFDUX c05, AO3, INC
  381. LFDUX c06, AO3, INC
  382. LFDUX c07, AO4, INC
  383. LFDUX c08, AO4, INC
  384. fsmfp c01, c02
  385. fsmfp c03, c04
  386. fsmfp c05, c06
  387. fsmfp c07, c08
  388. STFPDUX c01, B, INC2
  389. STFPDUX c03, B, INC2
  390. STFPDUX c05, B, INC2
  391. STFPDUX c07, B, INC2
  392. .align 4
  393. LL(119):
  394. addic. J, J, -1
  395. bgt LL(111)
  396. .align 4
  397. LL(120):
  398. andi. J, N, 2
  399. ble LL(130)
  400. mr AO1, A
  401. add AO2, A, LDA
  402. add A, AO2, LDA
  403. srawi. r0, M, 2
  404. mtspr CTR, r0
  405. ble LL(125)
  406. .align 4
  407. LL(122):
  408. LFDUX c01, AO1, INC
  409. LFDUX c02, AO1, INC
  410. LFDUX c09, AO2, INC
  411. LFDUX c10, AO2, INC
  412. LFDUX c03, AO1, INC
  413. LFDUX c04, AO1, INC
  414. LFDUX c11, AO2, INC
  415. LFDUX c12, AO2, INC
  416. LFDUX c05, AO1, INC
  417. LFDUX c06, AO1, INC
  418. LFDUX c13, AO2, INC
  419. LFDUX c14, AO2, INC
  420. fsmfp c01, c02
  421. LFDUX c07, AO1, INC
  422. fsmfp c09, c10
  423. LFDUX c08, AO1, INC
  424. fsmfp c03, c04
  425. LFDUX c15, AO2, INC
  426. fsmfp c11, c12
  427. LFDUX c16, AO2, INC
  428. fsmfp c05, c06
  429. STFPDUX c01, B, INC2
  430. fsmfp c13, c14
  431. STFPDUX c09, B, INC2
  432. fsmfp c07, c08
  433. STFPDUX c03, B, INC2
  434. fsmfp c15, c16
  435. STFPDUX c11, B, INC2
  436. STFPDUX c05, B, INC2
  437. STFPDUX c13, B, INC2
  438. STFPDUX c07, B, INC2
  439. STFPDUX c15, B, INC2
  440. bdnz LL(122)
  441. .align 4
  442. LL(125):
  443. andi. r0, M, 3
  444. ble LL(130)
  445. andi. r0, M, 2
  446. beq LL(127)
  447. LFDUX c01, AO1, INC
  448. LFDUX c02, AO1, INC
  449. LFDUX c03, AO2, INC
  450. LFDUX c04, AO2, INC
  451. LFDUX c05, AO1, INC
  452. LFDUX c06, AO1, INC
  453. LFDUX c07, AO2, INC
  454. LFDUX c08, AO2, INC
  455. fsmfp c01, c02
  456. fsmfp c03, c04
  457. fsmfp c05, c06
  458. fsmfp c07, c08
  459. STFPDUX c01, B, INC2
  460. STFPDUX c03, B, INC2
  461. STFPDUX c05, B, INC2
  462. STFPDUX c07, B, INC2
  463. .align 4
  464. LL(127):
  465. andi. r0, M, 1
  466. beq LL(130)
  467. LFDUX c01, AO1, INC
  468. LFDUX c02, AO1, INC
  469. LFDUX c03, AO2, INC
  470. LFDUX c04, AO2, INC
  471. fsmfp c01, c02
  472. fsmfp c03, c04
  473. STFPDUX c01, B, INC2
  474. STFPDUX c03, B, INC2
  475. .align 4
  476. LL(130):
  477. andi. J, N, 1
  478. ble LL(999)
  479. mr AO1, A
  480. srawi. r0, M, 2
  481. mtspr CTR, r0
  482. ble LL(135)
  483. .align 4
  484. LL(132):
  485. LFDUX c01, AO1, INC
  486. LFDUX c02, AO1, INC
  487. LFDUX c03, AO1, INC
  488. LFDUX c04, AO1, INC
  489. LFDUX c05, AO1, INC
  490. LFDUX c06, AO1, INC
  491. LFDUX c07, AO1, INC
  492. LFDUX c08, AO1, INC
  493. fsmfp c01, c02
  494. fsmfp c03, c04
  495. fsmfp c05, c06
  496. fsmfp c07, c08
  497. STFPDUX c01, B, INC2
  498. STFPDUX c03, B, INC2
  499. STFPDUX c05, B, INC2
  500. STFPDUX c07, B, INC2
  501. bdnz LL(132)
  502. .align 4
  503. LL(135):
  504. andi. r0, M, 3
  505. ble LL(999)
  506. andi. r0, M, 2
  507. beq LL(137)
  508. LFDUX c01, AO1, INC
  509. LFDUX c02, AO1, INC
  510. LFDUX c03, AO1, INC
  511. LFDUX c04, AO1, INC
  512. fsmfp c01, c02
  513. fsmfp c03, c04
  514. STFPDUX c01, B, INC2
  515. STFPDUX c03, B, INC2
  516. .align 4
  517. LL(137):
  518. andi. r0, M, 1
  519. beq LL(999)
  520. LFDUX c01, AO1, INC
  521. LFDUX c02, AO1, INC
  522. fsmfp c01, c02
  523. STFPDUX c01, B, INC2
  524. .align 4
  525. LL(999):
  526. addi SP, SP, -4
  527. lwzu r30, 4(SP)
  528. lwzu r31, 4(SP)
  529. subi SP, SP, 12
  530. li r0, 16
  531. lfpdux f15, SP, r0
  532. lfpdux f14, SP, r0
  533. addi SP, SP, 16
  534. blr
  535. .align 4
  536. EPILOGUE