You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n.S 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M $4
  41. #define N $5
  42. #define A $8
  43. #define LDA $9
  44. #define X $10
  45. #define INCX $11
  46. #define Y $2
  47. #define INCY $6
  48. #define BUFFER $7
  49. #define YORIG $3
  50. #define XX $12
  51. #define YY $13
  52. #define I $14
  53. #define J $15
  54. #define AO1 $16
  55. #define AO2 $17
  56. #define ALPHA $f15
  57. #define a1 $f0
  58. #define a2 $f1
  59. #define a3 $f2
  60. #define a4 $f3
  61. #define a5 $f4
  62. #define a6 $f5
  63. #define a7 $f6
  64. #define a8 $f7
  65. #define x1 $f8
  66. #define x2 $f9
  67. #define y1 $f10
  68. #define y2 $f11
  69. #define y3 $f12
  70. #define y4 $f13
  71. #define y5 $f14
  72. #define y6 $f16
  73. #define y7 $f17
  74. #define y8 $f18
  75. #define t1 $f19
  76. #define t2 $f20
  77. #define t3 $f21
  78. #define t4 $f22
  79. PROLOGUE
  80. LDARG Y, 0($sp)
  81. LDARG INCY, 8($sp)
  82. LDARG BUFFER, 16($sp)
  83. #ifdef __64BIT__
  84. daddiu $sp, $sp, -16
  85. #else
  86. daddiu $sp, $sp, -48
  87. #endif
  88. SDARG $16, 0($sp)
  89. SDARG $17, 8($sp)
  90. dsll LDA, LDA, BASE_SHIFT
  91. #ifndef __64BIT__
  92. sdc1 $f20, 16($sp)
  93. sdc1 $f21, 24($sp)
  94. sdc1 $f22, 32($sp)
  95. #endif
  96. blez M, .L999
  97. dsll INCX, INCX, BASE_SHIFT
  98. blez N, .L999
  99. dsll INCY, INCY, BASE_SHIFT
  100. li YORIG, SIZE
  101. beq INCY, YORIG, .L10
  102. move YORIG, Y
  103. dsra I, M, 2
  104. move YORIG, BUFFER
  105. move XX, Y
  106. blez I, .L05
  107. move YY, BUFFER
  108. .align 3
  109. .L02:
  110. LD a1, 0 * SIZE(XX)
  111. daddu XX, XX, INCY
  112. LD a2, 0 * SIZE(XX)
  113. daddu XX, XX, INCY
  114. LD a3, 0 * SIZE(XX)
  115. daddu XX, XX, INCY
  116. LD a4, 0 * SIZE(XX)
  117. daddu XX, XX, INCY
  118. ST a1, 0 * SIZE(YY)
  119. ST a2, 1 * SIZE(YY)
  120. ST a3, 2 * SIZE(YY)
  121. ST a4, 3 * SIZE(YY)
  122. daddiu I, I, -1
  123. bgtz I, .L02
  124. daddiu YY, YY, 4 * SIZE
  125. .align 3
  126. .L05:
  127. andi I, M, 3
  128. blez I, .L10
  129. NOP
  130. .align 3
  131. .L06:
  132. LD a1, 0 * SIZE(XX)
  133. daddu XX, XX, INCY
  134. ST a1, 0 * SIZE(YY)
  135. daddiu I, I, -1
  136. bgtz I, .L06
  137. daddiu YY, YY, 1 * SIZE
  138. .align 3
  139. .L10:
  140. dsra J, N, 1
  141. blez J, .L20
  142. NOP
  143. .align 3
  144. .L11:
  145. LD x1, 0 * SIZE(X)
  146. daddu X, X, INCX
  147. LD x2, 0 * SIZE(X)
  148. daddu X, X, INCX
  149. move AO1, A
  150. daddu AO2, A, LDA
  151. daddu A, AO2, LDA
  152. move YY, YORIG
  153. MUL x1, ALPHA, x1
  154. dsra I, M, 3
  155. blez I, .L15
  156. MUL x2, ALPHA, x2
  157. LD a1, 0 * SIZE(AO1)
  158. LD y1, 0 * SIZE(YY)
  159. LD a2, 1 * SIZE(AO1)
  160. LD y2, 1 * SIZE(YY)
  161. LD a3, 2 * SIZE(AO1)
  162. LD y3, 2 * SIZE(YY)
  163. LD a4, 3 * SIZE(AO1)
  164. LD y4, 3 * SIZE(YY)
  165. LD a5, 0 * SIZE(AO2)
  166. LD y5, 4 * SIZE(YY)
  167. LD a6, 1 * SIZE(AO2)
  168. LD y6, 5 * SIZE(YY)
  169. LD a7, 2 * SIZE(AO2)
  170. LD y7, 6 * SIZE(YY)
  171. LD a8, 3 * SIZE(AO2)
  172. daddiu I, I, -1
  173. blez I, .L13
  174. LD y8, 7 * SIZE(YY)
  175. .align 3
  176. .L12:
  177. MADD t1, y1, x1, a1
  178. LD a1, 4 * SIZE(AO1)
  179. MADD t2, y2, x1, a2
  180. LD a2, 5 * SIZE(AO1)
  181. LD y1, 8 * SIZE(YY)
  182. LD y2, 9 * SIZE(YY)
  183. MADD t3, y3, x1, a3
  184. LD a3, 6 * SIZE(AO1)
  185. MADD t4, y4, x1, a4
  186. LD a4, 7 * SIZE(AO1)
  187. LD y3, 10 * SIZE(YY)
  188. LD y4, 11 * SIZE(YY)
  189. MADD t1, t1, x2, a5
  190. LD a5, 4 * SIZE(AO2)
  191. MADD t2, t2, x2, a6
  192. LD a6, 5 * SIZE(AO2)
  193. MADD t3, t3, x2, a7
  194. LD a7, 6 * SIZE(AO2)
  195. MADD t4, t4, x2, a8
  196. LD a8, 7 * SIZE(AO2)
  197. ST t1, 0 * SIZE(YY)
  198. ST t2, 1 * SIZE(YY)
  199. ST t3, 2 * SIZE(YY)
  200. ST t4, 3 * SIZE(YY)
  201. MADD t1, y5, x1, a1
  202. LD a1, 8 * SIZE(AO1)
  203. MADD t2, y6, x1, a2
  204. LD a2, 9 * SIZE(AO1)
  205. LD y5, 12 * SIZE(YY)
  206. LD y6, 13 * SIZE(YY)
  207. MADD t3, y7, x1, a3
  208. LD a3, 10 * SIZE(AO1)
  209. MADD t4, y8, x1, a4
  210. LD a4, 11 * SIZE(AO1)
  211. LD y7, 14 * SIZE(YY)
  212. LD y8, 15 * SIZE(YY)
  213. MADD t1, t1, x2, a5
  214. LD a5, 8 * SIZE(AO2)
  215. MADD t2, t2, x2, a6
  216. LD a6, 9 * SIZE(AO2)
  217. MADD t3, t3, x2, a7
  218. LD a7, 10 * SIZE(AO2)
  219. MADD t4, t4, x2, a8
  220. LD a8, 11 * SIZE(AO2)
  221. ST t1, 4 * SIZE(YY)
  222. ST t2, 5 * SIZE(YY)
  223. ST t3, 6 * SIZE(YY)
  224. ST t4, 7 * SIZE(YY)
  225. daddiu I, I, -1
  226. daddiu YY, YY, 8 * SIZE
  227. daddiu AO1, AO1, 8 * SIZE
  228. bgtz I, .L12
  229. daddiu AO2, AO2, 8 * SIZE
  230. .align 3
  231. .L13:
  232. MADD t1, y1, x1, a1
  233. LD a1, 4 * SIZE(AO1)
  234. MADD t2, y2, x1, a2
  235. LD a2, 5 * SIZE(AO1)
  236. MADD t3, y3, x1, a3
  237. LD a3, 6 * SIZE(AO1)
  238. MADD t4, y4, x1, a4
  239. LD a4, 7 * SIZE(AO1)
  240. MADD t1, t1, x2, a5
  241. LD a5, 4 * SIZE(AO2)
  242. MADD t2, t2, x2, a6
  243. LD a6, 5 * SIZE(AO2)
  244. MADD t3, t3, x2, a7
  245. LD a7, 6 * SIZE(AO2)
  246. MADD t4, t4, x2, a8
  247. LD a8, 7 * SIZE(AO2)
  248. ST t1, 0 * SIZE(YY)
  249. MADD t1, y5, x1, a1
  250. ST t2, 1 * SIZE(YY)
  251. MADD t2, y6, x1, a2
  252. ST t3, 2 * SIZE(YY)
  253. MADD t3, y7, x1, a3
  254. ST t4, 3 * SIZE(YY)
  255. MADD t4, y8, x1, a4
  256. MADD t1, t1, x2, a5
  257. daddiu AO1, AO1, 8 * SIZE
  258. MADD t2, t2, x2, a6
  259. daddiu AO2, AO2, 8 * SIZE
  260. MADD t3, t3, x2, a7
  261. daddiu YY, YY, 8 * SIZE
  262. MADD t4, t4, x2, a8
  263. NOP
  264. ST t1, -4 * SIZE(YY)
  265. ST t2, -3 * SIZE(YY)
  266. ST t3, -2 * SIZE(YY)
  267. ST t4, -1 * SIZE(YY)
  268. .align 3
  269. .L15:
  270. andi I, M, 4
  271. NOP
  272. blez I, .L16
  273. NOP
  274. LD a1, 0 * SIZE(AO1)
  275. LD y1, 0 * SIZE(YY)
  276. LD a2, 1 * SIZE(AO1)
  277. LD y2, 1 * SIZE(YY)
  278. LD a3, 2 * SIZE(AO1)
  279. LD y3, 2 * SIZE(YY)
  280. LD a4, 3 * SIZE(AO1)
  281. LD y4, 3 * SIZE(YY)
  282. LD a5, 0 * SIZE(AO2)
  283. MADD y1, y1, x1, a1
  284. LD a6, 1 * SIZE(AO2)
  285. MADD y2, y2, x1, a2
  286. LD a7, 2 * SIZE(AO2)
  287. MADD y3, y3, x1, a3
  288. LD a8, 3 * SIZE(AO2)
  289. MADD y4, y4, x1, a4
  290. MADD y1, y1, x2, a5
  291. daddiu YY, YY, 4 * SIZE
  292. MADD y2, y2, x2, a6
  293. daddiu AO1, AO1, 4 * SIZE
  294. MADD y3, y3, x2, a7
  295. daddiu AO2, AO2, 4 * SIZE
  296. MADD y4, y4, x2, a8
  297. ST y1, -4 * SIZE(YY)
  298. ST y2, -3 * SIZE(YY)
  299. ST y3, -2 * SIZE(YY)
  300. ST y4, -1 * SIZE(YY)
  301. .align 3
  302. .L16:
  303. andi I, M, 2
  304. NOP
  305. blez I, .L17
  306. NOP
  307. LD a1, 0 * SIZE(AO1)
  308. LD y1, 0 * SIZE(YY)
  309. LD a2, 1 * SIZE(AO1)
  310. LD y2, 1 * SIZE(YY)
  311. LD a5, 0 * SIZE(AO2)
  312. LD a6, 1 * SIZE(AO2)
  313. MADD y1, y1, x1, a1
  314. NOP
  315. MADD y2, y2, x1, a2
  316. daddiu YY, YY, 2 * SIZE
  317. MADD y1, y1, x2, a5
  318. daddiu AO1, AO1, 2 * SIZE
  319. MADD y2, y2, x2, a6
  320. daddiu AO2, AO2, 2 * SIZE
  321. ST y1, -2 * SIZE(YY)
  322. ST y2, -1 * SIZE(YY)
  323. .align 3
  324. .L17:
  325. andi I, M, 1
  326. NOP
  327. blez I, .L19
  328. NOP
  329. LD y1, 0 * SIZE(YY)
  330. LD a1, 0 * SIZE(AO1)
  331. LD a5, 0 * SIZE(AO2)
  332. MADD y1, y1, x1, a1
  333. MADD y1, y1, x2, a5
  334. ST y1, 0 * SIZE(YY)
  335. .align 3
  336. .L19:
  337. daddiu J, J, -1
  338. bgtz J, .L11
  339. NOP
  340. .align 3
  341. .L20:
  342. andi J, N, 1
  343. blez J, .L900
  344. NOP
  345. .align 3
  346. .L21:
  347. LD x1, 0 * SIZE(X)
  348. daddu X, X, INCX
  349. move YY, YORIG
  350. move AO1, A
  351. dsra I, M, 3
  352. blez I, .L25
  353. MUL x1, ALPHA, x1
  354. LD a1, 0 * SIZE(AO1)
  355. LD y1, 0 * SIZE(YY)
  356. LD a2, 1 * SIZE(AO1)
  357. LD y2, 1 * SIZE(YY)
  358. LD a3, 2 * SIZE(AO1)
  359. LD y3, 2 * SIZE(YY)
  360. LD a4, 3 * SIZE(AO1)
  361. LD y4, 3 * SIZE(YY)
  362. LD y5, 4 * SIZE(YY)
  363. LD y6, 5 * SIZE(YY)
  364. LD y7, 6 * SIZE(YY)
  365. daddiu I, I, -1
  366. blez I, .L23
  367. LD y8, 7 * SIZE(YY)
  368. .align 3
  369. .L22:
  370. MADD t1, y1, x1, a1
  371. LD a1, 4 * SIZE(AO1)
  372. MADD t2, y2, x1, a2
  373. LD a2, 5 * SIZE(AO1)
  374. LD y1, 8 * SIZE(YY)
  375. LD y2, 9 * SIZE(YY)
  376. MADD t3, y3, x1, a3
  377. LD a3, 6 * SIZE(AO1)
  378. MADD t4, y4, x1, a4
  379. LD a4, 7 * SIZE(AO1)
  380. LD y3, 10 * SIZE(YY)
  381. LD y4, 11 * SIZE(YY)
  382. ST t1, 0 * SIZE(YY)
  383. ST t2, 1 * SIZE(YY)
  384. ST t3, 2 * SIZE(YY)
  385. ST t4, 3 * SIZE(YY)
  386. MADD t1, y5, x1, a1
  387. LD a1, 8 * SIZE(AO1)
  388. MADD t2, y6, x1, a2
  389. LD a2, 9 * SIZE(AO1)
  390. LD y5, 12 * SIZE(YY)
  391. LD y6, 13 * SIZE(YY)
  392. MADD t3, y7, x1, a3
  393. LD a3, 10 * SIZE(AO1)
  394. MADD t4, y8, x1, a4
  395. LD a4, 11 * SIZE(AO1)
  396. LD y7, 14 * SIZE(YY)
  397. LD y8, 15 * SIZE(YY)
  398. ST t1, 4 * SIZE(YY)
  399. ST t2, 5 * SIZE(YY)
  400. ST t3, 6 * SIZE(YY)
  401. ST t4, 7 * SIZE(YY)
  402. daddiu I, I, -1
  403. daddiu YY, YY, 8 * SIZE
  404. bgtz I, .L22
  405. daddiu AO1, AO1, 8 * SIZE
  406. .align 3
  407. .L23:
  408. MADD t1, y1, x1, a1
  409. LD a1, 4 * SIZE(AO1)
  410. MADD t2, y2, x1, a2
  411. LD a2, 5 * SIZE(AO1)
  412. MADD t3, y3, x1, a3
  413. LD a3, 6 * SIZE(AO1)
  414. MADD t4, y4, x1, a4
  415. LD a4, 7 * SIZE(AO1)
  416. ST t1, 0 * SIZE(YY)
  417. MADD t1, y5, x1, a1
  418. ST t2, 1 * SIZE(YY)
  419. MADD t2, y6, x1, a2
  420. ST t3, 2 * SIZE(YY)
  421. MADD t3, y7, x1, a3
  422. ST t4, 3 * SIZE(YY)
  423. MADD t4, y8, x1, a4
  424. ST t1, 4 * SIZE(YY)
  425. ST t2, 5 * SIZE(YY)
  426. ST t3, 6 * SIZE(YY)
  427. ST t4, 7 * SIZE(YY)
  428. daddiu AO1, AO1, 8 * SIZE
  429. daddiu YY, YY, 8 * SIZE
  430. .align 3
  431. .L25:
  432. andi I, M, 4
  433. NOP
  434. blez I, .L26
  435. NOP
  436. LD a1, 0 * SIZE(AO1)
  437. LD y1, 0 * SIZE(YY)
  438. LD a2, 1 * SIZE(AO1)
  439. LD y2, 1 * SIZE(YY)
  440. LD a3, 2 * SIZE(AO1)
  441. LD y3, 2 * SIZE(YY)
  442. LD a4, 3 * SIZE(AO1)
  443. LD y4, 3 * SIZE(YY)
  444. MADD y1, y1, x1, a1
  445. MADD y2, y2, x1, a2
  446. MADD y3, y3, x1, a3
  447. daddiu YY, YY, 4 * SIZE
  448. MADD y4, y4, x1, a4
  449. daddiu AO1, AO1, 4 * SIZE
  450. ST y1, -4 * SIZE(YY)
  451. ST y2, -3 * SIZE(YY)
  452. ST y3, -2 * SIZE(YY)
  453. ST y4, -1 * SIZE(YY)
  454. .align 3
  455. .L26:
  456. andi I, M, 2
  457. NOP
  458. blez I, .L27
  459. NOP
  460. LD a1, 0 * SIZE(AO1)
  461. LD y1, 0 * SIZE(YY)
  462. LD a2, 1 * SIZE(AO1)
  463. LD y2, 1 * SIZE(YY)
  464. MADD y1, y1, x1, a1
  465. daddiu YY, YY, 2 * SIZE
  466. MADD y2, y2, x1, a2
  467. daddiu AO1, AO1, 2 * SIZE
  468. ST y1, -2 * SIZE(YY)
  469. ST y2, -1 * SIZE(YY)
  470. .align 3
  471. .L27:
  472. andi I, M, 1
  473. NOP
  474. blez I, .L900
  475. NOP
  476. LD y1, 0 * SIZE(YY)
  477. LD a1, 0 * SIZE(AO1)
  478. MADD y1, y1, x1, a1
  479. ST y1, 0 * SIZE(YY)
  480. .align 3
  481. .L900:
  482. li YORIG, SIZE
  483. beq INCY, YORIG, .L999
  484. dsra I, M, 2
  485. blez I, .L905
  486. move XX, BUFFER
  487. .align 3
  488. .L902:
  489. LD a1, 0 * SIZE(XX)
  490. LD a2, 1 * SIZE(XX)
  491. LD a3, 2 * SIZE(XX)
  492. LD a4, 3 * SIZE(XX)
  493. ST a1, 0 * SIZE(Y)
  494. daddu Y, Y, INCY
  495. ST a2, 0 * SIZE(Y)
  496. daddu Y, Y, INCY
  497. ST a3, 0 * SIZE(Y)
  498. daddu Y, Y, INCY
  499. ST a4, 0 * SIZE(Y)
  500. daddu Y, Y, INCY
  501. daddiu I, I, -1
  502. bgtz I, .L902
  503. daddiu XX, XX, 4 * SIZE
  504. .align 3
  505. .L905:
  506. andi I, M, 3
  507. blez I, .L999
  508. NOP
  509. .align 3
  510. .L906:
  511. LD a1, 0 * SIZE(XX)
  512. daddiu XX, XX, 1 * SIZE
  513. ST a1, 0 * SIZE(Y)
  514. daddiu I, I, -1
  515. bgtz I, .L906
  516. daddu Y, Y, INCY
  517. .align 3
  518. .L999:
  519. LDARG $16, 0($sp)
  520. LDARG $17, 8($sp)
  521. #ifndef __64BIT__
  522. ldc1 $f20, 16($sp)
  523. ldc1 $f21, 24($sp)
  524. ldc1 $f22, 32($sp)
  525. #endif
  526. j $31
  527. #ifdef __64BIT__
  528. daddiu $sp, $sp, 16
  529. #else
  530. daddiu $sp, $sp, 48
  531. #endif
  532. EPILOGUE