You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_t.S 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M $4
  41. #define N $5
  42. #define A $8
  43. #define LDA $9
  44. #define X $10
  45. #define INCX $11
  46. #define Y $2
  47. #define INCY $6
  48. #define BUFFER $7
  49. #define XORIG $3
  50. #define XX $12
  51. #define YY $13
  52. #define I $14
  53. #define J $15
  54. #define AO1 $16
  55. #define AO2 $17
  56. #define ALPHA $f15
  57. #define a1 $f0
  58. #define a2 $f1
  59. #define a3 $f2
  60. #define a4 $f3
  61. #define a5 $f4
  62. #define a6 $f5
  63. #define a7 $f6
  64. #define a8 $f7
  65. #define y1 $f8
  66. #define y2 $f9
  67. #define y3 $f10
  68. #define y4 $f11
  69. #define x1 $f12
  70. #define x2 $f13
  71. #define x3 $f14
  72. #define x4 $f16
  73. #define x5 $f17
  74. #define x6 $f18
  75. #define x7 $f19
  76. #define x8 $f20
  77. PROLOGUE
  78. LDARG Y, 0($sp)
  79. LDARG INCY, 8($sp)
  80. LDARG BUFFER, 16($sp)
  81. #ifdef __64BIT__
  82. daddiu $sp, $sp, -16
  83. #else
  84. daddiu $sp, $sp, -32
  85. #endif
  86. MTC $0, y1
  87. SDARG $16, 0($sp)
  88. SDARG $17, 8($sp)
  89. dsll LDA, LDA, BASE_SHIFT
  90. #ifndef __64BIT__
  91. sdc1 $f20, 16($sp)
  92. #endif
  93. blez M, .L999
  94. dsll INCX, INCX, BASE_SHIFT
  95. blez N, .L999
  96. dsll INCY, INCY, BASE_SHIFT
  97. li XORIG, SIZE
  98. beq INCX, XORIG, .L10
  99. move XORIG, X
  100. dsra I, M, 2
  101. move XORIG, BUFFER
  102. blez I, .L05
  103. move YY, BUFFER
  104. .align 3
  105. .L02:
  106. LD a1, 0 * SIZE(X)
  107. daddu X, X, INCX
  108. LD a2, 0 * SIZE(X)
  109. daddu X, X, INCX
  110. LD a3, 0 * SIZE(X)
  111. daddu X, X, INCX
  112. LD a4, 0 * SIZE(X)
  113. daddu X, X, INCX
  114. ST a1, 0 * SIZE(YY)
  115. ST a2, 1 * SIZE(YY)
  116. ST a3, 2 * SIZE(YY)
  117. ST a4, 3 * SIZE(YY)
  118. daddiu I, I, -1
  119. bgtz I, .L02
  120. daddiu YY, YY, 4 * SIZE
  121. .align 3
  122. .L05:
  123. andi I, M, 3
  124. blez I, .L10
  125. NOP
  126. .align 3
  127. .L06:
  128. LD a1, 0 * SIZE(X)
  129. daddu X, X, INCX
  130. ST a1, 0 * SIZE(YY)
  131. daddiu I, I, -1
  132. bgtz I, .L06
  133. daddiu YY, YY, 1 * SIZE
  134. .align 3
  135. .L10:
  136. dsra J, N, 1
  137. blez J, .L20
  138. move YY, Y
  139. .align 3
  140. .L11:
  141. move AO1, A
  142. MOV y2, y1
  143. daddu AO2, A, LDA
  144. MOV y3, y1
  145. daddu A, AO2, LDA
  146. MOV y4, y1
  147. dsra I, M, 3
  148. blez I, .L15
  149. move XX, XORIG
  150. LD a1, 0 * SIZE(AO1)
  151. LD x1, 0 * SIZE(XX)
  152. LD a2, 0 * SIZE(AO2)
  153. LD x2, 1 * SIZE(XX)
  154. LD a3, 1 * SIZE(AO1)
  155. LD x3, 2 * SIZE(XX)
  156. LD a4, 1 * SIZE(AO2)
  157. LD x4, 3 * SIZE(XX)
  158. LD a5, 2 * SIZE(AO1)
  159. LD x5, 4 * SIZE(XX)
  160. LD a6, 2 * SIZE(AO2)
  161. LD x6, 5 * SIZE(XX)
  162. LD a7, 3 * SIZE(AO1)
  163. LD x7, 6 * SIZE(XX)
  164. LD a8, 3 * SIZE(AO2)
  165. daddiu I, I, -1
  166. blez I, .L13
  167. LD x8, 7 * SIZE(XX)
  168. .align 3
  169. .L12:
  170. MADD y1, y1, x1, a1
  171. LD a1, 4 * SIZE(AO1)
  172. MADD y2, y2, x1, a2
  173. LD a2, 4 * SIZE(AO2)
  174. MADD y3, y3, x2, a3
  175. LD a3, 5 * SIZE(AO1)
  176. MADD y4, y4, x2, a4
  177. LD a4, 5 * SIZE(AO2)
  178. LD x1, 8 * SIZE(XX)
  179. LD x2, 9 * SIZE(XX)
  180. MADD y1, y1, x3, a5
  181. LD a5, 6 * SIZE(AO1)
  182. MADD y2, y2, x3, a6
  183. LD a6, 6 * SIZE(AO2)
  184. MADD y3, y3, x4, a7
  185. LD a7, 7 * SIZE(AO1)
  186. MADD y4, y4, x4, a8
  187. LD a8, 7 * SIZE(AO2)
  188. LD x3, 10 * SIZE(XX)
  189. LD x4, 11 * SIZE(XX)
  190. MADD y1, y1, x5, a1
  191. LD a1, 8 * SIZE(AO1)
  192. MADD y2, y2, x5, a2
  193. LD a2, 8 * SIZE(AO2)
  194. MADD y3, y3, x6, a3
  195. LD a3, 9 * SIZE(AO1)
  196. MADD y4, y4, x6, a4
  197. LD a4, 9 * SIZE(AO2)
  198. LD x5, 12 * SIZE(XX)
  199. LD x6, 13 * SIZE(XX)
  200. MADD y1, y1, x7, a5
  201. LD a5,10 * SIZE(AO1)
  202. MADD y2, y2, x7, a6
  203. LD a6,10 * SIZE(AO2)
  204. MADD y3, y3, x8, a7
  205. LD a7,11 * SIZE(AO1)
  206. MADD y4, y4, x8, a8
  207. LD a8,11 * SIZE(AO2)
  208. LD x7, 14 * SIZE(XX)
  209. LD x8, 15 * SIZE(XX)
  210. daddiu I, I, -1
  211. daddiu XX, XX, 8 * SIZE
  212. daddiu AO1, AO1, 8 * SIZE
  213. bgtz I, .L12
  214. daddiu AO2, AO2, 8 * SIZE
  215. .align 3
  216. .L13:
  217. MADD y1, y1, x1, a1
  218. LD a1, 4 * SIZE(AO1)
  219. MADD y2, y2, x1, a2
  220. LD a2, 4 * SIZE(AO2)
  221. MADD y3, y3, x2, a3
  222. LD a3, 5 * SIZE(AO1)
  223. MADD y4, y4, x2, a4
  224. LD a4, 5 * SIZE(AO2)
  225. MADD y1, y1, x3, a5
  226. LD a5, 6 * SIZE(AO1)
  227. MADD y2, y2, x3, a6
  228. LD a6, 6 * SIZE(AO2)
  229. MADD y3, y3, x4, a7
  230. LD a7, 7 * SIZE(AO1)
  231. MADD y4, y4, x4, a8
  232. LD a8, 7 * SIZE(AO2)
  233. MADD y1, y1, x5, a1
  234. MADD y2, y2, x5, a2
  235. MADD y3, y3, x6, a3
  236. MADD y4, y4, x6, a4
  237. MADD y1, y1, x7, a5
  238. daddiu XX, XX, 8 * SIZE
  239. MADD y2, y2, x7, a6
  240. daddiu AO1, AO1, 8 * SIZE
  241. MADD y3, y3, x8, a7
  242. daddiu AO2, AO2, 8 * SIZE
  243. MADD y4, y4, x8, a8
  244. NOP
  245. .align 3
  246. .L15:
  247. andi I, M, 4
  248. NOP
  249. blez I, .L17
  250. NOP
  251. LD a1, 0 * SIZE(AO1)
  252. LD x1, 0 * SIZE(XX)
  253. LD a2, 0 * SIZE(AO2)
  254. LD a3, 1 * SIZE(AO1)
  255. LD x2, 1 * SIZE(XX)
  256. LD a4, 1 * SIZE(AO2)
  257. LD a5, 2 * SIZE(AO1)
  258. LD x3, 2 * SIZE(XX)
  259. MADD y1, y1, x1, a1
  260. LD a6, 2 * SIZE(AO2)
  261. MADD y2, y2, x1, a2
  262. LD a7, 3 * SIZE(AO1)
  263. MADD y3, y3, x2, a3
  264. LD x4, 3 * SIZE(XX)
  265. MADD y4, y4, x2, a4
  266. LD a8, 3 * SIZE(AO2)
  267. MADD y1, y1, x3, a5
  268. MADD y2, y2, x3, a6
  269. daddiu XX, XX, 4 * SIZE
  270. MADD y3, y3, x4, a7
  271. daddiu AO1, AO1, 4 * SIZE
  272. MADD y4, y4, x4, a8
  273. daddiu AO2, AO2, 4 * SIZE
  274. .align 3
  275. .L17:
  276. andi I, M, 3
  277. ADD y1, y1, y3
  278. blez I, .L19
  279. ADD y2, y2, y4
  280. .align 3
  281. .L18:
  282. LD x1, 0 * SIZE(XX)
  283. LD a1, 0 * SIZE(AO1)
  284. LD a2, 0 * SIZE(AO2)
  285. daddiu I, I, -1
  286. daddiu XX, XX, 1 * SIZE
  287. daddiu AO1, AO1, 1 * SIZE
  288. daddiu AO2, AO2, 1 * SIZE
  289. MADD y1, y1, x1, a1
  290. bgtz I, .L18
  291. MADD y2, y2, x1, a2
  292. .align 3
  293. .L19:
  294. LD a1, 0 * SIZE(Y)
  295. daddu Y, Y, INCY
  296. LD a2, 0 * SIZE(Y)
  297. daddu Y, Y, INCY
  298. MADD a1, a1, ALPHA, y1
  299. daddiu J, J, -1
  300. MADD a2, a2, ALPHA, y2
  301. MTC $0, y1
  302. ST a1, 0 * SIZE(YY)
  303. daddu YY, YY, INCY
  304. ST a2, 0 * SIZE(YY)
  305. bgtz J, .L11
  306. daddu YY, YY, INCY
  307. .align 3
  308. .L20:
  309. andi J, N, 1
  310. MOV y3, y1
  311. blez J, .L999
  312. move AO1, A
  313. dsra I, M, 3
  314. NOP
  315. blez I, .L25
  316. move XX, XORIG
  317. LD a1, 0 * SIZE(AO1)
  318. LD x1, 0 * SIZE(XX)
  319. LD a3, 1 * SIZE(AO1)
  320. LD x2, 1 * SIZE(XX)
  321. LD a5, 2 * SIZE(AO1)
  322. LD x3, 2 * SIZE(XX)
  323. LD a7, 3 * SIZE(AO1)
  324. LD x4, 3 * SIZE(XX)
  325. LD x5, 4 * SIZE(XX)
  326. LD x6, 5 * SIZE(XX)
  327. LD x7, 6 * SIZE(XX)
  328. daddiu I, I, -1
  329. blez I, .L23
  330. LD x8, 7 * SIZE(XX)
  331. .align 3
  332. .L22:
  333. MADD y1, y1, x1, a1
  334. LD a1, 4 * SIZE(AO1)
  335. MADD y3, y3, x2, a3
  336. LD a3, 5 * SIZE(AO1)
  337. LD x1, 8 * SIZE(XX)
  338. LD x2, 9 * SIZE(XX)
  339. MADD y1, y1, x3, a5
  340. LD a5, 6 * SIZE(AO1)
  341. MADD y3, y3, x4, a7
  342. LD a7, 7 * SIZE(AO1)
  343. LD x3, 10 * SIZE(XX)
  344. LD x4, 11 * SIZE(XX)
  345. MADD y1, y1, x5, a1
  346. LD a1, 8 * SIZE(AO1)
  347. MADD y3, y3, x6, a3
  348. LD a3, 9 * SIZE(AO1)
  349. LD x5, 12 * SIZE(XX)
  350. LD x6, 13 * SIZE(XX)
  351. MADD y1, y1, x7, a5
  352. LD a5, 10 * SIZE(AO1)
  353. MADD y3, y3, x8, a7
  354. LD a7, 11 * SIZE(AO1)
  355. LD x7, 14 * SIZE(XX)
  356. LD x8, 15 * SIZE(XX)
  357. daddiu I, I, -1
  358. daddiu XX, XX, 8 * SIZE
  359. bgtz I, .L22
  360. daddiu AO1, AO1, 8 * SIZE
  361. .align 3
  362. .L23:
  363. MADD y1, y1, x1, a1
  364. LD a1, 4 * SIZE(AO1)
  365. MADD y3, y3, x2, a3
  366. LD a3, 5 * SIZE(AO1)
  367. MADD y1, y1, x3, a5
  368. LD a5, 6 * SIZE(AO1)
  369. MADD y3, y3, x4, a7
  370. LD a7, 7 * SIZE(AO1)
  371. MADD y1, y1, x5, a1
  372. MADD y3, y3, x6, a3
  373. MADD y1, y1, x7, a5
  374. MADD y3, y3, x8, a7
  375. daddiu XX, XX, 8 * SIZE
  376. daddiu AO1, AO1, 8 * SIZE
  377. .align 3
  378. .L25:
  379. andi I, M, 4
  380. NOP
  381. blez I, .L27
  382. NOP
  383. LD a1, 0 * SIZE(AO1)
  384. LD x1, 0 * SIZE(XX)
  385. LD a3, 1 * SIZE(AO1)
  386. LD x2, 1 * SIZE(XX)
  387. LD a5, 2 * SIZE(AO1)
  388. LD x3, 2 * SIZE(XX)
  389. MADD y1, y1, x1, a1
  390. LD a7, 3 * SIZE(AO1)
  391. MADD y3, y3, x2, a3
  392. LD x4, 3 * SIZE(XX)
  393. MADD y1, y1, x3, a5
  394. daddiu XX, XX, 4 * SIZE
  395. MADD y3, y3, x4, a7
  396. daddiu AO1, AO1, 4 * SIZE
  397. .align 3
  398. .L27:
  399. andi I, M, 3
  400. ADD y1, y1, y3
  401. blez I, .L29
  402. NOP
  403. .align 3
  404. .L28:
  405. LD x1, 0 * SIZE(XX)
  406. LD a1, 0 * SIZE(AO1)
  407. daddiu I, I, -1
  408. daddiu XX, XX, 1 * SIZE
  409. daddiu AO1, AO1, 1 * SIZE
  410. bgtz I, .L28
  411. MADD y1, y1, x1, a1
  412. .align 3
  413. .L29:
  414. LD a1, 0 * SIZE(Y)
  415. daddu Y, Y, INCY
  416. MADD a1, a1, ALPHA, y1
  417. NOP
  418. ST a1, 0 * SIZE(YY)
  419. daddu YY, YY, INCY
  420. .align 3
  421. .L999:
  422. LDARG $16, 0($sp)
  423. LDARG $17, 8($sp)
  424. #ifndef __64BIT__
  425. ldc1 $f20, 16($sp)
  426. #endif
  427. j $31
  428. #ifdef __64BIT__
  429. daddiu $sp, $sp, 16
  430. #else
  431. daddiu $sp, $sp, 32
  432. #endif
  433. EPILOGUE