You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ger.S 9.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %i0
  41. #define N %i1
  42. #if defined(DOUBLE) && !defined(__64BIT__)
  43. #define X %i5
  44. #define INCX %i2
  45. #define Y %i3
  46. #define INCY %i4
  47. #else
  48. #define X %i4
  49. #define INCX %i5
  50. #define Y %i2
  51. #define INCY %i3
  52. #endif
  53. #define A %l0
  54. #define LDA %l1
  55. #define BUFFER %l2
  56. #define I %l3
  57. #define J %l4
  58. #define A1 %o0
  59. #define X1 %o2
  60. #define XX %o3
  61. #ifdef DOUBLE
  62. #define t1 %f0
  63. #define t2 %f2
  64. #define t3 %f4
  65. #define t4 %f6
  66. #define x1 %f8
  67. #define x2 %f10
  68. #define x3 %f12
  69. #define x4 %f14
  70. #define x5 %f16
  71. #define x6 %f18
  72. #define x7 %f20
  73. #define x8 %f22
  74. #define a1 %f24
  75. #define a2 %f26
  76. #define a3 %f28
  77. #define a4 %f30
  78. #define a5 %f32
  79. #define a6 %f34
  80. #define a7 %f36
  81. #define a8 %f38
  82. #define a9 %f40
  83. #define a10 %f42
  84. #define a11 %f44
  85. #define a12 %f46
  86. #define a13 %f48
  87. #define a14 %f50
  88. #define a15 %f52
  89. #define a16 %f54
  90. #define y1 %f56
  91. #define y2 %f58
  92. #define ALPHA %f60
  93. #else
  94. #define t1 %f0
  95. #define t2 %f1
  96. #define t3 %f2
  97. #define t4 %f3
  98. #define x1 %f4
  99. #define x2 %f5
  100. #define x3 %f6
  101. #define x4 %f7
  102. #define x5 %f8
  103. #define x6 %f9
  104. #define x7 %f10
  105. #define x8 %f11
  106. #define a1 %f12
  107. #define a2 %f13
  108. #define a3 %f14
  109. #define a4 %f15
  110. #define a5 %f16
  111. #define a6 %f17
  112. #define a7 %f18
  113. #define a8 %f19
  114. #define a9 %f20
  115. #define a10 %f21
  116. #define a11 %f22
  117. #define a12 %f23
  118. #define a13 %f24
  119. #define a14 %f25
  120. #define a15 %f26
  121. #define a16 %f27
  122. #define y1 %f28
  123. #define y2 %f29
  124. #define ALPHA %f30
  125. #endif
  126. #define PREFETCHSIZE 60
  127. PROLOGUE
  128. SAVESP
  129. nop
  130. #ifndef __64BIT__
  131. #ifdef DOUBLE
  132. st %i3, [%sp + STACK_START + 16]
  133. st %i4, [%sp + STACK_START + 20]
  134. ld [%sp + STACK_START + 28], INCX
  135. ld [%sp + STACK_START + 32], Y
  136. ld [%sp + STACK_START + 36], INCY
  137. ld [%sp + STACK_START + 40], A
  138. ld [%sp + STACK_START + 44], LDA
  139. ld [%sp + STACK_START + 48], BUFFER
  140. #else
  141. st %i3, [%sp + STACK_START + 16]
  142. ld [%sp + STACK_START + 28], Y
  143. ld [%sp + STACK_START + 32], INCY
  144. ld [%sp + STACK_START + 36], A
  145. ld [%sp + STACK_START + 40], LDA
  146. ld [%sp + STACK_START + 44], BUFFER
  147. #endif
  148. LDF [%sp + STACK_START + 16], ALPHA
  149. #else
  150. ldx [%sp + STACK_START + 56], Y
  151. ldx [%sp + STACK_START + 64], INCY
  152. ldx [%sp + STACK_START + 72], A
  153. ldx [%sp + STACK_START + 80], LDA
  154. ldx [%sp + STACK_START + 88], BUFFER
  155. #ifdef DOUBLE
  156. FMOV %f6, ALPHA
  157. #else
  158. FMOV %f7, ALPHA
  159. #endif
  160. #endif
  161. sll LDA, BASE_SHIFT, LDA
  162. cmp M, 0
  163. ble %icc, .LL999
  164. sll INCX, BASE_SHIFT, INCX
  165. cmp N, 0
  166. ble %icc, .LL999
  167. sll INCY, BASE_SHIFT, INCY
  168. cmp INCX, SIZE
  169. be %icc, .LL10
  170. mov X, XX
  171. mov BUFFER, XX
  172. mov BUFFER, X1
  173. sra M, 3, J
  174. cmp J, 0
  175. ble,pn %icc, .LL05
  176. nop
  177. .LL01:
  178. LDF [X], a1
  179. add X, INCX, X
  180. LDF [X], a2
  181. add X, INCX, X
  182. LDF [X], a3
  183. add X, INCX, X
  184. LDF [X], a4
  185. add X, INCX, X
  186. LDF [X], a5
  187. add X, INCX, X
  188. LDF [X], a6
  189. add X, INCX, X
  190. LDF [X], a7
  191. add X, INCX, X
  192. LDF [X], a8
  193. add X, INCX, X
  194. STF a1, [X1 + 0 * SIZE]
  195. STF a2, [X1 + 1 * SIZE]
  196. STF a3, [X1 + 2 * SIZE]
  197. STF a4, [X1 + 3 * SIZE]
  198. STF a5, [X1 + 4 * SIZE]
  199. STF a6, [X1 + 5 * SIZE]
  200. STF a7, [X1 + 6 * SIZE]
  201. STF a8, [X1 + 7 * SIZE]
  202. add X1, 8 * SIZE, X1
  203. deccc J
  204. bg,pn %icc, .LL01
  205. nop
  206. .LL05:
  207. andcc M, 7, J
  208. ble,pn %icc, .LL10
  209. nop
  210. .LL06:
  211. LDF [X], a1
  212. add X, INCX, X
  213. STF a1, [X1 + 0 * SIZE]
  214. add X1, 1 * SIZE, X1
  215. deccc J
  216. bg,pn %icc, .LL06
  217. nop
  218. .LL10:
  219. mov N, J
  220. cmp N, 0
  221. ble,pn %icc, .LL999
  222. nop
  223. .LL11:
  224. mov XX, X1
  225. mov A, A1
  226. add A, LDA, A
  227. LDF [Y], y1
  228. add Y, INCY, Y
  229. FMUL ALPHA, y1, y1
  230. sra M, 3, I
  231. cmp I, 0
  232. ble,pn %icc, .LL15
  233. nop
  234. LDF [X1 + 0 * SIZE], x1
  235. LDF [A1 + 0 * SIZE], a1
  236. LDF [X1 + 1 * SIZE], x2
  237. LDF [A1 + 1 * SIZE], a2
  238. LDF [X1 + 2 * SIZE], x3
  239. LDF [A1 + 2 * SIZE], a3
  240. LDF [X1 + 3 * SIZE], x4
  241. LDF [A1 + 3 * SIZE], a4
  242. LDF [X1 + 4 * SIZE], x5
  243. LDF [A1 + 4 * SIZE], a5
  244. LDF [X1 + 5 * SIZE], x6
  245. LDF [A1 + 5 * SIZE], a6
  246. LDF [X1 + 6 * SIZE], x7
  247. LDF [A1 + 6 * SIZE], a7
  248. LDF [X1 + 7 * SIZE], x8
  249. LDF [A1 + 7 * SIZE], a8
  250. FMUL x1, y1, t1
  251. FMUL x2, y1, t2
  252. FMUL x3, y1, t3
  253. FMUL x4, y1, t4
  254. FADD a1, t1, a1
  255. FMUL x5, y1, t1
  256. FADD a2, t2, a2
  257. FMUL x6, y1, t2
  258. deccc I
  259. ble,pn %icc, .LL13
  260. nop
  261. .LL12:
  262. prefetch [A1 + PREFETCHSIZE * SIZE], 0
  263. FADD a3, t3, a3
  264. LDF [X1 + 8 * SIZE], x1
  265. FMUL x7, y1, t3
  266. LDF [X1 + 9 * SIZE], x2
  267. FADD a4, t4, a4
  268. LDF [X1 + 10 * SIZE], x3
  269. FMUL x8, y1, t4
  270. LDF [X1 + 11 * SIZE], x4
  271. FADD a5, t1, a5
  272. STF a1, [A1 + 0 * SIZE]
  273. LDF [A1 + 8 * SIZE], a1
  274. FMUL x1, y1, t1
  275. STF a2, [A1 + 1 * SIZE]
  276. LDF [A1 + 9 * SIZE], a2
  277. FADD a6, t2, a6
  278. STF a3, [A1 + 2 * SIZE]
  279. LDF [A1 + 10 * SIZE], a3
  280. FMUL x2, y1, t2
  281. STF a4, [A1 + 3 * SIZE]
  282. LDF [A1 + 11 * SIZE], a4
  283. FADD a7, t3, a7
  284. LDF [X1 + 12 * SIZE], x5
  285. FMUL x3, y1, t3
  286. LDF [X1 + 13 * SIZE], x6
  287. FADD a8, t4, a8
  288. LDF [X1 + 14 * SIZE], x7
  289. FMUL x4, y1, t4
  290. LDF [X1 + 15 * SIZE], x8
  291. FADD a1, t1, a1
  292. STF a5, [A1 + 4 * SIZE]
  293. deccc I
  294. LDF [A1 + 12 * SIZE], a5
  295. FMUL x5, y1, t1
  296. STF a6, [A1 + 5 * SIZE]
  297. LDF [A1 + 13 * SIZE], a6
  298. FADD a2, t2, a2
  299. STF a7, [A1 + 6 * SIZE]
  300. LDF [A1 + 14 * SIZE], a7
  301. FMUL x6, y1, t2
  302. STF a8, [A1 + 7 * SIZE]
  303. LDF [A1 + 15 * SIZE], a8
  304. add A1, 8 * SIZE, A1
  305. bg,pn %icc, .LL12
  306. add X1, 8 * SIZE, X1
  307. .LL13:
  308. FADD a3, t3, a3
  309. FMUL x7, y1, t3
  310. FADD a4, t4, a4
  311. FMUL x8, y1, t4
  312. FADD a5, t1, a5
  313. FADD a6, t2, a6
  314. FADD a7, t3, a7
  315. FADD a8, t4, a8
  316. STF a1, [A1 + 0 * SIZE]
  317. STF a2, [A1 + 1 * SIZE]
  318. STF a3, [A1 + 2 * SIZE]
  319. STF a4, [A1 + 3 * SIZE]
  320. STF a5, [A1 + 4 * SIZE]
  321. STF a6, [A1 + 5 * SIZE]
  322. STF a7, [A1 + 6 * SIZE]
  323. STF a8, [A1 + 7 * SIZE]
  324. add A1, 8 * SIZE, A1
  325. add X1, 8 * SIZE, X1
  326. .LL15:
  327. andcc M, 4, I
  328. ble,pn %icc, .LL16
  329. nop
  330. LDF [X1 + 0 * SIZE], x1
  331. LDF [A1 + 0 * SIZE], a1
  332. LDF [X1 + 1 * SIZE], x2
  333. LDF [A1 + 1 * SIZE], a2
  334. LDF [X1 + 2 * SIZE], x3
  335. LDF [A1 + 2 * SIZE], a3
  336. LDF [X1 + 3 * SIZE], x4
  337. LDF [A1 + 3 * SIZE], a4
  338. FMUL x1, y1, t1
  339. FMUL x2, y1, t2
  340. FMUL x3, y1, t3
  341. FMUL x4, y1, t4
  342. FADD a1, t1, a1
  343. FADD a2, t2, a2
  344. FADD a3, t3, a3
  345. FADD a4, t4, a4
  346. STF a1, [A1 + 0 * SIZE]
  347. STF a2, [A1 + 1 * SIZE]
  348. STF a3, [A1 + 2 * SIZE]
  349. add X1, 4 * SIZE, X1
  350. STF a4, [A1 + 3 * SIZE]
  351. add A1, 4 * SIZE, A1
  352. .LL16:
  353. andcc M, 2, I
  354. ble,pn %icc, .LL17
  355. nop
  356. LDF [X1 + 0 * SIZE], x1
  357. LDF [X1 + 1 * SIZE], x2
  358. LDF [A1 + 0 * SIZE], a1
  359. LDF [A1 + 1 * SIZE], a2
  360. FMUL x1, y1, t1
  361. FMUL x2, y1, t2
  362. FADD a1, t1, a1
  363. FADD a2, t2, a2
  364. STF a1, [A1 + 0 * SIZE]
  365. add X1, 2 * SIZE, X1
  366. STF a2, [A1 + 1 * SIZE]
  367. add A1, 2 * SIZE, A1
  368. .LL17:
  369. andcc M, 1, I
  370. ble,pn %icc, .LL19
  371. nop
  372. LDF [X1 + 0 * SIZE], x1
  373. add X1, 1 * SIZE, X1
  374. LDF [A1 + 0 * SIZE], a1
  375. FMUL x1, y1, t1
  376. FADD a1, t1, a1
  377. STF a1, [A1 + 0 * SIZE]
  378. add A1, 1 * SIZE, A1
  379. .LL19:
  380. deccc J
  381. bg %icc, .LL11
  382. nop
  383. .LL999:
  384. return %i7 + 8
  385. clr %o0
  386. EPILOGUE