You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* Copyright 2019, The OpenBLAS project */
  4. /* All rights reserved. */
  5. /* */
  6. /* Redistribution and use in source and binary forms, with or */
  7. /* without modification, are permitted provided that the following */
  8. /* conditions are met: */
  9. /* */
  10. /* 1. Redistributions of source code must retain the above */
  11. /* copyright notice, this list of conditions and the following */
  12. /* disclaimer. */
  13. /* */
  14. /* 2. Redistributions in binary form must reproduce the above */
  15. /* copyright notice, this list of conditions and the following */
  16. /* disclaimer in the documentation and/or other materials */
  17. /* provided with the distribution. */
  18. /* */
  19. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  20. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  21. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  22. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  23. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  24. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  25. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  26. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  27. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  28. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  29. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  30. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  31. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  32. /* POSSIBILITY OF SUCH DAMAGE. */
  33. /* */
  34. /* The views and conclusions contained in the software and */
  35. /* documentation are those of the authors and should not be */
  36. /* interpreted as representing official policies, either expressed */
  37. /* or implied, of The University of Texas at Austin. */
  38. /*********************************************************************/
  39. #define ASSEMBLER
  40. #include "common.h"
  41. #ifdef XDOUBLE
  42. #define PREFETCH_SIZE ( 8 * 16 + 4)
  43. #elif defined(DOUBLE)
  44. #define PREFETCH_SIZE (16 * 16 + 8)
  45. #else
  46. #define PREFETCH_SIZE (32 * 16 + 16)
  47. #endif
  48. #ifndef COMPLEX
  49. #define COMPADD 0
  50. #define STRIDE INCX
  51. #else
  52. #define COMPADD 1
  53. #define STRIDE SIZE
  54. #endif
  55. #define PRE1 r2
  56. #define I r17
  57. #define J r18
  58. #define INCX16 r21
  59. #define PR r30
  60. #define ARLC r31
  61. #define N r32
  62. #define X r33
  63. #define INCX r34
  64. PROLOGUE
  65. .prologue
  66. PROFCODE
  67. { .mfi
  68. adds PRE1 = PREFETCH_SIZE * SIZE, X
  69. mov f8 = f0
  70. .save ar.lc, ARLC
  71. mov ARLC = ar.lc
  72. }
  73. ;;
  74. .body
  75. #ifdef F_INTERFACE
  76. { .mmi
  77. LDINT N = [N]
  78. LDINT INCX = [INCX]
  79. nop.i 0
  80. }
  81. ;;
  82. #ifndef USE64BITINT
  83. { .mii
  84. nop.m 0
  85. sxt4 N = N
  86. sxt4 INCX = INCX
  87. }
  88. ;;
  89. #endif
  90. #endif
  91. { .mmi
  92. cmp.lt p0, p6 = r0, INCX
  93. cmp.lt p0, p7 = r0, N
  94. shr I = N, (4 - COMPADD)
  95. }
  96. { .mbb
  97. and J = ((1 << (4 - COMPADD)) - 1), N
  98. (p6) br.ret.sptk.many b0
  99. (p7) br.ret.sptk.many b0
  100. }
  101. ;;
  102. { .mfi
  103. adds I = -1, I
  104. mov f10 = f0
  105. mov PR = pr
  106. }
  107. { .mfi
  108. cmp.eq p9, p0 = r0, J
  109. mov f9 = f0
  110. tbit.z p0, p12 = N, 3 - COMPADD
  111. }
  112. ;;
  113. { .mmi
  114. cmp.eq p16, p0 = r0, r0
  115. cmp.ne p17, p0 = r0, r0
  116. mov ar.ec= 3
  117. }
  118. { .mfi
  119. cmp.ne p18, p0 = r0, r0
  120. mov f11 = f0
  121. shl INCX = INCX, BASE_SHIFT + COMPADD
  122. }
  123. ;;
  124. { .mmi
  125. #ifdef XDOUBLE
  126. shladd INCX16 = INCX, (3 - COMPADD), r0
  127. #else
  128. shladd INCX16 = INCX, (4 - COMPADD), r0
  129. #endif
  130. cmp.ne p19, p0 = r0, r0
  131. mov ar.lc = I
  132. }
  133. { .mmb
  134. cmp.gt p8 ,p0 = r0, I
  135. #ifdef COMPLEX
  136. adds INCX = - SIZE, INCX
  137. #else
  138. nop.m 0
  139. #endif
  140. (p8) br.cond.dpnt .L55
  141. }
  142. ;;
  143. .align 32
  144. .L52:
  145. { .mmf
  146. (p16) lfetch.nt1 [PRE1], INCX16
  147. (p16) LDFD f32 = [X], STRIDE
  148. }
  149. { .mfb
  150. (p19) FADD f8 = f8, f71
  151. }
  152. ;;
  153. { .mmf
  154. (p16) LDFD f35 = [X], INCX
  155. }
  156. { .mfb
  157. (p19) FADD f9 = f9, f74
  158. }
  159. ;;
  160. { .mmf
  161. (p16) LDFD f38 = [X], STRIDE
  162. }
  163. { .mfb
  164. (p19) FADD f10 = f10, f77
  165. }
  166. ;;
  167. { .mmf
  168. (p16) LDFD f41 = [X], INCX
  169. }
  170. { .mfb
  171. (p19) FADD f11 = f11, f80
  172. }
  173. ;;
  174. { .mmf
  175. (p16) LDFD f44 = [X], STRIDE
  176. }
  177. { .mfb
  178. (p18) FADD f8 = f8, f34
  179. }
  180. ;;
  181. { .mmf
  182. (p16) LDFD f47 = [X], INCX
  183. }
  184. { .mfb
  185. (p18) FADD f9 = f9, f37
  186. }
  187. ;;
  188. { .mmf
  189. (p16) LDFD f50 = [X], STRIDE
  190. }
  191. { .mfb
  192. (p18) FADD f10 = f10, f40
  193. }
  194. ;;
  195. { .mmf
  196. (p16) LDFD f53 = [X], INCX
  197. }
  198. { .mfb
  199. (p18) FADD f11 = f11, f43
  200. }
  201. ;;
  202. { .mmf
  203. #ifdef XDOUBLE
  204. (p16) lfetch.nt1 [PRE1], INCX16
  205. #endif
  206. (p16) LDFD f56 = [X], STRIDE
  207. }
  208. { .mfb
  209. (p18) FADD f8 = f8, f46
  210. }
  211. ;;
  212. { .mmf
  213. (p16) LDFD f59 = [X], INCX
  214. }
  215. { .mfb
  216. (p18) FADD f9 = f9, f49
  217. }
  218. ;;
  219. { .mmf
  220. (p16) LDFD f62 = [X], STRIDE
  221. }
  222. { .mfb
  223. (p18) FADD f10 = f10, f52
  224. }
  225. ;;
  226. { .mmf
  227. (p16) LDFD f65 = [X], INCX
  228. }
  229. { .mfb
  230. (p18) FADD f11 = f11, f55
  231. }
  232. ;;
  233. { .mmf
  234. (p16) LDFD f68 = [X], STRIDE
  235. }
  236. { .mfb
  237. (p18) FADD f8 = f8, f58
  238. }
  239. ;;
  240. { .mmf
  241. (p16) LDFD f71 = [X], INCX
  242. }
  243. { .mfb
  244. (p18) FADD f9 = f9, f61
  245. }
  246. ;;
  247. { .mmf
  248. (p16) LDFD f74 = [X], STRIDE
  249. }
  250. { .mfb
  251. (p18) FADD f10 = f10, f64
  252. }
  253. ;;
  254. { .mmf
  255. (p16) LDFD f77 = [X], INCX
  256. }
  257. { .mfb
  258. (p18) FADD f11 = f11, f67
  259. br.ctop.sptk.few .L52
  260. }
  261. ;;
  262. FADD f8 = f8, f71
  263. FADD f9 = f9, f74
  264. FADD f10 = f10, f77
  265. FADD f11 = f11, f80
  266. .align 32
  267. ;;
  268. .L55:
  269. (p12) LDFD f32 = [X], STRIDE
  270. (p9) br.cond.dptk .L998
  271. ;;
  272. (p12) LDFD f33 = [X], INCX
  273. ;;
  274. (p12) LDFD f34 = [X], STRIDE
  275. ;;
  276. (p12) LDFD f35 = [X], INCX
  277. tbit.z p0, p13 = N, (2 - COMPADD)
  278. ;;
  279. (p12) LDFD f36 = [X], STRIDE
  280. tbit.z p0, p14 = N, (1 - COMPADD)
  281. ;;
  282. (p12) LDFD f37 = [X], INCX
  283. #ifndef COMPLEX
  284. tbit.z p0, p15 = N, 0
  285. #endif
  286. ;;
  287. (p12) LDFD f38 = [X], STRIDE
  288. ;;
  289. (p12) LDFD f39 = [X], INCX
  290. ;;
  291. (p13) LDFD f40 = [X], STRIDE
  292. ;;
  293. (p13) LDFD f41 = [X], INCX
  294. ;;
  295. (p13) LDFD f42 = [X], STRIDE
  296. (p12) FADD f8 = f8, f32
  297. ;;
  298. (p13) LDFD f43 = [X], INCX
  299. (p12) FADD f9 = f9, f33
  300. ;;
  301. (p14) LDFD f44 = [X], STRIDE
  302. (p12) FADD f10 = f10, f34
  303. ;;
  304. (p14) LDFD f45 = [X], INCX
  305. (p12) FADD f11 = f11, f35
  306. ;;
  307. #ifndef COMPLEX
  308. (p15) LDFD f46 = [X]
  309. #endif
  310. (p12) FADD f8 = f8, f36
  311. ;;
  312. (p12) FADD f9 = f9, f37
  313. (p12) FADD f10 = f10, f38
  314. (p12) FADD f11 = f11, f39
  315. ;;
  316. (p13) FADD f8 = f8, f40
  317. (p13) FADD f9 = f9, f41
  318. #ifndef COMPLEX
  319. #endif
  320. (p13) FADD f10 = f10, f42
  321. ;;
  322. (p13) FADD f11 = f11, f43
  323. (p14) FADD f8 = f8, f44
  324. (p14) FADD f9 = f9, f45
  325. #ifndef COMPLEX
  326. (p15) FADD f10 = f10, f46
  327. #endif
  328. ;;
  329. .align 32
  330. .L998:
  331. { .mfi
  332. FADD f8 = f8, f9
  333. mov ar.lc = ARLC
  334. }
  335. { .mmf
  336. FADD f10 = f10, f11
  337. }
  338. ;;
  339. { .mii
  340. mov pr = PR, -65474
  341. }
  342. ;;
  343. { .mfb
  344. FADD f8 = f8, f10
  345. br.ret.sptk.many b0
  346. }
  347. EPILOGUE