You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dnrm2.S 7.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCH_SIZE 80
  41. #define N $16
  42. #define X $17
  43. #define INCX $18
  44. #define XX $19
  45. #define I $0
  46. #define a0 $f0
  47. #define a1 $f1
  48. #define a2 $f10
  49. #define a3 $f11
  50. #define t0 $f12
  51. #define t1 $f13
  52. #define t2 $f14
  53. #define t3 $f15
  54. #define x0 $f16
  55. #define x1 $f17
  56. #define x2 $f18
  57. #define x3 $f19
  58. #define x4 $f20
  59. #define x5 $f21
  60. #define x6 $f22
  61. #define x7 $f23
  62. PROLOGUE
  63. #if defined(EV4) || defined(EV5)
  64. .frame $30,16,$26,0
  65. .mask 0x4000000,-16
  66. ldah $29, 0($27) !gpdisp!1
  67. lda $29, 0($29) !gpdisp!1
  68. lda $sp, -16($sp)
  69. ldq $27, sqrt($29) !literal!2
  70. stq $26, 0($sp)
  71. PROFCODE
  72. .prologue 1
  73. #else
  74. PROFCODE
  75. #endif
  76. fclr a0
  77. SXADDQ INCX, 0, INCX
  78. fclr a1
  79. ble N, $L999
  80. fclr a2
  81. cmpeq INCX, SIZE, $0
  82. fclr a3
  83. beq $0, $L20
  84. fclr t0
  85. sra N, 4, I
  86. fclr t1
  87. ble I, $L15
  88. fclr t2
  89. LD x0, 0 * SIZE(X)
  90. fclr t3
  91. LD x1, 1 * SIZE(X)
  92. LD x2, 2 * SIZE(X)
  93. LD x3, 3 * SIZE(X)
  94. LD x4, 4 * SIZE(X)
  95. LD x5, 5 * SIZE(X)
  96. LD x6, 6 * SIZE(X)
  97. LD x7, 7 * SIZE(X)
  98. lda I, -1(I)
  99. ble I, $L12
  100. .align 4
  101. $L11:
  102. addt a0, t0, a0
  103. ldl $31, (PREFETCH_SIZE) * SIZE(X)
  104. mult x0, x0, t0
  105. LD x0, 8 * SIZE(X)
  106. addt a1, t1, a1
  107. mov X, XX
  108. mult x1, x1, t1
  109. LD x1, 9 * SIZE(X)
  110. addt a2, t2, a2
  111. unop
  112. mult x2, x2, t2
  113. LD x2, 10 * SIZE(X)
  114. addt a3, t3, a3
  115. unop
  116. mult x3, x3, t3
  117. LD x3, 11 * SIZE(X)
  118. addt a0, t0, a0
  119. unop
  120. mult x4, x4, t0
  121. LD x4, 12 * SIZE(X)
  122. addt a1, t1, a1
  123. unop
  124. mult x5, x5, t1
  125. LD x5, 13 * SIZE(X)
  126. addt a2, t2, a2
  127. unop
  128. mult x6, x6, t2
  129. LD x6, 14 * SIZE(X)
  130. addt a3, t3, a3
  131. unop
  132. mult x7, x7, t3
  133. LD x7, 15 * SIZE(X)
  134. addt a0, t0, a0
  135. unop
  136. mult x0, x0, t0
  137. LD x0, 16 * SIZE(X)
  138. addt a1, t1, a1
  139. lda X, 16 * SIZE(X)
  140. mult x1, x1, t1
  141. LD x1, 17 * SIZE(XX)
  142. addt a2, t2, a2
  143. unop
  144. mult x2, x2, t2
  145. LD x2, 18 * SIZE(XX)
  146. addt a3, t3, a3
  147. unop
  148. mult x3, x3, t3
  149. LD x3, 19 * SIZE(XX)
  150. addt a0, t0, a0
  151. unop
  152. mult x4, x4, t0
  153. LD x4, 20 * SIZE(XX)
  154. addt a1, t1, a1
  155. lda I, -1(I)
  156. mult x5, x5, t1
  157. LD x5, 21 * SIZE(XX)
  158. addt a2, t2, a2
  159. unop
  160. mult x6, x6, t2
  161. LD x6, 22 * SIZE(XX)
  162. addt a3, t3, a3
  163. mult x7, x7, t3
  164. LD x7, 23 * SIZE(XX)
  165. bgt I, $L11
  166. .align 4
  167. $L12:
  168. addt a0, t0, a0
  169. mov X, XX
  170. mult x0, x0, t0
  171. LD x0, 8 * SIZE(X)
  172. addt a1, t1, a1
  173. unop
  174. mult x1, x1, t1
  175. LD x1, 9 * SIZE(X)
  176. addt a2, t2, a2
  177. unop
  178. mult x2, x2, t2
  179. LD x2, 10 * SIZE(X)
  180. addt a3, t3, a3
  181. unop
  182. mult x3, x3, t3
  183. LD x3, 11 * SIZE(X)
  184. addt a0, t0, a0
  185. unop
  186. mult x4, x4, t0
  187. LD x4, 12 * SIZE(XX)
  188. addt a1, t1, a1
  189. unop
  190. mult x5, x5, t1
  191. LD x5, 13 * SIZE(XX)
  192. addt a2, t2, a2
  193. unop
  194. mult x6, x6, t2
  195. LD x6, 14 * SIZE(XX)
  196. addt a3, t3, a3
  197. lda X, 16 * SIZE(X)
  198. mult x7, x7, t3
  199. LD x7, 15 * SIZE(XX)
  200. addt a0, t0, a0
  201. mult x0, x0, t0
  202. addt a1, t1, a1
  203. mult x1, x1, t1
  204. addt a2, t2, a2
  205. mult x2, x2, t2
  206. addt a3, t3, a3
  207. mult x3, x3, t3
  208. addt a0, t0, a0
  209. mult x4, x4, t0
  210. addt a1, t1, a1
  211. mult x5, x5, t1
  212. addt a2, t2, a2
  213. mult x6, x6, t2
  214. addt a3, t3, a3
  215. mult x7, x7, t3
  216. addt a1, t1, a1
  217. addt a2, t2, a2
  218. addt a3, t3, a3
  219. .align 4
  220. $L15:
  221. and N, 15, I
  222. ble I, $L998
  223. .align 4
  224. $L16:
  225. LD x0, 0 * SIZE(X)
  226. lda X, 1 * SIZE(X)
  227. addt a0, t0, a0
  228. mult x0, x0, t0
  229. lda I, -1(I)
  230. bgt I, $L16
  231. bsr $31, $L998
  232. .align 4
  233. $L20:
  234. fclr t0
  235. sra N, 3, I
  236. fclr t1
  237. ble I, $L25
  238. fclr t2
  239. fclr t3
  240. LD x0, 0 * SIZE(X)
  241. addq X, INCX, X
  242. LD x1, 0 * SIZE(X)
  243. addq X, INCX, X
  244. LD x2, 0 * SIZE(X)
  245. addq X, INCX, X
  246. LD x3, 0 * SIZE(X)
  247. addq X, INCX, X
  248. LD x4, 0 * SIZE(X)
  249. addq X, INCX, X
  250. LD x5, 0 * SIZE(X)
  251. addq X, INCX, X
  252. LD x6, 0 * SIZE(X)
  253. addq X, INCX, X
  254. lda I, -1(I)
  255. ble I, $L22
  256. .align 4
  257. $L21:
  258. addt a0, t0, a0
  259. LD x7, 0 * SIZE(X)
  260. mult x0, x0, t0
  261. addq X, INCX, X
  262. addt a1, t1, a1
  263. LD x0, 0 * SIZE(X)
  264. mult x1, x1, t1
  265. addq X, INCX, X
  266. addt a2, t2, a2
  267. LD x1, 0 * SIZE(X)
  268. mult x2, x2, t2
  269. addq X, INCX, X
  270. addt a3, t3, a3
  271. LD x2, 0 * SIZE(X)
  272. mult x3, x3, t3
  273. addq X, INCX, X
  274. addt a0, t0, a0
  275. LD x3, 0 * SIZE(X)
  276. mult x4, x4, t0
  277. addq X, INCX, X
  278. addt a1, t1, a1
  279. LD x4, 0 * SIZE(X)
  280. mult x5, x5, t1
  281. addq X, INCX, X
  282. addt a2, t2, a2
  283. LD x5, 0 * SIZE(X)
  284. mult x6, x6, t2
  285. addq X, INCX, X
  286. addt a3, t3, a3
  287. LD x6, 0 * SIZE(X)
  288. mult x7, x7, t3
  289. addq X, INCX, X
  290. lda I, -1(I)
  291. bgt I, $L21
  292. .align 4
  293. $L22:
  294. addt a0, t0, a0
  295. LD x7, 0 * SIZE(X)
  296. mult x0, x0, t0
  297. addq X, INCX, X
  298. addt a1, t1, a1
  299. unop
  300. mult x1, x1, t1
  301. unop
  302. addt a2, t2, a2
  303. mult x2, x2, t2
  304. addt a3, t3, a3
  305. mult x3, x3, t3
  306. addt a0, t0, a0
  307. mult x4, x4, t0
  308. addt a1, t1, a1
  309. mult x5, x5, t1
  310. addt a2, t2, a2
  311. mult x6, x6, t2
  312. addt a3, t3, a3
  313. mult x7, x7, t3
  314. addt a1, t1, a1
  315. addt a2, t2, a2
  316. addt a3, t3, a3
  317. .align 4
  318. $L25:
  319. and N, 7, I
  320. ble I, $L998
  321. .align 4
  322. $L26:
  323. LD x0, 0 * SIZE(X)
  324. addq X, INCX, X
  325. addt a0, t0, a0
  326. mult x0, x0, t0
  327. lda I, -1(I)
  328. bgt I, $L26
  329. .align 4
  330. $L998:
  331. addt a0, t0, a0
  332. addt a0, a1, a0
  333. addt a2, a3, a2
  334. #if defined(EV4) || defined(EV5)
  335. addt a0, a2, $f16
  336. jsr $26, ($27), sqrt !lituse_jsr!2
  337. ldah $29, 0($26) !gpdisp!3
  338. lda $29, 0($29) !gpdisp!3
  339. #else
  340. addt a0, a2, a0
  341. sqrtt a0, a0
  342. #endif
  343. .align 4
  344. $L999:
  345. #if defined(EV4) || defined(EV5)
  346. ldq $26, 0($sp)
  347. lda $sp, 16($sp)
  348. #endif
  349. ret
  350. EPILOGUE