You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

znrm2.S 7.8 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCH_SIZE 80
  41. #define N $16
  42. #define X $17
  43. #define INCX $18
  44. #define XX $19
  45. #define I $0
  46. #define a0 $f0
  47. #define a1 $f1
  48. #define a2 $f10
  49. #define a3 $f11
  50. #define t0 $f12
  51. #define t1 $f13
  52. #define t2 $f14
  53. #define t3 $f15
  54. #define x0 $f16
  55. #define x1 $f17
  56. #define x2 $f18
  57. #define x3 $f19
  58. #define x4 $f20
  59. #define x5 $f21
  60. #define x6 $f22
  61. #define x7 $f23
  62. PROLOGUE
  63. #if defined(EV4) || defined(EV5)
  64. .frame $30,16,$26,0
  65. .mask 0x4000000,-16
  66. ldah $29, 0($27) !gpdisp!1
  67. lda $29, 0($29) !gpdisp!1
  68. lda $sp, -16($sp)
  69. ldq $27, sqrt($29) !literal!2
  70. stq $26, 0($sp)
  71. PROFCODE
  72. .prologue 1
  73. #else
  74. PROFCODE
  75. #endif
  76. fclr a0
  77. sll INCX, ZBASE_SHIFT, INCX
  78. fclr a1
  79. ble N, $L999
  80. fclr a2
  81. cmpeq INCX, 2 * SIZE, $0
  82. fclr a3
  83. beq $0, $L20
  84. fclr t0
  85. sra N, 3, I
  86. fclr t1
  87. ble I, $L15
  88. fclr t2
  89. LD x0, 0 * SIZE(X)
  90. fclr t3
  91. LD x1, 1 * SIZE(X)
  92. LD x2, 2 * SIZE(X)
  93. LD x3, 3 * SIZE(X)
  94. LD x4, 4 * SIZE(X)
  95. LD x5, 5 * SIZE(X)
  96. LD x6, 6 * SIZE(X)
  97. LD x7, 7 * SIZE(X)
  98. lda I, -1(I)
  99. ble I, $L12
  100. .align 4
  101. $L11:
  102. addt a0, t0, a0
  103. ldl $31, (PREFETCH_SIZE) * SIZE(X)
  104. mult x0, x0, t0
  105. LD x0, 8 * SIZE(X)
  106. addt a1, t1, a1
  107. mov X, XX
  108. mult x1, x1, t1
  109. LD x1, 9 * SIZE(X)
  110. addt a2, t2, a2
  111. unop
  112. mult x2, x2, t2
  113. LD x2, 10 * SIZE(X)
  114. addt a3, t3, a3
  115. unop
  116. mult x3, x3, t3
  117. LD x3, 11 * SIZE(X)
  118. addt a0, t0, a0
  119. unop
  120. mult x4, x4, t0
  121. LD x4, 12 * SIZE(X)
  122. addt a1, t1, a1
  123. unop
  124. mult x5, x5, t1
  125. LD x5, 13 * SIZE(X)
  126. addt a2, t2, a2
  127. unop
  128. mult x6, x6, t2
  129. LD x6, 14 * SIZE(X)
  130. addt a3, t3, a3
  131. unop
  132. mult x7, x7, t3
  133. LD x7, 15 * SIZE(X)
  134. addt a0, t0, a0
  135. unop
  136. mult x0, x0, t0
  137. LD x0, 16 * SIZE(X)
  138. addt a1, t1, a1
  139. lda X, 16 * SIZE(X)
  140. mult x1, x1, t1
  141. LD x1, 17 * SIZE(XX)
  142. addt a2, t2, a2
  143. unop
  144. mult x2, x2, t2
  145. LD x2, 18 * SIZE(XX)
  146. addt a3, t3, a3
  147. unop
  148. mult x3, x3, t3
  149. LD x3, 19 * SIZE(XX)
  150. addt a0, t0, a0
  151. unop
  152. mult x4, x4, t0
  153. LD x4, 20 * SIZE(XX)
  154. addt a1, t1, a1
  155. lda I, -1(I)
  156. mult x5, x5, t1
  157. LD x5, 21 * SIZE(XX)
  158. addt a2, t2, a2
  159. unop
  160. mult x6, x6, t2
  161. LD x6, 22 * SIZE(XX)
  162. addt a3, t3, a3
  163. mult x7, x7, t3
  164. LD x7, 23 * SIZE(XX)
  165. bgt I, $L11
  166. .align 4
  167. $L12:
  168. addt a0, t0, a0
  169. mov X, XX
  170. mult x0, x0, t0
  171. LD x0, 8 * SIZE(X)
  172. addt a1, t1, a1
  173. unop
  174. mult x1, x1, t1
  175. LD x1, 9 * SIZE(X)
  176. addt a2, t2, a2
  177. unop
  178. mult x2, x2, t2
  179. LD x2, 10 * SIZE(X)
  180. addt a3, t3, a3
  181. unop
  182. mult x3, x3, t3
  183. LD x3, 11 * SIZE(X)
  184. addt a0, t0, a0
  185. unop
  186. mult x4, x4, t0
  187. LD x4, 12 * SIZE(XX)
  188. addt a1, t1, a1
  189. unop
  190. mult x5, x5, t1
  191. LD x5, 13 * SIZE(XX)
  192. addt a2, t2, a2
  193. unop
  194. mult x6, x6, t2
  195. LD x6, 14 * SIZE(XX)
  196. addt a3, t3, a3
  197. lda X, 16 * SIZE(X)
  198. mult x7, x7, t3
  199. LD x7, 15 * SIZE(XX)
  200. addt a0, t0, a0
  201. mult x0, x0, t0
  202. addt a1, t1, a1
  203. mult x1, x1, t1
  204. addt a2, t2, a2
  205. mult x2, x2, t2
  206. addt a3, t3, a3
  207. mult x3, x3, t3
  208. addt a0, t0, a0
  209. mult x4, x4, t0
  210. addt a1, t1, a1
  211. mult x5, x5, t1
  212. addt a2, t2, a2
  213. mult x6, x6, t2
  214. addt a3, t3, a3
  215. mult x7, x7, t3
  216. addt a2, t2, a2
  217. addt a3, t3, a3
  218. .align 4
  219. $L15:
  220. and N, 7, I
  221. ble I, $L998
  222. .align 4
  223. $L16:
  224. LD x0, 0 * SIZE(X)
  225. LD x1, 1 * SIZE(X)
  226. lda X, 2 * SIZE(X)
  227. addt a0, t0, a0
  228. mult x0, x0, t0
  229. addt a1, t1, a1
  230. mult x1, x1, t1
  231. lda I, -1(I)
  232. bgt I, $L16
  233. bsr $31, $L998
  234. .align 4
  235. $L20:
  236. fclr t0
  237. sra N, 2, I
  238. fclr t1
  239. ble I, $L25
  240. LD x0, 0 * SIZE(X)
  241. fclr t2
  242. LD x1, 1 * SIZE(X)
  243. addq X, INCX, X
  244. LD x2, 0 * SIZE(X)
  245. fclr t3
  246. LD x3, 1 * SIZE(X)
  247. addq X, INCX, X
  248. LD x4, 0 * SIZE(X)
  249. lda I, -1(I)
  250. LD x5, 1 * SIZE(X)
  251. addq X, INCX, X
  252. LD x6, 0 * SIZE(X)
  253. ble I, $L22
  254. .align 4
  255. $L21:
  256. addt a0, t0, a0
  257. LD x7, 1 * SIZE(X)
  258. mult x0, x0, t0
  259. addq X, INCX, X
  260. addt a1, t1, a1
  261. LD x0, 0 * SIZE(X)
  262. mult x1, x1, t1
  263. unop
  264. addt a2, t2, a2
  265. LD x1, 1 * SIZE(X)
  266. mult x2, x2, t2
  267. addq X, INCX, X
  268. addt a3, t3, a3
  269. LD x2, 0 * SIZE(X)
  270. mult x3, x3, t3
  271. unop
  272. addt a0, t0, a0
  273. LD x3, 1 * SIZE(X)
  274. mult x4, x4, t0
  275. addq X, INCX, X
  276. addt a1, t1, a1
  277. LD x4, 0 * SIZE(X)
  278. mult x5, x5, t1
  279. lda I, -1(I)
  280. addt a2, t2, a2
  281. LD x5, 1 * SIZE(X)
  282. mult x6, x6, t2
  283. addq X, INCX, X
  284. addt a3, t3, a3
  285. LD x6, 0 * SIZE(X)
  286. mult x7, x7, t3
  287. bgt I, $L21
  288. .align 4
  289. $L22:
  290. addt a0, t0, a0
  291. LD x7, 1 * SIZE(X)
  292. mult x0, x0, t0
  293. addq X, INCX, X
  294. addt a1, t1, a1
  295. mult x1, x1, t1
  296. addt a2, t2, a2
  297. mult x2, x2, t2
  298. addt a3, t3, a3
  299. mult x3, x3, t3
  300. addt a0, t0, a0
  301. mult x4, x4, t0
  302. addt a1, t1, a1
  303. mult x5, x5, t1
  304. addt a2, t2, a2
  305. mult x6, x6, t2
  306. addt a3, t3, a3
  307. mult x7, x7, t3
  308. addt a2, t2, a2
  309. addt a3, t3, a3
  310. .align 4
  311. $L25:
  312. and N, 3, I
  313. ble I, $L998
  314. .align 4
  315. $L26:
  316. LD x0, 0 * SIZE(X)
  317. lda I, -1(I)
  318. LD x1, 1 * SIZE(X)
  319. addq X, INCX, X
  320. addt a0, t0, a0
  321. mult x0, x0, t0
  322. addt a1, t1, a1
  323. mult x1, x1, t1
  324. bgt I, $L26
  325. .align 4
  326. $L998:
  327. addt a0, t0, a0
  328. addt a1, t1, a1
  329. addt a0, a1, a0
  330. addt a2, a3, a2
  331. #if defined(EV4) || defined(EV5)
  332. addt a0, a2, $f16
  333. jsr $26, ($27), sqrt !lituse_jsr!2
  334. ldah $29, 0($26) !gpdisp!3
  335. lda $29, 0($29) !gpdisp!3
  336. #else
  337. addt a0, a2, a0
  338. sqrtt a0, a0
  339. #endif
  340. .align 4
  341. $L999:
  342. #if defined(EV4) || defined(EV5)
  343. ldq $26, 0($sp)
  344. lda $sp, 16($sp)
  345. #endif
  346. ret
  347. EPILOGUE