You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

snrm2.S 8.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define PREA r8
  44. #define FZERO 144(SP)
  45. #define FONE 148(SP)
  46. #define STACKSIZE 160
  47. PROLOGUE
  48. PROFCODE
  49. addi SP, SP, -STACKSIZE
  50. li r10, 0
  51. lis r11, 0x3f80
  52. stfd f14, 0(SP)
  53. stfd f15, 8(SP)
  54. stfd f16, 16(SP)
  55. stfd f17, 24(SP)
  56. stfd f18, 32(SP)
  57. stfd f19, 40(SP)
  58. stfd f20, 48(SP)
  59. stfd f21, 56(SP)
  60. stfd f22, 64(SP)
  61. stfd f23, 72(SP)
  62. stfd f24, 80(SP)
  63. stfd f25, 88(SP)
  64. stfd f26, 96(SP)
  65. stfd f27, 104(SP)
  66. stfd f28, 112(SP)
  67. stfd f29, 120(SP)
  68. stfd f30, 128(SP)
  69. stfd f31, 136(SP)
  70. stw r10, FZERO
  71. stw r11, FONE
  72. lfs f1, FZERO
  73. #ifdef F_INTERFACE
  74. LDINT N, 0(N)
  75. LDINT INCX, 0(INCX)
  76. #endif
  77. slwi INCX, INCX, BASE_SHIFT
  78. li PREA, 4 * 16 * SIZE
  79. cmpwi cr0, N, 0
  80. ble- LL(9999)
  81. cmpwi cr0, INCX, 0
  82. beq- LL(9999)
  83. fmr f0, f1
  84. fmr f2, f1
  85. fmr f3, f1
  86. fmr f4, f1
  87. fmr f5, f1
  88. fmr f6, f1
  89. fmr f7, f1
  90. fmr f8, f1
  91. fmr f9, f1
  92. fmr f10, f1
  93. fmr f11, f1
  94. fmr f12, f1
  95. fmr f13, f1
  96. fmr f14, f1
  97. fmr f15, f1
  98. cmpwi cr0, INCX, SIZE
  99. bne- cr0, LL(1000)
  100. srawi. r0, N, 4
  101. mtspr CTR, r0
  102. beq- cr0, LL(150)
  103. LFD f16, 0 * SIZE(X)
  104. LFD f17, 1 * SIZE(X)
  105. LFD f18, 2 * SIZE(X)
  106. LFD f19, 3 * SIZE(X)
  107. LFD f20, 4 * SIZE(X)
  108. LFD f21, 5 * SIZE(X)
  109. LFD f22, 6 * SIZE(X)
  110. LFD f23, 7 * SIZE(X)
  111. LFD f24, 8 * SIZE(X)
  112. LFD f25, 9 * SIZE(X)
  113. LFD f26, 10 * SIZE(X)
  114. LFD f27, 11 * SIZE(X)
  115. LFD f28, 12 * SIZE(X)
  116. LFD f29, 13 * SIZE(X)
  117. LFD f30, 14 * SIZE(X)
  118. LFD f31, 15 * SIZE(X)
  119. bdz LL(120)
  120. .align 4
  121. LL(110):
  122. fmadd f0, f16, f16, f0
  123. fmadd f1, f17, f17, f1
  124. fmadd f2, f18, f18, f2
  125. fmadd f3, f19, f19, f3
  126. LFD f16, 16 * SIZE(X)
  127. LFD f17, 17 * SIZE(X)
  128. LFD f18, 18 * SIZE(X)
  129. LFD f19, 19 * SIZE(X)
  130. fmadd f4, f20, f20, f4
  131. fmadd f5, f21, f21, f5
  132. fmadd f6, f22, f22, f6
  133. fmadd f7, f23, f23, f7
  134. LFD f20, 20 * SIZE(X)
  135. LFD f21, 21 * SIZE(X)
  136. LFD f22, 22 * SIZE(X)
  137. LFD f23, 23 * SIZE(X)
  138. fmadd f8, f24, f24, f8
  139. fmadd f9, f25, f25, f9
  140. fmadd f10, f26, f26, f10
  141. fmadd f11, f27, f27, f11
  142. LFD f24, 24 * SIZE(X)
  143. LFD f25, 25 * SIZE(X)
  144. LFD f26, 26 * SIZE(X)
  145. LFD f27, 27 * SIZE(X)
  146. fmadd f12, f28, f28, f12
  147. fmadd f13, f29, f29, f13
  148. fmadd f14, f30, f30, f14
  149. fmadd f15, f31, f31, f15
  150. LFD f28, 28 * SIZE(X)
  151. LFD f29, 29 * SIZE(X)
  152. LFD f30, 30 * SIZE(X)
  153. LFD f31, 31 * SIZE(X)
  154. #ifndef POWER6
  155. L1_PREFETCH X, PREA
  156. #endif
  157. addi X, X, 16 * SIZE
  158. #ifdef POWER6
  159. L1_PREFETCH X, PREA
  160. #endif
  161. bdnz LL(110)
  162. .align 4
  163. LL(120):
  164. fmadd f0, f16, f16, f0
  165. fmadd f1, f17, f17, f1
  166. fmadd f2, f18, f18, f2
  167. fmadd f3, f19, f19, f3
  168. fmadd f4, f20, f20, f4
  169. fmadd f5, f21, f21, f5
  170. fmadd f6, f22, f22, f6
  171. fmadd f7, f23, f23, f7
  172. fmadd f8, f24, f24, f8
  173. fmadd f9, f25, f25, f9
  174. fmadd f10, f26, f26, f10
  175. fmadd f11, f27, f27, f11
  176. fmadd f12, f28, f28, f12
  177. fmadd f13, f29, f29, f13
  178. fmadd f14, f30, f30, f14
  179. fmadd f15, f31, f31, f15
  180. addi X, X, 16 * SIZE
  181. .align 4
  182. LL(150):
  183. andi. r0, N, 15
  184. mtspr CTR, r0
  185. beq- cr0, LL(170)
  186. .align 4
  187. LL(160):
  188. LFD f16, 0 * SIZE(X)
  189. addi X, X, 1 * SIZE
  190. fmadd f0, f16, f16, f0
  191. bdnz LL(160)
  192. .align 4
  193. LL(170):
  194. fadd f0, f0, f1
  195. fadd f2, f2, f3
  196. fadd f4, f4, f5
  197. fadd f6, f6, f7
  198. fadd f8, f8, f9
  199. fadd f10, f10, f11
  200. fadd f12, f12, f13
  201. fadd f14, f14, f15
  202. fadd f0, f0, f2
  203. fadd f4, f4, f6
  204. fadd f8, f8, f10
  205. fadd f12, f12, f14
  206. fadd f0, f0, f4
  207. fadd f8, f8, f12
  208. fadd f0, f0, f8
  209. fsqrts f1, f0
  210. b LL(9999)
  211. .align 4
  212. LL(1000):
  213. sub X, X, INCX
  214. srawi. r0, N, 4
  215. mtspr CTR, r0
  216. beq- cr0, LL(1150)
  217. LFDUX f16, X, INCX
  218. LFDUX f17, X, INCX
  219. LFDUX f18, X, INCX
  220. LFDUX f19, X, INCX
  221. LFDUX f20, X, INCX
  222. LFDUX f21, X, INCX
  223. LFDUX f22, X, INCX
  224. LFDUX f23, X, INCX
  225. LFDUX f24, X, INCX
  226. LFDUX f25, X, INCX
  227. LFDUX f26, X, INCX
  228. LFDUX f27, X, INCX
  229. LFDUX f28, X, INCX
  230. LFDUX f29, X, INCX
  231. LFDUX f30, X, INCX
  232. LFDUX f31, X, INCX
  233. bdz LL(1120)
  234. .align 4
  235. LL(1110):
  236. fmadd f0, f16, f16, f0
  237. fmadd f1, f17, f17, f1
  238. fmadd f2, f18, f18, f2
  239. fmadd f3, f19, f19, f3
  240. LFDUX f16, X, INCX
  241. LFDUX f17, X, INCX
  242. LFDUX f18, X, INCX
  243. LFDUX f19, X, INCX
  244. fmadd f4, f20, f20, f4
  245. fmadd f5, f21, f21, f5
  246. fmadd f6, f22, f22, f6
  247. fmadd f7, f23, f23, f7
  248. LFDUX f20, X, INCX
  249. LFDUX f21, X, INCX
  250. LFDUX f22, X, INCX
  251. LFDUX f23, X, INCX
  252. fmadd f8, f24, f24, f8
  253. fmadd f9, f25, f25, f9
  254. fmadd f10, f26, f26, f10
  255. fmadd f11, f27, f27, f11
  256. LFDUX f24, X, INCX
  257. LFDUX f25, X, INCX
  258. LFDUX f26, X, INCX
  259. LFDUX f27, X, INCX
  260. fmadd f12, f28, f28, f12
  261. fmadd f13, f29, f29, f13
  262. fmadd f14, f30, f30, f14
  263. fmadd f15, f31, f31, f15
  264. LFDUX f28, X, INCX
  265. LFDUX f29, X, INCX
  266. LFDUX f30, X, INCX
  267. LFDUX f31, X, INCX
  268. bdnz LL(1110)
  269. .align 4
  270. LL(1120):
  271. fmadd f0, f16, f16, f0
  272. fmadd f1, f17, f17, f1
  273. fmadd f2, f18, f18, f2
  274. fmadd f3, f19, f19, f3
  275. fmadd f4, f20, f20, f4
  276. fmadd f5, f21, f21, f5
  277. fmadd f6, f22, f22, f6
  278. fmadd f7, f23, f23, f7
  279. fmadd f8, f24, f24, f8
  280. fmadd f9, f25, f25, f9
  281. fmadd f10, f26, f26, f10
  282. fmadd f11, f27, f27, f11
  283. fmadd f12, f28, f28, f12
  284. fmadd f13, f29, f29, f13
  285. fmadd f14, f30, f30, f14
  286. fmadd f15, f31, f31, f15
  287. .align 4
  288. LL(1150):
  289. andi. r0, N, 15
  290. mtspr CTR, r0
  291. beq- cr0, LL(1170)
  292. .align 4
  293. LL(1160):
  294. LFDUX f16, X, INCX
  295. fmadd f0, f16, f16, f0
  296. bdnz LL(1160)
  297. .align 4
  298. LL(1170):
  299. fadd f0, f0, f1
  300. fadd f2, f2, f3
  301. fadd f4, f4, f5
  302. fadd f6, f6, f7
  303. fadd f8, f8, f9
  304. fadd f10, f10, f11
  305. fadd f12, f12, f13
  306. fadd f14, f14, f15
  307. fadd f0, f0, f2
  308. fadd f4, f4, f6
  309. fadd f8, f8, f10
  310. fadd f12, f12, f14
  311. fadd f0, f0, f4
  312. fadd f8, f8, f12
  313. fadd f0, f0, f8
  314. fsqrts f1, f0
  315. .align 4
  316. LL(9999):
  317. lfd f14, 0(SP)
  318. lfd f15, 8(SP)
  319. lfd f16, 16(SP)
  320. lfd f17, 24(SP)
  321. lfd f18, 32(SP)
  322. lfd f19, 40(SP)
  323. lfd f20, 48(SP)
  324. lfd f21, 56(SP)
  325. lfd f22, 64(SP)
  326. lfd f23, 72(SP)
  327. lfd f24, 80(SP)
  328. lfd f25, 88(SP)
  329. lfd f26, 96(SP)
  330. lfd f27, 104(SP)
  331. lfd f28, 112(SP)
  332. lfd f29, 120(SP)
  333. lfd f30, 128(SP)
  334. lfd f31, 136(SP)
  335. addi SP, SP, STACKSIZE
  336. blr
  337. EPILOGUE