You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

axpy_ppc440.S 8.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(linux) || defined(__FreeBSD__)
  41. #ifndef __64BIT__
  42. #define N r3
  43. #define X r6
  44. #define INCX r7
  45. #define Y r8
  46. #define INCY r9
  47. #define YY r5
  48. #define PRE r4
  49. #else
  50. #define N r3
  51. #define X r7
  52. #define INCX r8
  53. #define Y r9
  54. #define INCY r10
  55. #define YY r5
  56. #define PRE r4
  57. #endif
  58. #endif
  59. #if defined(_AIX) || defined(__APPLE__)
  60. #if !defined(__64BIT__) && defined(DOUBLE)
  61. #define N r3
  62. #define X r8
  63. #define INCX r9
  64. #define Y r10
  65. #define INCY r4
  66. #define YY r6
  67. #define PRE r5
  68. #else
  69. #define N r3
  70. #define X r7
  71. #define INCX r8
  72. #define Y r9
  73. #define INCY r10
  74. #define YY r5
  75. #define PRE r4
  76. #endif
  77. #endif
  78. #define ALPHA f24
  79. #define STACKSIZE 96
  80. PROLOGUE
  81. PROFCODE
  82. addi SP, SP, -STACKSIZE
  83. li r0, 0
  84. stfd f14, 0(SP)
  85. stfd f15, 8(SP)
  86. stfd f16, 16(SP)
  87. stfd f17, 24(SP)
  88. stfd f18, 32(SP)
  89. stfd f19, 40(SP)
  90. stfd f20, 48(SP)
  91. stfd f21, 56(SP)
  92. stfd f22, 64(SP)
  93. stfd f23, 72(SP)
  94. stfd f24, 80(SP)
  95. #if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE)
  96. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  97. #endif
  98. fmr ALPHA, f1
  99. slwi INCX, INCX, BASE_SHIFT
  100. slwi INCY, INCY, BASE_SHIFT
  101. li PRE, 2 * 16 * SIZE
  102. cmpwi cr0, N, 0
  103. ble- LL(999)
  104. sub X, X, INCX
  105. sub Y, Y, INCY
  106. mr YY, Y
  107. srawi. r0, N, 4
  108. mtspr CTR, r0
  109. beq- LL(150)
  110. .align 4
  111. LFDUX f0, X, INCX
  112. LFDUX f1, X, INCX
  113. LFDUX f2, X, INCX
  114. LFDUX f3, X, INCX
  115. LFDUX f8, Y, INCY
  116. LFDUX f9, Y, INCY
  117. LFDUX f10, Y, INCY
  118. LFDUX f11, Y, INCY
  119. LFDUX f4, X, INCX
  120. LFDUX f5, X, INCX
  121. LFDUX f6, X, INCX
  122. LFDUX f7, X, INCX
  123. LFDUX f12, Y, INCY
  124. LFDUX f13, Y, INCY
  125. LFDUX f14, Y, INCY
  126. LFDUX f15, Y, INCY
  127. bdz LL(120)
  128. .align 4
  129. LL(110):
  130. FMADD f16, ALPHA, f0, f8
  131. LFDUX f0, X, INCX
  132. LFDUX f8, Y, INCY
  133. #ifdef PPCG4
  134. dcbt X, PRE
  135. #endif
  136. FMADD f17, ALPHA, f1, f9
  137. LFDUX f1, X, INCX
  138. LFDUX f9, Y, INCY
  139. FMADD f18, ALPHA, f2, f10
  140. LFDUX f2, X, INCX
  141. LFDUX f10, Y, INCY
  142. #ifdef PPCG4
  143. dcbtst Y, PRE
  144. #endif
  145. FMADD f19, ALPHA, f3, f11
  146. LFDUX f3, X, INCX
  147. LFDUX f11, Y, INCY
  148. FMADD f20, ALPHA, f4, f12
  149. LFDUX f4, X, INCX
  150. LFDUX f12, Y, INCY
  151. #if defined(PPCG4) && defined(DOUBLE)
  152. dcbt X, PRE
  153. #endif
  154. FMADD f21, ALPHA, f5, f13
  155. LFDUX f5, X, INCX
  156. LFDUX f13, Y, INCY
  157. FMADD f22, ALPHA, f6, f14
  158. LFDUX f6, X, INCX
  159. LFDUX f14, Y, INCY
  160. #if defined(PPCG4) && defined(DOUBLE)
  161. dcbtst Y, PRE
  162. #endif
  163. FMADD f23, ALPHA, f7, f15
  164. LFDUX f7, X, INCX
  165. LFDUX f15, Y, INCY
  166. STFDUX f16, YY, INCY
  167. STFDUX f17, YY, INCY
  168. STFDUX f18, YY, INCY
  169. STFDUX f19, YY, INCY
  170. FMADD f16, ALPHA, f0, f8
  171. LFDUX f0, X, INCX
  172. LFDUX f8, Y, INCY
  173. #ifdef PPCG4
  174. dcbt X, PRE
  175. #endif
  176. FMADD f17, ALPHA, f1, f9
  177. LFDUX f1, X, INCX
  178. LFDUX f9, Y, INCY
  179. FMADD f18, ALPHA, f2, f10
  180. LFDUX f2, X, INCX
  181. LFDUX f10, Y, INCY
  182. #ifdef PPCG4
  183. dcbtst Y, PRE
  184. #endif
  185. FMADD f19, ALPHA, f3, f11
  186. LFDUX f3, X, INCX
  187. LFDUX f11, Y, INCY
  188. STFDUX f20, YY, INCY
  189. STFDUX f21, YY, INCY
  190. STFDUX f22, YY, INCY
  191. STFDUX f23, YY, INCY
  192. FMADD f20, ALPHA, f4, f12
  193. LFDUX f4, X, INCX
  194. LFDUX f12, Y, INCY
  195. #if defined(PPCG4) && defined(DOUBLE)
  196. dcbt X, PRE
  197. #endif
  198. FMADD f21, ALPHA, f5, f13
  199. LFDUX f5, X, INCX
  200. LFDUX f13, Y, INCY
  201. FMADD f22, ALPHA, f6, f14
  202. LFDUX f6, X, INCX
  203. LFDUX f14, Y, INCY
  204. #if defined(PPCG4) && defined(DOUBLE)
  205. dcbtst Y, PRE
  206. #endif
  207. FMADD f23, ALPHA, f7, f15
  208. LFDUX f7, X, INCX
  209. LFDUX f15, Y, INCY
  210. STFDUX f16, YY, INCY
  211. STFDUX f17, YY, INCY
  212. STFDUX f18, YY, INCY
  213. STFDUX f19, YY, INCY
  214. STFDUX f20, YY, INCY
  215. STFDUX f21, YY, INCY
  216. STFDUX f22, YY, INCY
  217. STFDUX f23, YY, INCY
  218. bdnz LL(110)
  219. .align 4
  220. LL(120):
  221. FMADD f16, ALPHA, f0, f8
  222. LFDUX f0, X, INCX
  223. LFDUX f8, Y, INCY
  224. FMADD f17, ALPHA, f1, f9
  225. LFDUX f1, X, INCX
  226. LFDUX f9, Y, INCY
  227. FMADD f18, ALPHA, f2, f10
  228. LFDUX f2, X, INCX
  229. LFDUX f10, Y, INCY
  230. FMADD f19, ALPHA, f3, f11
  231. LFDUX f3, X, INCX
  232. LFDUX f11, Y, INCY
  233. FMADD f20, ALPHA, f4, f12
  234. LFDUX f4, X, INCX
  235. LFDUX f12, Y, INCY
  236. FMADD f21, ALPHA, f5, f13
  237. LFDUX f5, X, INCX
  238. LFDUX f13, Y, INCY
  239. FMADD f22, ALPHA, f6, f14
  240. LFDUX f6, X, INCX
  241. LFDUX f14, Y, INCY
  242. FMADD f23, ALPHA, f7, f15
  243. LFDUX f7, X, INCX
  244. LFDUX f15, Y, INCY
  245. STFDUX f16, YY, INCY
  246. STFDUX f17, YY, INCY
  247. STFDUX f18, YY, INCY
  248. STFDUX f19, YY, INCY
  249. FMADD f16, ALPHA, f0, f8
  250. STFDUX f20, YY, INCY
  251. FMADD f17, ALPHA, f1, f9
  252. STFDUX f21, YY, INCY
  253. FMADD f18, ALPHA, f2, f10
  254. STFDUX f22, YY, INCY
  255. FMADD f19, ALPHA, f3, f11
  256. STFDUX f23, YY, INCY
  257. FMADD f20, ALPHA, f4, f12
  258. STFDUX f16, YY, INCY
  259. FMADD f21, ALPHA, f5, f13
  260. STFDUX f17, YY, INCY
  261. FMADD f22, ALPHA, f6, f14
  262. STFDUX f18, YY, INCY
  263. FMADD f23, ALPHA, f7, f15
  264. STFDUX f19, YY, INCY
  265. STFDUX f20, YY, INCY
  266. STFDUX f21, YY, INCY
  267. STFDUX f22, YY, INCY
  268. STFDUX f23, YY, INCY
  269. .align 4
  270. LL(150):
  271. andi. r0, N, 15
  272. mtspr CTR, r0
  273. beq LL(999)
  274. .align 4
  275. LL(160):
  276. LFDUX f0, X, INCX
  277. LFDUX f8, Y, INCY
  278. FMADD f16, ALPHA, f0, f8
  279. STFDUX f16, YY, INCY
  280. bdnz LL(160)
  281. .align 4
  282. LL(999):
  283. lfd f14, 0(SP)
  284. lfd f15, 8(SP)
  285. lfd f16, 16(SP)
  286. lfd f17, 24(SP)
  287. lfd f18, 32(SP)
  288. lfd f19, 40(SP)
  289. lfd f20, 48(SP)
  290. lfd f21, 56(SP)
  291. lfd f22, 64(SP)
  292. lfd f23, 72(SP)
  293. lfd f24, 80(SP)
  294. addi SP, SP, STACKSIZE
  295. blr
  296. EPILOGUE