You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zaxpy_ppc440.S 9.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(linux) || defined(__FreeBSD__)
  41. #ifndef __64BIT__
  42. #define N r3
  43. #define X r6
  44. #define INCX r7
  45. #define Y r8
  46. #define INCY r9
  47. #define YY r4
  48. #define PRE r5
  49. #else
  50. #define N r3
  51. #define X r8
  52. #define INCX r9
  53. #define Y r5
  54. #define INCY r4
  55. #define YY r6
  56. #define PRE r7
  57. #endif
  58. #endif
  59. #if defined(_AIX) || defined(__APPLE__)
  60. #if !defined(__64BIT__) && defined(DOUBLE)
  61. #define N r3
  62. #define X r10
  63. #define INCX r4
  64. #define Y r5
  65. #define INCY r6
  66. #define YY r7
  67. #define PRE r8
  68. #else
  69. #define N r3
  70. #define X r8
  71. #define INCX r9
  72. #define Y r10
  73. #define INCY r4
  74. #define YY r5
  75. #define PRE r6
  76. #endif
  77. #endif
  78. #define ALPHA_R f24
  79. #define ALPHA_I f25
  80. #ifndef CONJ
  81. #define ADD1 FNMSUB
  82. #define ADD2 FMADD
  83. #else
  84. #define ADD1 FMADD
  85. #define ADD2 FNMSUB
  86. #endif
  87. #define STACKSIZE 96
  88. PROLOGUE
  89. PROFCODE
  90. subi SP, SP, STACKSIZE
  91. stfd f14, 0(SP)
  92. stfd f15, 8(SP)
  93. stfd f16, 16(SP)
  94. stfd f17, 24(SP)
  95. stfd f18, 32(SP)
  96. stfd f19, 40(SP)
  97. stfd f20, 48(SP)
  98. stfd f21, 56(SP)
  99. stfd f22, 64(SP)
  100. stfd f23, 72(SP)
  101. stfd f24, 80(SP)
  102. stfd f25, 88(SP)
  103. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  104. ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
  105. #endif
  106. #if defined(_AIX) || defined(__APPLE__)
  107. #ifdef __64BIT__
  108. ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
  109. #else
  110. #ifdef DOUBLE
  111. lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
  112. lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
  113. lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
  114. #else
  115. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  116. #endif
  117. #endif
  118. #endif
  119. fmr ALPHA_R, f1
  120. slwi INCX, INCX, ZBASE_SHIFT
  121. fmr ALPHA_I, f2
  122. slwi INCY, INCY, ZBASE_SHIFT
  123. subi INCX, INCX, SIZE
  124. subi INCY, INCY, SIZE
  125. li PRE, 2 * 16 * SIZE
  126. cmpwi cr0, N, 0
  127. ble- LL(999)
  128. sub X, X, INCX
  129. sub Y, Y, INCY
  130. mr YY, Y
  131. srawi. r0, N, 3
  132. mtspr CTR, r0
  133. ble- LL(150)
  134. .align 4
  135. LFDUX f0, X, INCX
  136. LFDU f1, 1 * SIZE(X)
  137. LFDUX f2, X, INCX
  138. LFDU f3, 1 * SIZE(X)
  139. LFDUX f8, Y, INCY
  140. LFDU f9, 1 * SIZE(Y)
  141. LFDUX f10, Y, INCY
  142. LFDU f11, 1 * SIZE(Y)
  143. LFDUX f4, X, INCX
  144. LFDU f5, 1 * SIZE(X)
  145. LFDUX f6, X, INCX
  146. LFDU f7, 1 * SIZE(X)
  147. LFDUX f12, Y, INCY
  148. LFDU f13, 1 * SIZE(Y)
  149. LFDUX f14, Y, INCY
  150. LFDU f15, 1 * SIZE(Y)
  151. bdz LL(120)
  152. .align 4
  153. LL(110):
  154. FMADD f16, ALPHA_R, f0, f8
  155. LFDUX f8, Y, INCY
  156. FMADD f17, ALPHA_I, f0, f9
  157. LFDU f9, 1 * SIZE(Y)
  158. FMADD f18, ALPHA_R, f2, f10
  159. LFDUX f10, Y, INCY
  160. FMADD f19, ALPHA_I, f2, f11
  161. LFDU f11, 1 * SIZE(Y)
  162. #ifdef PPCG4
  163. dcbt X, PRE
  164. #endif
  165. ADD1 f16, ALPHA_I, f1, f16
  166. LFDUX f0, X, INCX
  167. ADD2 f17, ALPHA_R, f1, f17
  168. LFDU f1, 1 * SIZE(X)
  169. ADD1 f18, ALPHA_I, f3, f18
  170. LFDUX f2, X, INCX
  171. ADD2 f19, ALPHA_R, f3, f19
  172. LFDU f3, 1 * SIZE(X)
  173. #ifdef PPCG4
  174. dcbtst Y, PRE
  175. #endif
  176. FMADD f20, ALPHA_R, f4, f12
  177. LFDUX f12, Y, INCY
  178. FMADD f21, ALPHA_I, f4, f13
  179. LFDU f13, 1 * SIZE(Y)
  180. FMADD f22, ALPHA_R, f6, f14
  181. LFDUX f14, Y, INCY
  182. FMADD f23, ALPHA_I, f6, f15
  183. LFDU f15, 1 * SIZE(Y)
  184. #if defined(PPCG4) && defined(DOUBLE)
  185. dcbt X, PRE
  186. #endif
  187. ADD1 f20, ALPHA_I, f5, f20
  188. LFDUX f4, X, INCX
  189. ADD2 f21, ALPHA_R, f5, f21
  190. LFDU f5, 1 * SIZE(X)
  191. ADD1 f22, ALPHA_I, f7, f22
  192. LFDUX f6, X, INCX
  193. ADD2 f23, ALPHA_R, f7, f23
  194. LFDU f7, 1 * SIZE(X)
  195. #if defined(PPCG4) && defined(DOUBLE)
  196. dcbtst Y, PRE
  197. #endif
  198. STFDUX f16, YY, INCY
  199. STFDU f17, 1 * SIZE(YY)
  200. STFDUX f18, YY, INCY
  201. STFDU f19, 1 * SIZE(YY)
  202. FMADD f16, ALPHA_R, f0, f8
  203. LFDUX f8, Y, INCY
  204. FMADD f17, ALPHA_I, f0, f9
  205. LFDU f9, 1 * SIZE(Y)
  206. FMADD f18, ALPHA_R, f2, f10
  207. LFDUX f10, Y, INCY
  208. FMADD f19, ALPHA_I, f2, f11
  209. LFDU f11, 1 * SIZE(Y)
  210. #ifdef PPCG4
  211. dcbt X, PRE
  212. #endif
  213. ADD1 f16, ALPHA_I, f1, f16
  214. LFDUX f0, X, INCX
  215. ADD2 f17, ALPHA_R, f1, f17
  216. LFDU f1, 1 * SIZE(X)
  217. ADD1 f18, ALPHA_I, f3, f18
  218. LFDUX f2, X, INCX
  219. ADD2 f19, ALPHA_R, f3, f19
  220. LFDU f3, 1 * SIZE(X)
  221. #ifdef PPCG4
  222. dcbtst Y, PRE
  223. #endif
  224. STFDUX f20, YY, INCY
  225. STFDU f21, 1 * SIZE(YY)
  226. STFDUX f22, YY, INCY
  227. STFDU f23, 1 * SIZE(YY)
  228. FMADD f20, ALPHA_R, f4, f12
  229. LFDUX f12, Y, INCY
  230. FMADD f21, ALPHA_I, f4, f13
  231. LFDU f13, 1 * SIZE(Y)
  232. FMADD f22, ALPHA_R, f6, f14
  233. LFDUX f14, Y, INCY
  234. FMADD f23, ALPHA_I, f6, f15
  235. LFDU f15, 1 * SIZE(Y)
  236. #if defined(PPCG4) && defined(DOUBLE)
  237. dcbt X, PRE
  238. #endif
  239. ADD1 f20, ALPHA_I, f5, f20
  240. LFDUX f4, X, INCX
  241. ADD2 f21, ALPHA_R, f5, f21
  242. LFDU f5, 1 * SIZE(X)
  243. ADD1 f22, ALPHA_I, f7, f22
  244. LFDUX f6, X, INCX
  245. ADD2 f23, ALPHA_R, f7, f23
  246. LFDU f7, 1 * SIZE(X)
  247. #if defined(PPCG4) && defined(DOUBLE)
  248. dcbtst Y, PRE
  249. #endif
  250. STFDUX f16, YY, INCY
  251. STFDU f17, 1 * SIZE(YY)
  252. STFDUX f18, YY, INCY
  253. STFDU f19, 1 * SIZE(YY)
  254. STFDUX f20, YY, INCY
  255. STFDU f21, 1 * SIZE(YY)
  256. STFDUX f22, YY, INCY
  257. STFDU f23, 1 * SIZE(YY)
  258. bdnz LL(110)
  259. .align 4
  260. LL(120):
  261. FMADD f16, ALPHA_R, f0, f8
  262. LFDUX f8, Y, INCY
  263. FMADD f17, ALPHA_I, f0, f9
  264. LFDU f9, 1 * SIZE(Y)
  265. FMADD f18, ALPHA_R, f2, f10
  266. LFDUX f10, Y, INCY
  267. FMADD f19, ALPHA_I, f2, f11
  268. LFDU f11, 1 * SIZE(Y)
  269. ADD1 f16, ALPHA_I, f1, f16
  270. LFDUX f0, X, INCX
  271. ADD2 f17, ALPHA_R, f1, f17
  272. LFDU f1, 1 * SIZE(X)
  273. ADD1 f18, ALPHA_I, f3, f18
  274. LFDUX f2, X, INCX
  275. ADD2 f19, ALPHA_R, f3, f19
  276. LFDU f3, 1 * SIZE(X)
  277. FMADD f20, ALPHA_R, f4, f12
  278. LFDUX f12, Y, INCY
  279. FMADD f21, ALPHA_I, f4, f13
  280. LFDU f13, 1 * SIZE(Y)
  281. FMADD f22, ALPHA_R, f6, f14
  282. LFDUX f14, Y, INCY
  283. FMADD f23, ALPHA_I, f6, f15
  284. LFDU f15, 1 * SIZE(Y)
  285. ADD1 f20, ALPHA_I, f5, f20
  286. LFDUX f4, X, INCX
  287. ADD2 f21, ALPHA_R, f5, f21
  288. LFDU f5, 1 * SIZE(X)
  289. ADD1 f22, ALPHA_I, f7, f22
  290. LFDUX f6, X, INCX
  291. ADD2 f23, ALPHA_R, f7, f23
  292. LFDU f7, 1 * SIZE(X)
  293. STFDUX f16, YY, INCY
  294. FMADD f16, ALPHA_R, f0, f8
  295. STFDU f17, 1 * SIZE(YY)
  296. FMADD f17, ALPHA_I, f0, f9
  297. STFDUX f18, YY, INCY
  298. FMADD f18, ALPHA_R, f2, f10
  299. STFDU f19, 1 * SIZE(YY)
  300. FMADD f19, ALPHA_I, f2, f11
  301. ADD1 f16, ALPHA_I, f1, f16
  302. ADD2 f17, ALPHA_R, f1, f17
  303. ADD1 f18, ALPHA_I, f3, f18
  304. ADD2 f19, ALPHA_R, f3, f19
  305. STFDUX f20, YY, INCY
  306. FMADD f20, ALPHA_R, f4, f12
  307. STFDU f21, 1 * SIZE(YY)
  308. FMADD f21, ALPHA_I, f4, f13
  309. STFDUX f22, YY, INCY
  310. FMADD f22, ALPHA_R, f6, f14
  311. STFDU f23, 1 * SIZE(YY)
  312. FMADD f23, ALPHA_I, f6, f15
  313. ADD1 f20, ALPHA_I, f5, f20
  314. STFDUX f16, YY, INCY
  315. ADD2 f21, ALPHA_R, f5, f21
  316. STFDU f17, 1 * SIZE(YY)
  317. ADD1 f22, ALPHA_I, f7, f22
  318. STFDUX f18, YY, INCY
  319. ADD2 f23, ALPHA_R, f7, f23
  320. STFDU f19, 1 * SIZE(YY)
  321. STFDUX f20, YY, INCY
  322. STFDU f21, 1 * SIZE(YY)
  323. STFDUX f22, YY, INCY
  324. STFDU f23, 1 * SIZE(YY)
  325. .align 4
  326. LL(150):
  327. andi. r0, N, 7
  328. mtspr CTR, r0
  329. ble LL(999)
  330. .align 4
  331. LL(160):
  332. LFDUX f0, X, INCX
  333. LFDU f1, 1 * SIZE(X)
  334. LFDUX f8, Y, INCY
  335. LFDU f9, 1 * SIZE(Y)
  336. FMADD f16, ALPHA_R, f0, f8
  337. FMADD f17, ALPHA_I, f0, f9
  338. ADD1 f16, ALPHA_I, f1, f16
  339. ADD2 f17, ALPHA_R, f1, f17
  340. STFDUX f16, YY, INCY
  341. STFDU f17, 1 * SIZE(YY)
  342. bdnz LL(160)
  343. .align 4
  344. LL(999):
  345. lfd f14, 0(SP)
  346. lfd f15, 8(SP)
  347. lfd f16, 16(SP)
  348. lfd f17, 24(SP)
  349. lfd f18, 32(SP)
  350. lfd f19, 40(SP)
  351. lfd f20, 48(SP)
  352. lfd f21, 56(SP)
  353. lfd f22, 64(SP)
  354. lfd f23, 72(SP)
  355. lfd f24, 80(SP)
  356. lfd f25, 88(SP)
  357. addi SP, SP, STACKSIZE
  358. li r0, 0
  359. blr
  360. EPILOGUE