You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zdot.S 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C)
  41. #define RESULT r3
  42. #define N r4
  43. #define X r5
  44. #define INCX r6
  45. #define Y r7
  46. #define INCY r8
  47. #define PREA r9
  48. #else
  49. #define N r3
  50. #define X r4
  51. #define INCX r5
  52. #define Y r6
  53. #define INCY r7
  54. #define PREA r8
  55. #endif
  56. #define INCXM1 r10
  57. #define INCYM1 r11
  58. #define FZERO f0
  59. #define STACKSIZE 160
  60. PROLOGUE
  61. PROFCODE
  62. addi SP, SP, -STACKSIZE
  63. li r0, 0
  64. stfd f14, 0(SP)
  65. stfd f15, 8(SP)
  66. stfd f16, 16(SP)
  67. stfd f17, 24(SP)
  68. stfd f18, 32(SP)
  69. stfd f19, 40(SP)
  70. stfd f20, 48(SP)
  71. stfd f21, 56(SP)
  72. stfd f22, 64(SP)
  73. stfd f23, 72(SP)
  74. stfd f24, 80(SP)
  75. stfd f25, 88(SP)
  76. stfd f26, 96(SP)
  77. stfd f27, 104(SP)
  78. stfd f28, 112(SP)
  79. stfd f29, 120(SP)
  80. stfd f30, 128(SP)
  81. stfd f31, 136(SP)
  82. stw r0, 144(SP)
  83. lfs FZERO,144(SP)
  84. #ifdef F_INTERFACE
  85. LDINT N, 0(N)
  86. LDINT INCX, 0(INCX)
  87. LDINT INCY, 0(INCY)
  88. #endif
  89. slwi INCX, INCX, ZBASE_SHIFT
  90. slwi INCY, INCY, ZBASE_SHIFT
  91. subi INCXM1, INCX, SIZE
  92. subi INCYM1, INCY, SIZE
  93. fmr f1, FZERO
  94. fmr f2, FZERO
  95. fmr f3, FZERO
  96. fmr f4, FZERO
  97. fmr f5, FZERO
  98. fmr f6, FZERO
  99. fmr f7, FZERO
  100. fmr f24, FZERO
  101. fmr f25, FZERO
  102. fmr f26, FZERO
  103. fmr f27, FZERO
  104. fmr f28, FZERO
  105. fmr f29, FZERO
  106. fmr f30, FZERO
  107. fmr f31, FZERO
  108. #ifdef L1_DUALFETCH
  109. li PREA, (L1_PREFETCHSIZE) / 2
  110. #else
  111. li PREA, (L1_PREFETCHSIZE)
  112. #endif
  113. cmpwi cr0, N, 0
  114. ble- LL(999)
  115. cmpwi cr0, INCX, 2 * SIZE
  116. bne- cr0, LL(100)
  117. cmpwi cr0, INCY, 2 * SIZE
  118. bne- cr0, LL(100)
  119. srawi. r0, N, 3
  120. mtspr CTR, r0
  121. beq- cr0, LL(50)
  122. .align 4
  123. LFD f8, 0 * SIZE(X)
  124. LFD f9, 1 * SIZE(X)
  125. LFD f10, 2 * SIZE(X)
  126. LFD f11, 3 * SIZE(X)
  127. LFD f16, 0 * SIZE(Y)
  128. LFD f17, 1 * SIZE(Y)
  129. LFD f18, 2 * SIZE(Y)
  130. LFD f19, 3 * SIZE(Y)
  131. LFD f12, 4 * SIZE(X)
  132. LFD f13, 5 * SIZE(X)
  133. LFD f14, 6 * SIZE(X)
  134. LFD f15, 7 * SIZE(X)
  135. LFD f20, 4 * SIZE(Y)
  136. LFD f21, 5 * SIZE(Y)
  137. LFD f22, 6 * SIZE(Y)
  138. LFD f23, 7 * SIZE(Y)
  139. bdz LL(20)
  140. .align 4
  141. LL(10):
  142. FMADD f0, f8, f16, f0
  143. FMADD f1, f9, f17, f1
  144. FMADD f2, f9, f16, f2
  145. FMADD f3, f8, f17, f3
  146. FMADD f4, f10, f18, f4
  147. FMADD f5, f11, f19, f5
  148. FMADD f6, f11, f18, f6
  149. FMADD f7, f10, f19, f7
  150. LFD f8, 8 * SIZE(X)
  151. LFD f9, 9 * SIZE(X)
  152. LFD f10, 10 * SIZE(X)
  153. LFD f11, 11 * SIZE(X)
  154. LFD f16, 8 * SIZE(Y)
  155. LFD f17, 9 * SIZE(Y)
  156. LFD f18, 10 * SIZE(Y)
  157. LFD f19, 11 * SIZE(Y)
  158. FMADD f24, f12, f20, f24
  159. FMADD f25, f13, f21, f25
  160. FMADD f26, f13, f20, f26
  161. FMADD f27, f12, f21, f27
  162. FMADD f28, f14, f22, f28
  163. FMADD f29, f15, f23, f29
  164. FMADD f30, f15, f22, f30
  165. FMADD f31, f14, f23, f31
  166. LFD f12, 12 * SIZE(X)
  167. LFD f13, 13 * SIZE(X)
  168. LFD f14, 14 * SIZE(X)
  169. LFD f15, 15 * SIZE(X)
  170. LFD f20, 12 * SIZE(Y)
  171. LFD f21, 13 * SIZE(Y)
  172. LFD f22, 14 * SIZE(Y)
  173. LFD f23, 15 * SIZE(Y)
  174. FMADD f0, f8, f16, f0
  175. FMADD f1, f9, f17, f1
  176. FMADD f2, f9, f16, f2
  177. FMADD f3, f8, f17, f3
  178. FMADD f4, f10, f18, f4
  179. FMADD f5, f11, f19, f5
  180. FMADD f6, f11, f18, f6
  181. FMADD f7, f10, f19, f7
  182. LFD f8, 16 * SIZE(X)
  183. LFD f9, 17 * SIZE(X)
  184. LFD f10, 18 * SIZE(X)
  185. LFD f11, 19 * SIZE(X)
  186. LFD f16, 16 * SIZE(Y)
  187. LFD f17, 17 * SIZE(Y)
  188. LFD f18, 18 * SIZE(Y)
  189. LFD f19, 19 * SIZE(Y)
  190. FMADD f24, f12, f20, f24
  191. FMADD f25, f13, f21, f25
  192. FMADD f26, f13, f20, f26
  193. FMADD f27, f12, f21, f27
  194. FMADD f28, f14, f22, f28
  195. FMADD f29, f15, f23, f29
  196. FMADD f30, f15, f22, f30
  197. FMADD f31, f14, f23, f31
  198. LFD f12, 20 * SIZE(X)
  199. LFD f13, 21 * SIZE(X)
  200. LFD f14, 22 * SIZE(X)
  201. LFD f15, 23 * SIZE(X)
  202. LFD f20, 20 * SIZE(Y)
  203. LFD f21, 21 * SIZE(Y)
  204. LFD f22, 22 * SIZE(Y)
  205. LFD f23, 23 * SIZE(Y)
  206. #ifndef POWER6
  207. L1_PREFETCH X, PREA
  208. #ifdef L1_DUALFETCH
  209. L1_PREFETCH Y, PREA
  210. #endif
  211. #endif
  212. addi X, X, 16 * SIZE
  213. addi Y, Y, 16 * SIZE
  214. #ifdef POWER6
  215. L1_PREFETCH X, PREA
  216. #ifdef L1_DUALFETCH
  217. L1_PREFETCH Y, PREA
  218. #endif
  219. #endif
  220. bdnz LL(10)
  221. .align 4
  222. LL(20):
  223. FMADD f0, f8, f16, f0
  224. FMADD f1, f9, f17, f1
  225. FMADD f2, f9, f16, f2
  226. FMADD f3, f8, f17, f3
  227. FMADD f4, f10, f18, f4
  228. FMADD f5, f11, f19, f5
  229. FMADD f6, f11, f18, f6
  230. FMADD f7, f10, f19, f7
  231. LFD f8, 8 * SIZE(X)
  232. LFD f9, 9 * SIZE(X)
  233. LFD f10, 10 * SIZE(X)
  234. LFD f11, 11 * SIZE(X)
  235. LFD f16, 8 * SIZE(Y)
  236. LFD f17, 9 * SIZE(Y)
  237. LFD f18, 10 * SIZE(Y)
  238. LFD f19, 11 * SIZE(Y)
  239. FMADD f24, f12, f20, f24
  240. FMADD f25, f13, f21, f25
  241. FMADD f26, f13, f20, f26
  242. FMADD f27, f12, f21, f27
  243. FMADD f28, f14, f22, f28
  244. FMADD f29, f15, f23, f29
  245. FMADD f30, f15, f22, f30
  246. FMADD f31, f14, f23, f31
  247. LFD f12, 12 * SIZE(X)
  248. LFD f13, 13 * SIZE(X)
  249. LFD f14, 14 * SIZE(X)
  250. LFD f15, 15 * SIZE(X)
  251. LFD f20, 12 * SIZE(Y)
  252. LFD f21, 13 * SIZE(Y)
  253. LFD f22, 14 * SIZE(Y)
  254. LFD f23, 15 * SIZE(Y)
  255. FMADD f0, f8, f16, f0
  256. FMADD f1, f9, f17, f1
  257. FMADD f2, f9, f16, f2
  258. FMADD f3, f8, f17, f3
  259. FMADD f4, f10, f18, f4
  260. FMADD f5, f11, f19, f5
  261. FMADD f6, f11, f18, f6
  262. FMADD f7, f10, f19, f7
  263. FMADD f24, f12, f20, f24
  264. FMADD f25, f13, f21, f25
  265. FMADD f26, f13, f20, f26
  266. FMADD f27, f12, f21, f27
  267. FMADD f28, f14, f22, f28
  268. FMADD f29, f15, f23, f29
  269. FMADD f30, f15, f22, f30
  270. FMADD f31, f14, f23, f31
  271. addi X, X, 16 * SIZE
  272. addi Y, Y, 16 * SIZE
  273. .align 4
  274. LL(50):
  275. andi. r0, N, 7
  276. mtspr CTR, r0
  277. beq LL(999)
  278. .align 4
  279. LL(60):
  280. LFD f8, 0 * SIZE(X)
  281. LFD f9, 1 * SIZE(X)
  282. LFD f16, 0 * SIZE(Y)
  283. LFD f17, 1 * SIZE(Y)
  284. addi X, X, 2 * SIZE
  285. addi Y, Y, 2 * SIZE
  286. FMADD f0, f8, f16, f0
  287. FMADD f1, f9, f17, f1
  288. FMADD f2, f9, f16, f2
  289. FMADD f3, f8, f17, f3
  290. bdnz LL(60)
  291. b LL(999)
  292. .align 4
  293. LL(100):
  294. #ifdef F_INTERFACE
  295. cmpwi cr0, INCX, 0
  296. bge+ LL(102)
  297. subi r0, N, 1
  298. mullw r0, r0, INCX
  299. sub X, X, r0
  300. .align 4
  301. LL(102):
  302. cmpwi cr0, INCY, 0
  303. bge+ LL(104)
  304. subi r0, N, 1
  305. mullw r0, r0, INCY
  306. sub Y, Y, r0
  307. .align 4
  308. LL(104):
  309. #endif
  310. sub X, X, INCXM1
  311. sub Y, Y, INCYM1
  312. srawi. r0, N, 3
  313. mtspr CTR, r0
  314. beq- LL(150)
  315. LFDX f8, X, INCXM1
  316. LFDX f16, Y, INCYM1
  317. LFDUX f9, X, INCX
  318. LFDUX f17, Y, INCY
  319. LFDX f10, X, INCXM1
  320. LFDX f18, Y, INCYM1
  321. LFDUX f11, X, INCX
  322. LFDUX f19, Y, INCY
  323. LFDX f12, X, INCXM1
  324. LFDX f20, Y, INCYM1
  325. LFDUX f13, X, INCX
  326. LFDUX f21, Y, INCY
  327. LFDX f14, X, INCXM1
  328. LFDX f22, Y, INCYM1
  329. LFDUX f15, X, INCX
  330. LFDUX f23, Y, INCY
  331. bdz LL(120)
  332. .align 4
  333. LL(110):
  334. FMADD f0, f8, f16, f0
  335. FMADD f1, f9, f17, f1
  336. FMADD f2, f9, f16, f2
  337. FMADD f3, f8, f17, f3
  338. FMADD f4, f10, f18, f4
  339. FMADD f5, f11, f19, f5
  340. FMADD f6, f11, f18, f6
  341. FMADD f7, f10, f19, f7
  342. LFDX f8, X, INCXM1
  343. LFDX f16, Y, INCYM1
  344. LFDUX f9, X, INCX
  345. LFDUX f17, Y, INCY
  346. LFDX f10, X, INCXM1
  347. LFDX f18, Y, INCYM1
  348. LFDUX f11, X, INCX
  349. LFDUX f19, Y, INCY
  350. FMADD f24, f12, f20, f24
  351. FMADD f25, f13, f21, f25
  352. FMADD f26, f13, f20, f26
  353. FMADD f27, f12, f21, f27
  354. FMADD f28, f14, f22, f28
  355. FMADD f29, f15, f23, f29
  356. FMADD f30, f15, f22, f30
  357. FMADD f31, f14, f23, f31
  358. LFDX f12, X, INCXM1
  359. LFDX f20, Y, INCYM1
  360. LFDUX f13, X, INCX
  361. LFDUX f21, Y, INCY
  362. LFDX f14, X, INCXM1
  363. LFDX f22, Y, INCYM1
  364. LFDUX f15, X, INCX
  365. LFDUX f23, Y, INCY
  366. FMADD f0, f8, f16, f0
  367. FMADD f1, f9, f17, f1
  368. FMADD f2, f9, f16, f2
  369. FMADD f3, f8, f17, f3
  370. FMADD f4, f10, f18, f4
  371. FMADD f5, f11, f19, f5
  372. FMADD f6, f11, f18, f6
  373. FMADD f7, f10, f19, f7
  374. LFDX f8, X, INCXM1
  375. LFDX f16, Y, INCYM1
  376. LFDUX f9, X, INCX
  377. LFDUX f17, Y, INCY
  378. LFDX f10, X, INCXM1
  379. LFDX f18, Y, INCYM1
  380. LFDUX f11, X, INCX
  381. LFDUX f19, Y, INCY
  382. FMADD f24, f12, f20, f24
  383. FMADD f25, f13, f21, f25
  384. FMADD f26, f13, f20, f26
  385. FMADD f27, f12, f21, f27
  386. FMADD f28, f14, f22, f28
  387. FMADD f29, f15, f23, f29
  388. FMADD f30, f15, f22, f30
  389. FMADD f31, f14, f23, f31
  390. LFDX f12, X, INCXM1
  391. LFDX f20, Y, INCYM1
  392. LFDUX f13, X, INCX
  393. LFDUX f21, Y, INCY
  394. LFDX f14, X, INCXM1
  395. LFDX f22, Y, INCYM1
  396. LFDUX f15, X, INCX
  397. LFDUX f23, Y, INCY
  398. bdnz LL(110)
  399. .align 4
  400. LL(120):
  401. FMADD f0, f8, f16, f0
  402. FMADD f1, f9, f17, f1
  403. FMADD f2, f9, f16, f2
  404. FMADD f3, f8, f17, f3
  405. FMADD f4, f10, f18, f4
  406. FMADD f5, f11, f19, f5
  407. FMADD f6, f11, f18, f6
  408. FMADD f7, f10, f19, f7
  409. LFDX f8, X, INCXM1
  410. LFDX f16, Y, INCYM1
  411. LFDUX f9, X, INCX
  412. LFDUX f17, Y, INCY
  413. LFDX f10, X, INCXM1
  414. LFDX f18, Y, INCYM1
  415. LFDUX f11, X, INCX
  416. LFDUX f19, Y, INCY
  417. FMADD f24, f12, f20, f24
  418. FMADD f25, f13, f21, f25
  419. FMADD f26, f13, f20, f26
  420. FMADD f27, f12, f21, f27
  421. FMADD f28, f14, f22, f28
  422. FMADD f29, f15, f23, f29
  423. FMADD f30, f15, f22, f30
  424. FMADD f31, f14, f23, f31
  425. LFDX f12, X, INCXM1
  426. LFDX f20, Y, INCYM1
  427. LFDUX f13, X, INCX
  428. LFDUX f21, Y, INCY
  429. LFDX f14, X, INCXM1
  430. LFDX f22, Y, INCYM1
  431. LFDUX f15, X, INCX
  432. LFDUX f23, Y, INCY
  433. FMADD f0, f8, f16, f0
  434. FMADD f1, f9, f17, f1
  435. FMADD f2, f9, f16, f2
  436. FMADD f3, f8, f17, f3
  437. FMADD f4, f10, f18, f4
  438. FMADD f5, f11, f19, f5
  439. FMADD f6, f11, f18, f6
  440. FMADD f7, f10, f19, f7
  441. FMADD f24, f12, f20, f24
  442. FMADD f25, f13, f21, f25
  443. FMADD f26, f13, f20, f26
  444. FMADD f27, f12, f21, f27
  445. FMADD f28, f14, f22, f28
  446. FMADD f29, f15, f23, f29
  447. FMADD f30, f15, f22, f30
  448. FMADD f31, f14, f23, f31
  449. .align 4
  450. LL(150):
  451. andi. r0, N, 7
  452. mtspr CTR, r0
  453. beq LL(999)
  454. .align 4
  455. LL(160):
  456. LFDX f8, X, INCXM1
  457. LFDUX f9, X, INCX
  458. LFDX f16, Y, INCYM1
  459. LFDUX f17, Y, INCY
  460. FMADD f0, f8, f16, f0
  461. FMADD f1, f9, f17, f1
  462. FMADD f2, f9, f16, f2
  463. FMADD f3, f8, f17, f3
  464. bdnz LL(160)
  465. .align 4
  466. LL(999):
  467. FADD f0, f0, f4
  468. FADD f1, f1, f5
  469. FADD f2, f2, f6
  470. FADD f3, f3, f7
  471. FADD f24, f28, f24
  472. FADD f25, f29, f25
  473. FADD f26, f30, f26
  474. FADD f27, f31, f27
  475. FADD f0, f0, f24
  476. FADD f1, f1, f25
  477. FADD f2, f2, f26
  478. FADD f3, f3, f27
  479. #ifndef CONJ
  480. FSUB f1, f0, f1
  481. FADD f2, f2, f3
  482. #else
  483. FADD f1, f0, f1
  484. FSUB f2, f3, f2
  485. #endif
  486. #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C)
  487. STFD f1, 0 * SIZE(RESULT)
  488. STFD f2, 1 * SIZE(RESULT)
  489. #endif
  490. #if defined(F_INTERFACE) && defined(F_INTERFACE_GFORT)
  491. #ifndef __64BIT__
  492. #ifndef DOUBLE
  493. stfs f1, 144(SP)
  494. stfs f2, 148(SP)
  495. lwz r3, 144(SP)
  496. lwz r4, 148(SP)
  497. #else
  498. stfd f1, 144(SP)
  499. stfd f2, 152(SP)
  500. lwz r3, 144(SP)
  501. lwz r4, 148(SP)
  502. lwz r5, 152(SP)
  503. lwz r6, 156(SP)
  504. #endif
  505. #else
  506. #ifndef DOUBLE
  507. stfs f1, 144(SP)
  508. stfs f2, 148(SP)
  509. ld r3, 144(SP)
  510. #else
  511. stfd f1, 144(SP)
  512. stfd f2, 152(SP)
  513. ld r3, 144(SP)
  514. ld r4, 152(SP)
  515. #endif
  516. #endif
  517. #endif
  518. lfd f14, 0(SP)
  519. lfd f15, 8(SP)
  520. lfd f16, 16(SP)
  521. lfd f17, 24(SP)
  522. lfd f18, 32(SP)
  523. lfd f19, 40(SP)
  524. lfd f20, 48(SP)
  525. lfd f21, 56(SP)
  526. lfd f22, 64(SP)
  527. lfd f23, 72(SP)
  528. lfd f24, 80(SP)
  529. lfd f25, 88(SP)
  530. lfd f26, 96(SP)
  531. lfd f27, 104(SP)
  532. lfd f28, 112(SP)
  533. lfd f29, 120(SP)
  534. lfd f30, 128(SP)
  535. lfd f31, 136(SP)
  536. addi SP, SP, STACKSIZE
  537. blr
  538. EPILOGUE