You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zdot_cell.S 13 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C)
  41. #define RESULT r3
  42. #define N r4
  43. #define X r5
  44. #define INCX r6
  45. #define Y r7
  46. #define INCY r8
  47. #define PREA r9
  48. #else
  49. #define N r3
  50. #define X r4
  51. #define INCX r5
  52. #define Y r6
  53. #define INCY r7
  54. #define PREA r8
  55. #endif
  56. #define INCXM1 r10
  57. #define INCYM1 r11
  58. #define FZERO f0
  59. #define STACKSIZE 160
  60. PROLOGUE
  61. PROFCODE
  62. addi SP, SP, -STACKSIZE
  63. li r0, 0
  64. stfd f14, 0(SP)
  65. stfd f15, 8(SP)
  66. stfd f16, 16(SP)
  67. stfd f17, 24(SP)
  68. stfd f18, 32(SP)
  69. stfd f19, 40(SP)
  70. stfd f20, 48(SP)
  71. stfd f21, 56(SP)
  72. stfd f22, 64(SP)
  73. stfd f23, 72(SP)
  74. stfd f24, 80(SP)
  75. stfd f25, 88(SP)
  76. stfd f26, 96(SP)
  77. stfd f27, 104(SP)
  78. stfd f28, 112(SP)
  79. stfd f29, 120(SP)
  80. stfd f30, 128(SP)
  81. stfd f31, 136(SP)
  82. stw r0, 144(SP)
  83. lfs FZERO,144(SP)
  84. #ifdef F_INTERFACE
  85. LDINT N, 0(N)
  86. LDINT INCX, 0(INCX)
  87. LDINT INCY, 0(INCY)
  88. #endif
  89. slwi INCX, INCX, ZBASE_SHIFT
  90. slwi INCY, INCY, ZBASE_SHIFT
  91. subi INCXM1, INCX, SIZE
  92. subi INCYM1, INCY, SIZE
  93. fmr f1, FZERO
  94. fmr f2, FZERO
  95. fmr f3, FZERO
  96. fmr f4, FZERO
  97. fmr f5, FZERO
  98. fmr f6, FZERO
  99. fmr f7, FZERO
  100. fmr f24, FZERO
  101. fmr f25, FZERO
  102. fmr f26, FZERO
  103. fmr f27, FZERO
  104. fmr f28, FZERO
  105. fmr f29, FZERO
  106. fmr f30, FZERO
  107. fmr f31, FZERO
  108. li PREA, 16 * 10 * SIZE
  109. cmpwi cr0, N, 0
  110. ble- LL(999)
  111. cmpwi cr0, INCX, 2 * SIZE
  112. bne- cr0, LL(100)
  113. cmpwi cr0, INCY, 2 * SIZE
  114. bne- cr0, LL(100)
  115. srawi. r0, N, 3
  116. mtspr CTR, r0
  117. beq- cr0, LL(50)
  118. .align 4
  119. LFD f8, 0 * SIZE(X)
  120. LFD f9, 1 * SIZE(X)
  121. LFD f10, 2 * SIZE(X)
  122. LFD f11, 3 * SIZE(X)
  123. LFD f16, 0 * SIZE(Y)
  124. LFD f17, 1 * SIZE(Y)
  125. LFD f18, 2 * SIZE(Y)
  126. LFD f19, 3 * SIZE(Y)
  127. LFD f12, 4 * SIZE(X)
  128. LFD f13, 5 * SIZE(X)
  129. LFD f14, 6 * SIZE(X)
  130. LFD f15, 7 * SIZE(X)
  131. LFD f20, 4 * SIZE(Y)
  132. LFD f21, 5 * SIZE(Y)
  133. LFD f23, 7 * SIZE(Y)
  134. bdz LL(20)
  135. .align 4
  136. LL(10):
  137. FMADD f0, f8, f16, f0
  138. LFD f22, 6 * SIZE(Y)
  139. FMADD f3, f8, f17, f3
  140. LFD f8, 8 * SIZE(X)
  141. FMADD f1, f9, f17, f1
  142. LFD f17, 9 * SIZE(Y)
  143. FMADD f2, f9, f16, f2
  144. LFD f9, 9 * SIZE(X)
  145. FMADD f4, f10, f18, f4
  146. LFD f16, 8 * SIZE(Y)
  147. FMADD f7, f10, f19, f7
  148. LFD f10, 10 * SIZE(X)
  149. FMADD f5, f11, f19, f5
  150. LFD f19, 11 * SIZE(Y)
  151. FMADD f6, f11, f18, f6
  152. LFD f11, 11 * SIZE(X)
  153. FMADD f24, f12, f20, f24
  154. LFD f18, 10 * SIZE(Y)
  155. FMADD f27, f12, f21, f27
  156. LFD f12, 12 * SIZE(X)
  157. FMADD f25, f13, f21, f25
  158. LFD f21, 13 * SIZE(Y)
  159. FMADD f26, f13, f20, f26
  160. LFD f13, 13 * SIZE(X)
  161. FMADD f28, f14, f22, f28
  162. LFD f20, 12 * SIZE(Y)
  163. FMADD f31, f14, f23, f31
  164. LFD f14, 14 * SIZE(X)
  165. FMADD f29, f15, f23, f29
  166. LFD f23, 15 * SIZE(Y)
  167. FMADD f30, f15, f22, f30
  168. LFD f15, 15 * SIZE(X)
  169. FMADD f0, f8, f16, f0
  170. LFD f22, 14 * SIZE(Y)
  171. FMADD f3, f8, f17, f3
  172. LFD f8, 16 * SIZE(X)
  173. FMADD f1, f9, f17, f1
  174. LFD f17, 17 * SIZE(Y)
  175. FMADD f2, f9, f16, f2
  176. LFD f9, 17 * SIZE(X)
  177. FMADD f4, f10, f18, f4
  178. LFD f16, 16 * SIZE(Y)
  179. FMADD f7, f10, f19, f7
  180. LFD f10, 18 * SIZE(X)
  181. FMADD f5, f11, f19, f5
  182. LFD f19, 19 * SIZE(Y)
  183. FMADD f6, f11, f18, f6
  184. LFD f11, 19 * SIZE(X)
  185. FMADD f24, f12, f20, f24
  186. LFD f18, 18 * SIZE(Y)
  187. FMADD f27, f12, f21, f27
  188. LFD f12, 20 * SIZE(X)
  189. FMADD f25, f13, f21, f25
  190. LFD f21, 21 * SIZE(Y)
  191. FMADD f26, f13, f20, f26
  192. LFD f13, 21 * SIZE(X)
  193. FMADD f28, f14, f22, f28
  194. LFD f20, 20 * SIZE(Y)
  195. FMADD f31, f14, f23, f31
  196. LFD f14, 22 * SIZE(X)
  197. FMADD f29, f15, f23, f29
  198. LFD f23, 23 * SIZE(Y)
  199. FMADD f30, f15, f22, f30
  200. LFD f15, 23 * SIZE(X)
  201. dcbt X, PREA
  202. addi X, X, 16 * SIZE
  203. dcbt Y, PREA
  204. addi Y, Y, 16 * SIZE
  205. bdnz LL(10)
  206. .align 4
  207. LL(20):
  208. FMADD f0, f8, f16, f0
  209. LFD f22, 6 * SIZE(Y)
  210. FMADD f3, f8, f17, f3
  211. LFD f8, 8 * SIZE(X)
  212. FMADD f1, f9, f17, f1
  213. LFD f17, 9 * SIZE(Y)
  214. FMADD f2, f9, f16, f2
  215. LFD f9, 9 * SIZE(X)
  216. FMADD f4, f10, f18, f4
  217. LFD f16, 8 * SIZE(Y)
  218. FMADD f7, f10, f19, f7
  219. LFD f10, 10 * SIZE(X)
  220. FMADD f5, f11, f19, f5
  221. LFD f19, 11 * SIZE(Y)
  222. FMADD f6, f11, f18, f6
  223. LFD f11, 11 * SIZE(X)
  224. FMADD f24, f12, f20, f24
  225. LFD f18, 10 * SIZE(Y)
  226. FMADD f27, f12, f21, f27
  227. LFD f12, 12 * SIZE(X)
  228. FMADD f25, f13, f21, f25
  229. LFD f21, 13 * SIZE(Y)
  230. FMADD f26, f13, f20, f26
  231. LFD f13, 13 * SIZE(X)
  232. FMADD f28, f14, f22, f28
  233. LFD f20, 12 * SIZE(Y)
  234. FMADD f31, f14, f23, f31
  235. LFD f14, 14 * SIZE(X)
  236. FMADD f29, f15, f23, f29
  237. LFD f23, 15 * SIZE(Y)
  238. FMADD f30, f15, f22, f30
  239. LFD f15, 15 * SIZE(X)
  240. FMADD f0, f8, f16, f0
  241. LFD f22, 14 * SIZE(Y)
  242. FMADD f3, f8, f17, f3
  243. addi X, X, 16 * SIZE
  244. FMADD f1, f9, f17, f1
  245. addi Y, Y, 16 * SIZE
  246. FMADD f2, f9, f16, f2
  247. nop
  248. FMADD f4, f10, f18, f4
  249. FMADD f7, f10, f19, f7
  250. FMADD f5, f11, f19, f5
  251. FMADD f6, f11, f18, f6
  252. FMADD f24, f12, f20, f24
  253. FMADD f27, f12, f21, f27
  254. FMADD f25, f13, f21, f25
  255. FMADD f26, f13, f20, f26
  256. FMADD f28, f14, f22, f28
  257. FMADD f31, f14, f23, f31
  258. FMADD f29, f15, f23, f29
  259. FMADD f30, f15, f22, f30
  260. .align 4
  261. LL(50):
  262. andi. r0, N, 7
  263. mtspr CTR, r0
  264. beq LL(999)
  265. .align 4
  266. LL(60):
  267. LFD f8, 0 * SIZE(X)
  268. LFD f9, 1 * SIZE(X)
  269. LFD f16, 0 * SIZE(Y)
  270. LFD f17, 1 * SIZE(Y)
  271. addi X, X, 2 * SIZE
  272. addi Y, Y, 2 * SIZE
  273. FMADD f0, f8, f16, f0
  274. FMADD f3, f8, f17, f3
  275. FMADD f1, f9, f17, f1
  276. FMADD f2, f9, f16, f2
  277. bdnz LL(60)
  278. b LL(999)
  279. .align 4
  280. LL(100):
  281. #ifdef F_INTERFACE
  282. cmpwi cr0, INCX, 0
  283. bge+ LL(102)
  284. subi r0, N, 1
  285. mullw r0, r0, INCX
  286. sub X, X, r0
  287. .align 4
  288. LL(102):
  289. cmpwi cr0, INCY, 0
  290. bge+ LL(104)
  291. subi r0, N, 1
  292. mullw r0, r0, INCY
  293. sub Y, Y, r0
  294. .align 4
  295. LL(104):
  296. #endif
  297. sub X, X, INCXM1
  298. sub Y, Y, INCYM1
  299. srawi. r0, N, 3
  300. mtspr CTR, r0
  301. beq- LL(150)
  302. LFDX f8, X, INCXM1
  303. LFDX f16, Y, INCYM1
  304. LFDUX f9, X, INCX
  305. LFDUX f17, Y, INCY
  306. LFDX f10, X, INCXM1
  307. LFDX f18, Y, INCYM1
  308. LFDUX f11, X, INCX
  309. LFDUX f19, Y, INCY
  310. LFDX f12, X, INCXM1
  311. LFDX f20, Y, INCYM1
  312. LFDUX f13, X, INCX
  313. LFDUX f21, Y, INCY
  314. LFDX f14, X, INCXM1
  315. LFDUX f15, X, INCX
  316. bdz LL(120)
  317. .align 4
  318. LL(110):
  319. FMADD f0, f8, f16, f0
  320. LFDX f22, Y, INCYM1
  321. FMADD f3, f8, f17, f3
  322. LFDX f8, X, INCXM1
  323. FMADD f1, f9, f17, f1
  324. LFDUX f23, Y, INCY
  325. FMADD f2, f9, f16, f2
  326. LFDUX f9, X, INCX
  327. FMADD f4, f10, f18, f4
  328. LFDX f16, Y, INCYM1
  329. FMADD f7, f10, f19, f7
  330. LFDX f10, X, INCXM1
  331. FMADD f5, f11, f19, f5
  332. LFDUX f17, Y, INCY
  333. FMADD f6, f11, f18, f6
  334. LFDUX f11, X, INCX
  335. FMADD f24, f12, f20, f24
  336. LFDX f18, Y, INCYM1
  337. FMADD f27, f12, f21, f27
  338. LFDX f12, X, INCXM1
  339. FMADD f25, f13, f21, f25
  340. LFDUX f19, Y, INCY
  341. FMADD f26, f13, f20, f26
  342. LFDUX f13, X, INCX
  343. FMADD f28, f14, f22, f28
  344. LFDX f20, Y, INCYM1
  345. FMADD f31, f14, f23, f31
  346. LFDX f14, X, INCXM1
  347. FMADD f29, f15, f23, f29
  348. LFDUX f21, Y, INCY
  349. FMADD f30, f15, f22, f30
  350. LFDUX f15, X, INCX
  351. FMADD f0, f8, f16, f0
  352. LFDX f22, Y, INCYM1
  353. FMADD f3, f8, f17, f3
  354. LFDX f8, X, INCXM1
  355. FMADD f1, f9, f17, f1
  356. LFDUX f23, Y, INCY
  357. FMADD f2, f9, f16, f2
  358. LFDUX f9, X, INCX
  359. FMADD f4, f10, f18, f4
  360. LFDX f16, Y, INCYM1
  361. FMADD f7, f10, f19, f7
  362. LFDX f10, X, INCXM1
  363. FMADD f5, f11, f19, f5
  364. LFDUX f17, Y, INCY
  365. FMADD f6, f11, f18, f6
  366. LFDUX f11, X, INCX
  367. FMADD f24, f12, f20, f24
  368. LFDX f18, Y, INCYM1
  369. FMADD f27, f12, f21, f27
  370. LFDX f12, X, INCXM1
  371. FMADD f25, f13, f21, f25
  372. LFDUX f19, Y, INCY
  373. FMADD f26, f13, f20, f26
  374. LFDUX f13, X, INCX
  375. FMADD f28, f14, f22, f28
  376. LFDX f20, Y, INCYM1
  377. FMADD f31, f14, f23, f31
  378. LFDX f14, X, INCXM1
  379. FMADD f29, f15, f23, f29
  380. LFDUX f21, Y, INCY
  381. FMADD f30, f15, f22, f30
  382. LFDUX f15, X, INCX
  383. bdnz LL(110)
  384. .align 4
  385. LL(120):
  386. FMADD f0, f8, f16, f0
  387. LFDX f22, Y, INCYM1
  388. FMADD f3, f8, f17, f3
  389. LFDX f8, X, INCXM1
  390. FMADD f1, f9, f17, f1
  391. LFDUX f23, Y, INCY
  392. FMADD f2, f9, f16, f2
  393. LFDUX f9, X, INCX
  394. FMADD f4, f10, f18, f4
  395. LFDX f16, Y, INCYM1
  396. FMADD f7, f10, f19, f7
  397. LFDX f10, X, INCXM1
  398. FMADD f5, f11, f19, f5
  399. LFDUX f17, Y, INCY
  400. FMADD f6, f11, f18, f6
  401. LFDUX f11, X, INCX
  402. FMADD f24, f12, f20, f24
  403. LFDX f18, Y, INCYM1
  404. FMADD f27, f12, f21, f27
  405. LFDX f12, X, INCXM1
  406. FMADD f25, f13, f21, f25
  407. LFDUX f19, Y, INCY
  408. FMADD f26, f13, f20, f26
  409. LFDUX f13, X, INCX
  410. FMADD f28, f14, f22, f28
  411. LFDX f20, Y, INCYM1
  412. FMADD f31, f14, f23, f31
  413. LFDX f14, X, INCXM1
  414. FMADD f29, f15, f23, f29
  415. LFDUX f21, Y, INCY
  416. FMADD f30, f15, f22, f30
  417. LFDUX f15, X, INCX
  418. FMADD f0, f8, f16, f0
  419. LFDX f22, Y, INCYM1
  420. FMADD f3, f8, f17, f3
  421. LFDUX f23, Y, INCY
  422. FMADD f1, f9, f17, f1
  423. FMADD f2, f9, f16, f2
  424. FMADD f4, f10, f18, f4
  425. FMADD f7, f10, f19, f7
  426. FMADD f5, f11, f19, f5
  427. FMADD f6, f11, f18, f6
  428. FMADD f24, f12, f20, f24
  429. FMADD f27, f12, f21, f27
  430. FMADD f25, f13, f21, f25
  431. FMADD f26, f13, f20, f26
  432. FMADD f28, f14, f22, f28
  433. FMADD f31, f14, f23, f31
  434. FMADD f29, f15, f23, f29
  435. FMADD f30, f15, f22, f30
  436. .align 4
  437. LL(150):
  438. andi. r0, N, 7
  439. mtspr CTR, r0
  440. beq LL(999)
  441. .align 4
  442. LL(160):
  443. LFDX f8, X, INCXM1
  444. LFDUX f9, X, INCX
  445. LFDX f16, Y, INCYM1
  446. LFDUX f17, Y, INCY
  447. FMADD f0, f8, f16, f0
  448. FMADD f3, f8, f17, f3
  449. FMADD f1, f9, f17, f1
  450. FMADD f2, f9, f16, f2
  451. bdnz LL(160)
  452. .align 4
  453. LL(999):
  454. FADD f0, f0, f4
  455. FADD f1, f1, f5
  456. FADD f2, f2, f6
  457. FADD f3, f3, f7
  458. FADD f24, f28, f24
  459. FADD f25, f29, f25
  460. FADD f26, f30, f26
  461. FADD f27, f31, f27
  462. FADD f0, f0, f24
  463. FADD f1, f1, f25
  464. FADD f2, f2, f26
  465. FADD f3, f3, f27
  466. #ifndef CONJ
  467. FSUB f1, f0, f1
  468. FADD f2, f2, f3
  469. #else
  470. FADD f1, f0, f1
  471. FSUB f2, f3, f2
  472. #endif
  473. #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C)
  474. STFD f1, 0 * SIZE(RESULT)
  475. STFD f2, 1 * SIZE(RESULT)
  476. #endif
  477. #if defined(F_INTERFACE) && defined(F_INTERFACE_GFORT)
  478. #ifndef __64BIT__
  479. #ifndef DOUBLE
  480. stfs f1, 144(SP)
  481. stfs f2, 148(SP)
  482. lwz r3, 144(SP)
  483. lwz r4, 148(SP)
  484. #else
  485. stfd f1, 144(SP)
  486. stfd f2, 152(SP)
  487. lwz r3, 144(SP)
  488. lwz r4, 148(SP)
  489. lwz r5, 152(SP)
  490. lwz r6, 156(SP)
  491. #endif
  492. #else
  493. #ifndef DOUBLE
  494. stfs f1, 144(SP)
  495. stfs f2, 148(SP)
  496. ld r3, 144(SP)
  497. #else
  498. stfd f1, 144(SP)
  499. stfd f2, 152(SP)
  500. ld r3, 144(SP)
  501. ld r4, 152(SP)
  502. #endif
  503. #endif
  504. #endif
  505. lfd f14, 0(SP)
  506. lfd f15, 8(SP)
  507. lfd f16, 16(SP)
  508. lfd f17, 24(SP)
  509. lfd f18, 32(SP)
  510. lfd f19, 40(SP)
  511. lfd f20, 48(SP)
  512. lfd f21, 56(SP)
  513. lfd f22, 64(SP)
  514. lfd f23, 72(SP)
  515. lfd f24, 80(SP)
  516. lfd f25, 88(SP)
  517. lfd f26, 96(SP)
  518. lfd f27, 104(SP)
  519. lfd f28, 112(SP)
  520. lfd f29, 120(SP)
  521. lfd f30, 128(SP)
  522. lfd f31, 136(SP)
  523. addi SP, SP, STACKSIZE
  524. blr
  525. EPILOGUE