You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

snrm2_hummer.S 12 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define INCX2 r6
  44. #define X2 r7
  45. #define C1 f1
  46. #define C2 f0
  47. #define C3 f2
  48. #define C4 f3
  49. #define C5 f4
  50. #define C6 f5
  51. #define C7 f6
  52. #define C8 f7
  53. #define A1 f8
  54. #define A2 f9
  55. #define A3 f10
  56. #define A4 f11
  57. #define A5 f12
  58. #define A6 f13
  59. #define A7 f14
  60. #define A8 f15
  61. #define A9 f16
  62. #define A10 f17
  63. #define A11 f18
  64. #define A12 f19
  65. #define A13 f20
  66. #define A14 f21
  67. #define A15 f22
  68. #define A16 f23
  69. PROLOGUE
  70. PROFCODE
  71. li r10, -16
  72. stfpdux f14, SP, r10
  73. stfpdux f15, SP, r10
  74. stfpdux f16, SP, r10
  75. stfpdux f17, SP, r10
  76. stfpdux f18, SP, r10
  77. stfpdux f19, SP, r10
  78. stfpdux f20, SP, r10
  79. stfpdux f21, SP, r10
  80. stfpdux f22, SP, r10
  81. stfpdux f23, SP, r10
  82. li r10, 0
  83. stwu r10, -4(SP)
  84. stwu r10, -4(SP)
  85. stwu r10, -4(SP)
  86. stwu r10, -4(SP)
  87. #ifdef F_INTERFACE
  88. LDINT N, 0(N)
  89. LDINT INCX, 0(INCX)
  90. #endif
  91. lfpdx C1, SP, r10 # Zero clear
  92. slwi INCX, INCX, BASE_SHIFT
  93. add INCX2, INCX, INCX
  94. fpmr C2, C1
  95. fpmr C3, C1
  96. fpmr C4, C1
  97. fpmr C5, C1
  98. fpmr C6, C1
  99. fpmr C7, C1
  100. fpmr C8, C1
  101. cmpwi cr0, N, 0
  102. ble LL(99)
  103. cmpwi cr0, INCX, 0
  104. beq LL(99)
  105. cmpwi cr0, INCX, SIZE
  106. bne LL(100)
  107. andi. r0, X, 2 * SIZE - 1
  108. beq LL(05)
  109. LFD C1, 0(X)
  110. addi X, X, 1 * SIZE
  111. addi N, N, -1
  112. cmpwi cr0, N, 0
  113. fmul C1, C1, C1
  114. ble LL(998)
  115. .align 4
  116. LL(05):
  117. srawi. r0, N, 5
  118. sub X, X, INCX2
  119. mtspr CTR, r0
  120. beq- LL(15)
  121. LFPDUX A1, X, INCX2
  122. LFPDUX A2, X, INCX2
  123. LFPDUX A3, X, INCX2
  124. LFPDUX A4, X, INCX2
  125. LFPDUX A5, X, INCX2
  126. LFPDUX A6, X, INCX2
  127. LFPDUX A7, X, INCX2
  128. LFPDUX A8, X, INCX2
  129. LFPDUX A9, X, INCX2
  130. LFPDUX A10, X, INCX2
  131. LFPDUX A11, X, INCX2
  132. LFPDUX A12, X, INCX2
  133. LFPDUX A13, X, INCX2
  134. LFPDUX A14, X, INCX2
  135. LFPDUX A15, X, INCX2
  136. LFPDUX A16, X, INCX2
  137. bdz LL(13)
  138. .align 4
  139. LL(12):
  140. fpmadd C1, A1, A1, C1
  141. LFPDUX A1, X, INCX2
  142. fpmadd C2, A2, A2, C2
  143. LFPDUX A2, X, INCX2
  144. fpmadd C3, A3, A3, C3
  145. LFPDUX A3, X, INCX2
  146. fpmadd C4, A4, A4, C4
  147. LFPDUX A4, X, INCX2
  148. fpmadd C5, A5, A5, C5
  149. LFPDUX A5, X, INCX2
  150. fpmadd C6, A6, A6, C6
  151. LFPDUX A6, X, INCX2
  152. fpmadd C7, A7, A7, C7
  153. LFPDUX A7, X, INCX2
  154. fpmadd C8, A8, A8, C8
  155. LFPDUX A8, X, INCX2
  156. fpmadd C1, A9, A9, C1
  157. LFPDUX A9, X, INCX2
  158. fpmadd C2, A10, A10, C2
  159. LFPDUX A10, X, INCX2
  160. fpmadd C3, A11, A11, C3
  161. LFPDUX A11, X, INCX2
  162. fpmadd C4, A12, A12, C4
  163. LFPDUX A12, X, INCX2
  164. fpmadd C5, A13, A13, C5
  165. LFPDUX A13, X, INCX2
  166. fpmadd C6, A14, A14, C6
  167. LFPDUX A14, X, INCX2
  168. fpmadd C7, A15, A15, C7
  169. LFPDUX A15, X, INCX2
  170. fpmadd C8, A16, A16, C8
  171. LFPDUX A16, X, INCX2
  172. bdnz LL(12)
  173. .align 4
  174. LL(13):
  175. fpmadd C1, A1, A1, C1
  176. fpmadd C2, A2, A2, C2
  177. fpmadd C3, A3, A3, C3
  178. fpmadd C4, A4, A4, C4
  179. fpmadd C5, A5, A5, C5
  180. fpmadd C6, A6, A6, C6
  181. fpmadd C7, A7, A7, C7
  182. fpmadd C8, A8, A8, C8
  183. fpmadd C1, A9, A9, C1
  184. fpmadd C2, A10, A10, C2
  185. fpmadd C3, A11, A11, C3
  186. fpmadd C4, A12, A12, C4
  187. fpmadd C5, A13, A13, C5
  188. fpmadd C6, A14, A14, C6
  189. fpmadd C7, A15, A15, C7
  190. fpmadd C8, A16, A16, C8
  191. .align 4
  192. LL(15):
  193. andi. r0, N, 31
  194. beq LL(98)
  195. andi. r0, N, 16
  196. beq LL(16)
  197. LFPDUX A1, X, INCX2
  198. LFPDUX A2, X, INCX2
  199. LFPDUX A3, X, INCX2
  200. LFPDUX A4, X, INCX2
  201. LFPDUX A5, X, INCX2
  202. LFPDUX A6, X, INCX2
  203. LFPDUX A7, X, INCX2
  204. LFPDUX A8, X, INCX2
  205. fpmadd C1, A1, A1, C1
  206. fpmadd C2, A2, A2, C2
  207. fpmadd C3, A3, A3, C3
  208. fpmadd C4, A4, A4, C4
  209. fpmadd C5, A5, A5, C5
  210. fpmadd C6, A6, A6, C6
  211. fpmadd C7, A7, A7, C7
  212. fpmadd C8, A8, A8, C8
  213. .align 4
  214. LL(16):
  215. andi. r0, N, 8
  216. beq LL(17)
  217. LFPDUX A1, X, INCX2
  218. LFPDUX A2, X, INCX2
  219. LFPDUX A3, X, INCX2
  220. LFPDUX A4, X, INCX2
  221. fpmadd C1, A1, A1, C1
  222. fpmadd C2, A2, A2, C2
  223. fpmadd C3, A3, A3, C3
  224. fpmadd C4, A4, A4, C4
  225. .align 4
  226. LL(17):
  227. andi. r0, N, 4
  228. beq LL(18)
  229. LFPDUX A1, X, INCX2
  230. LFPDUX A2, X, INCX2
  231. fpmadd C1, A1, A1, C1
  232. fpmadd C2, A2, A2, C2
  233. .align 4
  234. LL(18):
  235. andi. r0, N, 2
  236. beq LL(19)
  237. LFPDUX A1, X, INCX2
  238. fpmadd C3, A1, A1, C3
  239. .align 4
  240. LL(19):
  241. andi. r0, N, 1
  242. beq LL(98)
  243. LFDX A1, X, INCX2
  244. fmadd C4, A1, A1, C4
  245. .align 4
  246. LL(98):
  247. fpadd C1, C1, C5
  248. lis r3, 0x3f00
  249. fpadd C2, C2, C6
  250. lis r4, 0x4040
  251. fpadd C3, C3, C7
  252. stw r3, 4(SP)
  253. fpadd C4, C4, C8
  254. stw r4, 8(SP)
  255. fpadd C1, C1, C2
  256. fpadd C3, C3, C4
  257. lfs f10, 4(SP)
  258. fpadd C1, C1, C3
  259. lfs f11, 4(SP)
  260. lfs f12, 8(SP)
  261. fsmtp C2, C1
  262. fadd C1, C2, C1
  263. fcmpu cr0, f10, C1
  264. beq cr0, LL(99)
  265. #ifndef HUMMER_EMULATOR
  266. frsqrte f9, f1
  267. li r10, 16
  268. fmul f2, f1, f9
  269. lfpdux f23, SP, r10
  270. fmul f3, f9, f11
  271. lfpdux f22, SP, r10
  272. fnmsub f4, f2, f9, f12
  273. lfpdux f21, SP, r10
  274. fmul f9, f3, f4
  275. lfpdux f20, SP, r10
  276. fadd f13, f11, f11
  277. lfpdux f19, SP, r10
  278. fmul f12, f1, f9
  279. lfpdux f18, SP, r10
  280. fmul f11, f12, f11
  281. lfpdux f17, SP, r10
  282. fnmsub f1, f12, f9, f13
  283. lfpdux f16, SP, r10
  284. lfpdux f15, SP, r10
  285. lfpdux f14, SP, r10
  286. addi SP, SP, 16
  287. fmadd f1, f11, f1, f12
  288. blr
  289. #else
  290. fsqrt f1, f1
  291. li r10, 16
  292. lfpdux f23, SP, r10
  293. lfpdux f22, SP, r10
  294. lfpdux f21, SP, r10
  295. lfpdux f20, SP, r10
  296. lfpdux f19, SP, r10
  297. lfpdux f18, SP, r10
  298. lfpdux f17, SP, r10
  299. lfpdux f16, SP, r10
  300. lfpdux f15, SP, r10
  301. lfpdux f14, SP, r10
  302. addi SP, SP, 16
  303. blr
  304. #endif
  305. .align 4
  306. LL(99):
  307. li r10, 16
  308. lfpdux f23, SP, r10
  309. lfpdux f22, SP, r10
  310. lfpdux f21, SP, r10
  311. lfpdux f20, SP, r10
  312. lfpdux f19, SP, r10
  313. lfpdux f18, SP, r10
  314. lfpdux f17, SP, r10
  315. lfpdux f16, SP, r10
  316. lfpdux f15, SP, r10
  317. lfpdux f14, SP, r10
  318. addi SP, SP, 16
  319. blr
  320. .align 4
  321. LL(100):
  322. sub X2, X, INCX
  323. sub X, X, INCX2
  324. srawi. r0, N, 4
  325. mtspr CTR, r0
  326. beq- LL(115)
  327. LFDUX A1, X, INCX2
  328. LFDUX A2, X2, INCX2
  329. LFDUX A3, X, INCX2
  330. LFDUX A4, X2, INCX2
  331. LFDUX A5, X, INCX2
  332. LFDUX A6, X2, INCX2
  333. LFDUX A7, X, INCX2
  334. LFDUX A8, X2, INCX2
  335. LFDUX A9, X, INCX2
  336. LFDUX A10, X2, INCX2
  337. LFDUX A11, X, INCX2
  338. LFDUX A12, X2, INCX2
  339. LFDUX A13, X, INCX2
  340. LFDUX A14, X2, INCX2
  341. LFDUX A15, X, INCX2
  342. LFDUX A16, X2, INCX2
  343. bdz LL(113)
  344. .align 4
  345. LL(112):
  346. fmadd C1, A1, A1, C1
  347. LFDUX A1, X, INCX2
  348. fmadd C2, A2, A2, C2
  349. LFDUX A2, X2, INCX2
  350. fmadd C3, A3, A3, C3
  351. LFDUX A3, X, INCX2
  352. fmadd C4, A4, A4, C4
  353. LFDUX A4, X2, INCX2
  354. fmadd C5, A5, A5, C5
  355. LFDUX A5, X, INCX2
  356. fmadd C6, A6, A6, C6
  357. LFDUX A6, X2, INCX2
  358. fmadd C7, A7, A7, C7
  359. LFDUX A7, X, INCX2
  360. fmadd C8, A8, A8, C8
  361. LFDUX A8, X2, INCX2
  362. fmadd C1, A9, A9, C1
  363. LFDUX A9, X, INCX2
  364. fmadd C2, A10, A10, C2
  365. LFDUX A10, X2, INCX2
  366. fmadd C3, A11, A11, C3
  367. LFDUX A11, X, INCX2
  368. fmadd C4, A12, A12, C4
  369. LFDUX A12, X2, INCX2
  370. fmadd C5, A13, A13, C5
  371. LFDUX A13, X, INCX2
  372. fmadd C6, A14, A14, C6
  373. LFDUX A14, X2, INCX2
  374. fmadd C7, A15, A15, C7
  375. LFDUX A15, X, INCX2
  376. fmadd C8, A16, A16, C8
  377. LFDUX A16, X2, INCX2
  378. bdnz LL(112)
  379. .align 4
  380. LL(113):
  381. fmadd C1, A1, A1, C1
  382. fmadd C2, A2, A2, C2
  383. fmadd C3, A3, A3, C3
  384. fmadd C4, A4, A4, C4
  385. fmadd C5, A5, A5, C5
  386. fmadd C6, A6, A6, C6
  387. fmadd C7, A7, A7, C7
  388. fmadd C8, A8, A8, C8
  389. fmadd C1, A9, A9, C1
  390. fmadd C2, A10, A10, C2
  391. fmadd C3, A11, A11, C3
  392. fmadd C4, A12, A12, C4
  393. fmadd C5, A13, A13, C5
  394. fmadd C6, A14, A14, C6
  395. fmadd C7, A15, A15, C7
  396. fmadd C8, A16, A16, C8
  397. .align 4
  398. LL(115):
  399. andi. r0, N, 15
  400. beq LL(998)
  401. andi. r0, N, 8
  402. beq LL(116)
  403. LFDUX A1, X, INCX2
  404. LFDUX A2, X2, INCX2
  405. LFDUX A3, X, INCX2
  406. LFDUX A4, X2, INCX2
  407. LFDUX A5, X, INCX2
  408. LFDUX A6, X2, INCX2
  409. LFDUX A7, X, INCX2
  410. LFDUX A8, X2, INCX2
  411. fmadd C1, A1, A1, C1
  412. fmadd C2, A2, A2, C2
  413. fmadd C3, A3, A3, C3
  414. fmadd C4, A4, A4, C4
  415. fmadd C5, A5, A5, C5
  416. fmadd C6, A6, A6, C6
  417. fmadd C7, A7, A7, C7
  418. fmadd C8, A8, A8, C8
  419. .align 4
  420. LL(116):
  421. andi. r0, N, 4
  422. beq LL(117)
  423. LFDUX A1, X, INCX2
  424. LFDUX A2, X2, INCX2
  425. LFDUX A3, X, INCX2
  426. LFDUX A4, X2, INCX2
  427. fmadd C1, A1, A1, C1
  428. fmadd C2, A2, A2, C2
  429. fmadd C3, A3, A3, C3
  430. fmadd C4, A4, A4, C4
  431. .align 4
  432. LL(117):
  433. andi. r0, N, 2
  434. beq LL(118)
  435. LFDUX A1, X, INCX2
  436. LFDUX A2, X2, INCX2
  437. fmadd C1, A1, A1, C1
  438. fmadd C2, A2, A2, C2
  439. .align 4
  440. LL(118):
  441. andi. r0, N, 1
  442. beq LL(998)
  443. LFDX A1, X, INCX2
  444. fmadd C1, A1, A1, C1
  445. .align 4
  446. LL(998):
  447. fadd C1, C1, C5
  448. lis r3, 0x3f00
  449. fadd C2, C2, C6
  450. lis r4, 0x4040
  451. fadd C3, C3, C7
  452. stw r3, 4(SP)
  453. fadd C4, C4, C8
  454. stw r4, 8(SP)
  455. fadd C1, C1, C2
  456. lfs f10, 0(SP)
  457. fadd C3, C3, C4
  458. lfs f11, 4(SP)
  459. lfs f12, 8(SP)
  460. fadd C1, C1, C3
  461. fcmpu cr0, f10, C1
  462. beq cr0, LL(999)
  463. #ifndef HUMMER_EMULATOR
  464. frsqrte f9, f1
  465. li r10, 16
  466. fmul f2, f1, f9
  467. lfpdux f23, SP, r10
  468. fmul f3, f9, f11
  469. lfpdux f22, SP, r10
  470. fnmsub f4, f2, f9, f12
  471. lfpdux f21, SP, r10
  472. fmul f9, f3, f4
  473. lfpdux f20, SP, r10
  474. fadd f13, f11, f11
  475. lfpdux f19, SP, r10
  476. fmul f12, f1, f9
  477. lfpdux f18, SP, r10
  478. fmul f11, f12, f11
  479. lfpdux f17, SP, r10
  480. fnmsub f1, f12, f9, f13
  481. lfpdux f16, SP, r10
  482. lfpdux f15, SP, r10
  483. lfpdux f14, SP, r10
  484. addi SP, SP, 16
  485. fmadd f1, f11, f1, f12
  486. blr
  487. #else
  488. fsqrt f1, f1
  489. li r10, 16
  490. lfpdux f23, SP, r10
  491. lfpdux f22, SP, r10
  492. lfpdux f21, SP, r10
  493. lfpdux f20, SP, r10
  494. lfpdux f19, SP, r10
  495. lfpdux f18, SP, r10
  496. lfpdux f17, SP, r10
  497. lfpdux f16, SP, r10
  498. lfpdux f15, SP, r10
  499. lfpdux f14, SP, r10
  500. addi SP, SP, 16
  501. blr
  502. #endif
  503. .align 4
  504. LL(999):
  505. li r10, 16
  506. lfpdux f23, SP, r10
  507. lfpdux f22, SP, r10
  508. lfpdux f21, SP, r10
  509. lfpdux f20, SP, r10
  510. lfpdux f19, SP, r10
  511. lfpdux f18, SP, r10
  512. lfpdux f17, SP, r10
  513. lfpdux f16, SP, r10
  514. lfpdux f15, SP, r10
  515. lfpdux f14, SP, r10
  516. addi SP, SP, 16
  517. blr
  518. EPILOGUE