You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

znrm2_hummer.S 18 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define INCX2 r6
  44. #define X2 r7
  45. #define XX r8
  46. #define C1 f1
  47. #define C2 f0
  48. #define C3 f2
  49. #define C4 f3
  50. #define ALPHA f4
  51. #define ALPHA_R f5
  52. #define A1 f6
  53. #define A2 f7
  54. #define A3 f8
  55. #define A4 f9
  56. #define A5 f10
  57. #define A6 f11
  58. #define A7 f12
  59. #define A8 f13
  60. #define F1 f14
  61. #define F2 f15
  62. #define F3 f16
  63. #define F4 f17
  64. #define F5 f18
  65. #define F6 f19
  66. #define F7 f20
  67. #define F8 f21
  68. #define T1 f22
  69. #define T2 f23
  70. #define T3 f24
  71. #define T4 f25
  72. #define T5 f26
  73. #define T6 f27
  74. #define T7 f28
  75. #define T8 f29
  76. PROLOGUE
  77. PROFCODE
  78. li r10, -16
  79. stfpdux f14, SP, r10
  80. stfpdux f15, SP, r10
  81. stfpdux f16, SP, r10
  82. stfpdux f17, SP, r10
  83. stfpdux f18, SP, r10
  84. stfpdux f19, SP, r10
  85. stfpdux f20, SP, r10
  86. stfpdux f21, SP, r10
  87. stfpdux f22, SP, r10
  88. stfpdux f23, SP, r10
  89. stfpdux f24, SP, r10
  90. stfpdux f25, SP, r10
  91. stfpdux f26, SP, r10
  92. stfpdux f27, SP, r10
  93. stfpdux f28, SP, r10
  94. stfpdux f29, SP, r10
  95. li r10, 0
  96. lis r11, 0x3f80
  97. stwu r11, -4(SP)
  98. stwu r11, -4(SP)
  99. stwu r10, -4(SP)
  100. stwu r10, -4(SP)
  101. #ifdef F_INTERFACE
  102. LDINT N, 0(N)
  103. LDINT INCX, 0(INCX)
  104. #endif
  105. lfpsx C1, SP, r10 # Zero clear
  106. slwi INCX, INCX, BASE_SHIFT
  107. add INCX2, INCX, INCX
  108. fpmr C2, C1
  109. fpmr C3, C1
  110. fpmr C4, C1
  111. cmpwi cr0, N, 0
  112. ble LL(99)
  113. cmpwi cr0, INCX, 0
  114. ble LL(99)
  115. mr XX, X
  116. andi. r0, X, 2 * SIZE - 1
  117. bne LL(100)
  118. /* aligned */
  119. sub X, X, INCX2
  120. srawi. r0, N, 3
  121. mtspr CTR, r0
  122. beq- LL(15)
  123. LFPDUX A1, X, INCX2
  124. LFPDUX A2, X, INCX2
  125. LFPDUX A3, X, INCX2
  126. LFPDUX A4, X, INCX2
  127. LFPDUX A5, X, INCX2
  128. fpabs T1, A1
  129. LFPDUX A6, X, INCX2
  130. fpabs T2, A2
  131. LFPDUX A7, X, INCX2
  132. fpabs T3, A3
  133. LFPDUX A8, X, INCX2
  134. fpabs T4, A4
  135. bdz LL(13)
  136. .align 4
  137. LL(12):
  138. fpsub F1, C1, T1
  139. LFPDUX A1, X, INCX2
  140. fpsub F2, C2, T2
  141. LFPDUX A2, X, INCX2
  142. fpsub F3, C3, T3
  143. LFPDUX A3, X, INCX2
  144. fpsub F4, C4, T4
  145. LFPDUX A4, X, INCX2
  146. fpabs T5, A5
  147. fpabs T6, A6
  148. fpabs T7, A7
  149. fpabs T8, A8
  150. fpsel C1, F1, C1, T1
  151. LFPDUX A5, X, INCX2
  152. fpsel C2, F2, C2, T2
  153. LFPDUX A6, X, INCX2
  154. fpsel C3, F3, C3, T3
  155. LFPDUX A7, X, INCX2
  156. fpsel C4, F4, C4, T4
  157. LFPDUX A8, X, INCX2
  158. fpsub F5, C1, T5
  159. fpsub F6, C2, T6
  160. fpsub F7, C3, T7
  161. fpsub F8, C4, T8
  162. fpabs T1, A1
  163. fpabs T2, A2
  164. fpabs T3, A3
  165. fpabs T4, A4
  166. fpsel C1, F5, C1, T5
  167. fpsel C2, F6, C2, T6
  168. fpsel C3, F7, C3, T7
  169. fpsel C4, F8, C4, T8
  170. bdnz LL(12)
  171. .align 4
  172. LL(13):
  173. fpabs T5, A5
  174. fpabs T6, A6
  175. fpabs T7, A7
  176. fpabs T8, A8
  177. fpsub F1, C1, T1
  178. fpsub F2, C2, T2
  179. fpsub F3, C3, T3
  180. fpsub F4, C4, T4
  181. fpsel C1, F1, C1, T1
  182. fpsel C2, F2, C2, T2
  183. fpsel C3, F3, C3, T3
  184. fpsel C4, F4, C4, T4
  185. fpsub F5, C1, T5
  186. fpsub F6, C2, T6
  187. fpsub F7, C3, T7
  188. fpsub F8, C4, T8
  189. fpsel C1, F5, C1, T5
  190. fpsel C2, F6, C2, T6
  191. fpsel C3, F7, C3, T7
  192. fpsel C4, F8, C4, T8
  193. .align 4
  194. LL(15):
  195. andi. r0, N, 7
  196. beq LL(20)
  197. andi. r0, N, 4
  198. beq LL(16)
  199. LFPDUX A1, X, INCX2
  200. LFPDUX A2, X, INCX2
  201. LFPDUX A3, X, INCX2
  202. LFPDUX A4, X, INCX2
  203. fpabs A1, A1
  204. fpabs A2, A2
  205. fpabs A3, A3
  206. fpabs A4, A4
  207. fpsub F1, C1, A1
  208. fpsub F2, C2, A2
  209. fpsub F3, C3, A3
  210. fpsub F4, C4, A4
  211. fpsel C1, F1, C1, A1
  212. fpsel C2, F2, C2, A2
  213. fpsel C3, F3, C3, A3
  214. fpsel C4, F4, C4, A4
  215. .align 4
  216. LL(16):
  217. andi. r0, N, 2
  218. beq LL(17)
  219. LFPDUX A1, X, INCX2
  220. LFPDUX A2, X, INCX2
  221. fpabs A1, A1
  222. fpabs A2, A2
  223. fpsub F1, C1, A1
  224. fpsub F2, C2, A2
  225. fpsel C1, F1, C1, A1
  226. fpsel C2, F2, C2, A2
  227. .align 4
  228. LL(17):
  229. andi. r0, N, 1
  230. beq LL(20)
  231. LFPDUX A1, X, INCX2
  232. fpabs A1, A1
  233. fpsub F1, C1, A1
  234. fpsel C1, F1, C1, A1
  235. .align 4
  236. LL(20):
  237. fpsub F1, C1, C2
  238. fpsub F2, C3, C4
  239. fpsel C1, F1, C1, C2
  240. fpsel C3, F2, C3, C4
  241. fpsub F1, C1, C3
  242. fpsel C1, F1, C1, C3
  243. fsmtp C2, C1
  244. fsub F1, C1, C2
  245. fsel ALPHA, F1, C1, C2
  246. li r10, 0
  247. lfs ALPHA_R, 8(SP) # load 1.0
  248. fdiv ALPHA_R, ALPHA_R, ALPHA
  249. lfpsx C1, SP, r10 # Zero clear
  250. fpmr C2, C1
  251. fpmr C3, C1
  252. fpmr C4, C1
  253. fsmfp ALPHA_R, ALPHA_R
  254. andi. r0, XX, 2 * SIZE - 1
  255. beq LL(21)
  256. LFD C1, 0 * SIZE(XX)
  257. add XX, XX, INCX
  258. cmpwi cr0, N, 0
  259. fmul C1, ALPHA_R, C1
  260. fmul C1, C1, C1
  261. ble LL(98)
  262. .align 4
  263. LL(21):
  264. sub XX, XX, INCX2
  265. srawi. r0, N, 3
  266. mtspr CTR, r0
  267. beq- LL(25)
  268. LFPDUX A1, XX, INCX2
  269. LFPDUX A2, XX, INCX2
  270. LFPDUX A3, XX, INCX2
  271. LFPDUX A4, XX, INCX2
  272. LFPDUX A5, XX, INCX2
  273. LFPDUX A6, XX, INCX2
  274. LFPDUX A7, XX, INCX2
  275. LFPDUX A8, XX, INCX2
  276. fpmul T1, ALPHA_R, A1
  277. fpmul T2, ALPHA_R, A2
  278. fpmul T3, ALPHA_R, A3
  279. fpmul T4, ALPHA_R, A4
  280. bdz LL(23)
  281. .align 4
  282. LL(22):
  283. fpmadd C1, T1, T1, C1
  284. LFPDUX A1, XX, INCX2
  285. fpmul T1, ALPHA_R, A5
  286. LFPDUX A2, XX, INCX2
  287. fpmadd C2, T2, T2, C2
  288. LFPDUX A3, XX, INCX2
  289. fpmul T2, ALPHA_R, A6
  290. LFPDUX A4, XX, INCX2
  291. fpmadd C3, T3, T3, C3
  292. fpmul T3, ALPHA_R, A7
  293. fpmadd C4, T4, T4, C4
  294. fpmul T4, ALPHA_R, A8
  295. fpmadd C1, T1, T1, C1
  296. LFPDUX A5, XX, INCX2
  297. fpmul T1, ALPHA_R, A1
  298. LFPDUX A6, XX, INCX2
  299. fpmadd C2, T2, T2, C2
  300. LFPDUX A7, XX, INCX2
  301. fpmul T2, ALPHA_R, A2
  302. LFPDUX A8, XX, INCX2
  303. fpmadd C3, T3, T3, C3
  304. fpmul T3, ALPHA_R, A3
  305. fpmadd C4, T4, T4, C4
  306. fpmul T4, ALPHA_R, A4
  307. bdnz LL(22)
  308. .align 4
  309. LL(23):
  310. fpmadd C1, T1, T1, C1
  311. fpmul T1, ALPHA_R, A5
  312. fpmadd C2, T2, T2, C2
  313. fpmul T2, ALPHA_R, A6
  314. fpmadd C3, T3, T3, C3
  315. fpmul T3, ALPHA_R, A7
  316. fpmadd C4, T4, T4, C4
  317. fpmul T4, ALPHA_R, A8
  318. fpmadd C1, T1, T1, C1
  319. fpmadd C2, T2, T2, C2
  320. fpmadd C3, T3, T3, C3
  321. fpmadd C4, T4, T4, C4
  322. .align 4
  323. LL(25):
  324. andi. r0, N, 7
  325. beq LL(98)
  326. andi. r0, N, 4
  327. beq LL(26)
  328. LFPDUX A1, XX, INCX2
  329. LFPDUX A2, XX, INCX2
  330. LFPDUX A3, XX, INCX2
  331. LFPDUX A4, XX, INCX2
  332. fpmul A1, ALPHA_R, A1
  333. fpmul A2, ALPHA_R, A2
  334. fpmul A3, ALPHA_R, A3
  335. fpmul A4, ALPHA_R, A4
  336. fpmadd C1, A1, A1, C1
  337. fpmadd C2, A2, A2, C2
  338. fpmadd C3, A3, A3, C3
  339. fpmadd C4, A4, A4, C4
  340. .align 4
  341. LL(26):
  342. andi. r0, N, 2
  343. beq LL(27)
  344. LFPDUX A1, XX, INCX2
  345. LFPDUX A2, XX, INCX2
  346. fpmul A1, ALPHA_R, A1
  347. fpmul A2, ALPHA_R, A2
  348. fpmadd C1, A1, A1, C1
  349. fpmadd C2, A2, A2, C2
  350. .align 4
  351. LL(27):
  352. andi. r0, N, 1
  353. beq LL(98)
  354. LFPDUX A1, XX, INCX2
  355. fpmul A1, ALPHA_R, A1
  356. fpmadd C1, A1, A1, C1
  357. .align 4
  358. LL(98):
  359. fpadd C1, C1, C2
  360. lis r3, 0x3f00
  361. fpadd C3, C3, C4
  362. lis r4, 0x4040
  363. stw r3, 4(SP)
  364. stw r4, 8(SP)
  365. fpadd C1, C1, C3
  366. lfs f10, 0(SP)
  367. fsmtp C2, C1
  368. lfs f11, 4(SP)
  369. fadd C1, C2, C1
  370. lfs f12, 8(SP)
  371. fcmpu cr0, f10, C1
  372. beq cr0, LL(99)
  373. #ifndef HUMMER_EMULATOR
  374. frsqrte f9, C1
  375. li r10, 16
  376. fmul f2, f1, f9
  377. lfpdux f29, SP, r10
  378. fmul f3, f9, f11
  379. lfpdux f28, SP, r10
  380. fnmsub f7, f2, f9, f12
  381. lfpdux f27, SP, r10
  382. fmul f9, f3, f7
  383. lfpdux f26, SP, r10
  384. fadd f13, f11, f11
  385. lfpdux f25, SP, r10
  386. fmul f12, f1, f9
  387. lfpdux f24, SP, r10
  388. fmul f11, f12, f11
  389. lfpdux f23, SP, r10
  390. lfpdux f22, SP, r10
  391. lfpdux f21, SP, r10
  392. fnmsub f1, f12, f9, f13
  393. lfpdux f20, SP, r10
  394. lfpdux f19, SP, r10
  395. lfpdux f18, SP, r10
  396. fmadd f1, f11, f1, f12
  397. lfpdux f17, SP, r10
  398. lfpdux f16, SP, r10
  399. lfpdux f15, SP, r10
  400. lfpdux f14, SP, r10
  401. addi SP, SP, 16
  402. fmul C1, ALPHA, C1
  403. blr
  404. #else
  405. fsqrt C1, C1
  406. li r10, 16
  407. lfpdux f29, SP, r10
  408. lfpdux f28, SP, r10
  409. lfpdux f27, SP, r10
  410. lfpdux f26, SP, r10
  411. lfpdux f25, SP, r10
  412. lfpdux f24, SP, r10
  413. lfpdux f23, SP, r10
  414. lfpdux f22, SP, r10
  415. lfpdux f21, SP, r10
  416. lfpdux f20, SP, r10
  417. lfpdux f19, SP, r10
  418. lfpdux f18, SP, r10
  419. lfpdux f17, SP, r10
  420. lfpdux f16, SP, r10
  421. lfpdux f15, SP, r10
  422. lfpdux f14, SP, r10
  423. fmul C1, ALPHA, C1
  424. addi SP, SP, 16
  425. blr
  426. #endif
  427. .align 4
  428. LL(99):
  429. li r10, 16
  430. lfpdux f29, SP, r10
  431. lfpdux f28, SP, r10
  432. lfpdux f27, SP, r10
  433. lfpdux f26, SP, r10
  434. lfpdux f25, SP, r10
  435. lfpdux f24, SP, r10
  436. lfpdux f23, SP, r10
  437. lfpdux f22, SP, r10
  438. lfpdux f21, SP, r10
  439. lfpdux f20, SP, r10
  440. lfpdux f19, SP, r10
  441. lfpdux f18, SP, r10
  442. lfpdux f17, SP, r10
  443. lfpdux f16, SP, r10
  444. lfpdux f15, SP, r10
  445. lfpdux f14, SP, r10
  446. addi SP, SP, 16
  447. blr
  448. .align 4
  449. LL(100):
  450. sub X, X, INCX2
  451. addi X2, X, SIZE
  452. srawi. r0, N, 3
  453. mtspr CTR, r0
  454. beq- LL(105)
  455. LFDUX A1, X, INCX2
  456. LFDUX A2, X2, INCX2
  457. LFDUX A3, X, INCX2
  458. LFDUX A4, X2, INCX2
  459. LFSDUX A1, X, INCX2
  460. LFSDUX A2, X2, INCX2
  461. LFSDUX A3, X, INCX2
  462. LFSDUX A4, X2, INCX2
  463. LFDUX A5, X, INCX2
  464. LFDUX A6, X2, INCX2
  465. LFDUX A7, X, INCX2
  466. LFDUX A8, X2, INCX2
  467. LFSDUX A5, X, INCX2
  468. fpabs T1, A1
  469. LFSDUX A6, X2, INCX2
  470. fpabs T2, A2
  471. LFSDUX A7, X, INCX2
  472. fpabs T3, A3
  473. LFSDUX A8, X2, INCX2
  474. fpabs T4, A4
  475. bdz LL(103)
  476. .align 4
  477. LL(102):
  478. fpsub F1, C1, T1
  479. LFDUX A1, X, INCX2
  480. fpsub F2, C2, T2
  481. LFDUX A2, X2, INCX2
  482. fpsub F3, C3, T3
  483. LFDUX A3, X, INCX2
  484. fpsub F4, C4, T4
  485. LFDUX A4, X2, INCX2
  486. fpabs T5, A5
  487. LFSDUX A1, X, INCX2
  488. fpabs T6, A6
  489. LFSDUX A2, X2, INCX2
  490. fpabs T7, A7
  491. LFSDUX A3, X, INCX2
  492. fpabs T8, A8
  493. LFSDUX A4, X2, INCX2
  494. fpsel C1, F1, C1, T1
  495. LFDUX A5, X, INCX2
  496. fpsel C2, F2, C2, T2
  497. LFDUX A6, X2, INCX2
  498. fpsel C3, F3, C3, T3
  499. LFDUX A7, X, INCX2
  500. fpsel C4, F4, C4, T4
  501. LFDUX A8, X2, INCX2
  502. fpsub F5, C1, T5
  503. LFSDUX A5, X, INCX2
  504. fpsub F6, C2, T6
  505. LFSDUX A6, X2, INCX2
  506. fpsub F7, C3, T7
  507. LFSDUX A7, X, INCX2
  508. fpsub F8, C4, T8
  509. LFSDUX A8, X2, INCX2
  510. fpabs T1, A1
  511. fpabs T2, A2
  512. fpabs T3, A3
  513. fpabs T4, A4
  514. fpsel C1, F5, C1, T5
  515. fpsel C2, F6, C2, T6
  516. fpsel C3, F7, C3, T7
  517. fpsel C4, F8, C4, T8
  518. bdnz LL(102)
  519. .align 4
  520. LL(103):
  521. fpabs T5, A5
  522. fpabs T6, A6
  523. fpabs T7, A7
  524. fpabs T8, A8
  525. fpsub F1, C1, T1
  526. fpsub F2, C2, T2
  527. fpsub F3, C3, T3
  528. fpsub F4, C4, T4
  529. fpsel C1, F1, C1, T1
  530. fpsel C2, F2, C2, T2
  531. fpsel C3, F3, C3, T3
  532. fpsel C4, F4, C4, T4
  533. fpsub F5, C1, T5
  534. fpsub F6, C2, T6
  535. fpsub F7, C3, T7
  536. fpsub F8, C4, T8
  537. fpsel C1, F5, C1, T5
  538. fpsel C2, F6, C2, T6
  539. fpsel C3, F7, C3, T7
  540. fpsel C4, F8, C4, T8
  541. .align 4
  542. LL(105):
  543. andi. r0, N, 7
  544. beq LL(120)
  545. andi. r0, N, 4
  546. beq LL(106)
  547. LFDUX A1, X, INCX2
  548. LFDUX A2, X2, INCX2
  549. LFDUX A3, X, INCX2
  550. LFDUX A4, X2, INCX2
  551. LFSDUX A1, X, INCX2
  552. LFSDUX A2, X2, INCX2
  553. LFSDUX A3, X, INCX2
  554. LFSDUX A4, X2, INCX2
  555. fpabs A1, A1
  556. fpabs A2, A2
  557. fpabs A3, A3
  558. fpabs A4, A4
  559. fpsub F1, C1, A1
  560. fpsub F2, C2, A2
  561. fpsub F3, C3, A3
  562. fpsub F4, C4, A4
  563. fpsel C1, F1, C1, A1
  564. fpsel C2, F2, C2, A2
  565. fpsel C3, F3, C3, A3
  566. fpsel C4, F4, C4, A4
  567. .align 4
  568. LL(106):
  569. andi. r0, N, 2
  570. beq LL(107)
  571. LFDUX A1, X, INCX2
  572. LFDUX A2, X2, INCX2
  573. LFDUX A3, X, INCX2
  574. LFDUX A4, X2, INCX2
  575. fabs A1, A1
  576. fabs A2, A2
  577. fabs A3, A3
  578. fabs A4, A4
  579. fsub F1, C1, A1
  580. fsub F2, C2, A2
  581. fsub F3, C3, A3
  582. fsub F4, C4, A4
  583. fsel C1, F1, C1, A1
  584. fsel C2, F2, C2, A2
  585. fsel C3, F3, C3, A3
  586. fsel C4, F4, C4, A4
  587. .align 4
  588. LL(107):
  589. andi. r0, N, 1
  590. beq LL(120)
  591. LFDUX A1, X, INCX2
  592. LFDUX A2, X2, INCX2
  593. fabs A1, A1
  594. fabs A2, A2
  595. fsub F1, C1, A1
  596. fsub F2, C2, A2
  597. fsel C1, F1, C1, A1
  598. fsel C2, F2, C2, A2
  599. .align 4
  600. LL(120):
  601. fpsub F1, C1, C2
  602. fpsub F2, C3, C4
  603. fpsel C1, F1, C1, C2
  604. fpsel C3, F2, C3, C4
  605. fpsub F1, C1, C3
  606. fpsel C1, F1, C1, C3
  607. fsmtp C2, C1
  608. fsub F1, C1, C2
  609. fsel ALPHA, F1, C1, C2
  610. li r10, 0
  611. lfs ALPHA_R, 8(SP) # load 1.0
  612. fdiv ALPHA_R, ALPHA_R, ALPHA
  613. lfpsx C1, SP, r10 # Zero clear
  614. fpmr C2, C1
  615. fpmr C3, C1
  616. fpmr C4, C1
  617. fsmfp ALPHA_R, ALPHA_R
  618. sub XX, XX, INCX2
  619. addi X2, XX, SIZE
  620. srawi. r0, N, 3
  621. mtspr CTR, r0
  622. beq- LL(125)
  623. LFDUX A1, XX, INCX2
  624. LFDUX A2, X2, INCX2
  625. LFDUX A3, XX, INCX2
  626. LFDUX A4, X2, INCX2
  627. LFSDUX A1, XX, INCX2
  628. LFSDUX A2, X2, INCX2
  629. LFSDUX A3, XX, INCX2
  630. LFSDUX A4, X2, INCX2
  631. LFDUX A5, XX, INCX2
  632. LFDUX A6, X2, INCX2
  633. LFDUX A7, XX, INCX2
  634. LFDUX A8, X2, INCX2
  635. LFSDUX A5, XX, INCX2
  636. fpmul T1, ALPHA_R, A1
  637. LFSDUX A6, X2, INCX2
  638. fpmul T2, ALPHA_R, A2
  639. LFSDUX A7, XX, INCX2
  640. fpmul T3, ALPHA_R, A3
  641. LFSDUX A8, X2, INCX2
  642. fpmul T4, ALPHA_R, A4
  643. bdz LL(123)
  644. .align 4
  645. LL(122):
  646. fpmadd C1, T1, T1, C1
  647. LFDUX A1, XX, INCX2
  648. fpmul T1, ALPHA_R, A5
  649. LFDUX A2, X2, INCX2
  650. fpmadd C2, T2, T2, C2
  651. LFDUX A3, XX, INCX2
  652. fpmul T2, ALPHA_R, A6
  653. LFDUX A4, X2, INCX2
  654. fpmadd C3, T3, T3, C3
  655. LFSDUX A1, XX, INCX2
  656. fpmul T3, ALPHA_R, A7
  657. LFSDUX A2, X2, INCX2
  658. fpmadd C4, T4, T4, C4
  659. LFSDUX A3, XX, INCX2
  660. fpmul T4, ALPHA_R, A8
  661. LFSDUX A4, X2, INCX2
  662. fpmadd C1, T1, T1, C1
  663. LFDUX A5, XX, INCX2
  664. fpmul T1, ALPHA_R, A1
  665. LFDUX A6, X2, INCX2
  666. fpmadd C2, T2, T2, C2
  667. LFDUX A7, XX, INCX2
  668. fpmul T2, ALPHA_R, A2
  669. LFDUX A8, X2, INCX2
  670. fpmadd C3, T3, T3, C3
  671. LFSDUX A5, XX, INCX2
  672. fpmul T3, ALPHA_R, A3
  673. LFSDUX A6, X2, INCX2
  674. fpmadd C4, T4, T4, C4
  675. LFSDUX A7, XX, INCX2
  676. fpmul T4, ALPHA_R, A4
  677. LFSDUX A8, X2, INCX2
  678. bdnz LL(122)
  679. .align 4
  680. LL(123):
  681. fpmadd C1, T1, T1, C1
  682. fpmul T1, ALPHA_R, A5
  683. fpmadd C2, T2, T2, C2
  684. fpmul T2, ALPHA_R, A6
  685. fpmadd C3, T3, T3, C3
  686. fpmul T3, ALPHA_R, A7
  687. fpmadd C4, T4, T4, C4
  688. fpmul T4, ALPHA_R, A8
  689. fpmadd C1, T1, T1, C1
  690. fpmadd C2, T2, T2, C2
  691. fpmadd C3, T3, T3, C3
  692. fpmadd C4, T4, T4, C4
  693. .align 4
  694. LL(125):
  695. andi. r0, N, 7
  696. beq LL(998)
  697. andi. r0, N, 4
  698. beq LL(126)
  699. LFDUX A1, XX, INCX2
  700. LFDUX A2, X2, INCX2
  701. LFDUX A3, XX, INCX2
  702. LFDUX A4, X2, INCX2
  703. LFSDUX A1, XX, INCX2
  704. LFSDUX A2, X2, INCX2
  705. LFSDUX A3, XX, INCX2
  706. LFSDUX A4, X2, INCX2
  707. fpmul A1, ALPHA_R, A1
  708. fpmul A2, ALPHA_R, A2
  709. fpmul A3, ALPHA_R, A3
  710. fpmul A4, ALPHA_R, A4
  711. fpmadd C1, A1, A1, C1
  712. fpmadd C2, A2, A2, C2
  713. fpmadd C3, A3, A3, C3
  714. fpmadd C4, A4, A4, C4
  715. .align 4
  716. LL(126):
  717. andi. r0, N, 2
  718. beq LL(127)
  719. LFDUX A1, XX, INCX2
  720. LFDUX A2, X2, INCX2
  721. LFDUX A3, XX, INCX2
  722. LFDUX A4, X2, INCX2
  723. fmul A1, ALPHA_R, A1
  724. fmul A2, ALPHA_R, A2
  725. fmul A3, ALPHA_R, A3
  726. fmul A4, ALPHA_R, A4
  727. fmadd C1, A1, A1, C1
  728. fmadd C2, A2, A2, C2
  729. fmadd C3, A3, A3, C3
  730. fmadd C4, A4, A4, C4
  731. .align 4
  732. LL(127):
  733. andi. r0, N, 1
  734. beq LL(998)
  735. LFDUX A1, XX, INCX2
  736. LFDUX A2, X2, INCX2
  737. fmul A1, ALPHA_R, A1
  738. fmul A2, ALPHA_R, A2
  739. fmadd C1, A1, A1, C1
  740. fmadd C2, A2, A2, C2
  741. .align 4
  742. LL(998):
  743. fpadd C1, C1, C2
  744. lis r3, 0x3f00
  745. fpadd C3, C3, C4
  746. lis r4, 0x4040
  747. stw r3, 4(SP)
  748. stw r4, 8(SP)
  749. fpadd C1, C1, C3
  750. lfs f10, 0(SP)
  751. fsmtp C2, C1
  752. lfs f11, 4(SP)
  753. fadd C1, C2, C1
  754. lfs f12, 8(SP)
  755. fcmpu cr0, f10, C1
  756. beq cr0, LL(99)
  757. #ifndef HUMMER_EMULATOR
  758. frsqrte f9, C1
  759. li r10, 16
  760. fmul f2, f1, f9
  761. lfpdux f29, SP, r10
  762. fmul f3, f9, f11
  763. lfpdux f28, SP, r10
  764. fnmsub f7, f2, f9, f12
  765. lfpdux f27, SP, r10
  766. fmul f9, f3, f7
  767. lfpdux f26, SP, r10
  768. fadd f13, f11, f11
  769. lfpdux f25, SP, r10
  770. fmul f12, f1, f9
  771. lfpdux f24, SP, r10
  772. fmul f11, f12, f11
  773. lfpdux f23, SP, r10
  774. lfpdux f22, SP, r10
  775. fnmsub f1, f12, f9, f13
  776. lfpdux f21, SP, r10
  777. lfpdux f20, SP, r10
  778. lfpdux f19, SP, r10
  779. lfpdux f18, SP, r10
  780. fmadd f1, f11, f1, f12
  781. lfpdux f17, SP, r10
  782. lfpdux f16, SP, r10
  783. lfpdux f15, SP, r10
  784. lfpdux f14, SP, r10
  785. addi SP, SP, 16
  786. fmul C1, ALPHA, C1
  787. blr
  788. #else
  789. fsqrt C1, C1
  790. li r10, 16
  791. lfpdux f29, SP, r10
  792. lfpdux f28, SP, r10
  793. lfpdux f27, SP, r10
  794. lfpdux f26, SP, r10
  795. lfpdux f25, SP, r10
  796. lfpdux f24, SP, r10
  797. lfpdux f23, SP, r10
  798. lfpdux f22, SP, r10
  799. lfpdux f21, SP, r10
  800. lfpdux f20, SP, r10
  801. lfpdux f19, SP, r10
  802. lfpdux f18, SP, r10
  803. lfpdux f17, SP, r10
  804. lfpdux f16, SP, r10
  805. lfpdux f15, SP, r10
  806. lfpdux f14, SP, r10
  807. fmul C1, ALPHA, C1
  808. addi SP, SP, 16
  809. blr
  810. #endif
  811. .align 4
  812. LL(999):
  813. li r10, 16
  814. lfpdux f29, SP, r10
  815. lfpdux f28, SP, r10
  816. lfpdux f27, SP, r10
  817. lfpdux f26, SP, r10
  818. lfpdux f25, SP, r10
  819. lfpdux f24, SP, r10
  820. lfpdux f23, SP, r10
  821. lfpdux f22, SP, r10
  822. lfpdux f21, SP, r10
  823. lfpdux f20, SP, r10
  824. lfpdux f19, SP, r10
  825. lfpdux f18, SP, r10
  826. lfpdux f17, SP, r10
  827. lfpdux f16, SP, r10
  828. lfpdux f15, SP, r10
  829. lfpdux f14, SP, r10
  830. addi SP, SP, 16
  831. blr
  832. EPILOGUE