You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dnrm2_hummer.S 18 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define INCX2 r6
  44. #define X2 r7
  45. #define XX r8
  46. #define C1 f1
  47. #define C2 f0
  48. #define C3 f2
  49. #define C4 f3
  50. #define ALPHA f4
  51. #define ALPHA_R f5
  52. #define A1 f6
  53. #define A2 f7
  54. #define A3 f8
  55. #define A4 f9
  56. #define A5 f10
  57. #define A6 f11
  58. #define A7 f12
  59. #define A8 f13
  60. #define F1 f14
  61. #define F2 f15
  62. #define F3 f16
  63. #define F4 f17
  64. #define F5 f18
  65. #define F6 f19
  66. #define F7 f20
  67. #define F8 f21
  68. #define T1 f22
  69. #define T2 f23
  70. #define T3 f24
  71. #define T4 f25
  72. #define T5 f26
  73. #define T6 f27
  74. #define T7 f28
  75. #define T8 f29
  76. PROLOGUE
  77. PROFCODE
  78. li r10, -16
  79. stfpdux f14, SP, r10
  80. stfpdux f15, SP, r10
  81. stfpdux f16, SP, r10
  82. stfpdux f17, SP, r10
  83. stfpdux f18, SP, r10
  84. stfpdux f19, SP, r10
  85. stfpdux f20, SP, r10
  86. stfpdux f21, SP, r10
  87. stfpdux f22, SP, r10
  88. stfpdux f23, SP, r10
  89. stfpdux f24, SP, r10
  90. stfpdux f25, SP, r10
  91. stfpdux f26, SP, r10
  92. stfpdux f27, SP, r10
  93. stfpdux f28, SP, r10
  94. stfpdux f29, SP, r10
  95. li r10, 0
  96. lis r11, 0x3f80
  97. stwu r11, -4(SP)
  98. stwu r11, -4(SP)
  99. stwu r10, -4(SP)
  100. stwu r10, -4(SP)
  101. #ifdef F_INTERFACE
  102. LDINT N, 0(N)
  103. LDINT INCX, 0(INCX)
  104. #endif
  105. lfpsx C1, SP, r10 # Zero clear
  106. slwi INCX, INCX, BASE_SHIFT
  107. add INCX2, INCX, INCX
  108. fpmr C2, C1
  109. fpmr C3, C1
  110. fpmr C4, C1
  111. cmpwi cr0, N, 0
  112. ble LL(99)
  113. cmpwi cr0, INCX, 0
  114. ble LL(99)
  115. mr XX, X
  116. cmpwi cr0, INCX, SIZE
  117. bne LL(100)
  118. andi. r0, X, 2 * SIZE - 1
  119. beq LL(05)
  120. LFD C1, 0 * SIZE(X)
  121. add X, X, INCX
  122. addi N, N, -1
  123. cmpwi cr0, N, 0
  124. fabs C1, C1
  125. ble LL(20)
  126. .align 4
  127. LL(05):
  128. sub X, X, INCX2
  129. srawi. r0, N, 4
  130. mtspr CTR, r0
  131. beq- LL(15)
  132. LFPDUX A1, X, INCX2
  133. LFPDUX A2, X, INCX2
  134. LFPDUX A3, X, INCX2
  135. LFPDUX A4, X, INCX2
  136. LFPDUX A5, X, INCX2
  137. fpabs T1, A1
  138. LFPDUX A6, X, INCX2
  139. fpabs T2, A2
  140. LFPDUX A7, X, INCX2
  141. fpabs T3, A3
  142. LFPDUX A8, X, INCX2
  143. fpabs T4, A4
  144. bdz LL(13)
  145. .align 4
  146. LL(12):
  147. fpsub F1, C1, T1
  148. LFPDUX A1, X, INCX2
  149. fpsub F2, C2, T2
  150. LFPDUX A2, X, INCX2
  151. fpsub F3, C3, T3
  152. LFPDUX A3, X, INCX2
  153. fpsub F4, C4, T4
  154. LFPDUX A4, X, INCX2
  155. fpabs T5, A5
  156. fpabs T6, A6
  157. fpabs T7, A7
  158. fpabs T8, A8
  159. fpsel C1, F1, C1, T1
  160. LFPDUX A5, X, INCX2
  161. fpsel C2, F2, C2, T2
  162. LFPDUX A6, X, INCX2
  163. fpsel C3, F3, C3, T3
  164. LFPDUX A7, X, INCX2
  165. fpsel C4, F4, C4, T4
  166. LFPDUX A8, X, INCX2
  167. fpsub F5, C1, T5
  168. fpsub F6, C2, T6
  169. fpsub F7, C3, T7
  170. fpsub F8, C4, T8
  171. fpabs T1, A1
  172. fpabs T2, A2
  173. fpabs T3, A3
  174. fpabs T4, A4
  175. fpsel C1, F5, C1, T5
  176. fpsel C2, F6, C2, T6
  177. fpsel C3, F7, C3, T7
  178. fpsel C4, F8, C4, T8
  179. bdnz LL(12)
  180. .align 4
  181. LL(13):
  182. fpabs T5, A5
  183. fpabs T6, A6
  184. fpabs T7, A7
  185. fpabs T8, A8
  186. fpsub F1, C1, T1
  187. fpsub F2, C2, T2
  188. fpsub F3, C3, T3
  189. fpsub F4, C4, T4
  190. fpsel C1, F1, C1, T1
  191. fpsel C2, F2, C2, T2
  192. fpsel C3, F3, C3, T3
  193. fpsel C4, F4, C4, T4
  194. fpsub F5, C1, T5
  195. fpsub F6, C2, T6
  196. fpsub F7, C3, T7
  197. fpsub F8, C4, T8
  198. fpsel C1, F5, C1, T5
  199. fpsel C2, F6, C2, T6
  200. fpsel C3, F7, C3, T7
  201. fpsel C4, F8, C4, T8
  202. .align 4
  203. LL(15):
  204. andi. r0, N, 15
  205. beq LL(20)
  206. andi. r0, N, 8
  207. beq LL(16)
  208. LFPDUX A1, X, INCX2
  209. LFPDUX A2, X, INCX2
  210. LFPDUX A3, X, INCX2
  211. LFPDUX A4, X, INCX2
  212. fpabs A1, A1
  213. fpabs A2, A2
  214. fpabs A3, A3
  215. fpabs A4, A4
  216. fpsub F1, C1, A1
  217. fpsub F2, C2, A2
  218. fpsub F3, C3, A3
  219. fpsub F4, C4, A4
  220. fpsel C1, F1, C1, A1
  221. fpsel C2, F2, C2, A2
  222. fpsel C3, F3, C3, A3
  223. fpsel C4, F4, C4, A4
  224. .align 4
  225. LL(16):
  226. andi. r0, N, 4
  227. beq LL(17)
  228. LFPDUX A1, X, INCX2
  229. LFPDUX A2, X, INCX2
  230. fpabs A1, A1
  231. fpabs A2, A2
  232. fpsub F1, C1, A1
  233. fpsub F2, C2, A2
  234. fpsel C1, F1, C1, A1
  235. fpsel C2, F2, C2, A2
  236. .align 4
  237. LL(17):
  238. andi. r0, N, 2
  239. beq LL(18)
  240. LFPDUX A1, X, INCX2
  241. fpabs A1, A1
  242. fpsub F1, C1, A1
  243. fpsel C1, F1, C1, A1
  244. .align 4
  245. LL(18):
  246. andi. r0, N, 1
  247. beq LL(20)
  248. LFDUX A1, X, INCX2
  249. fabs A1, A1
  250. fsub F1, C1, A1
  251. fsel C1, F1, C1, A1
  252. .align 4
  253. LL(20):
  254. fpsub F1, C1, C2
  255. fpsub F2, C3, C4
  256. fpsel C1, F1, C1, C2
  257. fpsel C3, F2, C3, C4
  258. fpsub F1, C1, C3
  259. fpsel C1, F1, C1, C3
  260. fsmtp C2, C1
  261. fsub F1, C1, C2
  262. fsel ALPHA, F1, C1, C2
  263. li r10, 0
  264. lfs ALPHA_R, 8(SP) # load 1.0
  265. fdiv ALPHA_R, ALPHA_R, ALPHA
  266. lfpsx C1, SP, r10 # Zero clear
  267. fpmr C2, C1
  268. fpmr C3, C1
  269. fpmr C4, C1
  270. fsmfp ALPHA_R, ALPHA_R
  271. andi. r0, XX, 2 * SIZE - 1
  272. beq LL(21)
  273. LFD C1, 0 * SIZE(XX)
  274. add XX, XX, INCX
  275. cmpwi cr0, N, 0
  276. fmul C1, ALPHA_R, C1
  277. fmul C1, C1, C1
  278. ble LL(998)
  279. .align 4
  280. LL(21):
  281. sub XX, XX, INCX2
  282. srawi. r0, N, 4
  283. mtspr CTR, r0
  284. beq- LL(25)
  285. LFPDUX A1, XX, INCX2
  286. LFPDUX A2, XX, INCX2
  287. LFPDUX A3, XX, INCX2
  288. LFPDUX A4, XX, INCX2
  289. LFPDUX A5, XX, INCX2
  290. LFPDUX A6, XX, INCX2
  291. LFPDUX A7, XX, INCX2
  292. LFPDUX A8, XX, INCX2
  293. fpmul T1, ALPHA_R, A1
  294. fpmul T2, ALPHA_R, A2
  295. fpmul T3, ALPHA_R, A3
  296. fpmul T4, ALPHA_R, A4
  297. bdz LL(23)
  298. .align 4
  299. LL(22):
  300. fpmadd C1, T1, T1, C1
  301. LFPDUX A1, XX, INCX2
  302. fpmul T1, ALPHA_R, A5
  303. LFPDUX A2, XX, INCX2
  304. fpmadd C2, T2, T2, C2
  305. LFPDUX A3, XX, INCX2
  306. fpmul T2, ALPHA_R, A6
  307. LFPDUX A4, XX, INCX2
  308. fpmadd C3, T3, T3, C3
  309. fpmul T3, ALPHA_R, A7
  310. fpmadd C4, T4, T4, C4
  311. fpmul T4, ALPHA_R, A8
  312. fpmadd C1, T1, T1, C1
  313. LFPDUX A5, XX, INCX2
  314. fpmul T1, ALPHA_R, A1
  315. LFPDUX A6, XX, INCX2
  316. fpmadd C2, T2, T2, C2
  317. LFPDUX A7, XX, INCX2
  318. fpmul T2, ALPHA_R, A2
  319. LFPDUX A8, XX, INCX2
  320. fpmadd C3, T3, T3, C3
  321. fpmul T3, ALPHA_R, A3
  322. fpmadd C4, T4, T4, C4
  323. fpmul T4, ALPHA_R, A4
  324. bdnz LL(22)
  325. .align 4
  326. LL(23):
  327. fpmadd C1, T1, T1, C1
  328. fpmul T1, ALPHA_R, A5
  329. fpmadd C2, T2, T2, C2
  330. fpmul T2, ALPHA_R, A6
  331. fpmadd C3, T3, T3, C3
  332. fpmul T3, ALPHA_R, A7
  333. fpmadd C4, T4, T4, C4
  334. fpmul T4, ALPHA_R, A8
  335. fpmadd C1, T1, T1, C1
  336. fpmadd C2, T2, T2, C2
  337. fpmadd C3, T3, T3, C3
  338. fpmadd C4, T4, T4, C4
  339. .align 4
  340. LL(25):
  341. andi. r0, N, 15
  342. beq LL(98)
  343. andi. r0, N, 8
  344. beq LL(26)
  345. LFPDUX A1, XX, INCX2
  346. LFPDUX A2, XX, INCX2
  347. LFPDUX A3, XX, INCX2
  348. LFPDUX A4, XX, INCX2
  349. fpmul A1, ALPHA_R, A1
  350. fpmul A2, ALPHA_R, A2
  351. fpmul A3, ALPHA_R, A3
  352. fpmul A4, ALPHA_R, A4
  353. fpmadd C1, A1, A1, C1
  354. fpmadd C2, A2, A2, C2
  355. fpmadd C3, A3, A3, C3
  356. fpmadd C4, A4, A4, C4
  357. .align 4
  358. LL(26):
  359. andi. r0, N, 4
  360. beq LL(27)
  361. LFPDUX A1, XX, INCX2
  362. LFPDUX A2, XX, INCX2
  363. fpmul A1, ALPHA_R, A1
  364. fpmul A2, ALPHA_R, A2
  365. fpmadd C1, A1, A1, C1
  366. fpmadd C2, A2, A2, C2
  367. .align 4
  368. LL(27):
  369. andi. r0, N, 2
  370. beq LL(28)
  371. LFPDUX A1, XX, INCX2
  372. fpmul A1, ALPHA_R, A1
  373. fpmadd C1, A1, A1, C1
  374. .align 4
  375. LL(28):
  376. andi. r0, N, 1
  377. beq LL(98)
  378. LFDUX A1, XX, INCX2
  379. fmul A1, ALPHA_R, A1
  380. fmadd C1, A1, A1, C1
  381. .align 4
  382. LL(98):
  383. fpadd C1, C1, C2
  384. lis r3, 0x3f00
  385. fpadd C3, C3, C4
  386. lis r4, 0x4040
  387. stw r3, 4(SP)
  388. stw r4, 8(SP)
  389. fpadd C1, C1, C3
  390. lfs f10, 0(SP)
  391. fsmtp C2, C1
  392. lfs f11, 4(SP)
  393. fadd C1, C2, C1
  394. lfs f12, 8(SP)
  395. fcmpu cr0, f10, C1
  396. beq cr0, LL(99)
  397. #ifndef HUMMER_EMULATOR
  398. frsqrte f9, C1
  399. li r10, 16
  400. fmul f2, f1, f9
  401. lfpdux f29, SP, r10
  402. fmul f3, f9, f11
  403. lfpdux f28, SP, r10
  404. fnmsub f7, f2, f9, f12
  405. lfpdux f27, SP, r10
  406. fmul f9, f3, f7
  407. lfpdux f26, SP, r10
  408. fadd f13, f11, f11
  409. lfpdux f25, SP, r10
  410. fmul f12, f1, f9
  411. lfpdux f24, SP, r10
  412. fmul f11, f12, f11
  413. lfpdux f23, SP, r10
  414. lfpdux f22, SP, r10
  415. fnmsub f1, f12, f9, f13
  416. lfpdux f21, SP, r10
  417. lfpdux f20, SP, r10
  418. lfpdux f19, SP, r10
  419. lfpdux f18, SP, r10
  420. fmadd f1, f11, f1, f12
  421. lfpdux f17, SP, r10
  422. lfpdux f16, SP, r10
  423. lfpdux f15, SP, r10
  424. lfpdux f14, SP, r10
  425. fmul C1, ALPHA, C1
  426. addi SP, SP, 16
  427. blr
  428. #else
  429. fsqrt C1, C1
  430. li r10, 16
  431. lfpdux f29, SP, r10
  432. lfpdux f28, SP, r10
  433. lfpdux f27, SP, r10
  434. lfpdux f26, SP, r10
  435. lfpdux f25, SP, r10
  436. lfpdux f24, SP, r10
  437. lfpdux f23, SP, r10
  438. lfpdux f22, SP, r10
  439. lfpdux f21, SP, r10
  440. lfpdux f20, SP, r10
  441. lfpdux f19, SP, r10
  442. lfpdux f18, SP, r10
  443. lfpdux f17, SP, r10
  444. lfpdux f16, SP, r10
  445. lfpdux f15, SP, r10
  446. lfpdux f14, SP, r10
  447. fmul C1, ALPHA, C1
  448. addi SP, SP, 16
  449. blr
  450. #endif
  451. .align 4
  452. LL(99):
  453. li r10, 16
  454. lfpdux f29, SP, r10
  455. lfpdux f28, SP, r10
  456. lfpdux f27, SP, r10
  457. lfpdux f26, SP, r10
  458. lfpdux f25, SP, r10
  459. lfpdux f24, SP, r10
  460. lfpdux f23, SP, r10
  461. lfpdux f22, SP, r10
  462. lfpdux f21, SP, r10
  463. lfpdux f20, SP, r10
  464. lfpdux f19, SP, r10
  465. lfpdux f18, SP, r10
  466. lfpdux f17, SP, r10
  467. lfpdux f16, SP, r10
  468. lfpdux f15, SP, r10
  469. lfpdux f14, SP, r10
  470. addi SP, SP, 16
  471. blr
  472. .align 4
  473. LL(100):
  474. sub X, X, INCX
  475. srawi. r0, N, 4
  476. mtspr CTR, r0
  477. beq- LL(105)
  478. LFDUX A1, X, INCX
  479. LFDUX A2, X, INCX
  480. LFDUX A3, X, INCX
  481. LFDUX A4, X, INCX
  482. LFSDUX A1, X, INCX
  483. LFSDUX A2, X, INCX
  484. LFSDUX A3, X, INCX
  485. LFSDUX A4, X, INCX
  486. LFDUX A5, X, INCX
  487. LFDUX A6, X, INCX
  488. LFDUX A7, X, INCX
  489. LFDUX A8, X, INCX
  490. LFSDUX A5, X, INCX
  491. fpabs T1, A1
  492. LFSDUX A6, X, INCX
  493. fpabs T2, A2
  494. LFSDUX A7, X, INCX
  495. fpabs T3, A3
  496. LFSDUX A8, X, INCX
  497. fpabs T4, A4
  498. bdz LL(103)
  499. .align 4
  500. LL(102):
  501. fpsub F1, C1, T1
  502. LFDUX A1, X, INCX
  503. fpsub F2, C2, T2
  504. LFDUX A2, X, INCX
  505. fpsub F3, C3, T3
  506. LFDUX A3, X, INCX
  507. fpsub F4, C4, T4
  508. LFDUX A4, X, INCX
  509. fpabs T5, A5
  510. LFSDUX A1, X, INCX
  511. fpabs T6, A6
  512. LFSDUX A2, X, INCX
  513. fpabs T7, A7
  514. LFSDUX A3, X, INCX
  515. fpabs T8, A8
  516. LFSDUX A4, X, INCX
  517. fpsel C1, F1, C1, T1
  518. LFDUX A5, X, INCX
  519. fpsel C2, F2, C2, T2
  520. LFDUX A6, X, INCX
  521. fpsel C3, F3, C3, T3
  522. LFDUX A7, X, INCX
  523. fpsel C4, F4, C4, T4
  524. LFDUX A8, X, INCX
  525. fpsub F5, C1, T5
  526. LFSDUX A5, X, INCX
  527. fpsub F6, C2, T6
  528. LFSDUX A6, X, INCX
  529. fpsub F7, C3, T7
  530. LFSDUX A7, X, INCX
  531. fpsub F8, C4, T8
  532. LFSDUX A8, X, INCX
  533. fpabs T1, A1
  534. fpabs T2, A2
  535. fpabs T3, A3
  536. fpabs T4, A4
  537. fpsel C1, F5, C1, T5
  538. fpsel C2, F6, C2, T6
  539. fpsel C3, F7, C3, T7
  540. fpsel C4, F8, C4, T8
  541. bdnz LL(102)
  542. .align 4
  543. LL(103):
  544. fpabs T5, A5
  545. fpabs T6, A6
  546. fpabs T7, A7
  547. fpabs T8, A8
  548. fpsub F1, C1, T1
  549. fpsub F2, C2, T2
  550. fpsub F3, C3, T3
  551. fpsub F4, C4, T4
  552. fpsel C1, F1, C1, T1
  553. fpsel C2, F2, C2, T2
  554. fpsel C3, F3, C3, T3
  555. fpsel C4, F4, C4, T4
  556. fpsub F5, C1, T5
  557. fpsub F6, C2, T6
  558. fpsub F7, C3, T7
  559. fpsub F8, C4, T8
  560. fpsel C1, F5, C1, T5
  561. fpsel C2, F6, C2, T6
  562. fpsel C3, F7, C3, T7
  563. fpsel C4, F8, C4, T8
  564. .align 4
  565. LL(105):
  566. andi. r0, N, 15
  567. beq LL(120)
  568. andi. r0, N, 8
  569. beq LL(106)
  570. LFDUX A1, X, INCX
  571. LFDUX A2, X, INCX
  572. LFDUX A3, X, INCX
  573. LFDUX A4, X, INCX
  574. LFSDUX A1, X, INCX
  575. LFSDUX A2, X, INCX
  576. LFSDUX A3, X, INCX
  577. LFSDUX A4, X, INCX
  578. fpabs A1, A1
  579. fpabs A2, A2
  580. fpabs A3, A3
  581. fpabs A4, A4
  582. fpsub F1, C1, A1
  583. fpsub F2, C2, A2
  584. fpsub F3, C3, A3
  585. fpsub F4, C4, A4
  586. fpsel C1, F1, C1, A1
  587. fpsel C2, F2, C2, A2
  588. fpsel C3, F3, C3, A3
  589. fpsel C4, F4, C4, A4
  590. .align 4
  591. LL(106):
  592. andi. r0, N, 4
  593. beq LL(107)
  594. LFDUX A1, X, INCX
  595. LFDUX A2, X, INCX
  596. LFDUX A3, X, INCX
  597. LFDUX A4, X, INCX
  598. fabs A1, A1
  599. fabs A2, A2
  600. fabs A3, A3
  601. fabs A4, A4
  602. fsub F1, C1, A1
  603. fsub F2, C2, A2
  604. fsub F3, C3, A3
  605. fsub F4, C4, A4
  606. fsel C1, F1, C1, A1
  607. fsel C2, F2, C2, A2
  608. fsel C3, F3, C3, A3
  609. fsel C4, F4, C4, A4
  610. .align 4
  611. LL(107):
  612. andi. r0, N, 2
  613. beq LL(108)
  614. LFDUX A1, X, INCX
  615. LFDUX A2, X, INCX
  616. fabs A1, A1
  617. fabs A2, A2
  618. fsub F1, C1, A1
  619. fsub F2, C2, A2
  620. fsel C1, F1, C1, A1
  621. fsel C2, F2, C2, A2
  622. .align 4
  623. LL(108):
  624. andi. r0, N, 1
  625. beq LL(120)
  626. LFDUX A1, X, INCX
  627. fabs A1, A1
  628. fsub F1, C1, A1
  629. fsel C1, F1, C1, A1
  630. .align 4
  631. LL(120):
  632. fpsub F1, C1, C2
  633. fpsub F2, C3, C4
  634. fpsel C1, F1, C1, C2
  635. fpsel C3, F2, C3, C4
  636. fpsub F1, C1, C3
  637. fpsel C1, F1, C1, C3
  638. fsmtp C2, C1
  639. fsub F1, C1, C2
  640. fsel ALPHA, F1, C1, C2
  641. li r10, 0
  642. lfs ALPHA_R, 8(SP) # load 1.0
  643. fdiv ALPHA_R, ALPHA_R, ALPHA
  644. lfpsx C1, SP, r10 # Zero clear
  645. fpmr C2, C1
  646. fpmr C3, C1
  647. fpmr C4, C1
  648. fsmfp ALPHA_R, ALPHA_R
  649. sub XX, XX, INCX
  650. srawi. r0, N, 4
  651. mtspr CTR, r0
  652. beq- LL(125)
  653. LFDUX A1, XX, INCX
  654. LFDUX A2, XX, INCX
  655. LFDUX A3, XX, INCX
  656. LFDUX A4, XX, INCX
  657. LFSDUX A1, XX, INCX
  658. LFSDUX A2, XX, INCX
  659. LFSDUX A3, XX, INCX
  660. LFSDUX A4, XX, INCX
  661. LFDUX A5, XX, INCX
  662. LFDUX A6, XX, INCX
  663. LFDUX A7, XX, INCX
  664. LFDUX A8, XX, INCX
  665. LFSDUX A5, XX, INCX
  666. fpmul T1, ALPHA_R, A1
  667. LFSDUX A6, XX, INCX
  668. fpmul T2, ALPHA_R, A2
  669. LFSDUX A7, XX, INCX
  670. fpmul T3, ALPHA_R, A3
  671. LFSDUX A8, XX, INCX
  672. fpmul T4, ALPHA_R, A4
  673. bdz LL(123)
  674. .align 4
  675. LL(122):
  676. fpmadd C1, T1, T1, C1
  677. LFDUX A1, XX, INCX
  678. fpmul T1, ALPHA_R, A5
  679. LFDUX A2, XX, INCX
  680. fpmadd C2, T2, T2, C2
  681. LFDUX A3, XX, INCX
  682. fpmul T2, ALPHA_R, A6
  683. LFDUX A4, XX, INCX
  684. fpmadd C3, T3, T3, C3
  685. LFSDUX A1, XX, INCX
  686. fpmul T3, ALPHA_R, A7
  687. LFSDUX A2, XX, INCX
  688. fpmadd C4, T4, T4, C4
  689. LFSDUX A3, XX, INCX
  690. fpmul T4, ALPHA_R, A8
  691. LFSDUX A4, XX, INCX
  692. fpmadd C1, T1, T1, C1
  693. LFDUX A5, XX, INCX
  694. fpmul T1, ALPHA_R, A1
  695. LFDUX A6, XX, INCX
  696. fpmadd C2, T2, T2, C2
  697. LFDUX A7, XX, INCX
  698. fpmul T2, ALPHA_R, A2
  699. LFDUX A8, XX, INCX
  700. fpmadd C3, T3, T3, C3
  701. LFSDUX A5, XX, INCX
  702. fpmul T3, ALPHA_R, A3
  703. LFSDUX A6, XX, INCX
  704. fpmadd C4, T4, T4, C4
  705. LFSDUX A7, XX, INCX
  706. fpmul T4, ALPHA_R, A4
  707. LFSDUX A8, XX, INCX
  708. bdnz LL(122)
  709. .align 4
  710. LL(123):
  711. fpmadd C1, T1, T1, C1
  712. fpmul T1, ALPHA_R, A5
  713. fpmadd C2, T2, T2, C2
  714. fpmul T2, ALPHA_R, A6
  715. fpmadd C3, T3, T3, C3
  716. fpmul T3, ALPHA_R, A7
  717. fpmadd C4, T4, T4, C4
  718. fpmul T4, ALPHA_R, A8
  719. fpmadd C1, T1, T1, C1
  720. fpmadd C2, T2, T2, C2
  721. fpmadd C3, T3, T3, C3
  722. fpmadd C4, T4, T4, C4
  723. .align 4
  724. LL(125):
  725. andi. r0, N, 15
  726. beq LL(998)
  727. andi. r0, N, 8
  728. beq LL(126)
  729. LFDUX A1, XX, INCX
  730. LFDUX A2, XX, INCX
  731. LFDUX A3, XX, INCX
  732. LFDUX A4, XX, INCX
  733. LFSDUX A1, XX, INCX
  734. LFSDUX A2, XX, INCX
  735. LFSDUX A3, XX, INCX
  736. LFSDUX A4, XX, INCX
  737. fpmul A1, ALPHA_R, A1
  738. fpmul A2, ALPHA_R, A2
  739. fpmul A3, ALPHA_R, A3
  740. fpmul A4, ALPHA_R, A4
  741. fpmadd C1, A1, A1, C1
  742. fpmadd C2, A2, A2, C2
  743. fpmadd C3, A3, A3, C3
  744. fpmadd C4, A4, A4, C4
  745. .align 4
  746. LL(126):
  747. andi. r0, N, 4
  748. beq LL(127)
  749. LFDUX A1, XX, INCX
  750. LFDUX A2, XX, INCX
  751. LFDUX A3, XX, INCX
  752. LFDUX A4, XX, INCX
  753. fmul A1, ALPHA_R, A1
  754. fmul A2, ALPHA_R, A2
  755. fmul A3, ALPHA_R, A3
  756. fmul A4, ALPHA_R, A4
  757. fmadd C1, A1, A1, C1
  758. fmadd C2, A2, A2, C2
  759. fmadd C3, A3, A3, C3
  760. fmadd C4, A4, A4, C4
  761. .align 4
  762. LL(127):
  763. andi. r0, N, 2
  764. beq LL(128)
  765. LFDUX A1, XX, INCX
  766. LFDUX A2, XX, INCX
  767. fmul A1, ALPHA_R, A1
  768. fmul A2, ALPHA_R, A2
  769. fmadd C1, A1, A1, C1
  770. fmadd C2, A2, A2, C2
  771. .align 4
  772. LL(128):
  773. andi. r0, N, 1
  774. beq LL(998)
  775. LFDUX A1, XX, INCX
  776. fmul A1, ALPHA_R, A1
  777. fmadd C1, A1, A1, C1
  778. .align 4
  779. LL(998):
  780. fpadd C1, C1, C2
  781. lis r3, 0x3f00
  782. fpadd C3, C3, C4
  783. lis r4, 0x4040
  784. stw r3, 4(SP)
  785. stw r4, 8(SP)
  786. fpadd C1, C1, C3
  787. lfs f10, 0(SP)
  788. fsmtp C2, C1
  789. lfs f11, 4(SP)
  790. fadd C1, C2, C1
  791. lfs f12, 8(SP)
  792. fcmpu cr0, f10, C1
  793. beq cr0, LL(999)
  794. #ifndef HUMMER_EMULATOR
  795. frsqrte f9, C1
  796. li r10, 16
  797. fmul f2, f1, f9
  798. lfpdux f29, SP, r10
  799. fmul f3, f9, f11
  800. lfpdux f28, SP, r10
  801. fnmsub f7, f2, f9, f12
  802. lfpdux f27, SP, r10
  803. fmul f9, f3, f7
  804. lfpdux f26, SP, r10
  805. fadd f13, f11, f11
  806. lfpdux f25, SP, r10
  807. fmul f12, f1, f9
  808. lfpdux f24, SP, r10
  809. fmul f11, f12, f11
  810. lfpdux f23, SP, r10
  811. lfpdux f22, SP, r10
  812. lfpdux f21, SP, r10
  813. fnmsub f1, f12, f9, f13
  814. lfpdux f20, SP, r10
  815. lfpdux f19, SP, r10
  816. lfpdux f18, SP, r10
  817. fmadd f1, f11, f1, f12
  818. lfpdux f17, SP, r10
  819. lfpdux f16, SP, r10
  820. lfpdux f15, SP, r10
  821. lfpdux f14, SP, r10
  822. fmul C1, ALPHA, C1
  823. addi SP, SP, 16
  824. blr
  825. #else
  826. fsqrt C1, C1
  827. li r10, 16
  828. lfpdux f29, SP, r10
  829. lfpdux f28, SP, r10
  830. lfpdux f27, SP, r10
  831. lfpdux f26, SP, r10
  832. lfpdux f25, SP, r10
  833. lfpdux f24, SP, r10
  834. lfpdux f23, SP, r10
  835. lfpdux f22, SP, r10
  836. lfpdux f21, SP, r10
  837. lfpdux f20, SP, r10
  838. lfpdux f19, SP, r10
  839. lfpdux f18, SP, r10
  840. lfpdux f17, SP, r10
  841. lfpdux f16, SP, r10
  842. lfpdux f15, SP, r10
  843. lfpdux f14, SP, r10
  844. fmul C1, ALPHA, C1
  845. addi SP, SP, 16
  846. blr
  847. #endif
  848. .align 4
  849. LL(999):
  850. li r10, 16
  851. lfpdux f29, SP, r10
  852. lfpdux f28, SP, r10
  853. lfpdux f27, SP, r10
  854. lfpdux f26, SP, r10
  855. lfpdux f25, SP, r10
  856. lfpdux f24, SP, r10
  857. lfpdux f23, SP, r10
  858. lfpdux f22, SP, r10
  859. lfpdux f21, SP, r10
  860. lfpdux f20, SP, r10
  861. lfpdux f19, SP, r10
  862. lfpdux f18, SP, r10
  863. lfpdux f17, SP, r10
  864. lfpdux f16, SP, r10
  865. lfpdux f15, SP, r10
  866. lfpdux f14, SP, r10
  867. addi SP, SP, 16
  868. blr
  869. EPILOGUE