You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

nrm2.S 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define NN r6
  44. #define XX r7
  45. #define PREA r8
  46. #define FZERO 144(SP)
  47. #define FONE 148(SP)
  48. #define FMAX 152(SP)
  49. #define FINF 160(SP)
  50. #define STACKSIZE 168
  51. PROLOGUE
  52. PROFCODE
  53. addi SP, SP, -STACKSIZE
  54. li r10, 0
  55. lis r11, 0x3f80
  56. lis r12, 0x5fe0
  57. stfd f14, 0(SP)
  58. stfd f15, 8(SP)
  59. stfd f16, 16(SP)
  60. stfd f17, 24(SP)
  61. stfd f18, 32(SP)
  62. stfd f19, 40(SP)
  63. stfd f20, 48(SP)
  64. stfd f21, 56(SP)
  65. stfd f22, 64(SP)
  66. stfd f23, 72(SP)
  67. stfd f24, 80(SP)
  68. stfd f25, 88(SP)
  69. stfd f26, 96(SP)
  70. stfd f27, 104(SP)
  71. stfd f28, 112(SP)
  72. stfd f29, 120(SP)
  73. stfd f30, 128(SP)
  74. stfd f31, 136(SP)
  75. stw r10, FZERO
  76. stw r11, FONE
  77. stw r12, FMAX
  78. #ifdef DOUBLE
  79. lis r12, 0x7ff0
  80. ori r12, r12, 0x0
  81. sldi r12, r12, 32
  82. oris r12, r12,0x0
  83. ori r12, r12, 0x0
  84. std r12, FINF
  85. #endif
  86. stw r10, 4 + FMAX
  87. lfs f1, FZERO
  88. #ifdef F_INTERFACE
  89. LDINT N, 0(N)
  90. LDINT INCX, 0(INCX)
  91. #endif
  92. slwi INCX, INCX, BASE_SHIFT
  93. li PREA, L1_PREFETCHSIZE
  94. cmpwi cr0, N, 0
  95. ble- LL(9999)
  96. cmpwi cr0, INCX, 0
  97. beq- LL(9999)
  98. mr NN, N
  99. mr XX, X
  100. LFD f1, 0 * SIZE(X)
  101. add X, X, INCX
  102. fabs f0, f1
  103. fabs f2, f1
  104. fabs f3, f1
  105. fabs f4, f1
  106. fabs f5, f1
  107. fabs f6, f1
  108. fabs f7, f1
  109. fabs f1, f1
  110. subi N, N, 1
  111. cmpwi cr0, N, 0
  112. ble- LL(9999)
  113. cmpwi cr0, INCX, SIZE
  114. bne- cr0, LL(1000)
  115. srawi. r0, N, 4
  116. mtspr CTR, r0
  117. beq- cr0, LL(50)
  118. LFD f24, 0 * SIZE(X)
  119. LFD f25, 1 * SIZE(X)
  120. LFD f26, 2 * SIZE(X)
  121. LFD f27, 3 * SIZE(X)
  122. LFD f28, 4 * SIZE(X)
  123. LFD f29, 5 * SIZE(X)
  124. LFD f30, 6 * SIZE(X)
  125. LFD f31, 7 * SIZE(X)
  126. fabs f8, f24
  127. fabs f9, f25
  128. fabs f10, f26
  129. fabs f11, f27
  130. LFD f24, 8 * SIZE(X)
  131. LFD f25, 9 * SIZE(X)
  132. LFD f26, 10 * SIZE(X)
  133. LFD f27, 11 * SIZE(X)
  134. fabs f12, f28
  135. fabs f13, f29
  136. fabs f14, f30
  137. fabs f15, f31
  138. LFD f28, 12 * SIZE(X)
  139. LFD f29, 13 * SIZE(X)
  140. LFD f30, 14 * SIZE(X)
  141. LFD f31, 15 * SIZE(X)
  142. bdz LL(20)
  143. .align 4
  144. LL(10):
  145. fsub f16, f0, f8
  146. fsub f17, f1, f9
  147. fsub f18, f2, f10
  148. fsub f19, f3, f11
  149. fsub f20, f4, f12
  150. fsub f21, f5, f13
  151. fsub f22, f6, f14
  152. fsub f23, f7, f15
  153. fsel f0, f16, f0, f8
  154. fabs f8, f24
  155. fsel f1, f17, f1, f9
  156. fabs f9, f25
  157. fsel f2, f18, f2, f10
  158. fabs f10, f26
  159. fsel f3, f19, f3, f11
  160. fabs f11, f27
  161. LFD f24, 16 * SIZE(X)
  162. LFD f25, 17 * SIZE(X)
  163. LFD f26, 18 * SIZE(X)
  164. LFD f27, 19 * SIZE(X)
  165. fsel f4, f20, f4, f12
  166. fabs f12, f28
  167. fsel f5, f21, f5, f13
  168. fabs f13, f29
  169. fsel f6, f22, f6, f14
  170. fabs f14, f30
  171. fsel f7, f23, f7, f15
  172. fabs f15, f31
  173. LFD f28, 20 * SIZE(X)
  174. LFD f29, 21 * SIZE(X)
  175. LFD f30, 22 * SIZE(X)
  176. LFD f31, 23 * SIZE(X)
  177. fsub f16, f0, f8
  178. fsub f17, f1, f9
  179. fsub f18, f2, f10
  180. fsub f19, f3, f11
  181. fsub f20, f4, f12
  182. fsub f21, f5, f13
  183. fsub f22, f6, f14
  184. fsub f23, f7, f15
  185. fsel f0, f16, f0, f8
  186. fabs f8, f24
  187. fsel f1, f17, f1, f9
  188. fabs f9, f25
  189. fsel f2, f18, f2, f10
  190. fabs f10, f26
  191. fsel f3, f19, f3, f11
  192. fabs f11, f27
  193. LFD f24, 24 * SIZE(X)
  194. LFD f25, 25 * SIZE(X)
  195. LFD f26, 26 * SIZE(X)
  196. LFD f27, 27 * SIZE(X)
  197. fsel f4, f20, f4, f12
  198. fabs f12, f28
  199. fsel f5, f21, f5, f13
  200. fabs f13, f29
  201. fsel f6, f22, f6, f14
  202. fabs f14, f30
  203. fsel f7, f23, f7, f15
  204. fabs f15, f31
  205. LFD f28, 28 * SIZE(X)
  206. LFD f29, 29 * SIZE(X)
  207. LFD f30, 30 * SIZE(X)
  208. LFD f31, 31 * SIZE(X)
  209. #ifndef POWER6
  210. L1_PREFETCH X, PREA
  211. #endif
  212. addi X, X, 16 * SIZE
  213. #ifdef POWER6
  214. L1_PREFETCH X, PREA
  215. #endif
  216. bdnz LL(10)
  217. .align 4
  218. LL(20):
  219. fsub f16, f0, f8
  220. fsub f17, f1, f9
  221. fsub f18, f2, f10
  222. fsub f19, f3, f11
  223. fsub f20, f4, f12
  224. fsub f21, f5, f13
  225. fsub f22, f6, f14
  226. fsub f23, f7, f15
  227. fsel f0, f16, f0, f8
  228. fabs f8, f24
  229. fsel f1, f17, f1, f9
  230. fabs f9, f25
  231. fsel f2, f18, f2, f10
  232. fabs f10, f26
  233. fsel f3, f19, f3, f11
  234. fabs f11, f27
  235. fsel f4, f20, f4, f12
  236. fabs f12, f28
  237. fsel f5, f21, f5, f13
  238. fabs f13, f29
  239. fsel f6, f22, f6, f14
  240. fabs f14, f30
  241. fsel f7, f23, f7, f15
  242. fabs f15, f31
  243. fsub f16, f0, f8
  244. fsub f17, f1, f9
  245. fsub f18, f2, f10
  246. fsub f19, f3, f11
  247. fsub f20, f4, f12
  248. fsub f21, f5, f13
  249. fsub f22, f6, f14
  250. fsub f23, f7, f15
  251. fsel f0, f16, f0, f8
  252. fsel f1, f17, f1, f9
  253. fsel f2, f18, f2, f10
  254. fsel f3, f19, f3, f11
  255. fsel f4, f20, f4, f12
  256. fsel f5, f21, f5, f13
  257. fsel f6, f22, f6, f14
  258. fsel f7, f23, f7, f15
  259. addi X, X, 16 * SIZE
  260. .align 4
  261. LL(50):
  262. andi. r0, N, 15
  263. mtspr CTR, r0
  264. beq LL(100)
  265. .align 4
  266. LL(60):
  267. LFD f8, 0 * SIZE(X)
  268. addi X, X, 1 * SIZE
  269. fabs f8, f8
  270. fsub f16, f1, f8
  271. fsel f1, f16, f1, f8
  272. bdnz LL(60)
  273. .align 4
  274. LL(100):
  275. fsub f8, f0, f1
  276. fsub f9, f2, f3
  277. fsub f10, f4, f5
  278. fsub f11, f6, f7
  279. fsel f0, f8, f0, f1
  280. fsel f2, f9, f2, f3
  281. fsel f4, f10, f4, f5
  282. fsel f6, f11, f6, f7
  283. fsub f8, f0, f2
  284. fsub f9, f4, f6
  285. fsel f0, f8, f0, f2
  286. fsel f4, f9, f4, f6
  287. fsub f8, f0, f4
  288. fsel f31, f8, f0, f4
  289. lfs f1, FZERO
  290. lfs f0, FONE
  291. fcmpu cr0, f1, f31
  292. nop
  293. beq- cr0, LL(9999)
  294. fdiv f30, f0, f31
  295. #ifdef DOUBLE
  296. lfd f1, FINF
  297. fcmpu cr0, f1, f30
  298. lfs f1, FZERO
  299. beq- cr0, LL(9999)
  300. #endif
  301. fmr f0, f1
  302. fmr f2, f1
  303. fmr f3, f1
  304. fmr f4, f1
  305. fmr f5, f1
  306. fmr f6, f1
  307. fmr f7, f1
  308. srawi. r0, NN, 4
  309. mtspr CTR, r0
  310. beq- cr0, LL(250)
  311. LFD f8, 0 * SIZE(XX)
  312. LFD f9, 1 * SIZE(XX)
  313. LFD f10, 2 * SIZE(XX)
  314. LFD f11, 3 * SIZE(XX)
  315. LFD f12, 4 * SIZE(XX)
  316. LFD f13, 5 * SIZE(XX)
  317. LFD f14, 6 * SIZE(XX)
  318. LFD f15, 7 * SIZE(XX)
  319. fmul f16, f30, f8
  320. fmul f17, f30, f9
  321. fmul f18, f30, f10
  322. fmul f19, f30, f11
  323. LFD f8, 8 * SIZE(XX)
  324. LFD f9, 9 * SIZE(XX)
  325. LFD f10, 10 * SIZE(XX)
  326. LFD f11, 11 * SIZE(XX)
  327. fmul f20, f30, f12
  328. fmul f21, f30, f13
  329. fmul f22, f30, f14
  330. fmul f23, f30, f15
  331. LFD f12, 12 * SIZE(XX)
  332. LFD f13, 13 * SIZE(XX)
  333. LFD f14, 14 * SIZE(XX)
  334. LFD f15, 15 * SIZE(XX)
  335. bdz LL(220)
  336. .align 4
  337. LL(210):
  338. fmadd f0, f16, f16, f0
  339. fmul f16, f30, f8
  340. fmadd f1, f17, f17, f1
  341. fmul f17, f30, f9
  342. fmadd f2, f18, f18, f2
  343. fmul f18, f30, f10
  344. fmadd f3, f19, f19, f3
  345. fmul f19, f30, f11
  346. LFD f8, 16 * SIZE(XX)
  347. LFD f9, 17 * SIZE(XX)
  348. LFD f10, 18 * SIZE(XX)
  349. LFD f11, 19 * SIZE(XX)
  350. fmadd f4, f20, f20, f4
  351. fmul f20, f30, f12
  352. fmadd f5, f21, f21, f5
  353. fmul f21, f30, f13
  354. fmadd f6, f22, f22, f6
  355. fmul f22, f30, f14
  356. fmadd f7, f23, f23, f7
  357. fmul f23, f30, f15
  358. LFD f12, 20 * SIZE(XX)
  359. LFD f13, 21 * SIZE(XX)
  360. LFD f14, 22 * SIZE(XX)
  361. LFD f15, 23 * SIZE(XX)
  362. fmadd f0, f16, f16, f0
  363. fmul f16, f30, f8
  364. fmadd f1, f17, f17, f1
  365. fmul f17, f30, f9
  366. fmadd f2, f18, f18, f2
  367. fmul f18, f30, f10
  368. fmadd f3, f19, f19, f3
  369. fmul f19, f30, f11
  370. LFD f8, 24 * SIZE(XX)
  371. LFD f9, 25 * SIZE(XX)
  372. LFD f10, 26 * SIZE(XX)
  373. LFD f11, 27 * SIZE(XX)
  374. fmadd f4, f20, f20, f4
  375. fmul f20, f30, f12
  376. fmadd f5, f21, f21, f5
  377. fmul f21, f30, f13
  378. fmadd f6, f22, f22, f6
  379. fmul f22, f30, f14
  380. fmadd f7, f23, f23, f7
  381. fmul f23, f30, f15
  382. LFD f12, 28 * SIZE(XX)
  383. LFD f13, 29 * SIZE(XX)
  384. LFD f14, 30 * SIZE(XX)
  385. LFD f15, 31 * SIZE(XX)
  386. #ifndef POWER6
  387. L1_PREFETCH XX, PREA
  388. #endif
  389. addi XX, XX, 16 * SIZE
  390. #ifdef POWER6
  391. L1_PREFETCH XX, PREA
  392. #endif
  393. bdnz LL(210)
  394. .align 4
  395. LL(220):
  396. fmadd f0, f16, f16, f0
  397. fmul f16, f30, f8
  398. fmadd f1, f17, f17, f1
  399. fmul f17, f30, f9
  400. fmadd f2, f18, f18, f2
  401. fmul f18, f30, f10
  402. fmadd f3, f19, f19, f3
  403. fmul f19, f30, f11
  404. fmadd f4, f20, f20, f4
  405. fmul f20, f30, f12
  406. fmadd f5, f21, f21, f5
  407. fmul f21, f30, f13
  408. fmadd f6, f22, f22, f6
  409. fmul f22, f30, f14
  410. fmadd f7, f23, f23, f7
  411. fmul f23, f30, f15
  412. fmadd f0, f16, f16, f0
  413. fmadd f1, f17, f17, f1
  414. fmadd f2, f18, f18, f2
  415. fmadd f3, f19, f19, f3
  416. fmadd f4, f20, f20, f4
  417. fmadd f5, f21, f21, f5
  418. fmadd f6, f22, f22, f6
  419. fmadd f7, f23, f23, f7
  420. addi XX, XX, 16 * SIZE
  421. .align 4
  422. LL(250):
  423. andi. r0, NN, 15
  424. mtspr CTR, r0
  425. beq- cr0, LL(270)
  426. .align 4
  427. LL(260):
  428. LFD f8, 0 * SIZE(XX)
  429. addi XX, XX, 1 * SIZE
  430. fmul f16, f30, f8
  431. fmadd f0, f16, f16, f0
  432. bdnz LL(260)
  433. .align 4
  434. LL(270):
  435. fadd f0, f0, f1
  436. fadd f2, f2, f3
  437. fadd f4, f4, f5
  438. fadd f6, f6, f7
  439. fadd f0, f0, f2
  440. fadd f4, f4, f6
  441. fadd f0, f0, f4
  442. fsqrt f0, f0
  443. fmul f1, f31, f0
  444. b LL(9999)
  445. .align 4
  446. LL(1000):
  447. sub X, X, INCX
  448. srawi. r0, N, 4
  449. mtspr CTR, r0
  450. beq- LL(1050)
  451. LFDUX f24, X, INCX
  452. LFDUX f25, X, INCX
  453. LFDUX f26, X, INCX
  454. LFDUX f27, X, INCX
  455. LFDUX f28, X, INCX
  456. LFDUX f29, X, INCX
  457. LFDUX f30, X, INCX
  458. LFDUX f31, X, INCX
  459. fabs f8, f24
  460. fabs f9, f25
  461. fabs f10, f26
  462. fabs f11, f27
  463. LFDUX f24, X, INCX
  464. LFDUX f25, X, INCX
  465. LFDUX f26, X, INCX
  466. LFDUX f27, X, INCX
  467. fabs f12, f28
  468. fabs f13, f29
  469. fabs f14, f30
  470. fabs f15, f31
  471. LFDUX f28, X, INCX
  472. LFDUX f29, X, INCX
  473. LFDUX f30, X, INCX
  474. LFDUX f31, X, INCX
  475. bdz LL(1020)
  476. .align 4
  477. LL(1010):
  478. fsub f16, f0, f8
  479. fsub f17, f1, f9
  480. fsub f18, f2, f10
  481. fsub f19, f3, f11
  482. fsub f20, f4, f12
  483. fsub f21, f5, f13
  484. fsub f22, f6, f14
  485. fsub f23, f7, f15
  486. fsel f0, f16, f0, f8
  487. fabs f8, f24
  488. fsel f1, f17, f1, f9
  489. fabs f9, f25
  490. fsel f2, f18, f2, f10
  491. fabs f10, f26
  492. fsel f3, f19, f3, f11
  493. fabs f11, f27
  494. LFDUX f24, X, INCX
  495. LFDUX f25, X, INCX
  496. LFDUX f26, X, INCX
  497. LFDUX f27, X, INCX
  498. fsel f4, f20, f4, f12
  499. fabs f12, f28
  500. fsel f5, f21, f5, f13
  501. fabs f13, f29
  502. fsel f6, f22, f6, f14
  503. fabs f14, f30
  504. fsel f7, f23, f7, f15
  505. fabs f15, f31
  506. LFDUX f28, X, INCX
  507. LFDUX f29, X, INCX
  508. LFDUX f30, X, INCX
  509. LFDUX f31, X, INCX
  510. fsub f16, f0, f8
  511. fsub f17, f1, f9
  512. fsub f18, f2, f10
  513. fsub f19, f3, f11
  514. fsub f20, f4, f12
  515. fsub f21, f5, f13
  516. fsub f22, f6, f14
  517. fsub f23, f7, f15
  518. fsel f0, f16, f0, f8
  519. fabs f8, f24
  520. fsel f1, f17, f1, f9
  521. fabs f9, f25
  522. fsel f2, f18, f2, f10
  523. fabs f10, f26
  524. fsel f3, f19, f3, f11
  525. fabs f11, f27
  526. LFDUX f24, X, INCX
  527. LFDUX f25, X, INCX
  528. LFDUX f26, X, INCX
  529. LFDUX f27, X, INCX
  530. fsel f4, f20, f4, f12
  531. fabs f12, f28
  532. fsel f5, f21, f5, f13
  533. fabs f13, f29
  534. fsel f6, f22, f6, f14
  535. fabs f14, f30
  536. fsel f7, f23, f7, f15
  537. fabs f15, f31
  538. LFDUX f28, X, INCX
  539. LFDUX f29, X, INCX
  540. LFDUX f30, X, INCX
  541. LFDUX f31, X, INCX
  542. bdnz LL(1010)
  543. .align 4
  544. LL(1020):
  545. fsub f16, f0, f8
  546. fsub f17, f1, f9
  547. fsub f18, f2, f10
  548. fsub f19, f3, f11
  549. fsub f20, f4, f12
  550. fsub f21, f5, f13
  551. fsub f22, f6, f14
  552. fsub f23, f7, f15
  553. fsel f0, f16, f0, f8
  554. fabs f8, f24
  555. fsel f1, f17, f1, f9
  556. fabs f9, f25
  557. fsel f2, f18, f2, f10
  558. fabs f10, f26
  559. fsel f3, f19, f3, f11
  560. fabs f11, f27
  561. fsel f4, f20, f4, f12
  562. fabs f12, f28
  563. fsel f5, f21, f5, f13
  564. fabs f13, f29
  565. fsel f6, f22, f6, f14
  566. fabs f14, f30
  567. fsel f7, f23, f7, f15
  568. fabs f15, f31
  569. fsub f16, f0, f8
  570. fsub f17, f1, f9
  571. fsub f18, f2, f10
  572. fsub f19, f3, f11
  573. fsub f20, f4, f12
  574. fsub f21, f5, f13
  575. fsub f22, f6, f14
  576. fsub f23, f7, f15
  577. fsel f0, f16, f0, f8
  578. fsel f1, f17, f1, f9
  579. fsel f2, f18, f2, f10
  580. fsel f3, f19, f3, f11
  581. fsel f4, f20, f4, f12
  582. fsel f5, f21, f5, f13
  583. fsel f6, f22, f6, f14
  584. fsel f7, f23, f7, f15
  585. .align 4
  586. LL(1050):
  587. andi. r0, N, 15
  588. mtspr CTR, r0
  589. beq LL(1999)
  590. .align 4
  591. LL(1060):
  592. LFDUX f8, X, INCX
  593. fabs f8, f8
  594. fsub f16, f1, f8
  595. fsel f1, f16, f1, f8
  596. bdnz LL(1060)
  597. .align 4
  598. LL(1999):
  599. fsub f8, f0, f1
  600. fsub f9, f2, f3
  601. fsub f10, f4, f5
  602. fsub f11, f6, f7
  603. fsel f0, f8, f0, f1
  604. fsel f2, f9, f2, f3
  605. fsel f4, f10, f4, f5
  606. fsel f6, f11, f6, f7
  607. fsub f8, f0, f2
  608. fsub f9, f4, f6
  609. fsel f0, f8, f0, f2
  610. fsel f4, f9, f4, f6
  611. fsub f8, f0, f4
  612. fsel f31, f8, f0, f4
  613. lfs f1, FZERO
  614. lfs f0, FONE
  615. lfd f2, FMAX
  616. fcmpu cr0, f1, f31
  617. beq- cr0, LL(9999)
  618. fdiv f30, f0, f31
  619. #ifdef DOUBLE
  620. lfd f1, FINF
  621. fcmpu cr0, f1, f30
  622. lfs f1, FZERO
  623. beq- cr0, LL(9999)
  624. #endif
  625. fmr f0, f1
  626. fmr f2, f1
  627. fmr f3, f1
  628. fmr f4, f1
  629. fmr f5, f1
  630. fmr f6, f1
  631. fmr f7, f1
  632. sub XX, XX, INCX
  633. srawi. r0, NN, 4
  634. mtspr CTR, r0
  635. beq- cr0, LL(2150)
  636. LFDUX f8, XX, INCX
  637. LFDUX f9, XX, INCX
  638. LFDUX f10, XX, INCX
  639. LFDUX f11, XX, INCX
  640. LFDUX f12, XX, INCX
  641. LFDUX f13, XX, INCX
  642. LFDUX f14, XX, INCX
  643. LFDUX f15, XX, INCX
  644. fmul f16, f30, f8
  645. fmul f17, f30, f9
  646. fmul f18, f30, f10
  647. fmul f19, f30, f11
  648. LFDUX f8, XX, INCX
  649. LFDUX f9, XX, INCX
  650. LFDUX f10, XX, INCX
  651. LFDUX f11, XX, INCX
  652. fmul f20, f30, f12
  653. fmul f21, f30, f13
  654. fmul f22, f30, f14
  655. fmul f23, f30, f15
  656. LFDUX f12, XX, INCX
  657. LFDUX f13, XX, INCX
  658. LFDUX f14, XX, INCX
  659. LFDUX f15, XX, INCX
  660. bdz LL(2120)
  661. .align 4
  662. LL(2110):
  663. fmadd f0, f16, f16, f0
  664. fmul f16, f30, f8
  665. fmadd f1, f17, f17, f1
  666. fmul f17, f30, f9
  667. fmadd f2, f18, f18, f2
  668. fmul f18, f30, f10
  669. fmadd f3, f19, f19, f3
  670. fmul f19, f30, f11
  671. LFDUX f8, XX, INCX
  672. LFDUX f9, XX, INCX
  673. LFDUX f10, XX, INCX
  674. LFDUX f11, XX, INCX
  675. fmadd f4, f20, f20, f4
  676. fmul f20, f30, f12
  677. fmadd f5, f21, f21, f5
  678. fmul f21, f30, f13
  679. fmadd f6, f22, f22, f6
  680. fmul f22, f30, f14
  681. fmadd f7, f23, f23, f7
  682. fmul f23, f30, f15
  683. LFDUX f12, XX, INCX
  684. LFDUX f13, XX, INCX
  685. LFDUX f14, XX, INCX
  686. LFDUX f15, XX, INCX
  687. fmadd f0, f16, f16, f0
  688. fmul f16, f30, f8
  689. fmadd f1, f17, f17, f1
  690. fmul f17, f30, f9
  691. fmadd f2, f18, f18, f2
  692. fmul f18, f30, f10
  693. fmadd f3, f19, f19, f3
  694. fmul f19, f30, f11
  695. LFDUX f8, XX, INCX
  696. LFDUX f9, XX, INCX
  697. LFDUX f10, XX, INCX
  698. LFDUX f11, XX, INCX
  699. fmadd f4, f20, f20, f4
  700. fmul f20, f30, f12
  701. fmadd f5, f21, f21, f5
  702. fmul f21, f30, f13
  703. fmadd f6, f22, f22, f6
  704. fmul f22, f30, f14
  705. fmadd f7, f23, f23, f7
  706. fmul f23, f30, f15
  707. LFDUX f12, XX, INCX
  708. LFDUX f13, XX, INCX
  709. LFDUX f14, XX, INCX
  710. LFDUX f15, XX, INCX
  711. bdnz LL(2110)
  712. .align 4
  713. LL(2120):
  714. fmadd f0, f16, f16, f0
  715. fmul f16, f30, f8
  716. fmadd f1, f17, f17, f1
  717. fmul f17, f30, f9
  718. fmadd f2, f18, f18, f2
  719. fmul f18, f30, f10
  720. fmadd f3, f19, f19, f3
  721. fmul f19, f30, f11
  722. fmadd f4, f20, f20, f4
  723. fmul f20, f30, f12
  724. fmadd f5, f21, f21, f5
  725. fmul f21, f30, f13
  726. fmadd f6, f22, f22, f6
  727. fmul f22, f30, f14
  728. fmadd f7, f23, f23, f7
  729. fmul f23, f30, f15
  730. fmadd f0, f16, f16, f0
  731. fmadd f1, f17, f17, f1
  732. fmadd f2, f18, f18, f2
  733. fmadd f3, f19, f19, f3
  734. fmadd f4, f20, f20, f4
  735. fmadd f5, f21, f21, f5
  736. fmadd f6, f22, f22, f6
  737. fmadd f7, f23, f23, f7
  738. .align 4
  739. LL(2150):
  740. andi. r0, NN, 15
  741. mtspr CTR, r0
  742. beq- cr0, LL(2170)
  743. .align 4
  744. LL(2160):
  745. LFDUX f8, XX, INCX
  746. fmul f16, f30, f8
  747. fmadd f0, f16, f16, f0
  748. bdnz LL(2160)
  749. .align 4
  750. LL(2170):
  751. fadd f0, f0, f1
  752. fadd f2, f2, f3
  753. fadd f4, f4, f5
  754. fadd f6, f6, f7
  755. fadd f0, f0, f2
  756. fadd f4, f4, f6
  757. fadd f0, f0, f4
  758. fsqrt f0, f0
  759. fmul f1, f31, f0
  760. .align 4
  761. LL(9999):
  762. lfd f14, 0(SP)
  763. lfd f15, 8(SP)
  764. lfd f16, 16(SP)
  765. lfd f17, 24(SP)
  766. lfd f18, 32(SP)
  767. lfd f19, 40(SP)
  768. lfd f20, 48(SP)
  769. lfd f21, 56(SP)
  770. lfd f22, 64(SP)
  771. lfd f23, 72(SP)
  772. lfd f24, 80(SP)
  773. lfd f25, 88(SP)
  774. lfd f26, 96(SP)
  775. lfd f27, 104(SP)
  776. lfd f28, 112(SP)
  777. lfd f29, 120(SP)
  778. lfd f30, 128(SP)
  779. lfd f31, 136(SP)
  780. addi SP, SP, STACKSIZE
  781. blr
  782. EPILOGUE