You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define NN r6
  44. #define XX r7
  45. #define PREA r8
  46. #define INCXM1 r9
  47. #define FZERO 144(SP)
  48. #define FONE 148(SP)
  49. #define FMAX 152(SP)
  50. #define STACKSIZE 160
  51. PROLOGUE
  52. PROFCODE
  53. addi SP, SP, -STACKSIZE
  54. li r10, 0
  55. lis r11, 0x3f80
  56. lis r12, 0x5fe0
  57. stfd f14, 0(SP)
  58. stfd f15, 8(SP)
  59. stfd f16, 16(SP)
  60. stfd f17, 24(SP)
  61. stfd f18, 32(SP)
  62. stfd f19, 40(SP)
  63. stfd f20, 48(SP)
  64. stfd f21, 56(SP)
  65. stfd f22, 64(SP)
  66. stfd f23, 72(SP)
  67. stfd f24, 80(SP)
  68. stfd f25, 88(SP)
  69. stfd f26, 96(SP)
  70. stfd f27, 104(SP)
  71. stfd f28, 112(SP)
  72. stfd f29, 120(SP)
  73. stfd f30, 128(SP)
  74. stfd f31, 136(SP)
  75. stw r10, FZERO
  76. stw r11, FONE
  77. stw r12, FMAX
  78. stw r10, 4 + FMAX
  79. lfs f1, FZERO
  80. #ifdef F_INTERFACE
  81. LDINT N, 0(N)
  82. LDINT INCX, 0(INCX)
  83. #endif
  84. slwi INCX, INCX, ZBASE_SHIFT
  85. subi INCXM1, INCX, SIZE
  86. li PREA, L1_PREFETCHSIZE
  87. cmpwi cr0, N, 0
  88. ble- LL(9999)
  89. cmpwi cr0, INCX, 0
  90. beq- LL(9999)
  91. mr NN, N
  92. mr XX, X
  93. LFD f0, 0 * SIZE(X)
  94. LFD f1, 1 * SIZE(X)
  95. add X, X, INCX
  96. fabs f2, f0
  97. fabs f3, f1
  98. fabs f4, f0
  99. fabs f5, f1
  100. fabs f6, f0
  101. fabs f7, f1
  102. fabs f0, f0
  103. fabs f1, f1
  104. subi N, N, 1
  105. cmpwi cr0, INCX, 2 * SIZE
  106. bne- cr0, LL(1000)
  107. srawi. r0, N, 3
  108. mtspr CTR, r0
  109. beq- cr0, LL(50)
  110. LFD f24, 0 * SIZE(X)
  111. LFD f25, 1 * SIZE(X)
  112. LFD f26, 2 * SIZE(X)
  113. LFD f27, 3 * SIZE(X)
  114. LFD f28, 4 * SIZE(X)
  115. LFD f29, 5 * SIZE(X)
  116. LFD f30, 6 * SIZE(X)
  117. LFD f31, 7 * SIZE(X)
  118. fabs f8, f24
  119. fabs f9, f25
  120. fabs f10, f26
  121. fabs f11, f27
  122. LFD f24, 8 * SIZE(X)
  123. LFD f25, 9 * SIZE(X)
  124. LFD f26, 10 * SIZE(X)
  125. LFD f27, 11 * SIZE(X)
  126. fabs f12, f28
  127. fabs f13, f29
  128. fabs f14, f30
  129. fabs f15, f31
  130. LFD f28, 12 * SIZE(X)
  131. LFD f29, 13 * SIZE(X)
  132. LFD f30, 14 * SIZE(X)
  133. LFD f31, 15 * SIZE(X)
  134. bdz LL(20)
  135. .align 4
  136. LL(10):
  137. fsub f16, f0, f8
  138. fsub f17, f1, f9
  139. fsub f18, f2, f10
  140. fsub f19, f3, f11
  141. fsub f20, f4, f12
  142. fsub f21, f5, f13
  143. fsub f22, f6, f14
  144. fsub f23, f7, f15
  145. fsel f0, f16, f0, f8
  146. fabs f8, f24
  147. fsel f1, f17, f1, f9
  148. fabs f9, f25
  149. fsel f2, f18, f2, f10
  150. fabs f10, f26
  151. fsel f3, f19, f3, f11
  152. fabs f11, f27
  153. LFD f24, 16 * SIZE(X)
  154. LFD f25, 17 * SIZE(X)
  155. LFD f26, 18 * SIZE(X)
  156. LFD f27, 19 * SIZE(X)
  157. fsel f4, f20, f4, f12
  158. fabs f12, f28
  159. fsel f5, f21, f5, f13
  160. fabs f13, f29
  161. fsel f6, f22, f6, f14
  162. fabs f14, f30
  163. fsel f7, f23, f7, f15
  164. fabs f15, f31
  165. LFD f28, 20 * SIZE(X)
  166. LFD f29, 21 * SIZE(X)
  167. LFD f30, 22 * SIZE(X)
  168. LFD f31, 23 * SIZE(X)
  169. fsub f16, f0, f8
  170. fsub f17, f1, f9
  171. fsub f18, f2, f10
  172. fsub f19, f3, f11
  173. fsub f20, f4, f12
  174. fsub f21, f5, f13
  175. fsub f22, f6, f14
  176. fsub f23, f7, f15
  177. fsel f0, f16, f0, f8
  178. fabs f8, f24
  179. fsel f1, f17, f1, f9
  180. fabs f9, f25
  181. fsel f2, f18, f2, f10
  182. fabs f10, f26
  183. fsel f3, f19, f3, f11
  184. fabs f11, f27
  185. LFD f24, 24 * SIZE(X)
  186. LFD f25, 25 * SIZE(X)
  187. LFD f26, 26 * SIZE(X)
  188. LFD f27, 27 * SIZE(X)
  189. fsel f4, f20, f4, f12
  190. fabs f12, f28
  191. fsel f5, f21, f5, f13
  192. fabs f13, f29
  193. fsel f6, f22, f6, f14
  194. fabs f14, f30
  195. fsel f7, f23, f7, f15
  196. fabs f15, f31
  197. LFD f28, 28 * SIZE(X)
  198. LFD f29, 29 * SIZE(X)
  199. LFD f30, 30 * SIZE(X)
  200. LFD f31, 31 * SIZE(X)
  201. #ifndef POWER6
  202. L1_PREFETCH X, PREA
  203. #endif
  204. addi X, X, 16 * SIZE
  205. #ifdef POWER6
  206. L1_PREFETCH X, PREA
  207. #endif
  208. bdnz LL(10)
  209. .align 4
  210. LL(20):
  211. fsub f16, f0, f8
  212. fsub f17, f1, f9
  213. fsub f18, f2, f10
  214. fsub f19, f3, f11
  215. fsub f20, f4, f12
  216. fsub f21, f5, f13
  217. fsub f22, f6, f14
  218. fsub f23, f7, f15
  219. fsel f0, f16, f0, f8
  220. fabs f8, f24
  221. fsel f1, f17, f1, f9
  222. fabs f9, f25
  223. fsel f2, f18, f2, f10
  224. fabs f10, f26
  225. fsel f3, f19, f3, f11
  226. fabs f11, f27
  227. fsel f4, f20, f4, f12
  228. fabs f12, f28
  229. fsel f5, f21, f5, f13
  230. fabs f13, f29
  231. fsel f6, f22, f6, f14
  232. fabs f14, f30
  233. fsel f7, f23, f7, f15
  234. fabs f15, f31
  235. fsub f16, f0, f8
  236. fsub f17, f1, f9
  237. fsub f18, f2, f10
  238. fsub f19, f3, f11
  239. fsub f20, f4, f12
  240. fsub f21, f5, f13
  241. fsub f22, f6, f14
  242. fsub f23, f7, f15
  243. fsel f0, f16, f0, f8
  244. fsel f1, f17, f1, f9
  245. fsel f2, f18, f2, f10
  246. fsel f3, f19, f3, f11
  247. fsel f4, f20, f4, f12
  248. fsel f5, f21, f5, f13
  249. fsel f6, f22, f6, f14
  250. fsel f7, f23, f7, f15
  251. addi X, X, 16 * SIZE
  252. .align 4
  253. LL(50):
  254. andi. r0, N, 7
  255. mtspr CTR, r0
  256. beq LL(100)
  257. .align 4
  258. LL(60):
  259. LFD f8, 0 * SIZE(X)
  260. LFD f9, 1 * SIZE(X)
  261. addi X, X, 2 * SIZE
  262. fabs f8, f8
  263. fabs f9, f9
  264. fsub f16, f0, f8
  265. fsub f17, f1, f9
  266. fsel f0, f16, f0, f8
  267. fsel f1, f17, f1, f9
  268. bdnz LL(60)
  269. .align 4
  270. LL(100):
  271. fsub f8, f0, f1
  272. fsub f9, f2, f3
  273. fsub f10, f4, f5
  274. fsub f11, f6, f7
  275. fsel f0, f8, f0, f1
  276. fsel f2, f9, f2, f3
  277. fsel f4, f10, f4, f5
  278. fsel f6, f11, f6, f7
  279. fsub f8, f0, f2
  280. fsub f9, f4, f6
  281. fsel f0, f8, f0, f2
  282. fsel f4, f9, f4, f6
  283. fsub f8, f0, f4
  284. fsel f31, f8, f0, f4
  285. lfs f1, FZERO
  286. lfs f0, FONE
  287. fcmpu cr0, f1, f31
  288. beq- cr0, LL(9999)
  289. fdiv f30, f0, f31
  290. fmr f0, f1
  291. fmr f2, f1
  292. fmr f3, f1
  293. fmr f4, f1
  294. fmr f5, f1
  295. fmr f6, f1
  296. fmr f7, f1
  297. srawi. r0, NN, 3
  298. mtspr CTR, r0
  299. beq- cr0, LL(150)
  300. LFD f8, 0 * SIZE(XX)
  301. LFD f9, 1 * SIZE(XX)
  302. LFD f10, 2 * SIZE(XX)
  303. LFD f11, 3 * SIZE(XX)
  304. LFD f12, 4 * SIZE(XX)
  305. LFD f13, 5 * SIZE(XX)
  306. LFD f14, 6 * SIZE(XX)
  307. LFD f15, 7 * SIZE(XX)
  308. fmul f16, f30, f8
  309. fmul f17, f30, f9
  310. fmul f18, f30, f10
  311. fmul f19, f30, f11
  312. LFD f8, 8 * SIZE(XX)
  313. LFD f9, 9 * SIZE(XX)
  314. LFD f10, 10 * SIZE(XX)
  315. LFD f11, 11 * SIZE(XX)
  316. fmul f20, f30, f12
  317. fmul f21, f30, f13
  318. fmul f22, f30, f14
  319. fmul f23, f30, f15
  320. LFD f12, 12 * SIZE(XX)
  321. LFD f13, 13 * SIZE(XX)
  322. LFD f14, 14 * SIZE(XX)
  323. LFD f15, 15 * SIZE(XX)
  324. bdz LL(120)
  325. .align 4
  326. LL(110):
  327. fmadd f0, f16, f16, f0
  328. fmul f16, f30, f8
  329. fmadd f1, f17, f17, f1
  330. fmul f17, f30, f9
  331. fmadd f2, f18, f18, f2
  332. fmul f18, f30, f10
  333. fmadd f3, f19, f19, f3
  334. fmul f19, f30, f11
  335. LFD f8, 16 * SIZE(XX)
  336. LFD f9, 17 * SIZE(XX)
  337. LFD f10, 18 * SIZE(XX)
  338. LFD f11, 19 * SIZE(XX)
  339. fmadd f4, f20, f20, f4
  340. fmul f20, f30, f12
  341. fmadd f5, f21, f21, f5
  342. fmul f21, f30, f13
  343. fmadd f6, f22, f22, f6
  344. fmul f22, f30, f14
  345. fmadd f7, f23, f23, f7
  346. fmul f23, f30, f15
  347. LFD f12, 20 * SIZE(XX)
  348. LFD f13, 21 * SIZE(XX)
  349. LFD f14, 22 * SIZE(XX)
  350. LFD f15, 23 * SIZE(XX)
  351. fmadd f0, f16, f16, f0
  352. fmul f16, f30, f8
  353. fmadd f1, f17, f17, f1
  354. fmul f17, f30, f9
  355. fmadd f2, f18, f18, f2
  356. fmul f18, f30, f10
  357. fmadd f3, f19, f19, f3
  358. fmul f19, f30, f11
  359. LFD f8, 24 * SIZE(XX)
  360. LFD f9, 25 * SIZE(XX)
  361. LFD f10, 26 * SIZE(XX)
  362. LFD f11, 27 * SIZE(XX)
  363. fmadd f4, f20, f20, f4
  364. fmul f20, f30, f12
  365. fmadd f5, f21, f21, f5
  366. fmul f21, f30, f13
  367. fmadd f6, f22, f22, f6
  368. fmul f22, f30, f14
  369. fmadd f7, f23, f23, f7
  370. fmul f23, f30, f15
  371. LFD f12, 28 * SIZE(XX)
  372. LFD f13, 29 * SIZE(XX)
  373. LFD f14, 30 * SIZE(XX)
  374. LFD f15, 31 * SIZE(XX)
  375. #ifndef POWER6
  376. L1_PREFETCH XX, PREA
  377. #endif
  378. addi XX, XX, 16 * SIZE
  379. #ifdef POWER6
  380. L1_PREFETCH XX, PREA
  381. #endif
  382. bdnz LL(110)
  383. .align 4
  384. LL(120):
  385. fmadd f0, f16, f16, f0
  386. fmul f16, f30, f8
  387. fmadd f1, f17, f17, f1
  388. fmul f17, f30, f9
  389. fmadd f2, f18, f18, f2
  390. fmul f18, f30, f10
  391. fmadd f3, f19, f19, f3
  392. fmul f19, f30, f11
  393. fmadd f4, f20, f20, f4
  394. fmul f20, f30, f12
  395. fmadd f5, f21, f21, f5
  396. fmul f21, f30, f13
  397. fmadd f6, f22, f22, f6
  398. fmul f22, f30, f14
  399. fmadd f7, f23, f23, f7
  400. fmul f23, f30, f15
  401. fmadd f0, f16, f16, f0
  402. fmadd f1, f17, f17, f1
  403. fmadd f2, f18, f18, f2
  404. fmadd f3, f19, f19, f3
  405. fmadd f4, f20, f20, f4
  406. fmadd f5, f21, f21, f5
  407. fmadd f6, f22, f22, f6
  408. fmadd f7, f23, f23, f7
  409. addi XX, XX, 16 * SIZE
  410. .align 4
  411. LL(150):
  412. andi. r0, NN, 7
  413. mtspr CTR, r0
  414. beq- cr0, LL(170)
  415. .align 4
  416. LL(160):
  417. LFD f8, 0 * SIZE(XX)
  418. LFD f9, 1 * SIZE(XX)
  419. addi XX, XX, 2 * SIZE
  420. fmul f16, f30, f8
  421. fmul f17, f30, f9
  422. fmadd f0, f16, f16, f0
  423. fmadd f1, f17, f17, f1
  424. bdnz LL(160)
  425. .align 4
  426. LL(170):
  427. fadd f0, f0, f1
  428. fadd f2, f2, f3
  429. fadd f4, f4, f5
  430. fadd f6, f6, f7
  431. fadd f0, f0, f2
  432. fadd f4, f4, f6
  433. fadd f0, f0, f4
  434. fsqrt f0, f0
  435. fmul f1, f31, f0
  436. b LL(9999)
  437. .align 4
  438. LL(1000):
  439. sub X, X, INCXM1
  440. srawi. r0, N, 3
  441. mtspr CTR, r0
  442. beq- LL(1050)
  443. LFDX f24, X, INCXM1
  444. LFDUX f25, X, INCX
  445. LFDX f26, X, INCXM1
  446. LFDUX f27, X, INCX
  447. LFDX f28, X, INCXM1
  448. LFDUX f29, X, INCX
  449. LFDX f30, X, INCXM1
  450. LFDUX f31, X, INCX
  451. fabs f8, f24
  452. fabs f9, f25
  453. fabs f10, f26
  454. fabs f11, f27
  455. LFDX f24, X, INCXM1
  456. LFDUX f25, X, INCX
  457. LFDX f26, X, INCXM1
  458. LFDUX f27, X, INCX
  459. fabs f12, f28
  460. fabs f13, f29
  461. fabs f14, f30
  462. fabs f15, f31
  463. LFDX f28, X, INCXM1
  464. LFDUX f29, X, INCX
  465. LFDX f30, X, INCXM1
  466. LFDUX f31, X, INCX
  467. bdz LL(1020)
  468. .align 4
  469. LL(1010):
  470. fsub f16, f0, f8
  471. fsub f17, f1, f9
  472. fsub f18, f2, f10
  473. fsub f19, f3, f11
  474. fsub f20, f4, f12
  475. fsub f21, f5, f13
  476. fsub f22, f6, f14
  477. fsub f23, f7, f15
  478. fsel f0, f16, f0, f8
  479. fabs f8, f24
  480. fsel f1, f17, f1, f9
  481. fabs f9, f25
  482. fsel f2, f18, f2, f10
  483. fabs f10, f26
  484. fsel f3, f19, f3, f11
  485. fabs f11, f27
  486. LFDX f24, X, INCXM1
  487. LFDUX f25, X, INCX
  488. LFDX f26, X, INCXM1
  489. LFDUX f27, X, INCX
  490. fsel f4, f20, f4, f12
  491. fabs f12, f28
  492. fsel f5, f21, f5, f13
  493. fabs f13, f29
  494. fsel f6, f22, f6, f14
  495. fabs f14, f30
  496. fsel f7, f23, f7, f15
  497. fabs f15, f31
  498. LFDX f28, X, INCXM1
  499. LFDUX f29, X, INCX
  500. LFDX f30, X, INCXM1
  501. LFDUX f31, X, INCX
  502. fsub f16, f0, f8
  503. fsub f17, f1, f9
  504. fsub f18, f2, f10
  505. fsub f19, f3, f11
  506. fsub f20, f4, f12
  507. fsub f21, f5, f13
  508. fsub f22, f6, f14
  509. fsub f23, f7, f15
  510. fsel f0, f16, f0, f8
  511. fabs f8, f24
  512. fsel f1, f17, f1, f9
  513. fabs f9, f25
  514. fsel f2, f18, f2, f10
  515. fabs f10, f26
  516. fsel f3, f19, f3, f11
  517. fabs f11, f27
  518. LFDX f24, X, INCXM1
  519. LFDUX f25, X, INCX
  520. LFDX f26, X, INCXM1
  521. LFDUX f27, X, INCX
  522. fsel f4, f20, f4, f12
  523. fabs f12, f28
  524. fsel f5, f21, f5, f13
  525. fabs f13, f29
  526. fsel f6, f22, f6, f14
  527. fabs f14, f30
  528. fsel f7, f23, f7, f15
  529. fabs f15, f31
  530. LFDX f28, X, INCXM1
  531. LFDUX f29, X, INCX
  532. LFDX f30, X, INCXM1
  533. LFDUX f31, X, INCX
  534. bdnz LL(1010)
  535. .align 4
  536. LL(1020):
  537. fsub f16, f0, f8
  538. fsub f17, f1, f9
  539. fsub f18, f2, f10
  540. fsub f19, f3, f11
  541. fsub f20, f4, f12
  542. fsub f21, f5, f13
  543. fsub f22, f6, f14
  544. fsub f23, f7, f15
  545. fsel f0, f16, f0, f8
  546. fabs f8, f24
  547. fsel f1, f17, f1, f9
  548. fabs f9, f25
  549. fsel f2, f18, f2, f10
  550. fabs f10, f26
  551. fsel f3, f19, f3, f11
  552. fabs f11, f27
  553. fsel f4, f20, f4, f12
  554. fabs f12, f28
  555. fsel f5, f21, f5, f13
  556. fabs f13, f29
  557. fsel f6, f22, f6, f14
  558. fabs f14, f30
  559. fsel f7, f23, f7, f15
  560. fabs f15, f31
  561. fsub f16, f0, f8
  562. fsub f17, f1, f9
  563. fsub f18, f2, f10
  564. fsub f19, f3, f11
  565. fsub f20, f4, f12
  566. fsub f21, f5, f13
  567. fsub f22, f6, f14
  568. fsub f23, f7, f15
  569. fsel f0, f16, f0, f8
  570. fsel f1, f17, f1, f9
  571. fsel f2, f18, f2, f10
  572. fsel f3, f19, f3, f11
  573. fsel f4, f20, f4, f12
  574. fsel f5, f21, f5, f13
  575. fsel f6, f22, f6, f14
  576. fsel f7, f23, f7, f15
  577. .align 4
  578. LL(1050):
  579. andi. r0, N, 7
  580. mtspr CTR, r0
  581. beq LL(1999)
  582. .align 4
  583. LL(1060):
  584. LFDX f8, X, INCXM1
  585. LFDUX f9, X, INCX
  586. fabs f8, f8
  587. fabs f9, f9
  588. fsub f16, f0, f8
  589. fsub f17, f1, f9
  590. fsel f0, f16, f0, f8
  591. fsel f1, f17, f1, f9
  592. bdnz LL(1060)
  593. .align 4
  594. LL(1999):
  595. fsub f8, f0, f1
  596. fsub f9, f2, f3
  597. fsub f10, f4, f5
  598. fsub f11, f6, f7
  599. fsel f0, f8, f0, f1
  600. fsel f2, f9, f2, f3
  601. fsel f4, f10, f4, f5
  602. fsel f6, f11, f6, f7
  603. fsub f8, f0, f2
  604. fsub f9, f4, f6
  605. fsel f0, f8, f0, f2
  606. fsel f4, f9, f4, f6
  607. fsub f8, f0, f4
  608. fsel f31, f8, f0, f4
  609. lfs f1, FZERO
  610. lfs f0, FONE
  611. fcmpu cr0, f1, f31
  612. beq- cr0, LL(9999)
  613. fdiv f30, f0, f31
  614. fmr f0, f1
  615. fmr f2, f1
  616. fmr f3, f1
  617. fmr f4, f1
  618. fmr f5, f1
  619. fmr f6, f1
  620. fmr f7, f1
  621. sub XX, XX, INCXM1
  622. srawi. r0, NN, 3
  623. mtspr CTR, r0
  624. beq- cr0, LL(1150)
  625. LFDX f8, XX, INCXM1
  626. LFDUX f9, XX, INCX
  627. LFDX f10, XX, INCXM1
  628. LFDUX f11, XX, INCX
  629. LFDX f12, XX, INCXM1
  630. LFDUX f13, XX, INCX
  631. LFDX f14, XX, INCXM1
  632. LFDUX f15, XX, INCX
  633. fmul f16, f30, f8
  634. fmul f17, f30, f9
  635. fmul f18, f30, f10
  636. fmul f19, f30, f11
  637. LFDX f8, XX, INCXM1
  638. LFDUX f9, XX, INCX
  639. LFDX f10, XX, INCXM1
  640. LFDUX f11, XX, INCX
  641. fmul f20, f30, f12
  642. fmul f21, f30, f13
  643. fmul f22, f30, f14
  644. fmul f23, f30, f15
  645. LFDX f12, XX, INCXM1
  646. LFDUX f13, XX, INCX
  647. LFDX f14, XX, INCXM1
  648. LFDUX f15, XX, INCX
  649. bdz LL(1120)
  650. .align 4
  651. LL(1110):
  652. fmadd f0, f16, f16, f0
  653. fmul f16, f30, f8
  654. fmadd f1, f17, f17, f1
  655. fmul f17, f30, f9
  656. fmadd f2, f18, f18, f2
  657. fmul f18, f30, f10
  658. fmadd f3, f19, f19, f3
  659. fmul f19, f30, f11
  660. LFDX f8, XX, INCXM1
  661. LFDUX f9, XX, INCX
  662. LFDX f10, XX, INCXM1
  663. LFDUX f11, XX, INCX
  664. fmadd f4, f20, f20, f4
  665. fmul f20, f30, f12
  666. fmadd f5, f21, f21, f5
  667. fmul f21, f30, f13
  668. fmadd f6, f22, f22, f6
  669. fmul f22, f30, f14
  670. fmadd f7, f23, f23, f7
  671. fmul f23, f30, f15
  672. LFDX f12, XX, INCXM1
  673. LFDUX f13, XX, INCX
  674. LFDX f14, XX, INCXM1
  675. LFDUX f15, XX, INCX
  676. fmadd f0, f16, f16, f0
  677. fmul f16, f30, f8
  678. fmadd f1, f17, f17, f1
  679. fmul f17, f30, f9
  680. fmadd f2, f18, f18, f2
  681. fmul f18, f30, f10
  682. fmadd f3, f19, f19, f3
  683. fmul f19, f30, f11
  684. LFDX f8, XX, INCXM1
  685. LFDUX f9, XX, INCX
  686. LFDX f10, XX, INCXM1
  687. LFDUX f11, XX, INCX
  688. fmadd f4, f20, f20, f4
  689. fmul f20, f30, f12
  690. fmadd f5, f21, f21, f5
  691. fmul f21, f30, f13
  692. fmadd f6, f22, f22, f6
  693. fmul f22, f30, f14
  694. fmadd f7, f23, f23, f7
  695. fmul f23, f30, f15
  696. LFDX f12, XX, INCXM1
  697. LFDUX f13, XX, INCX
  698. LFDX f14, XX, INCXM1
  699. LFDUX f15, XX, INCX
  700. bdnz LL(1110)
  701. .align 4
  702. LL(1120):
  703. fmadd f0, f16, f16, f0
  704. fmul f16, f30, f8
  705. fmadd f1, f17, f17, f1
  706. fmul f17, f30, f9
  707. fmadd f2, f18, f18, f2
  708. fmul f18, f30, f10
  709. fmadd f3, f19, f19, f3
  710. fmul f19, f30, f11
  711. fmadd f4, f20, f20, f4
  712. fmul f20, f30, f12
  713. fmadd f5, f21, f21, f5
  714. fmul f21, f30, f13
  715. fmadd f6, f22, f22, f6
  716. fmul f22, f30, f14
  717. fmadd f7, f23, f23, f7
  718. fmul f23, f30, f15
  719. fmadd f0, f16, f16, f0
  720. fmadd f1, f17, f17, f1
  721. fmadd f2, f18, f18, f2
  722. fmadd f3, f19, f19, f3
  723. fmadd f4, f20, f20, f4
  724. fmadd f5, f21, f21, f5
  725. fmadd f6, f22, f22, f6
  726. fmadd f7, f23, f23, f7
  727. .align 4
  728. LL(1150):
  729. andi. r0, NN, 7
  730. mtspr CTR, r0
  731. beq- cr0, LL(1170)
  732. .align 4
  733. LL(1160):
  734. LFDX f8, XX, INCXM1
  735. LFDUX f9, XX, INCX
  736. fmul f16, f30, f8
  737. fmul f17, f30, f9
  738. fmadd f0, f16, f16, f0
  739. fmadd f1, f17, f17, f1
  740. bdnz LL(1160)
  741. .align 4
  742. LL(1170):
  743. fadd f0, f0, f1
  744. fadd f2, f2, f3
  745. fadd f4, f4, f5
  746. fadd f6, f6, f7
  747. fadd f0, f0, f2
  748. fadd f4, f4, f6
  749. fadd f0, f0, f4
  750. fsqrt f0, f0
  751. fmul f1, f31, f0
  752. .align 4
  753. LL(9999):
  754. lfd f14, 0(SP)
  755. lfd f15, 8(SP)
  756. lfd f16, 16(SP)
  757. lfd f17, 24(SP)
  758. lfd f18, 32(SP)
  759. lfd f19, 40(SP)
  760. lfd f20, 48(SP)
  761. lfd f21, 56(SP)
  762. lfd f22, 64(SP)
  763. lfd f23, 72(SP)
  764. lfd f24, 80(SP)
  765. lfd f25, 88(SP)
  766. lfd f26, 96(SP)
  767. lfd f27, 104(SP)
  768. lfd f28, 112(SP)
  769. lfd f29, 120(SP)
  770. lfd f30, 128(SP)
  771. lfd f31, 136(SP)
  772. addi SP, SP, STACKSIZE
  773. blr
  774. EPILOGUE