You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cnrm2_hummer.S 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define INCX2 r6
  44. #define X2 r7
  45. #define C1 f1
  46. #define C2 f0
  47. #define C3 f2
  48. #define C4 f3
  49. #define C5 f4
  50. #define C6 f5
  51. #define C7 f6
  52. #define C8 f7
  53. #define A1 f8
  54. #define A2 f9
  55. #define A3 f10
  56. #define A4 f11
  57. #define A5 f12
  58. #define A6 f13
  59. #define A7 f14
  60. #define A8 f15
  61. #define A9 f16
  62. #define A10 f17
  63. #define A11 f18
  64. #define A12 f19
  65. #define A13 f20
  66. #define A14 f21
  67. #define A15 f22
  68. #define A16 f23
  69. PROLOGUE
  70. PROFCODE
  71. li r10, -16
  72. stfpdux f14, SP, r10
  73. stfpdux f15, SP, r10
  74. stfpdux f16, SP, r10
  75. stfpdux f17, SP, r10
  76. stfpdux f18, SP, r10
  77. stfpdux f19, SP, r10
  78. stfpdux f20, SP, r10
  79. stfpdux f21, SP, r10
  80. stfpdux f22, SP, r10
  81. stfpdux f23, SP, r10
  82. li r10, 0
  83. stwu r10, -4(SP)
  84. stwu r10, -4(SP)
  85. stwu r10, -4(SP)
  86. stwu r10, -4(SP)
  87. #ifdef F_INTERFACE
  88. LDINT N, 0(N)
  89. LDINT INCX, 0(INCX)
  90. #endif
  91. lfpdx C1, SP, r10 # Zero clear
  92. slwi INCX, INCX, BASE_SHIFT
  93. add INCX2, INCX, INCX
  94. fpmr C2, C1
  95. fpmr C3, C1
  96. fpmr C4, C1
  97. fpmr C5, C1
  98. fpmr C6, C1
  99. fpmr C7, C1
  100. fpmr C8, C1
  101. cmpwi cr0, N, 0
  102. ble LL(99)
  103. cmpwi cr0, INCX, 0
  104. beq LL(99)
  105. andi. r0, X, 2 * SIZE - 1
  106. bne LL(100)
  107. srawi. r0, N, 4
  108. sub X, X, INCX2
  109. mtspr CTR, r0
  110. beq- LL(15)
  111. LFPDUX A1, X, INCX2
  112. LFPDUX A2, X, INCX2
  113. LFPDUX A3, X, INCX2
  114. LFPDUX A4, X, INCX2
  115. LFPDUX A5, X, INCX2
  116. LFPDUX A6, X, INCX2
  117. LFPDUX A7, X, INCX2
  118. LFPDUX A8, X, INCX2
  119. LFPDUX A9, X, INCX2
  120. LFPDUX A10, X, INCX2
  121. LFPDUX A11, X, INCX2
  122. LFPDUX A12, X, INCX2
  123. LFPDUX A13, X, INCX2
  124. LFPDUX A14, X, INCX2
  125. LFPDUX A15, X, INCX2
  126. LFPDUX A16, X, INCX2
  127. bdz LL(13)
  128. .align 4
  129. LL(12):
  130. fpmadd C1, A1, A1, C1
  131. LFPDUX A1, X, INCX2
  132. fpmadd C2, A2, A2, C2
  133. LFPDUX A2, X, INCX2
  134. fpmadd C3, A3, A3, C3
  135. LFPDUX A3, X, INCX2
  136. fpmadd C4, A4, A4, C4
  137. LFPDUX A4, X, INCX2
  138. fpmadd C5, A5, A5, C5
  139. LFPDUX A5, X, INCX2
  140. fpmadd C6, A6, A6, C6
  141. LFPDUX A6, X, INCX2
  142. fpmadd C7, A7, A7, C7
  143. LFPDUX A7, X, INCX2
  144. fpmadd C8, A8, A8, C8
  145. LFPDUX A8, X, INCX2
  146. fpmadd C1, A9, A9, C1
  147. LFPDUX A9, X, INCX2
  148. fpmadd C2, A10, A10, C2
  149. LFPDUX A10, X, INCX2
  150. fpmadd C3, A11, A11, C3
  151. LFPDUX A11, X, INCX2
  152. fpmadd C4, A12, A12, C4
  153. LFPDUX A12, X, INCX2
  154. fpmadd C5, A13, A13, C5
  155. LFPDUX A13, X, INCX2
  156. fpmadd C6, A14, A14, C6
  157. LFPDUX A14, X, INCX2
  158. fpmadd C7, A15, A15, C7
  159. LFPDUX A15, X, INCX2
  160. fpmadd C8, A16, A16, C8
  161. LFPDUX A16, X, INCX2
  162. bdnz LL(12)
  163. .align 4
  164. LL(13):
  165. fpmadd C1, A1, A1, C1
  166. fpmadd C2, A2, A2, C2
  167. fpmadd C3, A3, A3, C3
  168. fpmadd C4, A4, A4, C4
  169. fpmadd C5, A5, A5, C5
  170. fpmadd C6, A6, A6, C6
  171. fpmadd C7, A7, A7, C7
  172. fpmadd C8, A8, A8, C8
  173. fpmadd C1, A9, A9, C1
  174. fpmadd C2, A10, A10, C2
  175. fpmadd C3, A11, A11, C3
  176. fpmadd C4, A12, A12, C4
  177. fpmadd C5, A13, A13, C5
  178. fpmadd C6, A14, A14, C6
  179. fpmadd C7, A15, A15, C7
  180. fpmadd C8, A16, A16, C8
  181. .align 4
  182. LL(15):
  183. andi. r0, N, 15
  184. beq LL(98)
  185. andi. r0, N, 8
  186. beq LL(16)
  187. LFPDUX A1, X, INCX2
  188. LFPDUX A2, X, INCX2
  189. LFPDUX A3, X, INCX2
  190. LFPDUX A4, X, INCX2
  191. LFPDUX A5, X, INCX2
  192. LFPDUX A6, X, INCX2
  193. LFPDUX A7, X, INCX2
  194. LFPDUX A8, X, INCX2
  195. fpmadd C1, A1, A1, C1
  196. fpmadd C2, A2, A2, C2
  197. fpmadd C3, A3, A3, C3
  198. fpmadd C4, A4, A4, C4
  199. fpmadd C5, A5, A5, C5
  200. fpmadd C6, A6, A6, C6
  201. fpmadd C7, A7, A7, C7
  202. fpmadd C8, A8, A8, C8
  203. .align 4
  204. LL(16):
  205. andi. r0, N, 4
  206. beq LL(17)
  207. LFPDUX A1, X, INCX2
  208. LFPDUX A2, X, INCX2
  209. LFPDUX A3, X, INCX2
  210. LFPDUX A4, X, INCX2
  211. fpmadd C1, A1, A1, C1
  212. fpmadd C2, A2, A2, C2
  213. fpmadd C3, A3, A3, C3
  214. fpmadd C4, A4, A4, C4
  215. .align 4
  216. LL(17):
  217. andi. r0, N, 2
  218. beq LL(18)
  219. LFPDUX A1, X, INCX2
  220. LFPDUX A2, X, INCX2
  221. fpmadd C1, A1, A1, C1
  222. fpmadd C2, A2, A2, C2
  223. .align 4
  224. LL(18):
  225. andi. r0, N, 1
  226. beq LL(98)
  227. LFPDUX A1, X, INCX2
  228. fpmadd C3, A1, A1, C3
  229. .align 4
  230. LL(98):
  231. fpadd C1, C1, C5
  232. lis r3, 0x3f00
  233. fpadd C2, C2, C6
  234. lis r4, 0x4040
  235. fpadd C3, C3, C7
  236. stw r3, 4(SP)
  237. fpadd C4, C4, C8
  238. stw r4, 8(SP)
  239. fpadd C1, C1, C2
  240. lfs f10, 0(SP)
  241. fpadd C3, C3, C4
  242. lfs f11, 4(SP)
  243. fpadd C1, C1, C3
  244. lfs f12, 8(SP)
  245. fsmtp C2, C1
  246. fadd C1, C2, C1
  247. fcmpu cr0, f10, C1
  248. beq cr0, LL(99)
  249. #ifndef HUMMER_EMULATOR
  250. frsqrte f9, f1
  251. li r10, 16
  252. fmul f2, f1, f9
  253. lfpdux f23, SP, r10
  254. fmul f3, f9, f11
  255. lfpdux f22, SP, r10
  256. fnmsub f4, f2, f9, f12
  257. lfpdux f21, SP, r10
  258. fmul f9, f3, f4
  259. lfpdux f20, SP, r10
  260. fadd f13, f11, f11
  261. lfpdux f19, SP, r10
  262. fmul f12, f1, f9
  263. lfpdux f18, SP, r10
  264. fmul f11, f12, f11
  265. lfpdux f17, SP, r10
  266. fnmsub f1, f12, f9, f13
  267. lfpdux f16, SP, r10
  268. lfpdux f15, SP, r10
  269. lfpdux f14, SP, r10
  270. addi SP, SP, 16
  271. fmadd f1, f11, f1, f12
  272. blr
  273. #else
  274. fsqrt f1, f1
  275. li r10, 16
  276. lfpdux f23, SP, r10
  277. lfpdux f22, SP, r10
  278. lfpdux f21, SP, r10
  279. lfpdux f20, SP, r10
  280. lfpdux f19, SP, r10
  281. lfpdux f18, SP, r10
  282. lfpdux f17, SP, r10
  283. lfpdux f16, SP, r10
  284. lfpdux f15, SP, r10
  285. lfpdux f14, SP, r10
  286. addi SP, SP, 16
  287. blr
  288. #endif
  289. .align 4
  290. LL(99):
  291. li r10, 16
  292. lfpdux f23, SP, r10
  293. lfpdux f22, SP, r10
  294. lfpdux f21, SP, r10
  295. lfpdux f20, SP, r10
  296. lfpdux f19, SP, r10
  297. lfpdux f18, SP, r10
  298. lfpdux f17, SP, r10
  299. lfpdux f16, SP, r10
  300. lfpdux f15, SP, r10
  301. lfpdux f14, SP, r10
  302. addi SP, SP, 16
  303. blr
  304. .align 4
  305. LL(100):
  306. cmpwi cr0, INCX, SIZE
  307. bne LL(200)
  308. LFD C1, 0(X)
  309. addi X, X, 1 * SIZE
  310. addi N, N, -1
  311. cmpwi cr0, N, 0
  312. fmul C1, C1, C1
  313. sub X, X, INCX2
  314. ble LL(198)
  315. srawi. r0, N, 4
  316. mtspr CTR, r0
  317. beq- LL(115)
  318. LFPDUX A1, X, INCX2
  319. LFPDUX A2, X, INCX2
  320. LFPDUX A3, X, INCX2
  321. LFPDUX A4, X, INCX2
  322. LFPDUX A5, X, INCX2
  323. LFPDUX A6, X, INCX2
  324. LFPDUX A7, X, INCX2
  325. LFPDUX A8, X, INCX2
  326. LFPDUX A9, X, INCX2
  327. LFPDUX A10, X, INCX2
  328. LFPDUX A11, X, INCX2
  329. LFPDUX A12, X, INCX2
  330. LFPDUX A13, X, INCX2
  331. LFPDUX A14, X, INCX2
  332. LFPDUX A15, X, INCX2
  333. LFPDUX A16, X, INCX2
  334. bdz LL(113)
  335. .align 4
  336. LL(112):
  337. fpmadd C1, A1, A1, C1
  338. LFPDUX A1, X, INCX2
  339. fpmadd C2, A2, A2, C2
  340. LFPDUX A2, X, INCX2
  341. fpmadd C3, A3, A3, C3
  342. LFPDUX A3, X, INCX2
  343. fpmadd C4, A4, A4, C4
  344. LFPDUX A4, X, INCX2
  345. fpmadd C5, A5, A5, C5
  346. LFPDUX A5, X, INCX2
  347. fpmadd C6, A6, A6, C6
  348. LFPDUX A6, X, INCX2
  349. fpmadd C7, A7, A7, C7
  350. LFPDUX A7, X, INCX2
  351. fpmadd C8, A8, A8, C8
  352. LFPDUX A8, X, INCX2
  353. fpmadd C1, A9, A9, C1
  354. LFPDUX A9, X, INCX2
  355. fpmadd C2, A10, A10, C2
  356. LFPDUX A10, X, INCX2
  357. fpmadd C3, A11, A11, C3
  358. LFPDUX A11, X, INCX2
  359. fpmadd C4, A12, A12, C4
  360. LFPDUX A12, X, INCX2
  361. fpmadd C5, A13, A13, C5
  362. LFPDUX A13, X, INCX2
  363. fpmadd C6, A14, A14, C6
  364. LFPDUX A14, X, INCX2
  365. fpmadd C7, A15, A15, C7
  366. LFPDUX A15, X, INCX2
  367. fpmadd C8, A16, A16, C8
  368. LFPDUX A16, X, INCX2
  369. bdnz LL(112)
  370. .align 4
  371. LL(113):
  372. fpmadd C1, A1, A1, C1
  373. fpmadd C2, A2, A2, C2
  374. fpmadd C3, A3, A3, C3
  375. fpmadd C4, A4, A4, C4
  376. fpmadd C5, A5, A5, C5
  377. fpmadd C6, A6, A6, C6
  378. fpmadd C7, A7, A7, C7
  379. fpmadd C8, A8, A8, C8
  380. fpmadd C1, A9, A9, C1
  381. fpmadd C2, A10, A10, C2
  382. fpmadd C3, A11, A11, C3
  383. fpmadd C4, A12, A12, C4
  384. fpmadd C5, A13, A13, C5
  385. fpmadd C6, A14, A14, C6
  386. fpmadd C7, A15, A15, C7
  387. fpmadd C8, A16, A16, C8
  388. .align 4
  389. LL(115):
  390. andi. r0, N, 15
  391. beq LL(198)
  392. andi. r0, N, 8
  393. beq LL(116)
  394. LFPDUX A1, X, INCX2
  395. LFPDUX A2, X, INCX2
  396. LFPDUX A3, X, INCX2
  397. LFPDUX A4, X, INCX2
  398. LFPDUX A5, X, INCX2
  399. LFPDUX A6, X, INCX2
  400. LFPDUX A7, X, INCX2
  401. LFPDUX A8, X, INCX2
  402. fpmadd C1, A1, A1, C1
  403. fpmadd C2, A2, A2, C2
  404. fpmadd C3, A3, A3, C3
  405. fpmadd C4, A4, A4, C4
  406. fpmadd C5, A5, A5, C5
  407. fpmadd C6, A6, A6, C6
  408. fpmadd C7, A7, A7, C7
  409. fpmadd C8, A8, A8, C8
  410. .align 4
  411. LL(116):
  412. andi. r0, N, 4
  413. beq LL(117)
  414. LFPDUX A1, X, INCX2
  415. LFPDUX A2, X, INCX2
  416. LFPDUX A3, X, INCX2
  417. LFPDUX A4, X, INCX2
  418. fpmadd C1, A1, A1, C1
  419. fpmadd C2, A2, A2, C2
  420. fpmadd C3, A3, A3, C3
  421. fpmadd C4, A4, A4, C4
  422. .align 4
  423. LL(117):
  424. andi. r0, N, 2
  425. beq LL(118)
  426. LFPDUX A1, X, INCX2
  427. LFPDUX A2, X, INCX2
  428. fpmadd C1, A1, A1, C1
  429. fpmadd C2, A2, A2, C2
  430. .align 4
  431. LL(118):
  432. andi. r0, N, 1
  433. beq LL(198)
  434. LFPDUX A1, X, INCX2
  435. fpmadd C3, A1, A1, C3
  436. .align 4
  437. LL(198):
  438. LFDX A1, X, INCX2
  439. fmadd C4, A1, A1, C4
  440. fpadd C1, C1, C5
  441. lis r3, 0x3f00
  442. fpadd C2, C2, C6
  443. lis r4, 0x4040
  444. fpadd C3, C3, C7
  445. stw r3, 4(SP)
  446. fpadd C4, C4, C8
  447. stw r4, 8(SP)
  448. fpadd C1, C1, C2
  449. lfs f10, 0(SP)
  450. fpadd C3, C3, C4
  451. lfs f11, 4(SP)
  452. fpadd C1, C1, C3
  453. lfs f12, 8(SP)
  454. fsmtp C2, C1
  455. fadd C1, C2, C1
  456. fcmpu cr0, f10, C1
  457. beq cr0, LL(199)
  458. #ifndef HUMMER_EMULATOR
  459. frsqrte f9, f1
  460. li r10, 16
  461. fmul f2, f1, f9
  462. lfpdux f23, SP, r10
  463. fmul f3, f9, f11
  464. lfpdux f22, SP, r10
  465. fnmsub f4, f2, f9, f12
  466. lfpdux f21, SP, r10
  467. fmul f9, f3, f4
  468. lfpdux f20, SP, r10
  469. fadd f13, f11, f11
  470. lfpdux f19, SP, r10
  471. fmul f12, f1, f9
  472. lfpdux f18, SP, r10
  473. fmul f11, f12, f11
  474. lfpdux f17, SP, r10
  475. fnmsub f1, f12, f9, f13
  476. lfpdux f16, SP, r10
  477. lfpdux f15, SP, r10
  478. lfpdux f14, SP, r10
  479. addi SP, SP, 16
  480. fmadd f1, f11, f1, f12
  481. blr
  482. #else
  483. fsqrt f1, f1
  484. li r10, 16
  485. lfpdux f23, SP, r10
  486. lfpdux f22, SP, r10
  487. lfpdux f21, SP, r10
  488. lfpdux f20, SP, r10
  489. lfpdux f19, SP, r10
  490. lfpdux f18, SP, r10
  491. lfpdux f17, SP, r10
  492. lfpdux f16, SP, r10
  493. lfpdux f15, SP, r10
  494. lfpdux f14, SP, r10
  495. addi SP, SP, 16
  496. blr
  497. #endif
  498. .align 4
  499. LL(199):
  500. li r10, 16
  501. lfpdux f23, SP, r10
  502. lfpdux f22, SP, r10
  503. lfpdux f21, SP, r10
  504. lfpdux f20, SP, r10
  505. lfpdux f19, SP, r10
  506. lfpdux f18, SP, r10
  507. lfpdux f17, SP, r10
  508. lfpdux f16, SP, r10
  509. lfpdux f15, SP, r10
  510. lfpdux f14, SP, r10
  511. addi SP, SP, 16
  512. blr
  513. .align 4
  514. LL(200):
  515. sub X, X, INCX2
  516. addi X2, X, SIZE
  517. srawi. r0, N, 3
  518. mtspr CTR, r0
  519. beq- LL(215)
  520. LFDUX A1, X, INCX2
  521. LFDUX A2, X2, INCX2
  522. LFDUX A3, X, INCX2
  523. LFDUX A4, X2, INCX2
  524. LFDUX A5, X, INCX2
  525. LFDUX A6, X2, INCX2
  526. LFDUX A7, X, INCX2
  527. LFDUX A8, X2, INCX2
  528. LFDUX A9, X, INCX2
  529. LFDUX A10, X2, INCX2
  530. LFDUX A11, X, INCX2
  531. LFDUX A12, X2, INCX2
  532. LFDUX A13, X, INCX2
  533. LFDUX A14, X2, INCX2
  534. LFDUX A15, X, INCX2
  535. LFDUX A16, X2, INCX2
  536. bdz LL(213)
  537. .align 4
  538. LL(212):
  539. fmadd C1, A1, A1, C1
  540. LFDUX A1, X, INCX2
  541. fmadd C2, A2, A2, C2
  542. LFDUX A2, X2, INCX2
  543. fmadd C3, A3, A3, C3
  544. LFDUX A3, X, INCX2
  545. fmadd C4, A4, A4, C4
  546. LFDUX A4, X2, INCX2
  547. fmadd C5, A5, A5, C5
  548. LFDUX A5, X, INCX2
  549. fmadd C6, A6, A6, C6
  550. LFDUX A6, X2, INCX2
  551. fmadd C7, A7, A7, C7
  552. LFDUX A7, X, INCX2
  553. fmadd C8, A8, A8, C8
  554. LFDUX A8, X2, INCX2
  555. fmadd C1, A9, A9, C1
  556. LFDUX A9, X, INCX2
  557. fmadd C2, A10, A10, C2
  558. LFDUX A10, X2, INCX2
  559. fmadd C3, A11, A11, C3
  560. LFDUX A11, X, INCX2
  561. fmadd C4, A12, A12, C4
  562. LFDUX A12, X2, INCX2
  563. fmadd C5, A13, A13, C5
  564. LFDUX A13, X, INCX2
  565. fmadd C6, A14, A14, C6
  566. LFDUX A14, X2, INCX2
  567. fmadd C7, A15, A15, C7
  568. LFDUX A15, X, INCX2
  569. fmadd C8, A16, A16, C8
  570. LFDUX A16, X2, INCX2
  571. bdnz LL(212)
  572. .align 4
  573. LL(213):
  574. fmadd C1, A1, A1, C1
  575. fmadd C2, A2, A2, C2
  576. fmadd C3, A3, A3, C3
  577. fmadd C4, A4, A4, C4
  578. fmadd C5, A5, A5, C5
  579. fmadd C6, A6, A6, C6
  580. fmadd C7, A7, A7, C7
  581. fmadd C8, A8, A8, C8
  582. fmadd C1, A9, A9, C1
  583. fmadd C2, A10, A10, C2
  584. fmadd C3, A11, A11, C3
  585. fmadd C4, A12, A12, C4
  586. fmadd C5, A13, A13, C5
  587. fmadd C6, A14, A14, C6
  588. fmadd C7, A15, A15, C7
  589. fmadd C8, A16, A16, C8
  590. .align 4
  591. LL(215):
  592. andi. r0, N, 7
  593. beq LL(998)
  594. andi. r0, N, 4
  595. beq LL(216)
  596. LFDUX A1, X, INCX2
  597. LFDUX A2, X2, INCX2
  598. LFDUX A3, X, INCX2
  599. LFDUX A4, X2, INCX2
  600. LFDUX A5, X, INCX2
  601. LFDUX A6, X2, INCX2
  602. LFDUX A7, X, INCX2
  603. LFDUX A8, X2, INCX2
  604. fmadd C1, A1, A1, C1
  605. fmadd C2, A2, A2, C2
  606. fmadd C3, A3, A3, C3
  607. fmadd C4, A4, A4, C4
  608. fmadd C5, A5, A5, C5
  609. fmadd C6, A6, A6, C6
  610. fmadd C7, A7, A7, C7
  611. fmadd C8, A8, A8, C8
  612. .align 4
  613. LL(216):
  614. andi. r0, N, 2
  615. beq LL(217)
  616. LFDUX A1, X, INCX2
  617. LFDUX A2, X2, INCX2
  618. LFDUX A3, X, INCX2
  619. LFDUX A4, X2, INCX2
  620. fmadd C1, A1, A1, C1
  621. fmadd C2, A2, A2, C2
  622. fmadd C3, A3, A3, C3
  623. fmadd C4, A4, A4, C4
  624. .align 4
  625. LL(217):
  626. andi. r0, N, 1
  627. beq LL(998)
  628. LFDUX A1, X, INCX2
  629. LFDUX A2, X2, INCX2
  630. fmadd C1, A1, A1, C1
  631. fmadd C2, A2, A2, C2
  632. .align 4
  633. LL(998):
  634. fadd C1, C1, C5
  635. lis r3, 0x3f00
  636. fadd C2, C2, C6
  637. lis r4, 0x4040
  638. fadd C3, C3, C7
  639. stw r3, 4(SP)
  640. fadd C4, C4, C8
  641. stw r4, 8(SP)
  642. fadd C1, C1, C2
  643. lfs f10, 0(SP)
  644. fadd C3, C3, C4
  645. lfs f11, 4(SP)
  646. fadd C1, C1, C3
  647. lfs f12, 8(SP)
  648. fcmpu cr0, f10, C1
  649. beq cr0, LL(99)
  650. frsqrte f9, f1
  651. li r10, 16
  652. fmul f2, f1, f9
  653. lfpdux f23, SP, r10
  654. fmul f3, f9, f11
  655. lfpdux f22, SP, r10
  656. fnmsub f4, f2, f9, f12
  657. lfpdux f21, SP, r10
  658. fmul f9, f3, f4
  659. lfpdux f20, SP, r10
  660. fadd f13, f11, f11
  661. lfpdux f19, SP, r10
  662. fmul f12, f1, f9
  663. lfpdux f18, SP, r10
  664. fmul f11, f12, f11
  665. lfpdux f17, SP, r10
  666. fnmsub f1, f12, f9, f13
  667. lfpdux f16, SP, r10
  668. lfpdux f15, SP, r10
  669. lfpdux f14, SP, r10
  670. addi SP, SP, 16
  671. fmadd f1, f11, f1, f12
  672. blr
  673. LL(999):
  674. li r10, 16
  675. lfpdux f23, SP, r10
  676. lfpdux f22, SP, r10
  677. lfpdux f21, SP, r10
  678. lfpdux f20, SP, r10
  679. lfpdux f19, SP, r10
  680. lfpdux f18, SP, r10
  681. lfpdux f17, SP, r10
  682. lfpdux f16, SP, r10
  683. lfpdux f15, SP, r10
  684. lfpdux f14, SP, r10
  685. addi SP, SP, 16
  686. blr
  687. EPILOGUE