You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT_2x2.S 19 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define J 0 + STACK(%esp)
  43. #define KK 4 + STACK(%esp)
  44. #define KKK 8 + STACK(%esp)
  45. #define AORIG 12 + STACK(%esp)
  46. #define M 4 + STACK + ARGS(%esp)
  47. #define N 8 + STACK + ARGS(%esp)
  48. #define K 12 + STACK + ARGS(%esp)
  49. #define ALPHA 16 + STACK + ARGS(%esp)
  50. #ifdef DOUBLE
  51. #define A 24 + STACK + ARGS(%esp)
  52. #define B 28 + STACK + ARGS(%esp)
  53. #define C 32 + STACK + ARGS(%esp)
  54. #define LDC 36 + STACK + ARGS(%esp)
  55. #define OFFSET 40 + STACK + ARGS(%esp)
  56. #else
  57. #define A 20 + STACK + ARGS(%esp)
  58. #define B 24 + STACK + ARGS(%esp)
  59. #define C 28 + STACK + ARGS(%esp)
  60. #define LDC 32 + STACK + ARGS(%esp)
  61. #define OFFSET 36 + STACK + ARGS(%esp)
  62. #endif
  63. #define PREFETCH_OFFSET 48
  64. #if defined(PENTIUM3) || defined(PENTIUMM)
  65. #define REP rep
  66. #else
  67. #define REP rep
  68. #endif
  69. #define AA %edx
  70. #define BB %ecx
  71. PROLOGUE
  72. subl $ARGS, %esp # Generate Stack Frame
  73. pushl %ebp
  74. pushl %edi
  75. pushl %esi
  76. pushl %ebx
  77. PROFCODE
  78. movl LDC, %ebp # ldc # MEMORY
  79. movl B, %ebx
  80. leal (, %ebp, SIZE), %ebp
  81. #ifdef LN
  82. movl M, %eax
  83. leal (, %eax, SIZE), %eax
  84. addl %eax, C
  85. imull K, %eax
  86. addl %eax, A
  87. #endif
  88. #ifdef RT
  89. movl N, %eax
  90. leal (, %eax, SIZE), %eax
  91. imull K, %eax
  92. addl %eax, %ebx
  93. movl N, %eax
  94. imull %ebp, %eax
  95. addl %eax, C
  96. #endif
  97. #ifdef RN
  98. negl KK
  99. #endif
  100. #ifdef RT
  101. movl N, %eax
  102. subl OFFSET, %eax
  103. movl %eax, KK
  104. #endif
  105. movl N, %eax # n # MEMORY
  106. andl $1, %eax
  107. je .L8
  108. #if defined(LT) || defined(RN)
  109. movl A, AA
  110. #else
  111. movl A, %eax
  112. movl %eax, AORIG
  113. #endif
  114. #ifdef RT
  115. movl K, %eax
  116. sall $0 + BASE_SHIFT, %eax
  117. subl %eax, %ebx
  118. #endif
  119. #ifdef RT
  120. subl %ebp, C
  121. #endif
  122. movl C, %edi # c # MEMORY
  123. #ifndef RT
  124. addl %ebp, C
  125. #endif
  126. #ifdef LN
  127. movl OFFSET, %eax
  128. addl M, %eax
  129. movl %eax, KK
  130. #endif
  131. #ifdef LT
  132. movl OFFSET, %eax
  133. movl %eax, KK
  134. #endif
  135. movl M, %esi # m # MEMORY
  136. sarl $1, %esi # m >> 1
  137. je .L36
  138. ALIGN_4
  139. .L46:
  140. #ifdef LN
  141. movl K, %eax
  142. sall $1 + BASE_SHIFT, %eax
  143. subl %eax, AORIG
  144. #endif
  145. #if defined(LN) || defined(RT)
  146. movl KK, %eax
  147. leal (, %eax, SIZE), %eax
  148. movl AORIG, AA
  149. leal (AA, %eax, 2), AA
  150. leal (%ebx, %eax, 1), BB
  151. #else
  152. movl %ebx, BB
  153. #endif
  154. fldz
  155. fldz
  156. FLD 0 * SIZE(BB) # temp1 = *(boffset + 0)
  157. #if defined(LT) || defined(RN)
  158. movl KK, %eax
  159. #else
  160. movl K, %eax
  161. subl KK, %eax
  162. #endif
  163. sarl $1, %eax
  164. je .L56
  165. ALIGN_4
  166. .L57:
  167. FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0)
  168. fmul %st(1), %st
  169. faddp %st, %st(2)
  170. FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0)
  171. faddp %st, %st(2)
  172. FLD 1 * SIZE(BB) # temp1 = *(boffset + 0)
  173. FLD 2 * SIZE(AA) # temp2 = *(aoffset + 0)
  174. fmul %st(1), %st
  175. faddp %st, %st(2)
  176. FMUL 3 * SIZE(AA) # temp2 = *(aoffset + 0)
  177. faddp %st, %st(2)
  178. FLD 2 * SIZE(BB) # temp1 = *(boffset + 0)
  179. addl $4 * SIZE,AA
  180. addl $2 * SIZE,BB
  181. dec %eax
  182. jne .L57
  183. ALIGN_4
  184. .L56:
  185. #if defined(LT) || defined(RN)
  186. movl KK, %eax
  187. #else
  188. movl K, %eax
  189. subl KK, %eax
  190. #endif
  191. andl $1, %eax
  192. je .L45
  193. ALIGN_4
  194. FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0)
  195. fmul %st(1), %st
  196. faddp %st, %st(2)
  197. FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0)
  198. faddp %st, %st(2)
  199. FLD 3 * SIZE(BB) # temp1 = *(boffset + 0)
  200. addl $2 * SIZE,AA
  201. addl $1 * SIZE,BB
  202. ALIGN_4
  203. .L45:
  204. ffreep %st(0)
  205. #if defined(LN) || defined(RT)
  206. movl KK, %eax
  207. #ifdef LN
  208. subl $2, %eax
  209. #else
  210. subl $1, %eax
  211. #endif
  212. leal (, %eax, SIZE), %eax
  213. movl AORIG, AA
  214. leal (AA, %eax, 2), AA
  215. leal (%ebx, %eax, 1), BB
  216. #endif
  217. #if defined(LN) || defined(LT)
  218. FLD 0 * SIZE(BB)
  219. fsubp %st, %st(1)
  220. FLD 1 * SIZE(BB)
  221. fsubp %st, %st(2)
  222. #else
  223. FLD 0 * SIZE(AA)
  224. fsubp %st, %st(1)
  225. FLD 1 * SIZE(AA)
  226. fsubp %st, %st(2)
  227. #endif
  228. #ifdef LN
  229. FLD 3 * SIZE(AA)
  230. fmulp %st, %st(2)
  231. FLD 2 * SIZE(AA)
  232. fmul %st(2), %st
  233. fsubrp %st, %st(1)
  234. FLD 0 * SIZE(AA)
  235. fmulp %st, %st(1)
  236. #endif
  237. #ifdef LT
  238. FLD 0 * SIZE(AA)
  239. fmulp %st, %st(1)
  240. FLD 1 * SIZE(AA)
  241. fmul %st(1), %st
  242. fsubrp %st, %st(2)
  243. FLD 3 * SIZE(AA)
  244. fmulp %st, %st(2)
  245. #endif
  246. #ifdef RN
  247. FLD 0 * SIZE(BB)
  248. fmul %st, %st(1)
  249. fmulp %st, %st(2)
  250. #endif
  251. #ifdef RT
  252. FLD 0 * SIZE(BB)
  253. fmul %st, %st(1)
  254. fmulp %st, %st(2)
  255. #endif
  256. #ifdef LN
  257. subl $2 * SIZE, %edi
  258. #endif
  259. #if defined(LN) || defined(LT)
  260. FSTU 0 * SIZE(BB)
  261. fxch %st(1)
  262. FSTU 1 * SIZE(BB)
  263. #else
  264. FSTU 0 * SIZE(AA)
  265. fxch %st(1)
  266. FSTU 1 * SIZE(AA)
  267. #endif
  268. FST 1 * SIZE(%edi)
  269. FST 0 * SIZE(%edi)
  270. #ifndef LN
  271. addl $2 * SIZE, %edi
  272. #endif
  273. #if defined(LT) || defined(RN)
  274. movl K, %eax
  275. subl KK, %eax
  276. leal (,%eax, SIZE), %eax
  277. leal (AA, %eax, 2), AA
  278. leal (BB, %eax, 1), BB
  279. #endif
  280. #ifdef LN
  281. subl $2, KK
  282. #endif
  283. #ifdef LT
  284. addl $2, KK
  285. #endif
  286. #ifdef RT
  287. movl K, %eax
  288. sall $1 + BASE_SHIFT, %eax
  289. addl %eax, AORIG
  290. #endif
  291. decl %esi # i --
  292. jne .L46
  293. ALIGN_4
  294. .L36:
  295. movl M, %eax # m # MEMORY
  296. andl $1, %eax # m & 1
  297. je .L99
  298. #ifdef LN
  299. movl K, %eax
  300. sall $0 + BASE_SHIFT, %eax
  301. subl %eax, AORIG
  302. #endif
  303. #if defined(LN) || defined(RT)
  304. movl KK, %eax
  305. leal (, %eax, SIZE), %eax
  306. movl AORIG, AA
  307. leal (AA, %eax, 1), AA
  308. leal (%ebx, %eax, 1), BB
  309. #else
  310. movl %ebx, BB
  311. #endif
  312. fldz
  313. #if defined(LT) || defined(RN)
  314. movl KK, %eax
  315. #else
  316. movl K, %eax
  317. subl KK, %eax
  318. #endif
  319. test %eax, %eax
  320. jle .L52
  321. ALIGN_3
  322. .L51:
  323. FLD (AA)
  324. FMUL (BB)
  325. addl $1 * SIZE,AA
  326. addl $1 * SIZE,BB
  327. faddp %st,%st(1)
  328. decl %eax
  329. jne .L51
  330. ALIGN_4
  331. .L52:
  332. #if defined(LN) || defined(RT)
  333. movl KK, %eax
  334. #ifdef LN
  335. subl $1, %eax
  336. #else
  337. subl $1, %eax
  338. #endif
  339. leal (, %eax, SIZE), %eax
  340. movl AORIG, AA
  341. leal (AA, %eax, 1), AA
  342. leal (%ebx, %eax, 1), BB
  343. #endif
  344. #if defined(LN) || defined(LT)
  345. FLD 0 * SIZE(BB)
  346. fsubp %st, %st(1)
  347. #else
  348. FLD 0 * SIZE(AA)
  349. fsubp %st, %st(1)
  350. #endif
  351. #if defined(LN) || defined(LT)
  352. FMUL 0 * SIZE(AA)
  353. #else
  354. FMUL 0 * SIZE(BB)
  355. #endif
  356. #ifdef LN
  357. subl $1 * SIZE, %edi
  358. #endif
  359. #if defined(LN) || defined(LT)
  360. FSTU 0 * SIZE(BB)
  361. #else
  362. FSTU 0 * SIZE(AA)
  363. #endif
  364. FST 0 * SIZE(%edi)
  365. #ifndef LN
  366. addl $1 * SIZE, %edi
  367. #endif
  368. #if defined(LT) || defined(RN)
  369. movl K, %eax
  370. subl KK, %eax
  371. leal (,%eax, SIZE), %eax
  372. leal (AA, %eax, 1), AA
  373. leal (BB, %eax, 1), BB
  374. #endif
  375. #ifdef LN
  376. subl $1, KK
  377. #endif
  378. #ifdef LT
  379. addl $1, KK
  380. #endif
  381. #ifdef RT
  382. movl K, %eax
  383. sall $0 + BASE_SHIFT, %eax
  384. addl %eax, AORIG
  385. #endif
  386. ALIGN_4
  387. .L99:
  388. #ifdef LN
  389. movl K, %eax
  390. leal (%ebx, %eax, SIZE), %ebx
  391. #endif
  392. #if defined(LT) || defined(RN)
  393. movl BB, %ebx
  394. #endif
  395. #ifdef RN
  396. addl $1, KK
  397. #endif
  398. #ifdef RT
  399. subl $1, KK
  400. #endif
  401. ALIGN_4
  402. .L8:
  403. movl N, %eax # j = (n >> 1) # MEMORY
  404. sarl $1, %eax
  405. movl %eax, J # j = (n >> 1) # MEMORY
  406. je .End
  407. ALIGN_4
  408. .L34:
  409. #if defined(LT) || defined(RN)
  410. movl A, AA
  411. #else
  412. movl A, %eax
  413. movl %eax, AORIG
  414. #endif
  415. #ifdef RT
  416. movl K, %eax
  417. sall $1 + BASE_SHIFT, %eax
  418. subl %eax, %ebx
  419. #endif
  420. lea (, %ebp, 2), %eax
  421. #ifdef RT
  422. subl %eax, C
  423. #endif
  424. movl C, %edi
  425. #ifndef RT
  426. addl %eax, C
  427. #endif
  428. #ifdef LN
  429. movl OFFSET, %eax
  430. addl M, %eax
  431. movl %eax, KK
  432. #endif
  433. #ifdef LT
  434. movl OFFSET, %eax
  435. movl %eax, KK
  436. #endif
  437. movl M, %esi
  438. sarl $1, %esi
  439. je .L12
  440. ALIGN_4
  441. .MainHead:
  442. #ifdef LN
  443. movl K, %eax
  444. sall $1 + BASE_SHIFT, %eax
  445. subl %eax, AORIG
  446. #endif
  447. #if defined(LN) || defined(RT)
  448. movl KK, %eax
  449. leal (, %eax, SIZE), %eax
  450. movl AORIG, AA
  451. leal (AA, %eax, 2), AA
  452. leal (%ebx, %eax, 2), BB
  453. #else
  454. movl %ebx, BB
  455. #endif
  456. fldz
  457. fldz
  458. fldz
  459. fldz
  460. FLD 4 * SIZE(BB) # b5
  461. FLD 4 * SIZE(AA) # a5
  462. FLD 0 * SIZE(BB) # b1
  463. FLD 0 * SIZE(AA) # a1
  464. #if defined(HAVE_3DNOW)
  465. prefetchw 2 * SIZE(%edi)
  466. prefetchw 2 * SIZE(%edi, %ebp, 1)
  467. #elif defined(HAVE_SSE)
  468. prefetchnta 2 * SIZE(%edi)
  469. prefetchnta 2 * SIZE(%edi, %ebp, 1)
  470. #endif
  471. #if defined(LT) || defined(RN)
  472. movl KK, %eax
  473. #else
  474. movl K, %eax
  475. subl KK, %eax
  476. #endif
  477. sarl $2, %eax
  478. je .L16
  479. ALIGN_4
  480. .MainLoop:
  481. #if defined(HAVE_3DNOW)
  482. prefetch (PREFETCH_OFFSET) * SIZE(BB)
  483. nop
  484. #elif defined(HAVE_SSE)
  485. prefetchnta (PREFETCH_OFFSET) * SIZE(BB)
  486. #if (L2_SIZE == 524288)
  487. prefetcht0 (PREFETCH_OFFSET) * SIZE(AA)
  488. #endif
  489. #endif
  490. fmul %st, %st(1)
  491. FMUL 1 * SIZE(BB)
  492. fxch %st(1)
  493. faddp %st, %st(4)
  494. FLD 0 * SIZE(BB)
  495. fxch %st(1)
  496. faddp %st, %st(5)
  497. FLD 1 * SIZE(AA)
  498. fmul %st, %st(1)
  499. FMUL 1 * SIZE(BB)
  500. fxch %st(1)
  501. faddp %st, %st(6)
  502. FLD 2 * SIZE(BB)
  503. fxch %st(1)
  504. faddp %st, %st(7)
  505. FLD 2 * SIZE(AA)
  506. fmul %st, %st(1)
  507. FMUL 3 * SIZE(BB)
  508. fxch %st(1)
  509. faddp %st, %st(4)
  510. FLD 2 * SIZE(BB)
  511. fxch %st(1)
  512. faddp %st, %st(5)
  513. FLD 3 * SIZE(AA)
  514. fmul %st, %st(1)
  515. FMUL 3 * SIZE(BB)
  516. fxch %st(1)
  517. faddp %st, %st(6)
  518. FLD 8 * SIZE(BB)
  519. fxch %st(1)
  520. faddp %st, %st(7)
  521. FLD 8 * SIZE(AA)
  522. fxch %st(2)
  523. #if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE)
  524. prefetchnta (PREFETCH_OFFSET + 4) * SIZE(BB)
  525. #if (L2_SIZE == 524288)
  526. prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(AA)
  527. #endif
  528. #endif
  529. fmul %st, %st(3)
  530. FMUL 5 * SIZE(BB)
  531. fxch %st(3)
  532. faddp %st, %st(4)
  533. FLD 4 * SIZE(BB)
  534. fxch %st(3)
  535. faddp %st, %st(5)
  536. FLD 5 * SIZE(AA)
  537. fmul %st, %st(3)
  538. FMUL 5 * SIZE(BB)
  539. fxch %st(3)
  540. faddp %st, %st(6)
  541. FLD 6 * SIZE(BB)
  542. fxch %st(3)
  543. faddp %st, %st(7)
  544. FLD 6 * SIZE(AA)
  545. fmul %st, %st(3)
  546. FMUL 7 * SIZE(BB)
  547. fxch %st(3)
  548. faddp %st, %st(4)
  549. FLD 6 * SIZE(BB)
  550. fxch %st(3)
  551. faddp %st, %st(5)
  552. FLD 7 * SIZE(AA)
  553. fmul %st, %st(3)
  554. FMUL 7 * SIZE(BB)
  555. fxch %st(3)
  556. faddp %st, %st(6)
  557. FLD 12 * SIZE(BB)
  558. fxch %st(3)
  559. faddp %st, %st(7)
  560. FLD 12 * SIZE(AA)
  561. fxch %st(2)
  562. subl $-8 * SIZE, BB
  563. subl $-8 * SIZE, AA
  564. decl %eax # l --
  565. jne .MainLoop
  566. ALIGN_4
  567. .L16:
  568. #if defined(LT) || defined(RN)
  569. movl KK, %eax
  570. #else
  571. movl K, %eax
  572. subl KK, %eax
  573. #endif
  574. and $3, %eax
  575. je .L21
  576. ALIGN_4
  577. .SubLoop:
  578. fmul %st, %st(1)
  579. FMUL 1 * SIZE(BB)
  580. fxch %st(1)
  581. faddp %st, %st(4)
  582. FLD 0 * SIZE(BB)
  583. fxch %st(1)
  584. faddp %st, %st(5)
  585. FLD 1 * SIZE(AA)
  586. fmul %st, %st(1)
  587. FMUL 1 * SIZE(BB)
  588. fxch %st(1)
  589. faddp %st, %st(6)
  590. FLD 2 * SIZE(BB)
  591. fxch %st(1)
  592. faddp %st, %st(7)
  593. FLD 2 * SIZE(AA)
  594. addl $2 * SIZE,BB
  595. addl $2 * SIZE,AA
  596. decl %eax
  597. jne .SubLoop
  598. ALIGN_4
  599. .L21:
  600. ffreep %st(0)
  601. ffreep %st(0)
  602. ffreep %st(0)
  603. ffreep %st(0)
  604. #if defined(LN) || defined(RT)
  605. movl KK, %eax
  606. #ifdef LN
  607. subl $2, %eax
  608. #else
  609. subl $2, %eax
  610. #endif
  611. leal (, %eax, SIZE), %eax
  612. movl AORIG, AA
  613. leal (AA, %eax, 2), AA
  614. leal (%ebx, %eax, 2), BB
  615. #endif
  616. #if defined(LN) || defined(LT)
  617. FLD 0 * SIZE(BB)
  618. fsubp %st, %st(1)
  619. FLD 1 * SIZE(BB)
  620. fsubp %st, %st(2)
  621. FLD 2 * SIZE(BB)
  622. fsubp %st, %st(3)
  623. FLD 3 * SIZE(BB)
  624. fsubp %st, %st(4)
  625. #else
  626. FLD 0 * SIZE(AA)
  627. fsubp %st, %st(1)
  628. FLD 1 * SIZE(AA)
  629. fsubp %st, %st(3)
  630. FLD 2 * SIZE(AA)
  631. fsubp %st, %st(2)
  632. FLD 3 * SIZE(AA)
  633. fsubp %st, %st(4)
  634. #endif
  635. #ifdef LN
  636. FLD 3 * SIZE(AA)
  637. fmul %st, %st(3)
  638. fmulp %st, %st(4)
  639. FLD 2 * SIZE(AA)
  640. fmul %st(3), %st
  641. FLD 2 * SIZE(AA)
  642. fmul %st(5), %st
  643. fsubrp %st, %st(3)
  644. fsubrp %st, %st(1)
  645. FLD 0 * SIZE(AA)
  646. fmul %st, %st(1)
  647. fmulp %st, %st(2)
  648. #endif
  649. #ifdef LT
  650. FLD 0 * SIZE(AA)
  651. fmul %st, %st(1)
  652. fmulp %st, %st(2)
  653. FLD 1 * SIZE(AA)
  654. fmul %st(1), %st
  655. FLD 1 * SIZE(AA)
  656. fmul %st(3), %st
  657. fsubrp %st, %st(5)
  658. fsubrp %st, %st(3)
  659. FLD 3 * SIZE(AA)
  660. fmul %st, %st(3)
  661. fmulp %st, %st(4)
  662. #endif
  663. #ifdef RN
  664. FLD 0 * SIZE(BB)
  665. fmul %st, %st(1)
  666. fmulp %st, %st(3)
  667. FLD 1 * SIZE(BB)
  668. fmul %st(1), %st
  669. FLD 1 * SIZE(BB)
  670. fmul %st(4), %st
  671. fsubrp %st, %st(5)
  672. fsubrp %st, %st(2)
  673. FLD 3 * SIZE(BB)
  674. fmul %st, %st(2)
  675. fmulp %st, %st(4)
  676. #endif
  677. #ifdef RT
  678. FLD 3 * SIZE(BB)
  679. fmul %st, %st(2)
  680. fmulp %st, %st(4)
  681. FLD 2 * SIZE(BB)
  682. fmul %st(2), %st
  683. FLD 2 * SIZE(BB)
  684. fmul %st(5), %st
  685. fsubrp %st, %st(4)
  686. fsubrp %st, %st(1)
  687. FLD 0 * SIZE(BB)
  688. fmul %st, %st(1)
  689. fmulp %st, %st(3)
  690. #endif
  691. #ifdef LN
  692. subl $2 * SIZE, %edi
  693. #endif
  694. #if defined(LN) || defined(LT)
  695. FSTU 0 * SIZE(BB)
  696. fxch %st(1)
  697. FSTU 1 * SIZE(BB)
  698. fxch %st(2)
  699. FSTU 2 * SIZE(BB)
  700. fxch %st(3)
  701. FSTU 3 * SIZE(BB)
  702. FST 1 * SIZE(%edi,%ebp)
  703. FST 0 * SIZE(%edi)
  704. FST 0 * SIZE(%edi,%ebp)
  705. FST 1 * SIZE(%edi)
  706. #else
  707. FSTU 0 * SIZE(AA)
  708. fxch %st(2)
  709. FSTU 1 * SIZE(AA)
  710. fxch %st(1)
  711. FSTU 2 * SIZE(AA)
  712. fxch %st(3)
  713. FSTU 3 * SIZE(AA)
  714. FST 1 * SIZE(%edi,%ebp)
  715. FST 1 * SIZE(%edi)
  716. FST 0 * SIZE(%edi)
  717. FST 0 * SIZE(%edi,%ebp)
  718. #endif
  719. #ifndef LN
  720. addl $2 * SIZE, %edi
  721. #endif
  722. #if defined(LT) || defined(RN)
  723. movl K, %eax
  724. subl KK, %eax
  725. leal (,%eax, SIZE), %eax
  726. leal (AA, %eax, 2), AA
  727. leal (BB, %eax, 2), BB
  728. #endif
  729. #ifdef LN
  730. subl $2, KK
  731. #endif
  732. #ifdef LT
  733. addl $2, KK
  734. #endif
  735. #ifdef RT
  736. movl K, %eax
  737. sall $1 + BASE_SHIFT, %eax
  738. addl %eax, AORIG
  739. #endif
  740. decl %esi # i --
  741. jne .MainHead
  742. ALIGN_4
  743. .L12:
  744. movl M, %eax # m # MEMORY
  745. andl $1, %eax
  746. je .L27
  747. #ifdef LN
  748. movl K, %eax
  749. sall $0 + BASE_SHIFT, %eax
  750. subl %eax, AORIG
  751. #endif
  752. #if defined(LN) || defined(RT)
  753. movl KK, %eax
  754. leal (, %eax, SIZE), %eax
  755. movl AORIG, AA
  756. leal (AA, %eax, 1), AA
  757. leal (%ebx, %eax, 2), BB
  758. #else
  759. movl %ebx, BB
  760. #endif
  761. fldz
  762. fldz
  763. FLD 0 * SIZE(AA) # temp1 = *(aoffset + 0)
  764. #if defined(LT) || defined(RN)
  765. movl KK, %eax
  766. #else
  767. movl K, %eax
  768. subl KK, %eax
  769. #endif
  770. sarl $1,%eax # k >> 1 # MEMORY
  771. je .L54
  772. ALIGN_4
  773. .L55:
  774. FLD 0 * SIZE(BB) # temp2 = *(boffset + 0)
  775. rep
  776. fmul %st(1), %st
  777. faddp %st, %st(2)
  778. FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0)
  779. faddp %st, %st(2)
  780. FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0)
  781. FLD 2 * SIZE(BB) # temp2 = *(boffset + 0)
  782. rep
  783. fmul %st(1), %st
  784. faddp %st, %st(2)
  785. FMUL 3 * SIZE(BB) # temp2 = *(boffset + 0)
  786. faddp %st, %st(2)
  787. FLD 2 * SIZE(AA) # temp1 = *(aoffset + 0)
  788. addl $2 * SIZE, AA
  789. addl $4 * SIZE, BB
  790. decl %eax
  791. jne .L55
  792. ALIGN_4
  793. .L54:
  794. #if defined(LT) || defined(RN)
  795. movl KK, %eax
  796. #else
  797. movl K, %eax
  798. subl KK, %eax
  799. #endif
  800. andl $1,%eax # k & 1
  801. je .L33
  802. ALIGN_4
  803. FLD 0 * SIZE(BB) # temp2 = *(boffset + 0)
  804. rep
  805. fmul %st(1), %st
  806. faddp %st, %st(2)
  807. FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0)
  808. faddp %st, %st(2)
  809. FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0)
  810. addl $1 * SIZE, AA
  811. addl $2 * SIZE, BB
  812. ALIGN_4
  813. .L33:
  814. ffreep %st(0)
  815. #if defined(LN) || defined(RT)
  816. movl KK, %eax
  817. #ifdef LN
  818. subl $1, %eax
  819. #else
  820. subl $2, %eax
  821. #endif
  822. leal (, %eax, SIZE), %eax
  823. movl AORIG, AA
  824. leal (AA, %eax, 1), AA
  825. leal (%ebx, %eax, 2), BB
  826. #endif
  827. #if defined(LN) || defined(LT)
  828. FLD 0 * SIZE(BB)
  829. fsubp %st, %st(1)
  830. FLD 1 * SIZE(BB)
  831. fsubp %st, %st(2)
  832. #else
  833. FLD 0 * SIZE(AA)
  834. fsubp %st, %st(1)
  835. FLD 1 * SIZE(AA)
  836. fsubp %st, %st(2)
  837. #endif
  838. #if defined(LN) || defined(LT)
  839. FLD 0 * SIZE(AA)
  840. fmul %st, %st(1)
  841. fmulp %st, %st(2)
  842. #endif
  843. #ifdef RN
  844. FLD 0 * SIZE(BB)
  845. fmulp %st, %st(1)
  846. FLD 1 * SIZE(BB)
  847. fmul %st(1), %st
  848. fsubrp %st, %st(2)
  849. FLD 3 * SIZE(BB)
  850. fmulp %st, %st(2)
  851. #endif
  852. #ifdef RT
  853. FLD 3 * SIZE(BB)
  854. fmulp %st, %st(2)
  855. FLD 2 * SIZE(BB)
  856. fmul %st(2), %st
  857. fsubrp %st, %st(1)
  858. FLD 0 * SIZE(BB)
  859. fmulp %st, %st(1)
  860. #endif
  861. #ifdef LN
  862. subl $1 * SIZE, %edi
  863. #endif
  864. #if defined(LN) || defined(LT)
  865. FSTU 0 * SIZE(BB)
  866. fxch %st(1)
  867. FSTU 1 * SIZE(BB)
  868. #else
  869. FSTU 0 * SIZE(AA)
  870. fxch %st(1)
  871. FSTU 1 * SIZE(AA)
  872. #endif
  873. FST 0 * SIZE(%edi,%ebp)
  874. FST 0 * SIZE(%edi)
  875. #ifndef LN
  876. addl $1 * SIZE, %edi
  877. #endif
  878. #if defined(LT) || defined(RN)
  879. movl K, %eax
  880. subl KK, %eax
  881. leal (,%eax, SIZE), %eax
  882. leal (AA, %eax, 1), AA
  883. leal (BB, %eax, 2), BB
  884. #endif
  885. #ifdef LN
  886. subl $1, KK
  887. #endif
  888. #ifdef LT
  889. addl $1, KK
  890. #endif
  891. #ifdef RT
  892. movl K, %eax
  893. sall $0 + BASE_SHIFT, %eax
  894. addl %eax, AORIG
  895. #endif
  896. ALIGN_4
  897. .L27:
  898. #ifdef LN
  899. movl K, %eax
  900. leal ( , %eax, SIZE), %eax
  901. leal (%ebx, %eax, 2), %ebx
  902. #endif
  903. #if defined(LT) || defined(RN)
  904. movl BB, %ebx
  905. #endif
  906. #ifdef RN
  907. addl $2, KK
  908. #endif
  909. #ifdef RT
  910. subl $2, KK
  911. #endif
  912. decl J # j-- # MEMORY
  913. jne .L34
  914. ALIGN_4
  915. .End:
  916. popl %ebx
  917. popl %esi
  918. popl %edi
  919. popl %ebp
  920. addl $ARGS, %esp
  921. ret
  922. EPILOGUE