You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT_2x2.S 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define J 0 + STACK(%esp)
  43. #define KK 4 + STACK(%esp)
  44. #define KKK 8 + STACK(%esp)
  45. #define AORIG 12 + STACK(%esp)
  46. #define M 4 + STACK + ARGS(%esp)
  47. #define N 8 + STACK + ARGS(%esp)
  48. #define K 12 + STACK + ARGS(%esp)
  49. #define ALPHA 16 + STACK + ARGS(%esp)
  50. #ifdef DOUBLE
  51. #define A 24 + STACK + ARGS(%esp)
  52. #define B 28 + STACK + ARGS(%esp)
  53. #define C 32 + STACK + ARGS(%esp)
  54. #define LDC 36 + STACK + ARGS(%esp)
  55. #define OFFSET 40 + STACK + ARGS(%esp)
  56. #else
  57. #define A 20 + STACK + ARGS(%esp)
  58. #define B 24 + STACK + ARGS(%esp)
  59. #define C 28 + STACK + ARGS(%esp)
  60. #define LDC 32 + STACK + ARGS(%esp)
  61. #define OFFSET 36 + STACK + ARGS(%esp)
  62. #endif
  63. #define PREFETCH_OFFSET 48
  64. #if defined(PENTIUM3) || defined(PENTIUMM)
  65. #define REP rep
  66. #else
  67. #define REP rep
  68. #endif
  69. #define AA %edx
  70. #define BB %ecx
  71. PROLOGUE
  72. subl $ARGS, %esp # Generate Stack Frame
  73. pushl %ebp
  74. pushl %edi
  75. pushl %esi
  76. pushl %ebx
  77. PROFCODE
  78. movl LDC, %ebp # ldc # MEMORY
  79. movl B, %ebx
  80. leal (, %ebp, SIZE), %ebp
  81. #ifdef LN
  82. movl M, %eax
  83. leal (, %eax, SIZE), %eax
  84. addl %eax, C
  85. imull K, %eax
  86. addl %eax, A
  87. #endif
  88. #ifdef RT
  89. movl N, %eax
  90. leal (, %eax, SIZE), %eax
  91. imull K, %eax
  92. addl %eax, %ebx
  93. movl N, %eax
  94. imull %ebp, %eax
  95. addl %eax, C
  96. #endif
  97. #ifdef RN
  98. movl OFFSET, %eax
  99. negl %eax
  100. movl %eax, KK
  101. #endif
  102. #ifdef RT
  103. movl N, %eax
  104. subl OFFSET, %eax
  105. movl %eax, KK
  106. #endif
  107. movl N, %eax # j = (n >> 1) # MEMORY
  108. sarl $1, %eax
  109. movl %eax, J # j = (n >> 1) # MEMORY
  110. je .L8
  111. ALIGN_4
  112. .L34:
  113. #if defined(LT) || defined(RN)
  114. movl A, AA
  115. #else
  116. movl A, %eax
  117. movl %eax, AORIG
  118. #endif
  119. #ifdef RT
  120. movl K, %eax
  121. sall $1 + BASE_SHIFT, %eax
  122. subl %eax, %ebx
  123. #endif
  124. lea (, %ebp, 2), %eax
  125. #ifdef RT
  126. subl %eax, C
  127. #endif
  128. movl C, %edi
  129. #ifndef RT
  130. addl %eax, C
  131. #endif
  132. #ifdef LN
  133. movl OFFSET, %eax
  134. addl M, %eax
  135. movl %eax, KK
  136. #endif
  137. #ifdef LT
  138. movl OFFSET, %eax
  139. movl %eax, KK
  140. #endif
  141. movl M, %esi
  142. sarl $1, %esi
  143. je .L12
  144. ALIGN_4
  145. .MainHead:
  146. #ifdef LN
  147. movl K, %eax
  148. sall $1 + BASE_SHIFT, %eax
  149. subl %eax, AORIG
  150. #endif
  151. #if defined(LN) || defined(RT)
  152. movl KK, %eax
  153. leal (, %eax, SIZE), %eax
  154. movl AORIG, AA
  155. leal (AA, %eax, 2), AA
  156. leal (%ebx, %eax, 2), BB
  157. #else
  158. movl %ebx, BB
  159. #endif
  160. fldz
  161. fldz
  162. fldz
  163. fldz
  164. FLD 4 * SIZE(BB) # b5
  165. FLD 4 * SIZE(AA) # a5
  166. FLD 0 * SIZE(BB) # b1
  167. FLD 0 * SIZE(AA) # a1
  168. #if defined(HAVE_3DNOW)
  169. prefetchw 2 * SIZE(%edi)
  170. prefetchw 2 * SIZE(%edi, %ebp, 1)
  171. #elif defined(HAVE_SSE)
  172. prefetchnta 2 * SIZE(%edi)
  173. prefetchnta 2 * SIZE(%edi, %ebp, 1)
  174. #endif
  175. #if defined(LT) || defined(RN)
  176. movl KK, %eax
  177. #else
  178. movl K, %eax
  179. subl KK, %eax
  180. #endif
  181. sarl $2, %eax
  182. je .L16
  183. ALIGN_4
  184. .MainLoop:
  185. #if defined(HAVE_3DNOW)
  186. prefetch (PREFETCH_OFFSET) * SIZE(BB)
  187. nop
  188. #elif defined(HAVE_SSE)
  189. prefetchnta (PREFETCH_OFFSET) * SIZE(BB)
  190. #if (L2_SIZE == 524288)
  191. prefetcht0 (PREFETCH_OFFSET) * SIZE(AA)
  192. #endif
  193. #endif
  194. fmul %st, %st(1)
  195. FMUL 1 * SIZE(BB)
  196. fxch %st(1)
  197. faddp %st, %st(4)
  198. FLD 0 * SIZE(BB)
  199. fxch %st(1)
  200. faddp %st, %st(5)
  201. FLD 1 * SIZE(AA)
  202. fmul %st, %st(1)
  203. FMUL 1 * SIZE(BB)
  204. fxch %st(1)
  205. faddp %st, %st(6)
  206. FLD 2 * SIZE(BB)
  207. fxch %st(1)
  208. faddp %st, %st(7)
  209. FLD 2 * SIZE(AA)
  210. fmul %st, %st(1)
  211. FMUL 3 * SIZE(BB)
  212. fxch %st(1)
  213. faddp %st, %st(4)
  214. FLD 2 * SIZE(BB)
  215. fxch %st(1)
  216. faddp %st, %st(5)
  217. FLD 3 * SIZE(AA)
  218. fmul %st, %st(1)
  219. FMUL 3 * SIZE(BB)
  220. fxch %st(1)
  221. faddp %st, %st(6)
  222. FLD 8 * SIZE(BB)
  223. fxch %st(1)
  224. faddp %st, %st(7)
  225. FLD 8 * SIZE(AA)
  226. fxch %st(2)
  227. #if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE)
  228. prefetchnta (PREFETCH_OFFSET + 4) * SIZE(BB)
  229. #if (L2_SIZE == 524288)
  230. prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(AA)
  231. #endif
  232. #endif
  233. fmul %st, %st(3)
  234. FMUL 5 * SIZE(BB)
  235. fxch %st(3)
  236. faddp %st, %st(4)
  237. FLD 4 * SIZE(BB)
  238. fxch %st(3)
  239. faddp %st, %st(5)
  240. FLD 5 * SIZE(AA)
  241. fmul %st, %st(3)
  242. FMUL 5 * SIZE(BB)
  243. fxch %st(3)
  244. faddp %st, %st(6)
  245. FLD 6 * SIZE(BB)
  246. fxch %st(3)
  247. faddp %st, %st(7)
  248. FLD 6 * SIZE(AA)
  249. fmul %st, %st(3)
  250. FMUL 7 * SIZE(BB)
  251. fxch %st(3)
  252. faddp %st, %st(4)
  253. FLD 6 * SIZE(BB)
  254. fxch %st(3)
  255. faddp %st, %st(5)
  256. FLD 7 * SIZE(AA)
  257. fmul %st, %st(3)
  258. FMUL 7 * SIZE(BB)
  259. fxch %st(3)
  260. faddp %st, %st(6)
  261. FLD 12 * SIZE(BB)
  262. fxch %st(3)
  263. faddp %st, %st(7)
  264. FLD 12 * SIZE(AA)
  265. fxch %st(2)
  266. subl $-8 * SIZE, BB
  267. subl $-8 * SIZE, AA
  268. decl %eax # l --
  269. jne .MainLoop
  270. ALIGN_4
  271. .L16:
  272. #if defined(LT) || defined(RN)
  273. movl KK, %eax
  274. #else
  275. movl K, %eax
  276. subl KK, %eax
  277. #endif
  278. and $3, %eax
  279. je .L21
  280. ALIGN_4
  281. .SubLoop:
  282. fmul %st, %st(1)
  283. FMUL 1 * SIZE(BB)
  284. fxch %st(1)
  285. faddp %st, %st(4)
  286. FLD 0 * SIZE(BB)
  287. fxch %st(1)
  288. faddp %st, %st(5)
  289. FLD 1 * SIZE(AA)
  290. fmul %st, %st(1)
  291. FMUL 1 * SIZE(BB)
  292. fxch %st(1)
  293. faddp %st, %st(6)
  294. FLD 2 * SIZE(BB)
  295. fxch %st(1)
  296. faddp %st, %st(7)
  297. FLD 2 * SIZE(AA)
  298. addl $2 * SIZE,BB
  299. addl $2 * SIZE,AA
  300. decl %eax
  301. jne .SubLoop
  302. ALIGN_4
  303. .L21:
  304. ffreep %st(0)
  305. ffreep %st(0)
  306. ffreep %st(0)
  307. ffreep %st(0)
  308. #if defined(LN) || defined(RT)
  309. movl KK, %eax
  310. #ifdef LN
  311. subl $2, %eax
  312. #else
  313. subl $2, %eax
  314. #endif
  315. leal (, %eax, SIZE), %eax
  316. movl AORIG, AA
  317. leal (AA, %eax, 2), AA
  318. leal (%ebx, %eax, 2), BB
  319. #endif
  320. #if defined(LN) || defined(LT)
  321. FLD 0 * SIZE(BB)
  322. fsubp %st, %st(1)
  323. FLD 1 * SIZE(BB)
  324. fsubp %st, %st(2)
  325. FLD 2 * SIZE(BB)
  326. fsubp %st, %st(3)
  327. FLD 3 * SIZE(BB)
  328. fsubp %st, %st(4)
  329. #else
  330. FLD 0 * SIZE(AA)
  331. fsubp %st, %st(1)
  332. FLD 1 * SIZE(AA)
  333. fsubp %st, %st(3)
  334. FLD 2 * SIZE(AA)
  335. fsubp %st, %st(2)
  336. FLD 3 * SIZE(AA)
  337. fsubp %st, %st(4)
  338. #endif
  339. #ifdef LN
  340. FLD 3 * SIZE(AA)
  341. fmul %st, %st(3)
  342. fmulp %st, %st(4)
  343. FLD 2 * SIZE(AA)
  344. fmul %st(3), %st
  345. FLD 2 * SIZE(AA)
  346. fmul %st(5), %st
  347. fsubrp %st, %st(3)
  348. fsubrp %st, %st(1)
  349. FLD 0 * SIZE(AA)
  350. fmul %st, %st(1)
  351. fmulp %st, %st(2)
  352. #endif
  353. #ifdef LT
  354. FLD 0 * SIZE(AA)
  355. fmul %st, %st(1)
  356. fmulp %st, %st(2)
  357. FLD 1 * SIZE(AA)
  358. fmul %st(1), %st
  359. FLD 1 * SIZE(AA)
  360. fmul %st(3), %st
  361. fsubrp %st, %st(5)
  362. fsubrp %st, %st(3)
  363. FLD 3 * SIZE(AA)
  364. fmul %st, %st(3)
  365. fmulp %st, %st(4)
  366. #endif
  367. #ifdef RN
  368. FLD 0 * SIZE(BB)
  369. fmul %st, %st(1)
  370. fmulp %st, %st(3)
  371. FLD 1 * SIZE(BB)
  372. fmul %st(1), %st
  373. FLD 1 * SIZE(BB)
  374. fmul %st(4), %st
  375. fsubrp %st, %st(5)
  376. fsubrp %st, %st(2)
  377. FLD 3 * SIZE(BB)
  378. fmul %st, %st(2)
  379. fmulp %st, %st(4)
  380. #endif
  381. #ifdef RT
  382. FLD 3 * SIZE(BB)
  383. fmul %st, %st(2)
  384. fmulp %st, %st(4)
  385. FLD 2 * SIZE(BB)
  386. fmul %st(2), %st
  387. FLD 2 * SIZE(BB)
  388. fmul %st(5), %st
  389. fsubrp %st, %st(4)
  390. fsubrp %st, %st(1)
  391. FLD 0 * SIZE(BB)
  392. fmul %st, %st(1)
  393. fmulp %st, %st(3)
  394. #endif
  395. #ifdef LN
  396. subl $2 * SIZE, %edi
  397. #endif
  398. #if defined(LN) || defined(LT)
  399. FSTU 0 * SIZE(BB)
  400. fxch %st(1)
  401. FSTU 1 * SIZE(BB)
  402. fxch %st(2)
  403. FSTU 2 * SIZE(BB)
  404. fxch %st(3)
  405. FSTU 3 * SIZE(BB)
  406. FST 1 * SIZE(%edi,%ebp)
  407. FST 0 * SIZE(%edi)
  408. FST 0 * SIZE(%edi,%ebp)
  409. FST 1 * SIZE(%edi)
  410. #else
  411. FSTU 0 * SIZE(AA)
  412. fxch %st(2)
  413. FSTU 1 * SIZE(AA)
  414. fxch %st(1)
  415. FSTU 2 * SIZE(AA)
  416. fxch %st(3)
  417. FSTU 3 * SIZE(AA)
  418. FST 1 * SIZE(%edi,%ebp)
  419. FST 1 * SIZE(%edi)
  420. FST 0 * SIZE(%edi)
  421. FST 0 * SIZE(%edi,%ebp)
  422. #endif
  423. #ifndef LN
  424. addl $2 * SIZE, %edi
  425. #endif
  426. #if defined(LT) || defined(RN)
  427. movl K, %eax
  428. subl KK, %eax
  429. leal (,%eax, SIZE), %eax
  430. leal (AA, %eax, 2), AA
  431. leal (BB, %eax, 2), BB
  432. #endif
  433. #ifdef LN
  434. subl $2, KK
  435. #endif
  436. #ifdef LT
  437. addl $2, KK
  438. #endif
  439. #ifdef RT
  440. movl K, %eax
  441. sall $1 + BASE_SHIFT, %eax
  442. addl %eax, AORIG
  443. #endif
  444. decl %esi # i --
  445. jne .MainHead
  446. ALIGN_4
  447. .L12:
  448. movl M, %eax # m # MEMORY
  449. andl $1, %eax
  450. je .L27
  451. #ifdef LN
  452. movl K, %eax
  453. sall $0 + BASE_SHIFT, %eax
  454. subl %eax, AORIG
  455. #endif
  456. #if defined(LN) || defined(RT)
  457. movl KK, %eax
  458. leal (, %eax, SIZE), %eax
  459. movl AORIG, AA
  460. leal (AA, %eax, 1), AA
  461. leal (%ebx, %eax, 2), BB
  462. #else
  463. movl %ebx, BB
  464. #endif
  465. fldz
  466. fldz
  467. FLD 0 * SIZE(AA) # temp1 = *(aoffset + 0)
  468. #if defined(LT) || defined(RN)
  469. movl KK, %eax
  470. #else
  471. movl K, %eax
  472. subl KK, %eax
  473. #endif
  474. sarl $1,%eax # k >> 1 # MEMORY
  475. je .L54
  476. ALIGN_4
  477. .L55:
  478. FLD 0 * SIZE(BB) # temp2 = *(boffset + 0)
  479. rep
  480. fmul %st(1), %st
  481. faddp %st, %st(2)
  482. FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0)
  483. faddp %st, %st(2)
  484. FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0)
  485. FLD 2 * SIZE(BB) # temp2 = *(boffset + 0)
  486. rep
  487. fmul %st(1), %st
  488. faddp %st, %st(2)
  489. FMUL 3 * SIZE(BB) # temp2 = *(boffset + 0)
  490. faddp %st, %st(2)
  491. FLD 2 * SIZE(AA) # temp1 = *(aoffset + 0)
  492. addl $2 * SIZE, AA
  493. addl $4 * SIZE, BB
  494. decl %eax
  495. jne .L55
  496. ALIGN_4
  497. .L54:
  498. #if defined(LT) || defined(RN)
  499. movl KK, %eax
  500. #else
  501. movl K, %eax
  502. subl KK, %eax
  503. #endif
  504. andl $1,%eax # k & 1
  505. je .L33
  506. ALIGN_4
  507. FLD 0 * SIZE(BB) # temp2 = *(boffset + 0)
  508. rep
  509. fmul %st(1), %st
  510. faddp %st, %st(2)
  511. FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0)
  512. faddp %st, %st(2)
  513. FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0)
  514. addl $1 * SIZE, AA
  515. addl $2 * SIZE, BB
  516. ALIGN_4
  517. .L33:
  518. ffreep %st(0)
  519. #if defined(LN) || defined(RT)
  520. movl KK, %eax
  521. #ifdef LN
  522. subl $1, %eax
  523. #else
  524. subl $2, %eax
  525. #endif
  526. leal (, %eax, SIZE), %eax
  527. movl AORIG, AA
  528. leal (AA, %eax, 1), AA
  529. leal (%ebx, %eax, 2), BB
  530. #endif
  531. #if defined(LN) || defined(LT)
  532. FLD 0 * SIZE(BB)
  533. fsubp %st, %st(1)
  534. FLD 1 * SIZE(BB)
  535. fsubp %st, %st(2)
  536. #else
  537. FLD 0 * SIZE(AA)
  538. fsubp %st, %st(1)
  539. FLD 1 * SIZE(AA)
  540. fsubp %st, %st(2)
  541. #endif
  542. #if defined(LN) || defined(LT)
  543. FLD 0 * SIZE(AA)
  544. fmul %st, %st(1)
  545. fmulp %st, %st(2)
  546. #endif
  547. #ifdef RN
  548. FLD 0 * SIZE(BB)
  549. fmulp %st, %st(1)
  550. FLD 1 * SIZE(BB)
  551. fmul %st(1), %st
  552. fsubrp %st, %st(2)
  553. FLD 3 * SIZE(BB)
  554. fmulp %st, %st(2)
  555. #endif
  556. #ifdef RT
  557. FLD 3 * SIZE(BB)
  558. fmulp %st, %st(2)
  559. FLD 2 * SIZE(BB)
  560. fmul %st(2), %st
  561. fsubrp %st, %st(1)
  562. FLD 0 * SIZE(BB)
  563. fmulp %st, %st(1)
  564. #endif
  565. #ifdef LN
  566. subl $1 * SIZE, %edi
  567. #endif
  568. #if defined(LN) || defined(LT)
  569. FSTU 0 * SIZE(BB)
  570. fxch %st(1)
  571. FSTU 1 * SIZE(BB)
  572. #else
  573. FSTU 0 * SIZE(AA)
  574. fxch %st(1)
  575. FSTU 1 * SIZE(AA)
  576. #endif
  577. FST 0 * SIZE(%edi,%ebp)
  578. FST 0 * SIZE(%edi)
  579. #ifndef LN
  580. addl $1 * SIZE, %edi
  581. #endif
  582. #if defined(LT) || defined(RN)
  583. movl K, %eax
  584. subl KK, %eax
  585. leal (,%eax, SIZE), %eax
  586. leal (AA, %eax, 1), AA
  587. leal (BB, %eax, 2), BB
  588. #endif
  589. #ifdef LN
  590. subl $1, KK
  591. #endif
  592. #ifdef LT
  593. addl $1, KK
  594. #endif
  595. #ifdef RT
  596. movl K, %eax
  597. sall $0 + BASE_SHIFT, %eax
  598. addl %eax, AORIG
  599. #endif
  600. ALIGN_4
  601. .L27:
  602. #ifdef LN
  603. movl K, %eax
  604. leal ( , %eax, SIZE), %eax
  605. leal (%ebx, %eax, 2), %ebx
  606. #endif
  607. #if defined(LT) || defined(RN)
  608. movl BB, %ebx
  609. #endif
  610. #ifdef RN
  611. addl $2, KK
  612. #endif
  613. #ifdef RT
  614. subl $2, KK
  615. #endif
  616. decl J # j-- # MEMORY
  617. jne .L34
  618. ALIGN_4
  619. .L8:
  620. movl N, %eax # n # MEMORY
  621. andl $1, %eax
  622. je .End
  623. #if defined(LT) || defined(RN)
  624. movl A, AA
  625. #else
  626. movl A, %eax
  627. movl %eax, AORIG
  628. #endif
  629. #ifdef RT
  630. movl K, %eax
  631. sall $0 + BASE_SHIFT, %eax
  632. subl %eax, %ebx
  633. #endif
  634. #ifdef RT
  635. subl %ebp, C
  636. #endif
  637. movl C, %edi # c # MEMORY
  638. #ifndef RT
  639. addl %ebp, C
  640. #endif
  641. #ifdef LN
  642. movl OFFSET, %eax
  643. addl M, %eax
  644. movl %eax, KK
  645. #endif
  646. #ifdef LT
  647. movl OFFSET, %eax
  648. movl %eax, KK
  649. #endif
  650. movl M, %esi # m # MEMORY
  651. sarl $1, %esi # m >> 1
  652. je .L36
  653. ALIGN_4
  654. .L46:
  655. #ifdef LN
  656. movl K, %eax
  657. sall $1 + BASE_SHIFT, %eax
  658. subl %eax, AORIG
  659. #endif
  660. #if defined(LN) || defined(RT)
  661. movl KK, %eax
  662. leal (, %eax, SIZE), %eax
  663. movl AORIG, AA
  664. leal (AA, %eax, 2), AA
  665. leal (%ebx, %eax, 1), BB
  666. #else
  667. movl %ebx, BB
  668. #endif
  669. fldz
  670. fldz
  671. FLD 0 * SIZE(BB) # temp1 = *(boffset + 0)
  672. #if defined(LT) || defined(RN)
  673. movl KK, %eax
  674. #else
  675. movl K, %eax
  676. subl KK, %eax
  677. #endif
  678. sarl $1, %eax
  679. je .L56
  680. ALIGN_4
  681. .L57:
  682. FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0)
  683. fmul %st(1), %st
  684. faddp %st, %st(2)
  685. FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0)
  686. faddp %st, %st(2)
  687. FLD 1 * SIZE(BB) # temp1 = *(boffset + 0)
  688. FLD 2 * SIZE(AA) # temp2 = *(aoffset + 0)
  689. fmul %st(1), %st
  690. faddp %st, %st(2)
  691. FMUL 3 * SIZE(AA) # temp2 = *(aoffset + 0)
  692. faddp %st, %st(2)
  693. FLD 2 * SIZE(BB) # temp1 = *(boffset + 0)
  694. addl $4 * SIZE,AA
  695. addl $2 * SIZE,BB
  696. dec %eax
  697. jne .L57
  698. ALIGN_4
  699. .L56:
  700. #if defined(LT) || defined(RN)
  701. movl KK, %eax
  702. #else
  703. movl K, %eax
  704. subl KK, %eax
  705. #endif
  706. andl $1, %eax
  707. je .L45
  708. ALIGN_4
  709. FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0)
  710. fmul %st(1), %st
  711. faddp %st, %st(2)
  712. FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0)
  713. faddp %st, %st(2)
  714. FLD 3 * SIZE(BB) # temp1 = *(boffset + 0)
  715. addl $2 * SIZE,AA
  716. addl $1 * SIZE,BB
  717. ALIGN_4
  718. .L45:
  719. ffreep %st(0)
  720. #if defined(LN) || defined(RT)
  721. movl KK, %eax
  722. #ifdef LN
  723. subl $2, %eax
  724. #else
  725. subl $1, %eax
  726. #endif
  727. leal (, %eax, SIZE), %eax
  728. movl AORIG, AA
  729. leal (AA, %eax, 2), AA
  730. leal (%ebx, %eax, 1), BB
  731. #endif
  732. #if defined(LN) || defined(LT)
  733. FLD 0 * SIZE(BB)
  734. fsubp %st, %st(1)
  735. FLD 1 * SIZE(BB)
  736. fsubp %st, %st(2)
  737. #else
  738. FLD 0 * SIZE(AA)
  739. fsubp %st, %st(1)
  740. FLD 1 * SIZE(AA)
  741. fsubp %st, %st(2)
  742. #endif
  743. #ifdef LN
  744. FLD 3 * SIZE(AA)
  745. fmulp %st, %st(2)
  746. FLD 2 * SIZE(AA)
  747. fmul %st(2), %st
  748. fsubrp %st, %st(1)
  749. FLD 0 * SIZE(AA)
  750. fmulp %st, %st(1)
  751. #endif
  752. #ifdef LT
  753. FLD 0 * SIZE(AA)
  754. fmulp %st, %st(1)
  755. FLD 1 * SIZE(AA)
  756. fmul %st(1), %st
  757. fsubrp %st, %st(2)
  758. FLD 3 * SIZE(AA)
  759. fmulp %st, %st(2)
  760. #endif
  761. #ifdef RN
  762. FLD 0 * SIZE(BB)
  763. fmul %st, %st(1)
  764. fmulp %st, %st(2)
  765. #endif
  766. #ifdef RT
  767. FLD 0 * SIZE(BB)
  768. fmul %st, %st(1)
  769. fmulp %st, %st(2)
  770. #endif
  771. #ifdef LN
  772. subl $2 * SIZE, %edi
  773. #endif
  774. #if defined(LN) || defined(LT)
  775. FSTU 0 * SIZE(BB)
  776. fxch %st(1)
  777. FSTU 1 * SIZE(BB)
  778. #else
  779. FSTU 0 * SIZE(AA)
  780. fxch %st(1)
  781. FSTU 1 * SIZE(AA)
  782. #endif
  783. FST 1 * SIZE(%edi)
  784. FST 0 * SIZE(%edi)
  785. #ifndef LN
  786. addl $2 * SIZE, %edi
  787. #endif
  788. #if defined(LT) || defined(RN)
  789. movl K, %eax
  790. subl KK, %eax
  791. leal (,%eax, SIZE), %eax
  792. leal (AA, %eax, 2), AA
  793. leal (BB, %eax, 1), BB
  794. #endif
  795. #ifdef LN
  796. subl $2, KK
  797. #endif
  798. #ifdef LT
  799. addl $2, KK
  800. #endif
  801. #ifdef RT
  802. movl K, %eax
  803. sall $1 + BASE_SHIFT, %eax
  804. addl %eax, AORIG
  805. #endif
  806. decl %esi # i --
  807. jne .L46
  808. ALIGN_4
  809. .L36:
  810. movl M, %eax # m # MEMORY
  811. andl $1, %eax # m & 1
  812. je .L99
  813. #ifdef LN
  814. movl K, %eax
  815. sall $0 + BASE_SHIFT, %eax
  816. subl %eax, AORIG
  817. #endif
  818. #if defined(LN) || defined(RT)
  819. movl KK, %eax
  820. leal (, %eax, SIZE), %eax
  821. movl AORIG, AA
  822. leal (AA, %eax, 1), AA
  823. leal (%ebx, %eax, 1), BB
  824. #else
  825. movl %ebx, BB
  826. #endif
  827. fldz
  828. #if defined(LT) || defined(RN)
  829. movl KK, %eax
  830. #else
  831. movl K, %eax
  832. subl KK, %eax
  833. #endif
  834. test %eax, %eax
  835. jle .L52
  836. ALIGN_3
  837. .L51:
  838. FLD (AA)
  839. FMUL (BB)
  840. addl $1 * SIZE,AA
  841. addl $1 * SIZE,BB
  842. faddp %st,%st(1)
  843. decl %eax
  844. jne .L51
  845. ALIGN_4
  846. .L52:
  847. #if defined(LN) || defined(RT)
  848. movl KK, %eax
  849. #ifdef LN
  850. subl $1, %eax
  851. #else
  852. subl $1, %eax
  853. #endif
  854. leal (, %eax, SIZE), %eax
  855. movl AORIG, AA
  856. leal (AA, %eax, 1), AA
  857. leal (%ebx, %eax, 1), BB
  858. #endif
  859. #if defined(LN) || defined(LT)
  860. FLD 0 * SIZE(BB)
  861. fsubp %st, %st(1)
  862. #else
  863. FLD 0 * SIZE(AA)
  864. fsubp %st, %st(1)
  865. #endif
  866. #if defined(LN) || defined(LT)
  867. FMUL 0 * SIZE(AA)
  868. #else
  869. FMUL 0 * SIZE(BB)
  870. #endif
  871. #ifdef LN
  872. subl $1 * SIZE, %edi
  873. #endif
  874. #if defined(LN) || defined(LT)
  875. FSTU 0 * SIZE(BB)
  876. #else
  877. FSTU 0 * SIZE(AA)
  878. #endif
  879. FST 0 * SIZE(%edi)
  880. #ifndef LN
  881. addl $1 * SIZE, %edi
  882. #endif
  883. #if defined(LT) || defined(RN)
  884. movl K, %eax
  885. subl KK, %eax
  886. leal (,%eax, SIZE), %eax
  887. leal (AA, %eax, 1), AA
  888. leal (BB, %eax, 1), BB
  889. #endif
  890. #ifdef LN
  891. subl $1, KK
  892. #endif
  893. #ifdef LT
  894. addl $1, KK
  895. #endif
  896. #ifdef RT
  897. movl K, %eax
  898. sall $0 + BASE_SHIFT, %eax
  899. addl %eax, AORIG
  900. #endif
  901. ALIGN_4
  902. .L99:
  903. #ifdef LN
  904. movl K, %eax
  905. leal (%ebx, %eax, SIZE), %ebx
  906. #endif
  907. #if defined(LT) || defined(RN)
  908. movl BB, %ebx
  909. #endif
  910. #ifdef RN
  911. addl $1, KK
  912. #endif
  913. #ifdef RT
  914. subl $1, KK
  915. #endif
  916. ALIGN_4
  917. .End:
  918. popl %ebx
  919. popl %esi
  920. popl %edi
  921. popl %ebp
  922. addl $ARGS, %esp
  923. ret
  924. EPILOGUE