You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

qtrsm_kernel_LN_2x2.S 19 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef OPTERON
  41. #define PREFETCH prefetch
  42. #define PREFETCHW prefetchw
  43. #else
  44. #define PREFETCH prefetcht0
  45. #define PREFETCHW prefetcht0
  46. #endif
  47. #define PREFETCHSIZE (5 + 4 * 10)
  48. #define STACK 16
  49. #define ARGS 16
  50. #define J 0 + STACK(%esp)
  51. #define KK 4 + STACK(%esp)
  52. #define KKK 8 + STACK(%esp)
  53. #define AORIG 12 + STACK(%esp)
  54. #define M 4 + STACK + ARGS(%esp)
  55. #define N 8 + STACK + ARGS(%esp)
  56. #define K 12 + STACK + ARGS(%esp)
  57. #define ALPHA 16 + STACK + ARGS(%esp)
  58. #define A 32 + STACK + ARGS(%esp)
  59. #define ARG_B 36 + STACK + ARGS(%esp)
  60. #define C 40 + STACK + ARGS(%esp)
  61. #define ARG_LDC 44 + STACK + ARGS(%esp)
  62. #define OFFSET 48 + STACK + ARGS(%esp)
  63. #define I %esi
  64. #define B %ebx
  65. #define CO %edi
  66. #define AO %edx
  67. #define BO %ecx
  68. #define LDC %ebp
  69. #define PREFETCH_OFFSET 48
  70. PROLOGUE
  71. subl $ARGS, %esp # Generate Stack Frame
  72. pushl %ebp
  73. pushl %edi
  74. pushl %esi
  75. pushl %ebx
  76. PROFCODE
  77. movl ARG_LDC, LDC
  78. movl ARG_B, B
  79. sall $BASE_SHIFT, LDC
  80. addl $8 * SIZE, A
  81. addl $8 * SIZE, B
  82. #ifdef LN
  83. movl M, %eax
  84. sall $BASE_SHIFT, %eax
  85. addl %eax, C
  86. imull K, %eax
  87. addl %eax, A
  88. #endif
  89. #ifdef RT
  90. movl N, %eax
  91. sall $BASE_SHIFT, %eax
  92. imull K, %eax
  93. addl %eax, B
  94. movl N, %eax
  95. imull %ebp, %eax
  96. addl %eax, C
  97. #endif
  98. #ifdef RN
  99. movl OFFSET, %eax
  100. negl %eax
  101. movl %eax, KK
  102. #endif
  103. #ifdef RT
  104. movl N, %eax
  105. subl OFFSET, %eax
  106. movl %eax, KK
  107. #endif
  108. movl N, %eax
  109. sarl $1, %eax
  110. movl %eax, J
  111. je .L30
  112. ALIGN_4
  113. .L01:
  114. #if defined(LT) || defined(RN)
  115. movl A, AO
  116. #else
  117. movl A, %eax
  118. movl %eax, AORIG
  119. #endif
  120. #ifdef RT
  121. movl K, %eax
  122. sall $1 + BASE_SHIFT, %eax
  123. subl %eax, B
  124. #endif
  125. lea (, LDC, 2), %eax
  126. #ifdef RT
  127. subl %eax, C
  128. #endif
  129. movl C, CO
  130. #ifndef RT
  131. addl %eax, C
  132. #endif
  133. #ifdef LN
  134. movl OFFSET, %eax
  135. addl M, %eax
  136. movl %eax, KK
  137. #endif
  138. #ifdef LT
  139. movl OFFSET, %eax
  140. movl %eax, KK
  141. #endif
  142. movl M, %eax
  143. andl $1, %eax
  144. je .L20
  145. ALIGN_4
  146. .L21:
  147. #ifdef LN
  148. movl K, %eax
  149. sall $0 + BASE_SHIFT, %eax
  150. subl %eax, AORIG
  151. #endif
  152. #if defined(LN) || defined(RT)
  153. movl KK, %eax
  154. sall $BASE_SHIFT, %eax
  155. movl AORIG, AO
  156. leal (AO, %eax, 1), AO
  157. leal (B, %eax, 2), BO
  158. #else
  159. movl B, BO
  160. #endif
  161. fldz
  162. fldz
  163. #if defined(LT) || defined(RN)
  164. movl KK, %eax
  165. #else
  166. movl K, %eax
  167. subl KK, %eax
  168. #endif
  169. sarl $2, %eax
  170. je .L25
  171. ALIGN_4
  172. .L22:
  173. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  174. FLD -8 * SIZE(AO)
  175. FLD -8 * SIZE(BO)
  176. fmul %st(1), %st
  177. faddp %st, %st(2)
  178. FLD -7 * SIZE(BO)
  179. fmulp %st, %st(1)
  180. faddp %st, %st(2)
  181. FLD -7 * SIZE(AO)
  182. FLD -6 * SIZE(BO)
  183. fmul %st(1), %st
  184. faddp %st, %st(2)
  185. FLD -5 * SIZE(BO)
  186. fmulp %st, %st(1)
  187. faddp %st, %st(2)
  188. FLD -6 * SIZE(AO)
  189. FLD -4 * SIZE(BO)
  190. fmul %st(1), %st
  191. faddp %st, %st(2)
  192. FLD -3 * SIZE(BO)
  193. fmulp %st, %st(1)
  194. faddp %st, %st(2)
  195. FLD -5 * SIZE(AO)
  196. FLD -2 * SIZE(BO)
  197. fmul %st(1), %st
  198. faddp %st, %st(2)
  199. FLD -1 * SIZE(BO)
  200. fmulp %st, %st(1)
  201. faddp %st, %st(2)
  202. addl $4 * SIZE,AO
  203. addl $8 * SIZE,BO
  204. decl %eax
  205. jne .L22
  206. ALIGN_4
  207. .L25:
  208. #if defined(LT) || defined(RN)
  209. movl KK, %eax
  210. #else
  211. movl K, %eax
  212. subl KK, %eax
  213. #endif
  214. and $3, %eax
  215. je .L28
  216. ALIGN_4
  217. .L26:
  218. FLD -8 * SIZE(AO)
  219. FLD -8 * SIZE(BO)
  220. fmul %st(1), %st
  221. faddp %st, %st(2)
  222. FLD -7 * SIZE(BO)
  223. fmulp %st, %st(1)
  224. faddp %st, %st(2)
  225. addl $1 * SIZE,AO
  226. addl $2 * SIZE,BO
  227. decl %eax
  228. jne .L26
  229. ALIGN_4
  230. .L28:
  231. #if defined(LN) || defined(RT)
  232. movl KK, %eax
  233. #ifdef LN
  234. subl $1, %eax
  235. #else
  236. subl $2, %eax
  237. #endif
  238. sall $BASE_SHIFT, %eax
  239. movl AORIG, AO
  240. leal (AO, %eax, 1), AO
  241. leal (B, %eax, 2), BO
  242. #endif
  243. #if defined(LN) || defined(LT)
  244. FLD -8 * SIZE(BO)
  245. fsubp %st, %st(1)
  246. FLD -7 * SIZE(BO)
  247. fsubp %st, %st(2)
  248. #else
  249. FLD -8 * SIZE(AO)
  250. fsubp %st, %st(1)
  251. FLD -7 * SIZE(AO)
  252. fsubp %st, %st(3)
  253. #endif
  254. #if defined(LN) || defined(LT)
  255. FLD -8 * SIZE(AO)
  256. fmul %st, %st(1)
  257. fmulp %st, %st(2)
  258. #endif
  259. #ifdef RN
  260. FLD -8 * SIZE(BO)
  261. fmulp %st, %st(1)
  262. FLD -7 * SIZE(BO)
  263. fmul %st(1), %st
  264. fsubrp %st, %st(2)
  265. FLD -5 * SIZE(BO)
  266. fmulp %st, %st(2)
  267. #endif
  268. #ifdef RT
  269. FLD -5 * SIZE(BO)
  270. fmulp %st, %st(2)
  271. FLD -6 * SIZE(BO)
  272. fmul %st(2), %st
  273. fsubrp %st, %st(1)
  274. FLD -8 * SIZE(BO)
  275. fmulp %st, %st(1)
  276. #endif
  277. #ifdef LN
  278. subl $1 * SIZE, CO
  279. #endif
  280. #if defined(LN) || defined(LT)
  281. fld %st
  282. FST -8 * SIZE(BO)
  283. fxch %st(1)
  284. fld %st
  285. FST -7 * SIZE(BO)
  286. #else
  287. fld %st
  288. FST -8 * SIZE(AO)
  289. fxch %st(1)
  290. fld %st
  291. FST -7 * SIZE(AO)
  292. #endif
  293. FST 0 * SIZE(CO, LDC)
  294. FST 0 * SIZE(CO)
  295. #ifndef LN
  296. addl $1 * SIZE, CO
  297. #endif
  298. #if defined(LT) || defined(RN)
  299. movl K, %eax
  300. subl KK, %eax
  301. sall $BASE_SHIFT, %eax
  302. leal (AO, %eax, 1), AO
  303. leal (BO, %eax, 2), BO
  304. #endif
  305. #ifdef LN
  306. subl $1, KK
  307. #endif
  308. #ifdef LT
  309. addl $1, KK
  310. #endif
  311. #ifdef RT
  312. movl K, %eax
  313. sall $0 + BASE_SHIFT, %eax
  314. addl %eax, AORIG
  315. #endif
  316. ALIGN_4
  317. .L20:
  318. movl M, I
  319. sarl $1, I
  320. je .L29
  321. ALIGN_4
  322. .L11:
  323. #ifdef LN
  324. movl K, %eax
  325. sall $1 + BASE_SHIFT, %eax
  326. subl %eax, AORIG
  327. #endif
  328. #if defined(LN) || defined(RT)
  329. movl KK, %eax
  330. sall $BASE_SHIFT, %eax
  331. movl AORIG, AO
  332. leal (AO, %eax, 2), AO
  333. leal (B, %eax, 2), BO
  334. #else
  335. movl B, BO
  336. #endif
  337. fldz
  338. fldz
  339. fldz
  340. fldz
  341. #if defined(HAVE_3DNOW)
  342. prefetchw 2 * SIZE(CO)
  343. prefetchw 2 * SIZE(CO, LDC, 1)
  344. #elif defined(HAVE_SSE)
  345. prefetchnta 2 * SIZE(CO)
  346. prefetchnta 2 * SIZE(CO, LDC, 1)
  347. #endif
  348. #if defined(LT) || defined(RN)
  349. movl KK, %eax
  350. #else
  351. movl K, %eax
  352. subl KK, %eax
  353. #endif
  354. sarl $2, %eax
  355. je .L15
  356. ALIGN_4
  357. .L12:
  358. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  359. FLD -8 * SIZE(AO)
  360. FLD -8 * SIZE(BO)
  361. fld %st(1)
  362. fmul %st(1), %st
  363. faddp %st, %st(3)
  364. FLD -7 * SIZE(BO)
  365. fmul %st, %st(2)
  366. FLD -7 * SIZE(AO)
  367. fmul %st, %st(2)
  368. fmulp %st, %st(1)
  369. faddp %st, %st(6)
  370. faddp %st, %st(4)
  371. faddp %st, %st(2)
  372. FLD -6 * SIZE(AO)
  373. FLD -6 * SIZE(BO)
  374. fld %st(1)
  375. fmul %st(1), %st
  376. faddp %st, %st(3)
  377. FLD -5 * SIZE(BO)
  378. fmul %st, %st(2)
  379. FLD -5 * SIZE(AO)
  380. fmul %st, %st(2)
  381. fmulp %st, %st(1)
  382. faddp %st, %st(6)
  383. faddp %st, %st(4)
  384. faddp %st, %st(2)
  385. PREFETCH (PREFETCHSIZE + 4) * SIZE(AO)
  386. FLD -4 * SIZE(AO)
  387. FLD -4 * SIZE(BO)
  388. fld %st(1)
  389. fmul %st(1), %st
  390. faddp %st, %st(3)
  391. FLD -3 * SIZE(BO)
  392. fmul %st, %st(2)
  393. FLD -3 * SIZE(AO)
  394. fmul %st, %st(2)
  395. fmulp %st, %st(1)
  396. faddp %st, %st(6)
  397. faddp %st, %st(4)
  398. faddp %st, %st(2)
  399. FLD -2 * SIZE(AO)
  400. FLD -2 * SIZE(BO)
  401. fld %st(1)
  402. fmul %st(1), %st
  403. faddp %st, %st(3)
  404. FLD -1 * SIZE(BO)
  405. fmul %st, %st(2)
  406. FLD -1 * SIZE(AO)
  407. fmul %st, %st(2)
  408. fmulp %st, %st(1)
  409. faddp %st, %st(6)
  410. faddp %st, %st(4)
  411. faddp %st, %st(2)
  412. addl $8 * SIZE,AO
  413. addl $8 * SIZE,BO
  414. decl %eax
  415. jne .L12
  416. ALIGN_4
  417. .L15:
  418. #if defined(LT) || defined(RN)
  419. movl KK, %eax
  420. #else
  421. movl K, %eax
  422. subl KK, %eax
  423. #endif
  424. and $3, %eax
  425. je .L18
  426. ALIGN_4
  427. .L16:
  428. FLD -8 * SIZE(AO)
  429. FLD -8 * SIZE(BO)
  430. fld %st(1)
  431. fmul %st(1), %st
  432. faddp %st, %st(3)
  433. FLD -7 * SIZE(BO)
  434. fmul %st, %st(2)
  435. FLD -7 * SIZE(AO)
  436. fmul %st, %st(2)
  437. fmulp %st, %st(1)
  438. faddp %st, %st(6)
  439. faddp %st, %st(4)
  440. faddp %st, %st(2)
  441. addl $2 * SIZE,AO
  442. addl $2 * SIZE,BO
  443. decl %eax
  444. jne .L16
  445. ALIGN_4
  446. .L18:
  447. #if defined(LN) || defined(RT)
  448. movl KK, %eax
  449. #ifdef LN
  450. subl $2, %eax
  451. #else
  452. subl $2, %eax
  453. #endif
  454. sall $BASE_SHIFT, %eax
  455. movl AORIG, AO
  456. leal (AO, %eax, 2), AO
  457. leal (B, %eax, 2), BO
  458. #endif
  459. #if defined(LN) || defined(LT)
  460. FLD -8 * SIZE(BO)
  461. fsubp %st, %st(1)
  462. FLD -7 * SIZE(BO)
  463. fsubp %st, %st(2)
  464. FLD -6 * SIZE(BO)
  465. fsubp %st, %st(3)
  466. FLD -5 * SIZE(BO)
  467. fsubp %st, %st(4)
  468. #else
  469. FLD -8 * SIZE(AO)
  470. fsubp %st, %st(1)
  471. FLD -7 * SIZE(AO)
  472. fsubp %st, %st(3)
  473. FLD -6 * SIZE(AO)
  474. fsubp %st, %st(2)
  475. FLD -5 * SIZE(AO)
  476. fsubp %st, %st(4)
  477. #endif
  478. #ifdef LN
  479. FLD -5 * SIZE(AO)
  480. fmul %st, %st(3)
  481. fmulp %st, %st(4)
  482. FLD -6 * SIZE(AO)
  483. fmul %st(3), %st
  484. FLD -6 * SIZE(AO)
  485. fmul %st(5), %st
  486. fsubrp %st, %st(3)
  487. fsubrp %st, %st(1)
  488. FLD -8 * SIZE(AO)
  489. fmul %st, %st(1)
  490. fmulp %st, %st(2)
  491. #endif
  492. #ifdef LT
  493. FLD -8 * SIZE(AO)
  494. fmul %st, %st(1)
  495. fmulp %st, %st(2)
  496. FLD -7 * SIZE(AO)
  497. fmul %st(1), %st
  498. FLD -7 * SIZE(AO)
  499. fmul %st(3), %st
  500. fsubrp %st, %st(5)
  501. fsubrp %st, %st(3)
  502. FLD -5 * SIZE(AO)
  503. fmul %st, %st(3)
  504. fmulp %st, %st(4)
  505. #endif
  506. #ifdef RN
  507. FLD -8 * SIZE(BO)
  508. fmul %st, %st(1)
  509. fmulp %st, %st(3)
  510. FLD -7 * SIZE(BO)
  511. fmul %st(1), %st
  512. FLD -7 * SIZE(BO)
  513. fmul %st(4), %st
  514. fsubrp %st, %st(5)
  515. fsubrp %st, %st(2)
  516. FLD -5 * SIZE(BO)
  517. fmul %st, %st(2)
  518. fmulp %st, %st(4)
  519. #endif
  520. #ifdef RT
  521. FLD -5 * SIZE(BO)
  522. fmul %st, %st(2)
  523. fmulp %st, %st(4)
  524. FLD -6 * SIZE(BO)
  525. fmul %st(2), %st
  526. FLD -6 * SIZE(BO)
  527. fmul %st(5), %st
  528. fsubrp %st, %st(4)
  529. fsubrp %st, %st(1)
  530. FLD -8 * SIZE(BO)
  531. fmul %st, %st(1)
  532. fmulp %st, %st(3)
  533. #endif
  534. #ifdef LN
  535. subl $2 * SIZE, CO
  536. #endif
  537. #if defined(LN) || defined(LT)
  538. fld %st
  539. FST -8 * SIZE(BO)
  540. fxch %st(1)
  541. fld %st
  542. FST -7 * SIZE(BO)
  543. fxch %st(2)
  544. fld %st
  545. FST -6 * SIZE(BO)
  546. fxch %st(3)
  547. fld %st
  548. FST -5 * SIZE(BO)
  549. FST 1 * SIZE(CO, LDC)
  550. FST 0 * SIZE(CO)
  551. FST 0 * SIZE(CO, LDC)
  552. FST 1 * SIZE(CO)
  553. #else
  554. fld %st
  555. FST -8 * SIZE(AO)
  556. fxch %st(2)
  557. fld %st
  558. FST -7 * SIZE(AO)
  559. fxch %st(1)
  560. fld %st
  561. FST -6 * SIZE(AO)
  562. fxch %st(3)
  563. fld %st
  564. FST -5 * SIZE(AO)
  565. FST 1 * SIZE(CO, LDC)
  566. FST 1 * SIZE(CO)
  567. FST 0 * SIZE(CO)
  568. FST 0 * SIZE(CO, LDC)
  569. #endif
  570. #ifndef LN
  571. addl $2 * SIZE, CO
  572. #endif
  573. #if defined(LT) || defined(RN)
  574. movl K, %eax
  575. subl KK, %eax
  576. sall $BASE_SHIFT, %eax
  577. leal (AO, %eax, 2), AO
  578. leal (BO, %eax, 2), BO
  579. #endif
  580. #ifdef LN
  581. subl $2, KK
  582. #endif
  583. #ifdef LT
  584. addl $2, KK
  585. #endif
  586. #ifdef RT
  587. movl K, %eax
  588. sall $1 + BASE_SHIFT, %eax
  589. addl %eax, AORIG
  590. #endif
  591. decl I
  592. jne .L11
  593. ALIGN_4
  594. .L29:
  595. #ifdef LN
  596. movl K, %eax
  597. sall $BASE_SHIFT, %eax
  598. leal (B, %eax, 2), B
  599. #endif
  600. #if defined(LT) || defined(RN)
  601. movl BO, B
  602. #endif
  603. #ifdef RN
  604. addl $2, KK
  605. #endif
  606. #ifdef RT
  607. subl $2, KK
  608. #endif
  609. decl J
  610. jne .L01
  611. ALIGN_4
  612. .L30:
  613. movl N, %eax
  614. testl $1, %eax
  615. je .L999
  616. #if defined(LT) || defined(RN)
  617. movl A, AO
  618. #else
  619. movl A, %eax
  620. movl %eax, AORIG
  621. #endif
  622. #ifdef RT
  623. movl K, %eax
  624. sall $0 + BASE_SHIFT, %eax
  625. subl %eax, B
  626. #endif
  627. #ifdef RT
  628. subl LDC, C
  629. #endif
  630. movl C, CO
  631. #ifndef RT
  632. addl %eax, C
  633. #endif
  634. #ifdef LN
  635. movl OFFSET, %eax
  636. addl M, %eax
  637. movl %eax, KK
  638. #endif
  639. #ifdef LT
  640. movl OFFSET, %eax
  641. movl %eax, KK
  642. #endif
  643. movl M, %eax
  644. andl $1, %eax
  645. je .L40
  646. ALIGN_4
  647. .L41:
  648. #ifdef LN
  649. movl K, %eax
  650. sall $0 + BASE_SHIFT, %eax
  651. subl %eax, AORIG
  652. #endif
  653. #if defined(LN) || defined(RT)
  654. movl KK, %eax
  655. sall $BASE_SHIFT, %eax
  656. movl AORIG, AO
  657. leal (AO, %eax, 1), AO
  658. leal (B, %eax, 1), BO
  659. #else
  660. movl B, BO
  661. #endif
  662. fldz
  663. #if defined(LT) || defined(RN)
  664. movl KK, %eax
  665. #else
  666. movl K, %eax
  667. subl KK, %eax
  668. #endif
  669. sarl $2, %eax
  670. je .L45
  671. ALIGN_4
  672. .L42:
  673. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  674. FLD -8 * SIZE(AO)
  675. FLD -8 * SIZE(BO)
  676. fmulp %st, %st(1)
  677. faddp %st, %st(1)
  678. FLD -7 * SIZE(AO)
  679. FLD -7 * SIZE(BO)
  680. fmulp %st, %st(1)
  681. faddp %st, %st(1)
  682. FLD -6 * SIZE(AO)
  683. FLD -6 * SIZE(BO)
  684. fmulp %st, %st(1)
  685. faddp %st, %st(1)
  686. FLD -5 * SIZE(AO)
  687. FLD -5 * SIZE(BO)
  688. fmulp %st, %st(1)
  689. faddp %st, %st(1)
  690. addl $4 * SIZE,AO
  691. addl $4 * SIZE,BO
  692. decl %eax
  693. jne .L42
  694. ALIGN_4
  695. .L45:
  696. #if defined(LT) || defined(RN)
  697. movl KK, %eax
  698. #else
  699. movl K, %eax
  700. subl KK, %eax
  701. #endif
  702. and $3, %eax
  703. je .L48
  704. ALIGN_4
  705. .L46:
  706. FLD -8 * SIZE(AO)
  707. FLD -8 * SIZE(BO)
  708. fmulp %st, %st(1)
  709. faddp %st, %st(1)
  710. addl $1 * SIZE,AO
  711. addl $1 * SIZE,BO
  712. decl %eax
  713. jne .L46
  714. ALIGN_4
  715. .L48:
  716. #if defined(LN) || defined(RT)
  717. movl KK, %eax
  718. #ifdef LN
  719. subl $1, %eax
  720. #else
  721. subl $1, %eax
  722. #endif
  723. sall $BASE_SHIFT, %eax
  724. movl AORIG, AO
  725. leal (AO, %eax, 1), AO
  726. leal (B, %eax, 1), BO
  727. #endif
  728. #if defined(LN) || defined(LT)
  729. FLD -8 * SIZE(BO)
  730. fsubp %st, %st(1)
  731. #else
  732. FLD -8 * SIZE(AO)
  733. fsubp %st, %st(1)
  734. #endif
  735. #ifdef LN
  736. FLD -8 * SIZE(AO)
  737. fmulp %st, %st(1)
  738. #endif
  739. #ifdef LT
  740. FLD -8 * SIZE(AO)
  741. fmulp %st, %st(1)
  742. #endif
  743. #ifdef RN
  744. FLD -8 * SIZE(BO)
  745. fmulp %st, %st(1)
  746. #endif
  747. #ifdef RT
  748. FLD -8 * SIZE(BO)
  749. fmulp %st, %st(1)
  750. #endif
  751. #ifdef LN
  752. subl $1 * SIZE, CO
  753. #endif
  754. #if defined(LN) || defined(LT)
  755. fld %st
  756. FST -8 * SIZE(BO)
  757. #else
  758. fld %st
  759. FST -8 * SIZE(AO)
  760. #endif
  761. FST 0 * SIZE(CO)
  762. #ifndef LN
  763. addl $1 * SIZE, CO
  764. #endif
  765. #if defined(LT) || defined(RN)
  766. movl K, %eax
  767. subl KK, %eax
  768. sall $BASE_SHIFT, %eax
  769. leal (AO, %eax, 1), AO
  770. leal (BO, %eax, 1), BO
  771. #endif
  772. #ifdef LN
  773. subl $1, KK
  774. #endif
  775. #ifdef LT
  776. addl $1, KK
  777. #endif
  778. #ifdef RT
  779. movl K, %eax
  780. sall $0 + BASE_SHIFT, %eax
  781. addl %eax, AORIG
  782. #endif
  783. ALIGN_4
  784. .L40:
  785. movl M, I
  786. sarl $1, I
  787. je .L49
  788. ALIGN_4
  789. .L31:
  790. #ifdef LN
  791. movl K, %eax
  792. sall $1 + BASE_SHIFT, %eax
  793. subl %eax, AORIG
  794. #endif
  795. #if defined(LN) || defined(RT)
  796. movl KK, %eax
  797. sall $BASE_SHIFT, %eax
  798. movl AORIG, AO
  799. leal (AO, %eax, 2), AO
  800. leal (B, %eax, 1), BO
  801. #else
  802. movl B, BO
  803. #endif
  804. fldz
  805. fldz
  806. #if defined(HAVE_3DNOW)
  807. prefetchw 2 * SIZE(CO)
  808. #elif defined(HAVE_SSE)
  809. prefetchnta 2 * SIZE(CO)
  810. #endif
  811. #if defined(LT) || defined(RN)
  812. movl KK, %eax
  813. #else
  814. movl K, %eax
  815. subl KK, %eax
  816. #endif
  817. sarl $2, %eax
  818. je .L35
  819. ALIGN_4
  820. .L32:
  821. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  822. FLD -8 * SIZE(BO)
  823. FLD -8 * SIZE(AO)
  824. fmul %st(1), %st
  825. faddp %st, %st(2)
  826. FLD -7 * SIZE(AO)
  827. fmulp %st, %st(1)
  828. faddp %st, %st(2)
  829. FLD -7 * SIZE(BO)
  830. FLD -6 * SIZE(AO)
  831. fmul %st(1), %st
  832. faddp %st, %st(2)
  833. FLD -5 * SIZE(AO)
  834. fmulp %st, %st(1)
  835. faddp %st, %st(2)
  836. FLD -6 * SIZE(BO)
  837. FLD -4 * SIZE(AO)
  838. fmul %st(1), %st
  839. faddp %st, %st(2)
  840. FLD -3 * SIZE(AO)
  841. fmulp %st, %st(1)
  842. faddp %st, %st(2)
  843. FLD -5 * SIZE(BO)
  844. FLD -2 * SIZE(AO)
  845. fmul %st(1), %st
  846. faddp %st, %st(2)
  847. FLD -1 * SIZE(AO)
  848. fmulp %st, %st(1)
  849. faddp %st, %st(2)
  850. addl $8 * SIZE,AO
  851. addl $4 * SIZE,BO
  852. decl %eax
  853. jne .L32
  854. ALIGN_4
  855. .L35:
  856. #if defined(LT) || defined(RN)
  857. movl KK, %eax
  858. #else
  859. movl K, %eax
  860. subl KK, %eax
  861. #endif
  862. and $3, %eax
  863. je .L38
  864. ALIGN_4
  865. .L36:
  866. FLD -8 * SIZE(BO)
  867. FLD -8 * SIZE(AO)
  868. fmul %st(1), %st
  869. faddp %st, %st(2)
  870. FLD -7 * SIZE(AO)
  871. fmulp %st, %st(1)
  872. faddp %st, %st(2)
  873. addl $2 * SIZE,AO
  874. addl $1 * SIZE,BO
  875. decl %eax
  876. jne .L36
  877. ALIGN_4
  878. .L38:
  879. #if defined(LN) || defined(RT)
  880. movl KK, %eax
  881. #ifdef LN
  882. subl $2, %eax
  883. #else
  884. subl $1, %eax
  885. #endif
  886. sall $BASE_SHIFT, %eax
  887. movl AORIG, AO
  888. leal (AO, %eax, 2), AO
  889. leal (B, %eax, 1), BO
  890. #endif
  891. #if defined(LN) || defined(LT)
  892. FLD -8 * SIZE(BO)
  893. fsubp %st, %st(1)
  894. FLD -7 * SIZE(BO)
  895. fsubp %st, %st(2)
  896. #else
  897. FLD -8 * SIZE(AO)
  898. fsubp %st, %st(1)
  899. FLD -7 * SIZE(AO)
  900. fsubp %st, %st(3)
  901. #endif
  902. #ifdef LN
  903. FLD -5 * SIZE(AO)
  904. fmulp %st, %st(2)
  905. FLD -6 * SIZE(AO)
  906. fmul %st(2), %st
  907. fsubrp %st, %st(1)
  908. FLD -8 * SIZE(AO)
  909. fmulp %st, %st(1)
  910. #endif
  911. #ifdef LT
  912. FLD -8 * SIZE(AO)
  913. fmulp %st, %st(1)
  914. FLD -7 * SIZE(AO)
  915. fmul %st(1), %st
  916. fsubrp %st, %st(2)
  917. FLD -5 * SIZE(AO)
  918. fmulp %st, %st(2)
  919. #endif
  920. #ifdef RN
  921. FLD -8 * SIZE(BO)
  922. fmul %st, %st(1)
  923. fmulp %st, %st(2)
  924. #endif
  925. #ifdef RT
  926. FLD -8 * SIZE(BO)
  927. fmul %st, %st(1)
  928. fmulp %st, %st(2)
  929. #endif
  930. #ifdef LN
  931. subl $2 * SIZE, CO
  932. #endif
  933. #if defined(LN) || defined(LT)
  934. fld %st
  935. FST -8 * SIZE(BO)
  936. fxch %st(1)
  937. fld %st
  938. FST -7 * SIZE(BO)
  939. #else
  940. fld %st
  941. FST -8 * SIZE(AO)
  942. fxch %st(1)
  943. fld %st
  944. FST -7 * SIZE(AO)
  945. #endif
  946. FST 1 * SIZE(CO)
  947. FST 0 * SIZE(CO)
  948. #ifndef LN
  949. addl $2 * SIZE, CO
  950. #endif
  951. #if defined(LT) || defined(RN)
  952. movl K, %eax
  953. subl KK, %eax
  954. sall $BASE_SHIFT, %eax
  955. leal (AO, %eax, 2), AO
  956. leal (BO, %eax, 1), BO
  957. #endif
  958. #ifdef LN
  959. subl $2, KK
  960. #endif
  961. #ifdef LT
  962. addl $2, KK
  963. #endif
  964. #ifdef RT
  965. movl K, %eax
  966. sall $1 + BASE_SHIFT, %eax
  967. addl %eax, AORIG
  968. #endif
  969. decl I
  970. jne .L31
  971. ALIGN_4
  972. .L49:
  973. #ifdef LN
  974. movl K, %eax
  975. sall $BASE_SHIFT, %eax
  976. leal (B, %eax, 1), B
  977. #endif
  978. #if defined(LT) || defined(RN)
  979. movl BO, B
  980. #endif
  981. #ifdef RN
  982. addl $1, KK
  983. #endif
  984. #ifdef RT
  985. subl $1, KK
  986. #endif
  987. ALIGN_4
  988. .L999:
  989. popl %ebx
  990. popl %esi
  991. popl %edi
  992. popl %ebp
  993. addl $ARGS, %esp
  994. ret
  995. EPILOGUE