You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT_8x2_sse.S 65 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if !defined(HAVE_SSE) || !defined(HAVE_MMX)
  41. #error You have to check your configuration.
  42. #endif
  43. #define STACK 16
  44. #define ARGS 0
  45. #define STACK_M 4 + STACK + ARGS(%esi)
  46. #define STACK_N 8 + STACK + ARGS(%esi)
  47. #define STACK_K 12 + STACK + ARGS(%esi)
  48. #define STACK_A 20 + STACK + ARGS(%esi)
  49. #define STACK_B 24 + STACK + ARGS(%esi)
  50. #define STACK_C 28 + STACK + ARGS(%esi)
  51. #define STACK_LDC 32 + STACK + ARGS(%esi)
  52. #define STACK_OFFT 36 + STACK + ARGS(%esi)
  53. #define TRMASK 0(%esp)
  54. #define K 16(%esp)
  55. #define N 20(%esp)
  56. #define M 24(%esp)
  57. #define A 28(%esp)
  58. #define C 32(%esp)
  59. #define J 36(%esp)
  60. #define OLD_STACK 40(%esp)
  61. #define OFFSET 44(%esp)
  62. #define KK 48(%esp)
  63. #define KKK 52(%esp)
  64. #define AORIG 56(%esp)
  65. #define BORIG 60(%esp)
  66. #define BUFFER 128(%esp)
  67. #ifdef HAVE_3DNOW
  68. #define PREFETCH prefetch
  69. #define PREFETCHW prefetchw
  70. #define PREFETCHSIZE (16 * 10 + 8)
  71. #else
  72. #define PREFETCH prefetcht0
  73. #define PREFETCHW prefetcht0
  74. #define PREFETCHSIZE 96
  75. #endif
  76. #define B %edi
  77. #define AA %edx
  78. #define BB %ecx
  79. #define LDC %ebp
  80. #define CO1 %esi
  81. #define STACK_ALIGN 4096
  82. #define STACK_OFFSET 1024
  83. #if !defined(HAVE_SSE2) || defined(OPTERON)
  84. #define movsd movlps
  85. #endif
  86. #ifdef HAVE_SSE2
  87. #define xorps pxor
  88. #endif
  89. PROLOGUE
  90. pushl %ebp
  91. pushl %edi
  92. pushl %esi
  93. pushl %ebx
  94. PROFCODE
  95. movl %esp, %esi # save old stack
  96. subl $128 + LOCAL_BUFFER_SIZE, %esp
  97. andl $-STACK_ALIGN, %esp
  98. STACK_TOUCHING
  99. movss STACK_M, %xmm0
  100. movl STACK_N, %eax
  101. movss STACK_K, %xmm1
  102. movss STACK_A, %xmm2
  103. movl STACK_B, B
  104. movss STACK_C, %xmm3
  105. movl STACK_LDC, LDC
  106. movss STACK_OFFT, %xmm4
  107. movss %xmm1, K
  108. movl %eax, N
  109. movss %xmm0, M
  110. movss %xmm2, A
  111. movss %xmm3, C
  112. movl %esi, OLD_STACK
  113. movss %xmm4, OFFSET
  114. movss %xmm4, KK
  115. leal (, LDC, SIZE), LDC
  116. #ifdef LN
  117. movl M, %eax
  118. leal (, %eax, SIZE), %eax
  119. addl %eax, C
  120. imull K, %eax
  121. addl %eax, A
  122. #endif
  123. #ifdef RT
  124. movl N, %eax
  125. leal (, %eax, SIZE), %eax
  126. imull K, %eax
  127. addl %eax, B
  128. movl N, %eax
  129. imull LDC, %eax
  130. addl %eax, C
  131. #endif
  132. #ifdef RN
  133. negl KK
  134. #endif
  135. #ifdef RT
  136. movl N, %eax
  137. subl OFFSET, %eax
  138. movl %eax, KK
  139. #endif
  140. #if defined(LN) || defined(LT)
  141. movl $0x3f800000, 0 + TRMASK # 1.0
  142. movl $0x00000000, 4 + TRMASK # 0.0
  143. movl $0x3f800000, 8 + TRMASK # 1.0
  144. movl $0x00000000, 12 + TRMASK # 0.0
  145. #endif
  146. testl $1, N
  147. jle .L100
  148. #ifdef LN
  149. movl OFFSET, %eax
  150. addl M, %eax
  151. movl %eax, KK
  152. #endif
  153. leal BUFFER, BB
  154. #ifdef RT
  155. movl K, %eax
  156. sall $BASE_SHIFT, %eax
  157. subl %eax, B
  158. #endif
  159. #if defined(LN) || defined(RT)
  160. movl KK, %eax
  161. movl B, BORIG
  162. sall $BASE_SHIFT, %eax
  163. leal (B, %eax, 1), B
  164. leal (BB, %eax, 4), BB
  165. #endif
  166. #ifdef LT
  167. movl OFFSET, %eax
  168. movl %eax, KK
  169. #endif
  170. #if defined(LT) || defined(RN)
  171. movl KK, %eax
  172. #else
  173. movl K, %eax
  174. subl KK, %eax
  175. #endif
  176. sarl $3, %eax
  177. jle .L103
  178. ALIGN_4
  179. .L102:
  180. movsd 0 * SIZE(B), %xmm3
  181. movhps 2 * SIZE(B), %xmm3
  182. movsd 4 * SIZE(B), %xmm7
  183. movhps 6 * SIZE(B), %xmm7
  184. #ifdef HAVE_SSE2
  185. pshufd $0x00, %xmm3, %xmm0
  186. pshufd $0x55, %xmm3, %xmm1
  187. pshufd $0xaa, %xmm3, %xmm2
  188. pshufd $0xff, %xmm3, %xmm3
  189. pshufd $0x00, %xmm7, %xmm4
  190. pshufd $0x55, %xmm7, %xmm5
  191. pshufd $0xaa, %xmm7, %xmm6
  192. pshufd $0xff, %xmm7, %xmm7
  193. #else
  194. movaps %xmm3, %xmm0
  195. shufps $0x00, %xmm0, %xmm0
  196. movaps %xmm3, %xmm1
  197. shufps $0x55, %xmm1, %xmm1
  198. movaps %xmm3, %xmm2
  199. shufps $0xaa, %xmm2, %xmm2
  200. shufps $0xff, %xmm3, %xmm3
  201. movaps %xmm7, %xmm4
  202. shufps $0x00, %xmm4, %xmm4
  203. movaps %xmm7, %xmm5
  204. shufps $0x55, %xmm5, %xmm5
  205. movaps %xmm7, %xmm6
  206. shufps $0xaa, %xmm6, %xmm6
  207. shufps $0xff, %xmm7, %xmm7
  208. #endif
  209. movaps %xmm0, 0 * SIZE(BB)
  210. movaps %xmm1, 4 * SIZE(BB)
  211. movaps %xmm2, 8 * SIZE(BB)
  212. movaps %xmm3, 12 * SIZE(BB)
  213. movaps %xmm4, 16 * SIZE(BB)
  214. movaps %xmm5, 20 * SIZE(BB)
  215. movaps %xmm6, 24 * SIZE(BB)
  216. movaps %xmm7, 28 * SIZE(BB)
  217. addl $ 8 * SIZE, B
  218. addl $32 * SIZE, BB
  219. decl %eax
  220. BRANCH
  221. jne .L102
  222. ALIGN_2
  223. .L103:
  224. #if defined(LT) || defined(RN)
  225. movl KK, %eax
  226. #else
  227. movl K, %eax
  228. subl KK, %eax
  229. #endif
  230. andl $7, %eax
  231. BRANCH
  232. jle .L105
  233. ALIGN_2
  234. .L104:
  235. movss 0 * SIZE(B), %xmm0
  236. shufps $0x00, %xmm0, %xmm0
  237. movaps %xmm0, 0 * SIZE(BB)
  238. addl $1 * SIZE, B
  239. addl $4 * SIZE, BB
  240. decl %eax
  241. jne .L104
  242. ALIGN_4
  243. .L105:
  244. #if defined(LT) || defined(RN)
  245. movl A, AA
  246. #else
  247. movl A, %eax
  248. movl %eax, AORIG
  249. #endif
  250. #ifdef RT
  251. subl LDC, C
  252. #endif
  253. movl C, CO1
  254. #ifndef RT
  255. addl LDC, C
  256. #endif
  257. movl M, %ebx
  258. sarl $3, %ebx # i = (m >> 2)
  259. jle .L130
  260. ALIGN_4
  261. .L110:
  262. #ifdef LN
  263. movl K, %eax
  264. sall $3 + BASE_SHIFT, %eax
  265. subl %eax, AORIG
  266. #endif
  267. #if defined(LN) || defined(RT)
  268. movl KK, %eax
  269. movl AORIG, AA
  270. sall $3 + BASE_SHIFT, %eax
  271. addl %eax, AA
  272. #endif
  273. leal BUFFER, BB
  274. #if defined(LN) || defined(RT)
  275. movl KK, %eax
  276. sall $BASE_SHIFT, %eax
  277. leal (BB, %eax, 4), BB
  278. #endif
  279. movaps 0 * SIZE(BB), %xmm2
  280. xorps %xmm4, %xmm4
  281. movaps 0 * SIZE(AA), %xmm0
  282. xorps %xmm5, %xmm5
  283. movaps 16 * SIZE(BB), %xmm3
  284. xorps %xmm6, %xmm6
  285. movaps 16 * SIZE(AA), %xmm1
  286. xorps %xmm7, %xmm7
  287. PREFETCHW 7 * SIZE(CO1)
  288. #if defined(LT) || defined(RN)
  289. movl KK, %eax
  290. #else
  291. movl K, %eax
  292. subl KK, %eax
  293. #endif
  294. sarl $3, %eax
  295. je .L112
  296. ALIGN_2
  297. .L111:
  298. mulps %xmm2, %xmm0
  299. mulps 4 * SIZE(AA), %xmm2
  300. addps %xmm0, %xmm4
  301. movaps 8 * SIZE(AA), %xmm0
  302. addps %xmm2, %xmm6
  303. movaps 4 * SIZE(BB), %xmm2
  304. mulps %xmm2, %xmm0
  305. mulps 12 * SIZE(AA), %xmm2
  306. addps %xmm0, %xmm5
  307. movaps 32 * SIZE(AA), %xmm0
  308. addps %xmm2, %xmm7
  309. movaps 8 * SIZE(BB), %xmm2
  310. mulps %xmm2, %xmm1
  311. mulps 20 * SIZE(AA), %xmm2
  312. addps %xmm1, %xmm4
  313. movaps 24 * SIZE(AA), %xmm1
  314. addps %xmm2, %xmm6
  315. movaps 12 * SIZE(BB), %xmm2
  316. mulps %xmm2, %xmm1
  317. mulps 28 * SIZE(AA), %xmm2
  318. addps %xmm1, %xmm5
  319. movaps 48 * SIZE(AA), %xmm1
  320. addps %xmm2, %xmm7
  321. movaps 32 * SIZE(BB), %xmm2
  322. mulps %xmm3, %xmm0
  323. mulps 36 * SIZE(AA), %xmm3
  324. addps %xmm0, %xmm4
  325. movaps 40 * SIZE(AA), %xmm0
  326. addps %xmm3, %xmm6
  327. movaps 20 * SIZE(BB), %xmm3
  328. mulps %xmm3, %xmm0
  329. mulps 44 * SIZE(AA), %xmm3
  330. addps %xmm0, %xmm5
  331. movaps 64 * SIZE(AA), %xmm0
  332. addps %xmm3, %xmm7
  333. movaps 24 * SIZE(BB), %xmm3
  334. mulps %xmm3, %xmm1
  335. mulps 52 * SIZE(AA), %xmm3
  336. addps %xmm1, %xmm4
  337. movaps 56 * SIZE(AA), %xmm1
  338. addps %xmm3, %xmm6
  339. movaps 28 * SIZE(BB), %xmm3
  340. mulps %xmm3, %xmm1
  341. mulps 60 * SIZE(AA), %xmm3
  342. addps %xmm1, %xmm5
  343. movaps 80 * SIZE(AA), %xmm1
  344. addps %xmm3, %xmm7
  345. movaps 48 * SIZE(BB), %xmm3
  346. addl $64 * SIZE, AA
  347. addl $32 * SIZE, BB
  348. decl %eax
  349. jne .L111
  350. ALIGN_2
  351. .L112:
  352. #if defined(LT) || defined(RN)
  353. movl KK, %eax
  354. #else
  355. movl K, %eax
  356. subl KK, %eax
  357. #endif
  358. andl $7, %eax # if (k & 1)
  359. BRANCH
  360. je .L114
  361. .L113:
  362. movaps 0 * SIZE(BB), %xmm2
  363. movaps 0 * SIZE(AA), %xmm0
  364. mulps %xmm2, %xmm0
  365. addps %xmm0, %xmm4
  366. mulps 4 * SIZE(AA), %xmm2
  367. addps %xmm2, %xmm6
  368. addl $8 * SIZE, AA
  369. addl $4 * SIZE, BB
  370. subl $1, %eax
  371. jg .L113
  372. ALIGN_4
  373. .L114:
  374. addps %xmm5, %xmm4
  375. addps %xmm7, %xmm6
  376. #if defined(LN) || defined(RT)
  377. movl KK, %eax
  378. #ifdef LN
  379. subl $8, %eax
  380. #else
  381. subl $1, %eax
  382. #endif
  383. movl AORIG, AA
  384. movl BORIG, B
  385. leal BUFFER, BB
  386. sall $BASE_SHIFT, %eax
  387. leal (AA, %eax, 8), AA
  388. leal (B, %eax, 1), B
  389. leal (BB, %eax, 4), BB
  390. #endif
  391. #if defined(LN) || defined(LT)
  392. movsd 0 * SIZE(B), %xmm2
  393. movhps 2 * SIZE(B), %xmm2
  394. movsd 4 * SIZE(B), %xmm5
  395. movhps 6 * SIZE(B), %xmm5
  396. subps %xmm4, %xmm2
  397. subps %xmm6, %xmm5
  398. xorps %xmm0, %xmm0
  399. movaps %xmm2, %xmm3
  400. unpcklps %xmm0, %xmm2
  401. unpckhps %xmm0, %xmm3
  402. movaps %xmm5, %xmm7
  403. unpcklps %xmm0, %xmm5
  404. unpckhps %xmm0, %xmm7
  405. #else
  406. movaps 0 * SIZE(AA), %xmm0
  407. movaps 4 * SIZE(AA), %xmm1
  408. subps %xmm4, %xmm0
  409. subps %xmm6, %xmm1
  410. #endif
  411. #if defined(LN) || defined(LT)
  412. movaps TRMASK, %xmm6
  413. #endif
  414. #ifdef LN
  415. movss 63 * SIZE(AA), %xmm0
  416. movaps %xmm6, %xmm1
  417. shufps $0x00, %xmm0, %xmm1
  418. mulps %xmm1, %xmm7
  419. movaps %xmm7, %xmm1
  420. shufps $0xee, %xmm1, %xmm1
  421. movss 62 * SIZE(AA), %xmm0
  422. shufps $0x50, %xmm0, %xmm0
  423. mulps %xmm1, %xmm0
  424. subps %xmm0, %xmm7
  425. movsd 60 * SIZE(AA), %xmm0
  426. shufps $0x50, %xmm0, %xmm0
  427. mulps %xmm1, %xmm0
  428. subps %xmm0, %xmm5
  429. movsd 58 * SIZE(AA), %xmm0
  430. shufps $0x50, %xmm0, %xmm0
  431. mulps %xmm1, %xmm0
  432. subps %xmm0, %xmm3
  433. movsd 56 * SIZE(AA), %xmm0
  434. shufps $0x50, %xmm0, %xmm0
  435. mulps %xmm1, %xmm0
  436. subps %xmm0, %xmm2
  437. movss 54 * SIZE(AA), %xmm0
  438. shufps $0x00, %xmm6, %xmm0
  439. mulps %xmm0, %xmm7
  440. movaps %xmm7, %xmm1
  441. shufps $0x44, %xmm1, %xmm1
  442. movsd 52 * SIZE(AA), %xmm0
  443. shufps $0x50, %xmm0, %xmm0
  444. mulps %xmm1, %xmm0
  445. subps %xmm0, %xmm5
  446. movsd 50 * SIZE(AA), %xmm0
  447. shufps $0x50, %xmm0, %xmm0
  448. mulps %xmm1, %xmm0
  449. subps %xmm0, %xmm3
  450. movsd 48 * SIZE(AA), %xmm0
  451. shufps $0x50, %xmm0, %xmm0
  452. mulps %xmm1, %xmm0
  453. subps %xmm0, %xmm2
  454. movss 45 * SIZE(AA), %xmm0
  455. movaps %xmm6, %xmm1
  456. shufps $0x00, %xmm0, %xmm1
  457. mulps %xmm1, %xmm5
  458. movaps %xmm5, %xmm1
  459. shufps $0xee, %xmm1, %xmm1
  460. movss 44 * SIZE(AA), %xmm0
  461. shufps $0x50, %xmm0, %xmm0
  462. mulps %xmm1, %xmm0
  463. subps %xmm0, %xmm5
  464. movsd 42 * SIZE(AA), %xmm0
  465. shufps $0x50, %xmm0, %xmm0
  466. mulps %xmm1, %xmm0
  467. subps %xmm0, %xmm3
  468. movsd 40 * SIZE(AA), %xmm0
  469. shufps $0x50, %xmm0, %xmm0
  470. mulps %xmm1, %xmm0
  471. subps %xmm0, %xmm2
  472. movss 36 * SIZE(AA), %xmm0
  473. shufps $0x00, %xmm6, %xmm0
  474. mulps %xmm0, %xmm5
  475. movaps %xmm5, %xmm1
  476. shufps $0x44, %xmm1, %xmm1
  477. movsd 34 * SIZE(AA), %xmm0
  478. shufps $0x50, %xmm0, %xmm0
  479. mulps %xmm1, %xmm0
  480. subps %xmm0, %xmm3
  481. movsd 32 * SIZE(AA), %xmm0
  482. shufps $0x50, %xmm0, %xmm0
  483. mulps %xmm1, %xmm0
  484. subps %xmm0, %xmm2
  485. movss 27 * SIZE(AA), %xmm0
  486. movaps %xmm6, %xmm1
  487. shufps $0x00, %xmm0, %xmm1
  488. mulps %xmm1, %xmm3
  489. movaps %xmm3, %xmm1
  490. shufps $0xee, %xmm1, %xmm1
  491. movss 26 * SIZE(AA), %xmm0
  492. shufps $0x50, %xmm0, %xmm0
  493. mulps %xmm1, %xmm0
  494. subps %xmm0, %xmm3
  495. movsd 24 * SIZE(AA), %xmm0
  496. shufps $0x50, %xmm0, %xmm0
  497. mulps %xmm1, %xmm0
  498. subps %xmm0, %xmm2
  499. movss 18 * SIZE(AA), %xmm0
  500. shufps $0x00, %xmm6, %xmm0
  501. mulps %xmm0, %xmm3
  502. movaps %xmm3, %xmm1
  503. shufps $0x44, %xmm1, %xmm1
  504. movsd 16 * SIZE(AA), %xmm0
  505. shufps $0x50, %xmm0, %xmm0
  506. mulps %xmm1, %xmm0
  507. subps %xmm0, %xmm2
  508. movss 9 * SIZE(AA), %xmm0
  509. movaps %xmm6, %xmm1
  510. shufps $0x00, %xmm0, %xmm1
  511. mulps %xmm1, %xmm2
  512. movaps %xmm2, %xmm1
  513. shufps $0xee, %xmm1, %xmm1
  514. movss 8 * SIZE(AA), %xmm0
  515. shufps $0x50, %xmm0, %xmm0
  516. mulps %xmm1, %xmm0
  517. subps %xmm0, %xmm2
  518. movss 0 * SIZE(AA), %xmm0
  519. shufps $0x00, %xmm6, %xmm0
  520. mulps %xmm0, %xmm2
  521. #endif
  522. #ifdef LT
  523. movss 0 * SIZE(AA), %xmm0
  524. shufps $0x00, %xmm6, %xmm0
  525. mulps %xmm0, %xmm2
  526. movaps %xmm2, %xmm1
  527. shufps $0x44, %xmm1, %xmm1
  528. movss 1 * SIZE(AA), %xmm0
  529. shufps $0x05, %xmm0, %xmm0
  530. mulps %xmm1, %xmm0
  531. subps %xmm0, %xmm2
  532. movsd 2 * SIZE(AA), %xmm0
  533. shufps $0x50, %xmm0, %xmm0
  534. mulps %xmm1, %xmm0
  535. subps %xmm0, %xmm3
  536. movsd 4 * SIZE(AA), %xmm0
  537. shufps $0x50, %xmm0, %xmm0
  538. mulps %xmm1, %xmm0
  539. subps %xmm0, %xmm5
  540. movsd 6 * SIZE(AA), %xmm0
  541. shufps $0x50, %xmm0, %xmm0
  542. mulps %xmm1, %xmm0
  543. subps %xmm0, %xmm7
  544. movss 9 * SIZE(AA), %xmm0
  545. movaps %xmm6, %xmm1
  546. shufps $0x00, %xmm0, %xmm1
  547. mulps %xmm1, %xmm2
  548. movaps %xmm2, %xmm1
  549. shufps $0xee, %xmm1, %xmm1
  550. movsd 10 * SIZE(AA), %xmm0
  551. shufps $0x50, %xmm0, %xmm0
  552. mulps %xmm1, %xmm0
  553. subps %xmm0, %xmm3
  554. movsd 12 * SIZE(AA), %xmm0
  555. shufps $0x50, %xmm0, %xmm0
  556. mulps %xmm1, %xmm0
  557. subps %xmm0, %xmm5
  558. movsd 14 * SIZE(AA), %xmm0
  559. shufps $0x50, %xmm0, %xmm0
  560. mulps %xmm1, %xmm0
  561. subps %xmm0, %xmm7
  562. movss 18 * SIZE(AA), %xmm0
  563. shufps $0x00, %xmm6, %xmm0
  564. mulps %xmm0, %xmm3
  565. movaps %xmm3, %xmm1
  566. shufps $0x44, %xmm1, %xmm1
  567. movss 19 * SIZE(AA), %xmm0
  568. shufps $0x05, %xmm0, %xmm0
  569. mulps %xmm1, %xmm0
  570. subps %xmm0, %xmm3
  571. movsd 20 * SIZE(AA), %xmm0
  572. shufps $0x50, %xmm0, %xmm0
  573. mulps %xmm1, %xmm0
  574. subps %xmm0, %xmm5
  575. movsd 22 * SIZE(AA), %xmm0
  576. shufps $0x50, %xmm0, %xmm0
  577. mulps %xmm1, %xmm0
  578. subps %xmm0, %xmm7
  579. movss 27 * SIZE(AA), %xmm0
  580. movaps %xmm6, %xmm1
  581. shufps $0x00, %xmm0, %xmm1
  582. mulps %xmm1, %xmm3
  583. movaps %xmm3, %xmm1
  584. shufps $0xee, %xmm1, %xmm1
  585. movsd 28 * SIZE(AA), %xmm0
  586. shufps $0x50, %xmm0, %xmm0
  587. mulps %xmm1, %xmm0
  588. subps %xmm0, %xmm5
  589. movsd 30 * SIZE(AA), %xmm0
  590. shufps $0x50, %xmm0, %xmm0
  591. mulps %xmm1, %xmm0
  592. subps %xmm0, %xmm7
  593. movss 36 * SIZE(AA), %xmm0
  594. shufps $0x00, %xmm6, %xmm0
  595. mulps %xmm0, %xmm5
  596. movaps %xmm5, %xmm1
  597. shufps $0x44, %xmm1, %xmm1
  598. movss 37 * SIZE(AA), %xmm0
  599. shufps $0x05, %xmm0, %xmm0
  600. mulps %xmm1, %xmm0
  601. subps %xmm0, %xmm5
  602. movsd 38 * SIZE(AA), %xmm0
  603. shufps $0x50, %xmm0, %xmm0
  604. mulps %xmm1, %xmm0
  605. subps %xmm0, %xmm7
  606. movss 45 * SIZE(AA), %xmm0
  607. movaps %xmm6, %xmm1
  608. shufps $0x00, %xmm0, %xmm1
  609. mulps %xmm1, %xmm5
  610. movaps %xmm5, %xmm1
  611. shufps $0xee, %xmm1, %xmm1
  612. movsd 46 * SIZE(AA), %xmm0
  613. shufps $0x50, %xmm0, %xmm0
  614. mulps %xmm1, %xmm0
  615. subps %xmm0, %xmm7
  616. movss 54 * SIZE(AA), %xmm0
  617. shufps $0x00, %xmm6, %xmm0
  618. mulps %xmm0, %xmm7
  619. movaps %xmm7, %xmm1
  620. shufps $0x44, %xmm1, %xmm1
  621. movss 55 * SIZE(AA), %xmm0
  622. shufps $0x05, %xmm0, %xmm0
  623. mulps %xmm1, %xmm0
  624. subps %xmm0, %xmm7
  625. movss 63 * SIZE(AA), %xmm0
  626. movaps %xmm6, %xmm1
  627. shufps $0x00, %xmm0, %xmm1
  628. mulps %xmm1, %xmm7
  629. #endif
  630. #if defined(RN) || defined(RT)
  631. movss 0 * SIZE(B), %xmm6
  632. shufps $0x00, %xmm6, %xmm6
  633. mulps %xmm6, %xmm0
  634. mulps %xmm6, %xmm1
  635. #endif
  636. #if defined(LN) || defined(LT)
  637. shufps $0x88, %xmm3, %xmm2
  638. shufps $0x88, %xmm7, %xmm5
  639. movlps %xmm2, 0 * SIZE(B)
  640. movhps %xmm2, 2 * SIZE(B)
  641. movlps %xmm5, 4 * SIZE(B)
  642. movhps %xmm5, 6 * SIZE(B)
  643. #ifdef HAVE_SSE2
  644. pshufd $0x00, %xmm2, %xmm0
  645. pshufd $0x55, %xmm2, %xmm1
  646. pshufd $0xaa, %xmm2, %xmm4
  647. pshufd $0xff, %xmm2, %xmm6
  648. #else
  649. movaps %xmm2, %xmm0
  650. shufps $0x00, %xmm0, %xmm0
  651. movaps %xmm2, %xmm1
  652. shufps $0x55, %xmm1, %xmm1
  653. movaps %xmm2, %xmm4
  654. shufps $0xaa, %xmm4, %xmm4
  655. movaps %xmm2, %xmm6
  656. shufps $0xff, %xmm6, %xmm6
  657. #endif
  658. movaps %xmm0, 0 * SIZE(BB)
  659. movaps %xmm1, 4 * SIZE(BB)
  660. movaps %xmm4, 8 * SIZE(BB)
  661. movaps %xmm6, 12 * SIZE(BB)
  662. #ifdef HAVE_SSE2
  663. pshufd $0x00, %xmm5, %xmm0
  664. pshufd $0x55, %xmm5, %xmm1
  665. pshufd $0xaa, %xmm5, %xmm4
  666. pshufd $0xff, %xmm5, %xmm6
  667. #else
  668. movaps %xmm5, %xmm0
  669. shufps $0x00, %xmm0, %xmm0
  670. movaps %xmm5, %xmm1
  671. shufps $0x55, %xmm1, %xmm1
  672. movaps %xmm5, %xmm4
  673. shufps $0xaa, %xmm4, %xmm4
  674. movaps %xmm5, %xmm6
  675. shufps $0xff, %xmm6, %xmm6
  676. #endif
  677. movaps %xmm0, 16 * SIZE(BB)
  678. movaps %xmm1, 20 * SIZE(BB)
  679. movaps %xmm4, 24 * SIZE(BB)
  680. movaps %xmm6, 28 * SIZE(BB)
  681. #else
  682. movaps %xmm0, 0 * SIZE(AA)
  683. movaps %xmm1, 4 * SIZE(AA)
  684. #endif
  685. #ifdef LN
  686. subl $8 * SIZE, CO1
  687. #endif
  688. #if defined(LN) || defined(LT)
  689. movlps %xmm2, 0 * SIZE(CO1)
  690. movhps %xmm2, 2 * SIZE(CO1)
  691. movlps %xmm5, 4 * SIZE(CO1)
  692. movhps %xmm5, 6 * SIZE(CO1)
  693. #else
  694. movlps %xmm0, 0 * SIZE(CO1)
  695. movhps %xmm0, 2 * SIZE(CO1)
  696. movlps %xmm1, 4 * SIZE(CO1)
  697. movhps %xmm1, 6 * SIZE(CO1)
  698. #endif
  699. #ifndef LN
  700. addl $8 * SIZE, CO1
  701. #endif
  702. #if defined(LT) || defined(RN)
  703. movl K, %eax
  704. subl KK, %eax
  705. leal (,%eax, SIZE), %eax
  706. leal (AA, %eax, 8), AA
  707. #ifdef LT
  708. addl $8 * SIZE, B
  709. #endif
  710. #endif
  711. #ifdef LN
  712. subl $8, KK
  713. movl BORIG, B
  714. #endif
  715. #ifdef LT
  716. addl $8, KK
  717. #endif
  718. #ifdef RT
  719. movl K, %eax
  720. movl BORIG, B
  721. sall $3 + BASE_SHIFT, %eax
  722. addl %eax, AORIG
  723. #endif
  724. decl %ebx # i --
  725. jg .L110
  726. ALIGN_2
  727. .L130:
  728. testl $4, M
  729. jle .L150
  730. #ifdef LN
  731. movl K, %eax
  732. sall $2 + BASE_SHIFT, %eax
  733. subl %eax, AORIG
  734. #endif
  735. #if defined(LN) || defined(RT)
  736. movl KK, %eax
  737. movl AORIG, AA
  738. sall $2 + BASE_SHIFT, %eax
  739. addl %eax, AA
  740. #endif
  741. leal BUFFER, BB
  742. #if defined(LN) || defined(RT)
  743. movl KK, %eax
  744. sall $BASE_SHIFT, %eax
  745. leal (BB, %eax, 4), BB
  746. #endif
  747. movaps 0 * SIZE(BB), %xmm2
  748. xorps %xmm4, %xmm4
  749. movsd 0 * SIZE(AA), %xmm0
  750. movhps 2 * SIZE(AA), %xmm0
  751. xorps %xmm5, %xmm5
  752. movaps 16 * SIZE(BB), %xmm3
  753. xorps %xmm6, %xmm6
  754. movsd 16 * SIZE(AA), %xmm1
  755. movhps 18 * SIZE(AA), %xmm1
  756. xorps %xmm7, %xmm7
  757. #if defined(LT) || defined(RN)
  758. movl KK, %eax
  759. #else
  760. movl K, %eax
  761. subl KK, %eax
  762. #endif
  763. sarl $3, %eax
  764. je .L132
  765. ALIGN_2
  766. .L131:
  767. mulps %xmm0, %xmm2
  768. movaps 4 * SIZE(AA), %xmm0
  769. addps %xmm2, %xmm4
  770. mulps 4 * SIZE(BB), %xmm0
  771. movaps 32 * SIZE(BB), %xmm2
  772. addps %xmm0, %xmm5
  773. movaps 8 * SIZE(AA), %xmm0
  774. mulps 8 * SIZE(BB), %xmm0
  775. addps %xmm0, %xmm6
  776. movaps 12 * SIZE(AA), %xmm0
  777. mulps 12 * SIZE(BB), %xmm0
  778. addps %xmm0, %xmm7
  779. movaps 32 * SIZE(AA), %xmm0
  780. mulps %xmm1, %xmm3
  781. movaps 20 * SIZE(AA), %xmm1
  782. addps %xmm3, %xmm4
  783. mulps 20 * SIZE(BB), %xmm1
  784. movaps 48 * SIZE(BB), %xmm3
  785. addps %xmm1, %xmm5
  786. movaps 24 * SIZE(AA), %xmm1
  787. mulps 24 * SIZE(BB), %xmm1
  788. addps %xmm1, %xmm6
  789. movaps 28 * SIZE(AA), %xmm1
  790. mulps 28 * SIZE(BB), %xmm1
  791. addps %xmm1, %xmm7
  792. movaps 48 * SIZE(AA), %xmm1
  793. addl $32 * SIZE, AA
  794. addl $32 * SIZE, BB
  795. decl %eax
  796. jne .L131
  797. ALIGN_2
  798. .L132:
  799. #if defined(LT) || defined(RN)
  800. movl KK, %eax
  801. #else
  802. movl K, %eax
  803. subl KK, %eax
  804. #endif
  805. andl $7, %eax # if (k & 1)
  806. BRANCH
  807. je .L134
  808. .L133:
  809. movaps 0 * SIZE(BB), %xmm2
  810. movaps 0 * SIZE(AA), %xmm0
  811. mulps %xmm0, %xmm2
  812. addps %xmm2, %xmm4
  813. addl $4 * SIZE, AA
  814. addl $4 * SIZE, BB
  815. decl %eax
  816. jg .L133
  817. ALIGN_4
  818. .L134:
  819. addps %xmm5, %xmm4
  820. addps %xmm7, %xmm6
  821. addps %xmm6, %xmm4
  822. #if defined(LN) || defined(RT)
  823. movl KK, %eax
  824. #ifdef LN
  825. subl $4, %eax
  826. #else
  827. subl $1, %eax
  828. #endif
  829. movl AORIG, AA
  830. movl BORIG, B
  831. leal BUFFER, BB
  832. sall $BASE_SHIFT, %eax
  833. leal (AA, %eax, 4), AA
  834. leal (B, %eax, 1), B
  835. leal (BB, %eax, 4), BB
  836. #endif
  837. #if defined(LN) || defined(LT)
  838. movsd 0 * SIZE(B), %xmm2
  839. movhps 2 * SIZE(B), %xmm2
  840. subps %xmm4, %xmm2
  841. xorps %xmm5, %xmm5
  842. movaps %xmm2, %xmm3
  843. unpcklps %xmm5, %xmm2
  844. unpckhps %xmm5, %xmm3
  845. #else
  846. movaps 0 * SIZE(AA), %xmm0
  847. subps %xmm4, %xmm0
  848. #endif
  849. #if defined(LN) || defined(LT)
  850. movaps TRMASK, %xmm6
  851. #endif
  852. #ifdef LN
  853. movss 15 * SIZE(AA), %xmm0
  854. movaps %xmm6, %xmm1
  855. shufps $0x00, %xmm0, %xmm1
  856. mulps %xmm1, %xmm3
  857. movaps %xmm3, %xmm1
  858. shufps $0xee, %xmm1, %xmm1
  859. movss 14 * SIZE(AA), %xmm0
  860. shufps $0x50, %xmm0, %xmm0
  861. mulps %xmm1, %xmm0
  862. subps %xmm0, %xmm3
  863. movsd 12 * SIZE(AA), %xmm0
  864. shufps $0x50, %xmm0, %xmm0
  865. mulps %xmm1, %xmm0
  866. subps %xmm0, %xmm2
  867. movss 10 * SIZE(AA), %xmm0
  868. shufps $0x00, %xmm6, %xmm0
  869. mulps %xmm0, %xmm3
  870. movaps %xmm3, %xmm1
  871. shufps $0x44, %xmm1, %xmm1
  872. movsd 8 * SIZE(AA), %xmm0
  873. shufps $0x50, %xmm0, %xmm0
  874. mulps %xmm1, %xmm0
  875. subps %xmm0, %xmm2
  876. movss 5 * SIZE(AA), %xmm0
  877. movaps %xmm6, %xmm1
  878. shufps $0x00, %xmm0, %xmm1
  879. mulps %xmm1, %xmm2
  880. movaps %xmm2, %xmm1
  881. shufps $0xee, %xmm1, %xmm1
  882. movss 4 * SIZE(AA), %xmm0
  883. shufps $0x50, %xmm0, %xmm0
  884. mulps %xmm1, %xmm0
  885. subps %xmm0, %xmm2
  886. movss 0 * SIZE(AA), %xmm0
  887. shufps $0x00, %xmm6, %xmm0
  888. mulps %xmm0, %xmm2
  889. #endif
  890. #ifdef LT
  891. movss 0 * SIZE(AA), %xmm0
  892. shufps $0x00, %xmm6, %xmm0
  893. mulps %xmm0, %xmm2
  894. movaps %xmm2, %xmm1
  895. shufps $0x44, %xmm1, %xmm1
  896. movss 1 * SIZE(AA), %xmm0
  897. shufps $0x05, %xmm0, %xmm0
  898. mulps %xmm1, %xmm0
  899. subps %xmm0, %xmm2
  900. movsd 2 * SIZE(AA), %xmm0
  901. shufps $0x50, %xmm0, %xmm0
  902. mulps %xmm1, %xmm0
  903. subps %xmm0, %xmm3
  904. movss 5 * SIZE(AA), %xmm0
  905. movaps %xmm6, %xmm1
  906. shufps $0x00, %xmm0, %xmm1
  907. mulps %xmm1, %xmm2
  908. movaps %xmm2, %xmm1
  909. shufps $0xee, %xmm1, %xmm1
  910. movsd 6 * SIZE(AA), %xmm0
  911. shufps $0x50, %xmm0, %xmm0
  912. mulps %xmm1, %xmm0
  913. subps %xmm0, %xmm3
  914. movss 10 * SIZE(AA), %xmm0
  915. shufps $0x00, %xmm6, %xmm0
  916. mulps %xmm0, %xmm3
  917. movaps %xmm3, %xmm1
  918. shufps $0x44, %xmm1, %xmm1
  919. movss 11 * SIZE(AA), %xmm0
  920. shufps $0x05, %xmm0, %xmm0
  921. mulps %xmm1, %xmm0
  922. subps %xmm0, %xmm3
  923. movss 15 * SIZE(AA), %xmm0
  924. movaps %xmm6, %xmm1
  925. shufps $0x00, %xmm0, %xmm1
  926. mulps %xmm1, %xmm3
  927. #endif
  928. #ifdef RN
  929. movss 0 * SIZE(B), %xmm6
  930. shufps $0x00, %xmm6, %xmm6
  931. mulps %xmm6, %xmm0
  932. #endif
  933. #ifdef RT
  934. movss 0 * SIZE(B), %xmm6
  935. shufps $0x00, %xmm6, %xmm6
  936. mulps %xmm6, %xmm0
  937. #endif
  938. #if defined(LN) || defined(LT)
  939. shufps $0x88, %xmm3, %xmm2
  940. movlps %xmm2, 0 * SIZE(B)
  941. movhps %xmm2, 2 * SIZE(B)
  942. #ifdef HAVE_SSE2
  943. pshufd $0x00, %xmm2, %xmm0
  944. pshufd $0x55, %xmm2, %xmm1
  945. pshufd $0xaa, %xmm2, %xmm4
  946. pshufd $0xff, %xmm2, %xmm6
  947. #else
  948. movaps %xmm2, %xmm0
  949. shufps $0x00, %xmm0, %xmm0
  950. movaps %xmm2, %xmm1
  951. shufps $0x55, %xmm1, %xmm1
  952. movaps %xmm2, %xmm4
  953. shufps $0xaa, %xmm4, %xmm4
  954. movaps %xmm2, %xmm6
  955. shufps $0xff, %xmm6, %xmm6
  956. #endif
  957. movaps %xmm0, 0 * SIZE(BB)
  958. movaps %xmm1, 4 * SIZE(BB)
  959. movaps %xmm4, 8 * SIZE(BB)
  960. movaps %xmm6, 12 * SIZE(BB)
  961. #else
  962. movaps %xmm0, 0 * SIZE(AA)
  963. #endif
  964. #ifdef LN
  965. subl $4 * SIZE, CO1
  966. #endif
  967. #if defined(LN) || defined(LT)
  968. movlps %xmm2, 0 * SIZE(CO1)
  969. movhps %xmm2, 2 * SIZE(CO1)
  970. #else
  971. movlps %xmm0, 0 * SIZE(CO1)
  972. movhps %xmm0, 2 * SIZE(CO1)
  973. #endif
  974. #ifndef LN
  975. addl $4 * SIZE, CO1
  976. #endif
  977. #if defined(LT) || defined(RN)
  978. movl K, %eax
  979. subl KK, %eax
  980. leal (,%eax, SIZE), %eax
  981. leal (AA, %eax, 4), AA
  982. #ifdef LT
  983. addl $4 * SIZE, B
  984. #endif
  985. #endif
  986. #ifdef LN
  987. subl $4, KK
  988. movl BORIG, B
  989. #endif
  990. #ifdef LT
  991. addl $4, KK
  992. #endif
  993. #ifdef RT
  994. movl K, %eax
  995. movl BORIG, B
  996. sall $2 + BASE_SHIFT, %eax
  997. addl %eax, AORIG
  998. #endif
  999. ALIGN_2
  1000. .L150:
  1001. testl $2, M
  1002. jle .L170
  1003. #ifdef LN
  1004. movl K, %eax
  1005. sall $1 + BASE_SHIFT, %eax
  1006. subl %eax, AORIG
  1007. #endif
  1008. #if defined(LN) || defined(RT)
  1009. movl KK, %eax
  1010. movl AORIG, AA
  1011. sall $1 + BASE_SHIFT, %eax
  1012. addl %eax, AA
  1013. #endif
  1014. leal BUFFER, BB
  1015. #if defined(LN) || defined(RT)
  1016. movl KK, %eax
  1017. sall $BASE_SHIFT, %eax
  1018. leal (BB, %eax, 4), BB
  1019. #endif
  1020. movaps 0 * SIZE(BB), %xmm2
  1021. xorps %xmm4, %xmm4
  1022. #ifdef movsd
  1023. xorps %xmm0, %xmm0
  1024. #endif
  1025. movsd 0 * SIZE(AA), %xmm0
  1026. xorps %xmm5, %xmm5
  1027. movaps 16 * SIZE(BB), %xmm3
  1028. xorps %xmm6, %xmm6
  1029. #ifdef movsd
  1030. xorps %xmm1, %xmm1
  1031. #endif
  1032. movsd 8 * SIZE(AA), %xmm1
  1033. xorps %xmm7, %xmm7
  1034. #if defined(LT) || defined(RN)
  1035. movl KK, %eax
  1036. #else
  1037. movl K, %eax
  1038. subl KK, %eax
  1039. #endif
  1040. sarl $3, %eax
  1041. je .L152
  1042. ALIGN_2
  1043. .L151:
  1044. mulps %xmm0, %xmm2
  1045. movsd 2 * SIZE(AA), %xmm0
  1046. addps %xmm2, %xmm4
  1047. movaps 4 * SIZE(BB), %xmm2
  1048. mulps %xmm0, %xmm2
  1049. movsd 4 * SIZE(AA), %xmm0
  1050. addps %xmm2, %xmm5
  1051. movaps 8 * SIZE(BB), %xmm2
  1052. mulps %xmm0, %xmm2
  1053. movsd 6 * SIZE(AA), %xmm0
  1054. addps %xmm2, %xmm6
  1055. movaps 12 * SIZE(BB), %xmm2
  1056. mulps %xmm0, %xmm2
  1057. movsd 16 * SIZE(AA), %xmm0
  1058. addps %xmm2, %xmm7
  1059. movaps 32 * SIZE(BB), %xmm2
  1060. mulps %xmm1, %xmm3
  1061. movsd 10 * SIZE(AA), %xmm1
  1062. addps %xmm3, %xmm4
  1063. movaps 20 * SIZE(BB), %xmm3
  1064. mulps %xmm1, %xmm3
  1065. movsd 12 * SIZE(AA), %xmm1
  1066. addps %xmm3, %xmm5
  1067. movaps 24 * SIZE(BB), %xmm3
  1068. mulps %xmm1, %xmm3
  1069. movsd 14 * SIZE(AA), %xmm1
  1070. addps %xmm3, %xmm6
  1071. movaps 28 * SIZE(BB), %xmm3
  1072. mulps %xmm1, %xmm3
  1073. movsd 24 * SIZE(AA), %xmm1
  1074. addps %xmm3, %xmm7
  1075. movaps 48 * SIZE(BB), %xmm3
  1076. addl $16 * SIZE, AA
  1077. addl $32 * SIZE, BB
  1078. decl %eax
  1079. jne .L151
  1080. ALIGN_2
  1081. .L152:
  1082. #if defined(LT) || defined(RN)
  1083. movl KK, %eax
  1084. #else
  1085. movl K, %eax
  1086. subl KK, %eax
  1087. #endif
  1088. andl $7, %eax # if (k & 1)
  1089. BRANCH
  1090. je .L154
  1091. .L153:
  1092. mulps %xmm0, %xmm2
  1093. movsd 2 * SIZE(AA), %xmm0
  1094. addps %xmm2, %xmm4
  1095. movaps 4 * SIZE(BB), %xmm2
  1096. addl $2 * SIZE, AA
  1097. addl $4 * SIZE, BB
  1098. decl %eax
  1099. jg .L153
  1100. ALIGN_4
  1101. .L154:
  1102. addps %xmm5, %xmm4
  1103. addps %xmm7, %xmm6
  1104. addps %xmm6, %xmm4
  1105. #if defined(LN) || defined(RT)
  1106. movl KK, %eax
  1107. #ifdef LN
  1108. subl $2, %eax
  1109. #else
  1110. subl $1, %eax
  1111. #endif
  1112. movl AORIG, AA
  1113. movl BORIG, B
  1114. leal BUFFER, BB
  1115. sall $BASE_SHIFT, %eax
  1116. leal (AA, %eax, 2), AA
  1117. leal (B, %eax, 1), B
  1118. leal (BB, %eax, 4), BB
  1119. #endif
  1120. #if defined(LN) || defined(LT)
  1121. movaps %xmm4, %xmm5
  1122. shufps $1, %xmm5, %xmm5
  1123. movss 0 * SIZE(B), %xmm0
  1124. movss 1 * SIZE(B), %xmm1
  1125. subss %xmm4, %xmm0
  1126. subss %xmm5, %xmm1
  1127. #else
  1128. #ifdef movsd
  1129. xorps %xmm0, %xmm0
  1130. #endif
  1131. movsd 0 * SIZE(AA), %xmm0
  1132. subps %xmm4, %xmm0
  1133. #endif
  1134. #ifdef LN
  1135. movaps 0 * SIZE(AA), %xmm4
  1136. movaps %xmm4, %xmm6
  1137. shufps $0xff, %xmm6, %xmm6
  1138. mulss %xmm6, %xmm1
  1139. movaps %xmm4, %xmm6
  1140. shufps $0xaa, %xmm6, %xmm6
  1141. mulss %xmm1, %xmm6
  1142. subss %xmm6, %xmm0
  1143. mulss %xmm4, %xmm0
  1144. #endif
  1145. #ifdef LT
  1146. movaps 0 * SIZE(AA), %xmm4
  1147. mulss %xmm4, %xmm0
  1148. movaps %xmm4, %xmm6
  1149. shufps $0x55, %xmm6, %xmm6
  1150. mulss %xmm0, %xmm6
  1151. subss %xmm6, %xmm1
  1152. movaps %xmm4, %xmm6
  1153. shufps $0xff, %xmm6, %xmm6
  1154. mulss %xmm6, %xmm1
  1155. #endif
  1156. #ifdef RN
  1157. movss 0 * SIZE(B), %xmm6
  1158. shufps $0x00, %xmm6, %xmm6
  1159. mulps %xmm6, %xmm0
  1160. #endif
  1161. #ifdef RT
  1162. movss 0 * SIZE(B), %xmm6
  1163. shufps $0x00, %xmm6, %xmm6
  1164. mulps %xmm6, %xmm0
  1165. #endif
  1166. #if defined(LN) || defined(LT)
  1167. movss %xmm0, 0 * SIZE(B)
  1168. movss %xmm1, 1 * SIZE(B)
  1169. shufps $0x00, %xmm0, %xmm0
  1170. shufps $0x00, %xmm1, %xmm1
  1171. movaps %xmm0, 0 * SIZE(BB)
  1172. movaps %xmm1, 4 * SIZE(BB)
  1173. #else
  1174. movlps %xmm0, 0 * SIZE(AA)
  1175. #endif
  1176. #ifdef LN
  1177. subl $2 * SIZE, CO1
  1178. #endif
  1179. #if defined(LN) || defined(LT)
  1180. movss %xmm0, 0 * SIZE(CO1)
  1181. movss %xmm1, 1 * SIZE(CO1)
  1182. #else
  1183. movlps %xmm0, 0 * SIZE(CO1)
  1184. #endif
  1185. #ifndef LN
  1186. addl $2 * SIZE, CO1
  1187. #endif
  1188. #if defined(LT) || defined(RN)
  1189. movl K, %eax
  1190. subl KK, %eax
  1191. leal (,%eax, SIZE), %eax
  1192. leal (AA, %eax, 2), AA
  1193. #ifdef LT
  1194. addl $2 * SIZE, B
  1195. #endif
  1196. #endif
  1197. #ifdef LN
  1198. subl $2, KK
  1199. movl BORIG, B
  1200. #endif
  1201. #ifdef LT
  1202. addl $2, KK
  1203. #endif
  1204. #ifdef RT
  1205. movl K, %eax
  1206. movl BORIG, B
  1207. sall $1 + BASE_SHIFT, %eax
  1208. addl %eax, AORIG
  1209. #endif
  1210. ALIGN_2
  1211. .L170:
  1212. testl $1, M
  1213. jle .L179
  1214. #ifdef LN
  1215. movl K, %eax
  1216. sall $BASE_SHIFT, %eax
  1217. subl %eax, AORIG
  1218. #endif
  1219. #if defined(LN) || defined(RT)
  1220. movl KK, %eax
  1221. movl AORIG, AA
  1222. leal (AA, %eax, SIZE), AA
  1223. #endif
  1224. leal BUFFER, BB
  1225. #if defined(LN) || defined(RT)
  1226. movl KK, %eax
  1227. sall $BASE_SHIFT, %eax
  1228. leal (BB, %eax, 4), BB
  1229. #endif
  1230. movss 0 * SIZE(BB), %xmm2
  1231. xorps %xmm4, %xmm4
  1232. movss 0 * SIZE(AA), %xmm0
  1233. xorps %xmm5, %xmm5
  1234. movss 16 * SIZE(BB), %xmm3
  1235. xorps %xmm6, %xmm6
  1236. movss 4 * SIZE(AA), %xmm1
  1237. xorps %xmm7, %xmm7
  1238. #if defined(LT) || defined(RN)
  1239. movl KK, %eax
  1240. #else
  1241. movl K, %eax
  1242. subl KK, %eax
  1243. #endif
  1244. sarl $3, %eax
  1245. je .L172
  1246. ALIGN_2
  1247. .L171:
  1248. mulss %xmm0, %xmm2
  1249. movss 1 * SIZE(AA), %xmm0
  1250. addss %xmm2, %xmm4
  1251. mulss 4 * SIZE(BB), %xmm0
  1252. movss 32 * SIZE(BB), %xmm2
  1253. addss %xmm0, %xmm5
  1254. movss 2 * SIZE(AA), %xmm0
  1255. mulss 8 * SIZE(BB), %xmm0
  1256. addss %xmm0, %xmm6
  1257. movss 3 * SIZE(AA), %xmm0
  1258. mulss 12 * SIZE(BB), %xmm0
  1259. addss %xmm0, %xmm7
  1260. movss 8 * SIZE(AA), %xmm0
  1261. mulss %xmm1, %xmm3
  1262. movss 5 * SIZE(AA), %xmm1
  1263. addss %xmm3, %xmm4
  1264. mulss 20 * SIZE(BB), %xmm1
  1265. movss 48 * SIZE(BB), %xmm3
  1266. addss %xmm1, %xmm5
  1267. movss 6 * SIZE(AA), %xmm1
  1268. mulss 24 * SIZE(BB), %xmm1
  1269. addss %xmm1, %xmm6
  1270. movss 7 * SIZE(AA), %xmm1
  1271. mulss 28 * SIZE(BB), %xmm1
  1272. addss %xmm1, %xmm7
  1273. movss 12 * SIZE(AA), %xmm1
  1274. addl $ 8 * SIZE, AA
  1275. addl $32 * SIZE, BB
  1276. decl %eax
  1277. jne .L171
  1278. ALIGN_2
  1279. .L172:
  1280. #if defined(LT) || defined(RN)
  1281. movl KK, %eax
  1282. #else
  1283. movl K, %eax
  1284. subl KK, %eax
  1285. #endif
  1286. andl $7, %eax # if (k & 1)
  1287. BRANCH
  1288. je .L174
  1289. .L173:
  1290. movss 0 * SIZE(AA), %xmm0
  1291. movss 0 * SIZE(BB), %xmm2
  1292. mulss %xmm0, %xmm2
  1293. addss %xmm2, %xmm4
  1294. addl $1 * SIZE, AA
  1295. addl $4 * SIZE, BB
  1296. decl %eax
  1297. jg .L173
  1298. ALIGN_4
  1299. .L174:
  1300. addss %xmm5, %xmm4
  1301. addss %xmm7, %xmm6
  1302. addss %xmm6, %xmm4
  1303. #if defined(LN) || defined(RT)
  1304. movl KK, %eax
  1305. subl $1, %eax
  1306. movl AORIG, AA
  1307. movl BORIG, B
  1308. leal BUFFER, BB
  1309. sall $ BASE_SHIFT, %eax
  1310. leal (AA, %eax, 1), AA
  1311. leal (B, %eax, 1), B
  1312. leal (BB, %eax, 4), BB
  1313. #endif
  1314. #if defined(LN) || defined(LT)
  1315. movss 0 * SIZE(B), %xmm1
  1316. subss %xmm4, %xmm1
  1317. #else
  1318. movss 0 * SIZE(AA), %xmm0
  1319. subss %xmm4, %xmm0
  1320. #endif
  1321. #if defined(LN) || defined(LT)
  1322. mulss 0 * SIZE(AA), %xmm1
  1323. #endif
  1324. #if defined(RN) || defined(RT)
  1325. mulss 0 * SIZE(B), %xmm0
  1326. #endif
  1327. #if defined(LN) || defined(LT)
  1328. movss %xmm1, 0 * SIZE(B)
  1329. shufps $0x00, %xmm1, %xmm1
  1330. movaps %xmm1, 0 * SIZE(BB)
  1331. #else
  1332. movss %xmm0, 0 * SIZE(AA)
  1333. #endif
  1334. #ifdef LN
  1335. subl $1 * SIZE, CO1
  1336. #endif
  1337. #if defined(LN) || defined(LT)
  1338. movss %xmm1, 0 * SIZE(CO1)
  1339. #else
  1340. movss %xmm0, 0 * SIZE(CO1)
  1341. #endif
  1342. #ifndef LN
  1343. addl $1 * SIZE, CO1
  1344. #endif
  1345. #if defined(LT) || defined(RN)
  1346. movl K, %eax
  1347. subl KK, %eax
  1348. leal (AA, %eax, SIZE), AA
  1349. #ifdef LT
  1350. addl $1 * SIZE, B
  1351. #endif
  1352. #endif
  1353. #ifdef LN
  1354. subl $1, KK
  1355. movl BORIG, B
  1356. #endif
  1357. #ifdef LT
  1358. addl $1, KK
  1359. #endif
  1360. #ifdef RT
  1361. movl K, %eax
  1362. movl BORIG, B
  1363. sall $BASE_SHIFT, %eax
  1364. addl %eax, AORIG
  1365. #endif
  1366. ALIGN_2
  1367. .L179:
  1368. #ifdef LN
  1369. movl K, %eax
  1370. leal (B, %eax, SIZE), B
  1371. #endif
  1372. #if defined(LT) || defined(RN)
  1373. movl K, %eax
  1374. subl KK, %eax
  1375. leal (B, %eax, SIZE), B
  1376. #endif
  1377. #ifdef RN
  1378. addl $1, KK
  1379. #endif
  1380. #ifdef RT
  1381. subl $1, KK
  1382. #endif
  1383. ALIGN_4
  1384. .L100:
  1385. movl N, %eax
  1386. sarl $1, %eax # j = (n >> 1)
  1387. movl %eax, J
  1388. jle .L999
  1389. ALIGN_2
  1390. .L01:
  1391. #ifdef LN
  1392. movl OFFSET, %eax
  1393. addl M, %eax
  1394. movl %eax, KK
  1395. #endif
  1396. leal BUFFER, BB
  1397. #ifdef RT
  1398. movl K, %eax
  1399. sall $1 + BASE_SHIFT, %eax
  1400. subl %eax, B
  1401. #endif
  1402. #if defined(LN) || defined(RT)
  1403. movl KK, %eax
  1404. movl B, BORIG
  1405. sall $1 + BASE_SHIFT, %eax
  1406. leal (B, %eax, 1), B
  1407. leal (BB, %eax, 4), BB
  1408. #endif
  1409. #ifdef LT
  1410. movl OFFSET, %eax
  1411. movl %eax, KK
  1412. #endif
  1413. #if defined(LT) || defined(RN)
  1414. movl KK, %eax
  1415. #else
  1416. movl K, %eax
  1417. subl KK, %eax
  1418. #endif
  1419. sarl $2, %eax
  1420. jle .L03
  1421. ALIGN_4
  1422. .L02:
  1423. movsd 0 * SIZE(B), %xmm3
  1424. movhps 2 * SIZE(B), %xmm3
  1425. movsd 4 * SIZE(B), %xmm7
  1426. movhps 6 * SIZE(B), %xmm7
  1427. #ifdef HAVE_SSE2
  1428. pshufd $0x00, %xmm3, %xmm0
  1429. pshufd $0x55, %xmm3, %xmm1
  1430. pshufd $0xaa, %xmm3, %xmm2
  1431. pshufd $0xff, %xmm3, %xmm3
  1432. pshufd $0x00, %xmm7, %xmm4
  1433. pshufd $0x55, %xmm7, %xmm5
  1434. pshufd $0xaa, %xmm7, %xmm6
  1435. pshufd $0xff, %xmm7, %xmm7
  1436. #else
  1437. movaps %xmm3, %xmm0
  1438. shufps $0x00, %xmm0, %xmm0
  1439. movaps %xmm3, %xmm1
  1440. shufps $0x55, %xmm1, %xmm1
  1441. movaps %xmm3, %xmm2
  1442. shufps $0xaa, %xmm2, %xmm2
  1443. shufps $0xff, %xmm3, %xmm3
  1444. movaps %xmm7, %xmm4
  1445. shufps $0x00, %xmm4, %xmm4
  1446. movaps %xmm7, %xmm5
  1447. shufps $0x55, %xmm5, %xmm5
  1448. movaps %xmm7, %xmm6
  1449. shufps $0xaa, %xmm6, %xmm6
  1450. shufps $0xff, %xmm7, %xmm7
  1451. #endif
  1452. movaps %xmm0, 0 * SIZE(BB)
  1453. movaps %xmm1, 4 * SIZE(BB)
  1454. movaps %xmm2, 8 * SIZE(BB)
  1455. movaps %xmm3, 12 * SIZE(BB)
  1456. movaps %xmm4, 16 * SIZE(BB)
  1457. movaps %xmm5, 20 * SIZE(BB)
  1458. movaps %xmm6, 24 * SIZE(BB)
  1459. movaps %xmm7, 28 * SIZE(BB)
  1460. addl $ 8 * SIZE, B
  1461. addl $32 * SIZE, BB
  1462. decl %eax
  1463. BRANCH
  1464. jne .L02
  1465. ALIGN_2
  1466. .L03:
  1467. #if defined(LT) || defined(RN)
  1468. movl KK, %eax
  1469. #else
  1470. movl K, %eax
  1471. subl KK, %eax
  1472. #endif
  1473. andl $3, %eax
  1474. BRANCH
  1475. jle .L05
  1476. ALIGN_2
  1477. .L04:
  1478. movsd 0 * SIZE(B), %xmm3
  1479. #ifdef HAVE_SSE2
  1480. pshufd $0x00, %xmm3, %xmm0
  1481. pshufd $0x55, %xmm3, %xmm1
  1482. #else
  1483. movaps %xmm3, %xmm0
  1484. shufps $0x00, %xmm0, %xmm0
  1485. movaps %xmm3, %xmm1
  1486. shufps $0x55, %xmm1, %xmm1
  1487. #endif
  1488. movaps %xmm0, 0 * SIZE(BB)
  1489. movaps %xmm1, 4 * SIZE(BB)
  1490. addl $2 * SIZE, B
  1491. addl $8 * SIZE, BB
  1492. decl %eax
  1493. jne .L04
  1494. ALIGN_4
  1495. .L05:
  1496. #if defined(LT) || defined(RN)
  1497. movl A, AA
  1498. #else
  1499. movl A, %eax
  1500. movl %eax, AORIG
  1501. #endif
  1502. leal (, LDC, 2), %eax
  1503. #ifdef RT
  1504. subl %eax, C
  1505. #endif
  1506. movl C, CO1
  1507. #ifndef RT
  1508. addl %eax, C
  1509. #endif
  1510. movl M, %ebx
  1511. sarl $3, %ebx
  1512. jle .L30
  1513. ALIGN_4
  1514. .L10:
  1515. #ifdef LN
  1516. movl K, %eax
  1517. sall $3 + BASE_SHIFT, %eax
  1518. subl %eax, AORIG
  1519. #endif
  1520. #if defined(LN) || defined(RT)
  1521. movl KK, %eax
  1522. movl AORIG, AA
  1523. sall $3 + BASE_SHIFT, %eax
  1524. addl %eax, AA
  1525. #endif
  1526. leal BUFFER, BB
  1527. #if defined(LN) || defined(RT)
  1528. movl KK, %eax
  1529. sall $1 + BASE_SHIFT, %eax
  1530. leal (BB, %eax, 4), BB
  1531. #endif
  1532. movaps 0 * SIZE(BB), %xmm2
  1533. xorps %xmm4, %xmm4
  1534. movaps 0 * SIZE(AA), %xmm0
  1535. xorps %xmm5, %xmm5
  1536. movaps 8 * SIZE(BB), %xmm3
  1537. xorps %xmm6, %xmm6
  1538. movaps 8 * SIZE(AA), %xmm1
  1539. xorps %xmm7, %xmm7
  1540. PREFETCHW 7 * SIZE(CO1)
  1541. PREFETCHW 7 * SIZE(CO1, LDC)
  1542. #if defined(LT) || defined(RN)
  1543. movl KK, %eax
  1544. #else
  1545. movl K, %eax
  1546. subl KK, %eax
  1547. #endif
  1548. sarl $3, %eax
  1549. je .L12
  1550. ALIGN_2
  1551. .L11:
  1552. mulps %xmm0, %xmm2
  1553. mulps 4 * SIZE(BB), %xmm0
  1554. addps %xmm2, %xmm4
  1555. movaps 0 * SIZE(BB), %xmm2
  1556. addps %xmm0, %xmm5
  1557. movaps 4 * SIZE(AA), %xmm0
  1558. mulps %xmm0, %xmm2
  1559. mulps 4 * SIZE(BB), %xmm0
  1560. addps %xmm2, %xmm6
  1561. movaps 16 * SIZE(BB), %xmm2
  1562. addps %xmm0, %xmm7
  1563. movaps 16 * SIZE(AA), %xmm0
  1564. mulps %xmm1, %xmm3
  1565. mulps 12 * SIZE(BB), %xmm1
  1566. addps %xmm3, %xmm4
  1567. movaps 8 * SIZE(BB), %xmm3
  1568. addps %xmm1, %xmm5
  1569. movaps 12 * SIZE(AA), %xmm1
  1570. mulps %xmm1, %xmm3
  1571. mulps 12 * SIZE(BB), %xmm1
  1572. addps %xmm3, %xmm6
  1573. movaps 24 * SIZE(BB), %xmm3
  1574. addps %xmm1, %xmm7
  1575. movaps 24 * SIZE(AA), %xmm1
  1576. mulps %xmm0, %xmm2
  1577. mulps 20 * SIZE(BB), %xmm0
  1578. addps %xmm2, %xmm4
  1579. movaps 16 * SIZE(BB), %xmm2
  1580. addps %xmm0, %xmm5
  1581. movaps 20 * SIZE(AA), %xmm0
  1582. mulps %xmm0, %xmm2
  1583. mulps 20 * SIZE(BB), %xmm0
  1584. addps %xmm2, %xmm6
  1585. movaps 32 * SIZE(BB), %xmm2
  1586. addps %xmm0, %xmm7
  1587. movaps 32 * SIZE(AA), %xmm0
  1588. mulps %xmm1, %xmm3
  1589. mulps 28 * SIZE(BB), %xmm1
  1590. addps %xmm3, %xmm4
  1591. movaps 24 * SIZE(BB), %xmm3
  1592. addps %xmm1, %xmm5
  1593. movaps 28 * SIZE(AA), %xmm1
  1594. mulps %xmm1, %xmm3
  1595. mulps 28 * SIZE(BB), %xmm1
  1596. addps %xmm3, %xmm6
  1597. movaps 40 * SIZE(BB), %xmm3
  1598. addps %xmm1, %xmm7
  1599. movaps 40 * SIZE(AA), %xmm1
  1600. mulps %xmm0, %xmm2
  1601. mulps 36 * SIZE(BB), %xmm0
  1602. addps %xmm2, %xmm4
  1603. movaps 32 * SIZE(BB), %xmm2
  1604. addps %xmm0, %xmm5
  1605. movaps 36 * SIZE(AA), %xmm0
  1606. mulps %xmm0, %xmm2
  1607. mulps 36 * SIZE(BB), %xmm0
  1608. addps %xmm2, %xmm6
  1609. movaps 48 * SIZE(BB), %xmm2
  1610. addps %xmm0, %xmm7
  1611. movaps 48 * SIZE(AA), %xmm0
  1612. mulps %xmm1, %xmm3
  1613. mulps 44 * SIZE(BB), %xmm1
  1614. addps %xmm3, %xmm4
  1615. movaps 40 * SIZE(BB), %xmm3
  1616. addps %xmm1, %xmm5
  1617. movaps 44 * SIZE(AA), %xmm1
  1618. mulps %xmm1, %xmm3
  1619. mulps 44 * SIZE(BB), %xmm1
  1620. addps %xmm3, %xmm6
  1621. movaps 56 * SIZE(BB), %xmm3
  1622. addps %xmm1, %xmm7
  1623. movaps 56 * SIZE(AA), %xmm1
  1624. mulps %xmm0, %xmm2
  1625. mulps 52 * SIZE(BB), %xmm0
  1626. addps %xmm2, %xmm4
  1627. movaps 48 * SIZE(BB), %xmm2
  1628. addps %xmm0, %xmm5
  1629. movaps 52 * SIZE(AA), %xmm0
  1630. mulps %xmm0, %xmm2
  1631. mulps 52 * SIZE(BB), %xmm0
  1632. addps %xmm2, %xmm6
  1633. movaps 64 * SIZE(BB), %xmm2
  1634. addps %xmm0, %xmm7
  1635. movaps 64 * SIZE(AA), %xmm0
  1636. mulps %xmm1, %xmm3
  1637. mulps 60 * SIZE(BB), %xmm1
  1638. addps %xmm3, %xmm4
  1639. movaps 56 * SIZE(BB), %xmm3
  1640. addps %xmm1, %xmm5
  1641. movaps 60 * SIZE(AA), %xmm1
  1642. mulps %xmm1, %xmm3
  1643. mulps 60 * SIZE(BB), %xmm1
  1644. addps %xmm3, %xmm6
  1645. movaps 72 * SIZE(BB), %xmm3
  1646. addps %xmm1, %xmm7
  1647. movaps 72 * SIZE(AA), %xmm1
  1648. addl $64 * SIZE, BB
  1649. addl $64 * SIZE, AA
  1650. decl %eax
  1651. jne .L11
  1652. ALIGN_2
  1653. .L12:
  1654. #if defined(LT) || defined(RN)
  1655. movl KK, %eax
  1656. #else
  1657. movl K, %eax
  1658. subl KK, %eax
  1659. #endif
  1660. andl $7, %eax # if (k & 1)
  1661. BRANCH
  1662. je .L14
  1663. .L13:
  1664. movaps 4 * SIZE(BB), %xmm1
  1665. mulps %xmm0, %xmm2
  1666. addps %xmm2, %xmm4
  1667. movaps 0 * SIZE(BB), %xmm2
  1668. mulps %xmm0, %xmm1
  1669. movaps 4 * SIZE(AA), %xmm0
  1670. addps %xmm1, %xmm5
  1671. movaps 4 * SIZE(BB), %xmm1
  1672. mulps %xmm0, %xmm2
  1673. addps %xmm2, %xmm6
  1674. movaps 8 * SIZE(BB), %xmm2
  1675. mulps %xmm0, %xmm1
  1676. movaps 8 * SIZE(AA), %xmm0
  1677. addps %xmm1, %xmm7
  1678. addl $8 * SIZE, AA
  1679. addl $8 * SIZE, BB
  1680. subl $1, %eax
  1681. jg .L13
  1682. ALIGN_4
  1683. .L14:
  1684. #if defined(LN) || defined(RT)
  1685. movl KK, %eax
  1686. #ifdef LN
  1687. subl $8, %eax
  1688. #else
  1689. subl $2, %eax
  1690. #endif
  1691. movl AORIG, AA
  1692. movl BORIG, B
  1693. leal BUFFER, BB
  1694. sall $BASE_SHIFT, %eax
  1695. leal (AA, %eax, 8), AA
  1696. leal (B, %eax, 2), B
  1697. leal (BB, %eax, 8), BB
  1698. #endif
  1699. #if defined(LN) || defined(LT)
  1700. movaps %xmm4, %xmm0
  1701. unpcklps %xmm5, %xmm4
  1702. unpckhps %xmm5, %xmm0
  1703. movaps %xmm6, %xmm1
  1704. unpcklps %xmm7, %xmm6
  1705. unpckhps %xmm7, %xmm1
  1706. movsd 0 * SIZE(B), %xmm2
  1707. movhps 2 * SIZE(B), %xmm2
  1708. movsd 4 * SIZE(B), %xmm3
  1709. movhps 6 * SIZE(B), %xmm3
  1710. movsd 8 * SIZE(B), %xmm5
  1711. movhps 10 * SIZE(B), %xmm5
  1712. movsd 12 * SIZE(B), %xmm7
  1713. movhps 14 * SIZE(B), %xmm7
  1714. subps %xmm4, %xmm2
  1715. subps %xmm0, %xmm3
  1716. subps %xmm6, %xmm5
  1717. subps %xmm1, %xmm7
  1718. #else
  1719. movaps 0 * SIZE(AA), %xmm0
  1720. movaps 4 * SIZE(AA), %xmm1
  1721. movaps 8 * SIZE(AA), %xmm2
  1722. movaps 12 * SIZE(AA), %xmm3
  1723. subps %xmm4, %xmm0
  1724. subps %xmm6, %xmm1
  1725. subps %xmm5, %xmm2
  1726. subps %xmm7, %xmm3
  1727. #endif
  1728. #if defined(LN) || defined(LT)
  1729. movaps TRMASK, %xmm6
  1730. #endif
  1731. #ifdef LN
  1732. movss 63 * SIZE(AA), %xmm0
  1733. movaps %xmm6, %xmm1
  1734. shufps $0x00, %xmm0, %xmm1
  1735. mulps %xmm1, %xmm7
  1736. movaps %xmm7, %xmm1
  1737. shufps $0xee, %xmm1, %xmm1
  1738. movss 62 * SIZE(AA), %xmm0
  1739. shufps $0x50, %xmm0, %xmm0
  1740. mulps %xmm1, %xmm0
  1741. subps %xmm0, %xmm7
  1742. movsd 60 * SIZE(AA), %xmm0
  1743. shufps $0x50, %xmm0, %xmm0
  1744. mulps %xmm1, %xmm0
  1745. subps %xmm0, %xmm5
  1746. movsd 58 * SIZE(AA), %xmm0
  1747. shufps $0x50, %xmm0, %xmm0
  1748. mulps %xmm1, %xmm0
  1749. subps %xmm0, %xmm3
  1750. movsd 56 * SIZE(AA), %xmm0
  1751. shufps $0x50, %xmm0, %xmm0
  1752. mulps %xmm1, %xmm0
  1753. subps %xmm0, %xmm2
  1754. movss 54 * SIZE(AA), %xmm0
  1755. shufps $0x00, %xmm6, %xmm0
  1756. mulps %xmm0, %xmm7
  1757. movaps %xmm7, %xmm1
  1758. shufps $0x44, %xmm1, %xmm1
  1759. movsd 52 * SIZE(AA), %xmm0
  1760. shufps $0x50, %xmm0, %xmm0
  1761. mulps %xmm1, %xmm0
  1762. subps %xmm0, %xmm5
  1763. movsd 50 * SIZE(AA), %xmm0
  1764. shufps $0x50, %xmm0, %xmm0
  1765. mulps %xmm1, %xmm0
  1766. subps %xmm0, %xmm3
  1767. movsd 48 * SIZE(AA), %xmm0
  1768. shufps $0x50, %xmm0, %xmm0
  1769. mulps %xmm1, %xmm0
  1770. subps %xmm0, %xmm2
  1771. movss 45 * SIZE(AA), %xmm0
  1772. movaps %xmm6, %xmm1
  1773. shufps $0x00, %xmm0, %xmm1
  1774. mulps %xmm1, %xmm5
  1775. movaps %xmm5, %xmm1
  1776. shufps $0xee, %xmm1, %xmm1
  1777. movss 44 * SIZE(AA), %xmm0
  1778. shufps $0x50, %xmm0, %xmm0
  1779. mulps %xmm1, %xmm0
  1780. subps %xmm0, %xmm5
  1781. movsd 42 * SIZE(AA), %xmm0
  1782. shufps $0x50, %xmm0, %xmm0
  1783. mulps %xmm1, %xmm0
  1784. subps %xmm0, %xmm3
  1785. movsd 40 * SIZE(AA), %xmm0
  1786. shufps $0x50, %xmm0, %xmm0
  1787. mulps %xmm1, %xmm0
  1788. subps %xmm0, %xmm2
  1789. movss 36 * SIZE(AA), %xmm0
  1790. shufps $0x00, %xmm6, %xmm0
  1791. mulps %xmm0, %xmm5
  1792. movaps %xmm5, %xmm1
  1793. shufps $0x44, %xmm1, %xmm1
  1794. movsd 34 * SIZE(AA), %xmm0
  1795. shufps $0x50, %xmm0, %xmm0
  1796. mulps %xmm1, %xmm0
  1797. subps %xmm0, %xmm3
  1798. movsd 32 * SIZE(AA), %xmm0
  1799. shufps $0x50, %xmm0, %xmm0
  1800. mulps %xmm1, %xmm0
  1801. subps %xmm0, %xmm2
  1802. movss 27 * SIZE(AA), %xmm0
  1803. movaps %xmm6, %xmm1
  1804. shufps $0x00, %xmm0, %xmm1
  1805. mulps %xmm1, %xmm3
  1806. movaps %xmm3, %xmm1
  1807. shufps $0xee, %xmm1, %xmm1
  1808. movss 26 * SIZE(AA), %xmm0
  1809. shufps $0x50, %xmm0, %xmm0
  1810. mulps %xmm1, %xmm0
  1811. subps %xmm0, %xmm3
  1812. movsd 24 * SIZE(AA), %xmm0
  1813. shufps $0x50, %xmm0, %xmm0
  1814. mulps %xmm1, %xmm0
  1815. subps %xmm0, %xmm2
  1816. movss 18 * SIZE(AA), %xmm0
  1817. shufps $0x00, %xmm6, %xmm0
  1818. mulps %xmm0, %xmm3
  1819. movaps %xmm3, %xmm1
  1820. shufps $0x44, %xmm1, %xmm1
  1821. movsd 16 * SIZE(AA), %xmm0
  1822. shufps $0x50, %xmm0, %xmm0
  1823. mulps %xmm1, %xmm0
  1824. subps %xmm0, %xmm2
  1825. movss 9 * SIZE(AA), %xmm0
  1826. movaps %xmm6, %xmm1
  1827. shufps $0x00, %xmm0, %xmm1
  1828. mulps %xmm1, %xmm2
  1829. movaps %xmm2, %xmm1
  1830. shufps $0xee, %xmm1, %xmm1
  1831. movss 8 * SIZE(AA), %xmm0
  1832. shufps $0x50, %xmm0, %xmm0
  1833. mulps %xmm1, %xmm0
  1834. subps %xmm0, %xmm2
  1835. movss 0 * SIZE(AA), %xmm0
  1836. shufps $0x00, %xmm6, %xmm0
  1837. mulps %xmm0, %xmm2
  1838. #endif
  1839. #ifdef LT
  1840. movss 0 * SIZE(AA), %xmm0
  1841. shufps $0x00, %xmm6, %xmm0
  1842. mulps %xmm0, %xmm2
  1843. movaps %xmm2, %xmm1
  1844. shufps $0x44, %xmm1, %xmm1
  1845. movss 1 * SIZE(AA), %xmm0
  1846. shufps $0x05, %xmm0, %xmm0
  1847. mulps %xmm1, %xmm0
  1848. subps %xmm0, %xmm2
  1849. movsd 2 * SIZE(AA), %xmm0
  1850. shufps $0x50, %xmm0, %xmm0
  1851. mulps %xmm1, %xmm0
  1852. subps %xmm0, %xmm3
  1853. movsd 4 * SIZE(AA), %xmm0
  1854. shufps $0x50, %xmm0, %xmm0
  1855. mulps %xmm1, %xmm0
  1856. subps %xmm0, %xmm5
  1857. movsd 6 * SIZE(AA), %xmm0
  1858. shufps $0x50, %xmm0, %xmm0
  1859. mulps %xmm1, %xmm0
  1860. subps %xmm0, %xmm7
  1861. movss 9 * SIZE(AA), %xmm0
  1862. movaps %xmm6, %xmm1
  1863. shufps $0x00, %xmm0, %xmm1
  1864. mulps %xmm1, %xmm2
  1865. movaps %xmm2, %xmm1
  1866. shufps $0xee, %xmm1, %xmm1
  1867. movsd 10 * SIZE(AA), %xmm0
  1868. shufps $0x50, %xmm0, %xmm0
  1869. mulps %xmm1, %xmm0
  1870. subps %xmm0, %xmm3
  1871. movsd 12 * SIZE(AA), %xmm0
  1872. shufps $0x50, %xmm0, %xmm0
  1873. mulps %xmm1, %xmm0
  1874. subps %xmm0, %xmm5
  1875. movsd 14 * SIZE(AA), %xmm0
  1876. shufps $0x50, %xmm0, %xmm0
  1877. mulps %xmm1, %xmm0
  1878. subps %xmm0, %xmm7
  1879. movss 18 * SIZE(AA), %xmm0
  1880. shufps $0x00, %xmm6, %xmm0
  1881. mulps %xmm0, %xmm3
  1882. movaps %xmm3, %xmm1
  1883. shufps $0x44, %xmm1, %xmm1
  1884. movss 19 * SIZE(AA), %xmm0
  1885. shufps $0x05, %xmm0, %xmm0
  1886. mulps %xmm1, %xmm0
  1887. subps %xmm0, %xmm3
  1888. movsd 20 * SIZE(AA), %xmm0
  1889. shufps $0x50, %xmm0, %xmm0
  1890. mulps %xmm1, %xmm0
  1891. subps %xmm0, %xmm5
  1892. movsd 22 * SIZE(AA), %xmm0
  1893. shufps $0x50, %xmm0, %xmm0
  1894. mulps %xmm1, %xmm0
  1895. subps %xmm0, %xmm7
  1896. movss 27 * SIZE(AA), %xmm0
  1897. movaps %xmm6, %xmm1
  1898. shufps $0x00, %xmm0, %xmm1
  1899. mulps %xmm1, %xmm3
  1900. movaps %xmm3, %xmm1
  1901. shufps $0xee, %xmm1, %xmm1
  1902. movsd 28 * SIZE(AA), %xmm0
  1903. shufps $0x50, %xmm0, %xmm0
  1904. mulps %xmm1, %xmm0
  1905. subps %xmm0, %xmm5
  1906. movsd 30 * SIZE(AA), %xmm0
  1907. shufps $0x50, %xmm0, %xmm0
  1908. mulps %xmm1, %xmm0
  1909. subps %xmm0, %xmm7
  1910. movss 36 * SIZE(AA), %xmm0
  1911. shufps $0x00, %xmm6, %xmm0
  1912. mulps %xmm0, %xmm5
  1913. movaps %xmm5, %xmm1
  1914. shufps $0x44, %xmm1, %xmm1
  1915. movss 37 * SIZE(AA), %xmm0
  1916. shufps $0x05, %xmm0, %xmm0
  1917. mulps %xmm1, %xmm0
  1918. subps %xmm0, %xmm5
  1919. movsd 38 * SIZE(AA), %xmm0
  1920. shufps $0x50, %xmm0, %xmm0
  1921. mulps %xmm1, %xmm0
  1922. subps %xmm0, %xmm7
  1923. movss 45 * SIZE(AA), %xmm0
  1924. movaps %xmm6, %xmm1
  1925. shufps $0x00, %xmm0, %xmm1
  1926. mulps %xmm1, %xmm5
  1927. movaps %xmm5, %xmm1
  1928. shufps $0xee, %xmm1, %xmm1
  1929. movsd 46 * SIZE(AA), %xmm0
  1930. shufps $0x50, %xmm0, %xmm0
  1931. mulps %xmm1, %xmm0
  1932. subps %xmm0, %xmm7
  1933. movss 54 * SIZE(AA), %xmm0
  1934. shufps $0x00, %xmm6, %xmm0
  1935. mulps %xmm0, %xmm7
  1936. movaps %xmm7, %xmm1
  1937. shufps $0x44, %xmm1, %xmm1
  1938. movss 55 * SIZE(AA), %xmm0
  1939. shufps $0x05, %xmm0, %xmm0
  1940. mulps %xmm1, %xmm0
  1941. subps %xmm0, %xmm7
  1942. movss 63 * SIZE(AA), %xmm0
  1943. movaps %xmm6, %xmm1
  1944. shufps $0x00, %xmm0, %xmm1
  1945. mulps %xmm1, %xmm7
  1946. #endif
  1947. #ifdef RN
  1948. movss 0 * SIZE(B), %xmm6
  1949. shufps $0x00, %xmm6, %xmm6
  1950. mulps %xmm6, %xmm0
  1951. mulps %xmm6, %xmm1
  1952. movss 1 * SIZE(B), %xmm6
  1953. shufps $0x00, %xmm6, %xmm6
  1954. movaps %xmm6, %xmm5
  1955. mulps %xmm0, %xmm5
  1956. mulps %xmm1, %xmm6
  1957. subps %xmm5, %xmm2
  1958. subps %xmm6, %xmm3
  1959. movss 3 * SIZE(B), %xmm6
  1960. shufps $0x00, %xmm6, %xmm6
  1961. mulps %xmm6, %xmm2
  1962. mulps %xmm6, %xmm3
  1963. #endif
  1964. #ifdef RT
  1965. movss 3 * SIZE(B), %xmm6
  1966. shufps $0x00, %xmm6, %xmm6
  1967. mulps %xmm6, %xmm2
  1968. mulps %xmm6, %xmm3
  1969. movss 2 * SIZE(B), %xmm6
  1970. shufps $0x00, %xmm6, %xmm6
  1971. movaps %xmm6, %xmm5
  1972. mulps %xmm2, %xmm5
  1973. mulps %xmm3, %xmm6
  1974. subps %xmm5, %xmm0
  1975. subps %xmm6, %xmm1
  1976. movss 0 * SIZE(B), %xmm6
  1977. shufps $0x00, %xmm6, %xmm6
  1978. mulps %xmm6, %xmm0
  1979. mulps %xmm6, %xmm1
  1980. #endif
  1981. #if defined(LN) || defined(LT)
  1982. movlps %xmm2, 0 * SIZE(B)
  1983. movhps %xmm2, 2 * SIZE(B)
  1984. movlps %xmm3, 4 * SIZE(B)
  1985. movhps %xmm3, 6 * SIZE(B)
  1986. movlps %xmm5, 8 * SIZE(B)
  1987. movhps %xmm5, 10 * SIZE(B)
  1988. movlps %xmm7, 12 * SIZE(B)
  1989. movhps %xmm7, 14 * SIZE(B)
  1990. #ifdef HAVE_SSE2
  1991. pshufd $0x00, %xmm2, %xmm0
  1992. pshufd $0x55, %xmm2, %xmm1
  1993. pshufd $0xaa, %xmm2, %xmm4
  1994. pshufd $0xff, %xmm2, %xmm6
  1995. #else
  1996. movaps %xmm2, %xmm0
  1997. shufps $0x00, %xmm0, %xmm0
  1998. movaps %xmm2, %xmm1
  1999. shufps $0x55, %xmm1, %xmm1
  2000. movaps %xmm2, %xmm4
  2001. shufps $0xaa, %xmm4, %xmm4
  2002. movaps %xmm2, %xmm6
  2003. shufps $0xff, %xmm6, %xmm6
  2004. #endif
  2005. movaps %xmm0, 0 * SIZE(BB)
  2006. movaps %xmm1, 4 * SIZE(BB)
  2007. movaps %xmm4, 8 * SIZE(BB)
  2008. movaps %xmm6, 12 * SIZE(BB)
  2009. #ifdef HAVE_SSE2
  2010. pshufd $0x00, %xmm3, %xmm0
  2011. pshufd $0x55, %xmm3, %xmm1
  2012. pshufd $0xaa, %xmm3, %xmm4
  2013. pshufd $0xff, %xmm3, %xmm6
  2014. #else
  2015. movaps %xmm3, %xmm0
  2016. shufps $0x00, %xmm0, %xmm0
  2017. movaps %xmm3, %xmm1
  2018. shufps $0x55, %xmm1, %xmm1
  2019. movaps %xmm3, %xmm4
  2020. shufps $0xaa, %xmm4, %xmm4
  2021. movaps %xmm3, %xmm6
  2022. shufps $0xff, %xmm6, %xmm6
  2023. #endif
  2024. movaps %xmm0, 16 * SIZE(BB)
  2025. movaps %xmm1, 20 * SIZE(BB)
  2026. movaps %xmm4, 24 * SIZE(BB)
  2027. movaps %xmm6, 28 * SIZE(BB)
  2028. #ifdef HAVE_SSE2
  2029. pshufd $0x00, %xmm5, %xmm0
  2030. pshufd $0x55, %xmm5, %xmm1
  2031. pshufd $0xaa, %xmm5, %xmm4
  2032. pshufd $0xff, %xmm5, %xmm6
  2033. #else
  2034. movaps %xmm5, %xmm0
  2035. shufps $0x00, %xmm0, %xmm0
  2036. movaps %xmm5, %xmm1
  2037. shufps $0x55, %xmm1, %xmm1
  2038. movaps %xmm5, %xmm4
  2039. shufps $0xaa, %xmm4, %xmm4
  2040. movaps %xmm5, %xmm6
  2041. shufps $0xff, %xmm6, %xmm6
  2042. #endif
  2043. movaps %xmm0, 32 * SIZE(BB)
  2044. movaps %xmm1, 36 * SIZE(BB)
  2045. movaps %xmm4, 40 * SIZE(BB)
  2046. movaps %xmm6, 44 * SIZE(BB)
  2047. #ifdef HAVE_SSE2
  2048. pshufd $0x00, %xmm7, %xmm0
  2049. pshufd $0x55, %xmm7, %xmm1
  2050. pshufd $0xaa, %xmm7, %xmm4
  2051. pshufd $0xff, %xmm7, %xmm6
  2052. #else
  2053. movaps %xmm7, %xmm0
  2054. shufps $0x00, %xmm0, %xmm0
  2055. movaps %xmm7, %xmm1
  2056. shufps $0x55, %xmm1, %xmm1
  2057. movaps %xmm7, %xmm4
  2058. shufps $0xaa, %xmm4, %xmm4
  2059. movaps %xmm7, %xmm6
  2060. shufps $0xff, %xmm6, %xmm6
  2061. #endif
  2062. movaps %xmm0, 48 * SIZE(BB)
  2063. movaps %xmm1, 52 * SIZE(BB)
  2064. movaps %xmm4, 56 * SIZE(BB)
  2065. movaps %xmm6, 60 * SIZE(BB)
  2066. #else
  2067. movaps %xmm0, 0 * SIZE(AA)
  2068. movaps %xmm1, 4 * SIZE(AA)
  2069. movaps %xmm2, 8 * SIZE(AA)
  2070. movaps %xmm3, 12 * SIZE(AA)
  2071. #endif
  2072. #ifdef LN
  2073. subl $8 * SIZE, CO1
  2074. #endif
  2075. #if defined(LN) || defined(LT)
  2076. movaps %xmm2, %xmm0
  2077. shufps $0x88, %xmm3, %xmm2
  2078. shufps $0xdd, %xmm3, %xmm0
  2079. movaps %xmm5, %xmm4
  2080. shufps $0x88, %xmm7, %xmm5
  2081. shufps $0xdd, %xmm7, %xmm4
  2082. movlps %xmm2, 0 * SIZE(CO1)
  2083. movhps %xmm2, 2 * SIZE(CO1)
  2084. movlps %xmm5, 4 * SIZE(CO1)
  2085. movhps %xmm5, 6 * SIZE(CO1)
  2086. movlps %xmm0, 0 * SIZE(CO1, LDC)
  2087. movhps %xmm0, 2 * SIZE(CO1, LDC)
  2088. movlps %xmm4, 4 * SIZE(CO1, LDC)
  2089. movhps %xmm4, 6 * SIZE(CO1, LDC)
  2090. #else
  2091. movlps %xmm0, 0 * SIZE(CO1)
  2092. movhps %xmm0, 2 * SIZE(CO1)
  2093. movlps %xmm1, 4 * SIZE(CO1)
  2094. movhps %xmm1, 6 * SIZE(CO1)
  2095. movlps %xmm2, 0 * SIZE(CO1, LDC)
  2096. movhps %xmm2, 2 * SIZE(CO1, LDC)
  2097. movlps %xmm3, 4 * SIZE(CO1, LDC)
  2098. movhps %xmm3, 6 * SIZE(CO1, LDC)
  2099. #endif
  2100. #ifndef LN
  2101. addl $8 * SIZE, CO1
  2102. #endif
  2103. #if defined(LT) || defined(RN)
  2104. movl K, %eax
  2105. subl KK, %eax
  2106. leal (,%eax, SIZE), %eax
  2107. leal (AA, %eax, 8), AA
  2108. #ifdef LT
  2109. addl $16 * SIZE, B
  2110. #endif
  2111. #endif
  2112. #ifdef LN
  2113. subl $8, KK
  2114. movl BORIG, B
  2115. #endif
  2116. #ifdef LT
  2117. addl $8, KK
  2118. #endif
  2119. #ifdef RT
  2120. movl K, %eax
  2121. movl BORIG, B
  2122. sall $3 + BASE_SHIFT, %eax
  2123. addl %eax, AORIG
  2124. #endif
  2125. decl %ebx # i --
  2126. jg .L10
  2127. ALIGN_2
  2128. .L30:
  2129. testl $4, M
  2130. jle .L50
  2131. #ifdef LN
  2132. movl K, %eax
  2133. sall $2 + BASE_SHIFT, %eax
  2134. subl %eax, AORIG
  2135. #endif
  2136. #if defined(LN) || defined(RT)
  2137. movl KK, %eax
  2138. movl AORIG, AA
  2139. sall $2 + BASE_SHIFT, %eax
  2140. addl %eax, AA
  2141. #endif
  2142. leal BUFFER, BB
  2143. #if defined(LN) || defined(RT)
  2144. movl KK, %eax
  2145. sall $1 + BASE_SHIFT, %eax
  2146. leal (BB, %eax, 4), BB
  2147. #endif
  2148. movaps 0 * SIZE(BB), %xmm2
  2149. xorps %xmm4, %xmm4
  2150. movaps 0 * SIZE(AA), %xmm0
  2151. xorps %xmm5, %xmm5
  2152. movaps 16 * SIZE(BB), %xmm3
  2153. xorps %xmm6, %xmm6
  2154. movaps 16 * SIZE(AA), %xmm1
  2155. xorps %xmm7, %xmm7
  2156. #if defined(LT) || defined(RN)
  2157. movl KK, %eax
  2158. #else
  2159. movl K, %eax
  2160. subl KK, %eax
  2161. #endif
  2162. sarl $3, %eax
  2163. je .L32
  2164. ALIGN_2
  2165. .L31:
  2166. mulps %xmm0, %xmm2
  2167. mulps 4 * SIZE(BB), %xmm0
  2168. addps %xmm2, %xmm4
  2169. movaps 8 * SIZE(BB), %xmm2
  2170. addps %xmm0, %xmm5
  2171. movaps 4 * SIZE(AA), %xmm0
  2172. mulps %xmm0, %xmm2
  2173. mulps 12 * SIZE(BB), %xmm0
  2174. addps %xmm2, %xmm6
  2175. movaps 32 * SIZE(BB), %xmm2
  2176. addps %xmm0, %xmm7
  2177. movaps 8 * SIZE(AA), %xmm0
  2178. mulps %xmm0, %xmm3
  2179. mulps 20 * SIZE(BB), %xmm0
  2180. addps %xmm3, %xmm4
  2181. movaps 24 * SIZE(BB), %xmm3
  2182. addps %xmm0, %xmm5
  2183. movaps 12 * SIZE(AA), %xmm0
  2184. mulps %xmm0, %xmm3
  2185. mulps 28 * SIZE(BB), %xmm0
  2186. addps %xmm3, %xmm6
  2187. movaps 48 * SIZE(BB), %xmm3
  2188. addps %xmm0, %xmm7
  2189. movaps 32 * SIZE(AA), %xmm0
  2190. mulps %xmm1, %xmm2
  2191. mulps 36 * SIZE(BB), %xmm1
  2192. addps %xmm2, %xmm4
  2193. movaps 40 * SIZE(BB), %xmm2
  2194. addps %xmm1, %xmm5
  2195. movaps 20 * SIZE(AA), %xmm1
  2196. mulps %xmm1, %xmm2
  2197. mulps 44 * SIZE(BB), %xmm1
  2198. addps %xmm2, %xmm6
  2199. movaps 64 * SIZE(BB), %xmm2
  2200. addps %xmm1, %xmm7
  2201. movaps 24 * SIZE(AA), %xmm1
  2202. mulps %xmm1, %xmm3
  2203. mulps 52 * SIZE(BB), %xmm1
  2204. addps %xmm3, %xmm4
  2205. movaps 56 * SIZE(BB), %xmm3
  2206. addps %xmm1, %xmm5
  2207. movaps 28 * SIZE(AA), %xmm1
  2208. mulps %xmm1, %xmm3
  2209. mulps 60 * SIZE(BB), %xmm1
  2210. addps %xmm3, %xmm6
  2211. movaps 80 * SIZE(BB), %xmm3
  2212. addps %xmm1, %xmm7
  2213. movaps 48 * SIZE(AA), %xmm1
  2214. addl $32 * SIZE, AA
  2215. addl $64 * SIZE, BB
  2216. decl %eax
  2217. jne .L31
  2218. ALIGN_2
  2219. .L32:
  2220. #if defined(LT) || defined(RN)
  2221. movl KK, %eax
  2222. #else
  2223. movl K, %eax
  2224. subl KK, %eax
  2225. #endif
  2226. andl $7, %eax # if (k & 1)
  2227. BRANCH
  2228. je .L34
  2229. .L33:
  2230. mulps %xmm0, %xmm2
  2231. mulps 4 * SIZE(BB), %xmm0
  2232. addps %xmm2, %xmm4
  2233. movaps 8 * SIZE(BB), %xmm2
  2234. addps %xmm0, %xmm5
  2235. movaps 4 * SIZE(AA), %xmm0
  2236. addl $4 * SIZE, AA
  2237. addl $8 * SIZE, BB
  2238. decl %eax
  2239. jg .L33
  2240. ALIGN_4
  2241. .L34:
  2242. addps %xmm6, %xmm4
  2243. addps %xmm7, %xmm5
  2244. #if defined(LN) || defined(RT)
  2245. movl KK, %eax
  2246. #ifdef LN
  2247. subl $4, %eax
  2248. #else
  2249. subl $2, %eax
  2250. #endif
  2251. movl AORIG, AA
  2252. movl BORIG, B
  2253. leal BUFFER, BB
  2254. sall $BASE_SHIFT, %eax
  2255. leal (AA, %eax, 4), AA
  2256. leal (B, %eax, 2), B
  2257. leal (BB, %eax, 8), BB
  2258. #endif
  2259. #if defined(LN) || defined(LT)
  2260. movaps %xmm4, %xmm0
  2261. unpcklps %xmm5, %xmm4
  2262. unpckhps %xmm5, %xmm0
  2263. movsd 0 * SIZE(B), %xmm2
  2264. movhps 2 * SIZE(B), %xmm2
  2265. movsd 4 * SIZE(B), %xmm3
  2266. movhps 6 * SIZE(B), %xmm3
  2267. subps %xmm4, %xmm2
  2268. subps %xmm0, %xmm3
  2269. #else
  2270. movaps 0 * SIZE(AA), %xmm0
  2271. movaps 4 * SIZE(AA), %xmm2
  2272. subps %xmm4, %xmm0
  2273. subps %xmm5, %xmm2
  2274. #endif
  2275. #if defined(LN) || defined(LT)
  2276. movaps TRMASK, %xmm6
  2277. #endif
  2278. #ifdef LN
  2279. movss 15 * SIZE(AA), %xmm0
  2280. movaps %xmm6, %xmm1
  2281. shufps $0x00, %xmm0, %xmm1
  2282. mulps %xmm1, %xmm3
  2283. movaps %xmm3, %xmm1
  2284. shufps $0xee, %xmm1, %xmm1
  2285. movss 14 * SIZE(AA), %xmm0
  2286. shufps $0x50, %xmm0, %xmm0
  2287. mulps %xmm1, %xmm0
  2288. subps %xmm0, %xmm3
  2289. movsd 12 * SIZE(AA), %xmm0
  2290. shufps $0x50, %xmm0, %xmm0
  2291. mulps %xmm1, %xmm0
  2292. subps %xmm0, %xmm2
  2293. movss 10 * SIZE(AA), %xmm0
  2294. shufps $0x00, %xmm6, %xmm0
  2295. mulps %xmm0, %xmm3
  2296. movaps %xmm3, %xmm1
  2297. shufps $0x44, %xmm1, %xmm1
  2298. movsd 8 * SIZE(AA), %xmm0
  2299. shufps $0x50, %xmm0, %xmm0
  2300. mulps %xmm1, %xmm0
  2301. subps %xmm0, %xmm2
  2302. movss 5 * SIZE(AA), %xmm0
  2303. movaps %xmm6, %xmm1
  2304. shufps $0x00, %xmm0, %xmm1
  2305. mulps %xmm1, %xmm2
  2306. movaps %xmm2, %xmm1
  2307. shufps $0xee, %xmm1, %xmm1
  2308. movss 4 * SIZE(AA), %xmm0
  2309. shufps $0x50, %xmm0, %xmm0
  2310. mulps %xmm1, %xmm0
  2311. subps %xmm0, %xmm2
  2312. movss 0 * SIZE(AA), %xmm0
  2313. shufps $0x00, %xmm6, %xmm0
  2314. mulps %xmm0, %xmm2
  2315. #endif
  2316. #ifdef LT
  2317. movss 0 * SIZE(AA), %xmm0
  2318. shufps $0x00, %xmm6, %xmm0
  2319. mulps %xmm0, %xmm2
  2320. movaps %xmm2, %xmm1
  2321. shufps $0x44, %xmm1, %xmm1
  2322. movss 1 * SIZE(AA), %xmm0
  2323. shufps $0x05, %xmm0, %xmm0
  2324. mulps %xmm1, %xmm0
  2325. subps %xmm0, %xmm2
  2326. movsd 2 * SIZE(AA), %xmm0
  2327. shufps $0x50, %xmm0, %xmm0
  2328. mulps %xmm1, %xmm0
  2329. subps %xmm0, %xmm3
  2330. movss 5 * SIZE(AA), %xmm0
  2331. movaps %xmm6, %xmm1
  2332. shufps $0x00, %xmm0, %xmm1
  2333. mulps %xmm1, %xmm2
  2334. movaps %xmm2, %xmm1
  2335. shufps $0xee, %xmm1, %xmm1
  2336. movsd 6 * SIZE(AA), %xmm0
  2337. shufps $0x50, %xmm0, %xmm0
  2338. mulps %xmm1, %xmm0
  2339. subps %xmm0, %xmm3
  2340. movss 10 * SIZE(AA), %xmm0
  2341. shufps $0x00, %xmm6, %xmm0
  2342. mulps %xmm0, %xmm3
  2343. movaps %xmm3, %xmm1
  2344. shufps $0x44, %xmm1, %xmm1
  2345. movss 11 * SIZE(AA), %xmm0
  2346. shufps $0x05, %xmm0, %xmm0
  2347. mulps %xmm1, %xmm0
  2348. subps %xmm0, %xmm3
  2349. movss 15 * SIZE(AA), %xmm0
  2350. movaps %xmm6, %xmm1
  2351. shufps $0x00, %xmm0, %xmm1
  2352. mulps %xmm1, %xmm3
  2353. #endif
  2354. #ifdef RN
  2355. movss 0 * SIZE(B), %xmm6
  2356. shufps $0x00, %xmm6, %xmm6
  2357. mulps %xmm6, %xmm0
  2358. movss 1 * SIZE(B), %xmm6
  2359. shufps $0x00, %xmm6, %xmm6
  2360. movaps %xmm6, %xmm5
  2361. mulps %xmm0, %xmm5
  2362. subps %xmm5, %xmm2
  2363. movss 3 * SIZE(B), %xmm6
  2364. shufps $0x00, %xmm6, %xmm6
  2365. mulps %xmm6, %xmm2
  2366. #endif
  2367. #ifdef RT
  2368. movss 3 * SIZE(B), %xmm6
  2369. shufps $0x00, %xmm6, %xmm6
  2370. mulps %xmm6, %xmm2
  2371. movss 2 * SIZE(B), %xmm6
  2372. shufps $0x00, %xmm6, %xmm6
  2373. movaps %xmm6, %xmm5
  2374. mulps %xmm2, %xmm5
  2375. subps %xmm5, %xmm0
  2376. movss 0 * SIZE(B), %xmm6
  2377. shufps $0x00, %xmm6, %xmm6
  2378. mulps %xmm6, %xmm0
  2379. #endif
  2380. #if defined(LN) || defined(LT)
  2381. movlps %xmm2, 0 * SIZE(B)
  2382. movhps %xmm2, 2 * SIZE(B)
  2383. movlps %xmm3, 4 * SIZE(B)
  2384. movhps %xmm3, 6 * SIZE(B)
  2385. #ifdef HAVE_SSE2
  2386. pshufd $0x00, %xmm2, %xmm0
  2387. pshufd $0x55, %xmm2, %xmm1
  2388. pshufd $0xaa, %xmm2, %xmm4
  2389. pshufd $0xff, %xmm2, %xmm6
  2390. #else
  2391. movaps %xmm2, %xmm0
  2392. shufps $0x00, %xmm0, %xmm0
  2393. movaps %xmm2, %xmm1
  2394. shufps $0x55, %xmm1, %xmm1
  2395. movaps %xmm2, %xmm4
  2396. shufps $0xaa, %xmm4, %xmm4
  2397. movaps %xmm2, %xmm6
  2398. shufps $0xff, %xmm6, %xmm6
  2399. #endif
  2400. movaps %xmm0, 0 * SIZE(BB)
  2401. movaps %xmm1, 4 * SIZE(BB)
  2402. movaps %xmm4, 8 * SIZE(BB)
  2403. movaps %xmm6, 12 * SIZE(BB)
  2404. #ifdef HAVE_SSE2
  2405. pshufd $0x00, %xmm3, %xmm0
  2406. pshufd $0x55, %xmm3, %xmm1
  2407. pshufd $0xaa, %xmm3, %xmm4
  2408. pshufd $0xff, %xmm3, %xmm6
  2409. #else
  2410. movaps %xmm3, %xmm0
  2411. shufps $0x00, %xmm0, %xmm0
  2412. movaps %xmm3, %xmm1
  2413. shufps $0x55, %xmm1, %xmm1
  2414. movaps %xmm3, %xmm4
  2415. shufps $0xaa, %xmm4, %xmm4
  2416. movaps %xmm3, %xmm6
  2417. shufps $0xff, %xmm6, %xmm6
  2418. #endif
  2419. movaps %xmm0, 16 * SIZE(BB)
  2420. movaps %xmm1, 20 * SIZE(BB)
  2421. movaps %xmm4, 24 * SIZE(BB)
  2422. movaps %xmm6, 28 * SIZE(BB)
  2423. #else
  2424. movaps %xmm0, 0 * SIZE(AA)
  2425. movaps %xmm2, 4 * SIZE(AA)
  2426. #endif
  2427. #ifdef LN
  2428. subl $4 * SIZE, CO1
  2429. #endif
  2430. #if defined(LN) || defined(LT)
  2431. movaps %xmm2, %xmm0
  2432. shufps $0x88, %xmm3, %xmm2
  2433. shufps $0xdd, %xmm3, %xmm0
  2434. movlps %xmm2, 0 * SIZE(CO1)
  2435. movhps %xmm2, 2 * SIZE(CO1)
  2436. movlps %xmm0, 0 * SIZE(CO1, LDC)
  2437. movhps %xmm0, 2 * SIZE(CO1, LDC)
  2438. #else
  2439. movlps %xmm0, 0 * SIZE(CO1)
  2440. movhps %xmm0, 2 * SIZE(CO1)
  2441. movlps %xmm2, 0 * SIZE(CO1, LDC)
  2442. movhps %xmm2, 2 * SIZE(CO1, LDC)
  2443. #endif
  2444. #ifndef LN
  2445. addl $4 * SIZE, CO1
  2446. #endif
  2447. #if defined(LT) || defined(RN)
  2448. movl K, %eax
  2449. subl KK, %eax
  2450. leal (,%eax, SIZE), %eax
  2451. leal (AA, %eax, 4), AA
  2452. #ifdef LT
  2453. addl $8 * SIZE, B
  2454. #endif
  2455. #endif
  2456. #ifdef LN
  2457. subl $4, KK
  2458. movl BORIG, B
  2459. #endif
  2460. #ifdef LT
  2461. addl $4, KK
  2462. #endif
  2463. #ifdef RT
  2464. movl K, %eax
  2465. movl BORIG, B
  2466. sall $2 + BASE_SHIFT, %eax
  2467. addl %eax, AORIG
  2468. #endif
  2469. ALIGN_2
  2470. .L50:
  2471. testl $2, M
  2472. jle .L70
  2473. #ifdef LN
  2474. movl K, %eax
  2475. sall $1 + BASE_SHIFT, %eax
  2476. subl %eax, AORIG
  2477. #endif
  2478. #if defined(LN) || defined(RT)
  2479. movl KK, %eax
  2480. movl AORIG, AA
  2481. sall $1 + BASE_SHIFT, %eax
  2482. addl %eax, AA
  2483. #endif
  2484. leal BUFFER, BB
  2485. #if defined(LN) || defined(RT)
  2486. movl KK, %eax
  2487. sall $1 + BASE_SHIFT, %eax
  2488. leal (BB, %eax, 4), BB
  2489. #endif
  2490. movaps 0 * SIZE(BB), %xmm2
  2491. xorps %xmm4, %xmm4
  2492. movaps 0 * SIZE(AA), %xmm0
  2493. xorps %xmm5, %xmm5
  2494. movaps 16 * SIZE(BB), %xmm3
  2495. xorps %xmm6, %xmm6
  2496. movaps 8 * SIZE(AA), %xmm1
  2497. xorps %xmm7, %xmm7
  2498. #if defined(LT) || defined(RN)
  2499. movl KK, %eax
  2500. #else
  2501. movl K, %eax
  2502. subl KK, %eax
  2503. #endif
  2504. sarl $3, %eax
  2505. je .L52
  2506. ALIGN_2
  2507. .L51:
  2508. mulps %xmm0, %xmm2
  2509. addps %xmm2, %xmm4
  2510. movaps 4 * SIZE(BB), %xmm2
  2511. mulps %xmm0, %xmm2
  2512. movsd 2 * SIZE(AA), %xmm0
  2513. addps %xmm2, %xmm5
  2514. movaps 8 * SIZE(BB), %xmm2
  2515. mulps %xmm0, %xmm2
  2516. addps %xmm2, %xmm6
  2517. movaps 12 * SIZE(BB), %xmm2
  2518. mulps %xmm0, %xmm2
  2519. movsd 4 * SIZE(AA), %xmm0
  2520. addps %xmm2, %xmm7
  2521. movaps 32 * SIZE(BB), %xmm2
  2522. mulps %xmm0, %xmm3
  2523. addps %xmm3, %xmm4
  2524. movaps 20 * SIZE(BB), %xmm3
  2525. mulps %xmm0, %xmm3
  2526. movsd 6 * SIZE(AA), %xmm0
  2527. addps %xmm3, %xmm5
  2528. movaps 24 * SIZE(BB), %xmm3
  2529. mulps %xmm0, %xmm3
  2530. addps %xmm3, %xmm6
  2531. movaps 28 * SIZE(BB), %xmm3
  2532. mulps %xmm0, %xmm3
  2533. movsd 16 * SIZE(AA), %xmm0
  2534. addps %xmm3, %xmm7
  2535. movaps 48 * SIZE(BB), %xmm3
  2536. mulps %xmm1, %xmm2
  2537. addps %xmm2, %xmm4
  2538. movaps 36 * SIZE(BB), %xmm2
  2539. mulps %xmm1, %xmm2
  2540. movsd 10 * SIZE(AA), %xmm1
  2541. addps %xmm2, %xmm5
  2542. movaps 40 * SIZE(BB), %xmm2
  2543. mulps %xmm1, %xmm2
  2544. addps %xmm2, %xmm6
  2545. movaps 44 * SIZE(BB), %xmm2
  2546. mulps %xmm1, %xmm2
  2547. movsd 12 * SIZE(AA), %xmm1
  2548. addps %xmm2, %xmm7
  2549. movaps 64 * SIZE(BB), %xmm2
  2550. mulps %xmm1, %xmm3
  2551. addps %xmm3, %xmm4
  2552. movaps 52 * SIZE(BB), %xmm3
  2553. mulps %xmm1, %xmm3
  2554. movsd 14 * SIZE(AA), %xmm1
  2555. addps %xmm3, %xmm5
  2556. movaps 56 * SIZE(BB), %xmm3
  2557. mulps %xmm1, %xmm3
  2558. addps %xmm3, %xmm6
  2559. movaps 60 * SIZE(BB), %xmm3
  2560. mulps %xmm1, %xmm3
  2561. movsd 24 * SIZE(AA), %xmm1
  2562. addps %xmm3, %xmm7
  2563. movaps 80 * SIZE(BB), %xmm3
  2564. addl $16 * SIZE, AA
  2565. addl $64 * SIZE, BB
  2566. decl %eax
  2567. jne .L51
  2568. ALIGN_2
  2569. .L52:
  2570. #if defined(LT) || defined(RN)
  2571. movl KK, %eax
  2572. #else
  2573. movl K, %eax
  2574. subl KK, %eax
  2575. #endif
  2576. andl $7, %eax # if (k & 1)
  2577. BRANCH
  2578. je .L54
  2579. .L53:
  2580. mulps %xmm0, %xmm2
  2581. addps %xmm2, %xmm4
  2582. movaps 4 * SIZE(BB), %xmm2
  2583. mulps %xmm0, %xmm2
  2584. movsd 2 * SIZE(AA), %xmm0
  2585. addps %xmm2, %xmm5
  2586. movaps 8 * SIZE(BB), %xmm2
  2587. addl $2 * SIZE, AA
  2588. addl $8 * SIZE, BB
  2589. decl %eax
  2590. jg .L53
  2591. ALIGN_4
  2592. .L54:
  2593. addps %xmm6, %xmm4
  2594. addps %xmm7, %xmm5
  2595. #if defined(LN) || defined(RT)
  2596. movl KK, %eax
  2597. #ifdef LN
  2598. subl $2, %eax
  2599. #else
  2600. subl $2, %eax
  2601. #endif
  2602. movl AORIG, AA
  2603. movl BORIG, B
  2604. leal BUFFER, BB
  2605. sall $BASE_SHIFT, %eax
  2606. leal (AA, %eax, 2), AA
  2607. leal (B, %eax, 2), B
  2608. leal (BB, %eax, 8), BB
  2609. #endif
  2610. #if defined(LN) || defined(LT)
  2611. unpcklps %xmm5, %xmm4
  2612. movsd 0 * SIZE(B), %xmm2
  2613. movhps 2 * SIZE(B), %xmm2
  2614. subps %xmm4, %xmm2
  2615. #else
  2616. #ifdef movsd
  2617. xorps %xmm0, %xmm0
  2618. #endif
  2619. movsd 0 * SIZE(AA), %xmm0
  2620. #ifdef movsd
  2621. xorps %xmm2, %xmm2
  2622. #endif
  2623. movsd 2 * SIZE(AA), %xmm2
  2624. subps %xmm4, %xmm0
  2625. subps %xmm5, %xmm2
  2626. #endif
  2627. #if defined(LN) || defined(LT)
  2628. movaps TRMASK, %xmm6
  2629. #endif
  2630. #ifdef LN
  2631. movss 3 * SIZE(AA), %xmm0
  2632. movaps %xmm6, %xmm1
  2633. shufps $0x00, %xmm0, %xmm1
  2634. mulps %xmm1, %xmm2
  2635. movaps %xmm2, %xmm1
  2636. shufps $0xee, %xmm1, %xmm1
  2637. movss 2 * SIZE(AA), %xmm0
  2638. shufps $0x50, %xmm0, %xmm0
  2639. mulps %xmm1, %xmm0
  2640. subps %xmm0, %xmm2
  2641. movss 0 * SIZE(AA), %xmm0
  2642. shufps $0x00, %xmm6, %xmm0
  2643. mulps %xmm0, %xmm2
  2644. #endif
  2645. #ifdef LT
  2646. movss 0 * SIZE(AA), %xmm0
  2647. shufps $0x00, %xmm6, %xmm0
  2648. mulps %xmm0, %xmm2
  2649. movaps %xmm2, %xmm1
  2650. shufps $0x44, %xmm1, %xmm1
  2651. movss 1 * SIZE(AA), %xmm0
  2652. shufps $0x05, %xmm0, %xmm0
  2653. mulps %xmm1, %xmm0
  2654. subps %xmm0, %xmm2
  2655. movss 3 * SIZE(AA), %xmm0
  2656. movaps %xmm6, %xmm1
  2657. shufps $0x00, %xmm0, %xmm1
  2658. mulps %xmm1, %xmm2
  2659. #endif
  2660. #ifdef RN
  2661. movss 0 * SIZE(B), %xmm6
  2662. shufps $0x00, %xmm6, %xmm6
  2663. mulps %xmm6, %xmm0
  2664. movss 1 * SIZE(B), %xmm6
  2665. shufps $0x00, %xmm6, %xmm6
  2666. movaps %xmm6, %xmm5
  2667. mulps %xmm0, %xmm5
  2668. subps %xmm5, %xmm2
  2669. movss 3 * SIZE(B), %xmm6
  2670. shufps $0x00, %xmm6, %xmm6
  2671. mulps %xmm6, %xmm2
  2672. #endif
  2673. #ifdef RT
  2674. movss 3 * SIZE(B), %xmm6
  2675. shufps $0x00, %xmm6, %xmm6
  2676. mulps %xmm6, %xmm2
  2677. movss 2 * SIZE(B), %xmm6
  2678. shufps $0x00, %xmm6, %xmm6
  2679. movaps %xmm6, %xmm5
  2680. mulps %xmm2, %xmm5
  2681. subps %xmm5, %xmm0
  2682. movss 0 * SIZE(B), %xmm6
  2683. shufps $0x00, %xmm6, %xmm6
  2684. mulps %xmm6, %xmm0
  2685. #endif
  2686. #if defined(LN) || defined(LT)
  2687. movlps %xmm2, 0 * SIZE(B)
  2688. movhps %xmm2, 2 * SIZE(B)
  2689. #ifdef HAVE_SSE2
  2690. pshufd $0x00, %xmm2, %xmm0
  2691. pshufd $0x55, %xmm2, %xmm1
  2692. pshufd $0xaa, %xmm2, %xmm4
  2693. pshufd $0xff, %xmm2, %xmm6
  2694. #else
  2695. movaps %xmm2, %xmm0
  2696. shufps $0x00, %xmm0, %xmm0
  2697. movaps %xmm2, %xmm1
  2698. shufps $0x55, %xmm1, %xmm1
  2699. movaps %xmm2, %xmm4
  2700. shufps $0xaa, %xmm4, %xmm4
  2701. movaps %xmm2, %xmm6
  2702. shufps $0xff, %xmm6, %xmm6
  2703. #endif
  2704. movaps %xmm0, 0 * SIZE(BB)
  2705. movaps %xmm1, 4 * SIZE(BB)
  2706. movaps %xmm4, 8 * SIZE(BB)
  2707. movaps %xmm6, 12 * SIZE(BB)
  2708. #else
  2709. movlps %xmm0, 0 * SIZE(AA)
  2710. movlps %xmm2, 2 * SIZE(AA)
  2711. #endif
  2712. #ifdef LN
  2713. subl $2 * SIZE, CO1
  2714. #endif
  2715. #if defined(LN) || defined(LT)
  2716. movaps %xmm2, %xmm0
  2717. shufps $0x88, %xmm3, %xmm2
  2718. shufps $0xdd, %xmm3, %xmm0
  2719. movlps %xmm2, 0 * SIZE(CO1)
  2720. movlps %xmm0, 0 * SIZE(CO1, LDC)
  2721. #else
  2722. movlps %xmm0, 0 * SIZE(CO1)
  2723. movlps %xmm2, 0 * SIZE(CO1, LDC)
  2724. #endif
  2725. #ifndef LN
  2726. addl $2 * SIZE, CO1
  2727. #endif
  2728. #if defined(LT) || defined(RN)
  2729. movl K, %eax
  2730. subl KK, %eax
  2731. leal (,%eax, SIZE), %eax
  2732. leal (AA, %eax, 2), AA
  2733. #ifdef LT
  2734. addl $4 * SIZE, B
  2735. #endif
  2736. #endif
  2737. #ifdef LN
  2738. subl $2, KK
  2739. movl BORIG, B
  2740. #endif
  2741. #ifdef LT
  2742. addl $2, KK
  2743. #endif
  2744. #ifdef RT
  2745. movl K, %eax
  2746. movl BORIG, B
  2747. sall $1 + BASE_SHIFT, %eax
  2748. addl %eax, AORIG
  2749. #endif
  2750. ALIGN_2
  2751. .L70:
  2752. testl $1, M
  2753. jle .L99
  2754. #ifdef LN
  2755. movl K, %eax
  2756. sall $BASE_SHIFT, %eax
  2757. subl %eax, AORIG
  2758. #endif
  2759. #if defined(LN) || defined(RT)
  2760. movl KK, %eax
  2761. movl AORIG, AA
  2762. sall $BASE_SHIFT, %eax
  2763. addl %eax, AA
  2764. #endif
  2765. leal BUFFER, BB
  2766. #if defined(LN) || defined(RT)
  2767. movl KK, %eax
  2768. sall $1 + BASE_SHIFT, %eax
  2769. leal (BB, %eax, 4), BB
  2770. #endif
  2771. movss 0 * SIZE(BB), %xmm2
  2772. xorps %xmm4, %xmm4
  2773. movss 0 * SIZE(AA), %xmm0
  2774. xorps %xmm5, %xmm5
  2775. movss 16 * SIZE(BB), %xmm3
  2776. xorps %xmm6, %xmm6
  2777. movss 4 * SIZE(AA), %xmm1
  2778. xorps %xmm7, %xmm7
  2779. #if defined(LT) || defined(RN)
  2780. movl KK, %eax
  2781. #else
  2782. movl K, %eax
  2783. subl KK, %eax
  2784. #endif
  2785. sarl $3, %eax
  2786. je .L72
  2787. ALIGN_2
  2788. .L71:
  2789. mulss %xmm0, %xmm2
  2790. mulss 4 * SIZE(BB), %xmm0
  2791. addss %xmm2, %xmm4
  2792. movss 8 * SIZE(BB), %xmm2
  2793. addss %xmm0, %xmm5
  2794. movss 1 * SIZE(AA), %xmm0
  2795. mulss %xmm0, %xmm2
  2796. mulss 12 * SIZE(BB), %xmm0
  2797. addss %xmm2, %xmm6
  2798. movss 32 * SIZE(BB), %xmm2
  2799. addss %xmm0, %xmm7
  2800. movss 2 * SIZE(AA), %xmm0
  2801. mulss %xmm0, %xmm3
  2802. mulss 20 * SIZE(BB), %xmm0
  2803. addss %xmm3, %xmm4
  2804. movss 24 * SIZE(BB), %xmm3
  2805. addss %xmm0, %xmm5
  2806. movss 3 * SIZE(AA), %xmm0
  2807. mulss %xmm0, %xmm3
  2808. mulss 28 * SIZE(BB), %xmm0
  2809. addss %xmm3, %xmm6
  2810. movss 48 * SIZE(BB), %xmm3
  2811. addss %xmm0, %xmm7
  2812. movss 8 * SIZE(AA), %xmm0
  2813. mulss %xmm1, %xmm2
  2814. mulss 36 * SIZE(BB), %xmm1
  2815. addss %xmm2, %xmm4
  2816. movss 40 * SIZE(BB), %xmm2
  2817. addss %xmm1, %xmm5
  2818. movss 5 * SIZE(AA), %xmm1
  2819. mulss %xmm1, %xmm2
  2820. mulss 44 * SIZE(BB), %xmm1
  2821. addss %xmm2, %xmm6
  2822. movss 64 * SIZE(BB), %xmm2
  2823. addss %xmm1, %xmm7
  2824. movss 6 * SIZE(AA), %xmm1
  2825. mulss %xmm1, %xmm3
  2826. mulss 52 * SIZE(BB), %xmm1
  2827. addss %xmm3, %xmm4
  2828. movss 56 * SIZE(BB), %xmm3
  2829. addss %xmm1, %xmm5
  2830. movss 7 * SIZE(AA), %xmm1
  2831. mulss %xmm1, %xmm3
  2832. mulss 60 * SIZE(BB), %xmm1
  2833. addss %xmm3, %xmm6
  2834. movss 80 * SIZE(BB), %xmm3
  2835. addss %xmm1, %xmm7
  2836. movss 12 * SIZE(AA), %xmm1
  2837. addl $ 8 * SIZE, AA
  2838. addl $64 * SIZE, BB
  2839. decl %eax
  2840. jne .L71
  2841. ALIGN_2
  2842. .L72:
  2843. #if defined(LT) || defined(RN)
  2844. movl KK, %eax
  2845. #else
  2846. movl K, %eax
  2847. subl KK, %eax
  2848. #endif
  2849. andl $7, %eax # if (k & 1)
  2850. BRANCH
  2851. je .L74
  2852. .L73:
  2853. mulss %xmm0, %xmm2
  2854. mulss 4 * SIZE(BB), %xmm0
  2855. addss %xmm2, %xmm4
  2856. movss 8 * SIZE(BB), %xmm2
  2857. addss %xmm0, %xmm5
  2858. movss 1 * SIZE(AA), %xmm0
  2859. addl $1 * SIZE, AA
  2860. addl $8 * SIZE, BB
  2861. decl %eax
  2862. jg .L73
  2863. ALIGN_4
  2864. .L74:
  2865. addss %xmm6, %xmm4
  2866. addss %xmm7, %xmm5
  2867. #if defined(LN) || defined(RT)
  2868. movl KK, %eax
  2869. #ifdef LN
  2870. subl $1, %eax
  2871. #else
  2872. subl $2, %eax
  2873. #endif
  2874. movl AORIG, AA
  2875. movl BORIG, B
  2876. leal BUFFER, BB
  2877. sall $BASE_SHIFT, %eax
  2878. leal (AA, %eax, 1), AA
  2879. leal (B, %eax, 2), B
  2880. leal (BB, %eax, 8), BB
  2881. #endif
  2882. #if defined(LN) || defined(LT)
  2883. unpcklps %xmm5, %xmm4
  2884. #ifdef movsd
  2885. xorps %xmm2, %xmm2
  2886. #endif
  2887. movsd 0 * SIZE(B), %xmm2
  2888. subps %xmm4, %xmm2
  2889. #else
  2890. movss 0 * SIZE(AA), %xmm0
  2891. movss 1 * SIZE(AA), %xmm2
  2892. subss %xmm4, %xmm0
  2893. subss %xmm5, %xmm2
  2894. #endif
  2895. #if defined(LN) || defined(LT)
  2896. movaps TRMASK, %xmm6
  2897. #endif
  2898. #if defined(LN) || defined(LT)
  2899. movss 0 * SIZE(AA), %xmm0
  2900. shufps $0x00, %xmm6, %xmm0
  2901. mulps %xmm0, %xmm2
  2902. #endif
  2903. #ifdef RN
  2904. movss 0 * SIZE(B), %xmm6
  2905. mulss %xmm6, %xmm0
  2906. movss 1 * SIZE(B), %xmm6
  2907. movaps %xmm6, %xmm5
  2908. mulss %xmm0, %xmm5
  2909. subss %xmm5, %xmm2
  2910. movss 3 * SIZE(B), %xmm6
  2911. mulss %xmm6, %xmm2
  2912. #endif
  2913. #ifdef RT
  2914. movss 3 * SIZE(B), %xmm6
  2915. mulss %xmm6, %xmm2
  2916. movss 2 * SIZE(B), %xmm6
  2917. movaps %xmm6, %xmm5
  2918. mulss %xmm2, %xmm5
  2919. subss %xmm5, %xmm0
  2920. movss 0 * SIZE(B), %xmm6
  2921. mulss %xmm6, %xmm0
  2922. #endif
  2923. #if defined(LN) || defined(LT)
  2924. #ifdef movsd
  2925. xorps %xmm2, %xmm2
  2926. #endif
  2927. movsd %xmm2, 0 * SIZE(B)
  2928. movaps %xmm2, %xmm0
  2929. shufps $0x00, %xmm0, %xmm0
  2930. movaps %xmm2, %xmm1
  2931. shufps $0x55, %xmm1, %xmm1
  2932. movaps %xmm0, 0 * SIZE(BB)
  2933. movaps %xmm1, 4 * SIZE(BB)
  2934. #else
  2935. movss %xmm0, 0 * SIZE(AA)
  2936. movss %xmm2, 1 * SIZE(AA)
  2937. #endif
  2938. #ifdef LN
  2939. subl $1 * SIZE, CO1
  2940. #endif
  2941. #if defined(LN) || defined(LT)
  2942. movaps %xmm2, %xmm0
  2943. shufps $0x88, %xmm3, %xmm2
  2944. shufps $0xdd, %xmm3, %xmm0
  2945. movss %xmm2, 0 * SIZE(CO1)
  2946. movss %xmm0, 0 * SIZE(CO1, LDC)
  2947. #else
  2948. movss %xmm0, 0 * SIZE(CO1)
  2949. movss %xmm2, 0 * SIZE(CO1, LDC)
  2950. #endif
  2951. #ifndef LN
  2952. addl $1 * SIZE, CO1
  2953. #endif
  2954. #if defined(LT) || defined(RN)
  2955. movl K, %eax
  2956. subl KK, %eax
  2957. leal (,%eax, SIZE), %eax
  2958. leal (AA, %eax, 1), AA
  2959. #ifdef LT
  2960. addl $2 * SIZE, B
  2961. #endif
  2962. #endif
  2963. #ifdef LN
  2964. subl $1, KK
  2965. movl BORIG, B
  2966. #endif
  2967. #ifdef LT
  2968. addl $1, KK
  2969. #endif
  2970. #ifdef RT
  2971. movl K, %eax
  2972. movl BORIG, B
  2973. sall $BASE_SHIFT, %eax
  2974. addl %eax, AORIG
  2975. #endif
  2976. ALIGN_2
  2977. .L99:
  2978. #ifdef LN
  2979. movl K, %eax
  2980. leal (, %eax, SIZE), %eax
  2981. leal (B, %eax, 2), B
  2982. #endif
  2983. #if defined(LT) || defined(RN)
  2984. movl K, %eax
  2985. subl KK, %eax
  2986. leal (,%eax, SIZE), %eax
  2987. leal (B, %eax, 2), B
  2988. #endif
  2989. #ifdef RN
  2990. addl $2, KK
  2991. #endif
  2992. #ifdef RT
  2993. subl $2, KK
  2994. #endif
  2995. decl J # j --
  2996. jg .L01
  2997. ALIGN_2
  2998. .L999:
  2999. movl OLD_STACK, %esp
  3000. popl %ebx
  3001. popl %esi
  3002. popl %edi
  3003. popl %ebp
  3004. ret
  3005. EPILOGUE