You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT_8x2_sse.S 65 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if !defined(HAVE_SSE) || !defined(HAVE_MMX)
  41. #error You have to check your configuration.
  42. #endif
  43. #define STACK 16
  44. #define ARGS 0
  45. #define STACK_M 4 + STACK + ARGS(%esi)
  46. #define STACK_N 8 + STACK + ARGS(%esi)
  47. #define STACK_K 12 + STACK + ARGS(%esi)
  48. #define STACK_A 20 + STACK + ARGS(%esi)
  49. #define STACK_B 24 + STACK + ARGS(%esi)
  50. #define STACK_C 28 + STACK + ARGS(%esi)
  51. #define STACK_LDC 32 + STACK + ARGS(%esi)
  52. #define STACK_OFFT 36 + STACK + ARGS(%esi)
  53. #define TRMASK 0(%esp)
  54. #define K 16(%esp)
  55. #define N 20(%esp)
  56. #define M 24(%esp)
  57. #define A 28(%esp)
  58. #define C 32(%esp)
  59. #define J 36(%esp)
  60. #define OLD_STACK 40(%esp)
  61. #define OFFSET 44(%esp)
  62. #define KK 48(%esp)
  63. #define KKK 52(%esp)
  64. #define AORIG 56(%esp)
  65. #define BORIG 60(%esp)
  66. #define BUFFER 128(%esp)
  67. #ifdef HAVE_3DNOW
  68. #define PREFETCH prefetch
  69. #define PREFETCHW prefetchw
  70. #define PREFETCHSIZE (16 * 10 + 8)
  71. #else
  72. #define PREFETCH prefetcht0
  73. #define PREFETCHW prefetcht0
  74. #define PREFETCHSIZE 96
  75. #endif
  76. #define B %edi
  77. #define AA %edx
  78. #define BB %ecx
  79. #define LDC %ebp
  80. #define CO1 %esi
  81. #define STACK_ALIGN 4096
  82. #define STACK_OFFSET 1024
  83. #if !defined(HAVE_SSE2) || defined(OPTERON)
  84. #define movsd movlps
  85. #endif
  86. #ifdef HAVE_SSE2
  87. #define xorps pxor
  88. #endif
  89. PROLOGUE
  90. pushl %ebp
  91. pushl %edi
  92. pushl %esi
  93. pushl %ebx
  94. PROFCODE
  95. movl %esp, %esi # save old stack
  96. subl $128 + LOCAL_BUFFER_SIZE, %esp
  97. andl $-STACK_ALIGN, %esp
  98. STACK_TOUCHING
  99. movss STACK_M, %xmm0
  100. movl STACK_N, %eax
  101. movss STACK_K, %xmm1
  102. movss STACK_A, %xmm2
  103. movl STACK_B, B
  104. movss STACK_C, %xmm3
  105. movl STACK_LDC, LDC
  106. movss STACK_OFFT, %xmm4
  107. movss %xmm1, K
  108. movl %eax, N
  109. movss %xmm0, M
  110. movss %xmm2, A
  111. movss %xmm3, C
  112. movl %esi, OLD_STACK
  113. movss %xmm4, OFFSET
  114. movss %xmm4, KK
  115. leal (, LDC, SIZE), LDC
  116. #ifdef LN
  117. movl M, %eax
  118. leal (, %eax, SIZE), %eax
  119. addl %eax, C
  120. imull K, %eax
  121. addl %eax, A
  122. #endif
  123. #ifdef RT
  124. movl N, %eax
  125. leal (, %eax, SIZE), %eax
  126. imull K, %eax
  127. addl %eax, B
  128. movl N, %eax
  129. imull LDC, %eax
  130. addl %eax, C
  131. #endif
  132. #ifdef RN
  133. negl KK
  134. #endif
  135. #ifdef RT
  136. movl N, %eax
  137. subl OFFSET, %eax
  138. movl %eax, KK
  139. #endif
  140. #if defined(LN) || defined(LT)
  141. movl $0x3f800000, 0 + TRMASK # 1.0
  142. movl $0x00000000, 4 + TRMASK # 0.0
  143. movl $0x3f800000, 8 + TRMASK # 1.0
  144. movl $0x00000000, 12 + TRMASK # 0.0
  145. #endif
  146. movl N, %eax
  147. sarl $1, %eax # j = (n >> 1)
  148. movl %eax, J
  149. jle .L100
  150. ALIGN_2
  151. .L01:
  152. #ifdef LN
  153. movl OFFSET, %eax
  154. addl M, %eax
  155. movl %eax, KK
  156. #endif
  157. leal BUFFER, BB
  158. #ifdef RT
  159. movl K, %eax
  160. sall $1 + BASE_SHIFT, %eax
  161. subl %eax, B
  162. #endif
  163. #if defined(LN) || defined(RT)
  164. movl KK, %eax
  165. movl B, BORIG
  166. sall $1 + BASE_SHIFT, %eax
  167. leal (B, %eax, 1), B
  168. leal (BB, %eax, 4), BB
  169. #endif
  170. #ifdef LT
  171. movl OFFSET, %eax
  172. movl %eax, KK
  173. #endif
  174. #if defined(LT) || defined(RN)
  175. movl KK, %eax
  176. #else
  177. movl K, %eax
  178. subl KK, %eax
  179. #endif
  180. sarl $2, %eax
  181. jle .L03
  182. ALIGN_4
  183. .L02:
  184. movsd 0 * SIZE(B), %xmm3
  185. movhps 2 * SIZE(B), %xmm3
  186. movsd 4 * SIZE(B), %xmm7
  187. movhps 6 * SIZE(B), %xmm7
  188. #ifdef HAVE_SSE2
  189. pshufd $0x00, %xmm3, %xmm0
  190. pshufd $0x55, %xmm3, %xmm1
  191. pshufd $0xaa, %xmm3, %xmm2
  192. pshufd $0xff, %xmm3, %xmm3
  193. pshufd $0x00, %xmm7, %xmm4
  194. pshufd $0x55, %xmm7, %xmm5
  195. pshufd $0xaa, %xmm7, %xmm6
  196. pshufd $0xff, %xmm7, %xmm7
  197. #else
  198. movaps %xmm3, %xmm0
  199. shufps $0x00, %xmm0, %xmm0
  200. movaps %xmm3, %xmm1
  201. shufps $0x55, %xmm1, %xmm1
  202. movaps %xmm3, %xmm2
  203. shufps $0xaa, %xmm2, %xmm2
  204. shufps $0xff, %xmm3, %xmm3
  205. movaps %xmm7, %xmm4
  206. shufps $0x00, %xmm4, %xmm4
  207. movaps %xmm7, %xmm5
  208. shufps $0x55, %xmm5, %xmm5
  209. movaps %xmm7, %xmm6
  210. shufps $0xaa, %xmm6, %xmm6
  211. shufps $0xff, %xmm7, %xmm7
  212. #endif
  213. movaps %xmm0, 0 * SIZE(BB)
  214. movaps %xmm1, 4 * SIZE(BB)
  215. movaps %xmm2, 8 * SIZE(BB)
  216. movaps %xmm3, 12 * SIZE(BB)
  217. movaps %xmm4, 16 * SIZE(BB)
  218. movaps %xmm5, 20 * SIZE(BB)
  219. movaps %xmm6, 24 * SIZE(BB)
  220. movaps %xmm7, 28 * SIZE(BB)
  221. addl $ 8 * SIZE, B
  222. addl $32 * SIZE, BB
  223. decl %eax
  224. BRANCH
  225. jne .L02
  226. ALIGN_2
  227. .L03:
  228. #if defined(LT) || defined(RN)
  229. movl KK, %eax
  230. #else
  231. movl K, %eax
  232. subl KK, %eax
  233. #endif
  234. andl $3, %eax
  235. BRANCH
  236. jle .L05
  237. ALIGN_2
  238. .L04:
  239. movsd 0 * SIZE(B), %xmm3
  240. #ifdef HAVE_SSE2
  241. pshufd $0x00, %xmm3, %xmm0
  242. pshufd $0x55, %xmm3, %xmm1
  243. #else
  244. movaps %xmm3, %xmm0
  245. shufps $0x00, %xmm0, %xmm0
  246. movaps %xmm3, %xmm1
  247. shufps $0x55, %xmm1, %xmm1
  248. #endif
  249. movaps %xmm0, 0 * SIZE(BB)
  250. movaps %xmm1, 4 * SIZE(BB)
  251. addl $2 * SIZE, B
  252. addl $8 * SIZE, BB
  253. decl %eax
  254. jne .L04
  255. ALIGN_4
  256. .L05:
  257. #if defined(LT) || defined(RN)
  258. movl A, AA
  259. #else
  260. movl A, %eax
  261. movl %eax, AORIG
  262. #endif
  263. leal (, LDC, 2), %eax
  264. #ifdef RT
  265. subl %eax, C
  266. #endif
  267. movl C, CO1
  268. #ifndef RT
  269. addl %eax, C
  270. #endif
  271. movl M, %ebx
  272. sarl $3, %ebx
  273. jle .L30
  274. ALIGN_4
  275. .L10:
  276. #ifdef LN
  277. movl K, %eax
  278. sall $3 + BASE_SHIFT, %eax
  279. subl %eax, AORIG
  280. #endif
  281. #if defined(LN) || defined(RT)
  282. movl KK, %eax
  283. movl AORIG, AA
  284. sall $3 + BASE_SHIFT, %eax
  285. addl %eax, AA
  286. #endif
  287. leal BUFFER, BB
  288. #if defined(LN) || defined(RT)
  289. movl KK, %eax
  290. sall $1 + BASE_SHIFT, %eax
  291. leal (BB, %eax, 4), BB
  292. #endif
  293. movaps 0 * SIZE(BB), %xmm2
  294. xorps %xmm4, %xmm4
  295. movaps 0 * SIZE(AA), %xmm0
  296. xorps %xmm5, %xmm5
  297. movaps 8 * SIZE(BB), %xmm3
  298. xorps %xmm6, %xmm6
  299. movaps 8 * SIZE(AA), %xmm1
  300. xorps %xmm7, %xmm7
  301. PREFETCHW 7 * SIZE(CO1)
  302. PREFETCHW 7 * SIZE(CO1, LDC)
  303. #if defined(LT) || defined(RN)
  304. movl KK, %eax
  305. #else
  306. movl K, %eax
  307. subl KK, %eax
  308. #endif
  309. sarl $3, %eax
  310. je .L12
  311. ALIGN_2
  312. .L11:
  313. mulps %xmm0, %xmm2
  314. mulps 4 * SIZE(BB), %xmm0
  315. addps %xmm2, %xmm4
  316. movaps 0 * SIZE(BB), %xmm2
  317. addps %xmm0, %xmm5
  318. movaps 4 * SIZE(AA), %xmm0
  319. mulps %xmm0, %xmm2
  320. mulps 4 * SIZE(BB), %xmm0
  321. addps %xmm2, %xmm6
  322. movaps 16 * SIZE(BB), %xmm2
  323. addps %xmm0, %xmm7
  324. movaps 16 * SIZE(AA), %xmm0
  325. mulps %xmm1, %xmm3
  326. mulps 12 * SIZE(BB), %xmm1
  327. addps %xmm3, %xmm4
  328. movaps 8 * SIZE(BB), %xmm3
  329. addps %xmm1, %xmm5
  330. movaps 12 * SIZE(AA), %xmm1
  331. mulps %xmm1, %xmm3
  332. mulps 12 * SIZE(BB), %xmm1
  333. addps %xmm3, %xmm6
  334. movaps 24 * SIZE(BB), %xmm3
  335. addps %xmm1, %xmm7
  336. movaps 24 * SIZE(AA), %xmm1
  337. mulps %xmm0, %xmm2
  338. mulps 20 * SIZE(BB), %xmm0
  339. addps %xmm2, %xmm4
  340. movaps 16 * SIZE(BB), %xmm2
  341. addps %xmm0, %xmm5
  342. movaps 20 * SIZE(AA), %xmm0
  343. mulps %xmm0, %xmm2
  344. mulps 20 * SIZE(BB), %xmm0
  345. addps %xmm2, %xmm6
  346. movaps 32 * SIZE(BB), %xmm2
  347. addps %xmm0, %xmm7
  348. movaps 32 * SIZE(AA), %xmm0
  349. mulps %xmm1, %xmm3
  350. mulps 28 * SIZE(BB), %xmm1
  351. addps %xmm3, %xmm4
  352. movaps 24 * SIZE(BB), %xmm3
  353. addps %xmm1, %xmm5
  354. movaps 28 * SIZE(AA), %xmm1
  355. mulps %xmm1, %xmm3
  356. mulps 28 * SIZE(BB), %xmm1
  357. addps %xmm3, %xmm6
  358. movaps 40 * SIZE(BB), %xmm3
  359. addps %xmm1, %xmm7
  360. movaps 40 * SIZE(AA), %xmm1
  361. mulps %xmm0, %xmm2
  362. mulps 36 * SIZE(BB), %xmm0
  363. addps %xmm2, %xmm4
  364. movaps 32 * SIZE(BB), %xmm2
  365. addps %xmm0, %xmm5
  366. movaps 36 * SIZE(AA), %xmm0
  367. mulps %xmm0, %xmm2
  368. mulps 36 * SIZE(BB), %xmm0
  369. addps %xmm2, %xmm6
  370. movaps 48 * SIZE(BB), %xmm2
  371. addps %xmm0, %xmm7
  372. movaps 48 * SIZE(AA), %xmm0
  373. mulps %xmm1, %xmm3
  374. mulps 44 * SIZE(BB), %xmm1
  375. addps %xmm3, %xmm4
  376. movaps 40 * SIZE(BB), %xmm3
  377. addps %xmm1, %xmm5
  378. movaps 44 * SIZE(AA), %xmm1
  379. mulps %xmm1, %xmm3
  380. mulps 44 * SIZE(BB), %xmm1
  381. addps %xmm3, %xmm6
  382. movaps 56 * SIZE(BB), %xmm3
  383. addps %xmm1, %xmm7
  384. movaps 56 * SIZE(AA), %xmm1
  385. mulps %xmm0, %xmm2
  386. mulps 52 * SIZE(BB), %xmm0
  387. addps %xmm2, %xmm4
  388. movaps 48 * SIZE(BB), %xmm2
  389. addps %xmm0, %xmm5
  390. movaps 52 * SIZE(AA), %xmm0
  391. mulps %xmm0, %xmm2
  392. mulps 52 * SIZE(BB), %xmm0
  393. addps %xmm2, %xmm6
  394. movaps 64 * SIZE(BB), %xmm2
  395. addps %xmm0, %xmm7
  396. movaps 64 * SIZE(AA), %xmm0
  397. mulps %xmm1, %xmm3
  398. mulps 60 * SIZE(BB), %xmm1
  399. addps %xmm3, %xmm4
  400. movaps 56 * SIZE(BB), %xmm3
  401. addps %xmm1, %xmm5
  402. movaps 60 * SIZE(AA), %xmm1
  403. mulps %xmm1, %xmm3
  404. mulps 60 * SIZE(BB), %xmm1
  405. addps %xmm3, %xmm6
  406. movaps 72 * SIZE(BB), %xmm3
  407. addps %xmm1, %xmm7
  408. movaps 72 * SIZE(AA), %xmm1
  409. addl $64 * SIZE, BB
  410. addl $64 * SIZE, AA
  411. decl %eax
  412. jne .L11
  413. ALIGN_2
  414. .L12:
  415. #if defined(LT) || defined(RN)
  416. movl KK, %eax
  417. #else
  418. movl K, %eax
  419. subl KK, %eax
  420. #endif
  421. andl $7, %eax # if (k & 1)
  422. BRANCH
  423. je .L14
  424. .L13:
  425. movaps 4 * SIZE(BB), %xmm1
  426. mulps %xmm0, %xmm2
  427. addps %xmm2, %xmm4
  428. movaps 0 * SIZE(BB), %xmm2
  429. mulps %xmm0, %xmm1
  430. movaps 4 * SIZE(AA), %xmm0
  431. addps %xmm1, %xmm5
  432. movaps 4 * SIZE(BB), %xmm1
  433. mulps %xmm0, %xmm2
  434. addps %xmm2, %xmm6
  435. movaps 8 * SIZE(BB), %xmm2
  436. mulps %xmm0, %xmm1
  437. movaps 8 * SIZE(AA), %xmm0
  438. addps %xmm1, %xmm7
  439. addl $8 * SIZE, AA
  440. addl $8 * SIZE, BB
  441. subl $1, %eax
  442. jg .L13
  443. ALIGN_4
  444. .L14:
  445. #if defined(LN) || defined(RT)
  446. movl KK, %eax
  447. #ifdef LN
  448. subl $8, %eax
  449. #else
  450. subl $2, %eax
  451. #endif
  452. movl AORIG, AA
  453. movl BORIG, B
  454. leal BUFFER, BB
  455. sall $BASE_SHIFT, %eax
  456. leal (AA, %eax, 8), AA
  457. leal (B, %eax, 2), B
  458. leal (BB, %eax, 8), BB
  459. #endif
  460. #if defined(LN) || defined(LT)
  461. movaps %xmm4, %xmm0
  462. unpcklps %xmm5, %xmm4
  463. unpckhps %xmm5, %xmm0
  464. movaps %xmm6, %xmm1
  465. unpcklps %xmm7, %xmm6
  466. unpckhps %xmm7, %xmm1
  467. movsd 0 * SIZE(B), %xmm2
  468. movhps 2 * SIZE(B), %xmm2
  469. movsd 4 * SIZE(B), %xmm3
  470. movhps 6 * SIZE(B), %xmm3
  471. movsd 8 * SIZE(B), %xmm5
  472. movhps 10 * SIZE(B), %xmm5
  473. movsd 12 * SIZE(B), %xmm7
  474. movhps 14 * SIZE(B), %xmm7
  475. subps %xmm4, %xmm2
  476. subps %xmm0, %xmm3
  477. subps %xmm6, %xmm5
  478. subps %xmm1, %xmm7
  479. #else
  480. movaps 0 * SIZE(AA), %xmm0
  481. movaps 4 * SIZE(AA), %xmm1
  482. movaps 8 * SIZE(AA), %xmm2
  483. movaps 12 * SIZE(AA), %xmm3
  484. subps %xmm4, %xmm0
  485. subps %xmm6, %xmm1
  486. subps %xmm5, %xmm2
  487. subps %xmm7, %xmm3
  488. #endif
  489. #if defined(LN) || defined(LT)
  490. movaps TRMASK, %xmm6
  491. #endif
  492. #ifdef LN
  493. movss 63 * SIZE(AA), %xmm0
  494. movaps %xmm6, %xmm1
  495. shufps $0x00, %xmm0, %xmm1
  496. mulps %xmm1, %xmm7
  497. movaps %xmm7, %xmm1
  498. shufps $0xee, %xmm1, %xmm1
  499. movss 62 * SIZE(AA), %xmm0
  500. shufps $0x50, %xmm0, %xmm0
  501. mulps %xmm1, %xmm0
  502. subps %xmm0, %xmm7
  503. movsd 60 * SIZE(AA), %xmm0
  504. shufps $0x50, %xmm0, %xmm0
  505. mulps %xmm1, %xmm0
  506. subps %xmm0, %xmm5
  507. movsd 58 * SIZE(AA), %xmm0
  508. shufps $0x50, %xmm0, %xmm0
  509. mulps %xmm1, %xmm0
  510. subps %xmm0, %xmm3
  511. movsd 56 * SIZE(AA), %xmm0
  512. shufps $0x50, %xmm0, %xmm0
  513. mulps %xmm1, %xmm0
  514. subps %xmm0, %xmm2
  515. movss 54 * SIZE(AA), %xmm0
  516. shufps $0x00, %xmm6, %xmm0
  517. mulps %xmm0, %xmm7
  518. movaps %xmm7, %xmm1
  519. shufps $0x44, %xmm1, %xmm1
  520. movsd 52 * SIZE(AA), %xmm0
  521. shufps $0x50, %xmm0, %xmm0
  522. mulps %xmm1, %xmm0
  523. subps %xmm0, %xmm5
  524. movsd 50 * SIZE(AA), %xmm0
  525. shufps $0x50, %xmm0, %xmm0
  526. mulps %xmm1, %xmm0
  527. subps %xmm0, %xmm3
  528. movsd 48 * SIZE(AA), %xmm0
  529. shufps $0x50, %xmm0, %xmm0
  530. mulps %xmm1, %xmm0
  531. subps %xmm0, %xmm2
  532. movss 45 * SIZE(AA), %xmm0
  533. movaps %xmm6, %xmm1
  534. shufps $0x00, %xmm0, %xmm1
  535. mulps %xmm1, %xmm5
  536. movaps %xmm5, %xmm1
  537. shufps $0xee, %xmm1, %xmm1
  538. movss 44 * SIZE(AA), %xmm0
  539. shufps $0x50, %xmm0, %xmm0
  540. mulps %xmm1, %xmm0
  541. subps %xmm0, %xmm5
  542. movsd 42 * SIZE(AA), %xmm0
  543. shufps $0x50, %xmm0, %xmm0
  544. mulps %xmm1, %xmm0
  545. subps %xmm0, %xmm3
  546. movsd 40 * SIZE(AA), %xmm0
  547. shufps $0x50, %xmm0, %xmm0
  548. mulps %xmm1, %xmm0
  549. subps %xmm0, %xmm2
  550. movss 36 * SIZE(AA), %xmm0
  551. shufps $0x00, %xmm6, %xmm0
  552. mulps %xmm0, %xmm5
  553. movaps %xmm5, %xmm1
  554. shufps $0x44, %xmm1, %xmm1
  555. movsd 34 * SIZE(AA), %xmm0
  556. shufps $0x50, %xmm0, %xmm0
  557. mulps %xmm1, %xmm0
  558. subps %xmm0, %xmm3
  559. movsd 32 * SIZE(AA), %xmm0
  560. shufps $0x50, %xmm0, %xmm0
  561. mulps %xmm1, %xmm0
  562. subps %xmm0, %xmm2
  563. movss 27 * SIZE(AA), %xmm0
  564. movaps %xmm6, %xmm1
  565. shufps $0x00, %xmm0, %xmm1
  566. mulps %xmm1, %xmm3
  567. movaps %xmm3, %xmm1
  568. shufps $0xee, %xmm1, %xmm1
  569. movss 26 * SIZE(AA), %xmm0
  570. shufps $0x50, %xmm0, %xmm0
  571. mulps %xmm1, %xmm0
  572. subps %xmm0, %xmm3
  573. movsd 24 * SIZE(AA), %xmm0
  574. shufps $0x50, %xmm0, %xmm0
  575. mulps %xmm1, %xmm0
  576. subps %xmm0, %xmm2
  577. movss 18 * SIZE(AA), %xmm0
  578. shufps $0x00, %xmm6, %xmm0
  579. mulps %xmm0, %xmm3
  580. movaps %xmm3, %xmm1
  581. shufps $0x44, %xmm1, %xmm1
  582. movsd 16 * SIZE(AA), %xmm0
  583. shufps $0x50, %xmm0, %xmm0
  584. mulps %xmm1, %xmm0
  585. subps %xmm0, %xmm2
  586. movss 9 * SIZE(AA), %xmm0
  587. movaps %xmm6, %xmm1
  588. shufps $0x00, %xmm0, %xmm1
  589. mulps %xmm1, %xmm2
  590. movaps %xmm2, %xmm1
  591. shufps $0xee, %xmm1, %xmm1
  592. movss 8 * SIZE(AA), %xmm0
  593. shufps $0x50, %xmm0, %xmm0
  594. mulps %xmm1, %xmm0
  595. subps %xmm0, %xmm2
  596. movss 0 * SIZE(AA), %xmm0
  597. shufps $0x00, %xmm6, %xmm0
  598. mulps %xmm0, %xmm2
  599. #endif
  600. #ifdef LT
  601. movss 0 * SIZE(AA), %xmm0
  602. shufps $0x00, %xmm6, %xmm0
  603. mulps %xmm0, %xmm2
  604. movaps %xmm2, %xmm1
  605. shufps $0x44, %xmm1, %xmm1
  606. movss 1 * SIZE(AA), %xmm0
  607. shufps $0x05, %xmm0, %xmm0
  608. mulps %xmm1, %xmm0
  609. subps %xmm0, %xmm2
  610. movsd 2 * SIZE(AA), %xmm0
  611. shufps $0x50, %xmm0, %xmm0
  612. mulps %xmm1, %xmm0
  613. subps %xmm0, %xmm3
  614. movsd 4 * SIZE(AA), %xmm0
  615. shufps $0x50, %xmm0, %xmm0
  616. mulps %xmm1, %xmm0
  617. subps %xmm0, %xmm5
  618. movsd 6 * SIZE(AA), %xmm0
  619. shufps $0x50, %xmm0, %xmm0
  620. mulps %xmm1, %xmm0
  621. subps %xmm0, %xmm7
  622. movss 9 * SIZE(AA), %xmm0
  623. movaps %xmm6, %xmm1
  624. shufps $0x00, %xmm0, %xmm1
  625. mulps %xmm1, %xmm2
  626. movaps %xmm2, %xmm1
  627. shufps $0xee, %xmm1, %xmm1
  628. movsd 10 * SIZE(AA), %xmm0
  629. shufps $0x50, %xmm0, %xmm0
  630. mulps %xmm1, %xmm0
  631. subps %xmm0, %xmm3
  632. movsd 12 * SIZE(AA), %xmm0
  633. shufps $0x50, %xmm0, %xmm0
  634. mulps %xmm1, %xmm0
  635. subps %xmm0, %xmm5
  636. movsd 14 * SIZE(AA), %xmm0
  637. shufps $0x50, %xmm0, %xmm0
  638. mulps %xmm1, %xmm0
  639. subps %xmm0, %xmm7
  640. movss 18 * SIZE(AA), %xmm0
  641. shufps $0x00, %xmm6, %xmm0
  642. mulps %xmm0, %xmm3
  643. movaps %xmm3, %xmm1
  644. shufps $0x44, %xmm1, %xmm1
  645. movss 19 * SIZE(AA), %xmm0
  646. shufps $0x05, %xmm0, %xmm0
  647. mulps %xmm1, %xmm0
  648. subps %xmm0, %xmm3
  649. movsd 20 * SIZE(AA), %xmm0
  650. shufps $0x50, %xmm0, %xmm0
  651. mulps %xmm1, %xmm0
  652. subps %xmm0, %xmm5
  653. movsd 22 * SIZE(AA), %xmm0
  654. shufps $0x50, %xmm0, %xmm0
  655. mulps %xmm1, %xmm0
  656. subps %xmm0, %xmm7
  657. movss 27 * SIZE(AA), %xmm0
  658. movaps %xmm6, %xmm1
  659. shufps $0x00, %xmm0, %xmm1
  660. mulps %xmm1, %xmm3
  661. movaps %xmm3, %xmm1
  662. shufps $0xee, %xmm1, %xmm1
  663. movsd 28 * SIZE(AA), %xmm0
  664. shufps $0x50, %xmm0, %xmm0
  665. mulps %xmm1, %xmm0
  666. subps %xmm0, %xmm5
  667. movsd 30 * SIZE(AA), %xmm0
  668. shufps $0x50, %xmm0, %xmm0
  669. mulps %xmm1, %xmm0
  670. subps %xmm0, %xmm7
  671. movss 36 * SIZE(AA), %xmm0
  672. shufps $0x00, %xmm6, %xmm0
  673. mulps %xmm0, %xmm5
  674. movaps %xmm5, %xmm1
  675. shufps $0x44, %xmm1, %xmm1
  676. movss 37 * SIZE(AA), %xmm0
  677. shufps $0x05, %xmm0, %xmm0
  678. mulps %xmm1, %xmm0
  679. subps %xmm0, %xmm5
  680. movsd 38 * SIZE(AA), %xmm0
  681. shufps $0x50, %xmm0, %xmm0
  682. mulps %xmm1, %xmm0
  683. subps %xmm0, %xmm7
  684. movss 45 * SIZE(AA), %xmm0
  685. movaps %xmm6, %xmm1
  686. shufps $0x00, %xmm0, %xmm1
  687. mulps %xmm1, %xmm5
  688. movaps %xmm5, %xmm1
  689. shufps $0xee, %xmm1, %xmm1
  690. movsd 46 * SIZE(AA), %xmm0
  691. shufps $0x50, %xmm0, %xmm0
  692. mulps %xmm1, %xmm0
  693. subps %xmm0, %xmm7
  694. movss 54 * SIZE(AA), %xmm0
  695. shufps $0x00, %xmm6, %xmm0
  696. mulps %xmm0, %xmm7
  697. movaps %xmm7, %xmm1
  698. shufps $0x44, %xmm1, %xmm1
  699. movss 55 * SIZE(AA), %xmm0
  700. shufps $0x05, %xmm0, %xmm0
  701. mulps %xmm1, %xmm0
  702. subps %xmm0, %xmm7
  703. movss 63 * SIZE(AA), %xmm0
  704. movaps %xmm6, %xmm1
  705. shufps $0x00, %xmm0, %xmm1
  706. mulps %xmm1, %xmm7
  707. #endif
  708. #ifdef RN
  709. movss 0 * SIZE(B), %xmm6
  710. shufps $0x00, %xmm6, %xmm6
  711. mulps %xmm6, %xmm0
  712. mulps %xmm6, %xmm1
  713. movss 1 * SIZE(B), %xmm6
  714. shufps $0x00, %xmm6, %xmm6
  715. movaps %xmm6, %xmm5
  716. mulps %xmm0, %xmm5
  717. mulps %xmm1, %xmm6
  718. subps %xmm5, %xmm2
  719. subps %xmm6, %xmm3
  720. movss 3 * SIZE(B), %xmm6
  721. shufps $0x00, %xmm6, %xmm6
  722. mulps %xmm6, %xmm2
  723. mulps %xmm6, %xmm3
  724. #endif
  725. #ifdef RT
  726. movss 3 * SIZE(B), %xmm6
  727. shufps $0x00, %xmm6, %xmm6
  728. mulps %xmm6, %xmm2
  729. mulps %xmm6, %xmm3
  730. movss 2 * SIZE(B), %xmm6
  731. shufps $0x00, %xmm6, %xmm6
  732. movaps %xmm6, %xmm5
  733. mulps %xmm2, %xmm5
  734. mulps %xmm3, %xmm6
  735. subps %xmm5, %xmm0
  736. subps %xmm6, %xmm1
  737. movss 0 * SIZE(B), %xmm6
  738. shufps $0x00, %xmm6, %xmm6
  739. mulps %xmm6, %xmm0
  740. mulps %xmm6, %xmm1
  741. #endif
  742. #if defined(LN) || defined(LT)
  743. movlps %xmm2, 0 * SIZE(B)
  744. movhps %xmm2, 2 * SIZE(B)
  745. movlps %xmm3, 4 * SIZE(B)
  746. movhps %xmm3, 6 * SIZE(B)
  747. movlps %xmm5, 8 * SIZE(B)
  748. movhps %xmm5, 10 * SIZE(B)
  749. movlps %xmm7, 12 * SIZE(B)
  750. movhps %xmm7, 14 * SIZE(B)
  751. #ifdef HAVE_SSE2
  752. pshufd $0x00, %xmm2, %xmm0
  753. pshufd $0x55, %xmm2, %xmm1
  754. pshufd $0xaa, %xmm2, %xmm4
  755. pshufd $0xff, %xmm2, %xmm6
  756. #else
  757. movaps %xmm2, %xmm0
  758. shufps $0x00, %xmm0, %xmm0
  759. movaps %xmm2, %xmm1
  760. shufps $0x55, %xmm1, %xmm1
  761. movaps %xmm2, %xmm4
  762. shufps $0xaa, %xmm4, %xmm4
  763. movaps %xmm2, %xmm6
  764. shufps $0xff, %xmm6, %xmm6
  765. #endif
  766. movaps %xmm0, 0 * SIZE(BB)
  767. movaps %xmm1, 4 * SIZE(BB)
  768. movaps %xmm4, 8 * SIZE(BB)
  769. movaps %xmm6, 12 * SIZE(BB)
  770. #ifdef HAVE_SSE2
  771. pshufd $0x00, %xmm3, %xmm0
  772. pshufd $0x55, %xmm3, %xmm1
  773. pshufd $0xaa, %xmm3, %xmm4
  774. pshufd $0xff, %xmm3, %xmm6
  775. #else
  776. movaps %xmm3, %xmm0
  777. shufps $0x00, %xmm0, %xmm0
  778. movaps %xmm3, %xmm1
  779. shufps $0x55, %xmm1, %xmm1
  780. movaps %xmm3, %xmm4
  781. shufps $0xaa, %xmm4, %xmm4
  782. movaps %xmm3, %xmm6
  783. shufps $0xff, %xmm6, %xmm6
  784. #endif
  785. movaps %xmm0, 16 * SIZE(BB)
  786. movaps %xmm1, 20 * SIZE(BB)
  787. movaps %xmm4, 24 * SIZE(BB)
  788. movaps %xmm6, 28 * SIZE(BB)
  789. #ifdef HAVE_SSE2
  790. pshufd $0x00, %xmm5, %xmm0
  791. pshufd $0x55, %xmm5, %xmm1
  792. pshufd $0xaa, %xmm5, %xmm4
  793. pshufd $0xff, %xmm5, %xmm6
  794. #else
  795. movaps %xmm5, %xmm0
  796. shufps $0x00, %xmm0, %xmm0
  797. movaps %xmm5, %xmm1
  798. shufps $0x55, %xmm1, %xmm1
  799. movaps %xmm5, %xmm4
  800. shufps $0xaa, %xmm4, %xmm4
  801. movaps %xmm5, %xmm6
  802. shufps $0xff, %xmm6, %xmm6
  803. #endif
  804. movaps %xmm0, 32 * SIZE(BB)
  805. movaps %xmm1, 36 * SIZE(BB)
  806. movaps %xmm4, 40 * SIZE(BB)
  807. movaps %xmm6, 44 * SIZE(BB)
  808. #ifdef HAVE_SSE2
  809. pshufd $0x00, %xmm7, %xmm0
  810. pshufd $0x55, %xmm7, %xmm1
  811. pshufd $0xaa, %xmm7, %xmm4
  812. pshufd $0xff, %xmm7, %xmm6
  813. #else
  814. movaps %xmm7, %xmm0
  815. shufps $0x00, %xmm0, %xmm0
  816. movaps %xmm7, %xmm1
  817. shufps $0x55, %xmm1, %xmm1
  818. movaps %xmm7, %xmm4
  819. shufps $0xaa, %xmm4, %xmm4
  820. movaps %xmm7, %xmm6
  821. shufps $0xff, %xmm6, %xmm6
  822. #endif
  823. movaps %xmm0, 48 * SIZE(BB)
  824. movaps %xmm1, 52 * SIZE(BB)
  825. movaps %xmm4, 56 * SIZE(BB)
  826. movaps %xmm6, 60 * SIZE(BB)
  827. #else
  828. movaps %xmm0, 0 * SIZE(AA)
  829. movaps %xmm1, 4 * SIZE(AA)
  830. movaps %xmm2, 8 * SIZE(AA)
  831. movaps %xmm3, 12 * SIZE(AA)
  832. #endif
  833. #ifdef LN
  834. subl $8 * SIZE, CO1
  835. #endif
  836. #if defined(LN) || defined(LT)
  837. movaps %xmm2, %xmm0
  838. shufps $0x88, %xmm3, %xmm2
  839. shufps $0xdd, %xmm3, %xmm0
  840. movaps %xmm5, %xmm4
  841. shufps $0x88, %xmm7, %xmm5
  842. shufps $0xdd, %xmm7, %xmm4
  843. movlps %xmm2, 0 * SIZE(CO1)
  844. movhps %xmm2, 2 * SIZE(CO1)
  845. movlps %xmm5, 4 * SIZE(CO1)
  846. movhps %xmm5, 6 * SIZE(CO1)
  847. movlps %xmm0, 0 * SIZE(CO1, LDC)
  848. movhps %xmm0, 2 * SIZE(CO1, LDC)
  849. movlps %xmm4, 4 * SIZE(CO1, LDC)
  850. movhps %xmm4, 6 * SIZE(CO1, LDC)
  851. #else
  852. movlps %xmm0, 0 * SIZE(CO1)
  853. movhps %xmm0, 2 * SIZE(CO1)
  854. movlps %xmm1, 4 * SIZE(CO1)
  855. movhps %xmm1, 6 * SIZE(CO1)
  856. movlps %xmm2, 0 * SIZE(CO1, LDC)
  857. movhps %xmm2, 2 * SIZE(CO1, LDC)
  858. movlps %xmm3, 4 * SIZE(CO1, LDC)
  859. movhps %xmm3, 6 * SIZE(CO1, LDC)
  860. #endif
  861. #ifndef LN
  862. addl $8 * SIZE, CO1
  863. #endif
  864. #if defined(LT) || defined(RN)
  865. movl K, %eax
  866. subl KK, %eax
  867. leal (,%eax, SIZE), %eax
  868. leal (AA, %eax, 8), AA
  869. #ifdef LT
  870. addl $16 * SIZE, B
  871. #endif
  872. #endif
  873. #ifdef LN
  874. subl $8, KK
  875. movl BORIG, B
  876. #endif
  877. #ifdef LT
  878. addl $8, KK
  879. #endif
  880. #ifdef RT
  881. movl K, %eax
  882. movl BORIG, B
  883. sall $3 + BASE_SHIFT, %eax
  884. addl %eax, AORIG
  885. #endif
  886. decl %ebx # i --
  887. jg .L10
  888. ALIGN_2
  889. .L30:
  890. testl $4, M
  891. jle .L50
  892. #ifdef LN
  893. movl K, %eax
  894. sall $2 + BASE_SHIFT, %eax
  895. subl %eax, AORIG
  896. #endif
  897. #if defined(LN) || defined(RT)
  898. movl KK, %eax
  899. movl AORIG, AA
  900. sall $2 + BASE_SHIFT, %eax
  901. addl %eax, AA
  902. #endif
  903. leal BUFFER, BB
  904. #if defined(LN) || defined(RT)
  905. movl KK, %eax
  906. sall $1 + BASE_SHIFT, %eax
  907. leal (BB, %eax, 4), BB
  908. #endif
  909. movaps 0 * SIZE(BB), %xmm2
  910. xorps %xmm4, %xmm4
  911. movaps 0 * SIZE(AA), %xmm0
  912. xorps %xmm5, %xmm5
  913. movaps 16 * SIZE(BB), %xmm3
  914. xorps %xmm6, %xmm6
  915. movaps 16 * SIZE(AA), %xmm1
  916. xorps %xmm7, %xmm7
  917. #if defined(LT) || defined(RN)
  918. movl KK, %eax
  919. #else
  920. movl K, %eax
  921. subl KK, %eax
  922. #endif
  923. sarl $3, %eax
  924. je .L32
  925. ALIGN_2
  926. .L31:
  927. mulps %xmm0, %xmm2
  928. mulps 4 * SIZE(BB), %xmm0
  929. addps %xmm2, %xmm4
  930. movaps 8 * SIZE(BB), %xmm2
  931. addps %xmm0, %xmm5
  932. movaps 4 * SIZE(AA), %xmm0
  933. mulps %xmm0, %xmm2
  934. mulps 12 * SIZE(BB), %xmm0
  935. addps %xmm2, %xmm6
  936. movaps 32 * SIZE(BB), %xmm2
  937. addps %xmm0, %xmm7
  938. movaps 8 * SIZE(AA), %xmm0
  939. mulps %xmm0, %xmm3
  940. mulps 20 * SIZE(BB), %xmm0
  941. addps %xmm3, %xmm4
  942. movaps 24 * SIZE(BB), %xmm3
  943. addps %xmm0, %xmm5
  944. movaps 12 * SIZE(AA), %xmm0
  945. mulps %xmm0, %xmm3
  946. mulps 28 * SIZE(BB), %xmm0
  947. addps %xmm3, %xmm6
  948. movaps 48 * SIZE(BB), %xmm3
  949. addps %xmm0, %xmm7
  950. movaps 32 * SIZE(AA), %xmm0
  951. mulps %xmm1, %xmm2
  952. mulps 36 * SIZE(BB), %xmm1
  953. addps %xmm2, %xmm4
  954. movaps 40 * SIZE(BB), %xmm2
  955. addps %xmm1, %xmm5
  956. movaps 20 * SIZE(AA), %xmm1
  957. mulps %xmm1, %xmm2
  958. mulps 44 * SIZE(BB), %xmm1
  959. addps %xmm2, %xmm6
  960. movaps 64 * SIZE(BB), %xmm2
  961. addps %xmm1, %xmm7
  962. movaps 24 * SIZE(AA), %xmm1
  963. mulps %xmm1, %xmm3
  964. mulps 52 * SIZE(BB), %xmm1
  965. addps %xmm3, %xmm4
  966. movaps 56 * SIZE(BB), %xmm3
  967. addps %xmm1, %xmm5
  968. movaps 28 * SIZE(AA), %xmm1
  969. mulps %xmm1, %xmm3
  970. mulps 60 * SIZE(BB), %xmm1
  971. addps %xmm3, %xmm6
  972. movaps 80 * SIZE(BB), %xmm3
  973. addps %xmm1, %xmm7
  974. movaps 48 * SIZE(AA), %xmm1
  975. addl $32 * SIZE, AA
  976. addl $64 * SIZE, BB
  977. decl %eax
  978. jne .L31
  979. ALIGN_2
  980. .L32:
  981. #if defined(LT) || defined(RN)
  982. movl KK, %eax
  983. #else
  984. movl K, %eax
  985. subl KK, %eax
  986. #endif
  987. andl $7, %eax # if (k & 1)
  988. BRANCH
  989. je .L34
  990. .L33:
  991. mulps %xmm0, %xmm2
  992. mulps 4 * SIZE(BB), %xmm0
  993. addps %xmm2, %xmm4
  994. movaps 8 * SIZE(BB), %xmm2
  995. addps %xmm0, %xmm5
  996. movaps 4 * SIZE(AA), %xmm0
  997. addl $4 * SIZE, AA
  998. addl $8 * SIZE, BB
  999. decl %eax
  1000. jg .L33
  1001. ALIGN_4
  1002. .L34:
  1003. addps %xmm6, %xmm4
  1004. addps %xmm7, %xmm5
  1005. #if defined(LN) || defined(RT)
  1006. movl KK, %eax
  1007. #ifdef LN
  1008. subl $4, %eax
  1009. #else
  1010. subl $2, %eax
  1011. #endif
  1012. movl AORIG, AA
  1013. movl BORIG, B
  1014. leal BUFFER, BB
  1015. sall $BASE_SHIFT, %eax
  1016. leal (AA, %eax, 4), AA
  1017. leal (B, %eax, 2), B
  1018. leal (BB, %eax, 8), BB
  1019. #endif
  1020. #if defined(LN) || defined(LT)
  1021. movaps %xmm4, %xmm0
  1022. unpcklps %xmm5, %xmm4
  1023. unpckhps %xmm5, %xmm0
  1024. movsd 0 * SIZE(B), %xmm2
  1025. movhps 2 * SIZE(B), %xmm2
  1026. movsd 4 * SIZE(B), %xmm3
  1027. movhps 6 * SIZE(B), %xmm3
  1028. subps %xmm4, %xmm2
  1029. subps %xmm0, %xmm3
  1030. #else
  1031. movaps 0 * SIZE(AA), %xmm0
  1032. movaps 4 * SIZE(AA), %xmm2
  1033. subps %xmm4, %xmm0
  1034. subps %xmm5, %xmm2
  1035. #endif
  1036. #if defined(LN) || defined(LT)
  1037. movaps TRMASK, %xmm6
  1038. #endif
  1039. #ifdef LN
  1040. movss 15 * SIZE(AA), %xmm0
  1041. movaps %xmm6, %xmm1
  1042. shufps $0x00, %xmm0, %xmm1
  1043. mulps %xmm1, %xmm3
  1044. movaps %xmm3, %xmm1
  1045. shufps $0xee, %xmm1, %xmm1
  1046. movss 14 * SIZE(AA), %xmm0
  1047. shufps $0x50, %xmm0, %xmm0
  1048. mulps %xmm1, %xmm0
  1049. subps %xmm0, %xmm3
  1050. movsd 12 * SIZE(AA), %xmm0
  1051. shufps $0x50, %xmm0, %xmm0
  1052. mulps %xmm1, %xmm0
  1053. subps %xmm0, %xmm2
  1054. movss 10 * SIZE(AA), %xmm0
  1055. shufps $0x00, %xmm6, %xmm0
  1056. mulps %xmm0, %xmm3
  1057. movaps %xmm3, %xmm1
  1058. shufps $0x44, %xmm1, %xmm1
  1059. movsd 8 * SIZE(AA), %xmm0
  1060. shufps $0x50, %xmm0, %xmm0
  1061. mulps %xmm1, %xmm0
  1062. subps %xmm0, %xmm2
  1063. movss 5 * SIZE(AA), %xmm0
  1064. movaps %xmm6, %xmm1
  1065. shufps $0x00, %xmm0, %xmm1
  1066. mulps %xmm1, %xmm2
  1067. movaps %xmm2, %xmm1
  1068. shufps $0xee, %xmm1, %xmm1
  1069. movss 4 * SIZE(AA), %xmm0
  1070. shufps $0x50, %xmm0, %xmm0
  1071. mulps %xmm1, %xmm0
  1072. subps %xmm0, %xmm2
  1073. movss 0 * SIZE(AA), %xmm0
  1074. shufps $0x00, %xmm6, %xmm0
  1075. mulps %xmm0, %xmm2
  1076. #endif
  1077. #ifdef LT
  1078. movss 0 * SIZE(AA), %xmm0
  1079. shufps $0x00, %xmm6, %xmm0
  1080. mulps %xmm0, %xmm2
  1081. movaps %xmm2, %xmm1
  1082. shufps $0x44, %xmm1, %xmm1
  1083. movss 1 * SIZE(AA), %xmm0
  1084. shufps $0x05, %xmm0, %xmm0
  1085. mulps %xmm1, %xmm0
  1086. subps %xmm0, %xmm2
  1087. movsd 2 * SIZE(AA), %xmm0
  1088. shufps $0x50, %xmm0, %xmm0
  1089. mulps %xmm1, %xmm0
  1090. subps %xmm0, %xmm3
  1091. movss 5 * SIZE(AA), %xmm0
  1092. movaps %xmm6, %xmm1
  1093. shufps $0x00, %xmm0, %xmm1
  1094. mulps %xmm1, %xmm2
  1095. movaps %xmm2, %xmm1
  1096. shufps $0xee, %xmm1, %xmm1
  1097. movsd 6 * SIZE(AA), %xmm0
  1098. shufps $0x50, %xmm0, %xmm0
  1099. mulps %xmm1, %xmm0
  1100. subps %xmm0, %xmm3
  1101. movss 10 * SIZE(AA), %xmm0
  1102. shufps $0x00, %xmm6, %xmm0
  1103. mulps %xmm0, %xmm3
  1104. movaps %xmm3, %xmm1
  1105. shufps $0x44, %xmm1, %xmm1
  1106. movss 11 * SIZE(AA), %xmm0
  1107. shufps $0x05, %xmm0, %xmm0
  1108. mulps %xmm1, %xmm0
  1109. subps %xmm0, %xmm3
  1110. movss 15 * SIZE(AA), %xmm0
  1111. movaps %xmm6, %xmm1
  1112. shufps $0x00, %xmm0, %xmm1
  1113. mulps %xmm1, %xmm3
  1114. #endif
  1115. #ifdef RN
  1116. movss 0 * SIZE(B), %xmm6
  1117. shufps $0x00, %xmm6, %xmm6
  1118. mulps %xmm6, %xmm0
  1119. movss 1 * SIZE(B), %xmm6
  1120. shufps $0x00, %xmm6, %xmm6
  1121. movaps %xmm6, %xmm5
  1122. mulps %xmm0, %xmm5
  1123. subps %xmm5, %xmm2
  1124. movss 3 * SIZE(B), %xmm6
  1125. shufps $0x00, %xmm6, %xmm6
  1126. mulps %xmm6, %xmm2
  1127. #endif
  1128. #ifdef RT
  1129. movss 3 * SIZE(B), %xmm6
  1130. shufps $0x00, %xmm6, %xmm6
  1131. mulps %xmm6, %xmm2
  1132. movss 2 * SIZE(B), %xmm6
  1133. shufps $0x00, %xmm6, %xmm6
  1134. movaps %xmm6, %xmm5
  1135. mulps %xmm2, %xmm5
  1136. subps %xmm5, %xmm0
  1137. movss 0 * SIZE(B), %xmm6
  1138. shufps $0x00, %xmm6, %xmm6
  1139. mulps %xmm6, %xmm0
  1140. #endif
  1141. #if defined(LN) || defined(LT)
  1142. movlps %xmm2, 0 * SIZE(B)
  1143. movhps %xmm2, 2 * SIZE(B)
  1144. movlps %xmm3, 4 * SIZE(B)
  1145. movhps %xmm3, 6 * SIZE(B)
  1146. #ifdef HAVE_SSE2
  1147. pshufd $0x00, %xmm2, %xmm0
  1148. pshufd $0x55, %xmm2, %xmm1
  1149. pshufd $0xaa, %xmm2, %xmm4
  1150. pshufd $0xff, %xmm2, %xmm6
  1151. #else
  1152. movaps %xmm2, %xmm0
  1153. shufps $0x00, %xmm0, %xmm0
  1154. movaps %xmm2, %xmm1
  1155. shufps $0x55, %xmm1, %xmm1
  1156. movaps %xmm2, %xmm4
  1157. shufps $0xaa, %xmm4, %xmm4
  1158. movaps %xmm2, %xmm6
  1159. shufps $0xff, %xmm6, %xmm6
  1160. #endif
  1161. movaps %xmm0, 0 * SIZE(BB)
  1162. movaps %xmm1, 4 * SIZE(BB)
  1163. movaps %xmm4, 8 * SIZE(BB)
  1164. movaps %xmm6, 12 * SIZE(BB)
  1165. #ifdef HAVE_SSE2
  1166. pshufd $0x00, %xmm3, %xmm0
  1167. pshufd $0x55, %xmm3, %xmm1
  1168. pshufd $0xaa, %xmm3, %xmm4
  1169. pshufd $0xff, %xmm3, %xmm6
  1170. #else
  1171. movaps %xmm3, %xmm0
  1172. shufps $0x00, %xmm0, %xmm0
  1173. movaps %xmm3, %xmm1
  1174. shufps $0x55, %xmm1, %xmm1
  1175. movaps %xmm3, %xmm4
  1176. shufps $0xaa, %xmm4, %xmm4
  1177. movaps %xmm3, %xmm6
  1178. shufps $0xff, %xmm6, %xmm6
  1179. #endif
  1180. movaps %xmm0, 16 * SIZE(BB)
  1181. movaps %xmm1, 20 * SIZE(BB)
  1182. movaps %xmm4, 24 * SIZE(BB)
  1183. movaps %xmm6, 28 * SIZE(BB)
  1184. #else
  1185. movaps %xmm0, 0 * SIZE(AA)
  1186. movaps %xmm2, 4 * SIZE(AA)
  1187. #endif
  1188. #ifdef LN
  1189. subl $4 * SIZE, CO1
  1190. #endif
  1191. #if defined(LN) || defined(LT)
  1192. movaps %xmm2, %xmm0
  1193. shufps $0x88, %xmm3, %xmm2
  1194. shufps $0xdd, %xmm3, %xmm0
  1195. movlps %xmm2, 0 * SIZE(CO1)
  1196. movhps %xmm2, 2 * SIZE(CO1)
  1197. movlps %xmm0, 0 * SIZE(CO1, LDC)
  1198. movhps %xmm0, 2 * SIZE(CO1, LDC)
  1199. #else
  1200. movlps %xmm0, 0 * SIZE(CO1)
  1201. movhps %xmm0, 2 * SIZE(CO1)
  1202. movlps %xmm2, 0 * SIZE(CO1, LDC)
  1203. movhps %xmm2, 2 * SIZE(CO1, LDC)
  1204. #endif
  1205. #ifndef LN
  1206. addl $4 * SIZE, CO1
  1207. #endif
  1208. #if defined(LT) || defined(RN)
  1209. movl K, %eax
  1210. subl KK, %eax
  1211. leal (,%eax, SIZE), %eax
  1212. leal (AA, %eax, 4), AA
  1213. #ifdef LT
  1214. addl $8 * SIZE, B
  1215. #endif
  1216. #endif
  1217. #ifdef LN
  1218. subl $4, KK
  1219. movl BORIG, B
  1220. #endif
  1221. #ifdef LT
  1222. addl $4, KK
  1223. #endif
  1224. #ifdef RT
  1225. movl K, %eax
  1226. movl BORIG, B
  1227. sall $2 + BASE_SHIFT, %eax
  1228. addl %eax, AORIG
  1229. #endif
  1230. ALIGN_2
  1231. .L50:
  1232. testl $2, M
  1233. jle .L70
  1234. #ifdef LN
  1235. movl K, %eax
  1236. sall $1 + BASE_SHIFT, %eax
  1237. subl %eax, AORIG
  1238. #endif
  1239. #if defined(LN) || defined(RT)
  1240. movl KK, %eax
  1241. movl AORIG, AA
  1242. sall $1 + BASE_SHIFT, %eax
  1243. addl %eax, AA
  1244. #endif
  1245. leal BUFFER, BB
  1246. #if defined(LN) || defined(RT)
  1247. movl KK, %eax
  1248. sall $1 + BASE_SHIFT, %eax
  1249. leal (BB, %eax, 4), BB
  1250. #endif
  1251. movaps 0 * SIZE(BB), %xmm2
  1252. xorps %xmm4, %xmm4
  1253. movaps 0 * SIZE(AA), %xmm0
  1254. xorps %xmm5, %xmm5
  1255. movaps 16 * SIZE(BB), %xmm3
  1256. xorps %xmm6, %xmm6
  1257. movaps 8 * SIZE(AA), %xmm1
  1258. xorps %xmm7, %xmm7
  1259. #if defined(LT) || defined(RN)
  1260. movl KK, %eax
  1261. #else
  1262. movl K, %eax
  1263. subl KK, %eax
  1264. #endif
  1265. sarl $3, %eax
  1266. je .L52
  1267. ALIGN_2
  1268. .L51:
  1269. mulps %xmm0, %xmm2
  1270. addps %xmm2, %xmm4
  1271. movaps 4 * SIZE(BB), %xmm2
  1272. mulps %xmm0, %xmm2
  1273. movsd 2 * SIZE(AA), %xmm0
  1274. addps %xmm2, %xmm5
  1275. movaps 8 * SIZE(BB), %xmm2
  1276. mulps %xmm0, %xmm2
  1277. addps %xmm2, %xmm6
  1278. movaps 12 * SIZE(BB), %xmm2
  1279. mulps %xmm0, %xmm2
  1280. movsd 4 * SIZE(AA), %xmm0
  1281. addps %xmm2, %xmm7
  1282. movaps 32 * SIZE(BB), %xmm2
  1283. mulps %xmm0, %xmm3
  1284. addps %xmm3, %xmm4
  1285. movaps 20 * SIZE(BB), %xmm3
  1286. mulps %xmm0, %xmm3
  1287. movsd 6 * SIZE(AA), %xmm0
  1288. addps %xmm3, %xmm5
  1289. movaps 24 * SIZE(BB), %xmm3
  1290. mulps %xmm0, %xmm3
  1291. addps %xmm3, %xmm6
  1292. movaps 28 * SIZE(BB), %xmm3
  1293. mulps %xmm0, %xmm3
  1294. movsd 16 * SIZE(AA), %xmm0
  1295. addps %xmm3, %xmm7
  1296. movaps 48 * SIZE(BB), %xmm3
  1297. mulps %xmm1, %xmm2
  1298. addps %xmm2, %xmm4
  1299. movaps 36 * SIZE(BB), %xmm2
  1300. mulps %xmm1, %xmm2
  1301. movsd 10 * SIZE(AA), %xmm1
  1302. addps %xmm2, %xmm5
  1303. movaps 40 * SIZE(BB), %xmm2
  1304. mulps %xmm1, %xmm2
  1305. addps %xmm2, %xmm6
  1306. movaps 44 * SIZE(BB), %xmm2
  1307. mulps %xmm1, %xmm2
  1308. movsd 12 * SIZE(AA), %xmm1
  1309. addps %xmm2, %xmm7
  1310. movaps 64 * SIZE(BB), %xmm2
  1311. mulps %xmm1, %xmm3
  1312. addps %xmm3, %xmm4
  1313. movaps 52 * SIZE(BB), %xmm3
  1314. mulps %xmm1, %xmm3
  1315. movsd 14 * SIZE(AA), %xmm1
  1316. addps %xmm3, %xmm5
  1317. movaps 56 * SIZE(BB), %xmm3
  1318. mulps %xmm1, %xmm3
  1319. addps %xmm3, %xmm6
  1320. movaps 60 * SIZE(BB), %xmm3
  1321. mulps %xmm1, %xmm3
  1322. movsd 24 * SIZE(AA), %xmm1
  1323. addps %xmm3, %xmm7
  1324. movaps 80 * SIZE(BB), %xmm3
  1325. addl $16 * SIZE, AA
  1326. addl $64 * SIZE, BB
  1327. decl %eax
  1328. jne .L51
  1329. ALIGN_2
  1330. .L52:
  1331. #if defined(LT) || defined(RN)
  1332. movl KK, %eax
  1333. #else
  1334. movl K, %eax
  1335. subl KK, %eax
  1336. #endif
  1337. andl $7, %eax # if (k & 1)
  1338. BRANCH
  1339. je .L54
  1340. .L53:
  1341. mulps %xmm0, %xmm2
  1342. addps %xmm2, %xmm4
  1343. movaps 4 * SIZE(BB), %xmm2
  1344. mulps %xmm0, %xmm2
  1345. movsd 2 * SIZE(AA), %xmm0
  1346. addps %xmm2, %xmm5
  1347. movaps 8 * SIZE(BB), %xmm2
  1348. addl $2 * SIZE, AA
  1349. addl $8 * SIZE, BB
  1350. decl %eax
  1351. jg .L53
  1352. ALIGN_4
  1353. .L54:
  1354. addps %xmm6, %xmm4
  1355. addps %xmm7, %xmm5
  1356. #if defined(LN) || defined(RT)
  1357. movl KK, %eax
  1358. #ifdef LN
  1359. subl $2, %eax
  1360. #else
  1361. subl $2, %eax
  1362. #endif
  1363. movl AORIG, AA
  1364. movl BORIG, B
  1365. leal BUFFER, BB
  1366. sall $BASE_SHIFT, %eax
  1367. leal (AA, %eax, 2), AA
  1368. leal (B, %eax, 2), B
  1369. leal (BB, %eax, 8), BB
  1370. #endif
  1371. #if defined(LN) || defined(LT)
  1372. unpcklps %xmm5, %xmm4
  1373. movsd 0 * SIZE(B), %xmm2
  1374. movhps 2 * SIZE(B), %xmm2
  1375. subps %xmm4, %xmm2
  1376. #else
  1377. #ifdef movsd
  1378. xorps %xmm0, %xmm0
  1379. #endif
  1380. movsd 0 * SIZE(AA), %xmm0
  1381. #ifdef movsd
  1382. xorps %xmm2, %xmm2
  1383. #endif
  1384. movsd 2 * SIZE(AA), %xmm2
  1385. subps %xmm4, %xmm0
  1386. subps %xmm5, %xmm2
  1387. #endif
  1388. #if defined(LN) || defined(LT)
  1389. movaps TRMASK, %xmm6
  1390. #endif
  1391. #ifdef LN
  1392. movss 3 * SIZE(AA), %xmm0
  1393. movaps %xmm6, %xmm1
  1394. shufps $0x00, %xmm0, %xmm1
  1395. mulps %xmm1, %xmm2
  1396. movaps %xmm2, %xmm1
  1397. shufps $0xee, %xmm1, %xmm1
  1398. movss 2 * SIZE(AA), %xmm0
  1399. shufps $0x50, %xmm0, %xmm0
  1400. mulps %xmm1, %xmm0
  1401. subps %xmm0, %xmm2
  1402. movss 0 * SIZE(AA), %xmm0
  1403. shufps $0x00, %xmm6, %xmm0
  1404. mulps %xmm0, %xmm2
  1405. #endif
  1406. #ifdef LT
  1407. movss 0 * SIZE(AA), %xmm0
  1408. shufps $0x00, %xmm6, %xmm0
  1409. mulps %xmm0, %xmm2
  1410. movaps %xmm2, %xmm1
  1411. shufps $0x44, %xmm1, %xmm1
  1412. movss 1 * SIZE(AA), %xmm0
  1413. shufps $0x05, %xmm0, %xmm0
  1414. mulps %xmm1, %xmm0
  1415. subps %xmm0, %xmm2
  1416. movss 3 * SIZE(AA), %xmm0
  1417. movaps %xmm6, %xmm1
  1418. shufps $0x00, %xmm0, %xmm1
  1419. mulps %xmm1, %xmm2
  1420. #endif
  1421. #ifdef RN
  1422. movss 0 * SIZE(B), %xmm6
  1423. shufps $0x00, %xmm6, %xmm6
  1424. mulps %xmm6, %xmm0
  1425. movss 1 * SIZE(B), %xmm6
  1426. shufps $0x00, %xmm6, %xmm6
  1427. movaps %xmm6, %xmm5
  1428. mulps %xmm0, %xmm5
  1429. subps %xmm5, %xmm2
  1430. movss 3 * SIZE(B), %xmm6
  1431. shufps $0x00, %xmm6, %xmm6
  1432. mulps %xmm6, %xmm2
  1433. #endif
  1434. #ifdef RT
  1435. movss 3 * SIZE(B), %xmm6
  1436. shufps $0x00, %xmm6, %xmm6
  1437. mulps %xmm6, %xmm2
  1438. movss 2 * SIZE(B), %xmm6
  1439. shufps $0x00, %xmm6, %xmm6
  1440. movaps %xmm6, %xmm5
  1441. mulps %xmm2, %xmm5
  1442. subps %xmm5, %xmm0
  1443. movss 0 * SIZE(B), %xmm6
  1444. shufps $0x00, %xmm6, %xmm6
  1445. mulps %xmm6, %xmm0
  1446. #endif
  1447. #if defined(LN) || defined(LT)
  1448. movlps %xmm2, 0 * SIZE(B)
  1449. movhps %xmm2, 2 * SIZE(B)
  1450. #ifdef HAVE_SSE2
  1451. pshufd $0x00, %xmm2, %xmm0
  1452. pshufd $0x55, %xmm2, %xmm1
  1453. pshufd $0xaa, %xmm2, %xmm4
  1454. pshufd $0xff, %xmm2, %xmm6
  1455. #else
  1456. movaps %xmm2, %xmm0
  1457. shufps $0x00, %xmm0, %xmm0
  1458. movaps %xmm2, %xmm1
  1459. shufps $0x55, %xmm1, %xmm1
  1460. movaps %xmm2, %xmm4
  1461. shufps $0xaa, %xmm4, %xmm4
  1462. movaps %xmm2, %xmm6
  1463. shufps $0xff, %xmm6, %xmm6
  1464. #endif
  1465. movaps %xmm0, 0 * SIZE(BB)
  1466. movaps %xmm1, 4 * SIZE(BB)
  1467. movaps %xmm4, 8 * SIZE(BB)
  1468. movaps %xmm6, 12 * SIZE(BB)
  1469. #else
  1470. movlps %xmm0, 0 * SIZE(AA)
  1471. movlps %xmm2, 2 * SIZE(AA)
  1472. #endif
  1473. #ifdef LN
  1474. subl $2 * SIZE, CO1
  1475. #endif
  1476. #if defined(LN) || defined(LT)
  1477. movaps %xmm2, %xmm0
  1478. shufps $0x88, %xmm3, %xmm2
  1479. shufps $0xdd, %xmm3, %xmm0
  1480. movlps %xmm2, 0 * SIZE(CO1)
  1481. movlps %xmm0, 0 * SIZE(CO1, LDC)
  1482. #else
  1483. movlps %xmm0, 0 * SIZE(CO1)
  1484. movlps %xmm2, 0 * SIZE(CO1, LDC)
  1485. #endif
  1486. #ifndef LN
  1487. addl $2 * SIZE, CO1
  1488. #endif
  1489. #if defined(LT) || defined(RN)
  1490. movl K, %eax
  1491. subl KK, %eax
  1492. leal (,%eax, SIZE), %eax
  1493. leal (AA, %eax, 2), AA
  1494. #ifdef LT
  1495. addl $4 * SIZE, B
  1496. #endif
  1497. #endif
  1498. #ifdef LN
  1499. subl $2, KK
  1500. movl BORIG, B
  1501. #endif
  1502. #ifdef LT
  1503. addl $2, KK
  1504. #endif
  1505. #ifdef RT
  1506. movl K, %eax
  1507. movl BORIG, B
  1508. sall $1 + BASE_SHIFT, %eax
  1509. addl %eax, AORIG
  1510. #endif
  1511. ALIGN_2
  1512. .L70:
  1513. testl $1, M
  1514. jle .L99
  1515. #ifdef LN
  1516. movl K, %eax
  1517. sall $BASE_SHIFT, %eax
  1518. subl %eax, AORIG
  1519. #endif
  1520. #if defined(LN) || defined(RT)
  1521. movl KK, %eax
  1522. movl AORIG, AA
  1523. sall $BASE_SHIFT, %eax
  1524. addl %eax, AA
  1525. #endif
  1526. leal BUFFER, BB
  1527. #if defined(LN) || defined(RT)
  1528. movl KK, %eax
  1529. sall $1 + BASE_SHIFT, %eax
  1530. leal (BB, %eax, 4), BB
  1531. #endif
  1532. movss 0 * SIZE(BB), %xmm2
  1533. xorps %xmm4, %xmm4
  1534. movss 0 * SIZE(AA), %xmm0
  1535. xorps %xmm5, %xmm5
  1536. movss 16 * SIZE(BB), %xmm3
  1537. xorps %xmm6, %xmm6
  1538. movss 4 * SIZE(AA), %xmm1
  1539. xorps %xmm7, %xmm7
  1540. #if defined(LT) || defined(RN)
  1541. movl KK, %eax
  1542. #else
  1543. movl K, %eax
  1544. subl KK, %eax
  1545. #endif
  1546. sarl $3, %eax
  1547. je .L72
  1548. ALIGN_2
  1549. .L71:
  1550. mulss %xmm0, %xmm2
  1551. mulss 4 * SIZE(BB), %xmm0
  1552. addss %xmm2, %xmm4
  1553. movss 8 * SIZE(BB), %xmm2
  1554. addss %xmm0, %xmm5
  1555. movss 1 * SIZE(AA), %xmm0
  1556. mulss %xmm0, %xmm2
  1557. mulss 12 * SIZE(BB), %xmm0
  1558. addss %xmm2, %xmm6
  1559. movss 32 * SIZE(BB), %xmm2
  1560. addss %xmm0, %xmm7
  1561. movss 2 * SIZE(AA), %xmm0
  1562. mulss %xmm0, %xmm3
  1563. mulss 20 * SIZE(BB), %xmm0
  1564. addss %xmm3, %xmm4
  1565. movss 24 * SIZE(BB), %xmm3
  1566. addss %xmm0, %xmm5
  1567. movss 3 * SIZE(AA), %xmm0
  1568. mulss %xmm0, %xmm3
  1569. mulss 28 * SIZE(BB), %xmm0
  1570. addss %xmm3, %xmm6
  1571. movss 48 * SIZE(BB), %xmm3
  1572. addss %xmm0, %xmm7
  1573. movss 8 * SIZE(AA), %xmm0
  1574. mulss %xmm1, %xmm2
  1575. mulss 36 * SIZE(BB), %xmm1
  1576. addss %xmm2, %xmm4
  1577. movss 40 * SIZE(BB), %xmm2
  1578. addss %xmm1, %xmm5
  1579. movss 5 * SIZE(AA), %xmm1
  1580. mulss %xmm1, %xmm2
  1581. mulss 44 * SIZE(BB), %xmm1
  1582. addss %xmm2, %xmm6
  1583. movss 64 * SIZE(BB), %xmm2
  1584. addss %xmm1, %xmm7
  1585. movss 6 * SIZE(AA), %xmm1
  1586. mulss %xmm1, %xmm3
  1587. mulss 52 * SIZE(BB), %xmm1
  1588. addss %xmm3, %xmm4
  1589. movss 56 * SIZE(BB), %xmm3
  1590. addss %xmm1, %xmm5
  1591. movss 7 * SIZE(AA), %xmm1
  1592. mulss %xmm1, %xmm3
  1593. mulss 60 * SIZE(BB), %xmm1
  1594. addss %xmm3, %xmm6
  1595. movss 80 * SIZE(BB), %xmm3
  1596. addss %xmm1, %xmm7
  1597. movss 12 * SIZE(AA), %xmm1
  1598. addl $ 8 * SIZE, AA
  1599. addl $64 * SIZE, BB
  1600. decl %eax
  1601. jne .L71
  1602. ALIGN_2
  1603. .L72:
  1604. #if defined(LT) || defined(RN)
  1605. movl KK, %eax
  1606. #else
  1607. movl K, %eax
  1608. subl KK, %eax
  1609. #endif
  1610. andl $7, %eax # if (k & 1)
  1611. BRANCH
  1612. je .L74
  1613. .L73:
  1614. mulss %xmm0, %xmm2
  1615. mulss 4 * SIZE(BB), %xmm0
  1616. addss %xmm2, %xmm4
  1617. movss 8 * SIZE(BB), %xmm2
  1618. addss %xmm0, %xmm5
  1619. movss 1 * SIZE(AA), %xmm0
  1620. addl $1 * SIZE, AA
  1621. addl $8 * SIZE, BB
  1622. decl %eax
  1623. jg .L73
  1624. ALIGN_4
  1625. .L74:
  1626. addss %xmm6, %xmm4
  1627. addss %xmm7, %xmm5
  1628. #if defined(LN) || defined(RT)
  1629. movl KK, %eax
  1630. #ifdef LN
  1631. subl $1, %eax
  1632. #else
  1633. subl $2, %eax
  1634. #endif
  1635. movl AORIG, AA
  1636. movl BORIG, B
  1637. leal BUFFER, BB
  1638. sall $BASE_SHIFT, %eax
  1639. leal (AA, %eax, 1), AA
  1640. leal (B, %eax, 2), B
  1641. leal (BB, %eax, 8), BB
  1642. #endif
  1643. #if defined(LN) || defined(LT)
  1644. unpcklps %xmm5, %xmm4
  1645. #ifdef movsd
  1646. xorps %xmm2, %xmm2
  1647. #endif
  1648. movsd 0 * SIZE(B), %xmm2
  1649. subps %xmm4, %xmm2
  1650. #else
  1651. movss 0 * SIZE(AA), %xmm0
  1652. movss 1 * SIZE(AA), %xmm2
  1653. subss %xmm4, %xmm0
  1654. subss %xmm5, %xmm2
  1655. #endif
  1656. #if defined(LN) || defined(LT)
  1657. movaps TRMASK, %xmm6
  1658. #endif
  1659. #if defined(LN) || defined(LT)
  1660. movss 0 * SIZE(AA), %xmm0
  1661. shufps $0x00, %xmm6, %xmm0
  1662. mulps %xmm0, %xmm2
  1663. #endif
  1664. #ifdef RN
  1665. movss 0 * SIZE(B), %xmm6
  1666. mulss %xmm6, %xmm0
  1667. movss 1 * SIZE(B), %xmm6
  1668. movaps %xmm6, %xmm5
  1669. mulss %xmm0, %xmm5
  1670. subss %xmm5, %xmm2
  1671. movss 3 * SIZE(B), %xmm6
  1672. mulss %xmm6, %xmm2
  1673. #endif
  1674. #ifdef RT
  1675. movss 3 * SIZE(B), %xmm6
  1676. mulss %xmm6, %xmm2
  1677. movss 2 * SIZE(B), %xmm6
  1678. movaps %xmm6, %xmm5
  1679. mulss %xmm2, %xmm5
  1680. subss %xmm5, %xmm0
  1681. movss 0 * SIZE(B), %xmm6
  1682. mulss %xmm6, %xmm0
  1683. #endif
  1684. #if defined(LN) || defined(LT)
  1685. movlps %xmm2, 0 * SIZE(B)
  1686. movaps %xmm2, %xmm0
  1687. shufps $0x00, %xmm0, %xmm0
  1688. movaps %xmm2, %xmm1
  1689. shufps $0x55, %xmm1, %xmm1
  1690. movaps %xmm0, 0 * SIZE(BB)
  1691. movaps %xmm1, 4 * SIZE(BB)
  1692. #else
  1693. movss %xmm0, 0 * SIZE(AA)
  1694. movss %xmm2, 1 * SIZE(AA)
  1695. #endif
  1696. #ifdef LN
  1697. subl $1 * SIZE, CO1
  1698. #endif
  1699. #if defined(LN) || defined(LT)
  1700. movaps %xmm2, %xmm0
  1701. shufps $0x88, %xmm3, %xmm2
  1702. shufps $0xdd, %xmm3, %xmm0
  1703. movss %xmm2, 0 * SIZE(CO1)
  1704. movss %xmm0, 0 * SIZE(CO1, LDC)
  1705. #else
  1706. movss %xmm0, 0 * SIZE(CO1)
  1707. movss %xmm2, 0 * SIZE(CO1, LDC)
  1708. #endif
  1709. #ifndef LN
  1710. addl $1 * SIZE, CO1
  1711. #endif
  1712. #if defined(LT) || defined(RN)
  1713. movl K, %eax
  1714. subl KK, %eax
  1715. leal (,%eax, SIZE), %eax
  1716. leal (AA, %eax, 1), AA
  1717. #ifdef LT
  1718. addl $2 * SIZE, B
  1719. #endif
  1720. #endif
  1721. #ifdef LN
  1722. subl $1, KK
  1723. movl BORIG, B
  1724. #endif
  1725. #ifdef LT
  1726. addl $1, KK
  1727. #endif
  1728. #ifdef RT
  1729. movl K, %eax
  1730. movl BORIG, B
  1731. sall $BASE_SHIFT, %eax
  1732. addl %eax, AORIG
  1733. #endif
  1734. ALIGN_2
  1735. .L99:
  1736. #ifdef LN
  1737. movl K, %eax
  1738. leal (, %eax, SIZE), %eax
  1739. leal (B, %eax, 2), B
  1740. #endif
  1741. #if defined(LT) || defined(RN)
  1742. movl K, %eax
  1743. subl KK, %eax
  1744. leal (,%eax, SIZE), %eax
  1745. leal (B, %eax, 2), B
  1746. #endif
  1747. #ifdef RN
  1748. addl $2, KK
  1749. #endif
  1750. #ifdef RT
  1751. subl $2, KK
  1752. #endif
  1753. decl J # j --
  1754. jg .L01
  1755. ALIGN_2
  1756. .L100:
  1757. testl $1, N
  1758. jle .L999
  1759. #ifdef LN
  1760. movl OFFSET, %eax
  1761. addl M, %eax
  1762. movl %eax, KK
  1763. #endif
  1764. leal BUFFER, BB
  1765. #ifdef RT
  1766. movl K, %eax
  1767. sall $BASE_SHIFT, %eax
  1768. subl %eax, B
  1769. #endif
  1770. #if defined(LN) || defined(RT)
  1771. movl KK, %eax
  1772. movl B, BORIG
  1773. sall $BASE_SHIFT, %eax
  1774. leal (B, %eax, 1), B
  1775. leal (BB, %eax, 4), BB
  1776. #endif
  1777. #ifdef LT
  1778. movl OFFSET, %eax
  1779. movl %eax, KK
  1780. #endif
  1781. #if defined(LT) || defined(RN)
  1782. movl KK, %eax
  1783. #else
  1784. movl K, %eax
  1785. subl KK, %eax
  1786. #endif
  1787. sarl $3, %eax
  1788. jle .L103
  1789. ALIGN_4
  1790. .L102:
  1791. movsd 0 * SIZE(B), %xmm3
  1792. movhps 2 * SIZE(B), %xmm3
  1793. movsd 4 * SIZE(B), %xmm7
  1794. movhps 6 * SIZE(B), %xmm7
  1795. #ifdef HAVE_SSE2
  1796. pshufd $0x00, %xmm3, %xmm0
  1797. pshufd $0x55, %xmm3, %xmm1
  1798. pshufd $0xaa, %xmm3, %xmm2
  1799. pshufd $0xff, %xmm3, %xmm3
  1800. pshufd $0x00, %xmm7, %xmm4
  1801. pshufd $0x55, %xmm7, %xmm5
  1802. pshufd $0xaa, %xmm7, %xmm6
  1803. pshufd $0xff, %xmm7, %xmm7
  1804. #else
  1805. movaps %xmm3, %xmm0
  1806. shufps $0x00, %xmm0, %xmm0
  1807. movaps %xmm3, %xmm1
  1808. shufps $0x55, %xmm1, %xmm1
  1809. movaps %xmm3, %xmm2
  1810. shufps $0xaa, %xmm2, %xmm2
  1811. shufps $0xff, %xmm3, %xmm3
  1812. movaps %xmm7, %xmm4
  1813. shufps $0x00, %xmm4, %xmm4
  1814. movaps %xmm7, %xmm5
  1815. shufps $0x55, %xmm5, %xmm5
  1816. movaps %xmm7, %xmm6
  1817. shufps $0xaa, %xmm6, %xmm6
  1818. shufps $0xff, %xmm7, %xmm7
  1819. #endif
  1820. movaps %xmm0, 0 * SIZE(BB)
  1821. movaps %xmm1, 4 * SIZE(BB)
  1822. movaps %xmm2, 8 * SIZE(BB)
  1823. movaps %xmm3, 12 * SIZE(BB)
  1824. movaps %xmm4, 16 * SIZE(BB)
  1825. movaps %xmm5, 20 * SIZE(BB)
  1826. movaps %xmm6, 24 * SIZE(BB)
  1827. movaps %xmm7, 28 * SIZE(BB)
  1828. addl $ 8 * SIZE, B
  1829. addl $32 * SIZE, BB
  1830. decl %eax
  1831. BRANCH
  1832. jne .L102
  1833. ALIGN_2
  1834. .L103:
  1835. #if defined(LT) || defined(RN)
  1836. movl KK, %eax
  1837. #else
  1838. movl K, %eax
  1839. subl KK, %eax
  1840. #endif
  1841. andl $7, %eax
  1842. BRANCH
  1843. jle .L105
  1844. ALIGN_2
  1845. .L104:
  1846. movss 0 * SIZE(B), %xmm0
  1847. shufps $0x00, %xmm0, %xmm0
  1848. movaps %xmm0, 0 * SIZE(BB)
  1849. addl $1 * SIZE, B
  1850. addl $4 * SIZE, BB
  1851. decl %eax
  1852. jne .L104
  1853. ALIGN_4
  1854. .L105:
  1855. #if defined(LT) || defined(RN)
  1856. movl A, AA
  1857. #else
  1858. movl A, %eax
  1859. movl %eax, AORIG
  1860. #endif
  1861. #ifdef RT
  1862. subl LDC, C
  1863. #endif
  1864. movl C, CO1
  1865. #ifndef RT
  1866. addl LDC, C
  1867. #endif
  1868. movl M, %ebx
  1869. sarl $3, %ebx # i = (m >> 2)
  1870. jle .L130
  1871. ALIGN_4
  1872. .L110:
  1873. #ifdef LN
  1874. movl K, %eax
  1875. sall $3 + BASE_SHIFT, %eax
  1876. subl %eax, AORIG
  1877. #endif
  1878. #if defined(LN) || defined(RT)
  1879. movl KK, %eax
  1880. movl AORIG, AA
  1881. sall $3 + BASE_SHIFT, %eax
  1882. addl %eax, AA
  1883. #endif
  1884. leal BUFFER, BB
  1885. #if defined(LN) || defined(RT)
  1886. movl KK, %eax
  1887. sall $BASE_SHIFT, %eax
  1888. leal (BB, %eax, 4), BB
  1889. #endif
  1890. movaps 0 * SIZE(BB), %xmm2
  1891. xorps %xmm4, %xmm4
  1892. movaps 0 * SIZE(AA), %xmm0
  1893. xorps %xmm5, %xmm5
  1894. movaps 16 * SIZE(BB), %xmm3
  1895. xorps %xmm6, %xmm6
  1896. movaps 16 * SIZE(AA), %xmm1
  1897. xorps %xmm7, %xmm7
  1898. PREFETCHW 7 * SIZE(CO1)
  1899. #if defined(LT) || defined(RN)
  1900. movl KK, %eax
  1901. #else
  1902. movl K, %eax
  1903. subl KK, %eax
  1904. #endif
  1905. sarl $3, %eax
  1906. je .L112
  1907. ALIGN_2
  1908. .L111:
  1909. mulps %xmm2, %xmm0
  1910. mulps 4 * SIZE(AA), %xmm2
  1911. addps %xmm0, %xmm4
  1912. movaps 8 * SIZE(AA), %xmm0
  1913. addps %xmm2, %xmm6
  1914. movaps 4 * SIZE(BB), %xmm2
  1915. mulps %xmm2, %xmm0
  1916. mulps 12 * SIZE(AA), %xmm2
  1917. addps %xmm0, %xmm5
  1918. movaps 32 * SIZE(AA), %xmm0
  1919. addps %xmm2, %xmm7
  1920. movaps 8 * SIZE(BB), %xmm2
  1921. mulps %xmm2, %xmm1
  1922. mulps 20 * SIZE(AA), %xmm2
  1923. addps %xmm1, %xmm4
  1924. movaps 24 * SIZE(AA), %xmm1
  1925. addps %xmm2, %xmm6
  1926. movaps 12 * SIZE(BB), %xmm2
  1927. mulps %xmm2, %xmm1
  1928. mulps 28 * SIZE(AA), %xmm2
  1929. addps %xmm1, %xmm5
  1930. movaps 48 * SIZE(AA), %xmm1
  1931. addps %xmm2, %xmm7
  1932. movaps 32 * SIZE(BB), %xmm2
  1933. mulps %xmm3, %xmm0
  1934. mulps 36 * SIZE(AA), %xmm3
  1935. addps %xmm0, %xmm4
  1936. movaps 40 * SIZE(AA), %xmm0
  1937. addps %xmm3, %xmm6
  1938. movaps 20 * SIZE(BB), %xmm3
  1939. mulps %xmm3, %xmm0
  1940. mulps 44 * SIZE(AA), %xmm3
  1941. addps %xmm0, %xmm5
  1942. movaps 64 * SIZE(AA), %xmm0
  1943. addps %xmm3, %xmm7
  1944. movaps 24 * SIZE(BB), %xmm3
  1945. mulps %xmm3, %xmm1
  1946. mulps 52 * SIZE(AA), %xmm3
  1947. addps %xmm1, %xmm4
  1948. movaps 56 * SIZE(AA), %xmm1
  1949. addps %xmm3, %xmm6
  1950. movaps 28 * SIZE(BB), %xmm3
  1951. mulps %xmm3, %xmm1
  1952. mulps 60 * SIZE(AA), %xmm3
  1953. addps %xmm1, %xmm5
  1954. movaps 80 * SIZE(AA), %xmm1
  1955. addps %xmm3, %xmm7
  1956. movaps 48 * SIZE(BB), %xmm3
  1957. addl $64 * SIZE, AA
  1958. addl $32 * SIZE, BB
  1959. decl %eax
  1960. jne .L111
  1961. ALIGN_2
  1962. .L112:
  1963. #if defined(LT) || defined(RN)
  1964. movl KK, %eax
  1965. #else
  1966. movl K, %eax
  1967. subl KK, %eax
  1968. #endif
  1969. andl $7, %eax # if (k & 1)
  1970. BRANCH
  1971. je .L114
  1972. .L113:
  1973. movaps 0 * SIZE(BB), %xmm2
  1974. movaps 0 * SIZE(AA), %xmm0
  1975. mulps %xmm2, %xmm0
  1976. addps %xmm0, %xmm4
  1977. mulps 4 * SIZE(AA), %xmm2
  1978. addps %xmm2, %xmm6
  1979. addl $8 * SIZE, AA
  1980. addl $4 * SIZE, BB
  1981. subl $1, %eax
  1982. jg .L113
  1983. ALIGN_4
  1984. .L114:
  1985. addps %xmm5, %xmm4
  1986. addps %xmm7, %xmm6
  1987. #if defined(LN) || defined(RT)
  1988. movl KK, %eax
  1989. #ifdef LN
  1990. subl $8, %eax
  1991. #else
  1992. subl $1, %eax
  1993. #endif
  1994. movl AORIG, AA
  1995. movl BORIG, B
  1996. leal BUFFER, BB
  1997. sall $BASE_SHIFT, %eax
  1998. leal (AA, %eax, 8), AA
  1999. leal (B, %eax, 1), B
  2000. leal (BB, %eax, 4), BB
  2001. #endif
  2002. #if defined(LN) || defined(LT)
  2003. movsd 0 * SIZE(B), %xmm2
  2004. movhps 2 * SIZE(B), %xmm2
  2005. movsd 4 * SIZE(B), %xmm5
  2006. movhps 6 * SIZE(B), %xmm5
  2007. subps %xmm4, %xmm2
  2008. subps %xmm6, %xmm5
  2009. xorps %xmm0, %xmm0
  2010. movaps %xmm2, %xmm3
  2011. unpcklps %xmm0, %xmm2
  2012. unpckhps %xmm0, %xmm3
  2013. movaps %xmm5, %xmm7
  2014. unpcklps %xmm0, %xmm5
  2015. unpckhps %xmm0, %xmm7
  2016. #else
  2017. movaps 0 * SIZE(AA), %xmm0
  2018. movaps 4 * SIZE(AA), %xmm1
  2019. subps %xmm4, %xmm0
  2020. subps %xmm6, %xmm1
  2021. #endif
  2022. #if defined(LN) || defined(LT)
  2023. movaps TRMASK, %xmm6
  2024. #endif
  2025. #ifdef LN
  2026. movss 63 * SIZE(AA), %xmm0
  2027. movaps %xmm6, %xmm1
  2028. shufps $0x00, %xmm0, %xmm1
  2029. mulps %xmm1, %xmm7
  2030. movaps %xmm7, %xmm1
  2031. shufps $0xee, %xmm1, %xmm1
  2032. movss 62 * SIZE(AA), %xmm0
  2033. shufps $0x50, %xmm0, %xmm0
  2034. mulps %xmm1, %xmm0
  2035. subps %xmm0, %xmm7
  2036. movsd 60 * SIZE(AA), %xmm0
  2037. shufps $0x50, %xmm0, %xmm0
  2038. mulps %xmm1, %xmm0
  2039. subps %xmm0, %xmm5
  2040. movsd 58 * SIZE(AA), %xmm0
  2041. shufps $0x50, %xmm0, %xmm0
  2042. mulps %xmm1, %xmm0
  2043. subps %xmm0, %xmm3
  2044. movsd 56 * SIZE(AA), %xmm0
  2045. shufps $0x50, %xmm0, %xmm0
  2046. mulps %xmm1, %xmm0
  2047. subps %xmm0, %xmm2
  2048. movss 54 * SIZE(AA), %xmm0
  2049. shufps $0x00, %xmm6, %xmm0
  2050. mulps %xmm0, %xmm7
  2051. movaps %xmm7, %xmm1
  2052. shufps $0x44, %xmm1, %xmm1
  2053. movsd 52 * SIZE(AA), %xmm0
  2054. shufps $0x50, %xmm0, %xmm0
  2055. mulps %xmm1, %xmm0
  2056. subps %xmm0, %xmm5
  2057. movsd 50 * SIZE(AA), %xmm0
  2058. shufps $0x50, %xmm0, %xmm0
  2059. mulps %xmm1, %xmm0
  2060. subps %xmm0, %xmm3
  2061. movsd 48 * SIZE(AA), %xmm0
  2062. shufps $0x50, %xmm0, %xmm0
  2063. mulps %xmm1, %xmm0
  2064. subps %xmm0, %xmm2
  2065. movss 45 * SIZE(AA), %xmm0
  2066. movaps %xmm6, %xmm1
  2067. shufps $0x00, %xmm0, %xmm1
  2068. mulps %xmm1, %xmm5
  2069. movaps %xmm5, %xmm1
  2070. shufps $0xee, %xmm1, %xmm1
  2071. movss 44 * SIZE(AA), %xmm0
  2072. shufps $0x50, %xmm0, %xmm0
  2073. mulps %xmm1, %xmm0
  2074. subps %xmm0, %xmm5
  2075. movsd 42 * SIZE(AA), %xmm0
  2076. shufps $0x50, %xmm0, %xmm0
  2077. mulps %xmm1, %xmm0
  2078. subps %xmm0, %xmm3
  2079. movsd 40 * SIZE(AA), %xmm0
  2080. shufps $0x50, %xmm0, %xmm0
  2081. mulps %xmm1, %xmm0
  2082. subps %xmm0, %xmm2
  2083. movss 36 * SIZE(AA), %xmm0
  2084. shufps $0x00, %xmm6, %xmm0
  2085. mulps %xmm0, %xmm5
  2086. movaps %xmm5, %xmm1
  2087. shufps $0x44, %xmm1, %xmm1
  2088. movsd 34 * SIZE(AA), %xmm0
  2089. shufps $0x50, %xmm0, %xmm0
  2090. mulps %xmm1, %xmm0
  2091. subps %xmm0, %xmm3
  2092. movsd 32 * SIZE(AA), %xmm0
  2093. shufps $0x50, %xmm0, %xmm0
  2094. mulps %xmm1, %xmm0
  2095. subps %xmm0, %xmm2
  2096. movss 27 * SIZE(AA), %xmm0
  2097. movaps %xmm6, %xmm1
  2098. shufps $0x00, %xmm0, %xmm1
  2099. mulps %xmm1, %xmm3
  2100. movaps %xmm3, %xmm1
  2101. shufps $0xee, %xmm1, %xmm1
  2102. movss 26 * SIZE(AA), %xmm0
  2103. shufps $0x50, %xmm0, %xmm0
  2104. mulps %xmm1, %xmm0
  2105. subps %xmm0, %xmm3
  2106. movsd 24 * SIZE(AA), %xmm0
  2107. shufps $0x50, %xmm0, %xmm0
  2108. mulps %xmm1, %xmm0
  2109. subps %xmm0, %xmm2
  2110. movss 18 * SIZE(AA), %xmm0
  2111. shufps $0x00, %xmm6, %xmm0
  2112. mulps %xmm0, %xmm3
  2113. movaps %xmm3, %xmm1
  2114. shufps $0x44, %xmm1, %xmm1
  2115. movsd 16 * SIZE(AA), %xmm0
  2116. shufps $0x50, %xmm0, %xmm0
  2117. mulps %xmm1, %xmm0
  2118. subps %xmm0, %xmm2
  2119. movss 9 * SIZE(AA), %xmm0
  2120. movaps %xmm6, %xmm1
  2121. shufps $0x00, %xmm0, %xmm1
  2122. mulps %xmm1, %xmm2
  2123. movaps %xmm2, %xmm1
  2124. shufps $0xee, %xmm1, %xmm1
  2125. movss 8 * SIZE(AA), %xmm0
  2126. shufps $0x50, %xmm0, %xmm0
  2127. mulps %xmm1, %xmm0
  2128. subps %xmm0, %xmm2
  2129. movss 0 * SIZE(AA), %xmm0
  2130. shufps $0x00, %xmm6, %xmm0
  2131. mulps %xmm0, %xmm2
  2132. #endif
  2133. #ifdef LT
  2134. movss 0 * SIZE(AA), %xmm0
  2135. shufps $0x00, %xmm6, %xmm0
  2136. mulps %xmm0, %xmm2
  2137. movaps %xmm2, %xmm1
  2138. shufps $0x44, %xmm1, %xmm1
  2139. movss 1 * SIZE(AA), %xmm0
  2140. shufps $0x05, %xmm0, %xmm0
  2141. mulps %xmm1, %xmm0
  2142. subps %xmm0, %xmm2
  2143. movsd 2 * SIZE(AA), %xmm0
  2144. shufps $0x50, %xmm0, %xmm0
  2145. mulps %xmm1, %xmm0
  2146. subps %xmm0, %xmm3
  2147. movsd 4 * SIZE(AA), %xmm0
  2148. shufps $0x50, %xmm0, %xmm0
  2149. mulps %xmm1, %xmm0
  2150. subps %xmm0, %xmm5
  2151. movsd 6 * SIZE(AA), %xmm0
  2152. shufps $0x50, %xmm0, %xmm0
  2153. mulps %xmm1, %xmm0
  2154. subps %xmm0, %xmm7
  2155. movss 9 * SIZE(AA), %xmm0
  2156. movaps %xmm6, %xmm1
  2157. shufps $0x00, %xmm0, %xmm1
  2158. mulps %xmm1, %xmm2
  2159. movaps %xmm2, %xmm1
  2160. shufps $0xee, %xmm1, %xmm1
  2161. movsd 10 * SIZE(AA), %xmm0
  2162. shufps $0x50, %xmm0, %xmm0
  2163. mulps %xmm1, %xmm0
  2164. subps %xmm0, %xmm3
  2165. movsd 12 * SIZE(AA), %xmm0
  2166. shufps $0x50, %xmm0, %xmm0
  2167. mulps %xmm1, %xmm0
  2168. subps %xmm0, %xmm5
  2169. movsd 14 * SIZE(AA), %xmm0
  2170. shufps $0x50, %xmm0, %xmm0
  2171. mulps %xmm1, %xmm0
  2172. subps %xmm0, %xmm7
  2173. movss 18 * SIZE(AA), %xmm0
  2174. shufps $0x00, %xmm6, %xmm0
  2175. mulps %xmm0, %xmm3
  2176. movaps %xmm3, %xmm1
  2177. shufps $0x44, %xmm1, %xmm1
  2178. movss 19 * SIZE(AA), %xmm0
  2179. shufps $0x05, %xmm0, %xmm0
  2180. mulps %xmm1, %xmm0
  2181. subps %xmm0, %xmm3
  2182. movsd 20 * SIZE(AA), %xmm0
  2183. shufps $0x50, %xmm0, %xmm0
  2184. mulps %xmm1, %xmm0
  2185. subps %xmm0, %xmm5
  2186. movsd 22 * SIZE(AA), %xmm0
  2187. shufps $0x50, %xmm0, %xmm0
  2188. mulps %xmm1, %xmm0
  2189. subps %xmm0, %xmm7
  2190. movss 27 * SIZE(AA), %xmm0
  2191. movaps %xmm6, %xmm1
  2192. shufps $0x00, %xmm0, %xmm1
  2193. mulps %xmm1, %xmm3
  2194. movaps %xmm3, %xmm1
  2195. shufps $0xee, %xmm1, %xmm1
  2196. movsd 28 * SIZE(AA), %xmm0
  2197. shufps $0x50, %xmm0, %xmm0
  2198. mulps %xmm1, %xmm0
  2199. subps %xmm0, %xmm5
  2200. movsd 30 * SIZE(AA), %xmm0
  2201. shufps $0x50, %xmm0, %xmm0
  2202. mulps %xmm1, %xmm0
  2203. subps %xmm0, %xmm7
  2204. movss 36 * SIZE(AA), %xmm0
  2205. shufps $0x00, %xmm6, %xmm0
  2206. mulps %xmm0, %xmm5
  2207. movaps %xmm5, %xmm1
  2208. shufps $0x44, %xmm1, %xmm1
  2209. movss 37 * SIZE(AA), %xmm0
  2210. shufps $0x05, %xmm0, %xmm0
  2211. mulps %xmm1, %xmm0
  2212. subps %xmm0, %xmm5
  2213. movsd 38 * SIZE(AA), %xmm0
  2214. shufps $0x50, %xmm0, %xmm0
  2215. mulps %xmm1, %xmm0
  2216. subps %xmm0, %xmm7
  2217. movss 45 * SIZE(AA), %xmm0
  2218. movaps %xmm6, %xmm1
  2219. shufps $0x00, %xmm0, %xmm1
  2220. mulps %xmm1, %xmm5
  2221. movaps %xmm5, %xmm1
  2222. shufps $0xee, %xmm1, %xmm1
  2223. movsd 46 * SIZE(AA), %xmm0
  2224. shufps $0x50, %xmm0, %xmm0
  2225. mulps %xmm1, %xmm0
  2226. subps %xmm0, %xmm7
  2227. movss 54 * SIZE(AA), %xmm0
  2228. shufps $0x00, %xmm6, %xmm0
  2229. mulps %xmm0, %xmm7
  2230. movaps %xmm7, %xmm1
  2231. shufps $0x44, %xmm1, %xmm1
  2232. movss 55 * SIZE(AA), %xmm0
  2233. shufps $0x05, %xmm0, %xmm0
  2234. mulps %xmm1, %xmm0
  2235. subps %xmm0, %xmm7
  2236. movss 63 * SIZE(AA), %xmm0
  2237. movaps %xmm6, %xmm1
  2238. shufps $0x00, %xmm0, %xmm1
  2239. mulps %xmm1, %xmm7
  2240. #endif
  2241. #if defined(RN) || defined(RT)
  2242. movss 0 * SIZE(B), %xmm6
  2243. shufps $0x00, %xmm6, %xmm6
  2244. mulps %xmm6, %xmm0
  2245. mulps %xmm6, %xmm1
  2246. #endif
  2247. #if defined(LN) || defined(LT)
  2248. shufps $0x88, %xmm3, %xmm2
  2249. shufps $0x88, %xmm7, %xmm5
  2250. movlps %xmm2, 0 * SIZE(B)
  2251. movhps %xmm2, 2 * SIZE(B)
  2252. movlps %xmm5, 4 * SIZE(B)
  2253. movhps %xmm5, 6 * SIZE(B)
  2254. #ifdef HAVE_SSE2
  2255. pshufd $0x00, %xmm2, %xmm0
  2256. pshufd $0x55, %xmm2, %xmm1
  2257. pshufd $0xaa, %xmm2, %xmm4
  2258. pshufd $0xff, %xmm2, %xmm6
  2259. #else
  2260. movaps %xmm2, %xmm0
  2261. shufps $0x00, %xmm0, %xmm0
  2262. movaps %xmm2, %xmm1
  2263. shufps $0x55, %xmm1, %xmm1
  2264. movaps %xmm2, %xmm4
  2265. shufps $0xaa, %xmm4, %xmm4
  2266. movaps %xmm2, %xmm6
  2267. shufps $0xff, %xmm6, %xmm6
  2268. #endif
  2269. movaps %xmm0, 0 * SIZE(BB)
  2270. movaps %xmm1, 4 * SIZE(BB)
  2271. movaps %xmm4, 8 * SIZE(BB)
  2272. movaps %xmm6, 12 * SIZE(BB)
  2273. #ifdef HAVE_SSE2
  2274. pshufd $0x00, %xmm5, %xmm0
  2275. pshufd $0x55, %xmm5, %xmm1
  2276. pshufd $0xaa, %xmm5, %xmm4
  2277. pshufd $0xff, %xmm5, %xmm6
  2278. #else
  2279. movaps %xmm5, %xmm0
  2280. shufps $0x00, %xmm0, %xmm0
  2281. movaps %xmm5, %xmm1
  2282. shufps $0x55, %xmm1, %xmm1
  2283. movaps %xmm5, %xmm4
  2284. shufps $0xaa, %xmm4, %xmm4
  2285. movaps %xmm5, %xmm6
  2286. shufps $0xff, %xmm6, %xmm6
  2287. #endif
  2288. movaps %xmm0, 16 * SIZE(BB)
  2289. movaps %xmm1, 20 * SIZE(BB)
  2290. movaps %xmm4, 24 * SIZE(BB)
  2291. movaps %xmm6, 28 * SIZE(BB)
  2292. #else
  2293. movaps %xmm0, 0 * SIZE(AA)
  2294. movaps %xmm1, 4 * SIZE(AA)
  2295. #endif
  2296. #ifdef LN
  2297. subl $8 * SIZE, CO1
  2298. #endif
  2299. #if defined(LN) || defined(LT)
  2300. movlps %xmm2, 0 * SIZE(CO1)
  2301. movhps %xmm2, 2 * SIZE(CO1)
  2302. movlps %xmm5, 4 * SIZE(CO1)
  2303. movhps %xmm5, 6 * SIZE(CO1)
  2304. #else
  2305. movlps %xmm0, 0 * SIZE(CO1)
  2306. movhps %xmm0, 2 * SIZE(CO1)
  2307. movlps %xmm1, 4 * SIZE(CO1)
  2308. movhps %xmm1, 6 * SIZE(CO1)
  2309. #endif
  2310. #ifndef LN
  2311. addl $8 * SIZE, CO1
  2312. #endif
  2313. #if defined(LT) || defined(RN)
  2314. movl K, %eax
  2315. subl KK, %eax
  2316. leal (,%eax, SIZE), %eax
  2317. leal (AA, %eax, 8), AA
  2318. #ifdef LT
  2319. addl $8 * SIZE, B
  2320. #endif
  2321. #endif
  2322. #ifdef LN
  2323. subl $8, KK
  2324. movl BORIG, B
  2325. #endif
  2326. #ifdef LT
  2327. addl $8, KK
  2328. #endif
  2329. #ifdef RT
  2330. movl K, %eax
  2331. movl BORIG, B
  2332. sall $3 + BASE_SHIFT, %eax
  2333. addl %eax, AORIG
  2334. #endif
  2335. decl %ebx # i --
  2336. jg .L110
  2337. ALIGN_2
  2338. .L130:
  2339. testl $4, M
  2340. jle .L150
  2341. #ifdef LN
  2342. movl K, %eax
  2343. sall $2 + BASE_SHIFT, %eax
  2344. subl %eax, AORIG
  2345. #endif
  2346. #if defined(LN) || defined(RT)
  2347. movl KK, %eax
  2348. movl AORIG, AA
  2349. sall $2 + BASE_SHIFT, %eax
  2350. addl %eax, AA
  2351. #endif
  2352. leal BUFFER, BB
  2353. #if defined(LN) || defined(RT)
  2354. movl KK, %eax
  2355. sall $BASE_SHIFT, %eax
  2356. leal (BB, %eax, 4), BB
  2357. #endif
  2358. movaps 0 * SIZE(BB), %xmm2
  2359. xorps %xmm4, %xmm4
  2360. movsd 0 * SIZE(AA), %xmm0
  2361. movhps 2 * SIZE(AA), %xmm0
  2362. xorps %xmm5, %xmm5
  2363. movaps 16 * SIZE(BB), %xmm3
  2364. xorps %xmm6, %xmm6
  2365. movsd 16 * SIZE(AA), %xmm1
  2366. movhps 18 * SIZE(AA), %xmm1
  2367. xorps %xmm7, %xmm7
  2368. #if defined(LT) || defined(RN)
  2369. movl KK, %eax
  2370. #else
  2371. movl K, %eax
  2372. subl KK, %eax
  2373. #endif
  2374. sarl $3, %eax
  2375. je .L132
  2376. ALIGN_2
  2377. .L131:
  2378. mulps %xmm0, %xmm2
  2379. movaps 4 * SIZE(AA), %xmm0
  2380. addps %xmm2, %xmm4
  2381. mulps 4 * SIZE(BB), %xmm0
  2382. movaps 32 * SIZE(BB), %xmm2
  2383. addps %xmm0, %xmm5
  2384. movaps 8 * SIZE(AA), %xmm0
  2385. mulps 8 * SIZE(BB), %xmm0
  2386. addps %xmm0, %xmm6
  2387. movaps 12 * SIZE(AA), %xmm0
  2388. mulps 12 * SIZE(BB), %xmm0
  2389. addps %xmm0, %xmm7
  2390. movaps 32 * SIZE(AA), %xmm0
  2391. mulps %xmm1, %xmm3
  2392. movaps 20 * SIZE(AA), %xmm1
  2393. addps %xmm3, %xmm4
  2394. mulps 20 * SIZE(BB), %xmm1
  2395. movaps 48 * SIZE(BB), %xmm3
  2396. addps %xmm1, %xmm5
  2397. movaps 24 * SIZE(AA), %xmm1
  2398. mulps 24 * SIZE(BB), %xmm1
  2399. addps %xmm1, %xmm6
  2400. movaps 28 * SIZE(AA), %xmm1
  2401. mulps 28 * SIZE(BB), %xmm1
  2402. addps %xmm1, %xmm7
  2403. movaps 48 * SIZE(AA), %xmm1
  2404. addl $32 * SIZE, AA
  2405. addl $32 * SIZE, BB
  2406. decl %eax
  2407. jne .L131
  2408. ALIGN_2
  2409. .L132:
  2410. #if defined(LT) || defined(RN)
  2411. movl KK, %eax
  2412. #else
  2413. movl K, %eax
  2414. subl KK, %eax
  2415. #endif
  2416. andl $7, %eax # if (k & 1)
  2417. BRANCH
  2418. je .L134
  2419. .L133:
  2420. movaps 0 * SIZE(BB), %xmm2
  2421. movaps 0 * SIZE(AA), %xmm0
  2422. mulps %xmm0, %xmm2
  2423. addps %xmm2, %xmm4
  2424. addl $4 * SIZE, AA
  2425. addl $4 * SIZE, BB
  2426. decl %eax
  2427. jg .L133
  2428. ALIGN_4
  2429. .L134:
  2430. addps %xmm5, %xmm4
  2431. addps %xmm7, %xmm6
  2432. addps %xmm6, %xmm4
  2433. #if defined(LN) || defined(RT)
  2434. movl KK, %eax
  2435. #ifdef LN
  2436. subl $4, %eax
  2437. #else
  2438. subl $1, %eax
  2439. #endif
  2440. movl AORIG, AA
  2441. movl BORIG, B
  2442. leal BUFFER, BB
  2443. sall $BASE_SHIFT, %eax
  2444. leal (AA, %eax, 4), AA
  2445. leal (B, %eax, 1), B
  2446. leal (BB, %eax, 4), BB
  2447. #endif
  2448. #if defined(LN) || defined(LT)
  2449. movsd 0 * SIZE(B), %xmm2
  2450. movhps 2 * SIZE(B), %xmm2
  2451. subps %xmm4, %xmm2
  2452. xorps %xmm5, %xmm5
  2453. movaps %xmm2, %xmm3
  2454. unpcklps %xmm5, %xmm2
  2455. unpckhps %xmm5, %xmm3
  2456. #else
  2457. movaps 0 * SIZE(AA), %xmm0
  2458. subps %xmm4, %xmm0
  2459. #endif
  2460. #if defined(LN) || defined(LT)
  2461. movaps TRMASK, %xmm6
  2462. #endif
  2463. #ifdef LN
  2464. movss 15 * SIZE(AA), %xmm0
  2465. movaps %xmm6, %xmm1
  2466. shufps $0x00, %xmm0, %xmm1
  2467. mulps %xmm1, %xmm3
  2468. movaps %xmm3, %xmm1
  2469. shufps $0xee, %xmm1, %xmm1
  2470. movss 14 * SIZE(AA), %xmm0
  2471. shufps $0x50, %xmm0, %xmm0
  2472. mulps %xmm1, %xmm0
  2473. subps %xmm0, %xmm3
  2474. movsd 12 * SIZE(AA), %xmm0
  2475. shufps $0x50, %xmm0, %xmm0
  2476. mulps %xmm1, %xmm0
  2477. subps %xmm0, %xmm2
  2478. movss 10 * SIZE(AA), %xmm0
  2479. shufps $0x00, %xmm6, %xmm0
  2480. mulps %xmm0, %xmm3
  2481. movaps %xmm3, %xmm1
  2482. shufps $0x44, %xmm1, %xmm1
  2483. movsd 8 * SIZE(AA), %xmm0
  2484. shufps $0x50, %xmm0, %xmm0
  2485. mulps %xmm1, %xmm0
  2486. subps %xmm0, %xmm2
  2487. movss 5 * SIZE(AA), %xmm0
  2488. movaps %xmm6, %xmm1
  2489. shufps $0x00, %xmm0, %xmm1
  2490. mulps %xmm1, %xmm2
  2491. movaps %xmm2, %xmm1
  2492. shufps $0xee, %xmm1, %xmm1
  2493. movss 4 * SIZE(AA), %xmm0
  2494. shufps $0x50, %xmm0, %xmm0
  2495. mulps %xmm1, %xmm0
  2496. subps %xmm0, %xmm2
  2497. movss 0 * SIZE(AA), %xmm0
  2498. shufps $0x00, %xmm6, %xmm0
  2499. mulps %xmm0, %xmm2
  2500. #endif
  2501. #ifdef LT
  2502. movss 0 * SIZE(AA), %xmm0
  2503. shufps $0x00, %xmm6, %xmm0
  2504. mulps %xmm0, %xmm2
  2505. movaps %xmm2, %xmm1
  2506. shufps $0x44, %xmm1, %xmm1
  2507. movss 1 * SIZE(AA), %xmm0
  2508. shufps $0x05, %xmm0, %xmm0
  2509. mulps %xmm1, %xmm0
  2510. subps %xmm0, %xmm2
  2511. movsd 2 * SIZE(AA), %xmm0
  2512. shufps $0x50, %xmm0, %xmm0
  2513. mulps %xmm1, %xmm0
  2514. subps %xmm0, %xmm3
  2515. movss 5 * SIZE(AA), %xmm0
  2516. movaps %xmm6, %xmm1
  2517. shufps $0x00, %xmm0, %xmm1
  2518. mulps %xmm1, %xmm2
  2519. movaps %xmm2, %xmm1
  2520. shufps $0xee, %xmm1, %xmm1
  2521. movsd 6 * SIZE(AA), %xmm0
  2522. shufps $0x50, %xmm0, %xmm0
  2523. mulps %xmm1, %xmm0
  2524. subps %xmm0, %xmm3
  2525. movss 10 * SIZE(AA), %xmm0
  2526. shufps $0x00, %xmm6, %xmm0
  2527. mulps %xmm0, %xmm3
  2528. movaps %xmm3, %xmm1
  2529. shufps $0x44, %xmm1, %xmm1
  2530. movss 11 * SIZE(AA), %xmm0
  2531. shufps $0x05, %xmm0, %xmm0
  2532. mulps %xmm1, %xmm0
  2533. subps %xmm0, %xmm3
  2534. movss 15 * SIZE(AA), %xmm0
  2535. movaps %xmm6, %xmm1
  2536. shufps $0x00, %xmm0, %xmm1
  2537. mulps %xmm1, %xmm3
  2538. #endif
  2539. #ifdef RN
  2540. movss 0 * SIZE(B), %xmm6
  2541. shufps $0x00, %xmm6, %xmm6
  2542. mulps %xmm6, %xmm0
  2543. #endif
  2544. #ifdef RT
  2545. movss 0 * SIZE(B), %xmm6
  2546. shufps $0x00, %xmm6, %xmm6
  2547. mulps %xmm6, %xmm0
  2548. #endif
  2549. #if defined(LN) || defined(LT)
  2550. shufps $0x88, %xmm3, %xmm2
  2551. movlps %xmm2, 0 * SIZE(B)
  2552. movhps %xmm2, 2 * SIZE(B)
  2553. #ifdef HAVE_SSE2
  2554. pshufd $0x00, %xmm2, %xmm0
  2555. pshufd $0x55, %xmm2, %xmm1
  2556. pshufd $0xaa, %xmm2, %xmm4
  2557. pshufd $0xff, %xmm2, %xmm6
  2558. #else
  2559. movaps %xmm2, %xmm0
  2560. shufps $0x00, %xmm0, %xmm0
  2561. movaps %xmm2, %xmm1
  2562. shufps $0x55, %xmm1, %xmm1
  2563. movaps %xmm2, %xmm4
  2564. shufps $0xaa, %xmm4, %xmm4
  2565. movaps %xmm2, %xmm6
  2566. shufps $0xff, %xmm6, %xmm6
  2567. #endif
  2568. movaps %xmm0, 0 * SIZE(BB)
  2569. movaps %xmm1, 4 * SIZE(BB)
  2570. movaps %xmm4, 8 * SIZE(BB)
  2571. movaps %xmm6, 12 * SIZE(BB)
  2572. #else
  2573. movaps %xmm0, 0 * SIZE(AA)
  2574. #endif
  2575. #ifdef LN
  2576. subl $4 * SIZE, CO1
  2577. #endif
  2578. #if defined(LN) || defined(LT)
  2579. movlps %xmm2, 0 * SIZE(CO1)
  2580. movhps %xmm2, 2 * SIZE(CO1)
  2581. #else
  2582. movlps %xmm0, 0 * SIZE(CO1)
  2583. movhps %xmm0, 2 * SIZE(CO1)
  2584. #endif
  2585. #ifndef LN
  2586. addl $4 * SIZE, CO1
  2587. #endif
  2588. #if defined(LT) || defined(RN)
  2589. movl K, %eax
  2590. subl KK, %eax
  2591. leal (,%eax, SIZE), %eax
  2592. leal (AA, %eax, 4), AA
  2593. #ifdef LT
  2594. addl $4 * SIZE, B
  2595. #endif
  2596. #endif
  2597. #ifdef LN
  2598. subl $4, KK
  2599. movl BORIG, B
  2600. #endif
  2601. #ifdef LT
  2602. addl $4, KK
  2603. #endif
  2604. #ifdef RT
  2605. movl K, %eax
  2606. movl BORIG, B
  2607. sall $2 + BASE_SHIFT, %eax
  2608. addl %eax, AORIG
  2609. #endif
  2610. ALIGN_2
  2611. .L150:
  2612. testl $2, M
  2613. jle .L170
  2614. #ifdef LN
  2615. movl K, %eax
  2616. sall $1 + BASE_SHIFT, %eax
  2617. subl %eax, AORIG
  2618. #endif
  2619. #if defined(LN) || defined(RT)
  2620. movl KK, %eax
  2621. movl AORIG, AA
  2622. sall $1 + BASE_SHIFT, %eax
  2623. addl %eax, AA
  2624. #endif
  2625. leal BUFFER, BB
  2626. #if defined(LN) || defined(RT)
  2627. movl KK, %eax
  2628. sall $BASE_SHIFT, %eax
  2629. leal (BB, %eax, 4), BB
  2630. #endif
  2631. movaps 0 * SIZE(BB), %xmm2
  2632. xorps %xmm4, %xmm4
  2633. #ifdef movsd
  2634. xorps %xmm0, %xmm0
  2635. #endif
  2636. movsd 0 * SIZE(AA), %xmm0
  2637. xorps %xmm5, %xmm5
  2638. movaps 16 * SIZE(BB), %xmm3
  2639. xorps %xmm6, %xmm6
  2640. #ifdef movsd
  2641. xorps %xmm1, %xmm1
  2642. #endif
  2643. movsd 8 * SIZE(AA), %xmm1
  2644. xorps %xmm7, %xmm7
  2645. #if defined(LT) || defined(RN)
  2646. movl KK, %eax
  2647. #else
  2648. movl K, %eax
  2649. subl KK, %eax
  2650. #endif
  2651. sarl $3, %eax
  2652. je .L152
  2653. ALIGN_2
  2654. .L151:
  2655. mulps %xmm0, %xmm2
  2656. movsd 2 * SIZE(AA), %xmm0
  2657. addps %xmm2, %xmm4
  2658. movaps 4 * SIZE(BB), %xmm2
  2659. mulps %xmm0, %xmm2
  2660. movsd 4 * SIZE(AA), %xmm0
  2661. addps %xmm2, %xmm5
  2662. movaps 8 * SIZE(BB), %xmm2
  2663. mulps %xmm0, %xmm2
  2664. movsd 6 * SIZE(AA), %xmm0
  2665. addps %xmm2, %xmm6
  2666. movaps 12 * SIZE(BB), %xmm2
  2667. mulps %xmm0, %xmm2
  2668. movsd 16 * SIZE(AA), %xmm0
  2669. addps %xmm2, %xmm7
  2670. movaps 32 * SIZE(BB), %xmm2
  2671. mulps %xmm1, %xmm3
  2672. movsd 10 * SIZE(AA), %xmm1
  2673. addps %xmm3, %xmm4
  2674. movaps 20 * SIZE(BB), %xmm3
  2675. mulps %xmm1, %xmm3
  2676. movsd 12 * SIZE(AA), %xmm1
  2677. addps %xmm3, %xmm5
  2678. movaps 24 * SIZE(BB), %xmm3
  2679. mulps %xmm1, %xmm3
  2680. movsd 14 * SIZE(AA), %xmm1
  2681. addps %xmm3, %xmm6
  2682. movaps 28 * SIZE(BB), %xmm3
  2683. mulps %xmm1, %xmm3
  2684. movsd 24 * SIZE(AA), %xmm1
  2685. addps %xmm3, %xmm7
  2686. movaps 48 * SIZE(BB), %xmm3
  2687. addl $16 * SIZE, AA
  2688. addl $32 * SIZE, BB
  2689. decl %eax
  2690. jne .L151
  2691. ALIGN_2
  2692. .L152:
  2693. #if defined(LT) || defined(RN)
  2694. movl KK, %eax
  2695. #else
  2696. movl K, %eax
  2697. subl KK, %eax
  2698. #endif
  2699. andl $7, %eax # if (k & 1)
  2700. BRANCH
  2701. je .L154
  2702. .L153:
  2703. mulps %xmm0, %xmm2
  2704. movsd 2 * SIZE(AA), %xmm0
  2705. addps %xmm2, %xmm4
  2706. movaps 4 * SIZE(BB), %xmm2
  2707. addl $2 * SIZE, AA
  2708. addl $4 * SIZE, BB
  2709. decl %eax
  2710. jg .L153
  2711. ALIGN_4
  2712. .L154:
  2713. addps %xmm5, %xmm4
  2714. addps %xmm7, %xmm6
  2715. addps %xmm6, %xmm4
  2716. #if defined(LN) || defined(RT)
  2717. movl KK, %eax
  2718. #ifdef LN
  2719. subl $2, %eax
  2720. #else
  2721. subl $1, %eax
  2722. #endif
  2723. movl AORIG, AA
  2724. movl BORIG, B
  2725. leal BUFFER, BB
  2726. sall $BASE_SHIFT, %eax
  2727. leal (AA, %eax, 2), AA
  2728. leal (B, %eax, 1), B
  2729. leal (BB, %eax, 4), BB
  2730. #endif
  2731. #if defined(LN) || defined(LT)
  2732. movaps %xmm4, %xmm5
  2733. shufps $1, %xmm5, %xmm5
  2734. movss 0 * SIZE(B), %xmm0
  2735. movss 1 * SIZE(B), %xmm1
  2736. subss %xmm4, %xmm0
  2737. subss %xmm5, %xmm1
  2738. #else
  2739. #ifdef movsd
  2740. xorps %xmm0, %xmm0
  2741. #endif
  2742. movsd 0 * SIZE(AA), %xmm0
  2743. subps %xmm4, %xmm0
  2744. #endif
  2745. #ifdef LN
  2746. movaps 0 * SIZE(AA), %xmm4
  2747. movaps %xmm4, %xmm6
  2748. shufps $0xff, %xmm6, %xmm6
  2749. mulss %xmm6, %xmm1
  2750. movaps %xmm4, %xmm6
  2751. shufps $0xaa, %xmm6, %xmm6
  2752. mulss %xmm1, %xmm6
  2753. subss %xmm6, %xmm0
  2754. mulss %xmm4, %xmm0
  2755. #endif
  2756. #ifdef LT
  2757. movaps 0 * SIZE(AA), %xmm4
  2758. mulss %xmm4, %xmm0
  2759. movaps %xmm4, %xmm6
  2760. shufps $0x55, %xmm6, %xmm6
  2761. mulss %xmm0, %xmm6
  2762. subss %xmm6, %xmm1
  2763. movaps %xmm4, %xmm6
  2764. shufps $0xff, %xmm6, %xmm6
  2765. mulss %xmm6, %xmm1
  2766. #endif
  2767. #ifdef RN
  2768. movss 0 * SIZE(B), %xmm6
  2769. shufps $0x00, %xmm6, %xmm6
  2770. mulps %xmm6, %xmm0
  2771. #endif
  2772. #ifdef RT
  2773. movss 0 * SIZE(B), %xmm6
  2774. shufps $0x00, %xmm6, %xmm6
  2775. mulps %xmm6, %xmm0
  2776. #endif
  2777. #if defined(LN) || defined(LT)
  2778. movss %xmm0, 0 * SIZE(B)
  2779. movss %xmm1, 1 * SIZE(B)
  2780. shufps $0x00, %xmm0, %xmm0
  2781. shufps $0x00, %xmm1, %xmm1
  2782. movaps %xmm0, 0 * SIZE(BB)
  2783. movaps %xmm1, 4 * SIZE(BB)
  2784. #else
  2785. movlps %xmm0, 0 * SIZE(AA)
  2786. #endif
  2787. #ifdef LN
  2788. subl $2 * SIZE, CO1
  2789. #endif
  2790. #if defined(LN) || defined(LT)
  2791. movss %xmm0, 0 * SIZE(CO1)
  2792. movss %xmm1, 1 * SIZE(CO1)
  2793. #else
  2794. movlps %xmm0, 0 * SIZE(CO1)
  2795. #endif
  2796. #ifndef LN
  2797. addl $2 * SIZE, CO1
  2798. #endif
  2799. #if defined(LT) || defined(RN)
  2800. movl K, %eax
  2801. subl KK, %eax
  2802. leal (,%eax, SIZE), %eax
  2803. leal (AA, %eax, 2), AA
  2804. #ifdef LT
  2805. addl $2 * SIZE, B
  2806. #endif
  2807. #endif
  2808. #ifdef LN
  2809. subl $2, KK
  2810. movl BORIG, B
  2811. #endif
  2812. #ifdef LT
  2813. addl $2, KK
  2814. #endif
  2815. #ifdef RT
  2816. movl K, %eax
  2817. movl BORIG, B
  2818. sall $1 + BASE_SHIFT, %eax
  2819. addl %eax, AORIG
  2820. #endif
  2821. ALIGN_2
  2822. .L170:
  2823. testl $1, M
  2824. jle .L179
  2825. #ifdef LN
  2826. movl K, %eax
  2827. sall $BASE_SHIFT, %eax
  2828. subl %eax, AORIG
  2829. #endif
  2830. #if defined(LN) || defined(RT)
  2831. movl KK, %eax
  2832. movl AORIG, AA
  2833. leal (AA, %eax, SIZE), AA
  2834. #endif
  2835. leal BUFFER, BB
  2836. #if defined(LN) || defined(RT)
  2837. movl KK, %eax
  2838. sall $BASE_SHIFT, %eax
  2839. leal (BB, %eax, 4), BB
  2840. #endif
  2841. movss 0 * SIZE(BB), %xmm2
  2842. xorps %xmm4, %xmm4
  2843. movss 0 * SIZE(AA), %xmm0
  2844. xorps %xmm5, %xmm5
  2845. movss 16 * SIZE(BB), %xmm3
  2846. xorps %xmm6, %xmm6
  2847. movss 4 * SIZE(AA), %xmm1
  2848. xorps %xmm7, %xmm7
  2849. #if defined(LT) || defined(RN)
  2850. movl KK, %eax
  2851. #else
  2852. movl K, %eax
  2853. subl KK, %eax
  2854. #endif
  2855. sarl $3, %eax
  2856. je .L172
  2857. ALIGN_2
  2858. .L171:
  2859. mulss %xmm0, %xmm2
  2860. movss 1 * SIZE(AA), %xmm0
  2861. addss %xmm2, %xmm4
  2862. mulss 4 * SIZE(BB), %xmm0
  2863. movss 32 * SIZE(BB), %xmm2
  2864. addss %xmm0, %xmm5
  2865. movss 2 * SIZE(AA), %xmm0
  2866. mulss 8 * SIZE(BB), %xmm0
  2867. addss %xmm0, %xmm6
  2868. movss 3 * SIZE(AA), %xmm0
  2869. mulss 12 * SIZE(BB), %xmm0
  2870. addss %xmm0, %xmm7
  2871. movss 8 * SIZE(AA), %xmm0
  2872. mulss %xmm1, %xmm3
  2873. movss 5 * SIZE(AA), %xmm1
  2874. addss %xmm3, %xmm4
  2875. mulss 20 * SIZE(BB), %xmm1
  2876. movss 48 * SIZE(BB), %xmm3
  2877. addss %xmm1, %xmm5
  2878. movss 6 * SIZE(AA), %xmm1
  2879. mulss 24 * SIZE(BB), %xmm1
  2880. addss %xmm1, %xmm6
  2881. movss 7 * SIZE(AA), %xmm1
  2882. mulss 28 * SIZE(BB), %xmm1
  2883. addss %xmm1, %xmm7
  2884. movss 12 * SIZE(AA), %xmm1
  2885. addl $ 8 * SIZE, AA
  2886. addl $32 * SIZE, BB
  2887. decl %eax
  2888. jne .L171
  2889. ALIGN_2
  2890. .L172:
  2891. #if defined(LT) || defined(RN)
  2892. movl KK, %eax
  2893. #else
  2894. movl K, %eax
  2895. subl KK, %eax
  2896. #endif
  2897. andl $7, %eax # if (k & 1)
  2898. BRANCH
  2899. je .L174
  2900. .L173:
  2901. movss 0 * SIZE(AA), %xmm0
  2902. movss 0 * SIZE(BB), %xmm2
  2903. mulss %xmm0, %xmm2
  2904. addss %xmm2, %xmm4
  2905. addl $1 * SIZE, AA
  2906. addl $4 * SIZE, BB
  2907. decl %eax
  2908. jg .L173
  2909. ALIGN_4
  2910. .L174:
  2911. addss %xmm5, %xmm4
  2912. addss %xmm7, %xmm6
  2913. addss %xmm6, %xmm4
  2914. #if defined(LN) || defined(RT)
  2915. movl KK, %eax
  2916. subl $1, %eax
  2917. movl AORIG, AA
  2918. movl BORIG, B
  2919. leal BUFFER, BB
  2920. sall $ BASE_SHIFT, %eax
  2921. leal (AA, %eax, 1), AA
  2922. leal (B, %eax, 1), B
  2923. leal (BB, %eax, 4), BB
  2924. #endif
  2925. #if defined(LN) || defined(LT)
  2926. movss 0 * SIZE(B), %xmm1
  2927. subss %xmm4, %xmm1
  2928. #else
  2929. movss 0 * SIZE(AA), %xmm0
  2930. subss %xmm4, %xmm0
  2931. #endif
  2932. #if defined(LN) || defined(LT)
  2933. mulss 0 * SIZE(AA), %xmm1
  2934. #endif
  2935. #if defined(RN) || defined(RT)
  2936. mulss 0 * SIZE(B), %xmm0
  2937. #endif
  2938. #if defined(LN) || defined(LT)
  2939. movss %xmm1, 0 * SIZE(B)
  2940. shufps $0x00, %xmm1, %xmm1
  2941. movaps %xmm1, 0 * SIZE(BB)
  2942. #else
  2943. movss %xmm0, 0 * SIZE(AA)
  2944. #endif
  2945. #ifdef LN
  2946. subl $1 * SIZE, CO1
  2947. #endif
  2948. #if defined(LN) || defined(LT)
  2949. movss %xmm1, 0 * SIZE(CO1)
  2950. #else
  2951. movss %xmm0, 0 * SIZE(CO1)
  2952. #endif
  2953. #ifndef LN
  2954. addl $1 * SIZE, CO1
  2955. #endif
  2956. #if defined(LT) || defined(RN)
  2957. movl K, %eax
  2958. subl KK, %eax
  2959. leal (AA, %eax, SIZE), AA
  2960. #ifdef LT
  2961. addl $1 * SIZE, B
  2962. #endif
  2963. #endif
  2964. #ifdef LN
  2965. subl $1, KK
  2966. movl BORIG, B
  2967. #endif
  2968. #ifdef LT
  2969. addl $1, KK
  2970. #endif
  2971. #ifdef RT
  2972. movl K, %eax
  2973. movl BORIG, B
  2974. sall $BASE_SHIFT, %eax
  2975. addl %eax, AORIG
  2976. #endif
  2977. ALIGN_2
  2978. .L179:
  2979. #ifdef LN
  2980. movl K, %eax
  2981. leal (B, %eax, SIZE), B
  2982. #endif
  2983. #if defined(LT) || defined(RN)
  2984. movl K, %eax
  2985. subl KK, %eax
  2986. leal (B, %eax, SIZE), B
  2987. #endif
  2988. #ifdef RN
  2989. addl $1, KK
  2990. #endif
  2991. #ifdef RT
  2992. subl $1, KK
  2993. #endif
  2994. ALIGN_4
  2995. .L999:
  2996. movl OLD_STACK, %esp
  2997. popl %ebx
  2998. popl %esi
  2999. popl %edi
  3000. popl %ebp
  3001. ret
  3002. EPILOGUE