You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LN_4x4_penryn.S 55 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 20 + STACK + ARGS(%esp)
  47. #define ARG_B 24 + STACK + ARGS(%esp)
  48. #define C 28 + STACK + ARGS(%esp)
  49. #define ARG_LDC 32 + STACK + ARGS(%esp)
  50. #define OFFSET 36 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define KK 4 + STACK(%esp)
  53. #define KKK 8 + STACK(%esp)
  54. #define AORIG 12 + STACK(%esp)
  55. #if defined(PENRYN) || defined(DUNNINGTON)
  56. #define PREFETCH prefetcht0
  57. #define PREFETCHSIZE (8 * 21 + 4)
  58. #endif
  59. #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
  60. #define PREFETCH prefetcht0
  61. #define PREFETCHSIZE (8 * 21 + 4)
  62. #endif
  63. #ifdef ATOM
  64. #define PREFETCH prefetcht0
  65. #define PREFETCHSIZE (8 * 8 + 4)
  66. #endif
  67. #ifdef NANO
  68. #define PREFETCH prefetcht0
  69. #define PREFETCHSIZE (16 * 2)
  70. #endif
  71. #define B %edi
  72. #define AA %edx
  73. #define BB %ecx
  74. #define LDC %ebp
  75. #define CO1 %esi
  76. PROLOGUE
  77. subl $ARGS, %esp
  78. pushl %ebp
  79. pushl %edi
  80. pushl %esi
  81. pushl %ebx
  82. PROFCODE
  83. movl ARG_B, B
  84. movl ARG_LDC, LDC
  85. movl OFFSET, %eax
  86. #ifdef RN
  87. negl %eax
  88. #endif
  89. movl %eax, KK
  90. leal (, LDC, SIZE), LDC
  91. subl $-32 * SIZE, A
  92. subl $-32 * SIZE, B
  93. #ifdef LN
  94. movl M, %eax
  95. leal (, %eax, SIZE), %eax
  96. addl %eax, C
  97. imull K, %eax
  98. addl %eax, A
  99. #endif
  100. #ifdef RT
  101. movl N, %eax
  102. leal (, %eax, SIZE), %eax
  103. imull K, %eax
  104. addl %eax, B
  105. movl N, %eax
  106. imull LDC, %eax
  107. addl %eax, C
  108. #endif
  109. #ifdef RT
  110. movl N, %eax
  111. subl OFFSET, %eax
  112. movl %eax, KK
  113. #endif
  114. movl N, %eax
  115. sarl $2, %eax
  116. movl %eax, J
  117. jle .L40
  118. .L10:
  119. #if defined(LT) || defined(RN)
  120. movl A, AA
  121. #else
  122. movl A, %eax
  123. movl %eax, AORIG
  124. #endif
  125. #ifdef RT
  126. movl K, %eax
  127. sall $2 + BASE_SHIFT, %eax
  128. subl %eax, B
  129. #endif
  130. leal (, LDC, 4), %eax
  131. #ifdef RT
  132. subl %eax, C
  133. #endif
  134. movl C, CO1
  135. #ifndef RT
  136. addl %eax, C
  137. #endif
  138. #ifdef LN
  139. movl OFFSET, %eax
  140. addl M, %eax
  141. movl %eax, KK
  142. #endif
  143. #ifdef LT
  144. movl OFFSET, %eax
  145. movl %eax, KK
  146. #endif
  147. testl $1, M
  148. je .L20
  149. #ifdef LN
  150. movl K, %eax
  151. sall $BASE_SHIFT, %eax
  152. subl %eax, AORIG
  153. #endif
  154. #if defined(LN) || defined(RT)
  155. movl KK, %eax
  156. movl AORIG, AA
  157. leal (AA, %eax, SIZE), AA
  158. #endif
  159. movl B, BB
  160. #if defined(LN) || defined(RT)
  161. movl KK, %eax
  162. sall $2 + BASE_SHIFT, %eax
  163. addl %eax, BB
  164. #endif
  165. pxor %xmm4, %xmm4
  166. movsd -32 * SIZE(AA), %xmm0
  167. pxor %xmm5, %xmm5
  168. movaps -32 * SIZE(BB), %xmm1
  169. pxor %xmm6, %xmm6
  170. pxor %xmm7, %xmm7
  171. #if defined(LT) || defined(RN)
  172. movl KK, %eax
  173. #else
  174. movl K, %eax
  175. subl KK, %eax
  176. #endif
  177. sarl $3, %eax
  178. je .L35
  179. ALIGN_4
  180. .L32:
  181. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  182. pshufd $0x00, %xmm0, %xmm2
  183. mulps %xmm2, %xmm1
  184. addps %xmm1, %xmm4
  185. movaps -28 * SIZE(BB), %xmm1
  186. pshufd $0x55, %xmm0, %xmm2
  187. movsd -30 * SIZE(AA), %xmm0
  188. mulps %xmm2, %xmm1
  189. addps %xmm1, %xmm4
  190. movaps -24 * SIZE(BB), %xmm1
  191. pshufd $0x00, %xmm0, %xmm2
  192. mulps %xmm2, %xmm1
  193. addps %xmm1, %xmm4
  194. movaps -20 * SIZE(BB), %xmm1
  195. pshufd $0x55, %xmm0, %xmm2
  196. movsd -28 * SIZE(AA), %xmm0
  197. mulps %xmm2, %xmm1
  198. addps %xmm1, %xmm4
  199. movaps -16 * SIZE(BB), %xmm1
  200. pshufd $0x00, %xmm0, %xmm2
  201. mulps %xmm2, %xmm1
  202. addps %xmm1, %xmm4
  203. movaps -12 * SIZE(BB), %xmm1
  204. pshufd $0x55, %xmm0, %xmm2
  205. movsd -26 * SIZE(AA), %xmm0
  206. mulps %xmm2, %xmm1
  207. addps %xmm1, %xmm4
  208. movaps -8 * SIZE(BB), %xmm1
  209. pshufd $0x00, %xmm0, %xmm2
  210. mulps %xmm2, %xmm1
  211. addps %xmm1, %xmm4
  212. movaps -4 * SIZE(BB), %xmm1
  213. pshufd $0x55, %xmm0, %xmm2
  214. movsd -24 * SIZE(AA), %xmm0
  215. mulps %xmm2, %xmm1
  216. addps %xmm1, %xmm4
  217. movaps 0 * SIZE(BB), %xmm1
  218. subl $ -8 * SIZE, AA
  219. subl $-32 * SIZE, BB
  220. subl $1, %eax
  221. jne .L32
  222. ALIGN_4
  223. .L35:
  224. #if defined(LT) || defined(RN)
  225. movl KK, %eax
  226. #else
  227. movl K, %eax
  228. subl KK, %eax
  229. #endif
  230. andl $7, %eax # if (k & 1)
  231. BRANCH
  232. je .L38
  233. ALIGN_4
  234. .L36:
  235. pshufd $0x00, %xmm0, %xmm2
  236. movss -31 * SIZE(AA), %xmm0
  237. mulps %xmm2, %xmm1
  238. addps %xmm1, %xmm4
  239. movaps -28 * SIZE(BB), %xmm1
  240. addl $1 * SIZE, AA
  241. addl $4 * SIZE, BB
  242. decl %eax
  243. jg .L36
  244. ALIGN_4
  245. .L38:
  246. #if defined(LN) || defined(RT)
  247. movl KK, %eax
  248. #ifdef LN
  249. subl $1, %eax
  250. #else
  251. subl $4, %eax
  252. #endif
  253. movl AORIG, AA
  254. leal (, %eax, SIZE), %eax
  255. leal (AA, %eax, 1), AA
  256. leal (B, %eax, 4), BB
  257. #endif
  258. #if defined(LN) || defined(LT)
  259. movaps -32 * SIZE(BB), %xmm1
  260. subps %xmm4, %xmm1
  261. #else
  262. movsd -32 * SIZE(AA), %xmm0
  263. movhps -30 * SIZE(AA), %xmm0
  264. subps %xmm4, %xmm0
  265. pshufd $0xff, %xmm0, %xmm3
  266. pshufd $0xaa, %xmm0, %xmm2
  267. pshufd $0x55, %xmm0, %xmm1
  268. pshufd $0x00, %xmm0, %xmm0
  269. #endif
  270. #if defined(LN) || defined(LT)
  271. movss -32 * SIZE(AA), %xmm4
  272. pshufd $0x00, %xmm4, %xmm6
  273. mulps %xmm6, %xmm1
  274. #endif
  275. #ifdef RN
  276. movaps -32 * SIZE(BB), %xmm6
  277. pshufd $0x00, %xmm6, %xmm7
  278. mulss %xmm7, %xmm0
  279. pshufd $0x55, %xmm6, %xmm7
  280. mulss %xmm0, %xmm7
  281. subss %xmm7, %xmm1
  282. pshufd $0xaa, %xmm6, %xmm7
  283. mulss %xmm0, %xmm7
  284. subss %xmm7, %xmm2
  285. pshufd $0xff, %xmm6, %xmm7
  286. mulss %xmm0, %xmm7
  287. subss %xmm7, %xmm3
  288. movaps -28 * SIZE(BB), %xmm6
  289. pshufd $0x55, %xmm6, %xmm7
  290. mulss %xmm7, %xmm1
  291. pshufd $0xaa, %xmm6, %xmm7
  292. mulss %xmm1, %xmm7
  293. subss %xmm7, %xmm2
  294. pshufd $0xff, %xmm6, %xmm7
  295. mulss %xmm1, %xmm7
  296. subss %xmm7, %xmm3
  297. movaps -24 * SIZE(BB), %xmm6
  298. pshufd $0xaa, %xmm6, %xmm7
  299. mulss %xmm7, %xmm2
  300. pshufd $0xff, %xmm6, %xmm7
  301. mulss %xmm2, %xmm7
  302. subss %xmm7, %xmm3
  303. movaps -20 * SIZE(BB), %xmm6
  304. pshufd $0xff, %xmm6, %xmm7
  305. mulss %xmm7, %xmm3
  306. #endif
  307. #ifdef RT
  308. movaps -20 * SIZE(BB), %xmm6
  309. pshufd $0xff, %xmm6, %xmm7
  310. mulss %xmm7, %xmm3
  311. pshufd $0xaa, %xmm6, %xmm7
  312. mulss %xmm3, %xmm7
  313. subss %xmm7, %xmm2
  314. pshufd $0x55, %xmm6, %xmm7
  315. mulss %xmm3, %xmm7
  316. subss %xmm7, %xmm1
  317. pshufd $0x00, %xmm6, %xmm7
  318. mulss %xmm3, %xmm7
  319. subss %xmm7, %xmm0
  320. movaps -24 * SIZE(BB), %xmm6
  321. pshufd $0xaa, %xmm6, %xmm7
  322. mulss %xmm7, %xmm2
  323. pshufd $0x55, %xmm6, %xmm7
  324. mulss %xmm2, %xmm7
  325. subss %xmm7, %xmm1
  326. pshufd $0x00, %xmm6, %xmm7
  327. mulss %xmm2, %xmm7
  328. subss %xmm7, %xmm0
  329. movaps -28 * SIZE(BB), %xmm6
  330. pshufd $0x55, %xmm6, %xmm7
  331. mulss %xmm7, %xmm1
  332. pshufd $0x00, %xmm6, %xmm7
  333. mulss %xmm1, %xmm7
  334. subss %xmm7, %xmm0
  335. movaps -32 * SIZE(BB), %xmm6
  336. pshufd $0x00, %xmm6, %xmm7
  337. mulss %xmm7, %xmm0
  338. #endif
  339. #if defined(LN) || defined(LT)
  340. movaps %xmm1, -32 * SIZE(BB)
  341. #else
  342. movss %xmm0, -32 * SIZE(AA)
  343. movss %xmm1, -31 * SIZE(AA)
  344. movss %xmm2, -30 * SIZE(AA)
  345. movss %xmm3, -29 * SIZE(AA)
  346. #endif
  347. #ifdef LN
  348. subl $1 * SIZE, CO1
  349. #endif
  350. leal (LDC, LDC, 2), %eax
  351. #if defined(LN) || defined(LT)
  352. movaps %xmm1, %xmm0
  353. unpcklps %xmm5, %xmm1
  354. unpckhps %xmm5, %xmm0
  355. movaps %xmm3, %xmm4
  356. unpcklps %xmm7, %xmm3
  357. unpckhps %xmm7, %xmm4
  358. movaps %xmm1, %xmm2
  359. unpcklps %xmm3, %xmm1
  360. unpckhps %xmm3, %xmm2
  361. movaps %xmm0, %xmm6
  362. unpcklps %xmm4, %xmm0
  363. unpckhps %xmm4, %xmm6
  364. movss %xmm1, 0 * SIZE(CO1)
  365. movss %xmm2, 0 * SIZE(CO1, LDC, 1)
  366. movss %xmm0, 0 * SIZE(CO1, LDC, 2)
  367. movss %xmm6, 0 * SIZE(CO1, %eax, 1)
  368. #else
  369. movss %xmm0, 0 * SIZE(CO1)
  370. movss %xmm1, 0 * SIZE(CO1, LDC, 1)
  371. movss %xmm2, 0 * SIZE(CO1, LDC, 2)
  372. movss %xmm3, 0 * SIZE(CO1, %eax, 1)
  373. #endif
  374. #ifndef LN
  375. addl $1 * SIZE, CO1
  376. #endif
  377. #if defined(LT) || defined(RN)
  378. movl K, %eax
  379. subl KK, %eax
  380. leal (,%eax, SIZE), %eax
  381. leal (AA, %eax, 1), AA
  382. leal (BB, %eax, 4), BB
  383. #endif
  384. #ifdef LN
  385. subl $1, KK
  386. #endif
  387. #ifdef LT
  388. addl $1, KK
  389. #endif
  390. #ifdef RT
  391. movl K, %eax
  392. sall $BASE_SHIFT, %eax
  393. addl %eax, AORIG
  394. #endif
  395. ALIGN_4
  396. .L20:
  397. testl $2, M
  398. je .L30
  399. #ifdef LN
  400. movl K, %eax
  401. sall $1 + BASE_SHIFT, %eax
  402. subl %eax, AORIG
  403. #endif
  404. #if defined(LN) || defined(RT)
  405. movl KK, %eax
  406. movl AORIG, AA
  407. leal (, %eax, SIZE), %eax
  408. leal (AA, %eax, 2), AA
  409. #endif
  410. movl B, BB
  411. #if defined(LN) || defined(RT)
  412. movl KK, %eax
  413. sall $2 + BASE_SHIFT, %eax
  414. addl %eax, BB
  415. #endif
  416. pxor %xmm4, %xmm4
  417. movaps -32 * SIZE(AA), %xmm0
  418. pxor %xmm5, %xmm5
  419. movaps -32 * SIZE(BB), %xmm1
  420. pxor %xmm6, %xmm6
  421. pxor %xmm7, %xmm7
  422. #if defined(LT) || defined(RN)
  423. movl KK, %eax
  424. #else
  425. movl K, %eax
  426. subl KK, %eax
  427. #endif
  428. sarl $3, %eax
  429. je .L25
  430. ALIGN_4
  431. .L22:
  432. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  433. pshufd $0x44, %xmm0, %xmm2
  434. pshufd $0x50, %xmm1, %xmm3
  435. mulps %xmm2, %xmm3
  436. addps %xmm3, %xmm4
  437. pshufd $0xfa, %xmm1, %xmm3
  438. movaps -28 * SIZE(BB), %xmm1
  439. mulps %xmm2, %xmm3
  440. addps %xmm3, %xmm6
  441. pshufd $0xee, %xmm0, %xmm2
  442. movaps -28 * SIZE(AA), %xmm0
  443. pshufd $0x50, %xmm1, %xmm3
  444. mulps %xmm2, %xmm3
  445. addps %xmm3, %xmm5
  446. pshufd $0xfa, %xmm1, %xmm3
  447. movaps -24 * SIZE(BB), %xmm1
  448. mulps %xmm2, %xmm3
  449. addps %xmm3, %xmm7
  450. pshufd $0x44, %xmm0, %xmm2
  451. pshufd $0x50, %xmm1, %xmm3
  452. mulps %xmm2, %xmm3
  453. addps %xmm3, %xmm4
  454. pshufd $0xfa, %xmm1, %xmm3
  455. movaps -20 * SIZE(BB), %xmm1
  456. mulps %xmm2, %xmm3
  457. addps %xmm3, %xmm6
  458. pshufd $0xee, %xmm0, %xmm2
  459. movaps -24 * SIZE(AA), %xmm0
  460. pshufd $0x50, %xmm1, %xmm3
  461. mulps %xmm2, %xmm3
  462. addps %xmm3, %xmm5
  463. pshufd $0xfa, %xmm1, %xmm3
  464. movaps -16 * SIZE(BB), %xmm1
  465. mulps %xmm2, %xmm3
  466. addps %xmm3, %xmm7
  467. pshufd $0x44, %xmm0, %xmm2
  468. pshufd $0x50, %xmm1, %xmm3
  469. mulps %xmm2, %xmm3
  470. addps %xmm3, %xmm4
  471. pshufd $0xfa, %xmm1, %xmm3
  472. movaps -12 * SIZE(BB), %xmm1
  473. mulps %xmm2, %xmm3
  474. addps %xmm3, %xmm6
  475. pshufd $0xee, %xmm0, %xmm2
  476. movaps -20 * SIZE(AA), %xmm0
  477. pshufd $0x50, %xmm1, %xmm3
  478. mulps %xmm2, %xmm3
  479. addps %xmm3, %xmm5
  480. pshufd $0xfa, %xmm1, %xmm3
  481. movaps -8 * SIZE(BB), %xmm1
  482. mulps %xmm2, %xmm3
  483. addps %xmm3, %xmm7
  484. pshufd $0x44, %xmm0, %xmm2
  485. pshufd $0x50, %xmm1, %xmm3
  486. mulps %xmm2, %xmm3
  487. addps %xmm3, %xmm4
  488. pshufd $0xfa, %xmm1, %xmm3
  489. movaps -4 * SIZE(BB), %xmm1
  490. mulps %xmm2, %xmm3
  491. addps %xmm3, %xmm6
  492. pshufd $0xee, %xmm0, %xmm2
  493. movaps -16 * SIZE(AA), %xmm0
  494. pshufd $0x50, %xmm1, %xmm3
  495. mulps %xmm2, %xmm3
  496. addps %xmm3, %xmm5
  497. pshufd $0xfa, %xmm1, %xmm3
  498. movaps 0 * SIZE(BB), %xmm1
  499. mulps %xmm2, %xmm3
  500. addps %xmm3, %xmm7
  501. subl $-16 * SIZE, AA
  502. subl $-32 * SIZE, BB
  503. subl $1, %eax
  504. jne .L22
  505. ALIGN_4
  506. .L25:
  507. #if defined(LT) || defined(RN)
  508. movl KK, %eax
  509. #else
  510. movl K, %eax
  511. subl KK, %eax
  512. #endif
  513. andl $7, %eax # if (k & 1)
  514. BRANCH
  515. je .L28
  516. ALIGN_4
  517. .L26:
  518. pshufd $0x44, %xmm0, %xmm2
  519. movsd -30 * SIZE(AA), %xmm0
  520. pshufd $0x50, %xmm1, %xmm3
  521. mulps %xmm2, %xmm3
  522. addps %xmm3, %xmm4
  523. pshufd $0xfa, %xmm1, %xmm3
  524. movaps -28 * SIZE(BB), %xmm1
  525. mulps %xmm2, %xmm3
  526. addps %xmm3, %xmm6
  527. addl $2 * SIZE, AA
  528. addl $4 * SIZE, BB
  529. decl %eax
  530. jg .L26
  531. ALIGN_4
  532. .L28:
  533. #if defined(LN) || defined(RT)
  534. movl KK, %eax
  535. #ifdef LN
  536. subl $2, %eax
  537. #else
  538. subl $4, %eax
  539. #endif
  540. movl AORIG, AA
  541. leal (, %eax, SIZE), %eax
  542. leal (AA, %eax, 2), AA
  543. leal (B, %eax, 4), BB
  544. #endif
  545. addps %xmm5, %xmm4
  546. addps %xmm7, %xmm6
  547. movhlps %xmm4, %xmm5
  548. movhlps %xmm6, %xmm7
  549. #if defined(LN) || defined(LT)
  550. unpcklps %xmm6, %xmm4
  551. unpcklps %xmm7, %xmm5
  552. movaps %xmm4, %xmm6
  553. unpcklps %xmm5, %xmm4
  554. unpckhps %xmm5, %xmm6
  555. movaps -32 * SIZE(BB), %xmm1
  556. movaps -28 * SIZE(BB), %xmm3
  557. subps %xmm4, %xmm1
  558. subps %xmm6, %xmm3
  559. #else
  560. movsd -32 * SIZE(AA), %xmm0
  561. movsd -30 * SIZE(AA), %xmm1
  562. movsd -28 * SIZE(AA), %xmm2
  563. movsd -26 * SIZE(AA), %xmm3
  564. subps %xmm4, %xmm0
  565. subps %xmm5, %xmm1
  566. subps %xmm6, %xmm2
  567. subps %xmm7, %xmm3
  568. #endif
  569. #ifdef LN
  570. movaps -32 * SIZE(AA), %xmm4
  571. pshufd $0xff, %xmm4, %xmm6
  572. mulps %xmm6, %xmm3
  573. pshufd $0xaa, %xmm4, %xmm6
  574. mulps %xmm3, %xmm6
  575. subps %xmm6, %xmm1
  576. pshufd $0x00, %xmm4, %xmm6
  577. mulps %xmm6, %xmm1
  578. #endif
  579. #ifdef LT
  580. movaps -32 * SIZE(AA), %xmm4
  581. pshufd $0x00, %xmm4, %xmm6
  582. mulps %xmm6, %xmm1
  583. pshufd $0x55, %xmm4, %xmm6
  584. mulps %xmm1, %xmm6
  585. subps %xmm6, %xmm3
  586. pshufd $0xff, %xmm4, %xmm6
  587. mulps %xmm6, %xmm3
  588. #endif
  589. #ifdef RN
  590. movaps -32 * SIZE(BB), %xmm6
  591. pshufd $0x00, %xmm6, %xmm7
  592. mulps %xmm7, %xmm0
  593. pshufd $0x55, %xmm6, %xmm7
  594. mulps %xmm0, %xmm7
  595. subps %xmm7, %xmm1
  596. pshufd $0xaa, %xmm6, %xmm7
  597. mulps %xmm0, %xmm7
  598. subps %xmm7, %xmm2
  599. pshufd $0xff, %xmm6, %xmm7
  600. mulps %xmm0, %xmm7
  601. subps %xmm7, %xmm3
  602. movaps -28 * SIZE(BB), %xmm6
  603. pshufd $0x55, %xmm6, %xmm7
  604. mulps %xmm7, %xmm1
  605. pshufd $0xaa, %xmm6, %xmm7
  606. mulps %xmm1, %xmm7
  607. subps %xmm7, %xmm2
  608. pshufd $0xff, %xmm6, %xmm7
  609. mulps %xmm1, %xmm7
  610. subps %xmm7, %xmm3
  611. movaps -24 * SIZE(BB), %xmm6
  612. pshufd $0xaa, %xmm6, %xmm7
  613. mulps %xmm7, %xmm2
  614. pshufd $0xff, %xmm6, %xmm7
  615. mulps %xmm2, %xmm7
  616. subps %xmm7, %xmm3
  617. movaps -20 * SIZE(BB), %xmm6
  618. pshufd $0xff, %xmm6, %xmm7
  619. mulps %xmm7, %xmm3
  620. #endif
  621. #ifdef RT
  622. movaps -20 * SIZE(BB), %xmm6
  623. pshufd $0xff, %xmm6, %xmm7
  624. mulps %xmm7, %xmm3
  625. pshufd $0xaa, %xmm6, %xmm7
  626. mulps %xmm3, %xmm7
  627. subps %xmm7, %xmm2
  628. pshufd $0x55, %xmm6, %xmm7
  629. mulps %xmm3, %xmm7
  630. subps %xmm7, %xmm1
  631. pshufd $0x00, %xmm6, %xmm7
  632. mulps %xmm3, %xmm7
  633. subps %xmm7, %xmm0
  634. movaps -24 * SIZE(BB), %xmm6
  635. pshufd $0xaa, %xmm6, %xmm7
  636. mulps %xmm7, %xmm2
  637. pshufd $0x55, %xmm6, %xmm7
  638. mulps %xmm2, %xmm7
  639. subps %xmm7, %xmm1
  640. pshufd $0x00, %xmm6, %xmm7
  641. mulps %xmm2, %xmm7
  642. subps %xmm7, %xmm0
  643. movaps -28 * SIZE(BB), %xmm6
  644. pshufd $0x55, %xmm6, %xmm7
  645. mulps %xmm7, %xmm1
  646. pshufd $0x00, %xmm6, %xmm7
  647. mulps %xmm1, %xmm7
  648. subps %xmm7, %xmm0
  649. movaps -32 * SIZE(BB), %xmm6
  650. pshufd $0x00, %xmm6, %xmm7
  651. mulps %xmm7, %xmm0
  652. #endif
  653. #if defined(LN) || defined(LT)
  654. movaps %xmm1, -32 * SIZE(BB)
  655. movaps %xmm3, -28 * SIZE(BB)
  656. #else
  657. movlps %xmm0, -32 * SIZE(AA)
  658. movlps %xmm1, -30 * SIZE(AA)
  659. movlps %xmm2, -28 * SIZE(AA)
  660. movlps %xmm3, -26 * SIZE(AA)
  661. #endif
  662. #ifdef LN
  663. subl $2 * SIZE, CO1
  664. #endif
  665. leal (LDC, LDC, 2), %eax
  666. #if defined(LN) || defined(LT)
  667. movaps %xmm1, %xmm0
  668. unpcklps %xmm5, %xmm1
  669. unpckhps %xmm5, %xmm0
  670. movaps %xmm3, %xmm4
  671. unpcklps %xmm7, %xmm3
  672. unpckhps %xmm7, %xmm4
  673. movaps %xmm1, %xmm2
  674. unpcklps %xmm3, %xmm1
  675. unpckhps %xmm3, %xmm2
  676. movaps %xmm0, %xmm6
  677. unpcklps %xmm4, %xmm0
  678. unpckhps %xmm4, %xmm6
  679. movlps %xmm1, 0 * SIZE(CO1)
  680. movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
  681. movlps %xmm0, 0 * SIZE(CO1, LDC, 2)
  682. movlps %xmm6, 0 * SIZE(CO1, %eax, 1)
  683. #else
  684. movlps %xmm0, 0 * SIZE(CO1)
  685. movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
  686. movlps %xmm2, 0 * SIZE(CO1, LDC, 2)
  687. movlps %xmm3, 0 * SIZE(CO1, %eax, 1)
  688. #endif
  689. #ifndef LN
  690. addl $2 * SIZE, CO1
  691. #endif
  692. #if defined(LT) || defined(RN)
  693. movl K, %eax
  694. subl KK, %eax
  695. leal (,%eax, SIZE), %eax
  696. leal (AA, %eax, 2), AA
  697. leal (BB, %eax, 4), BB
  698. #endif
  699. #ifdef LN
  700. subl $2, KK
  701. #endif
  702. #ifdef LT
  703. addl $2, KK
  704. #endif
  705. #ifdef RT
  706. movl K, %eax
  707. sall $1 + BASE_SHIFT, %eax
  708. addl %eax, AORIG
  709. #endif
  710. ALIGN_4
  711. .L30:
  712. movl M, %ebx
  713. sarl $2, %ebx # i = (m >> 2)
  714. jle .L39
  715. ALIGN_4
  716. .L11:
  717. #ifdef LN
  718. movl K, %eax
  719. sall $2 + BASE_SHIFT, %eax
  720. subl %eax, AORIG
  721. #endif
  722. #if defined(LN) || defined(RT)
  723. movl KK, %eax
  724. movl AORIG, AA
  725. leal (, %eax, SIZE), %eax
  726. leal (AA, %eax, 4), AA
  727. #endif
  728. movl B, BB
  729. #if defined(LN) || defined(RT)
  730. movl KK, %eax
  731. sall $2 + BASE_SHIFT, %eax
  732. addl %eax, BB
  733. #endif
  734. leal (CO1, LDC, 2), %eax
  735. movaps -32 * SIZE(AA), %xmm0
  736. pxor %xmm2, %xmm2
  737. movaps -32 * SIZE(BB), %xmm1
  738. pxor %xmm3, %xmm3
  739. #ifdef LN
  740. pxor %xmm4, %xmm4
  741. prefetcht0 -4 * SIZE(CO1)
  742. pxor %xmm5, %xmm5
  743. prefetcht0 -4 * SIZE(CO1, LDC)
  744. pxor %xmm6, %xmm6
  745. prefetcht0 -4 * SIZE(%eax)
  746. pxor %xmm7, %xmm7
  747. prefetcht0 -4 * SIZE(%eax, LDC)
  748. #else
  749. pxor %xmm4, %xmm4
  750. prefetcht0 3 * SIZE(CO1)
  751. pxor %xmm5, %xmm5
  752. prefetcht0 3 * SIZE(CO1, LDC)
  753. pxor %xmm6, %xmm6
  754. prefetcht0 3 * SIZE(%eax)
  755. pxor %xmm7, %xmm7
  756. prefetcht0 3 * SIZE(%eax, LDC)
  757. #endif
  758. #if defined(LT) || defined(RN)
  759. movl KK, %eax
  760. #else
  761. movl K, %eax
  762. subl KK, %eax
  763. #endif
  764. sarl $3, %eax
  765. je .L15
  766. ALIGN_4
  767. .L12:
  768. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  769. addps %xmm2, %xmm7
  770. pshufd $0x93, %xmm1, %xmm2
  771. mulps %xmm0, %xmm1
  772. addps %xmm3, %xmm6
  773. pshufd $0x93, %xmm2, %xmm3
  774. mulps %xmm0, %xmm2
  775. addps %xmm2, %xmm5
  776. pshufd $0x93, %xmm3, %xmm2
  777. mulps %xmm0, %xmm3
  778. addps %xmm1, %xmm4
  779. movaps -28 * SIZE(BB), %xmm1
  780. mulps %xmm0, %xmm2
  781. movaps -28 * SIZE(AA), %xmm0
  782. addps %xmm2, %xmm7
  783. pshufd $0x93, %xmm1, %xmm2
  784. mulps %xmm0, %xmm1
  785. addps %xmm3, %xmm6
  786. pshufd $0x93, %xmm2, %xmm3
  787. mulps %xmm0, %xmm2
  788. addps %xmm2, %xmm5
  789. pshufd $0x93, %xmm3, %xmm2
  790. mulps %xmm0, %xmm3
  791. addps %xmm1, %xmm4
  792. movaps -24 * SIZE(BB), %xmm1
  793. mulps %xmm0, %xmm2
  794. movaps -24 * SIZE(AA), %xmm0
  795. addps %xmm2, %xmm7
  796. pshufd $0x93, %xmm1, %xmm2
  797. mulps %xmm0, %xmm1
  798. addps %xmm3, %xmm6
  799. pshufd $0x93, %xmm2, %xmm3
  800. mulps %xmm0, %xmm2
  801. addps %xmm2, %xmm5
  802. pshufd $0x93, %xmm3, %xmm2
  803. mulps %xmm0, %xmm3
  804. addps %xmm1, %xmm4
  805. movaps -20 * SIZE(BB), %xmm1
  806. mulps %xmm0, %xmm2
  807. movaps -20 * SIZE(AA), %xmm0
  808. addps %xmm2, %xmm7
  809. pshufd $0x93, %xmm1, %xmm2
  810. mulps %xmm0, %xmm1
  811. addps %xmm3, %xmm6
  812. pshufd $0x93, %xmm2, %xmm3
  813. mulps %xmm0, %xmm2
  814. addps %xmm2, %xmm5
  815. pshufd $0x93, %xmm3, %xmm2
  816. mulps %xmm0, %xmm3
  817. addps %xmm1, %xmm4
  818. movaps -16 * SIZE(BB), %xmm1
  819. mulps %xmm0, %xmm2
  820. movaps -16 * SIZE(AA), %xmm0
  821. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  822. addps %xmm2, %xmm7
  823. pshufd $0x93, %xmm1, %xmm2
  824. mulps %xmm0, %xmm1
  825. addps %xmm3, %xmm6
  826. pshufd $0x93, %xmm2, %xmm3
  827. mulps %xmm0, %xmm2
  828. addps %xmm2, %xmm5
  829. pshufd $0x93, %xmm3, %xmm2
  830. mulps %xmm0, %xmm3
  831. addps %xmm1, %xmm4
  832. movaps -12 * SIZE(BB), %xmm1
  833. mulps %xmm0, %xmm2
  834. movaps -12 * SIZE(AA), %xmm0
  835. addps %xmm2, %xmm7
  836. pshufd $0x93, %xmm1, %xmm2
  837. mulps %xmm0, %xmm1
  838. addps %xmm3, %xmm6
  839. pshufd $0x93, %xmm2, %xmm3
  840. mulps %xmm0, %xmm2
  841. addps %xmm2, %xmm5
  842. pshufd $0x93, %xmm3, %xmm2
  843. mulps %xmm0, %xmm3
  844. addps %xmm1, %xmm4
  845. movaps -8 * SIZE(BB), %xmm1
  846. mulps %xmm0, %xmm2
  847. movaps -8 * SIZE(AA), %xmm0
  848. addps %xmm2, %xmm7
  849. pshufd $0x93, %xmm1, %xmm2
  850. mulps %xmm0, %xmm1
  851. addps %xmm3, %xmm6
  852. pshufd $0x93, %xmm2, %xmm3
  853. mulps %xmm0, %xmm2
  854. addps %xmm2, %xmm5
  855. pshufd $0x93, %xmm3, %xmm2
  856. mulps %xmm0, %xmm3
  857. addps %xmm1, %xmm4
  858. movaps -4 * SIZE(BB), %xmm1
  859. mulps %xmm0, %xmm2
  860. movaps -4 * SIZE(AA), %xmm0
  861. addps %xmm2, %xmm7
  862. subl $-32 * SIZE, BB
  863. pshufd $0x93, %xmm1, %xmm2
  864. mulps %xmm0, %xmm1
  865. addps %xmm3, %xmm6
  866. pshufd $0x93, %xmm2, %xmm3
  867. mulps %xmm0, %xmm2
  868. addps %xmm2, %xmm5
  869. subl $-32 * SIZE, AA
  870. pshufd $0x93, %xmm3, %xmm2
  871. mulps %xmm0, %xmm3
  872. addps %xmm1, %xmm4
  873. movaps -32 * SIZE(BB), %xmm1
  874. mulps %xmm0, %xmm2
  875. movaps -32 * SIZE(AA), %xmm0
  876. subl $1, %eax
  877. jne .L12
  878. ALIGN_4
  879. .L15:
  880. #if defined(LT) || defined(RN)
  881. movl KK, %eax
  882. #else
  883. movl K, %eax
  884. subl KK, %eax
  885. #endif
  886. andl $7, %eax # if (k & 1)
  887. BRANCH
  888. je .L18
  889. ALIGN_4
  890. .L16:
  891. addps %xmm2, %xmm7
  892. pshufd $0x93, %xmm1, %xmm2
  893. mulps %xmm0, %xmm1
  894. addps %xmm3, %xmm6
  895. pshufd $0x93, %xmm2, %xmm3
  896. mulps %xmm0, %xmm2
  897. addps %xmm2, %xmm5
  898. pshufd $0x93, %xmm3, %xmm2
  899. mulps %xmm0, %xmm3
  900. addps %xmm1, %xmm4
  901. movaps -28 * SIZE(BB), %xmm1
  902. mulps %xmm0, %xmm2
  903. movaps -28 * SIZE(AA), %xmm0
  904. addl $4 * SIZE, AA
  905. addl $4 * SIZE, BB
  906. decl %eax
  907. jg .L16
  908. ALIGN_4
  909. .L18:
  910. #if defined(LN) || defined(RT)
  911. movl KK, %eax
  912. #ifdef LN
  913. subl $4, %eax
  914. #else
  915. subl $4, %eax
  916. #endif
  917. movl AORIG, AA
  918. leal (, %eax, SIZE), %eax
  919. leal (AA, %eax, 4), AA
  920. leal (B, %eax, 4), BB
  921. #endif
  922. addps %xmm3, %xmm6
  923. addps %xmm2, %xmm7
  924. #if defined(LN) || defined(LT)
  925. movaps %xmm4, %xmm0
  926. unpcklps %xmm7, %xmm0
  927. unpckhps %xmm7, %xmm4
  928. movaps %xmm6, %xmm2
  929. unpcklps %xmm5, %xmm2
  930. unpckhps %xmm5, %xmm6
  931. movaps %xmm0, %xmm1
  932. movlhps %xmm2, %xmm0
  933. movhlps %xmm2, %xmm1
  934. movaps %xmm6, %xmm7
  935. movlhps %xmm4, %xmm6
  936. movhlps %xmm4, %xmm7
  937. pshufd $0x39, %xmm1, %xmm2
  938. pshufd $0x39, %xmm7, %xmm4
  939. movaps -32 * SIZE(BB), %xmm1
  940. movaps -28 * SIZE(BB), %xmm3
  941. movaps -24 * SIZE(BB), %xmm5
  942. movaps -20 * SIZE(BB), %xmm7
  943. subps %xmm0, %xmm1
  944. subps %xmm2, %xmm3
  945. subps %xmm6, %xmm5
  946. subps %xmm4, %xmm7
  947. #else
  948. pshufd $0x39, %xmm5, %xmm2
  949. pshufd $0x4e, %xmm6, %xmm0
  950. pshufd $0x93, %xmm7, %xmm7
  951. movaps %xmm4, %xmm6
  952. unpcklps %xmm0, %xmm4
  953. unpckhps %xmm0, %xmm6
  954. movaps %xmm2, %xmm1
  955. unpcklps %xmm7, %xmm2
  956. unpckhps %xmm7, %xmm1
  957. movaps %xmm4, %xmm5
  958. unpcklps %xmm2, %xmm4
  959. unpckhps %xmm2, %xmm5
  960. movaps %xmm6, %xmm7
  961. unpcklps %xmm1, %xmm6
  962. unpckhps %xmm1, %xmm7
  963. pshufd $0x93, %xmm5, %xmm5
  964. pshufd $0x4e, %xmm6, %xmm6
  965. pshufd $0x39, %xmm7, %xmm7
  966. movaps -32 * SIZE(AA), %xmm0
  967. movaps -28 * SIZE(AA), %xmm1
  968. movaps -24 * SIZE(AA), %xmm2
  969. movaps -20 * SIZE(AA), %xmm3
  970. subps %xmm4, %xmm0
  971. subps %xmm5, %xmm1
  972. subps %xmm6, %xmm2
  973. subps %xmm7, %xmm3
  974. #endif
  975. #ifdef LN
  976. movaps -20 * SIZE(AA), %xmm4
  977. pshufd $0xff, %xmm4, %xmm6
  978. mulps %xmm6, %xmm7
  979. pshufd $0xaa, %xmm4, %xmm6
  980. mulps %xmm7, %xmm6
  981. subps %xmm6, %xmm5
  982. pshufd $0x55, %xmm4, %xmm6
  983. mulps %xmm7, %xmm6
  984. subps %xmm6, %xmm3
  985. pshufd $0x00, %xmm4, %xmm6
  986. mulps %xmm7, %xmm6
  987. subps %xmm6, %xmm1
  988. movaps -24 * SIZE(AA), %xmm4
  989. pshufd $0xaa, %xmm4, %xmm6
  990. mulps %xmm6, %xmm5
  991. pshufd $0x55, %xmm4, %xmm6
  992. mulps %xmm5, %xmm6
  993. subps %xmm6, %xmm3
  994. pshufd $0x00, %xmm4, %xmm6
  995. mulps %xmm5, %xmm6
  996. subps %xmm6, %xmm1
  997. movaps -28 * SIZE(AA), %xmm4
  998. pshufd $0x55, %xmm4, %xmm6
  999. mulps %xmm6, %xmm3
  1000. pshufd $0x00, %xmm4, %xmm6
  1001. mulps %xmm3, %xmm6
  1002. subps %xmm6, %xmm1
  1003. movaps -32 * SIZE(AA), %xmm4
  1004. pshufd $0x00, %xmm4, %xmm6
  1005. mulps %xmm6, %xmm1
  1006. #endif
  1007. #ifdef LT
  1008. movaps -32 * SIZE(AA), %xmm4
  1009. pshufd $0x00, %xmm4, %xmm6
  1010. mulps %xmm6, %xmm1
  1011. pshufd $0x55, %xmm4, %xmm6
  1012. mulps %xmm1, %xmm6
  1013. subps %xmm6, %xmm3
  1014. pshufd $0xaa, %xmm4, %xmm6
  1015. mulps %xmm1, %xmm6
  1016. subps %xmm6, %xmm5
  1017. pshufd $0xff, %xmm4, %xmm6
  1018. mulps %xmm1, %xmm6
  1019. subps %xmm6, %xmm7
  1020. movaps -28 * SIZE(AA), %xmm4
  1021. pshufd $0x55, %xmm4, %xmm6
  1022. mulps %xmm6, %xmm3
  1023. pshufd $0xaa, %xmm4, %xmm6
  1024. mulps %xmm3, %xmm6
  1025. subps %xmm6, %xmm5
  1026. pshufd $0xff, %xmm4, %xmm6
  1027. mulps %xmm3, %xmm6
  1028. subps %xmm6, %xmm7
  1029. movaps -24 * SIZE(AA), %xmm4
  1030. pshufd $0xaa, %xmm4, %xmm6
  1031. mulps %xmm6, %xmm5
  1032. pshufd $0xff, %xmm4, %xmm6
  1033. mulps %xmm5, %xmm6
  1034. subps %xmm6, %xmm7
  1035. movaps -20 * SIZE(AA), %xmm4
  1036. pshufd $0xff, %xmm4, %xmm6
  1037. mulps %xmm6, %xmm7
  1038. #endif
  1039. #ifdef RN
  1040. movaps -32 * SIZE(BB), %xmm6
  1041. pshufd $0x00, %xmm6, %xmm7
  1042. mulps %xmm7, %xmm0
  1043. pshufd $0x55, %xmm6, %xmm7
  1044. mulps %xmm0, %xmm7
  1045. subps %xmm7, %xmm1
  1046. pshufd $0xaa, %xmm6, %xmm7
  1047. mulps %xmm0, %xmm7
  1048. subps %xmm7, %xmm2
  1049. pshufd $0xff, %xmm6, %xmm7
  1050. mulps %xmm0, %xmm7
  1051. subps %xmm7, %xmm3
  1052. movaps -28 * SIZE(BB), %xmm6
  1053. pshufd $0x55, %xmm6, %xmm7
  1054. mulps %xmm7, %xmm1
  1055. pshufd $0xaa, %xmm6, %xmm7
  1056. mulps %xmm1, %xmm7
  1057. subps %xmm7, %xmm2
  1058. pshufd $0xff, %xmm6, %xmm7
  1059. mulps %xmm1, %xmm7
  1060. subps %xmm7, %xmm3
  1061. movaps -24 * SIZE(BB), %xmm6
  1062. pshufd $0xaa, %xmm6, %xmm7
  1063. mulps %xmm7, %xmm2
  1064. pshufd $0xff, %xmm6, %xmm7
  1065. mulps %xmm2, %xmm7
  1066. subps %xmm7, %xmm3
  1067. movaps -20 * SIZE(BB), %xmm6
  1068. pshufd $0xff, %xmm6, %xmm7
  1069. mulps %xmm7, %xmm3
  1070. #endif
  1071. #ifdef RT
  1072. movaps -20 * SIZE(BB), %xmm6
  1073. pshufd $0xff, %xmm6, %xmm7
  1074. mulps %xmm7, %xmm3
  1075. pshufd $0xaa, %xmm6, %xmm7
  1076. mulps %xmm3, %xmm7
  1077. subps %xmm7, %xmm2
  1078. pshufd $0x55, %xmm6, %xmm7
  1079. mulps %xmm3, %xmm7
  1080. subps %xmm7, %xmm1
  1081. pshufd $0x00, %xmm6, %xmm7
  1082. mulps %xmm3, %xmm7
  1083. subps %xmm7, %xmm0
  1084. movaps -24 * SIZE(BB), %xmm6
  1085. pshufd $0xaa, %xmm6, %xmm7
  1086. mulps %xmm7, %xmm2
  1087. pshufd $0x55, %xmm6, %xmm7
  1088. mulps %xmm2, %xmm7
  1089. subps %xmm7, %xmm1
  1090. pshufd $0x00, %xmm6, %xmm7
  1091. mulps %xmm2, %xmm7
  1092. subps %xmm7, %xmm0
  1093. movaps -28 * SIZE(BB), %xmm6
  1094. pshufd $0x55, %xmm6, %xmm7
  1095. mulps %xmm7, %xmm1
  1096. pshufd $0x00, %xmm6, %xmm7
  1097. mulps %xmm1, %xmm7
  1098. subps %xmm7, %xmm0
  1099. movaps -32 * SIZE(BB), %xmm6
  1100. pshufd $0x00, %xmm6, %xmm7
  1101. mulps %xmm7, %xmm0
  1102. #endif
  1103. #if defined(LN) || defined(LT)
  1104. movaps %xmm1, -32 * SIZE(BB)
  1105. movaps %xmm3, -28 * SIZE(BB)
  1106. movaps %xmm5, -24 * SIZE(BB)
  1107. movaps %xmm7, -20 * SIZE(BB)
  1108. #else
  1109. movaps %xmm0, -32 * SIZE(AA)
  1110. movaps %xmm1, -28 * SIZE(AA)
  1111. movaps %xmm2, -24 * SIZE(AA)
  1112. movaps %xmm3, -20 * SIZE(AA)
  1113. #endif
  1114. #ifdef LN
  1115. subl $4 * SIZE, CO1
  1116. #endif
  1117. leal (LDC, LDC, 2), %eax
  1118. #if defined(LN) || defined(LT)
  1119. movaps %xmm1, %xmm0
  1120. unpcklps %xmm5, %xmm1
  1121. unpckhps %xmm5, %xmm0
  1122. movaps %xmm3, %xmm4
  1123. unpcklps %xmm7, %xmm3
  1124. unpckhps %xmm7, %xmm4
  1125. movaps %xmm1, %xmm2
  1126. unpcklps %xmm3, %xmm1
  1127. unpckhps %xmm3, %xmm2
  1128. movaps %xmm0, %xmm6
  1129. unpcklps %xmm4, %xmm0
  1130. unpckhps %xmm4, %xmm6
  1131. movlps %xmm1, 0 * SIZE(CO1)
  1132. movhps %xmm1, 2 * SIZE(CO1)
  1133. movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
  1134. movhps %xmm2, 2 * SIZE(CO1, LDC, 1)
  1135. movlps %xmm0, 0 * SIZE(CO1, LDC, 2)
  1136. movhps %xmm0, 2 * SIZE(CO1, LDC, 2)
  1137. movlps %xmm6, 0 * SIZE(CO1, %eax, 1)
  1138. movhps %xmm6, 2 * SIZE(CO1, %eax, 1)
  1139. #else
  1140. movlps %xmm0, 0 * SIZE(CO1)
  1141. movhps %xmm0, 2 * SIZE(CO1)
  1142. movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
  1143. movhps %xmm1, 2 * SIZE(CO1, LDC, 1)
  1144. movlps %xmm2, 0 * SIZE(CO1, LDC, 2)
  1145. movhps %xmm2, 2 * SIZE(CO1, LDC, 2)
  1146. movlps %xmm3, 0 * SIZE(CO1, %eax, 1)
  1147. movhps %xmm3, 2 * SIZE(CO1, %eax, 1)
  1148. #endif
  1149. #ifndef LN
  1150. addl $4 * SIZE, CO1
  1151. #endif
  1152. #if defined(LT) || defined(RN)
  1153. movl K, %eax
  1154. subl KK, %eax
  1155. leal (,%eax, SIZE), %eax
  1156. leal (AA, %eax, 4), AA
  1157. leal (BB, %eax, 4), BB
  1158. #endif
  1159. #ifdef LN
  1160. subl $4, KK
  1161. #endif
  1162. #ifdef LT
  1163. addl $4, KK
  1164. #endif
  1165. #ifdef RT
  1166. movl K, %eax
  1167. sall $2 + BASE_SHIFT, %eax
  1168. addl %eax, AORIG
  1169. #endif
  1170. decl %ebx # i --
  1171. jg .L11
  1172. ALIGN_4
  1173. .L39:
  1174. #ifdef LN
  1175. movl K, %eax
  1176. leal (, %eax, SIZE), %eax
  1177. leal (B, %eax, 4), B
  1178. #endif
  1179. #if defined(LT) || defined(RN)
  1180. movl BB, B
  1181. #endif
  1182. #ifdef RN
  1183. addl $4, KK
  1184. #endif
  1185. #ifdef RT
  1186. subl $4, KK
  1187. #endif
  1188. decl J # j --
  1189. jg .L10
  1190. ALIGN_4
  1191. .L40:
  1192. testl $2, N
  1193. je .L80
  1194. #if defined(LT) || defined(RN)
  1195. movl A, AA
  1196. #else
  1197. movl A, %eax
  1198. movl %eax, AORIG
  1199. #endif
  1200. #ifdef RT
  1201. movl K, %eax
  1202. sall $1 + BASE_SHIFT, %eax
  1203. subl %eax, B
  1204. #endif
  1205. leal (, LDC, 2), %eax
  1206. #ifdef RT
  1207. subl %eax, C
  1208. #endif
  1209. movl C, CO1
  1210. #ifndef RT
  1211. addl %eax, C
  1212. #endif
  1213. #ifdef LN
  1214. movl OFFSET, %eax
  1215. addl M, %eax
  1216. movl %eax, KK
  1217. #endif
  1218. #ifdef LT
  1219. movl OFFSET, %eax
  1220. movl %eax, KK
  1221. #endif
  1222. testl $1, M
  1223. je .L60
  1224. #ifdef LN
  1225. movl K, %eax
  1226. sall $BASE_SHIFT, %eax
  1227. subl %eax, AORIG
  1228. #endif
  1229. #if defined(LN) || defined(RT)
  1230. movl KK, %eax
  1231. movl AORIG, AA
  1232. leal (AA, %eax, SIZE), AA
  1233. #endif
  1234. movl B, BB
  1235. #if defined(LN) || defined(RT)
  1236. movl KK, %eax
  1237. sall $1 + BASE_SHIFT, %eax
  1238. addl %eax, BB
  1239. #endif
  1240. pxor %xmm4, %xmm4
  1241. movsd -32 * SIZE(AA), %xmm0
  1242. pxor %xmm5, %xmm5
  1243. movsd -32 * SIZE(BB), %xmm1
  1244. #if defined(LT) || defined(RN)
  1245. movl KK, %eax
  1246. #else
  1247. movl K, %eax
  1248. subl KK, %eax
  1249. #endif
  1250. sarl $3, %eax
  1251. je .L75
  1252. ALIGN_4
  1253. .L72:
  1254. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1255. pshufd $0x00, %xmm0, %xmm2
  1256. mulps %xmm2, %xmm1
  1257. addps %xmm1, %xmm4
  1258. movsd -30 * SIZE(BB), %xmm1
  1259. pshufd $0x55, %xmm0, %xmm2
  1260. movsd -30 * SIZE(AA), %xmm0
  1261. mulps %xmm2, %xmm1
  1262. addps %xmm1, %xmm5
  1263. movsd -28 * SIZE(BB), %xmm1
  1264. pshufd $0x00, %xmm0, %xmm2
  1265. mulps %xmm2, %xmm1
  1266. addps %xmm1, %xmm4
  1267. movsd -26 * SIZE(BB), %xmm1
  1268. pshufd $0x55, %xmm0, %xmm2
  1269. movsd -28 * SIZE(AA), %xmm0
  1270. mulps %xmm2, %xmm1
  1271. addps %xmm1, %xmm5
  1272. movsd -24 * SIZE(BB), %xmm1
  1273. pshufd $0x00, %xmm0, %xmm2
  1274. mulps %xmm2, %xmm1
  1275. addps %xmm1, %xmm4
  1276. movsd -22 * SIZE(BB), %xmm1
  1277. pshufd $0x55, %xmm0, %xmm2
  1278. movsd -26 * SIZE(AA), %xmm0
  1279. mulps %xmm2, %xmm1
  1280. addps %xmm1, %xmm5
  1281. movsd -20 * SIZE(BB), %xmm1
  1282. pshufd $0x00, %xmm0, %xmm2
  1283. mulps %xmm2, %xmm1
  1284. addps %xmm1, %xmm4
  1285. movsd -18 * SIZE(BB), %xmm1
  1286. pshufd $0x55, %xmm0, %xmm2
  1287. movsd -24 * SIZE(AA), %xmm0
  1288. mulps %xmm2, %xmm1
  1289. addps %xmm1, %xmm5
  1290. movsd -16 * SIZE(BB), %xmm1
  1291. subl $ -8 * SIZE, AA
  1292. subl $-16 * SIZE, BB
  1293. subl $1, %eax
  1294. jne .L72
  1295. ALIGN_4
  1296. .L75:
  1297. #if defined(LT) || defined(RN)
  1298. movl KK, %eax
  1299. #else
  1300. movl K, %eax
  1301. subl KK, %eax
  1302. #endif
  1303. andl $7, %eax # if (k & 1)
  1304. BRANCH
  1305. je .L78
  1306. ALIGN_4
  1307. .L76:
  1308. pshufd $0x00, %xmm0, %xmm2
  1309. movss -31 * SIZE(AA), %xmm0
  1310. mulps %xmm2, %xmm1
  1311. addps %xmm1, %xmm4
  1312. movsd -30 * SIZE(BB), %xmm1
  1313. addl $1 * SIZE, AA
  1314. addl $2 * SIZE, BB
  1315. decl %eax
  1316. jg .L76
  1317. ALIGN_4
  1318. .L78:
  1319. #if defined(LN) || defined(RT)
  1320. movl KK, %eax
  1321. #ifdef LN
  1322. subl $1, %eax
  1323. #else
  1324. subl $2, %eax
  1325. #endif
  1326. movl AORIG, AA
  1327. leal (, %eax, SIZE), %eax
  1328. leal (AA, %eax, 1), AA
  1329. leal (B, %eax, 2), BB
  1330. #endif
  1331. addps %xmm5, %xmm4
  1332. pshufd $0x55, %xmm4, %xmm5
  1333. pshufd $0x00, %xmm4, %xmm4
  1334. #if defined(LN) || defined(LT)
  1335. unpcklps %xmm5, %xmm4
  1336. movsd -32 * SIZE(BB), %xmm1
  1337. subps %xmm4, %xmm1
  1338. #else
  1339. movss -32 * SIZE(AA), %xmm0
  1340. movss -31 * SIZE(AA), %xmm1
  1341. subss %xmm4, %xmm0
  1342. subss %xmm5, %xmm1
  1343. #endif
  1344. #if defined(LN) || defined(LT)
  1345. movss -32 * SIZE(AA), %xmm4
  1346. pshufd $0x00, %xmm4, %xmm6
  1347. mulps %xmm6, %xmm1
  1348. #endif
  1349. #ifdef RN
  1350. movaps -32 * SIZE(BB), %xmm6
  1351. pshufd $0x00, %xmm6, %xmm7
  1352. mulss %xmm7, %xmm0
  1353. pshufd $0x55, %xmm6, %xmm7
  1354. mulss %xmm0, %xmm7
  1355. subss %xmm7, %xmm1
  1356. pshufd $0xff, %xmm6, %xmm7
  1357. mulss %xmm7, %xmm1
  1358. #endif
  1359. #ifdef RT
  1360. movaps -32 * SIZE(BB), %xmm6
  1361. pshufd $0xff, %xmm6, %xmm7
  1362. mulss %xmm7, %xmm1
  1363. pshufd $0xaa, %xmm6, %xmm7
  1364. mulss %xmm1, %xmm7
  1365. subss %xmm7, %xmm0
  1366. pshufd $0x00, %xmm6, %xmm7
  1367. mulss %xmm7, %xmm0
  1368. #endif
  1369. #if defined(LN) || defined(LT)
  1370. movlps %xmm1, -32 * SIZE(BB)
  1371. #else
  1372. movss %xmm0, -32 * SIZE(AA)
  1373. movss %xmm1, -31 * SIZE(AA)
  1374. #endif
  1375. #ifdef LN
  1376. subl $1 * SIZE, CO1
  1377. #endif
  1378. #if defined(LN) || defined(LT)
  1379. pshufd $1, %xmm1, %xmm3
  1380. movss %xmm1, 0 * SIZE(CO1)
  1381. movss %xmm3, 0 * SIZE(CO1, LDC)
  1382. #else
  1383. movss %xmm0, 0 * SIZE(CO1)
  1384. movss %xmm1, 0 * SIZE(CO1, LDC)
  1385. #endif
  1386. #ifndef LN
  1387. addl $1 * SIZE, CO1
  1388. #endif
  1389. #if defined(LT) || defined(RN)
  1390. movl K, %eax
  1391. subl KK, %eax
  1392. leal (,%eax, SIZE), %eax
  1393. leal (AA, %eax, 1), AA
  1394. leal (BB, %eax, 2), BB
  1395. #endif
  1396. #ifdef LN
  1397. subl $1, KK
  1398. #endif
  1399. #ifdef LT
  1400. addl $1, KK
  1401. #endif
  1402. #ifdef RT
  1403. movl K, %eax
  1404. sall $BASE_SHIFT, %eax
  1405. addl %eax, AORIG
  1406. #endif
  1407. ALIGN_4
  1408. .L60:
  1409. testl $2, M
  1410. je .L70
  1411. #ifdef LN
  1412. movl K, %eax
  1413. sall $1 + BASE_SHIFT, %eax
  1414. subl %eax, AORIG
  1415. #endif
  1416. #if defined(LN) || defined(RT)
  1417. movl KK, %eax
  1418. movl AORIG, AA
  1419. leal (, %eax, SIZE), %eax
  1420. leal (AA, %eax, 2), AA
  1421. #endif
  1422. movl B, BB
  1423. #if defined(LN) || defined(RT)
  1424. movl KK, %eax
  1425. sall $1 + BASE_SHIFT, %eax
  1426. addl %eax, BB
  1427. #endif
  1428. movaps -32 * SIZE(AA), %xmm0
  1429. pxor %xmm3, %xmm3
  1430. movaps -32 * SIZE(BB), %xmm1
  1431. pxor %xmm4, %xmm4
  1432. pxor %xmm5, %xmm5
  1433. #if defined(LT) || defined(RN)
  1434. movl KK, %eax
  1435. #else
  1436. movl K, %eax
  1437. subl KK, %eax
  1438. #endif
  1439. sarl $3, %eax
  1440. je .L65
  1441. ALIGN_4
  1442. .L62:
  1443. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1444. pshufd $0x44, %xmm0, %xmm2
  1445. addps %xmm3, %xmm4
  1446. pshufd $0x50, %xmm1, %xmm3
  1447. mulps %xmm2, %xmm3
  1448. pshufd $0xee, %xmm0, %xmm2
  1449. movaps -28 * SIZE(AA), %xmm0
  1450. addps %xmm3, %xmm5
  1451. pshufd $0xfa, %xmm1, %xmm3
  1452. movaps -28 * SIZE(BB), %xmm1
  1453. mulps %xmm2, %xmm3
  1454. pshufd $0x44, %xmm0, %xmm2
  1455. addps %xmm3, %xmm4
  1456. pshufd $0x50, %xmm1, %xmm3
  1457. mulps %xmm2, %xmm3
  1458. pshufd $0xee, %xmm0, %xmm2
  1459. movaps -24 * SIZE(AA), %xmm0
  1460. addps %xmm3, %xmm5
  1461. pshufd $0xfa, %xmm1, %xmm3
  1462. movaps -24 * SIZE(BB), %xmm1
  1463. mulps %xmm2, %xmm3
  1464. pshufd $0x44, %xmm0, %xmm2
  1465. addps %xmm3, %xmm4
  1466. pshufd $0x50, %xmm1, %xmm3
  1467. mulps %xmm2, %xmm3
  1468. pshufd $0xee, %xmm0, %xmm2
  1469. movaps -20 * SIZE(AA), %xmm0
  1470. addps %xmm3, %xmm5
  1471. pshufd $0xfa, %xmm1, %xmm3
  1472. movaps -20 * SIZE(BB), %xmm1
  1473. mulps %xmm2, %xmm3
  1474. pshufd $0x44, %xmm0, %xmm2
  1475. addps %xmm3, %xmm4
  1476. pshufd $0x50, %xmm1, %xmm3
  1477. mulps %xmm2, %xmm3
  1478. pshufd $0xee, %xmm0, %xmm2
  1479. movaps -16 * SIZE(AA), %xmm0
  1480. addps %xmm3, %xmm5
  1481. pshufd $0xfa, %xmm1, %xmm3
  1482. movaps -16 * SIZE(BB), %xmm1
  1483. mulps %xmm2, %xmm3
  1484. subl $-16 * SIZE, AA
  1485. subl $-16 * SIZE, BB
  1486. subl $1, %eax
  1487. jne .L62
  1488. ALIGN_4
  1489. .L65:
  1490. #if defined(LT) || defined(RN)
  1491. movl KK, %eax
  1492. #else
  1493. movl K, %eax
  1494. subl KK, %eax
  1495. #endif
  1496. andl $7, %eax # if (k & 1)
  1497. BRANCH
  1498. je .L68
  1499. ALIGN_4
  1500. .L66:
  1501. pshufd $0x44, %xmm0, %xmm2
  1502. movsd -30 * SIZE(AA), %xmm0
  1503. addps %xmm3, %xmm4
  1504. pshufd $0x50, %xmm1, %xmm3
  1505. movsd -30 * SIZE(BB), %xmm1
  1506. mulps %xmm2, %xmm3
  1507. addl $2 * SIZE, AA
  1508. addl $2 * SIZE, BB
  1509. decl %eax
  1510. jg .L66
  1511. ALIGN_4
  1512. .L68:
  1513. #if defined(LN) || defined(RT)
  1514. movl KK, %eax
  1515. #ifdef LN
  1516. subl $2, %eax
  1517. #else
  1518. subl $2, %eax
  1519. #endif
  1520. movl AORIG, AA
  1521. leal (, %eax, SIZE), %eax
  1522. leal (AA, %eax, 2), AA
  1523. leal (B, %eax, 2), BB
  1524. #endif
  1525. addps %xmm3, %xmm4
  1526. addps %xmm5, %xmm4
  1527. movhlps %xmm4, %xmm5
  1528. #if defined(LN) || defined(LT)
  1529. unpcklps %xmm6, %xmm4
  1530. unpcklps %xmm7, %xmm5
  1531. movaps %xmm4, %xmm6
  1532. unpcklps %xmm5, %xmm4
  1533. unpckhps %xmm5, %xmm6
  1534. movsd -32 * SIZE(BB), %xmm1
  1535. movsd -30 * SIZE(BB), %xmm3
  1536. subps %xmm4, %xmm1
  1537. subps %xmm6, %xmm3
  1538. #else
  1539. movsd -32 * SIZE(AA), %xmm0
  1540. movsd -30 * SIZE(AA), %xmm1
  1541. subps %xmm4, %xmm0
  1542. subps %xmm5, %xmm1
  1543. #endif
  1544. #ifdef LN
  1545. movaps -32 * SIZE(AA), %xmm4
  1546. pshufd $0xff, %xmm4, %xmm6
  1547. mulps %xmm6, %xmm3
  1548. pshufd $0xaa, %xmm4, %xmm6
  1549. mulps %xmm3, %xmm6
  1550. subps %xmm6, %xmm1
  1551. pshufd $0x00, %xmm4, %xmm6
  1552. mulps %xmm6, %xmm1
  1553. #endif
  1554. #ifdef LT
  1555. movaps -32 * SIZE(AA), %xmm4
  1556. pshufd $0x00, %xmm4, %xmm6
  1557. mulps %xmm6, %xmm1
  1558. pshufd $0x55, %xmm4, %xmm6
  1559. mulps %xmm1, %xmm6
  1560. subps %xmm6, %xmm3
  1561. pshufd $0xff, %xmm4, %xmm6
  1562. mulps %xmm6, %xmm3
  1563. #endif
  1564. #ifdef RN
  1565. movaps -32 * SIZE(BB), %xmm6
  1566. pshufd $0x00, %xmm6, %xmm7
  1567. mulps %xmm7, %xmm0
  1568. pshufd $0x55, %xmm6, %xmm7
  1569. mulps %xmm0, %xmm7
  1570. subps %xmm7, %xmm1
  1571. pshufd $0xff, %xmm6, %xmm7
  1572. mulps %xmm7, %xmm1
  1573. #endif
  1574. #ifdef RT
  1575. movaps -32 * SIZE(BB), %xmm6
  1576. pshufd $0xff, %xmm6, %xmm7
  1577. mulps %xmm7, %xmm1
  1578. pshufd $0xaa, %xmm6, %xmm7
  1579. mulps %xmm1, %xmm7
  1580. subps %xmm7, %xmm0
  1581. pshufd $0x00, %xmm6, %xmm7
  1582. mulps %xmm7, %xmm0
  1583. #endif
  1584. #if defined(LN) || defined(LT)
  1585. movlps %xmm1, -32 * SIZE(BB)
  1586. movlps %xmm3, -30 * SIZE(BB)
  1587. #else
  1588. movlps %xmm0, -32 * SIZE(AA)
  1589. movlps %xmm1, -30 * SIZE(AA)
  1590. #endif
  1591. #ifdef LN
  1592. subl $2 * SIZE, CO1
  1593. #endif
  1594. #if defined(LN) || defined(LT)
  1595. unpcklps %xmm3, %xmm1
  1596. movlps %xmm1, 0 * SIZE(CO1)
  1597. movhps %xmm1, 0 * SIZE(CO1, LDC)
  1598. #else
  1599. movlps %xmm0, 0 * SIZE(CO1)
  1600. movlps %xmm1, 0 * SIZE(CO1, LDC)
  1601. #endif
  1602. #ifndef LN
  1603. addl $2 * SIZE, CO1
  1604. #endif
  1605. #if defined(LT) || defined(RN)
  1606. movl K, %eax
  1607. subl KK, %eax
  1608. leal (,%eax, SIZE), %eax
  1609. leal (AA, %eax, 2), AA
  1610. leal (BB, %eax, 2), BB
  1611. #endif
  1612. #ifdef LN
  1613. subl $2, KK
  1614. #endif
  1615. #ifdef LT
  1616. addl $2, KK
  1617. #endif
  1618. #ifdef RT
  1619. movl K, %eax
  1620. sall $1 + BASE_SHIFT, %eax
  1621. addl %eax, AORIG
  1622. #endif
  1623. ALIGN_4
  1624. .L70:
  1625. movl M, %ebx
  1626. sarl $2, %ebx # i = (m >> 2)
  1627. jle .L79
  1628. ALIGN_4
  1629. .L51:
  1630. #ifdef LN
  1631. movl K, %eax
  1632. sall $2 + BASE_SHIFT, %eax
  1633. subl %eax, AORIG
  1634. #endif
  1635. #if defined(LN) || defined(RT)
  1636. movl KK, %eax
  1637. movl AORIG, AA
  1638. leal (, %eax, SIZE), %eax
  1639. leal (AA, %eax, 4), AA
  1640. #endif
  1641. movl B, BB
  1642. #if defined(LN) || defined(RT)
  1643. movl KK, %eax
  1644. sall $1 + BASE_SHIFT, %eax
  1645. addl %eax, BB
  1646. #endif
  1647. movaps -32 * SIZE(AA), %xmm0
  1648. pxor %xmm2, %xmm2
  1649. movaps -32 * SIZE(BB), %xmm1
  1650. pxor %xmm3, %xmm3
  1651. #ifdef LN
  1652. pxor %xmm4, %xmm4
  1653. prefetcht0 -4 * SIZE(CO1)
  1654. pxor %xmm5, %xmm5
  1655. prefetcht0 -4 * SIZE(CO1, LDC)
  1656. #else
  1657. pxor %xmm4, %xmm4
  1658. prefetcht0 3 * SIZE(CO1)
  1659. pxor %xmm5, %xmm5
  1660. prefetcht0 3 * SIZE(CO1, LDC)
  1661. #endif
  1662. pxor %xmm6, %xmm6
  1663. pxor %xmm7, %xmm7
  1664. #if defined(LT) || defined(RN)
  1665. movl KK, %eax
  1666. #else
  1667. movl K, %eax
  1668. subl KK, %eax
  1669. #endif
  1670. sarl $3, %eax
  1671. je .L55
  1672. ALIGN_4
  1673. .L52:
  1674. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1675. addps %xmm2, %xmm4
  1676. pshufd $0x00, %xmm1, %xmm2
  1677. mulps %xmm0, %xmm2
  1678. addps %xmm3, %xmm5
  1679. pshufd $0x55, %xmm1, %xmm3
  1680. mulps %xmm0, %xmm3
  1681. movaps -28 * SIZE(AA), %xmm0
  1682. addps %xmm2, %xmm6
  1683. pshufd $0xaa, %xmm1, %xmm2
  1684. mulps %xmm0, %xmm2
  1685. addps %xmm3, %xmm7
  1686. pshufd $0xff, %xmm1, %xmm3
  1687. movaps -28 * SIZE(BB), %xmm1
  1688. mulps %xmm0, %xmm3
  1689. movaps -24 * SIZE(AA), %xmm0
  1690. addps %xmm2, %xmm4
  1691. pshufd $0x00, %xmm1, %xmm2
  1692. mulps %xmm0, %xmm2
  1693. addps %xmm3, %xmm5
  1694. pshufd $0x55, %xmm1, %xmm3
  1695. mulps %xmm0, %xmm3
  1696. movaps -20 * SIZE(AA), %xmm0
  1697. addps %xmm2, %xmm6
  1698. pshufd $0xaa, %xmm1, %xmm2
  1699. mulps %xmm0, %xmm2
  1700. addps %xmm3, %xmm7
  1701. pshufd $0xff, %xmm1, %xmm3
  1702. movaps -24 * SIZE(BB), %xmm1
  1703. mulps %xmm0, %xmm3
  1704. movaps -16 * SIZE(AA), %xmm0
  1705. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  1706. addps %xmm2, %xmm4
  1707. pshufd $0x00, %xmm1, %xmm2
  1708. mulps %xmm0, %xmm2
  1709. addps %xmm3, %xmm5
  1710. pshufd $0x55, %xmm1, %xmm3
  1711. mulps %xmm0, %xmm3
  1712. movaps -12 * SIZE(AA), %xmm0
  1713. addps %xmm2, %xmm6
  1714. pshufd $0xaa, %xmm1, %xmm2
  1715. mulps %xmm0, %xmm2
  1716. addps %xmm3, %xmm7
  1717. pshufd $0xff, %xmm1, %xmm3
  1718. movaps -20 * SIZE(BB), %xmm1
  1719. mulps %xmm0, %xmm3
  1720. movaps -8 * SIZE(AA), %xmm0
  1721. addps %xmm2, %xmm4
  1722. pshufd $0x00, %xmm1, %xmm2
  1723. mulps %xmm0, %xmm2
  1724. addps %xmm3, %xmm5
  1725. pshufd $0x55, %xmm1, %xmm3
  1726. mulps %xmm0, %xmm3
  1727. movaps -4 * SIZE(AA), %xmm0
  1728. addps %xmm2, %xmm6
  1729. pshufd $0xaa, %xmm1, %xmm2
  1730. mulps %xmm0, %xmm2
  1731. addps %xmm3, %xmm7
  1732. pshufd $0xff, %xmm1, %xmm3
  1733. movaps -16 * SIZE(BB), %xmm1
  1734. mulps %xmm0, %xmm3
  1735. movaps 0 * SIZE(AA), %xmm0
  1736. subl $-32 * SIZE, AA
  1737. subl $-16 * SIZE, BB
  1738. subl $1, %eax
  1739. jne .L52
  1740. ALIGN_4
  1741. .L55:
  1742. #if defined(LT) || defined(RN)
  1743. movl KK, %eax
  1744. #else
  1745. movl K, %eax
  1746. subl KK, %eax
  1747. #endif
  1748. andl $7, %eax # if (k & 1)
  1749. BRANCH
  1750. je .L58
  1751. ALIGN_4
  1752. .L56:
  1753. addps %xmm2, %xmm4
  1754. pshufd $0x00, %xmm1, %xmm2
  1755. mulps %xmm0, %xmm2
  1756. addps %xmm3, %xmm5
  1757. pshufd $0x55, %xmm1, %xmm3
  1758. movsd -30 * SIZE(BB), %xmm1
  1759. mulps %xmm0, %xmm3
  1760. movaps -28 * SIZE(AA), %xmm0
  1761. addl $4 * SIZE, AA
  1762. addl $2 * SIZE, BB
  1763. decl %eax
  1764. jg .L56
  1765. ALIGN_4
  1766. .L58:
  1767. #if defined(LN) || defined(RT)
  1768. movl KK, %eax
  1769. #ifdef LN
  1770. subl $4, %eax
  1771. #else
  1772. subl $2, %eax
  1773. #endif
  1774. movl AORIG, AA
  1775. leal (, %eax, SIZE), %eax
  1776. leal (AA, %eax, 4), AA
  1777. leal (B, %eax, 2), BB
  1778. #endif
  1779. addps %xmm6, %xmm4
  1780. addps %xmm7, %xmm5
  1781. addps %xmm2, %xmm4
  1782. addps %xmm3, %xmm5
  1783. #if defined(LN) || defined(LT)
  1784. movaps %xmm4, %xmm0
  1785. unpcklps %xmm6, %xmm4
  1786. unpckhps %xmm6, %xmm0
  1787. movaps %xmm5, %xmm1
  1788. unpcklps %xmm7, %xmm5
  1789. unpckhps %xmm7, %xmm1
  1790. movaps %xmm4, %xmm6
  1791. unpcklps %xmm5, %xmm4
  1792. unpckhps %xmm5, %xmm6
  1793. movaps %xmm0, %xmm2
  1794. unpcklps %xmm1, %xmm0
  1795. unpckhps %xmm1, %xmm2
  1796. movsd -32 * SIZE(BB), %xmm1
  1797. movsd -30 * SIZE(BB), %xmm3
  1798. movsd -28 * SIZE(BB), %xmm5
  1799. movsd -26 * SIZE(BB), %xmm7
  1800. subps %xmm4, %xmm1
  1801. subps %xmm6, %xmm3
  1802. subps %xmm0, %xmm5
  1803. subps %xmm2, %xmm7
  1804. #else
  1805. movaps -32 * SIZE(AA), %xmm0
  1806. movaps -28 * SIZE(AA), %xmm1
  1807. subps %xmm4, %xmm0
  1808. subps %xmm5, %xmm1
  1809. #endif
  1810. #ifdef LN
  1811. movaps -20 * SIZE(AA), %xmm4
  1812. pshufd $0xff, %xmm4, %xmm6
  1813. mulps %xmm6, %xmm7
  1814. pshufd $0xaa, %xmm4, %xmm6
  1815. mulps %xmm7, %xmm6
  1816. subps %xmm6, %xmm5
  1817. pshufd $0x55, %xmm4, %xmm6
  1818. mulps %xmm7, %xmm6
  1819. subps %xmm6, %xmm3
  1820. pshufd $0x00, %xmm4, %xmm6
  1821. mulps %xmm7, %xmm6
  1822. subps %xmm6, %xmm1
  1823. movaps -24 * SIZE(AA), %xmm4
  1824. pshufd $0xaa, %xmm4, %xmm6
  1825. mulps %xmm6, %xmm5
  1826. pshufd $0x55, %xmm4, %xmm6
  1827. mulps %xmm5, %xmm6
  1828. subps %xmm6, %xmm3
  1829. pshufd $0x00, %xmm4, %xmm6
  1830. mulps %xmm5, %xmm6
  1831. subps %xmm6, %xmm1
  1832. movaps -28 * SIZE(AA), %xmm4
  1833. pshufd $0x55, %xmm4, %xmm6
  1834. mulps %xmm6, %xmm3
  1835. pshufd $0x00, %xmm4, %xmm6
  1836. mulps %xmm3, %xmm6
  1837. subps %xmm6, %xmm1
  1838. movaps -32 * SIZE(AA), %xmm4
  1839. pshufd $0x00, %xmm4, %xmm6
  1840. mulps %xmm6, %xmm1
  1841. #endif
  1842. #ifdef LT
  1843. movaps -32 * SIZE(AA), %xmm4
  1844. pshufd $0x00, %xmm4, %xmm6
  1845. mulps %xmm6, %xmm1
  1846. pshufd $0x55, %xmm4, %xmm6
  1847. mulps %xmm1, %xmm6
  1848. subps %xmm6, %xmm3
  1849. pshufd $0xaa, %xmm4, %xmm6
  1850. mulps %xmm1, %xmm6
  1851. subps %xmm6, %xmm5
  1852. pshufd $0xff, %xmm4, %xmm6
  1853. mulps %xmm1, %xmm6
  1854. subps %xmm6, %xmm7
  1855. movaps -28 * SIZE(AA), %xmm4
  1856. pshufd $0x55, %xmm4, %xmm6
  1857. mulps %xmm6, %xmm3
  1858. pshufd $0xaa, %xmm4, %xmm6
  1859. mulps %xmm3, %xmm6
  1860. subps %xmm6, %xmm5
  1861. pshufd $0xff, %xmm4, %xmm6
  1862. mulps %xmm3, %xmm6
  1863. subps %xmm6, %xmm7
  1864. movaps -24 * SIZE(AA), %xmm4
  1865. pshufd $0xaa, %xmm4, %xmm6
  1866. mulps %xmm6, %xmm5
  1867. pshufd $0xff, %xmm4, %xmm6
  1868. mulps %xmm5, %xmm6
  1869. subps %xmm6, %xmm7
  1870. movaps -20 * SIZE(AA), %xmm4
  1871. pshufd $0xff, %xmm4, %xmm6
  1872. mulps %xmm6, %xmm7
  1873. #endif
  1874. #ifdef RN
  1875. movaps -32 * SIZE(BB), %xmm6
  1876. pshufd $0x00, %xmm6, %xmm7
  1877. mulps %xmm7, %xmm0
  1878. pshufd $0x55, %xmm6, %xmm7
  1879. mulps %xmm0, %xmm7
  1880. subps %xmm7, %xmm1
  1881. pshufd $0xff, %xmm6, %xmm7
  1882. mulps %xmm7, %xmm1
  1883. #endif
  1884. #ifdef RT
  1885. movaps -32 * SIZE(BB), %xmm6
  1886. pshufd $0xff, %xmm6, %xmm7
  1887. mulps %xmm7, %xmm1
  1888. pshufd $0xaa, %xmm6, %xmm7
  1889. mulps %xmm1, %xmm7
  1890. subps %xmm7, %xmm0
  1891. pshufd $0x00, %xmm6, %xmm7
  1892. mulps %xmm7, %xmm0
  1893. #endif
  1894. #if defined(LN) || defined(LT)
  1895. movlps %xmm1, -32 * SIZE(BB)
  1896. movlps %xmm3, -30 * SIZE(BB)
  1897. movlps %xmm5, -28 * SIZE(BB)
  1898. movlps %xmm7, -26 * SIZE(BB)
  1899. #else
  1900. movaps %xmm0, -32 * SIZE(AA)
  1901. movaps %xmm1, -28 * SIZE(AA)
  1902. #endif
  1903. #ifdef LN
  1904. subl $4 * SIZE, CO1
  1905. #endif
  1906. #if defined(LN) || defined(LT)
  1907. unpcklps %xmm5, %xmm1
  1908. unpcklps %xmm7, %xmm3
  1909. movaps %xmm1, %xmm2
  1910. unpcklps %xmm3, %xmm1
  1911. unpckhps %xmm3, %xmm2
  1912. movlps %xmm1, 0 * SIZE(CO1)
  1913. movhps %xmm1, 2 * SIZE(CO1)
  1914. movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
  1915. movhps %xmm2, 2 * SIZE(CO1, LDC, 1)
  1916. #else
  1917. movlps %xmm0, 0 * SIZE(CO1)
  1918. movhps %xmm0, 2 * SIZE(CO1)
  1919. movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
  1920. movhps %xmm1, 2 * SIZE(CO1, LDC, 1)
  1921. #endif
  1922. #ifndef LN
  1923. addl $4 * SIZE, CO1
  1924. #endif
  1925. #if defined(LT) || defined(RN)
  1926. movl K, %eax
  1927. subl KK, %eax
  1928. leal (,%eax, SIZE), %eax
  1929. leal (AA, %eax, 4), AA
  1930. leal (BB, %eax, 2), BB
  1931. #endif
  1932. #ifdef LN
  1933. subl $4, KK
  1934. #endif
  1935. #ifdef LT
  1936. addl $4, KK
  1937. #endif
  1938. #ifdef RT
  1939. movl K, %eax
  1940. sall $2 + BASE_SHIFT, %eax
  1941. addl %eax, AORIG
  1942. #endif
  1943. decl %ebx # i --
  1944. jg .L51
  1945. ALIGN_4
  1946. .L79:
  1947. #ifdef LN
  1948. movl K, %eax
  1949. leal (, %eax, SIZE), %eax
  1950. leal (B, %eax, 2), B
  1951. #endif
  1952. #if defined(LT) || defined(RN)
  1953. movl BB, B
  1954. #endif
  1955. #ifdef RN
  1956. addl $2, KK
  1957. #endif
  1958. #ifdef RT
  1959. subl $2, KK
  1960. #endif
  1961. ALIGN_4
  1962. .L80:
  1963. testl $1, N
  1964. je .L999
  1965. #if defined(LT) || defined(RN)
  1966. movl A, AA
  1967. #else
  1968. movl A, %eax
  1969. movl %eax, AORIG
  1970. #endif
  1971. #ifdef RT
  1972. movl K, %eax
  1973. sall $BASE_SHIFT, %eax
  1974. subl %eax, B
  1975. #endif
  1976. #ifdef RT
  1977. subl LDC, C
  1978. #endif
  1979. movl C, CO1
  1980. #ifndef RT
  1981. addl LDC, C
  1982. #endif
  1983. #ifdef LN
  1984. movl OFFSET, %eax
  1985. addl M, %eax
  1986. movl %eax, KK
  1987. #endif
  1988. #ifdef LT
  1989. movl OFFSET, %eax
  1990. movl %eax, KK
  1991. #endif
  1992. testl $1, M
  1993. je .L100
  1994. #ifdef LN
  1995. movl K, %eax
  1996. sall $BASE_SHIFT, %eax
  1997. subl %eax, AORIG
  1998. #endif
  1999. #if defined(LN) || defined(RT)
  2000. movl KK, %eax
  2001. movl AORIG, AA
  2002. leal (AA, %eax, SIZE), AA
  2003. #endif
  2004. movl B, BB
  2005. #if defined(LN) || defined(RT)
  2006. movl KK, %eax
  2007. sall $BASE_SHIFT, %eax
  2008. addl %eax, BB
  2009. #endif
  2010. pxor %xmm4, %xmm4
  2011. movsd -32 * SIZE(AA), %xmm0
  2012. pxor %xmm5, %xmm5
  2013. movsd -32 * SIZE(BB), %xmm1
  2014. #if defined(LT) || defined(RN)
  2015. movl KK, %eax
  2016. #else
  2017. movl K, %eax
  2018. subl KK, %eax
  2019. #endif
  2020. sarl $3, %eax
  2021. je .L115
  2022. ALIGN_4
  2023. .L112:
  2024. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  2025. mulps %xmm0, %xmm1
  2026. movsd -30 * SIZE(AA), %xmm0
  2027. addps %xmm1, %xmm4
  2028. movsd -30 * SIZE(BB), %xmm1
  2029. mulps %xmm0, %xmm1
  2030. movsd -28 * SIZE(AA), %xmm0
  2031. addps %xmm1, %xmm4
  2032. movsd -28 * SIZE(BB), %xmm1
  2033. mulps %xmm0, %xmm1
  2034. movsd -26 * SIZE(AA), %xmm0
  2035. addps %xmm1, %xmm4
  2036. movsd -26 * SIZE(BB), %xmm1
  2037. mulps %xmm0, %xmm1
  2038. movsd -24 * SIZE(AA), %xmm0
  2039. addps %xmm1, %xmm4
  2040. movsd -24 * SIZE(BB), %xmm1
  2041. subl $-8 * SIZE, AA
  2042. subl $-8 * SIZE, BB
  2043. subl $1, %eax
  2044. jne .L112
  2045. ALIGN_4
  2046. .L115:
  2047. #if defined(LT) || defined(RN)
  2048. movl KK, %eax
  2049. #else
  2050. movl K, %eax
  2051. subl KK, %eax
  2052. #endif
  2053. andl $7, %eax # if (k & 1)
  2054. BRANCH
  2055. je .L118
  2056. ALIGN_4
  2057. .L116:
  2058. mulss %xmm0, %xmm1
  2059. movss -31 * SIZE(AA), %xmm0
  2060. addss %xmm1, %xmm4
  2061. movss -31 * SIZE(BB), %xmm1
  2062. addl $1 * SIZE, AA
  2063. addl $1 * SIZE, BB
  2064. decl %eax
  2065. jg .L116
  2066. ALIGN_4
  2067. .L118:
  2068. #if defined(LN) || defined(RT)
  2069. movl KK, %eax
  2070. subl $1, %eax
  2071. movl AORIG, AA
  2072. leal (AA, %eax, SIZE), AA
  2073. leal (B, %eax, SIZE), BB
  2074. #endif
  2075. haddps %xmm4, %xmm4
  2076. #if defined(LN) || defined(LT)
  2077. movss -32 * SIZE(BB), %xmm1
  2078. subss %xmm4, %xmm1
  2079. #else
  2080. movss -32 * SIZE(AA), %xmm0
  2081. subss %xmm4, %xmm0
  2082. #endif
  2083. #if defined(LN) || defined(LT)
  2084. mulss -32 * SIZE(AA), %xmm1
  2085. #endif
  2086. #if defined(RN) || defined(RT)
  2087. mulss -32 * SIZE(BB), %xmm0
  2088. #endif
  2089. #if defined(LN) || defined(LT)
  2090. movss %xmm1, -32 * SIZE(BB)
  2091. #else
  2092. movss %xmm0, -32 * SIZE(AA)
  2093. #endif
  2094. #ifdef LN
  2095. subl $1 * SIZE, CO1
  2096. #endif
  2097. #if defined(LN) || defined(LT)
  2098. movss %xmm1, 0 * SIZE(CO1)
  2099. #else
  2100. movss %xmm0, 0 * SIZE(CO1)
  2101. #endif
  2102. #ifndef LN
  2103. addl $1 * SIZE, CO1
  2104. #endif
  2105. #if defined(LT) || defined(RN)
  2106. movl K, %eax
  2107. subl KK, %eax
  2108. leal (AA, %eax, SIZE), AA
  2109. leal (BB, %eax, SIZE), BB
  2110. #endif
  2111. #ifdef LN
  2112. subl $1, KK
  2113. #endif
  2114. #ifdef LT
  2115. addl $1, KK
  2116. #endif
  2117. #ifdef RT
  2118. movl K, %eax
  2119. sall $BASE_SHIFT, %eax
  2120. addl %eax, AORIG
  2121. #endif
  2122. ALIGN_4
  2123. .L100:
  2124. testl $2, M
  2125. je .L110
  2126. #ifdef LN
  2127. movl K, %eax
  2128. sall $1 + BASE_SHIFT, %eax
  2129. subl %eax, AORIG
  2130. #endif
  2131. #if defined(LN) || defined(RT)
  2132. movl KK, %eax
  2133. movl AORIG, AA
  2134. leal (, %eax, SIZE), %eax
  2135. leal (AA, %eax, 2), AA
  2136. #endif
  2137. movl B, BB
  2138. #if defined(LN) || defined(RT)
  2139. movl KK, %eax
  2140. sall $BASE_SHIFT, %eax
  2141. addl %eax, BB
  2142. #endif
  2143. movsd -32 * SIZE(AA), %xmm0
  2144. pxor %xmm3, %xmm3
  2145. movsd -32 * SIZE(BB), %xmm1
  2146. pxor %xmm4, %xmm4
  2147. pxor %xmm5, %xmm5
  2148. #if defined(LT) || defined(RN)
  2149. movl KK, %eax
  2150. #else
  2151. movl K, %eax
  2152. subl KK, %eax
  2153. #endif
  2154. sarl $3, %eax
  2155. je .L105
  2156. ALIGN_4
  2157. .L102:
  2158. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  2159. pshufd $0x00, %xmm1, %xmm2
  2160. mulps %xmm0, %xmm2
  2161. movsd -30 * SIZE(AA), %xmm0
  2162. addps %xmm2, %xmm4
  2163. pshufd $0x55, %xmm1, %xmm2
  2164. movsd -30 * SIZE(BB), %xmm1
  2165. mulps %xmm0, %xmm2
  2166. movsd -28 * SIZE(AA), %xmm0
  2167. addps %xmm2, %xmm5
  2168. pshufd $0x00, %xmm1, %xmm2
  2169. mulps %xmm0, %xmm2
  2170. movsd -26 * SIZE(AA), %xmm0
  2171. addps %xmm2, %xmm4
  2172. pshufd $0x55, %xmm1, %xmm2
  2173. movsd -28 * SIZE(BB), %xmm1
  2174. mulps %xmm0, %xmm2
  2175. movsd -24 * SIZE(AA), %xmm0
  2176. addps %xmm2, %xmm5
  2177. pshufd $0x00, %xmm1, %xmm2
  2178. mulps %xmm0, %xmm2
  2179. movsd -22 * SIZE(AA), %xmm0
  2180. addps %xmm2, %xmm4
  2181. pshufd $0x55, %xmm1, %xmm2
  2182. movsd -26 * SIZE(BB), %xmm1
  2183. mulps %xmm0, %xmm2
  2184. movsd -20 * SIZE(AA), %xmm0
  2185. addps %xmm2, %xmm5
  2186. pshufd $0x00, %xmm1, %xmm2
  2187. mulps %xmm0, %xmm2
  2188. movsd -18 * SIZE(AA), %xmm0
  2189. addps %xmm2, %xmm4
  2190. pshufd $0x55, %xmm1, %xmm2
  2191. movsd -24 * SIZE(BB), %xmm1
  2192. mulps %xmm0, %xmm2
  2193. movsd -16 * SIZE(AA), %xmm0
  2194. addps %xmm2, %xmm5
  2195. subl $-16 * SIZE, AA
  2196. subl $ -8 * SIZE, BB
  2197. subl $1, %eax
  2198. jne .L102
  2199. ALIGN_4
  2200. .L105:
  2201. #if defined(LT) || defined(RN)
  2202. movl KK, %eax
  2203. #else
  2204. movl K, %eax
  2205. subl KK, %eax
  2206. #endif
  2207. andl $7, %eax # if (k & 1)
  2208. BRANCH
  2209. je .L108
  2210. ALIGN_4
  2211. .L106:
  2212. pshufd $0x00, %xmm1, %xmm2
  2213. movss -31 * SIZE(BB), %xmm1
  2214. mulps %xmm0, %xmm2
  2215. movsd -30 * SIZE(AA), %xmm0
  2216. addps %xmm2, %xmm4
  2217. addl $2 * SIZE, AA
  2218. addl $1 * SIZE, BB
  2219. decl %eax
  2220. jg .L106
  2221. ALIGN_4
  2222. .L108:
  2223. #if defined(LN) || defined(RT)
  2224. movl KK, %eax
  2225. #ifdef LN
  2226. subl $2, %eax
  2227. #else
  2228. subl $1, %eax
  2229. #endif
  2230. movl AORIG, AA
  2231. leal (, %eax, SIZE), %eax
  2232. leal (AA, %eax, 2), AA
  2233. leal (B, %eax, 1), BB
  2234. #endif
  2235. addps %xmm5, %xmm4
  2236. #if defined(LN) || defined(LT)
  2237. pshufd $1, %xmm4, %xmm6
  2238. movss -32 * SIZE(BB), %xmm1
  2239. movss -31 * SIZE(BB), %xmm3
  2240. subss %xmm4, %xmm1
  2241. subss %xmm6, %xmm3
  2242. #else
  2243. movsd -32 * SIZE(AA), %xmm0
  2244. subps %xmm4, %xmm0
  2245. #endif
  2246. #ifdef LN
  2247. movsd -32 * SIZE(AA), %xmm4
  2248. movhps -30 * SIZE(AA), %xmm4
  2249. pshufd $0xff, %xmm4, %xmm6
  2250. mulss %xmm6, %xmm3
  2251. pshufd $0xaa, %xmm4, %xmm6
  2252. mulss %xmm3, %xmm6
  2253. subss %xmm6, %xmm1
  2254. pshufd $0x00, %xmm4, %xmm6
  2255. mulss %xmm6, %xmm1
  2256. #endif
  2257. #ifdef LT
  2258. movaps -32 * SIZE(AA), %xmm4
  2259. pshufd $0x00, %xmm4, %xmm6
  2260. mulss %xmm6, %xmm1
  2261. pshufd $0x55, %xmm4, %xmm6
  2262. mulss %xmm1, %xmm6
  2263. subss %xmm6, %xmm3
  2264. pshufd $0xff, %xmm4, %xmm6
  2265. mulss %xmm6, %xmm3
  2266. #endif
  2267. #if defined(RN) || defined(RT)
  2268. movss -32 * SIZE(BB), %xmm6
  2269. pshufd $0x00, %xmm6, %xmm7
  2270. mulps %xmm7, %xmm0
  2271. #endif
  2272. #if defined(LN) || defined(LT)
  2273. movss %xmm1, -32 * SIZE(BB)
  2274. movss %xmm3, -31 * SIZE(BB)
  2275. #else
  2276. movlps %xmm0, -32 * SIZE(AA)
  2277. #endif
  2278. #ifdef LN
  2279. subl $2 * SIZE, CO1
  2280. #endif
  2281. #if defined(LN) || defined(LT)
  2282. movss %xmm1, 0 * SIZE(CO1)
  2283. movss %xmm3, 1 * SIZE(CO1)
  2284. #else
  2285. movlps %xmm0, 0 * SIZE(CO1)
  2286. #endif
  2287. #ifndef LN
  2288. addl $2 * SIZE, CO1
  2289. #endif
  2290. #if defined(LT) || defined(RN)
  2291. movl K, %eax
  2292. subl KK, %eax
  2293. leal (,%eax, SIZE), %eax
  2294. leal (AA, %eax, 2), AA
  2295. leal (BB, %eax, 1), BB
  2296. #endif
  2297. #ifdef LN
  2298. subl $2, KK
  2299. #endif
  2300. #ifdef LT
  2301. addl $2, KK
  2302. #endif
  2303. #ifdef RT
  2304. movl K, %eax
  2305. sall $1 + BASE_SHIFT, %eax
  2306. addl %eax, AORIG
  2307. #endif
  2308. ALIGN_4
  2309. .L110:
  2310. movl M, %ebx
  2311. sarl $2, %ebx # i = (m >> 2)
  2312. jle .L119
  2313. ALIGN_4
  2314. .L91:
  2315. #ifdef LN
  2316. movl K, %eax
  2317. sall $2 + BASE_SHIFT, %eax
  2318. subl %eax, AORIG
  2319. #endif
  2320. #if defined(LN) || defined(RT)
  2321. movl KK, %eax
  2322. movl AORIG, AA
  2323. leal (, %eax, SIZE), %eax
  2324. leal (AA, %eax, 4), AA
  2325. #endif
  2326. movl B, BB
  2327. #if defined(LN) || defined(RT)
  2328. movl KK, %eax
  2329. sall $BASE_SHIFT, %eax
  2330. addl %eax, BB
  2331. #endif
  2332. movaps -32 * SIZE(AA), %xmm0
  2333. pxor %xmm2, %xmm2
  2334. movsd -32 * SIZE(BB), %xmm1
  2335. pxor %xmm4, %xmm4
  2336. #ifdef LN
  2337. prefetcht0 -4 * SIZE(CO1)
  2338. #else
  2339. prefetcht0 3 * SIZE(CO1)
  2340. #endif
  2341. pxor %xmm5, %xmm5
  2342. #if defined(LT) || defined(RN)
  2343. movl KK, %eax
  2344. #else
  2345. movl K, %eax
  2346. subl KK, %eax
  2347. #endif
  2348. sarl $3, %eax
  2349. je .L95
  2350. ALIGN_4
  2351. .L92:
  2352. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  2353. addps %xmm2, %xmm4
  2354. pshufd $0x00, %xmm1, %xmm2
  2355. mulps %xmm0, %xmm2
  2356. movaps -28 * SIZE(AA), %xmm0
  2357. addps %xmm2, %xmm5
  2358. pshufd $0x55, %xmm1, %xmm2
  2359. movsd -30 * SIZE(BB), %xmm1
  2360. mulps %xmm0, %xmm2
  2361. movaps -24 * SIZE(AA), %xmm0
  2362. addps %xmm2, %xmm4
  2363. pshufd $0x00, %xmm1, %xmm2
  2364. mulps %xmm0, %xmm2
  2365. movaps -20 * SIZE(AA), %xmm0
  2366. addps %xmm2, %xmm5
  2367. pshufd $0x55, %xmm1, %xmm2
  2368. movsd -28 * SIZE(BB), %xmm1
  2369. mulps %xmm0, %xmm2
  2370. movaps -16 * SIZE(AA), %xmm0
  2371. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  2372. addps %xmm2, %xmm4
  2373. pshufd $0x00, %xmm1, %xmm2
  2374. mulps %xmm0, %xmm2
  2375. movaps -12 * SIZE(AA), %xmm0
  2376. addps %xmm2, %xmm5
  2377. pshufd $0x55, %xmm1, %xmm2
  2378. movsd -26 * SIZE(BB), %xmm1
  2379. mulps %xmm0, %xmm2
  2380. movaps -8 * SIZE(AA), %xmm0
  2381. addps %xmm2, %xmm4
  2382. pshufd $0x00, %xmm1, %xmm2
  2383. mulps %xmm0, %xmm2
  2384. movaps -4 * SIZE(AA), %xmm0
  2385. addps %xmm2, %xmm5
  2386. pshufd $0x55, %xmm1, %xmm2
  2387. movsd -24 * SIZE(BB), %xmm1
  2388. mulps %xmm0, %xmm2
  2389. movaps 0 * SIZE(AA), %xmm0
  2390. subl $-32 * SIZE, AA
  2391. subl $ -8 * SIZE, BB
  2392. subl $1, %eax
  2393. jne .L92
  2394. ALIGN_4
  2395. .L95:
  2396. #if defined(LT) || defined(RN)
  2397. movl KK, %eax
  2398. #else
  2399. movl K, %eax
  2400. subl KK, %eax
  2401. #endif
  2402. andl $7, %eax # if (k & 1)
  2403. BRANCH
  2404. je .L98
  2405. ALIGN_4
  2406. .L96:
  2407. addps %xmm2, %xmm4
  2408. pshufd $0x00, %xmm1, %xmm2
  2409. movss -31 * SIZE(BB), %xmm1
  2410. mulps %xmm0, %xmm2
  2411. movaps -28 * SIZE(AA), %xmm0
  2412. addl $4 * SIZE, AA
  2413. addl $1 * SIZE, BB
  2414. decl %eax
  2415. jg .L96
  2416. ALIGN_4
  2417. .L98:
  2418. #if defined(LN) || defined(RT)
  2419. movl KK, %eax
  2420. #ifdef LN
  2421. subl $4, %eax
  2422. #else
  2423. subl $1, %eax
  2424. #endif
  2425. movl AORIG, AA
  2426. leal (, %eax, SIZE), %eax
  2427. leal (AA, %eax, 4), AA
  2428. leal (B, %eax, 1), BB
  2429. #endif
  2430. addps %xmm2, %xmm4
  2431. addps %xmm5, %xmm4
  2432. #if defined(LN) || defined(LT)
  2433. movaps %xmm4, %xmm0
  2434. unpcklps %xmm6, %xmm4
  2435. unpckhps %xmm6, %xmm0
  2436. movaps %xmm5, %xmm1
  2437. unpcklps %xmm7, %xmm5
  2438. unpckhps %xmm7, %xmm1
  2439. movaps %xmm4, %xmm6
  2440. unpcklps %xmm5, %xmm4
  2441. unpckhps %xmm5, %xmm6
  2442. movaps %xmm0, %xmm2
  2443. unpcklps %xmm1, %xmm0
  2444. unpckhps %xmm1, %xmm2
  2445. movss -32 * SIZE(BB), %xmm1
  2446. movss -31 * SIZE(BB), %xmm3
  2447. movss -30 * SIZE(BB), %xmm5
  2448. movss -29 * SIZE(BB), %xmm7
  2449. subss %xmm4, %xmm1
  2450. subss %xmm6, %xmm3
  2451. subss %xmm0, %xmm5
  2452. subss %xmm2, %xmm7
  2453. #else
  2454. movaps -32 * SIZE(AA), %xmm0
  2455. subps %xmm4, %xmm0
  2456. #endif
  2457. #ifdef LN
  2458. movaps -20 * SIZE(AA), %xmm4
  2459. pshufd $0xff, %xmm4, %xmm6
  2460. mulss %xmm6, %xmm7
  2461. pshufd $0xaa, %xmm4, %xmm6
  2462. mulss %xmm7, %xmm6
  2463. subss %xmm6, %xmm5
  2464. pshufd $0x55, %xmm4, %xmm6
  2465. mulss %xmm7, %xmm6
  2466. subss %xmm6, %xmm3
  2467. pshufd $0x00, %xmm4, %xmm6
  2468. mulss %xmm7, %xmm6
  2469. subss %xmm6, %xmm1
  2470. movaps -24 * SIZE(AA), %xmm4
  2471. pshufd $0xaa, %xmm4, %xmm6
  2472. mulss %xmm6, %xmm5
  2473. pshufd $0x55, %xmm4, %xmm6
  2474. mulss %xmm5, %xmm6
  2475. subss %xmm6, %xmm3
  2476. pshufd $0x00, %xmm4, %xmm6
  2477. mulss %xmm5, %xmm6
  2478. subss %xmm6, %xmm1
  2479. movaps -28 * SIZE(AA), %xmm4
  2480. pshufd $0x55, %xmm4, %xmm6
  2481. mulss %xmm6, %xmm3
  2482. pshufd $0x00, %xmm4, %xmm6
  2483. mulss %xmm3, %xmm6
  2484. subss %xmm6, %xmm1
  2485. movaps -32 * SIZE(AA), %xmm4
  2486. pshufd $0x00, %xmm4, %xmm6
  2487. mulss %xmm6, %xmm1
  2488. #endif
  2489. #ifdef LT
  2490. movaps -32 * SIZE(AA), %xmm4
  2491. pshufd $0x00, %xmm4, %xmm6
  2492. mulss %xmm6, %xmm1
  2493. pshufd $0x55, %xmm4, %xmm6
  2494. mulss %xmm1, %xmm6
  2495. subss %xmm6, %xmm3
  2496. pshufd $0xaa, %xmm4, %xmm6
  2497. mulss %xmm1, %xmm6
  2498. subss %xmm6, %xmm5
  2499. pshufd $0xff, %xmm4, %xmm6
  2500. mulss %xmm1, %xmm6
  2501. subss %xmm6, %xmm7
  2502. movaps -28 * SIZE(AA), %xmm4
  2503. pshufd $0x55, %xmm4, %xmm6
  2504. mulss %xmm6, %xmm3
  2505. pshufd $0xaa, %xmm4, %xmm6
  2506. mulss %xmm3, %xmm6
  2507. subss %xmm6, %xmm5
  2508. pshufd $0xff, %xmm4, %xmm6
  2509. mulss %xmm3, %xmm6
  2510. subss %xmm6, %xmm7
  2511. movaps -24 * SIZE(AA), %xmm4
  2512. pshufd $0xaa, %xmm4, %xmm6
  2513. mulss %xmm6, %xmm5
  2514. pshufd $0xff, %xmm4, %xmm6
  2515. mulss %xmm5, %xmm6
  2516. subss %xmm6, %xmm7
  2517. movaps -20 * SIZE(AA), %xmm4
  2518. pshufd $0xff, %xmm4, %xmm6
  2519. mulss %xmm6, %xmm7
  2520. #endif
  2521. #if defined(RN) || defined(RT)
  2522. movss -32 * SIZE(BB), %xmm6
  2523. pshufd $0x00, %xmm6, %xmm7
  2524. mulps %xmm7, %xmm0
  2525. #endif
  2526. #if defined(LN) || defined(LT)
  2527. movss %xmm1, -32 * SIZE(BB)
  2528. movss %xmm3, -31 * SIZE(BB)
  2529. movss %xmm5, -30 * SIZE(BB)
  2530. movss %xmm7, -29 * SIZE(BB)
  2531. #else
  2532. movaps %xmm0, -32 * SIZE(AA)
  2533. #endif
  2534. #ifdef LN
  2535. subl $4 * SIZE, CO1
  2536. #endif
  2537. #if defined(LN) || defined(LT)
  2538. unpcklps %xmm5, %xmm1
  2539. unpcklps %xmm7, %xmm3
  2540. unpcklps %xmm3, %xmm1
  2541. movlps %xmm1, 0 * SIZE(CO1)
  2542. movhps %xmm1, 2 * SIZE(CO1)
  2543. #else
  2544. movlps %xmm0, 0 * SIZE(CO1)
  2545. movhps %xmm0, 2 * SIZE(CO1)
  2546. #endif
  2547. #ifndef LN
  2548. addl $4 * SIZE, CO1
  2549. #endif
  2550. #if defined(LT) || defined(RN)
  2551. movl K, %eax
  2552. subl KK, %eax
  2553. leal (,%eax, SIZE), %eax
  2554. leal (AA, %eax, 4), AA
  2555. leal (BB, %eax, 1), BB
  2556. #endif
  2557. #ifdef LN
  2558. subl $4, KK
  2559. #endif
  2560. #ifdef LT
  2561. addl $4, KK
  2562. #endif
  2563. #ifdef RT
  2564. movl K, %eax
  2565. sall $2 + BASE_SHIFT, %eax
  2566. addl %eax, AORIG
  2567. #endif
  2568. decl %ebx # i --
  2569. jg .L91
  2570. ALIGN_4
  2571. .L119:
  2572. #ifdef LN
  2573. movl K, %eax
  2574. leal (B, %eax, SIZE), B
  2575. #endif
  2576. #if defined(LT) || defined(RN)
  2577. movl BB, B
  2578. #endif
  2579. #ifdef RN
  2580. addl $1, KK
  2581. #endif
  2582. #ifdef RT
  2583. subl $1, KK
  2584. #endif
  2585. ALIGN_4
  2586. .L999:
  2587. popl %ebx
  2588. popl %esi
  2589. popl %edi
  2590. popl %ebp
  2591. addl $ARGS, %esp
  2592. ret
  2593. EPILOGUE