You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_RT_2x2_sse.S 39 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esi)
  43. #define STACK_N 8 + STACK + ARGS(%esi)
  44. #define STACK_K 12 + STACK + ARGS(%esi)
  45. #define STACK_A 24 + STACK + ARGS(%esi)
  46. #define STACK_B 28 + STACK + ARGS(%esi)
  47. #define STACK_C 32 + STACK + ARGS(%esi)
  48. #define STACK_LDC 36 + STACK + ARGS(%esi)
  49. #define STACK_OFFT 40 + STACK + ARGS(%esi)
  50. #define POSINV 0(%esp)
  51. #define K 16(%esp)
  52. #define N 20(%esp)
  53. #define M 24(%esp)
  54. #define A 28(%esp)
  55. #define C 32(%esp)
  56. #define J 36(%esp)
  57. #define OLD_STACK 40(%esp)
  58. #define OFFSET 48(%esp)
  59. #define KK 52(%esp)
  60. #define KKK 56(%esp)
  61. #define AORIG 60(%esp)
  62. #define BORIG 64(%esp)
  63. #define BUFFER 128(%esp)
  64. #define B %edi
  65. #define LDC %ebp
  66. #define AA %edx
  67. #define BB %ecx
  68. #define CO1 %esi
  69. #define STACK_ALIGN 4096
  70. #define STACK_OFFSET 1024
  71. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  72. #define PREFETCHSIZE (16 * 10 + 8)
  73. #define WPREFETCHSIZE 112
  74. #define PREFETCH prefetch
  75. #define PREFETCHW prefetchw
  76. #endif
  77. #if defined(PENTIUM4) || defined(PENTIUMM)
  78. #define PREFETCH prefetcht1
  79. #define PREFETCHSIZE 168
  80. #define PREFETCHW prefetcht0
  81. #endif
  82. #if defined(PENRYN) || defined(DUNNINGTON)
  83. #define PREFETCH prefetcht1
  84. #define PREFETCHSIZE 168
  85. #define PREFETCHW prefetcht0
  86. #endif
  87. #if defined(OPTERON) || !defined(HAVE_SSE2)
  88. #define movsd movlps
  89. #endif
  90. #ifdef HAVE_SSE2
  91. #define xorps pxor
  92. #endif
  93. #define KERNEL1(address) \
  94. mulps %xmm0, %xmm2; \
  95. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  96. addps %xmm2, %xmm4; \
  97. movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  98. mulps %xmm0, %xmm2; \
  99. addps %xmm2, %xmm5; \
  100. movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  101. mulps %xmm0, %xmm2; \
  102. mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  103. addps %xmm2, %xmm6; \
  104. movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  105. addps %xmm0, %xmm7; \
  106. movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  107. #define KERNEL2(address) \
  108. mulps %xmm0, %xmm3; \
  109. addps %xmm3, %xmm4; \
  110. movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  111. mulps %xmm0, %xmm3; \
  112. addps %xmm3, %xmm5; \
  113. movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  114. mulps %xmm0, %xmm3; \
  115. mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  116. addps %xmm3, %xmm6; \
  117. movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  118. addps %xmm0, %xmm7; \
  119. movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  120. #define KERNEL3(address) \
  121. mulps %xmm0, %xmm2; \
  122. addps %xmm2, %xmm4; \
  123. movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  124. mulps %xmm0, %xmm2; \
  125. addps %xmm2, %xmm5; \
  126. movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  127. mulps %xmm0, %xmm2; \
  128. mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  129. addps %xmm2, %xmm6; \
  130. movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  131. addps %xmm0, %xmm7; \
  132. movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  133. #define KERNEL4(address) \
  134. mulps %xmm0, %xmm3; \
  135. addps %xmm3, %xmm4; \
  136. movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  137. mulps %xmm0, %xmm3; \
  138. addps %xmm3, %xmm5; \
  139. movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  140. mulps %xmm0, %xmm3; \
  141. mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  142. addps %xmm3, %xmm6; \
  143. movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  144. addps %xmm0, %xmm7; \
  145. movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  146. #define KERNEL5(address) \
  147. mulps %xmm1, %xmm2; \
  148. addps %xmm2, %xmm4; \
  149. movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  150. mulps %xmm1, %xmm2; \
  151. addps %xmm2, %xmm5; \
  152. movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  153. mulps %xmm1, %xmm2; \
  154. mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  155. addps %xmm2, %xmm6; \
  156. movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  157. addps %xmm1, %xmm7; \
  158. movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  159. #define KERNEL6(address) \
  160. mulps %xmm1, %xmm3; \
  161. addps %xmm3, %xmm4; \
  162. movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  163. mulps %xmm1, %xmm3; \
  164. addps %xmm3, %xmm5; \
  165. movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  166. mulps %xmm1, %xmm3; \
  167. mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  168. addps %xmm3, %xmm6; \
  169. movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  170. addps %xmm1, %xmm7; \
  171. movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  172. #define KERNEL7(address) \
  173. mulps %xmm1, %xmm2; \
  174. addps %xmm2, %xmm4; \
  175. movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  176. mulps %xmm1, %xmm2; \
  177. addps %xmm2, %xmm5; \
  178. movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  179. mulps %xmm1, %xmm2; \
  180. mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  181. addps %xmm2, %xmm6; \
  182. movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  183. addps %xmm1, %xmm7; \
  184. movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  185. #define KERNEL8(address) \
  186. mulps %xmm1, %xmm3; \
  187. addps %xmm3, %xmm4; \
  188. movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  189. mulps %xmm1, %xmm3; \
  190. addps %xmm3, %xmm5; \
  191. movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  192. mulps %xmm1, %xmm3; \
  193. mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  194. addps %xmm3, %xmm6; \
  195. movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  196. addps %xmm1, %xmm7; \
  197. movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
  198. PROLOGUE
  199. pushl %ebp
  200. pushl %edi
  201. pushl %esi
  202. pushl %ebx
  203. PROFCODE
  204. movl %esp, %esi # save old stack
  205. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  206. andl $-STACK_ALIGN, %esp # align stack
  207. addl $STACK_OFFSET, %esp
  208. STACK_TOUCHING
  209. movl STACK_M, %ebx
  210. movl STACK_N, %eax
  211. movl STACK_K, %ecx
  212. movl STACK_A, %edx
  213. movl %ebx, M
  214. movl %eax, N
  215. movl %ecx, K
  216. movl %edx, A
  217. movl %esi, OLD_STACK
  218. movl STACK_B, %edi
  219. movl STACK_C, %ebx
  220. movss STACK_OFFT, %xmm4
  221. xorps %xmm7, %xmm7
  222. pcmpeqb %xmm7, %xmm7
  223. pslld $31, %xmm7
  224. xorps %xmm2, %xmm2
  225. #ifndef CONJ
  226. movss %xmm7, 0 + POSINV
  227. movss %xmm2, 4 + POSINV
  228. movss %xmm7, 8 + POSINV
  229. movss %xmm2, 12 + POSINV
  230. #else
  231. movss %xmm2, 0 + POSINV
  232. movss %xmm7, 4 + POSINV
  233. movss %xmm2, 8 + POSINV
  234. movss %xmm7, 12 + POSINV
  235. #endif
  236. EMMS
  237. movl %ebx, C
  238. movl STACK_LDC, LDC
  239. movss %xmm4, OFFSET
  240. movss %xmm4, KK
  241. sall $ZBASE_SHIFT, LDC
  242. #ifdef LN
  243. movl M, %eax
  244. sall $ZBASE_SHIFT, %eax
  245. addl %eax, C
  246. imull K, %eax
  247. addl %eax, A
  248. #endif
  249. #ifdef RT
  250. movl N, %eax
  251. sall $ZBASE_SHIFT, %eax
  252. imull K, %eax
  253. addl %eax, B
  254. movl N, %eax
  255. imull LDC, %eax
  256. addl %eax, C
  257. #endif
  258. #ifdef RN
  259. negl KK
  260. #endif
  261. #ifdef RT
  262. movl N, %eax
  263. subl OFFSET, %eax
  264. movl %eax, KK
  265. #endif
  266. movl N, %eax
  267. andl $1, %eax
  268. jle .L100
  269. ALIGN_4
  270. .L101:
  271. #ifdef LN
  272. movl OFFSET, %eax
  273. addl M, %eax
  274. movl %eax, KK
  275. #endif
  276. leal BUFFER, %ecx
  277. #ifdef RT
  278. movl K, %eax
  279. sall $ZBASE_SHIFT, %eax
  280. subl %eax, B
  281. #endif
  282. #if defined(LN) || defined(RT)
  283. movl KK, %eax
  284. movl B, BORIG
  285. sall $ZBASE_SHIFT, %eax
  286. addl %eax, B
  287. leal (BB, %eax, 4), BB
  288. #endif
  289. #if defined(LT)
  290. movl OFFSET, %eax
  291. movl %eax, KK
  292. #endif
  293. #if defined(LT) || defined(RN)
  294. movl KK, %eax
  295. #else
  296. movl K, %eax
  297. subl KK, %eax
  298. #endif
  299. sarl $2, %eax
  300. jle .L103
  301. ALIGN_4
  302. .L102:
  303. movaps 0 * SIZE(B), %xmm3
  304. movaps 4 * SIZE(B), %xmm7
  305. pshufd $0x00, %xmm3, %xmm0
  306. pshufd $0x55, %xmm3, %xmm1
  307. pshufd $0xaa, %xmm3, %xmm2
  308. pshufd $0xff, %xmm3, %xmm3
  309. movaps %xmm0, 0 * SIZE(BB)
  310. movaps %xmm1, 4 * SIZE(BB)
  311. movaps %xmm2, 8 * SIZE(BB)
  312. movaps %xmm3, 12 * SIZE(BB)
  313. pshufd $0x00, %xmm7, %xmm4
  314. pshufd $0x55, %xmm7, %xmm5
  315. pshufd $0xaa, %xmm7, %xmm6
  316. pshufd $0xff, %xmm7, %xmm7
  317. movaps %xmm4, 16 * SIZE(BB)
  318. movaps %xmm5, 20 * SIZE(BB)
  319. movaps %xmm6, 24 * SIZE(BB)
  320. movaps %xmm7, 28 * SIZE(BB)
  321. addl $ 8 * SIZE, B
  322. addl $32 * SIZE, BB
  323. decl %eax
  324. jne .L102
  325. ALIGN_4
  326. .L103:
  327. #if defined(LT) || defined(RN)
  328. movl KK, %eax
  329. #else
  330. movl K, %eax
  331. subl KK, %eax
  332. #endif
  333. andl $3, %eax
  334. BRANCH
  335. jle .L105
  336. ALIGN_4
  337. .L104:
  338. #ifdef movsd
  339. xorps %xmm3, %xmm3
  340. #endif
  341. movsd 0 * SIZE(B), %xmm3
  342. pshufd $0x00, %xmm3, %xmm0
  343. pshufd $0x55, %xmm3, %xmm1
  344. movaps %xmm0, 0 * SIZE(BB)
  345. movaps %xmm1, 4 * SIZE(BB)
  346. addl $ 2 * SIZE, %edi
  347. addl $ 8 * SIZE, %ecx
  348. decl %eax
  349. jne .L104
  350. ALIGN_4
  351. .L105:
  352. #if defined(LT) || defined(RN)
  353. movl A, %eax
  354. movl %eax, AA
  355. #else
  356. movl A, %eax
  357. movl %eax, AORIG
  358. #endif
  359. #ifdef RT
  360. subl LDC, C
  361. #endif
  362. movl C, CO1
  363. #ifndef RT
  364. addl LDC, C
  365. #endif
  366. movl M, %ebx
  367. sarl $1, %ebx
  368. jle .L130
  369. ALIGN_4
  370. .L110:
  371. #ifdef LN
  372. movl K, %eax
  373. sall $1 + ZBASE_SHIFT, %eax
  374. subl %eax, AORIG
  375. #endif
  376. #if defined(LN) || defined(RT)
  377. movl AORIG, %eax
  378. movl %eax, AA
  379. movl KK, %eax
  380. sall $1 + ZBASE_SHIFT, %eax
  381. addl %eax, AA
  382. #endif
  383. leal BUFFER, BB # boffset1 = boffset
  384. #if defined(LN) || defined(RT)
  385. movl KK, %eax
  386. sall $2 + ZBASE_SHIFT, %eax
  387. addl %eax, BB
  388. #endif
  389. xorps %xmm4, %xmm4
  390. xorps %xmm5, %xmm5
  391. xorps %xmm6, %xmm6
  392. xorps %xmm7, %xmm7
  393. movaps 0 * SIZE(AA), %xmm0
  394. movaps 16 * SIZE(AA), %xmm1
  395. movaps 0 * SIZE(BB), %xmm2
  396. movaps 16 * SIZE(BB), %xmm3
  397. PREFETCHW 3 * SIZE(CO1)
  398. #if defined(LT) || defined(RN)
  399. movl KK, %eax
  400. #else
  401. movl K, %eax
  402. subl KK, %eax
  403. #endif
  404. sarl $3, %eax
  405. je .L112
  406. ALIGN_4
  407. .L111:
  408. mulps %xmm0, %xmm2
  409. addps %xmm2, %xmm4
  410. movaps 4 * SIZE(BB), %xmm2
  411. mulps %xmm0, %xmm2
  412. movaps 4 * SIZE(AA), %xmm0
  413. addps %xmm2, %xmm5
  414. movaps 8 * SIZE(BB), %xmm2
  415. mulps %xmm0, %xmm2
  416. addps %xmm2, %xmm6
  417. movaps 12 * SIZE(BB), %xmm2
  418. mulps %xmm0, %xmm2
  419. movaps 8 * SIZE(AA), %xmm0
  420. addps %xmm2, %xmm7
  421. movaps 32 * SIZE(BB), %xmm2
  422. mulps %xmm0, %xmm3
  423. addps %xmm3, %xmm4
  424. movaps 20 * SIZE(BB), %xmm3
  425. mulps %xmm0, %xmm3
  426. movaps 12 * SIZE(AA), %xmm0
  427. addps %xmm3, %xmm5
  428. movaps 24 * SIZE(BB), %xmm3
  429. mulps %xmm0, %xmm3
  430. addps %xmm3, %xmm6
  431. movaps 28 * SIZE(BB), %xmm3
  432. mulps %xmm0, %xmm3
  433. movaps 32 * SIZE(AA), %xmm0
  434. addps %xmm3, %xmm7
  435. movaps 48 * SIZE(BB), %xmm3
  436. mulps %xmm1, %xmm2
  437. addps %xmm2, %xmm4
  438. movaps 36 * SIZE(BB), %xmm2
  439. mulps %xmm1, %xmm2
  440. movaps 20 * SIZE(AA), %xmm1
  441. addps %xmm2, %xmm5
  442. movaps 40 * SIZE(BB), %xmm2
  443. mulps %xmm1, %xmm2
  444. addps %xmm2, %xmm6
  445. movaps 44 * SIZE(BB), %xmm2
  446. mulps %xmm1, %xmm2
  447. movaps 24 * SIZE(AA), %xmm1
  448. addps %xmm2, %xmm7
  449. movaps 64 * SIZE(BB), %xmm2
  450. mulps %xmm1, %xmm3
  451. addps %xmm3, %xmm4
  452. movaps 52 * SIZE(BB), %xmm3
  453. mulps %xmm1, %xmm3
  454. movaps 28 * SIZE(AA), %xmm1
  455. addps %xmm3, %xmm5
  456. movaps 56 * SIZE(BB), %xmm3
  457. mulps %xmm1, %xmm3
  458. addps %xmm3, %xmm6
  459. movaps 60 * SIZE(BB), %xmm3
  460. mulps %xmm1, %xmm3
  461. movaps 48 * SIZE(AA), %xmm1
  462. addps %xmm3, %xmm7
  463. movaps 80 * SIZE(BB), %xmm3
  464. addl $ 32 * SIZE, AA
  465. addl $ 64 * SIZE, BB
  466. decl %eax
  467. jne .L111
  468. ALIGN_4
  469. .L112:
  470. #if defined(LT) || defined(RN)
  471. movl KK, %eax
  472. #else
  473. movl K, %eax
  474. subl KK, %eax
  475. #endif
  476. andl $7, %eax # if (k & 1)
  477. BRANCH
  478. je .L114
  479. ALIGN_4
  480. .L113:
  481. mulps %xmm0, %xmm2
  482. mulps 4 * SIZE(BB), %xmm0
  483. addps %xmm2, %xmm4
  484. movaps 8 * SIZE(BB), %xmm2
  485. addps %xmm0, %xmm5
  486. movaps 4 * SIZE(AA), %xmm0
  487. addl $ 4 * SIZE, AA
  488. addl $ 8 * SIZE, BB
  489. decl %eax
  490. jg .L113
  491. ALIGN_4
  492. .L114:
  493. addps %xmm6, %xmm4
  494. addps %xmm7, %xmm5
  495. movaps POSINV, %xmm0
  496. shufps $0xb1, %xmm5, %xmm5
  497. #if defined(LN) || defined(LT)
  498. #ifndef CONJ
  499. xorps %xmm0, %xmm5
  500. #else
  501. xorps %xmm0, %xmm4
  502. #endif
  503. #else
  504. xorps %xmm0, %xmm5
  505. #endif
  506. addps %xmm5, %xmm4
  507. #if defined(LN) || defined(RT)
  508. movl KK, %eax
  509. #ifdef LN
  510. subl $2, %eax
  511. #else
  512. subl $1, %eax
  513. #endif
  514. movl AORIG, AA
  515. movl BORIG, B
  516. leal BUFFER, BB
  517. sall $ZBASE_SHIFT, %eax
  518. leal (AA, %eax, 2), AA
  519. leal (B, %eax, 1), B
  520. leal (BB, %eax, 4), BB
  521. #endif
  522. #if defined(LN) || defined(LT)
  523. movaps %xmm4, %xmm5
  524. unpcklpd %xmm6, %xmm4
  525. unpckhpd %xmm6, %xmm5
  526. #ifdef movsd
  527. xorps %xmm2, %xmm2
  528. #endif
  529. movsd 0 * SIZE(B), %xmm2
  530. #ifdef movsd
  531. xorps %xmm3, %xmm3
  532. #endif
  533. movsd 2 * SIZE(B), %xmm3
  534. subps %xmm4, %xmm2
  535. subps %xmm5, %xmm3
  536. #else
  537. movaps 0 * SIZE(AA), %xmm1
  538. subps %xmm4, %xmm1
  539. #endif
  540. #ifdef LN
  541. movaps 4 * SIZE(AA), %xmm5
  542. pshufd $0xee, %xmm5, %xmm6
  543. pshufd $0xbb, %xmm5, %xmm7
  544. pshufd $0xa0, %xmm3, %xmm4
  545. pshufd $0xf5, %xmm3, %xmm3
  546. #ifndef CONJ
  547. xorps %xmm0, %xmm3
  548. #else
  549. xorps %xmm0, %xmm4
  550. #endif
  551. mulps %xmm6, %xmm4
  552. mulps %xmm7, %xmm3
  553. addps %xmm4, %xmm3
  554. pshufd $0x44, %xmm5, %xmm6
  555. pshufd $0x11, %xmm5, %xmm7
  556. pshufd $0xa0, %xmm3, %xmm4
  557. pshufd $0xf5, %xmm3, %xmm1
  558. #ifndef CONJ
  559. xorps %xmm0, %xmm1
  560. #else
  561. xorps %xmm0, %xmm4
  562. #endif
  563. mulps %xmm6, %xmm4
  564. mulps %xmm7, %xmm1
  565. subps %xmm4, %xmm2
  566. subps %xmm1, %xmm2
  567. movaps 0 * SIZE(AA), %xmm5
  568. pshufd $0x44, %xmm5, %xmm6
  569. pshufd $0x11, %xmm5, %xmm7
  570. pshufd $0xa0, %xmm2, %xmm4
  571. pshufd $0xf5, %xmm2, %xmm2
  572. #ifndef CONJ
  573. xorps %xmm0, %xmm2
  574. #else
  575. xorps %xmm0, %xmm4
  576. #endif
  577. mulps %xmm6, %xmm4
  578. mulps %xmm7, %xmm2
  579. addps %xmm4, %xmm2
  580. #endif
  581. #ifdef LT
  582. movaps 0 * SIZE(AA), %xmm5
  583. pshufd $0x44, %xmm5, %xmm6
  584. pshufd $0x11, %xmm5, %xmm7
  585. pshufd $0xa0, %xmm2, %xmm4
  586. pshufd $0xf5, %xmm2, %xmm2
  587. #ifndef CONJ
  588. xorps %xmm0, %xmm2
  589. #else
  590. xorps %xmm0, %xmm4
  591. #endif
  592. mulps %xmm6, %xmm4
  593. mulps %xmm7, %xmm2
  594. addps %xmm4, %xmm2
  595. pshufd $0xee, %xmm5, %xmm6
  596. pshufd $0xbb, %xmm5, %xmm7
  597. pshufd $0xa0, %xmm2, %xmm4
  598. pshufd $0xf5, %xmm2, %xmm1
  599. #ifndef CONJ
  600. xorps %xmm0, %xmm1
  601. #else
  602. xorps %xmm0, %xmm4
  603. #endif
  604. mulps %xmm6, %xmm4
  605. mulps %xmm7, %xmm1
  606. subps %xmm4, %xmm3
  607. subps %xmm1, %xmm3
  608. movaps 4 * SIZE(AA), %xmm5
  609. pshufd $0xee, %xmm5, %xmm6
  610. pshufd $0xbb, %xmm5, %xmm7
  611. pshufd $0xa0, %xmm3, %xmm4
  612. pshufd $0xf5, %xmm3, %xmm3
  613. #ifndef CONJ
  614. xorps %xmm0, %xmm3
  615. #else
  616. xorps %xmm0, %xmm4
  617. #endif
  618. mulps %xmm6, %xmm4
  619. mulps %xmm7, %xmm3
  620. addps %xmm4, %xmm3
  621. #endif
  622. #if defined(RN) || defined(RT)
  623. movaps 0 * SIZE(B), %xmm4
  624. pshufd $0x44, %xmm4, %xmm6
  625. pshufd $0x11, %xmm4, %xmm7
  626. pshufd $0xa0, %xmm1, %xmm3
  627. pshufd $0xf5, %xmm1, %xmm1
  628. #ifndef CONJ
  629. xorps %xmm0, %xmm1
  630. #else
  631. xorps %xmm0, %xmm3
  632. #endif
  633. mulps %xmm6, %xmm3
  634. mulps %xmm7, %xmm1
  635. addps %xmm3, %xmm1
  636. #endif
  637. #ifdef LN
  638. subl $4 * SIZE, CO1
  639. #endif
  640. #if defined(LN) || defined(LT)
  641. movlps %xmm2, 0 * SIZE(B)
  642. movlps %xmm3, 2 * SIZE(B)
  643. pshufd $0x00, %xmm2, %xmm0
  644. pshufd $0x55, %xmm2, %xmm1
  645. movaps %xmm0, 0 * SIZE(BB)
  646. movaps %xmm1, 4 * SIZE(BB)
  647. pshufd $0x00, %xmm3, %xmm0
  648. pshufd $0x55, %xmm3, %xmm1
  649. movaps %xmm0, 8 * SIZE(BB)
  650. movaps %xmm1, 12 * SIZE(BB)
  651. movlps %xmm2, 0 * SIZE(CO1)
  652. movlps %xmm3, 2 * SIZE(CO1)
  653. #else
  654. movaps %xmm1, 0 * SIZE(AA)
  655. movlps %xmm1, 0 * SIZE(CO1)
  656. movhps %xmm1, 2 * SIZE(CO1)
  657. #endif
  658. #ifndef LN
  659. addl $4 * SIZE, CO1
  660. #endif
  661. #if defined(LT) || defined(RN)
  662. movl K, %eax
  663. subl KK, %eax
  664. sall $1 + ZBASE_SHIFT, %eax
  665. addl %eax, AA
  666. #ifdef LT
  667. addl $4 * SIZE, B
  668. #endif
  669. #endif
  670. #ifdef LN
  671. subl $2, KK
  672. movl BORIG, B
  673. #endif
  674. #ifdef LT
  675. addl $2, KK
  676. #endif
  677. #ifdef RT
  678. movl K, %eax
  679. movl BORIG, B
  680. sall $1 + ZBASE_SHIFT, %eax
  681. addl %eax, AORIG
  682. #endif
  683. decl %ebx # i --
  684. jg .L110
  685. ALIGN_4
  686. .L130:
  687. movl M, %ebx
  688. andl $1, %ebx
  689. jle .L149
  690. #ifdef LN
  691. movl K, %eax
  692. sall $ZBASE_SHIFT, %eax
  693. subl %eax, AORIG
  694. #endif
  695. #if defined(LN) || defined(RT)
  696. movl AORIG, %eax
  697. movl %eax, AA
  698. movl KK, %eax
  699. sall $ZBASE_SHIFT, %eax
  700. addl %eax, AA
  701. #endif
  702. leal BUFFER, BB # boffset1 = boffset
  703. #if defined(LN) || defined(RT)
  704. movl KK, %eax
  705. sall $2 + ZBASE_SHIFT, %eax
  706. addl %eax, BB
  707. #endif
  708. #ifdef movsd
  709. xorps %xmm0, %xmm0
  710. #endif
  711. movsd 0 * SIZE(AA), %xmm0
  712. xorps %xmm4, %xmm4
  713. #ifdef movsd
  714. xorps %xmm1, %xmm1
  715. #endif
  716. movsd 8 * SIZE(AA), %xmm1
  717. xorps %xmm5, %xmm5
  718. movaps 0 * SIZE(BB), %xmm2
  719. xorps %xmm6, %xmm6
  720. movaps 16 * SIZE(BB), %xmm3
  721. xorps %xmm7, %xmm7
  722. #if defined(LT) || defined(RN)
  723. movl KK, %eax
  724. #else
  725. movl K, %eax
  726. subl KK, %eax
  727. #endif
  728. sarl $3, %eax
  729. je .L142
  730. ALIGN_4
  731. .L141:
  732. mulps %xmm0, %xmm2
  733. addps %xmm2, %xmm4
  734. movaps 4 * SIZE(BB), %xmm2
  735. mulps %xmm0, %xmm2
  736. movsd 2 * SIZE(AA), %xmm0
  737. addps %xmm2, %xmm5
  738. movaps 8 * SIZE(BB), %xmm2
  739. mulps %xmm0, %xmm2
  740. addps %xmm2, %xmm6
  741. movaps 12 * SIZE(BB), %xmm2
  742. mulps %xmm0, %xmm2
  743. movsd 4 * SIZE(AA), %xmm0
  744. addps %xmm2, %xmm7
  745. movaps 32 * SIZE(BB), %xmm2
  746. mulps %xmm0, %xmm3
  747. addps %xmm3, %xmm4
  748. movaps 20 * SIZE(BB), %xmm3
  749. mulps %xmm0, %xmm3
  750. movsd 6 * SIZE(AA), %xmm0
  751. addps %xmm3, %xmm5
  752. movaps 24 * SIZE(BB), %xmm3
  753. mulps %xmm0, %xmm3
  754. addps %xmm3, %xmm6
  755. movaps 28 * SIZE(BB), %xmm3
  756. mulps %xmm0, %xmm3
  757. movsd 16 * SIZE(AA), %xmm0
  758. addps %xmm3, %xmm7
  759. movaps 48 * SIZE(BB), %xmm3
  760. mulps %xmm1, %xmm2
  761. addps %xmm2, %xmm4
  762. movaps 36 * SIZE(BB), %xmm2
  763. mulps %xmm1, %xmm2
  764. movsd 10 * SIZE(AA), %xmm1
  765. addps %xmm2, %xmm5
  766. movaps 40 * SIZE(BB), %xmm2
  767. mulps %xmm1, %xmm2
  768. addps %xmm2, %xmm6
  769. movaps 44 * SIZE(BB), %xmm2
  770. mulps %xmm1, %xmm2
  771. movsd 12 * SIZE(AA), %xmm1
  772. addps %xmm2, %xmm7
  773. movaps 64 * SIZE(BB), %xmm2
  774. mulps %xmm1, %xmm3
  775. addps %xmm3, %xmm4
  776. movaps 52 * SIZE(BB), %xmm3
  777. mulps %xmm1, %xmm3
  778. movsd 14 * SIZE(AA), %xmm1
  779. addps %xmm3, %xmm5
  780. movaps 56 * SIZE(BB), %xmm3
  781. mulps %xmm1, %xmm3
  782. addps %xmm3, %xmm6
  783. movaps 60 * SIZE(BB), %xmm3
  784. mulps %xmm1, %xmm3
  785. movsd 24 * SIZE(AA), %xmm1
  786. addps %xmm3, %xmm7
  787. movaps 80 * SIZE(BB), %xmm3
  788. addl $ 16 * SIZE, AA
  789. addl $ 64 * SIZE, BB
  790. decl %eax
  791. jne .L141
  792. ALIGN_4
  793. .L142:
  794. #if defined(LT) || defined(RN)
  795. movl KK, %eax
  796. #else
  797. movl K, %eax
  798. subl KK, %eax
  799. #endif
  800. andl $7, %eax # if (k & 1)
  801. BRANCH
  802. je .L144
  803. ALIGN_4
  804. .L143:
  805. mulps %xmm0, %xmm2
  806. mulps 4 * SIZE(BB), %xmm0
  807. addps %xmm2, %xmm4
  808. movaps 8 * SIZE(BB), %xmm2
  809. addps %xmm0, %xmm5
  810. movsd 2 * SIZE(AA), %xmm0
  811. addl $2 * SIZE, AA
  812. addl $8 * SIZE, BB
  813. decl %eax
  814. jg .L143
  815. ALIGN_4
  816. .L144:
  817. addps %xmm6, %xmm4
  818. addps %xmm7, %xmm5
  819. movaps POSINV, %xmm0
  820. shufps $0xb1, %xmm5, %xmm5
  821. #if defined(LN) || defined(LT)
  822. #ifndef CONJ
  823. xorps %xmm0, %xmm5
  824. #else
  825. xorps %xmm0, %xmm4
  826. #endif
  827. #else
  828. xorps %xmm0, %xmm5
  829. #endif
  830. addps %xmm5, %xmm4
  831. #if defined(LN) || defined(RT)
  832. movl KK, %eax
  833. subl $1, %eax
  834. movl AORIG, AA
  835. movl BORIG, B
  836. leal BUFFER, BB
  837. sall $ZBASE_SHIFT, %eax
  838. addl %eax, AA
  839. addl %eax, B
  840. leal (BB, %eax, 4), BB
  841. #endif
  842. #if defined(LN) || defined(LT)
  843. #ifdef movsd
  844. xorps %xmm2, %xmm2
  845. #endif
  846. movsd 0 * SIZE(B), %xmm2
  847. subps %xmm4, %xmm2
  848. #else
  849. #ifdef movsd
  850. xorps %xmm1, %xmm1
  851. #endif
  852. movsd 0 * SIZE(AA), %xmm1
  853. subps %xmm4, %xmm1
  854. #endif
  855. #if defined(LN) || defined(LT)
  856. movaps 0 * SIZE(AA), %xmm5
  857. pshufd $0x44, %xmm5, %xmm6
  858. pshufd $0x11, %xmm5, %xmm7
  859. pshufd $0xa0, %xmm2, %xmm4
  860. pshufd $0xf5, %xmm2, %xmm2
  861. #ifndef CONJ
  862. xorps %xmm0, %xmm2
  863. #else
  864. xorps %xmm0, %xmm4
  865. #endif
  866. mulps %xmm6, %xmm4
  867. mulps %xmm7, %xmm2
  868. addps %xmm4, %xmm2
  869. #endif
  870. #if defined(RN) || defined(RT)
  871. movaps 0 * SIZE(B), %xmm4
  872. pshufd $0x44, %xmm4, %xmm6
  873. pshufd $0x11, %xmm4, %xmm7
  874. pshufd $0xa0, %xmm1, %xmm3
  875. pshufd $0xf5, %xmm1, %xmm1
  876. #ifndef CONJ
  877. xorps %xmm0, %xmm1
  878. #else
  879. xorps %xmm0, %xmm3
  880. #endif
  881. mulps %xmm6, %xmm3
  882. mulps %xmm7, %xmm1
  883. addps %xmm3, %xmm1
  884. #endif
  885. #ifdef LN
  886. subl $2 * SIZE, CO1
  887. #endif
  888. #if defined(LN) || defined(LT)
  889. movlps %xmm2, 0 * SIZE(B)
  890. pshufd $0x00, %xmm2, %xmm0
  891. pshufd $0x55, %xmm2, %xmm1
  892. movaps %xmm0, 0 * SIZE(BB)
  893. movaps %xmm1, 4 * SIZE(BB)
  894. movlps %xmm2, 0 * SIZE(CO1)
  895. #else
  896. movlps %xmm1, 0 * SIZE(AA)
  897. movlps %xmm1, 0 * SIZE(CO1)
  898. #endif
  899. #ifndef LN
  900. addl $2 * SIZE, CO1
  901. #endif
  902. #if defined(LT) || defined(RN)
  903. movl K, %eax
  904. subl KK, %eax
  905. sall $ZBASE_SHIFT, %eax
  906. addl %eax, AA
  907. #ifdef LT
  908. addl $2 * SIZE, B
  909. #endif
  910. #endif
  911. #ifdef LN
  912. subl $1, KK
  913. movl BORIG, B
  914. #endif
  915. #ifdef LT
  916. addl $1, KK
  917. #endif
  918. #ifdef RT
  919. movl K, %eax
  920. movl BORIG, B
  921. sall $ZBASE_SHIFT, %eax
  922. addl %eax, AORIG
  923. #endif
  924. ALIGN_4
  925. .L149:
  926. #ifdef LN
  927. movl K, %eax
  928. sall $ZBASE_SHIFT, %eax
  929. addl %eax, B
  930. #endif
  931. #if defined(LT) || defined(RN)
  932. movl K, %eax
  933. subl KK, %eax
  934. sall $ZBASE_SHIFT, %eax
  935. addl %eax, B
  936. #endif
  937. #ifdef RN
  938. addl $1, KK
  939. #endif
  940. #ifdef RT
  941. subl $1, KK
  942. #endif
  943. ALIGN_4
  944. .L100:
  945. movl N, %eax
  946. movl %eax, J
  947. sarl $1, J
  948. jle .L999
  949. ALIGN_4
  950. .L01:
  951. #ifdef LN
  952. movl OFFSET, %eax
  953. addl M, %eax
  954. movl %eax, KK
  955. #endif
  956. leal BUFFER, %ecx
  957. #ifdef RT
  958. movl K, %eax
  959. sall $1 + ZBASE_SHIFT, %eax
  960. subl %eax, B
  961. #endif
  962. #if defined(LN) || defined(RT)
  963. movl KK, %eax
  964. movl B, BORIG
  965. sall $1 + ZBASE_SHIFT, %eax
  966. addl %eax, B
  967. leal (BB, %eax, 4), BB
  968. #endif
  969. #if defined(LT)
  970. movl OFFSET, %eax
  971. movl %eax, KK
  972. #endif
  973. #if defined(LT) || defined(RN)
  974. movl KK, %eax
  975. #else
  976. movl K, %eax
  977. subl KK, %eax
  978. #endif
  979. sarl $1, %eax
  980. jle .L03
  981. ALIGN_4
  982. .L02:
  983. movaps 0 * SIZE(B), %xmm3
  984. movaps 4 * SIZE(B), %xmm7
  985. pshufd $0x00, %xmm3, %xmm0
  986. pshufd $0x55, %xmm3, %xmm1
  987. pshufd $0xaa, %xmm3, %xmm2
  988. pshufd $0xff, %xmm3, %xmm3
  989. movaps %xmm0, 0 * SIZE(BB)
  990. movaps %xmm1, 4 * SIZE(BB)
  991. movaps %xmm2, 8 * SIZE(BB)
  992. movaps %xmm3, 12 * SIZE(BB)
  993. pshufd $0x00, %xmm7, %xmm4
  994. pshufd $0x55, %xmm7, %xmm5
  995. pshufd $0xaa, %xmm7, %xmm6
  996. pshufd $0xff, %xmm7, %xmm7
  997. movaps %xmm4, 16 * SIZE(BB)
  998. movaps %xmm5, 20 * SIZE(BB)
  999. movaps %xmm6, 24 * SIZE(BB)
  1000. movaps %xmm7, 28 * SIZE(BB)
  1001. addl $ 8 * SIZE, B
  1002. addl $32 * SIZE, BB
  1003. decl %eax
  1004. jne .L02
  1005. ALIGN_4
  1006. .L03:
  1007. #if defined(LT) || defined(RN)
  1008. movl KK, %eax
  1009. #else
  1010. movl K, %eax
  1011. subl KK, %eax
  1012. #endif
  1013. andl $1, %eax
  1014. BRANCH
  1015. jle .L05
  1016. ALIGN_4
  1017. .L04:
  1018. movaps 0 * SIZE(B), %xmm3
  1019. pshufd $0x00, %xmm3, %xmm0
  1020. pshufd $0x55, %xmm3, %xmm1
  1021. pshufd $0xaa, %xmm3, %xmm2
  1022. pshufd $0xff, %xmm3, %xmm3
  1023. movaps %xmm0, 0 * SIZE(BB)
  1024. movaps %xmm1, 4 * SIZE(BB)
  1025. movaps %xmm2, 8 * SIZE(BB)
  1026. movaps %xmm3, 12 * SIZE(BB)
  1027. addl $ 4 * SIZE, B
  1028. ALIGN_4
  1029. .L05:
  1030. #if defined(LT) || defined(RN)
  1031. movl A, %eax
  1032. movl %eax, AA
  1033. #else
  1034. movl A, %eax
  1035. movl %eax, AORIG
  1036. #endif
  1037. leal (, LDC, 2), %eax
  1038. #ifdef RT
  1039. subl %eax, C
  1040. #endif
  1041. movl C, CO1
  1042. #ifndef RT
  1043. addl %eax, C
  1044. #endif
  1045. movl M, %ebx
  1046. sarl $1, %ebx
  1047. jle .L30
  1048. ALIGN_4
  1049. .L10:
  1050. #ifdef LN
  1051. movl K, %eax
  1052. sall $1 + ZBASE_SHIFT, %eax
  1053. subl %eax, AORIG
  1054. #endif
  1055. #if defined(LN) || defined(RT)
  1056. movl AORIG, %eax
  1057. movl %eax, AA
  1058. movl KK, %eax
  1059. sall $1 + ZBASE_SHIFT, %eax
  1060. addl %eax, AA
  1061. #endif
  1062. leal BUFFER, BB # boffset1 = boffset
  1063. #if defined(LN) || defined(RT)
  1064. movl KK, %eax
  1065. sall $3 + ZBASE_SHIFT, %eax
  1066. addl %eax, BB
  1067. #endif
  1068. movaps 0 * SIZE(AA), %xmm0
  1069. xorps %xmm4, %xmm4
  1070. movaps 16 * SIZE(AA), %xmm1
  1071. xorps %xmm5, %xmm5
  1072. movaps 0 * SIZE(BB), %xmm2
  1073. xorps %xmm6, %xmm6
  1074. movaps 16 * SIZE(BB), %xmm3
  1075. xorps %xmm7, %xmm7
  1076. PREFETCHW 3 * SIZE(CO1)
  1077. PREFETCHW 3 * SIZE(CO1, LDC)
  1078. #if defined(LT) || defined(RN)
  1079. movl KK, %eax
  1080. #else
  1081. movl K, %eax
  1082. subl KK, %eax
  1083. #endif
  1084. sarl $3, %eax
  1085. je .L15
  1086. ALIGN_4
  1087. .L11:
  1088. KERNEL1(0 * 16)
  1089. KERNEL2(0 * 16)
  1090. KERNEL3(0 * 16)
  1091. KERNEL4(0 * 16)
  1092. KERNEL5(0 * 16)
  1093. KERNEL6(0 * 16)
  1094. KERNEL7(0 * 16)
  1095. KERNEL8(0 * 16)
  1096. addl $ 32 * SIZE, AA
  1097. addl $128 * SIZE, BB
  1098. decl %eax
  1099. jne .L11
  1100. ALIGN_4
  1101. .L15:
  1102. #if defined(LT) || defined(RN)
  1103. movl KK, %eax
  1104. #else
  1105. movl K, %eax
  1106. subl KK, %eax
  1107. #endif
  1108. andl $7, %eax # if (k & 1)
  1109. BRANCH
  1110. je .L14
  1111. ALIGN_4
  1112. .L13:
  1113. mulps %xmm0, %xmm2
  1114. addps %xmm2, %xmm4
  1115. movaps 4 * SIZE(BB), %xmm2
  1116. mulps %xmm0, %xmm2
  1117. addps %xmm2, %xmm5
  1118. movaps 8 * SIZE(BB), %xmm2
  1119. mulps %xmm0, %xmm2
  1120. mulps 12 * SIZE(BB), %xmm0
  1121. addps %xmm2, %xmm6
  1122. movaps 16 * SIZE(BB), %xmm2
  1123. addps %xmm0, %xmm7
  1124. movaps 4 * SIZE(AA), %xmm0
  1125. addl $ 4 * SIZE, AA
  1126. addl $16 * SIZE, BB
  1127. decl %eax
  1128. jg .L13
  1129. ALIGN_4
  1130. .L14:
  1131. movaps POSINV, %xmm0
  1132. shufps $0xb1, %xmm5, %xmm5
  1133. shufps $0xb1, %xmm7, %xmm7
  1134. #if defined(LN) || defined(LT)
  1135. #ifndef CONJ
  1136. xorps %xmm0, %xmm5
  1137. xorps %xmm0, %xmm7
  1138. #else
  1139. xorps %xmm0, %xmm4
  1140. xorps %xmm0, %xmm6
  1141. #endif
  1142. #else
  1143. xorps %xmm0, %xmm5
  1144. xorps %xmm0, %xmm7
  1145. #endif
  1146. addps %xmm5, %xmm4
  1147. addps %xmm7, %xmm6
  1148. #if defined(LN) || defined(RT)
  1149. movl KK, %eax
  1150. #ifdef LN
  1151. subl $2, %eax
  1152. #else
  1153. subl $2, %eax
  1154. #endif
  1155. movl AORIG, AA
  1156. movl BORIG, B
  1157. leal BUFFER, BB
  1158. sall $ZBASE_SHIFT, %eax
  1159. leal (AA, %eax, 2), AA
  1160. leal (B, %eax, 2), B
  1161. leal (BB, %eax, 8), BB
  1162. #endif
  1163. #if defined(LN) || defined(LT)
  1164. movaps %xmm4, %xmm5
  1165. unpcklpd %xmm6, %xmm4
  1166. unpckhpd %xmm6, %xmm5
  1167. movaps 0 * SIZE(B), %xmm2
  1168. movaps 4 * SIZE(B), %xmm3
  1169. subps %xmm4, %xmm2
  1170. subps %xmm5, %xmm3
  1171. #else
  1172. movaps 0 * SIZE(AA), %xmm1
  1173. movaps 4 * SIZE(AA), %xmm5
  1174. subps %xmm4, %xmm1
  1175. subps %xmm6, %xmm5
  1176. #endif
  1177. #ifdef LN
  1178. movaps 4 * SIZE(AA), %xmm5
  1179. pshufd $0xee, %xmm5, %xmm6
  1180. pshufd $0xbb, %xmm5, %xmm7
  1181. pshufd $0xa0, %xmm3, %xmm4
  1182. pshufd $0xf5, %xmm3, %xmm3
  1183. #ifndef CONJ
  1184. xorps %xmm0, %xmm3
  1185. #else
  1186. xorps %xmm0, %xmm4
  1187. #endif
  1188. mulps %xmm6, %xmm4
  1189. mulps %xmm7, %xmm3
  1190. addps %xmm4, %xmm3
  1191. pshufd $0x44, %xmm5, %xmm6
  1192. pshufd $0x11, %xmm5, %xmm7
  1193. pshufd $0xa0, %xmm3, %xmm4
  1194. pshufd $0xf5, %xmm3, %xmm1
  1195. #ifndef CONJ
  1196. xorps %xmm0, %xmm1
  1197. #else
  1198. xorps %xmm0, %xmm4
  1199. #endif
  1200. mulps %xmm6, %xmm4
  1201. mulps %xmm7, %xmm1
  1202. subps %xmm4, %xmm2
  1203. subps %xmm1, %xmm2
  1204. movaps 0 * SIZE(AA), %xmm5
  1205. pshufd $0x44, %xmm5, %xmm6
  1206. pshufd $0x11, %xmm5, %xmm7
  1207. pshufd $0xa0, %xmm2, %xmm4
  1208. pshufd $0xf5, %xmm2, %xmm2
  1209. #ifndef CONJ
  1210. xorps %xmm0, %xmm2
  1211. #else
  1212. xorps %xmm0, %xmm4
  1213. #endif
  1214. mulps %xmm6, %xmm4
  1215. mulps %xmm7, %xmm2
  1216. addps %xmm4, %xmm2
  1217. #endif
  1218. #ifdef LT
  1219. movaps 0 * SIZE(AA), %xmm5
  1220. pshufd $0x44, %xmm5, %xmm6
  1221. pshufd $0x11, %xmm5, %xmm7
  1222. pshufd $0xa0, %xmm2, %xmm4
  1223. pshufd $0xf5, %xmm2, %xmm2
  1224. #ifndef CONJ
  1225. xorps %xmm0, %xmm2
  1226. #else
  1227. xorps %xmm0, %xmm4
  1228. #endif
  1229. mulps %xmm6, %xmm4
  1230. mulps %xmm7, %xmm2
  1231. addps %xmm4, %xmm2
  1232. pshufd $0xee, %xmm5, %xmm6
  1233. pshufd $0xbb, %xmm5, %xmm7
  1234. pshufd $0xa0, %xmm2, %xmm4
  1235. pshufd $0xf5, %xmm2, %xmm1
  1236. #ifndef CONJ
  1237. xorps %xmm0, %xmm1
  1238. #else
  1239. xorps %xmm0, %xmm4
  1240. #endif
  1241. mulps %xmm6, %xmm4
  1242. mulps %xmm7, %xmm1
  1243. subps %xmm4, %xmm3
  1244. subps %xmm1, %xmm3
  1245. movaps 4 * SIZE(AA), %xmm5
  1246. pshufd $0xee, %xmm5, %xmm6
  1247. pshufd $0xbb, %xmm5, %xmm7
  1248. pshufd $0xa0, %xmm3, %xmm4
  1249. pshufd $0xf5, %xmm3, %xmm3
  1250. #ifndef CONJ
  1251. xorps %xmm0, %xmm3
  1252. #else
  1253. xorps %xmm0, %xmm4
  1254. #endif
  1255. mulps %xmm6, %xmm4
  1256. mulps %xmm7, %xmm3
  1257. addps %xmm4, %xmm3
  1258. #endif
  1259. #ifdef RN
  1260. movaps 0 * SIZE(B), %xmm4
  1261. pshufd $0x44, %xmm4, %xmm6
  1262. pshufd $0x11, %xmm4, %xmm7
  1263. pshufd $0xa0, %xmm1, %xmm3
  1264. pshufd $0xf5, %xmm1, %xmm1
  1265. #ifndef CONJ
  1266. xorps %xmm0, %xmm1
  1267. #else
  1268. xorps %xmm0, %xmm3
  1269. #endif
  1270. mulps %xmm6, %xmm3
  1271. mulps %xmm7, %xmm1
  1272. addps %xmm3, %xmm1
  1273. pshufd $0xee, %xmm4, %xmm6
  1274. pshufd $0xbb, %xmm4, %xmm7
  1275. pshufd $0xa0, %xmm1, %xmm3
  1276. pshufd $0xf5, %xmm1, %xmm2
  1277. #ifndef CONJ
  1278. xorps %xmm0, %xmm2
  1279. #else
  1280. xorps %xmm0, %xmm3
  1281. #endif
  1282. mulps %xmm6, %xmm3
  1283. mulps %xmm7, %xmm2
  1284. subps %xmm3, %xmm5
  1285. subps %xmm2, %xmm5
  1286. movaps 4 * SIZE(B), %xmm4
  1287. pshufd $0xee, %xmm4, %xmm6
  1288. pshufd $0xbb, %xmm4, %xmm7
  1289. pshufd $0xa0, %xmm5, %xmm3
  1290. pshufd $0xf5, %xmm5, %xmm5
  1291. #ifndef CONJ
  1292. xorps %xmm0, %xmm5
  1293. #else
  1294. xorps %xmm0, %xmm3
  1295. #endif
  1296. mulps %xmm6, %xmm3
  1297. mulps %xmm7, %xmm5
  1298. addps %xmm3, %xmm5
  1299. #endif
  1300. #ifdef RT
  1301. movaps 4 * SIZE(B), %xmm4
  1302. pshufd $0xee, %xmm4, %xmm6
  1303. pshufd $0xbb, %xmm4, %xmm7
  1304. pshufd $0xa0, %xmm5, %xmm3
  1305. pshufd $0xf5, %xmm5, %xmm5
  1306. #ifndef CONJ
  1307. xorps %xmm0, %xmm5
  1308. #else
  1309. xorps %xmm0, %xmm3
  1310. #endif
  1311. mulps %xmm6, %xmm3
  1312. mulps %xmm7, %xmm5
  1313. addps %xmm3, %xmm5
  1314. pshufd $0x44, %xmm4, %xmm6
  1315. pshufd $0x11, %xmm4, %xmm7
  1316. pshufd $0xa0, %xmm5, %xmm3
  1317. pshufd $0xf5, %xmm5, %xmm2
  1318. #ifndef CONJ
  1319. xorps %xmm0, %xmm2
  1320. #else
  1321. xorps %xmm0, %xmm3
  1322. #endif
  1323. mulps %xmm6, %xmm3
  1324. mulps %xmm7, %xmm2
  1325. subps %xmm3, %xmm1
  1326. subps %xmm2, %xmm1
  1327. movaps 0 * SIZE(B), %xmm4
  1328. pshufd $0x44, %xmm4, %xmm6
  1329. pshufd $0x11, %xmm4, %xmm7
  1330. pshufd $0xa0, %xmm1, %xmm3
  1331. pshufd $0xf5, %xmm1, %xmm1
  1332. #ifndef CONJ
  1333. xorps %xmm0, %xmm1
  1334. #else
  1335. xorps %xmm0, %xmm3
  1336. #endif
  1337. mulps %xmm6, %xmm3
  1338. mulps %xmm7, %xmm1
  1339. addps %xmm3, %xmm1
  1340. #endif
  1341. #ifdef LN
  1342. subl $4 * SIZE, CO1
  1343. #endif
  1344. #if defined(LN) || defined(LT)
  1345. movaps %xmm2, 0 * SIZE(B)
  1346. movaps %xmm3, 4 * SIZE(B)
  1347. pshufd $0x00, %xmm2, %xmm0
  1348. pshufd $0x55, %xmm2, %xmm1
  1349. pshufd $0xaa, %xmm2, %xmm4
  1350. pshufd $0xff, %xmm2, %xmm5
  1351. movaps %xmm0, 0 * SIZE(BB)
  1352. movaps %xmm1, 4 * SIZE(BB)
  1353. movaps %xmm4, 8 * SIZE(BB)
  1354. movaps %xmm5, 12 * SIZE(BB)
  1355. pshufd $0x00, %xmm3, %xmm0
  1356. pshufd $0x55, %xmm3, %xmm1
  1357. pshufd $0xaa, %xmm3, %xmm4
  1358. pshufd $0xff, %xmm3, %xmm5
  1359. movaps %xmm0, 16 * SIZE(BB)
  1360. movaps %xmm1, 20 * SIZE(BB)
  1361. movaps %xmm4, 24 * SIZE(BB)
  1362. movaps %xmm5, 28 * SIZE(BB)
  1363. movlps %xmm2, 0 * SIZE(CO1)
  1364. movlps %xmm3, 2 * SIZE(CO1)
  1365. movhps %xmm2, 0 * SIZE(CO1, LDC)
  1366. movhps %xmm3, 2 * SIZE(CO1, LDC)
  1367. #else
  1368. movaps %xmm1, 0 * SIZE(AA)
  1369. movaps %xmm5, 4 * SIZE(AA)
  1370. movlps %xmm1, 0 * SIZE(CO1)
  1371. movhps %xmm1, 2 * SIZE(CO1)
  1372. movlps %xmm5, 0 * SIZE(CO1, LDC)
  1373. movhps %xmm5, 2 * SIZE(CO1, LDC)
  1374. #endif
  1375. #ifndef LN
  1376. addl $4 * SIZE, CO1
  1377. #endif
  1378. #if defined(LT) || defined(RN)
  1379. movl K, %eax
  1380. subl KK, %eax
  1381. sall $1 + ZBASE_SHIFT, %eax
  1382. addl %eax, AA
  1383. #ifdef LT
  1384. addl $8 * SIZE, B
  1385. #endif
  1386. #endif
  1387. #ifdef LN
  1388. subl $2, KK
  1389. movl BORIG, B
  1390. #endif
  1391. #ifdef LT
  1392. addl $2, KK
  1393. #endif
  1394. #ifdef RT
  1395. movl K, %eax
  1396. movl BORIG, B
  1397. sall $1 + ZBASE_SHIFT, %eax
  1398. addl %eax, AORIG
  1399. #endif
  1400. decl %ebx
  1401. jg .L10
  1402. ALIGN_4
  1403. .L30:
  1404. movl M, %ebx
  1405. andl $1, %ebx
  1406. jle .L99
  1407. ALIGN_4
  1408. .L40:
  1409. #ifdef LN
  1410. movl K, %eax
  1411. sall $ZBASE_SHIFT, %eax
  1412. subl %eax, AORIG
  1413. #endif
  1414. #if defined(LN) || defined(RT)
  1415. movl AORIG, %eax
  1416. movl %eax, AA
  1417. movl KK, %eax
  1418. sall $ZBASE_SHIFT, %eax
  1419. addl %eax, AA
  1420. #endif
  1421. leal BUFFER, BB # boffset1 = boffset
  1422. #if defined(LN) || defined(RT)
  1423. movl KK, %eax
  1424. sall $3 + ZBASE_SHIFT, %eax
  1425. addl %eax, BB
  1426. #endif
  1427. xorps %xmm4, %xmm4
  1428. xorps %xmm5, %xmm5
  1429. xorps %xmm6, %xmm6
  1430. xorps %xmm7, %xmm7
  1431. #ifdef movsd
  1432. xorps %xmm0, %xmm0
  1433. #endif
  1434. movsd 0 * SIZE(AA), %xmm0
  1435. #ifdef movsd
  1436. xorps %xmm1, %xmm1
  1437. #endif
  1438. movsd 8 * SIZE(AA), %xmm1
  1439. movaps 0 * SIZE(BB), %xmm2
  1440. movaps 16 * SIZE(BB), %xmm3
  1441. #if defined(LT) || defined(RN)
  1442. movl KK, %eax
  1443. #else
  1444. movl K, %eax
  1445. subl KK, %eax
  1446. #endif
  1447. sarl $3, %eax
  1448. je .L42
  1449. ALIGN_4
  1450. .L41:
  1451. mulps %xmm0, %xmm2
  1452. prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA)
  1453. addps %xmm2, %xmm4
  1454. movaps 4 * SIZE(BB), %xmm2
  1455. mulps %xmm0, %xmm2
  1456. addps %xmm2, %xmm5
  1457. movaps 8 * SIZE(BB), %xmm2
  1458. mulps %xmm0, %xmm2
  1459. mulps 12 * SIZE(BB), %xmm0
  1460. addps %xmm2, %xmm6
  1461. movaps 32 * SIZE(BB), %xmm2
  1462. addps %xmm0, %xmm7
  1463. movsd 2 * SIZE(AA), %xmm0
  1464. mulps %xmm0, %xmm3
  1465. addps %xmm3, %xmm4
  1466. movaps 20 * SIZE(BB), %xmm3
  1467. mulps %xmm0, %xmm3
  1468. addps %xmm3, %xmm5
  1469. movaps 24 * SIZE(BB), %xmm3
  1470. mulps %xmm0, %xmm3
  1471. mulps 28 * SIZE(BB), %xmm0
  1472. addps %xmm3, %xmm6
  1473. movaps 48 * SIZE(BB), %xmm3
  1474. addps %xmm0, %xmm7
  1475. movsd 4 * SIZE(AA), %xmm0
  1476. mulps %xmm0, %xmm2
  1477. addps %xmm2, %xmm4
  1478. movaps 36 * SIZE(BB), %xmm2
  1479. mulps %xmm0, %xmm2
  1480. addps %xmm2, %xmm5
  1481. movaps 40 * SIZE(BB), %xmm2
  1482. mulps %xmm0, %xmm2
  1483. mulps 44 * SIZE(BB), %xmm0
  1484. addps %xmm2, %xmm6
  1485. movaps 64 * SIZE(BB), %xmm2
  1486. addps %xmm0, %xmm7
  1487. movsd 6 * SIZE(AA), %xmm0
  1488. mulps %xmm0, %xmm3
  1489. addps %xmm3, %xmm4
  1490. movaps 52 * SIZE(BB), %xmm3
  1491. mulps %xmm0, %xmm3
  1492. addps %xmm3, %xmm5
  1493. movaps 56 * SIZE(BB), %xmm3
  1494. mulps %xmm0, %xmm3
  1495. mulps 60 * SIZE(BB), %xmm0
  1496. addps %xmm3, %xmm6
  1497. movaps 80 * SIZE(BB), %xmm3
  1498. addps %xmm0, %xmm7
  1499. movsd 16 * SIZE(AA), %xmm0
  1500. mulps %xmm1, %xmm2
  1501. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  1502. prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
  1503. #endif
  1504. addps %xmm2, %xmm4
  1505. movaps 68 * SIZE(BB), %xmm2
  1506. mulps %xmm1, %xmm2
  1507. addps %xmm2, %xmm5
  1508. movaps 72 * SIZE(BB), %xmm2
  1509. mulps %xmm1, %xmm2
  1510. mulps 76 * SIZE(BB), %xmm1
  1511. addps %xmm2, %xmm6
  1512. movaps 96 * SIZE(BB), %xmm2
  1513. addps %xmm1, %xmm7
  1514. movsd 10 * SIZE(AA), %xmm1
  1515. mulps %xmm1, %xmm3
  1516. addps %xmm3, %xmm4
  1517. movaps 84 * SIZE(BB), %xmm3
  1518. mulps %xmm1, %xmm3
  1519. addps %xmm3, %xmm5
  1520. movaps 88 * SIZE(BB), %xmm3
  1521. mulps %xmm1, %xmm3
  1522. mulps 92 * SIZE(BB), %xmm1
  1523. addps %xmm3, %xmm6
  1524. movaps 112 * SIZE(BB), %xmm3
  1525. addps %xmm1, %xmm7
  1526. movsd 12 * SIZE(AA), %xmm1
  1527. mulps %xmm1, %xmm2
  1528. addps %xmm2, %xmm4
  1529. movaps 100 * SIZE(BB), %xmm2
  1530. mulps %xmm1, %xmm2
  1531. addps %xmm2, %xmm5
  1532. movaps 104 * SIZE(BB), %xmm2
  1533. mulps %xmm1, %xmm2
  1534. mulps 108 * SIZE(BB), %xmm1
  1535. addps %xmm2, %xmm6
  1536. movaps 128 * SIZE(BB), %xmm2
  1537. addps %xmm1, %xmm7
  1538. movsd 14 * SIZE(AA), %xmm1
  1539. mulps %xmm1, %xmm3
  1540. addps %xmm3, %xmm4
  1541. movaps 116 * SIZE(BB), %xmm3
  1542. mulps %xmm1, %xmm3
  1543. addps %xmm3, %xmm5
  1544. movaps 120 * SIZE(BB), %xmm3
  1545. mulps %xmm1, %xmm3
  1546. mulps 124 * SIZE(BB), %xmm1
  1547. addps %xmm3, %xmm6
  1548. movaps 144 * SIZE(BB), %xmm3
  1549. addps %xmm1, %xmm7
  1550. movsd 24 * SIZE(AA), %xmm1
  1551. addl $ 16 * SIZE, AA
  1552. addl $128 * SIZE, BB
  1553. decl %eax
  1554. jne .L41
  1555. ALIGN_4
  1556. .L42:
  1557. #if defined(LT) || defined(RN)
  1558. movl KK, %eax
  1559. #else
  1560. movl K, %eax
  1561. subl KK, %eax
  1562. #endif
  1563. andl $7, %eax # if (k & 1)
  1564. BRANCH
  1565. je .L44
  1566. ALIGN_4
  1567. .L43:
  1568. mulps %xmm0, %xmm2
  1569. addps %xmm2, %xmm4
  1570. movaps 4 * SIZE(BB), %xmm2
  1571. mulps %xmm0, %xmm2
  1572. addps %xmm2, %xmm5
  1573. movaps 8 * SIZE(BB), %xmm2
  1574. mulps %xmm0, %xmm2
  1575. mulps 12 * SIZE(BB), %xmm0
  1576. addps %xmm2, %xmm6
  1577. movaps 16 * SIZE(BB), %xmm2
  1578. addps %xmm0, %xmm7
  1579. movsd 2 * SIZE(AA), %xmm0
  1580. addl $ 2 * SIZE, AA
  1581. addl $16 * SIZE, BB
  1582. decl %eax
  1583. jg .L43
  1584. ALIGN_4
  1585. .L44:
  1586. movaps POSINV, %xmm0
  1587. shufps $0xb1, %xmm5, %xmm5
  1588. shufps $0xb1, %xmm7, %xmm7
  1589. #if defined(LN) || defined(LT)
  1590. #ifndef CONJ
  1591. xorps %xmm0, %xmm5
  1592. xorps %xmm0, %xmm7
  1593. #else
  1594. xorps %xmm0, %xmm4
  1595. xorps %xmm0, %xmm6
  1596. #endif
  1597. #else
  1598. xorps %xmm0, %xmm5
  1599. xorps %xmm0, %xmm7
  1600. #endif
  1601. addps %xmm5, %xmm4
  1602. addps %xmm7, %xmm6
  1603. #if defined(LN) || defined(RT)
  1604. movl KK, %eax
  1605. #ifdef LN
  1606. subl $1, %eax
  1607. #else
  1608. subl $2, %eax
  1609. #endif
  1610. movl AORIG, AA
  1611. movl BORIG, B
  1612. leal BUFFER, BB
  1613. sall $ZBASE_SHIFT, %eax
  1614. leal (AA, %eax, 1), AA
  1615. leal (B, %eax, 2), B
  1616. leal (BB, %eax, 8), BB
  1617. #endif
  1618. #if defined(LN) || defined(LT)
  1619. unpcklpd %xmm6, %xmm4
  1620. movaps 0 * SIZE(B), %xmm2
  1621. subps %xmm4, %xmm2
  1622. #else
  1623. #ifdef movsd
  1624. xorps %xmm1, %xmm1
  1625. #endif
  1626. movsd 0 * SIZE(AA), %xmm1
  1627. #ifdef movsd
  1628. xorps %xmm5, %xmm5
  1629. #endif
  1630. movsd 2 * SIZE(AA), %xmm5
  1631. subps %xmm4, %xmm1
  1632. subps %xmm6, %xmm5
  1633. #endif
  1634. #if defined(LN) || defined(LT)
  1635. movaps 0 * SIZE(AA), %xmm5
  1636. pshufd $0x44, %xmm5, %xmm6
  1637. pshufd $0x11, %xmm5, %xmm7
  1638. pshufd $0xa0, %xmm2, %xmm4
  1639. pshufd $0xf5, %xmm2, %xmm2
  1640. #ifndef CONJ
  1641. xorps %xmm0, %xmm2
  1642. #else
  1643. xorps %xmm0, %xmm4
  1644. #endif
  1645. mulps %xmm6, %xmm4
  1646. mulps %xmm7, %xmm2
  1647. addps %xmm4, %xmm2
  1648. #endif
  1649. #ifdef RN
  1650. movaps 0 * SIZE(B), %xmm4
  1651. pshufd $0x44, %xmm4, %xmm6
  1652. pshufd $0x11, %xmm4, %xmm7
  1653. pshufd $0xa0, %xmm1, %xmm3
  1654. pshufd $0xf5, %xmm1, %xmm1
  1655. #ifndef CONJ
  1656. xorps %xmm0, %xmm1
  1657. #else
  1658. xorps %xmm0, %xmm3
  1659. #endif
  1660. mulps %xmm6, %xmm3
  1661. mulps %xmm7, %xmm1
  1662. addps %xmm3, %xmm1
  1663. pshufd $0xee, %xmm4, %xmm6
  1664. pshufd $0xbb, %xmm4, %xmm7
  1665. pshufd $0xa0, %xmm1, %xmm3
  1666. pshufd $0xf5, %xmm1, %xmm2
  1667. #ifndef CONJ
  1668. xorps %xmm0, %xmm2
  1669. #else
  1670. xorps %xmm0, %xmm3
  1671. #endif
  1672. mulps %xmm6, %xmm3
  1673. mulps %xmm7, %xmm2
  1674. subps %xmm3, %xmm5
  1675. subps %xmm2, %xmm5
  1676. movaps 4 * SIZE(B), %xmm4
  1677. pshufd $0xee, %xmm4, %xmm6
  1678. pshufd $0xbb, %xmm4, %xmm7
  1679. pshufd $0xa0, %xmm5, %xmm3
  1680. pshufd $0xf5, %xmm5, %xmm5
  1681. #ifndef CONJ
  1682. xorps %xmm0, %xmm5
  1683. #else
  1684. xorps %xmm0, %xmm3
  1685. #endif
  1686. mulps %xmm6, %xmm3
  1687. mulps %xmm7, %xmm5
  1688. addps %xmm3, %xmm5
  1689. #endif
  1690. #ifdef RT
  1691. movaps 4 * SIZE(B), %xmm4
  1692. pshufd $0xee, %xmm4, %xmm6
  1693. pshufd $0xbb, %xmm4, %xmm7
  1694. pshufd $0xa0, %xmm5, %xmm3
  1695. pshufd $0xf5, %xmm5, %xmm5
  1696. #ifndef CONJ
  1697. xorps %xmm0, %xmm5
  1698. #else
  1699. xorps %xmm0, %xmm3
  1700. #endif
  1701. mulps %xmm6, %xmm3
  1702. mulps %xmm7, %xmm5
  1703. addps %xmm3, %xmm5
  1704. pshufd $0x44, %xmm4, %xmm6
  1705. pshufd $0x11, %xmm4, %xmm7
  1706. pshufd $0xa0, %xmm5, %xmm3
  1707. pshufd $0xf5, %xmm5, %xmm2
  1708. #ifndef CONJ
  1709. xorps %xmm0, %xmm2
  1710. #else
  1711. xorps %xmm0, %xmm3
  1712. #endif
  1713. mulps %xmm6, %xmm3
  1714. mulps %xmm7, %xmm2
  1715. subps %xmm3, %xmm1
  1716. subps %xmm2, %xmm1
  1717. movaps 0 * SIZE(B), %xmm4
  1718. pshufd $0x44, %xmm4, %xmm6
  1719. pshufd $0x11, %xmm4, %xmm7
  1720. pshufd $0xa0, %xmm1, %xmm3
  1721. pshufd $0xf5, %xmm1, %xmm1
  1722. #ifndef CONJ
  1723. xorps %xmm0, %xmm1
  1724. #else
  1725. xorps %xmm0, %xmm3
  1726. #endif
  1727. mulps %xmm6, %xmm3
  1728. mulps %xmm7, %xmm1
  1729. addps %xmm3, %xmm1
  1730. #endif
  1731. #ifdef LN
  1732. subl $2 * SIZE, CO1
  1733. #endif
  1734. #if defined(LN) || defined(LT)
  1735. movaps %xmm2, 0 * SIZE(B)
  1736. pshufd $0x00, %xmm2, %xmm0
  1737. pshufd $0x55, %xmm2, %xmm1
  1738. pshufd $0xaa, %xmm2, %xmm4
  1739. pshufd $0xff, %xmm2, %xmm5
  1740. movaps %xmm0, 0 * SIZE(BB)
  1741. movaps %xmm1, 4 * SIZE(BB)
  1742. movaps %xmm4, 8 * SIZE(BB)
  1743. movaps %xmm5, 12 * SIZE(BB)
  1744. movlps %xmm2, 0 * SIZE(CO1)
  1745. movhps %xmm2, 0 * SIZE(CO1, LDC)
  1746. #else
  1747. movlps %xmm1, 0 * SIZE(AA)
  1748. movlps %xmm5, 2 * SIZE(AA)
  1749. movlps %xmm1, 0 * SIZE(CO1)
  1750. movlps %xmm5, 0 * SIZE(CO1, LDC)
  1751. #endif
  1752. #ifndef LN
  1753. addl $2 * SIZE, CO1
  1754. #endif
  1755. #if defined(LT) || defined(RN)
  1756. movl K, %eax
  1757. subl KK, %eax
  1758. sall $ZBASE_SHIFT, %eax
  1759. addl %eax, AA
  1760. #ifdef LT
  1761. addl $4 * SIZE, B
  1762. #endif
  1763. #endif
  1764. #ifdef LN
  1765. subl $1, KK
  1766. movl BORIG, B
  1767. #endif
  1768. #ifdef LT
  1769. addl $1, KK
  1770. #endif
  1771. #ifdef RT
  1772. movl K, %eax
  1773. movl BORIG, B
  1774. sall $ZBASE_SHIFT, %eax
  1775. addl %eax, AORIG
  1776. #endif
  1777. ALIGN_4
  1778. .L99:
  1779. #ifdef LN
  1780. movl K, %eax
  1781. sall $1 + ZBASE_SHIFT, %eax
  1782. addl %eax, B
  1783. #endif
  1784. #if defined(LT) || defined(RN)
  1785. movl K, %eax
  1786. subl KK, %eax
  1787. sall $1 + ZBASE_SHIFT, %eax
  1788. addl %eax, B
  1789. #endif
  1790. #ifdef RN
  1791. addl $2, KK
  1792. #endif
  1793. #ifdef RT
  1794. subl $2, KK
  1795. #endif
  1796. decl J # j --
  1797. jg .L01
  1798. ALIGN_4
  1799. .L999:
  1800. EMMS
  1801. movl OLD_STACK, %esp
  1802. popl %ebx
  1803. popl %esi
  1804. popl %edi
  1805. popl %ebp
  1806. ret
  1807. EPILOGUE