You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LN_2x4_penryn.S 36 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 24 + STACK + ARGS(%esp)
  47. #define ARG_B 28 + STACK + ARGS(%esp)
  48. #define C 32 + STACK + ARGS(%esp)
  49. #define ARG_LDC 36 + STACK + ARGS(%esp)
  50. #define OFFSET 40 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define KK 4 + STACK(%esp)
  53. #define KKK 8 + STACK(%esp)
  54. #define AORIG 12 + STACK(%esp)
  55. #if defined(PENRYN) || defined(DUNNINGTON)
  56. #define PREFETCH prefetcht0
  57. #define PREFETCHSIZE (8 * 21 + 4)
  58. #endif
  59. #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
  60. #define PREFETCH prefetcht0
  61. #define PREFETCHSIZE (8 * 21 + 4)
  62. #endif
  63. #ifdef NANO
  64. #define PREFETCH prefetcht0
  65. #define PREFETCHSIZE (8 * 2)
  66. #endif
  67. #define AA %edx
  68. #define BB %ecx
  69. #define LDC %ebp
  70. #define B %edi
  71. #define CO1 %esi
  72. PROLOGUE
  73. subl $ARGS, %esp
  74. pushl %ebp
  75. pushl %edi
  76. pushl %esi
  77. pushl %ebx
  78. PROFCODE
  79. movl ARG_B, B
  80. movl ARG_LDC, LDC
  81. movl OFFSET, %eax
  82. #ifdef RN
  83. negl %eax
  84. #endif
  85. movl %eax, KK
  86. leal (, LDC, SIZE), LDC
  87. subl $-16 * SIZE, A
  88. subl $-16 * SIZE, B
  89. #ifdef LN
  90. movl M, %eax
  91. leal (, %eax, SIZE), %eax
  92. addl %eax, C
  93. imull K, %eax
  94. addl %eax, A
  95. #endif
  96. #ifdef RT
  97. movl N, %eax
  98. leal (, %eax, SIZE), %eax
  99. imull K, %eax
  100. addl %eax, B
  101. movl N, %eax
  102. imull LDC, %eax
  103. addl %eax, C
  104. #endif
  105. #ifdef RT
  106. movl N, %eax
  107. subl OFFSET, %eax
  108. movl %eax, KK
  109. #endif
  110. movl N, %eax
  111. sarl $2, %eax
  112. movl %eax, J
  113. jle .L30
  114. ALIGN_4
  115. .L10:
  116. #if defined(LT) || defined(RN)
  117. movl A, AA
  118. #else
  119. movl A, %eax
  120. movl %eax, AORIG
  121. #endif
  122. #ifdef RT
  123. movl K, %eax
  124. sall $2 + BASE_SHIFT, %eax
  125. subl %eax, B
  126. #endif
  127. leal (, LDC, 4), %eax
  128. #ifdef RT
  129. subl %eax, C
  130. #endif
  131. movl C, CO1
  132. #ifndef RT
  133. addl %eax, C
  134. #endif
  135. #ifdef LN
  136. movl OFFSET, %eax
  137. addl M, %eax
  138. movl %eax, KK
  139. #endif
  140. #ifdef LT
  141. movl OFFSET, %eax
  142. movl %eax, KK
  143. #endif
  144. movl M, %ebx
  145. testl $1, %ebx
  146. jle .L20
  147. #ifdef LN
  148. movl K, %eax
  149. sall $BASE_SHIFT, %eax
  150. subl %eax, AORIG
  151. #endif
  152. #if defined(LN) || defined(RT)
  153. movl KK, %eax
  154. movl AORIG, AA
  155. leal (AA, %eax, SIZE), AA
  156. #endif
  157. movl B, BB
  158. #if defined(LN) || defined(RT)
  159. movl KK, %eax
  160. sall $2 + BASE_SHIFT, %eax
  161. addl %eax, BB
  162. #endif
  163. movsd -16 * SIZE(AA), %xmm0
  164. movhps -15 * SIZE(AA), %xmm0
  165. pxor %xmm4, %xmm4
  166. movaps -16 * SIZE(BB), %xmm2
  167. pxor %xmm5, %xmm5
  168. movaps -14 * SIZE(BB), %xmm3
  169. pxor %xmm6, %xmm6
  170. pxor %xmm7, %xmm7
  171. #if defined(LT) || defined(RN)
  172. movl KK, %eax
  173. #else
  174. movl K, %eax
  175. subl KK, %eax
  176. #endif
  177. sarl $3, %eax
  178. je .L25
  179. ALIGN_4
  180. .L22:
  181. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  182. pshufd $0x44, %xmm0, %xmm1
  183. mulpd %xmm1, %xmm2
  184. mulpd %xmm1, %xmm3
  185. addpd %xmm2, %xmm4
  186. movaps -12 * SIZE(BB), %xmm2
  187. addpd %xmm3, %xmm5
  188. movaps -10 * SIZE(BB), %xmm3
  189. pshufd $0xee, %xmm0, %xmm1
  190. movaps -14 * SIZE(AA), %xmm0
  191. mulpd %xmm1, %xmm2
  192. mulpd %xmm1, %xmm3
  193. addpd %xmm2, %xmm6
  194. movaps -8 * SIZE(BB), %xmm2
  195. addpd %xmm3, %xmm7
  196. movaps -6 * SIZE(BB), %xmm3
  197. pshufd $0x44, %xmm0, %xmm1
  198. mulpd %xmm1, %xmm2
  199. mulpd %xmm1, %xmm3
  200. addpd %xmm2, %xmm4
  201. movaps -4 * SIZE(BB), %xmm2
  202. addpd %xmm3, %xmm5
  203. movaps -2 * SIZE(BB), %xmm3
  204. pshufd $0xee, %xmm0, %xmm1
  205. movaps -12 * SIZE(AA), %xmm0
  206. mulpd %xmm1, %xmm2
  207. mulpd %xmm1, %xmm3
  208. addpd %xmm2, %xmm6
  209. movaps 0 * SIZE(BB), %xmm2
  210. addpd %xmm3, %xmm7
  211. movaps 2 * SIZE(BB), %xmm3
  212. pshufd $0x44, %xmm0, %xmm1
  213. mulpd %xmm1, %xmm2
  214. mulpd %xmm1, %xmm3
  215. addpd %xmm2, %xmm4
  216. movaps 4 * SIZE(BB), %xmm2
  217. addpd %xmm3, %xmm5
  218. movaps 6 * SIZE(BB), %xmm3
  219. pshufd $0xee, %xmm0, %xmm1
  220. movaps -10 * SIZE(AA), %xmm0
  221. mulpd %xmm1, %xmm2
  222. mulpd %xmm1, %xmm3
  223. addpd %xmm2, %xmm6
  224. movaps 8 * SIZE(BB), %xmm2
  225. addpd %xmm3, %xmm7
  226. movaps 10 * SIZE(BB), %xmm3
  227. pshufd $0x44, %xmm0, %xmm1
  228. mulpd %xmm1, %xmm2
  229. mulpd %xmm1, %xmm3
  230. addpd %xmm2, %xmm4
  231. movaps 12 * SIZE(BB), %xmm2
  232. addpd %xmm3, %xmm5
  233. movaps 14 * SIZE(BB), %xmm3
  234. pshufd $0xee, %xmm0, %xmm1
  235. movaps -8 * SIZE(AA), %xmm0
  236. mulpd %xmm1, %xmm2
  237. mulpd %xmm1, %xmm3
  238. addpd %xmm2, %xmm6
  239. movaps 16 * SIZE(BB), %xmm2
  240. addpd %xmm3, %xmm7
  241. movaps 18 * SIZE(BB), %xmm3
  242. subl $ -8 * SIZE, AA
  243. subl $-32 * SIZE, BB
  244. subl $1, %eax
  245. jne .L22
  246. ALIGN_4
  247. .L25:
  248. #if defined(LT) || defined(RN)
  249. movl KK, %eax
  250. #else
  251. movl K, %eax
  252. subl KK, %eax
  253. #endif
  254. andl $7, %eax
  255. BRANCH
  256. je .L28
  257. .L26:
  258. pshufd $0x44, %xmm0, %xmm1
  259. movsd -15 * SIZE(AA), %xmm0
  260. mulpd %xmm1, %xmm2
  261. mulpd %xmm1, %xmm3
  262. addpd %xmm2, %xmm4
  263. movaps -12 * SIZE(BB), %xmm2
  264. addpd %xmm3, %xmm5
  265. movaps -10 * SIZE(BB), %xmm3
  266. addl $1 * SIZE, AA
  267. addl $4 * SIZE, BB
  268. decl %eax
  269. jg .L26
  270. ALIGN_4
  271. .L28:
  272. addpd %xmm6, %xmm4
  273. addpd %xmm7, %xmm5
  274. #if defined(LN) || defined(RT)
  275. movl KK, %eax
  276. #ifdef LN
  277. subl $1, %eax
  278. #else
  279. subl $4, %eax
  280. #endif
  281. movl AORIG, AA
  282. leal (, %eax, SIZE), %eax
  283. leal (AA, %eax, 1), AA
  284. leal (B, %eax, 4), BB
  285. #endif
  286. #if defined(LN) || defined(LT)
  287. movapd -16 * SIZE(BB), %xmm0
  288. movapd -14 * SIZE(BB), %xmm1
  289. subpd %xmm4, %xmm0
  290. subpd %xmm5, %xmm1
  291. #else
  292. movapd -16 * SIZE(AA), %xmm1
  293. movapd -14 * SIZE(AA), %xmm3
  294. subpd %xmm4, %xmm1
  295. subpd %xmm5, %xmm3
  296. movapd %xmm1, %xmm0
  297. unpckhpd %xmm1, %xmm1
  298. movapd %xmm3, %xmm2
  299. unpckhpd %xmm3, %xmm3
  300. #endif
  301. #ifdef LN
  302. movddup -16 * SIZE(AA), %xmm4
  303. mulpd %xmm4, %xmm0
  304. mulpd %xmm4, %xmm1
  305. #endif
  306. #ifdef LT
  307. movddup -16 * SIZE(AA), %xmm4
  308. mulpd %xmm4, %xmm0
  309. mulpd %xmm4, %xmm1
  310. #endif
  311. #ifdef RN
  312. movsd -16 * SIZE(BB), %xmm4
  313. mulsd %xmm4, %xmm0
  314. movsd -15 * SIZE(BB), %xmm4
  315. mulsd %xmm0, %xmm4
  316. subsd %xmm4, %xmm1
  317. movsd -14 * SIZE(BB), %xmm4
  318. mulsd %xmm0, %xmm4
  319. subsd %xmm4, %xmm2
  320. movsd -13 * SIZE(BB), %xmm4
  321. mulsd %xmm0, %xmm4
  322. subsd %xmm4, %xmm3
  323. movsd -11 * SIZE(BB), %xmm4
  324. mulsd %xmm4, %xmm1
  325. movsd -10 * SIZE(BB), %xmm4
  326. mulsd %xmm1, %xmm4
  327. subsd %xmm4, %xmm2
  328. movsd -9 * SIZE(BB), %xmm4
  329. mulsd %xmm1, %xmm4
  330. subsd %xmm4, %xmm3
  331. movsd -6 * SIZE(BB), %xmm4
  332. mulsd %xmm4, %xmm2
  333. movsd -5 * SIZE(BB), %xmm4
  334. mulsd %xmm2, %xmm4
  335. subsd %xmm4, %xmm3
  336. movsd -1 * SIZE(BB), %xmm4
  337. mulsd %xmm4, %xmm3
  338. #endif
  339. #ifdef RT
  340. movsd -1 * SIZE(BB), %xmm4
  341. mulsd %xmm4, %xmm3
  342. movsd -2 * SIZE(BB), %xmm4
  343. mulsd %xmm3, %xmm4
  344. subsd %xmm4, %xmm2
  345. movsd -3 * SIZE(BB), %xmm4
  346. mulsd %xmm3, %xmm4
  347. subsd %xmm4, %xmm1
  348. movsd -4 * SIZE(BB), %xmm4
  349. mulsd %xmm3, %xmm4
  350. subsd %xmm4, %xmm0
  351. movsd -6 * SIZE(BB), %xmm4
  352. mulsd %xmm4, %xmm2
  353. movsd -7 * SIZE(BB), %xmm4
  354. mulsd %xmm2, %xmm4
  355. subsd %xmm4, %xmm1
  356. movsd -8 * SIZE(BB), %xmm4
  357. mulsd %xmm2, %xmm4
  358. subsd %xmm4, %xmm0
  359. movsd -11 * SIZE(BB), %xmm4
  360. mulsd %xmm4, %xmm1
  361. movsd -12 * SIZE(BB), %xmm4
  362. mulsd %xmm1, %xmm4
  363. subsd %xmm4, %xmm0
  364. movsd -16 * SIZE(BB), %xmm4
  365. mulsd %xmm4, %xmm0
  366. #endif
  367. #if defined(LN) || defined(LT)
  368. movapd %xmm0, -16 * SIZE(BB)
  369. movapd %xmm1, -14 * SIZE(BB)
  370. #else
  371. movsd %xmm0, -16 * SIZE(AA)
  372. movsd %xmm1, -15 * SIZE(AA)
  373. movsd %xmm2, -14 * SIZE(AA)
  374. movsd %xmm3, -13 * SIZE(AA)
  375. #endif
  376. #ifdef LN
  377. subl $1 * SIZE, CO1
  378. #endif
  379. leal (LDC, LDC, 2), %eax
  380. #if defined(LN) || defined(LT)
  381. movsd %xmm0, 0 * SIZE(CO1)
  382. movhpd %xmm0, 0 * SIZE(CO1, LDC, 1)
  383. movsd %xmm1, 0 * SIZE(CO1, LDC, 2)
  384. movhpd %xmm1, 0 * SIZE(CO1, %eax, 1)
  385. #else
  386. movsd %xmm0, 0 * SIZE(CO1)
  387. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  388. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  389. movsd %xmm3, 0 * SIZE(CO1, %eax, 1)
  390. #endif
  391. #ifndef LN
  392. addl $1 * SIZE, CO1
  393. #endif
  394. #if defined(LT) || defined(RN)
  395. movl K, %eax
  396. subl KK, %eax
  397. leal (,%eax, SIZE), %eax
  398. leal (AA, %eax, 1), AA
  399. leal (BB, %eax, 4), BB
  400. #endif
  401. #ifdef LN
  402. subl $1, KK
  403. #endif
  404. #ifdef LT
  405. addl $1, KK
  406. #endif
  407. #ifdef RT
  408. movl K, %eax
  409. sall $BASE_SHIFT, %eax
  410. addl %eax, AORIG
  411. #endif
  412. ALIGN_4
  413. .L20:
  414. movl M, %ebx
  415. sarl $1, %ebx
  416. jle .L29
  417. ALIGN_4
  418. .L11:
  419. #ifdef LN
  420. movl K, %eax
  421. sall $1 + BASE_SHIFT, %eax
  422. subl %eax, AORIG
  423. #endif
  424. #if defined(LN) || defined(RT)
  425. movl KK, %eax
  426. movl AORIG, AA
  427. leal (, %eax, SIZE), %eax
  428. leal (AA, %eax, 2), AA
  429. #endif
  430. movl B, BB
  431. #if defined(LN) || defined(RT)
  432. movl KK, %eax
  433. sall $2 + BASE_SHIFT, %eax
  434. addl %eax, BB
  435. #endif
  436. leal (CO1, LDC, 2), %eax
  437. movaps -16 * SIZE(AA), %xmm0
  438. pxor %xmm2, %xmm2
  439. movaps -16 * SIZE(BB), %xmm1
  440. pxor %xmm3, %xmm3
  441. #ifdef LN
  442. pxor %xmm4, %xmm4
  443. prefetcht0 -2 * SIZE(CO1)
  444. pxor %xmm5, %xmm5
  445. prefetcht0 -2 * SIZE(CO1, LDC)
  446. pxor %xmm6, %xmm6
  447. prefetcht0 -2 * SIZE(%eax)
  448. pxor %xmm7, %xmm7
  449. prefetcht0 -2 * SIZE(%eax, LDC)
  450. #else
  451. pxor %xmm4, %xmm4
  452. prefetcht0 1 * SIZE(CO1)
  453. pxor %xmm5, %xmm5
  454. prefetcht0 1 * SIZE(CO1, LDC)
  455. pxor %xmm6, %xmm6
  456. prefetcht0 1 * SIZE(%eax)
  457. pxor %xmm7, %xmm7
  458. prefetcht0 1 * SIZE(%eax, LDC)
  459. #endif
  460. #if defined(LT) || defined(RN)
  461. movl KK, %eax
  462. #else
  463. movl K, %eax
  464. subl KK, %eax
  465. #endif
  466. sarl $3, %eax
  467. je .L15
  468. ALIGN_4
  469. .L12:
  470. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  471. addpd %xmm3, %xmm7
  472. movaps -14 * SIZE(BB), %xmm3
  473. addpd %xmm2, %xmm6
  474. pshufd $0x4e, %xmm1, %xmm2
  475. mulpd %xmm0, %xmm1
  476. mulpd %xmm0, %xmm2
  477. addpd %xmm1, %xmm5
  478. movaps -12 * SIZE(BB), %xmm1
  479. addpd %xmm2, %xmm4
  480. pshufd $0x4e, %xmm3, %xmm2
  481. mulpd %xmm0, %xmm3
  482. mulpd %xmm0, %xmm2
  483. movaps -14 * SIZE(AA), %xmm0
  484. addpd %xmm3, %xmm7
  485. movaps -10 * SIZE(BB), %xmm3
  486. addpd %xmm2, %xmm6
  487. pshufd $0x4e, %xmm1, %xmm2
  488. mulpd %xmm0, %xmm1
  489. mulpd %xmm0, %xmm2
  490. addpd %xmm1, %xmm5
  491. movaps -8 * SIZE(BB), %xmm1
  492. addpd %xmm2, %xmm4
  493. pshufd $0x4e, %xmm3, %xmm2
  494. mulpd %xmm0, %xmm3
  495. mulpd %xmm0, %xmm2
  496. movaps -12 * SIZE(AA), %xmm0
  497. addpd %xmm3, %xmm7
  498. movaps -6 * SIZE(BB), %xmm3
  499. addpd %xmm2, %xmm6
  500. pshufd $0x4e, %xmm1, %xmm2
  501. mulpd %xmm0, %xmm1
  502. mulpd %xmm0, %xmm2
  503. addpd %xmm1, %xmm5
  504. movaps -4 * SIZE(BB), %xmm1
  505. addpd %xmm2, %xmm4
  506. pshufd $0x4e, %xmm3, %xmm2
  507. mulpd %xmm0, %xmm3
  508. mulpd %xmm0, %xmm2
  509. movaps -10 * SIZE(AA), %xmm0
  510. addpd %xmm3, %xmm7
  511. movaps -2 * SIZE(BB), %xmm3
  512. addpd %xmm2, %xmm6
  513. pshufd $0x4e, %xmm1, %xmm2
  514. mulpd %xmm0, %xmm1
  515. mulpd %xmm0, %xmm2
  516. addpd %xmm1, %xmm5
  517. movaps 0 * SIZE(BB), %xmm1
  518. addpd %xmm2, %xmm4
  519. pshufd $0x4e, %xmm3, %xmm2
  520. mulpd %xmm0, %xmm3
  521. mulpd %xmm0, %xmm2
  522. movaps -8 * SIZE(AA), %xmm0
  523. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  524. addpd %xmm3, %xmm7
  525. movaps 2 * SIZE(BB), %xmm3
  526. addpd %xmm2, %xmm6
  527. pshufd $0x4e, %xmm1, %xmm2
  528. mulpd %xmm0, %xmm1
  529. mulpd %xmm0, %xmm2
  530. addpd %xmm1, %xmm5
  531. movaps 4 * SIZE(BB), %xmm1
  532. addpd %xmm2, %xmm4
  533. pshufd $0x4e, %xmm3, %xmm2
  534. mulpd %xmm0, %xmm3
  535. mulpd %xmm0, %xmm2
  536. movaps -6 * SIZE(AA), %xmm0
  537. addpd %xmm3, %xmm7
  538. movaps 6 * SIZE(BB), %xmm3
  539. addpd %xmm2, %xmm6
  540. pshufd $0x4e, %xmm1, %xmm2
  541. mulpd %xmm0, %xmm1
  542. mulpd %xmm0, %xmm2
  543. addpd %xmm1, %xmm5
  544. movaps 8 * SIZE(BB), %xmm1
  545. addpd %xmm2, %xmm4
  546. pshufd $0x4e, %xmm3, %xmm2
  547. mulpd %xmm0, %xmm3
  548. mulpd %xmm0, %xmm2
  549. movaps -4 * SIZE(AA), %xmm0
  550. addpd %xmm3, %xmm7
  551. movaps 10 * SIZE(BB), %xmm3
  552. addpd %xmm2, %xmm6
  553. pshufd $0x4e, %xmm1, %xmm2
  554. mulpd %xmm0, %xmm1
  555. mulpd %xmm0, %xmm2
  556. addpd %xmm1, %xmm5
  557. movaps 12 * SIZE(BB), %xmm1
  558. addpd %xmm2, %xmm4
  559. pshufd $0x4e, %xmm3, %xmm2
  560. mulpd %xmm0, %xmm3
  561. mulpd %xmm0, %xmm2
  562. movaps -2 * SIZE(AA), %xmm0
  563. addpd %xmm3, %xmm7
  564. movaps 14 * SIZE(BB), %xmm3
  565. addpd %xmm2, %xmm6
  566. pshufd $0x4e, %xmm1, %xmm2
  567. mulpd %xmm0, %xmm1
  568. mulpd %xmm0, %xmm2
  569. addpd %xmm1, %xmm5
  570. movaps 16 * SIZE(BB), %xmm1
  571. addpd %xmm2, %xmm4
  572. pshufd $0x4e, %xmm3, %xmm2
  573. mulpd %xmm0, %xmm3
  574. subl $-32 * SIZE, BB
  575. mulpd %xmm0, %xmm2
  576. movaps 0 * SIZE(AA), %xmm0
  577. subl $-16 * SIZE, AA
  578. subl $1, %eax
  579. jne .L12
  580. ALIGN_4
  581. .L15:
  582. #if defined(LT) || defined(RN)
  583. movl KK, %eax
  584. #else
  585. movl K, %eax
  586. subl KK, %eax
  587. #endif
  588. andl $7, %eax # if (k & 1)
  589. BRANCH
  590. je .L18
  591. ALIGN_3
  592. .L16:
  593. addpd %xmm3, %xmm7
  594. movaps -14 * SIZE(BB), %xmm3
  595. addpd %xmm2, %xmm6
  596. pshufd $0x4e, %xmm1, %xmm2
  597. mulpd %xmm0, %xmm1
  598. mulpd %xmm0, %xmm2
  599. addpd %xmm1, %xmm5
  600. movaps -12 * SIZE(BB), %xmm1
  601. addpd %xmm2, %xmm4
  602. pshufd $0x4e, %xmm3, %xmm2
  603. mulpd %xmm0, %xmm3
  604. mulpd %xmm0, %xmm2
  605. movaps -14 * SIZE(AA), %xmm0
  606. addl $2 * SIZE, AA
  607. addl $4 * SIZE, BB
  608. decl %eax
  609. jg .L16
  610. ALIGN_4
  611. .L18:
  612. #if defined(LN) || defined(RT)
  613. movl KK, %eax
  614. #ifdef LN
  615. subl $2, %eax
  616. #else
  617. subl $4, %eax
  618. #endif
  619. movl AORIG, AA
  620. leal (, %eax, SIZE), %eax
  621. leal (AA, %eax, 2), AA
  622. leal (B, %eax, 4), BB
  623. #endif
  624. addpd %xmm2, %xmm6
  625. addpd %xmm3, %xmm7
  626. movaps %xmm4, %xmm0
  627. movsd %xmm5, %xmm4
  628. movsd %xmm0, %xmm5
  629. movaps %xmm6, %xmm0
  630. movsd %xmm7, %xmm6
  631. movsd %xmm0, %xmm7
  632. #if defined(LN) || defined(LT)
  633. movapd %xmm4, %xmm0
  634. unpcklpd %xmm5, %xmm4
  635. unpckhpd %xmm5, %xmm0
  636. movapd %xmm6, %xmm1
  637. unpcklpd %xmm7, %xmm6
  638. unpckhpd %xmm7, %xmm1
  639. movapd -16 * SIZE(BB), %xmm2
  640. movapd -14 * SIZE(BB), %xmm5
  641. movapd -12 * SIZE(BB), %xmm3
  642. movapd -10 * SIZE(BB), %xmm7
  643. subpd %xmm4, %xmm2
  644. subpd %xmm6, %xmm5
  645. subpd %xmm0, %xmm3
  646. subpd %xmm1, %xmm7
  647. #else
  648. movapd -16 * SIZE(AA), %xmm0
  649. movapd -14 * SIZE(AA), %xmm1
  650. movapd -12 * SIZE(AA), %xmm2
  651. movapd -10 * SIZE(AA), %xmm3
  652. subpd %xmm4, %xmm0
  653. subpd %xmm5, %xmm1
  654. subpd %xmm6, %xmm2
  655. subpd %xmm7, %xmm3
  656. #endif
  657. #ifdef LN
  658. movddup -13 * SIZE(AA), %xmm4
  659. mulpd %xmm4, %xmm3
  660. mulpd %xmm4, %xmm7
  661. movddup -14 * SIZE(AA), %xmm4
  662. movapd %xmm4, %xmm6
  663. mulpd %xmm3, %xmm4
  664. subpd %xmm4, %xmm2
  665. mulpd %xmm7, %xmm6
  666. subpd %xmm6, %xmm5
  667. movddup -16 * SIZE(AA), %xmm4
  668. mulpd %xmm4, %xmm2
  669. mulpd %xmm4, %xmm5
  670. #endif
  671. #ifdef LT
  672. movddup -16 * SIZE(AA), %xmm4
  673. mulpd %xmm4, %xmm2
  674. mulpd %xmm4, %xmm5
  675. movddup -15 * SIZE(AA), %xmm4
  676. movapd %xmm4, %xmm6
  677. mulpd %xmm2, %xmm4
  678. subpd %xmm4, %xmm3
  679. mulpd %xmm5, %xmm6
  680. subpd %xmm6, %xmm7
  681. movddup -13 * SIZE(AA), %xmm4
  682. mulpd %xmm4, %xmm3
  683. mulpd %xmm4, %xmm7
  684. #endif
  685. #ifdef RN
  686. movddup -16 * SIZE(BB), %xmm4
  687. mulpd %xmm4, %xmm0
  688. movddup -15 * SIZE(BB), %xmm4
  689. mulpd %xmm0, %xmm4
  690. subpd %xmm4, %xmm1
  691. movddup -14 * SIZE(BB), %xmm4
  692. mulpd %xmm0, %xmm4
  693. subpd %xmm4, %xmm2
  694. movddup -13 * SIZE(BB), %xmm4
  695. mulpd %xmm0, %xmm4
  696. subpd %xmm4, %xmm3
  697. movddup -11 * SIZE(BB), %xmm4
  698. mulpd %xmm4, %xmm1
  699. movddup -10 * SIZE(BB), %xmm4
  700. mulpd %xmm1, %xmm4
  701. subpd %xmm4, %xmm2
  702. movddup -9 * SIZE(BB), %xmm4
  703. mulpd %xmm1, %xmm4
  704. subpd %xmm4, %xmm3
  705. movddup -6 * SIZE(BB), %xmm4
  706. mulpd %xmm4, %xmm2
  707. movddup -5 * SIZE(BB), %xmm4
  708. mulpd %xmm2, %xmm4
  709. subpd %xmm4, %xmm3
  710. movddup -1 * SIZE(BB), %xmm4
  711. mulpd %xmm4, %xmm3
  712. #endif
  713. #ifdef RT
  714. movddup -1 * SIZE(BB), %xmm4
  715. mulpd %xmm4, %xmm3
  716. movddup -2 * SIZE(BB), %xmm4
  717. mulpd %xmm3, %xmm4
  718. subpd %xmm4, %xmm2
  719. movddup -3 * SIZE(BB), %xmm4
  720. mulpd %xmm3, %xmm4
  721. subpd %xmm4, %xmm1
  722. movddup -4 * SIZE(BB), %xmm4
  723. mulpd %xmm3, %xmm4
  724. subpd %xmm4, %xmm0
  725. movddup -6 * SIZE(BB), %xmm4
  726. mulpd %xmm4, %xmm2
  727. movddup -7 * SIZE(BB), %xmm4
  728. mulpd %xmm2, %xmm4
  729. subpd %xmm4, %xmm1
  730. movddup -8 * SIZE(BB), %xmm4
  731. mulpd %xmm2, %xmm4
  732. subpd %xmm4, %xmm0
  733. movddup -11 * SIZE(BB), %xmm4
  734. mulpd %xmm4, %xmm1
  735. movddup -12 * SIZE(BB), %xmm4
  736. mulpd %xmm1, %xmm4
  737. subpd %xmm4, %xmm0
  738. movddup -16 * SIZE(BB), %xmm4
  739. mulpd %xmm4, %xmm0
  740. #endif
  741. #if defined(LN) || defined(LT)
  742. movapd %xmm2, -16 * SIZE(BB)
  743. movapd %xmm5, -14 * SIZE(BB)
  744. movapd %xmm3, -12 * SIZE(BB)
  745. movapd %xmm7, -10 * SIZE(BB)
  746. #else
  747. movapd %xmm0, -16 * SIZE(AA)
  748. movapd %xmm1, -14 * SIZE(AA)
  749. movapd %xmm2, -12 * SIZE(AA)
  750. movapd %xmm3, -10 * SIZE(AA)
  751. #endif
  752. #ifdef LN
  753. subl $2 * SIZE, CO1
  754. #endif
  755. leal (LDC, LDC, 2), %eax
  756. #if defined(LN) || defined(LT)
  757. movsd %xmm2, 0 * SIZE(CO1)
  758. movsd %xmm3, 1 * SIZE(CO1)
  759. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  760. movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
  761. movsd %xmm5, 0 * SIZE(CO1, LDC, 2)
  762. movsd %xmm7, 1 * SIZE(CO1, LDC, 2)
  763. movhpd %xmm5, 0 * SIZE(CO1, %eax, 1)
  764. movhpd %xmm7, 1 * SIZE(CO1, %eax, 1)
  765. #else
  766. movsd %xmm0, 0 * SIZE(CO1)
  767. movhpd %xmm0, 1 * SIZE(CO1)
  768. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  769. movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
  770. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  771. movhpd %xmm2, 1 * SIZE(CO1, LDC, 2)
  772. movsd %xmm3, 0 * SIZE(CO1, %eax, 1)
  773. movhpd %xmm3, 1 * SIZE(CO1, %eax, 1)
  774. #endif
  775. #ifndef LN
  776. addl $2 * SIZE, CO1
  777. #endif
  778. #if defined(LT) || defined(RN)
  779. movl K, %eax
  780. subl KK, %eax
  781. leal (,%eax, SIZE), %eax
  782. leal (AA, %eax, 2), AA
  783. leal (BB, %eax, 4), BB
  784. #endif
  785. #ifdef LN
  786. subl $2, KK
  787. #endif
  788. #ifdef LT
  789. addl $2, KK
  790. #endif
  791. #ifdef RT
  792. movl K, %eax
  793. sall $1 + BASE_SHIFT, %eax
  794. addl %eax, AORIG
  795. #endif
  796. decl %ebx # i --
  797. jg .L11
  798. ALIGN_4
  799. .L29:
  800. #ifdef LN
  801. movl K, %eax
  802. leal (, %eax, SIZE), %eax
  803. leal (B, %eax, 4), B
  804. #endif
  805. #if defined(LT) || defined(RN)
  806. movl BB, B
  807. #endif
  808. #ifdef RN
  809. addl $4, KK
  810. #endif
  811. #ifdef RT
  812. subl $4, KK
  813. #endif
  814. decl J # j --
  815. jg .L10
  816. ALIGN_4
  817. .L30:
  818. testl $2, N
  819. je .L60
  820. #if defined(LT) || defined(RN)
  821. movl A, AA
  822. #else
  823. movl A, %eax
  824. movl %eax, AORIG
  825. #endif
  826. #ifdef RT
  827. movl K, %eax
  828. sall $1 + BASE_SHIFT, %eax
  829. subl %eax, B
  830. #endif
  831. leal (, LDC, 2), %eax
  832. #ifdef RT
  833. subl %eax, C
  834. #endif
  835. movl C, CO1
  836. #ifndef RT
  837. addl %eax, C
  838. #endif
  839. #ifdef LN
  840. movl OFFSET, %eax
  841. addl M, %eax
  842. movl %eax, KK
  843. #endif
  844. #ifdef LT
  845. movl OFFSET, %eax
  846. movl %eax, KK
  847. #endif
  848. movl M, %ebx
  849. testl $1, %ebx
  850. jle .L50
  851. #ifdef LN
  852. movl K, %eax
  853. sall $BASE_SHIFT, %eax
  854. subl %eax, AORIG
  855. #endif
  856. #if defined(LN) || defined(RT)
  857. movl KK, %eax
  858. movl AORIG, AA
  859. leal (AA, %eax, SIZE), AA
  860. #endif
  861. movl B, BB
  862. #if defined(LN) || defined(RT)
  863. movl KK, %eax
  864. sall $1 + BASE_SHIFT, %eax
  865. addl %eax, BB
  866. #endif
  867. movsd -16 * SIZE(AA), %xmm0
  868. movhps -15 * SIZE(AA), %xmm0
  869. pxor %xmm4, %xmm4
  870. movsd -16 * SIZE(BB), %xmm2
  871. movhps -15 * SIZE(BB), %xmm2
  872. pxor %xmm5, %xmm5
  873. #if defined(LT) || defined(RN)
  874. movl KK, %eax
  875. #else
  876. movl K, %eax
  877. subl KK, %eax
  878. #endif
  879. sarl $3, %eax
  880. je .L55
  881. ALIGN_4
  882. .L52:
  883. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  884. pshufd $0x44, %xmm0, %xmm1
  885. mulpd %xmm1, %xmm2
  886. addpd %xmm2, %xmm4
  887. movaps -14 * SIZE(BB), %xmm2
  888. pshufd $0xee, %xmm0, %xmm1
  889. movaps -14 * SIZE(AA), %xmm0
  890. mulpd %xmm1, %xmm2
  891. addpd %xmm2, %xmm5
  892. movaps -12 * SIZE(BB), %xmm2
  893. pshufd $0x44, %xmm0, %xmm1
  894. mulpd %xmm1, %xmm2
  895. addpd %xmm2, %xmm4
  896. movaps -10 * SIZE(BB), %xmm2
  897. pshufd $0xee, %xmm0, %xmm1
  898. movaps -12 * SIZE(AA), %xmm0
  899. mulpd %xmm1, %xmm2
  900. addpd %xmm2, %xmm5
  901. movaps -8 * SIZE(BB), %xmm2
  902. pshufd $0x44, %xmm0, %xmm1
  903. mulpd %xmm1, %xmm2
  904. addpd %xmm2, %xmm4
  905. movaps -6 * SIZE(BB), %xmm2
  906. pshufd $0xee, %xmm0, %xmm1
  907. movaps -10 * SIZE(AA), %xmm0
  908. mulpd %xmm1, %xmm2
  909. addpd %xmm2, %xmm5
  910. movaps -4 * SIZE(BB), %xmm2
  911. pshufd $0x44, %xmm0, %xmm1
  912. mulpd %xmm1, %xmm2
  913. addpd %xmm2, %xmm4
  914. movaps -2 * SIZE(BB), %xmm2
  915. pshufd $0xee, %xmm0, %xmm1
  916. movaps -8 * SIZE(AA), %xmm0
  917. mulpd %xmm1, %xmm2
  918. addpd %xmm2, %xmm5
  919. movaps 0 * SIZE(BB), %xmm2
  920. subl $ -8 * SIZE, AA
  921. subl $-16 * SIZE, BB
  922. subl $1, %eax
  923. jne .L52
  924. ALIGN_4
  925. .L55:
  926. #if defined(LT) || defined(RN)
  927. movl KK, %eax
  928. #else
  929. movl K, %eax
  930. subl KK, %eax
  931. #endif
  932. andl $7, %eax
  933. BRANCH
  934. je .L58
  935. .L56:
  936. pshufd $0x44, %xmm0, %xmm1
  937. movsd -15 * SIZE(AA), %xmm0
  938. mulpd %xmm1, %xmm2
  939. addpd %xmm2, %xmm4
  940. movaps -14 * SIZE(BB), %xmm2
  941. addl $1 * SIZE, AA
  942. addl $2 * SIZE, BB
  943. decl %eax
  944. jg .L56
  945. ALIGN_4
  946. .L58:
  947. addpd %xmm5, %xmm4
  948. #if defined(LN) || defined(RT)
  949. movl KK, %eax
  950. #ifdef LN
  951. subl $1, %eax
  952. #else
  953. subl $2, %eax
  954. #endif
  955. movl AORIG, AA
  956. leal (, %eax, SIZE), %eax
  957. addl %eax, AA
  958. leal (B, %eax, 2), BB
  959. #endif
  960. #if defined(LN) || defined(LT)
  961. movapd -16 * SIZE(BB), %xmm0
  962. subpd %xmm4, %xmm0
  963. #else
  964. movapd -16 * SIZE(AA), %xmm1
  965. subpd %xmm4, %xmm1
  966. movapd %xmm1, %xmm0
  967. unpckhpd %xmm1, %xmm1
  968. #endif
  969. #ifdef LN
  970. movddup -16 * SIZE(AA), %xmm4
  971. mulpd %xmm4, %xmm0
  972. #endif
  973. #ifdef LT
  974. movddup -16 * SIZE(AA), %xmm4
  975. mulpd %xmm4, %xmm0
  976. #endif
  977. #ifdef RN
  978. movsd -16 * SIZE(BB), %xmm4
  979. mulsd %xmm4, %xmm0
  980. movsd -15 * SIZE(BB), %xmm4
  981. mulsd %xmm0, %xmm4
  982. subsd %xmm4, %xmm1
  983. movsd -13 * SIZE(BB), %xmm4
  984. mulsd %xmm4, %xmm1
  985. #endif
  986. #ifdef RT
  987. movsd -13 * SIZE(BB), %xmm4
  988. mulsd %xmm4, %xmm1
  989. movsd -14 * SIZE(BB), %xmm4
  990. mulsd %xmm1, %xmm4
  991. subsd %xmm4, %xmm0
  992. movsd -16 * SIZE(BB), %xmm4
  993. mulsd %xmm4, %xmm0
  994. #endif
  995. #if defined(LN) || defined(LT)
  996. movapd %xmm0, -16 * SIZE(BB)
  997. #else
  998. movsd %xmm0, -16 * SIZE(AA)
  999. movsd %xmm1, -15 * SIZE(AA)
  1000. #endif
  1001. #ifdef LN
  1002. subl $1 * SIZE, CO1
  1003. #endif
  1004. #if defined(LN) || defined(LT)
  1005. movsd %xmm0, 0 * SIZE(CO1)
  1006. movhpd %xmm0, 0 * SIZE(CO1, LDC, 1)
  1007. #else
  1008. movsd %xmm0, 0 * SIZE(CO1)
  1009. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1010. #endif
  1011. #ifndef LN
  1012. addl $1 * SIZE, CO1
  1013. #endif
  1014. #if defined(LT) || defined(RN)
  1015. movl K, %eax
  1016. subl KK, %eax
  1017. leal (,%eax, SIZE), %eax
  1018. leal (AA, %eax, 1), AA
  1019. leal (BB, %eax, 2), BB
  1020. #endif
  1021. #ifdef LN
  1022. subl $1, KK
  1023. #endif
  1024. #ifdef LT
  1025. addl $1, KK
  1026. #endif
  1027. #ifdef RT
  1028. movl K, %eax
  1029. sall $1 + BASE_SHIFT, %eax
  1030. addl %eax, AORIG
  1031. #endif
  1032. ALIGN_4
  1033. .L50:
  1034. movl M, %ebx
  1035. sarl $1, %ebx
  1036. jle .L59
  1037. ALIGN_4
  1038. .L41:
  1039. #ifdef LN
  1040. movl K, %eax
  1041. sall $1 + BASE_SHIFT, %eax
  1042. subl %eax, AORIG
  1043. #endif
  1044. #if defined(LN) || defined(RT)
  1045. movl KK, %eax
  1046. movl AORIG, AA
  1047. leal (, %eax, SIZE), %eax
  1048. leal (AA, %eax, 2), AA
  1049. #endif
  1050. movl B, BB
  1051. #if defined(LN) || defined(RT)
  1052. movl KK, %eax
  1053. sall $1 + BASE_SHIFT, %eax
  1054. addl %eax, BB
  1055. #endif
  1056. movaps -16 * SIZE(AA), %xmm0
  1057. pxor %xmm4, %xmm4
  1058. movaps -16 * SIZE(BB), %xmm1
  1059. pxor %xmm5, %xmm5
  1060. #ifdef LN
  1061. prefetcht0 -2 * SIZE(CO1)
  1062. pxor %xmm6, %xmm6
  1063. prefetcht0 -2 * SIZE(CO1, LDC)
  1064. pxor %xmm7, %xmm7
  1065. #else
  1066. prefetcht0 1 * SIZE(CO1)
  1067. pxor %xmm6, %xmm6
  1068. prefetcht0 1 * SIZE(CO1, LDC)
  1069. pxor %xmm7, %xmm7
  1070. #endif
  1071. #if defined(LT) || defined(RN)
  1072. movl KK, %eax
  1073. #else
  1074. movl K, %eax
  1075. subl KK, %eax
  1076. #endif
  1077. sarl $3, %eax
  1078. je .L45
  1079. ALIGN_4
  1080. .L42:
  1081. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1082. pshufd $0x4e, %xmm1, %xmm2
  1083. mulpd %xmm0, %xmm1
  1084. mulpd %xmm0, %xmm2
  1085. movaps -14 * SIZE(AA), %xmm0
  1086. addpd %xmm1, %xmm5
  1087. movaps -14 * SIZE(BB), %xmm1
  1088. addpd %xmm2, %xmm4
  1089. pshufd $0x4e, %xmm1, %xmm2
  1090. mulpd %xmm0, %xmm1
  1091. mulpd %xmm0, %xmm2
  1092. movaps -12 * SIZE(AA), %xmm0
  1093. addpd %xmm1, %xmm7
  1094. movaps -12 * SIZE(BB), %xmm1
  1095. addpd %xmm2, %xmm6
  1096. pshufd $0x4e, %xmm1, %xmm2
  1097. mulpd %xmm0, %xmm1
  1098. mulpd %xmm0, %xmm2
  1099. movaps -10 * SIZE(AA), %xmm0
  1100. addpd %xmm1, %xmm5
  1101. movaps -10 * SIZE(BB), %xmm1
  1102. addpd %xmm2, %xmm4
  1103. pshufd $0x4e, %xmm1, %xmm2
  1104. mulpd %xmm0, %xmm1
  1105. mulpd %xmm0, %xmm2
  1106. movaps -8 * SIZE(AA), %xmm0
  1107. addpd %xmm1, %xmm7
  1108. movaps -8 * SIZE(BB), %xmm1
  1109. addpd %xmm2, %xmm6
  1110. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  1111. pshufd $0x4e, %xmm1, %xmm2
  1112. mulpd %xmm0, %xmm1
  1113. mulpd %xmm0, %xmm2
  1114. movaps -6 * SIZE(AA), %xmm0
  1115. addpd %xmm1, %xmm5
  1116. movaps -6 * SIZE(BB), %xmm1
  1117. addpd %xmm2, %xmm4
  1118. pshufd $0x4e, %xmm1, %xmm2
  1119. mulpd %xmm0, %xmm1
  1120. mulpd %xmm0, %xmm2
  1121. movaps -4 * SIZE(AA), %xmm0
  1122. addpd %xmm1, %xmm7
  1123. movaps -4 * SIZE(BB), %xmm1
  1124. addpd %xmm2, %xmm6
  1125. pshufd $0x4e, %xmm1, %xmm2
  1126. mulpd %xmm0, %xmm1
  1127. mulpd %xmm0, %xmm2
  1128. movaps -2 * SIZE(AA), %xmm0
  1129. addpd %xmm1, %xmm5
  1130. movaps -2 * SIZE(BB), %xmm1
  1131. addpd %xmm2, %xmm4
  1132. pshufd $0x4e, %xmm1, %xmm2
  1133. mulpd %xmm0, %xmm1
  1134. mulpd %xmm0, %xmm2
  1135. movaps 0 * SIZE(AA), %xmm0
  1136. addpd %xmm1, %xmm7
  1137. movaps 0 * SIZE(BB), %xmm1
  1138. addpd %xmm2, %xmm6
  1139. subl $-16 * SIZE, AA
  1140. subl $-16 * SIZE, BB
  1141. subl $1, %eax
  1142. jne .L42
  1143. ALIGN_4
  1144. .L45:
  1145. #if defined(LT) || defined(RN)
  1146. movl KK, %eax
  1147. #else
  1148. movl K, %eax
  1149. subl KK, %eax
  1150. #endif
  1151. andl $7, %eax # if (k & 1)
  1152. BRANCH
  1153. je .L48
  1154. ALIGN_3
  1155. .L46:
  1156. pshufd $0x4e, %xmm1, %xmm2
  1157. mulpd %xmm0, %xmm1
  1158. mulpd %xmm0, %xmm2
  1159. movaps -14 * SIZE(AA), %xmm0
  1160. addpd %xmm1, %xmm5
  1161. movaps -14 * SIZE(BB), %xmm1
  1162. addpd %xmm2, %xmm4
  1163. addl $2 * SIZE, AA
  1164. addl $2 * SIZE, BB
  1165. decl %eax
  1166. jg .L46
  1167. ALIGN_4
  1168. .L48:
  1169. addpd %xmm6, %xmm4
  1170. addpd %xmm7, %xmm5
  1171. movaps %xmm4, %xmm0
  1172. movsd %xmm5, %xmm4
  1173. movsd %xmm0, %xmm5
  1174. #if defined(LN) || defined(RT)
  1175. movl KK, %eax
  1176. #ifdef LN
  1177. subl $2, %eax
  1178. #else
  1179. subl $2, %eax
  1180. #endif
  1181. movl AORIG, AA
  1182. leal (, %eax, SIZE), %eax
  1183. leal (AA, %eax, 2), AA
  1184. leal (B, %eax, 2), BB
  1185. #endif
  1186. #if defined(LN) || defined(LT)
  1187. movapd %xmm4, %xmm0
  1188. unpcklpd %xmm5, %xmm4
  1189. unpckhpd %xmm5, %xmm0
  1190. movapd -16 * SIZE(BB), %xmm2
  1191. movapd -14 * SIZE(BB), %xmm3
  1192. subpd %xmm4, %xmm2
  1193. subpd %xmm0, %xmm3
  1194. #else
  1195. movapd -16 * SIZE(AA), %xmm0
  1196. movapd -14 * SIZE(AA), %xmm1
  1197. subpd %xmm4, %xmm0
  1198. subpd %xmm5, %xmm1
  1199. #endif
  1200. #ifdef LN
  1201. movddup -13 * SIZE(AA), %xmm4
  1202. mulpd %xmm4, %xmm3
  1203. movddup -14 * SIZE(AA), %xmm4
  1204. mulpd %xmm3, %xmm4
  1205. subpd %xmm4, %xmm2
  1206. movddup -16 * SIZE(AA), %xmm4
  1207. mulpd %xmm4, %xmm2
  1208. #endif
  1209. #ifdef LT
  1210. movddup -16 * SIZE(AA), %xmm4
  1211. mulpd %xmm4, %xmm2
  1212. movddup -15 * SIZE(AA), %xmm4
  1213. mulpd %xmm2, %xmm4
  1214. subpd %xmm4, %xmm3
  1215. movddup -13 * SIZE(AA), %xmm4
  1216. mulpd %xmm4, %xmm3
  1217. #endif
  1218. #ifdef RN
  1219. movddup -16 * SIZE(BB), %xmm4
  1220. mulpd %xmm4, %xmm0
  1221. movddup -15 * SIZE(BB), %xmm4
  1222. mulpd %xmm0, %xmm4
  1223. subpd %xmm4, %xmm1
  1224. movddup -13 * SIZE(BB), %xmm4
  1225. mulpd %xmm4, %xmm1
  1226. #endif
  1227. #ifdef RT
  1228. movddup -13 * SIZE(BB), %xmm4
  1229. mulpd %xmm4, %xmm1
  1230. movddup -14 * SIZE(BB), %xmm4
  1231. mulpd %xmm1, %xmm4
  1232. subpd %xmm4, %xmm0
  1233. movddup -16 * SIZE(BB), %xmm4
  1234. mulpd %xmm4, %xmm0
  1235. #endif
  1236. #if defined(LN) || defined(LT)
  1237. movapd %xmm2, -16 * SIZE(BB)
  1238. movapd %xmm3, -14 * SIZE(BB)
  1239. #else
  1240. movapd %xmm0, -16 * SIZE(AA)
  1241. movapd %xmm1, -14 * SIZE(AA)
  1242. #endif
  1243. #ifdef LN
  1244. subl $2 * SIZE, CO1
  1245. #endif
  1246. #if defined(LN) || defined(LT)
  1247. movsd %xmm2, 0 * SIZE(CO1)
  1248. movsd %xmm3, 1 * SIZE(CO1)
  1249. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  1250. movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
  1251. #else
  1252. movsd %xmm0, 0 * SIZE(CO1)
  1253. movhpd %xmm0, 1 * SIZE(CO1)
  1254. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1255. movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
  1256. #endif
  1257. #ifndef LN
  1258. addl $2 * SIZE, CO1
  1259. #endif
  1260. #if defined(LT) || defined(RN)
  1261. movl K, %eax
  1262. subl KK, %eax
  1263. leal (,%eax, SIZE), %eax
  1264. leal (AA, %eax, 2), AA
  1265. leal (BB, %eax, 2), BB
  1266. #endif
  1267. #ifdef LN
  1268. subl $2, KK
  1269. #endif
  1270. #ifdef LT
  1271. addl $2, KK
  1272. #endif
  1273. #ifdef RT
  1274. movl K, %eax
  1275. sall $1 + BASE_SHIFT, %eax
  1276. addl %eax, AORIG
  1277. #endif
  1278. decl %ebx # i --
  1279. jg .L41
  1280. ALIGN_4
  1281. .L59:
  1282. #ifdef LN
  1283. movl K, %eax
  1284. leal (, %eax, SIZE), %eax
  1285. leal (B, %eax, 2), B
  1286. #endif
  1287. #if defined(LT) || defined(RN)
  1288. movl BB, B
  1289. #endif
  1290. #ifdef RN
  1291. addl $2, KK
  1292. #endif
  1293. #ifdef RT
  1294. subl $2, KK
  1295. #endif
  1296. ALIGN_4
  1297. .L60:
  1298. testl $1, N
  1299. je .L999
  1300. #if defined(LT) || defined(RN)
  1301. movl A, AA
  1302. #else
  1303. movl A, %eax
  1304. movl %eax, AORIG
  1305. #endif
  1306. #ifdef RT
  1307. movl K, %eax
  1308. sall $BASE_SHIFT, %eax
  1309. subl %eax, B
  1310. #endif
  1311. #ifdef RT
  1312. subl LDC, C
  1313. #endif
  1314. movl C, CO1
  1315. #ifndef RT
  1316. addl LDC, C
  1317. #endif
  1318. #ifdef LN
  1319. movl OFFSET, %eax
  1320. addl M, %eax
  1321. movl %eax, KK
  1322. #endif
  1323. #ifdef LT
  1324. movl OFFSET, %eax
  1325. movl %eax, KK
  1326. #endif
  1327. movl M, %ebx
  1328. testl $1, %ebx
  1329. jle .L80
  1330. #ifdef LN
  1331. movl K, %eax
  1332. sall $BASE_SHIFT, %eax
  1333. subl %eax, AORIG
  1334. #endif
  1335. #if defined(LN) || defined(RT)
  1336. movl KK, %eax
  1337. movl AORIG, AA
  1338. leal (AA, %eax, SIZE), AA
  1339. #endif
  1340. movl B, BB
  1341. #if defined(LN) || defined(RT)
  1342. movl KK, %eax
  1343. sall $BASE_SHIFT, %eax
  1344. addl %eax, BB
  1345. #endif
  1346. movsd -16 * SIZE(AA), %xmm0
  1347. movhps -15 * SIZE(AA), %xmm0
  1348. pxor %xmm4, %xmm4
  1349. movsd -16 * SIZE(BB), %xmm2
  1350. movhps -16 * SIZE(BB), %xmm2
  1351. pxor %xmm5, %xmm5
  1352. #if defined(LT) || defined(RN)
  1353. movl KK, %eax
  1354. #else
  1355. movl K, %eax
  1356. subl KK, %eax
  1357. #endif
  1358. sarl $3, %eax
  1359. je .L85
  1360. ALIGN_4
  1361. .L82:
  1362. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1363. mulpd %xmm0, %xmm2
  1364. movaps -14 * SIZE(AA), %xmm0
  1365. addpd %xmm2, %xmm4
  1366. movaps -14 * SIZE(BB), %xmm2
  1367. mulpd %xmm0, %xmm2
  1368. movaps -12 * SIZE(AA), %xmm0
  1369. addpd %xmm2, %xmm5
  1370. movaps -12 * SIZE(BB), %xmm2
  1371. mulpd %xmm0, %xmm2
  1372. movaps -10 * SIZE(AA), %xmm0
  1373. addpd %xmm2, %xmm4
  1374. movaps -10 * SIZE(BB), %xmm2
  1375. mulpd %xmm0, %xmm2
  1376. movaps -8 * SIZE(AA), %xmm0
  1377. addpd %xmm2, %xmm5
  1378. movaps -8 * SIZE(BB), %xmm2
  1379. subl $-8 * SIZE, AA
  1380. subl $-8 * SIZE, BB
  1381. decl %eax
  1382. jne .L82
  1383. ALIGN_4
  1384. .L85:
  1385. #if defined(LT) || defined(RN)
  1386. movl KK, %eax
  1387. #else
  1388. movl K, %eax
  1389. subl KK, %eax
  1390. #endif
  1391. andl $7, %eax
  1392. BRANCH
  1393. je .L88
  1394. .L86:
  1395. mulsd %xmm0, %xmm2
  1396. movsd -15 * SIZE(AA), %xmm0
  1397. addsd %xmm2, %xmm4
  1398. movsd -15 * SIZE(BB), %xmm2
  1399. addl $1 * SIZE, AA
  1400. addl $1 * SIZE, BB
  1401. decl %eax
  1402. jg .L86
  1403. ALIGN_4
  1404. .L88:
  1405. addpd %xmm5, %xmm4
  1406. haddpd %xmm4, %xmm4
  1407. #if defined(LN) || defined(RT)
  1408. movl KK, %eax
  1409. #ifdef LN
  1410. subl $1, %eax
  1411. #else
  1412. subl $1, %eax
  1413. #endif
  1414. movl AORIG, AA
  1415. leal (, %eax, SIZE), %eax
  1416. addl %eax, AA
  1417. leal (B, %eax, 1), BB
  1418. #endif
  1419. #if defined(LN) || defined(LT)
  1420. movsd -16 * SIZE(BB), %xmm0
  1421. subsd %xmm4, %xmm0
  1422. #else
  1423. movsd -16 * SIZE(AA), %xmm0
  1424. subsd %xmm4, %xmm0
  1425. #endif
  1426. #ifdef LN
  1427. movsd -16 * SIZE(AA), %xmm4
  1428. mulsd %xmm4, %xmm0
  1429. #endif
  1430. #ifdef LT
  1431. movsd -16 * SIZE(AA), %xmm4
  1432. mulsd %xmm4, %xmm0
  1433. #endif
  1434. #ifdef RN
  1435. movsd -16 * SIZE(BB), %xmm4
  1436. mulsd %xmm4, %xmm0
  1437. #endif
  1438. #ifdef RT
  1439. movsd -16 * SIZE(BB), %xmm4
  1440. mulsd %xmm4, %xmm0
  1441. #endif
  1442. #if defined(LN) || defined(LT)
  1443. movsd %xmm0, -16 * SIZE(BB)
  1444. #else
  1445. movsd %xmm0, -16 * SIZE(AA)
  1446. #endif
  1447. #ifdef LN
  1448. subl $1 * SIZE, CO1
  1449. #endif
  1450. #if defined(LN) || defined(LT)
  1451. movsd %xmm0, 0 * SIZE(CO1)
  1452. #else
  1453. movsd %xmm0, 0 * SIZE(CO1)
  1454. #endif
  1455. #ifndef LN
  1456. addl $1 * SIZE, CO1
  1457. #endif
  1458. #if defined(LT) || defined(RN)
  1459. movl K, %eax
  1460. subl KK, %eax
  1461. leal (,%eax, SIZE), %eax
  1462. addl %eax, AA
  1463. addl %eax, BB
  1464. #endif
  1465. #ifdef LN
  1466. subl $1, KK
  1467. #endif
  1468. #ifdef LT
  1469. addl $1, KK
  1470. #endif
  1471. #ifdef RT
  1472. movl K, %eax
  1473. sall $BASE_SHIFT, %eax
  1474. addl %eax, AORIG
  1475. #endif
  1476. ALIGN_4
  1477. .L80:
  1478. movl M, %ebx
  1479. sarl $1, %ebx
  1480. jle .L89
  1481. ALIGN_4
  1482. .L71:
  1483. #ifdef LN
  1484. movl K, %eax
  1485. sall $1 + BASE_SHIFT, %eax
  1486. subl %eax, AORIG
  1487. #endif
  1488. #if defined(LN) || defined(RT)
  1489. movl KK, %eax
  1490. movl AORIG, AA
  1491. leal (, %eax, SIZE), %eax
  1492. leal (AA, %eax, 2), AA
  1493. #endif
  1494. movl B, BB
  1495. #if defined(LN) || defined(RT)
  1496. movl KK, %eax
  1497. sall $BASE_SHIFT, %eax
  1498. addl %eax, BB
  1499. #endif
  1500. movaps -16 * SIZE(AA), %xmm0
  1501. pxor %xmm4, %xmm4
  1502. movaps -16 * SIZE(BB), %xmm1
  1503. pxor %xmm5, %xmm5
  1504. #ifdef LN
  1505. prefetcht0 -2 * SIZE(CO1)
  1506. #else
  1507. prefetcht0 1 * SIZE(CO1)
  1508. #endif
  1509. #if defined(LT) || defined(RN)
  1510. movl KK, %eax
  1511. #else
  1512. movl K, %eax
  1513. subl KK, %eax
  1514. #endif
  1515. sarl $3, %eax
  1516. je .L75
  1517. ALIGN_4
  1518. .L72:
  1519. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1520. pshufd $0x44, %xmm1, %xmm2
  1521. mulpd %xmm0, %xmm2
  1522. movaps -14 * SIZE(AA), %xmm0
  1523. addpd %xmm2, %xmm4
  1524. pshufd $0xee, %xmm1, %xmm2
  1525. movaps -14 * SIZE(BB), %xmm1
  1526. mulpd %xmm0, %xmm2
  1527. movaps -12 * SIZE(AA), %xmm0
  1528. addpd %xmm2, %xmm5
  1529. pshufd $0x44, %xmm1, %xmm2
  1530. mulpd %xmm0, %xmm2
  1531. movaps -10 * SIZE(AA), %xmm0
  1532. addpd %xmm2, %xmm4
  1533. pshufd $0xee, %xmm1, %xmm2
  1534. movaps -12 * SIZE(BB), %xmm1
  1535. mulpd %xmm0, %xmm2
  1536. movaps -8 * SIZE(AA), %xmm0
  1537. addpd %xmm2, %xmm5
  1538. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  1539. pshufd $0x44, %xmm1, %xmm2
  1540. mulpd %xmm0, %xmm2
  1541. movaps -6 * SIZE(AA), %xmm0
  1542. addpd %xmm2, %xmm4
  1543. pshufd $0xee, %xmm1, %xmm2
  1544. movaps -10 * SIZE(BB), %xmm1
  1545. mulpd %xmm0, %xmm2
  1546. movaps -4 * SIZE(AA), %xmm0
  1547. addpd %xmm2, %xmm5
  1548. pshufd $0x44, %xmm1, %xmm2
  1549. mulpd %xmm0, %xmm2
  1550. movaps -2 * SIZE(AA), %xmm0
  1551. addpd %xmm2, %xmm4
  1552. pshufd $0xee, %xmm1, %xmm2
  1553. movaps -8 * SIZE(BB), %xmm1
  1554. mulpd %xmm0, %xmm2
  1555. movaps 0 * SIZE(AA), %xmm0
  1556. addpd %xmm2, %xmm5
  1557. subl $-16 * SIZE, AA
  1558. subl $ -8 * SIZE, BB
  1559. subl $1, %eax
  1560. jne .L72
  1561. ALIGN_4
  1562. .L75:
  1563. #if defined(LT) || defined(RN)
  1564. movl KK, %eax
  1565. #else
  1566. movl K, %eax
  1567. subl KK, %eax
  1568. #endif
  1569. andl $7, %eax # if (k & 1)
  1570. BRANCH
  1571. je .L78
  1572. ALIGN_3
  1573. .L76:
  1574. pshufd $0x44, %xmm1, %xmm2
  1575. movsd -15 * SIZE(BB), %xmm1
  1576. mulpd %xmm0, %xmm2
  1577. movaps -14 * SIZE(AA), %xmm0
  1578. addpd %xmm2, %xmm4
  1579. addl $2 * SIZE, AA
  1580. addl $1 * SIZE, BB
  1581. decl %eax
  1582. jg .L76
  1583. ALIGN_4
  1584. .L78:
  1585. addpd %xmm5, %xmm4
  1586. #if defined(LN) || defined(RT)
  1587. movl KK, %eax
  1588. #ifdef LN
  1589. subl $2, %eax
  1590. #else
  1591. subl $1, %eax
  1592. #endif
  1593. movl AORIG, AA
  1594. leal (, %eax, SIZE), %eax
  1595. leal (AA, %eax, 2), AA
  1596. leal (B, %eax, 1), BB
  1597. #endif
  1598. #if defined(LN) || defined(LT)
  1599. movapd -16 * SIZE(BB), %xmm1
  1600. subpd %xmm4, %xmm1
  1601. movapd %xmm1, %xmm0
  1602. unpckhpd %xmm1, %xmm1
  1603. #else
  1604. movapd -16 * SIZE(AA), %xmm0
  1605. subpd %xmm4, %xmm0
  1606. #endif
  1607. #ifdef LN
  1608. movsd -13 * SIZE(AA), %xmm4
  1609. mulsd %xmm4, %xmm1
  1610. movsd -14 * SIZE(AA), %xmm4
  1611. mulsd %xmm1, %xmm4
  1612. subsd %xmm4, %xmm0
  1613. movsd -16 * SIZE(AA), %xmm4
  1614. mulsd %xmm4, %xmm0
  1615. #endif
  1616. #ifdef LT
  1617. movsd -16 * SIZE(AA), %xmm4
  1618. mulsd %xmm4, %xmm0
  1619. movsd -15 * SIZE(AA), %xmm4
  1620. mulsd %xmm0, %xmm4
  1621. subsd %xmm4, %xmm1
  1622. movsd -13 * SIZE(AA), %xmm4
  1623. mulsd %xmm4, %xmm1
  1624. #endif
  1625. #ifdef RN
  1626. movddup -16 * SIZE(BB), %xmm4
  1627. mulpd %xmm4, %xmm0
  1628. #endif
  1629. #ifdef RT
  1630. movddup -16 * SIZE(BB), %xmm4
  1631. mulpd %xmm4, %xmm0
  1632. #endif
  1633. #if defined(LN) || defined(LT)
  1634. movsd %xmm0, -16 * SIZE(BB)
  1635. movsd %xmm1, -15 * SIZE(BB)
  1636. #else
  1637. movapd %xmm0, -16 * SIZE(AA)
  1638. #endif
  1639. #ifdef LN
  1640. subl $2 * SIZE, CO1
  1641. #endif
  1642. #if defined(LN) || defined(LT)
  1643. movsd %xmm0, 0 * SIZE(CO1)
  1644. movsd %xmm1, 1 * SIZE(CO1)
  1645. #else
  1646. movsd %xmm0, 0 * SIZE(CO1)
  1647. movhpd %xmm0, 1 * SIZE(CO1)
  1648. #endif
  1649. #ifndef LN
  1650. addl $2 * SIZE, CO1
  1651. #endif
  1652. #if defined(LT) || defined(RN)
  1653. movl K, %eax
  1654. subl KK, %eax
  1655. leal (,%eax, SIZE), %eax
  1656. leal (AA, %eax, 2), AA
  1657. addl %eax, BB
  1658. #endif
  1659. #ifdef LN
  1660. subl $2, KK
  1661. #endif
  1662. #ifdef LT
  1663. addl $2, KK
  1664. #endif
  1665. #ifdef RT
  1666. movl K, %eax
  1667. sall $1 + BASE_SHIFT, %eax
  1668. addl %eax, AORIG
  1669. #endif
  1670. decl %ebx # i --
  1671. jg .L71
  1672. ALIGN_4
  1673. .L89:
  1674. #ifdef LN
  1675. movl K, %eax
  1676. leal (B, %eax, SIZE), B
  1677. #endif
  1678. #if defined(LT) || defined(RN)
  1679. movl BB, B
  1680. #endif
  1681. #ifdef RN
  1682. addl $1, KK
  1683. #endif
  1684. #ifdef RT
  1685. subl $1, KK
  1686. #endif
  1687. ALIGN_4
  1688. .L999:
  1689. popl %ebx
  1690. popl %esi
  1691. popl %edi
  1692. popl %ebp
  1693. addl $ARGS, %esp
  1694. ret
  1695. EPILOGUE