You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LN_2x4_sse3.S 37 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 24 + STACK + ARGS(%esp)
  47. #define ARG_B 28 + STACK + ARGS(%esp)
  48. #define C 32 + STACK + ARGS(%esp)
  49. #define ARG_LDC 36 + STACK + ARGS(%esp)
  50. #define OFFSET 40 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define KK 4 + STACK(%esp)
  53. #define KKK 8 + STACK(%esp)
  54. #define AORIG 12 + STACK(%esp)
  55. #ifdef PENTIUM4
  56. #define PREFETCH prefetcht1
  57. #define PREFETCHSIZE 84
  58. #endif
  59. #if defined(PENRYN) || defined(DUNNINGTON)
  60. #define PREFETCH prefetcht1
  61. #define PREFETCHSIZE 84
  62. #endif
  63. #ifdef PENTIUMM
  64. #define PREFETCH prefetcht1
  65. #define PREFETCHSIZE 84
  66. #endif
  67. #define AA %edx
  68. #define BB %ecx
  69. #define LDC %ebp
  70. #define B %edi
  71. #define CO1 %esi
  72. PROLOGUE
  73. subl $ARGS, %esp
  74. pushl %ebp
  75. pushl %edi
  76. pushl %esi
  77. pushl %ebx
  78. PROFCODE
  79. movl ARG_B, B
  80. movl ARG_LDC, LDC
  81. movl OFFSET, %eax
  82. #ifdef RN
  83. negl %eax
  84. #endif
  85. movl %eax, KK
  86. leal (, LDC, SIZE), LDC
  87. #ifdef LN
  88. movl M, %eax
  89. leal (, %eax, SIZE), %eax
  90. addl %eax, C
  91. imull K, %eax
  92. addl %eax, A
  93. #endif
  94. #ifdef RT
  95. movl N, %eax
  96. leal (, %eax, SIZE), %eax
  97. imull K, %eax
  98. addl %eax, B
  99. movl N, %eax
  100. imull LDC, %eax
  101. addl %eax, C
  102. #endif
  103. #ifdef RT
  104. movl N, %eax
  105. subl OFFSET, %eax
  106. movl %eax, KK
  107. #endif
  108. movl N, %eax
  109. sarl $2, %eax
  110. movl %eax, J
  111. jle .L30
  112. ALIGN_2
  113. .L10:
  114. #if defined(LT) || defined(RN)
  115. movl A, AA
  116. #else
  117. movl A, %eax
  118. movl %eax, AORIG
  119. #endif
  120. #ifdef RT
  121. movl K, %eax
  122. sall $2 + BASE_SHIFT, %eax
  123. subl %eax, B
  124. #endif
  125. leal (, LDC, 4), %eax
  126. #ifdef RT
  127. subl %eax, C
  128. #endif
  129. movl C, CO1
  130. #ifndef RT
  131. addl %eax, C
  132. #endif
  133. #ifdef LN
  134. movl OFFSET, %eax
  135. addl M, %eax
  136. movl %eax, KK
  137. #endif
  138. #ifdef LT
  139. movl OFFSET, %eax
  140. movl %eax, KK
  141. #endif
  142. movl M, %ebx
  143. testl $1, %ebx # i = (m >> 2)
  144. jle .L20
  145. #ifdef LN
  146. movl K, %eax
  147. sall $BASE_SHIFT, %eax
  148. subl %eax, AORIG
  149. #endif
  150. #if defined(LN) || defined(RT)
  151. movl KK, %eax
  152. movl AORIG, AA
  153. leal (AA, %eax, SIZE), AA
  154. #endif
  155. movl B, BB
  156. #if defined(LN) || defined(RT)
  157. movl KK, %eax
  158. sall $2 + BASE_SHIFT, %eax
  159. addl %eax, BB
  160. #endif
  161. movddup 0 * SIZE(AA), %xmm0
  162. pxor %xmm4, %xmm4
  163. movddup 8 * SIZE(AA), %xmm1
  164. pxor %xmm5, %xmm5
  165. movapd 0 * SIZE(BB), %xmm2
  166. pxor %xmm6, %xmm6
  167. movapd 8 * SIZE(BB), %xmm3
  168. pxor %xmm7, %xmm7
  169. #if defined(LT) || defined(RN)
  170. movl KK, %eax
  171. #else
  172. movl K, %eax
  173. subl KK, %eax
  174. #endif
  175. sarl $4, %eax
  176. je .L25
  177. ALIGN_4
  178. .L22:
  179. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  180. mulpd %xmm0, %xmm2
  181. mulpd 2 * SIZE(BB), %xmm0
  182. addpd %xmm2, %xmm4
  183. movapd 4 * SIZE(BB), %xmm2
  184. addpd %xmm0, %xmm5
  185. movddup 1 * SIZE(AA), %xmm0
  186. mulpd %xmm0, %xmm2
  187. mulpd 6 * SIZE(BB), %xmm0
  188. addpd %xmm2, %xmm6
  189. movapd 16 * SIZE(BB), %xmm2
  190. addpd %xmm0, %xmm7
  191. movddup 2 * SIZE(AA), %xmm0
  192. mulpd %xmm0, %xmm3
  193. mulpd 10 * SIZE(BB), %xmm0
  194. addpd %xmm3, %xmm4
  195. movapd 12 * SIZE(BB), %xmm3
  196. addpd %xmm0, %xmm5
  197. movddup 3 * SIZE(AA), %xmm0
  198. mulpd %xmm0, %xmm3
  199. mulpd 14 * SIZE(BB), %xmm0
  200. addpd %xmm3, %xmm6
  201. movapd 24 * SIZE(BB), %xmm3
  202. addpd %xmm0, %xmm7
  203. movddup 4 * SIZE(AA), %xmm0
  204. mulpd %xmm0, %xmm2
  205. mulpd 18 * SIZE(BB), %xmm0
  206. addpd %xmm2, %xmm4
  207. movapd 20 * SIZE(BB), %xmm2
  208. addpd %xmm0, %xmm5
  209. movddup 5 * SIZE(AA), %xmm0
  210. mulpd %xmm0, %xmm2
  211. mulpd 22 * SIZE(BB), %xmm0
  212. addpd %xmm2, %xmm6
  213. movapd 32 * SIZE(BB), %xmm2
  214. addpd %xmm0, %xmm7
  215. movddup 6 * SIZE(AA), %xmm0
  216. mulpd %xmm0, %xmm3
  217. mulpd 26 * SIZE(BB), %xmm0
  218. addpd %xmm3, %xmm4
  219. movapd 28 * SIZE(BB), %xmm3
  220. addpd %xmm0, %xmm5
  221. movddup 7 * SIZE(AA), %xmm0
  222. mulpd %xmm0, %xmm3
  223. mulpd 30 * SIZE(BB), %xmm0
  224. addpd %xmm3, %xmm6
  225. movapd 40 * SIZE(BB), %xmm3
  226. addpd %xmm0, %xmm7
  227. movddup 16 * SIZE(AA), %xmm0
  228. mulpd %xmm1, %xmm2
  229. mulpd 34 * SIZE(BB), %xmm1
  230. addpd %xmm2, %xmm4
  231. movapd 36 * SIZE(BB), %xmm2
  232. addpd %xmm1, %xmm5
  233. movddup 9 * SIZE(AA), %xmm1
  234. mulpd %xmm1, %xmm2
  235. mulpd 38 * SIZE(BB), %xmm1
  236. addpd %xmm2, %xmm6
  237. movapd 48 * SIZE(BB), %xmm2
  238. addpd %xmm1, %xmm7
  239. movddup 10 * SIZE(AA), %xmm1
  240. mulpd %xmm1, %xmm3
  241. mulpd 42 * SIZE(BB), %xmm1
  242. addpd %xmm3, %xmm4
  243. movapd 44 * SIZE(BB), %xmm3
  244. addpd %xmm1, %xmm5
  245. movddup 11 * SIZE(AA), %xmm1
  246. mulpd %xmm1, %xmm3
  247. mulpd 46 * SIZE(BB), %xmm1
  248. addpd %xmm3, %xmm6
  249. movapd 56 * SIZE(BB), %xmm3
  250. addpd %xmm1, %xmm7
  251. movddup 12 * SIZE(AA), %xmm1
  252. mulpd %xmm1, %xmm2
  253. mulpd 50 * SIZE(BB), %xmm1
  254. addpd %xmm2, %xmm4
  255. movapd 52 * SIZE(BB), %xmm2
  256. addpd %xmm1, %xmm5
  257. movddup 13 * SIZE(AA), %xmm1
  258. mulpd %xmm1, %xmm2
  259. mulpd 54 * SIZE(BB), %xmm1
  260. addpd %xmm2, %xmm6
  261. movapd 64 * SIZE(BB), %xmm2
  262. addpd %xmm1, %xmm7
  263. movddup 14 * SIZE(AA), %xmm1
  264. mulpd %xmm1, %xmm3
  265. mulpd 58 * SIZE(BB), %xmm1
  266. addpd %xmm3, %xmm4
  267. movapd 60 * SIZE(BB), %xmm3
  268. addpd %xmm1, %xmm5
  269. movddup 15 * SIZE(AA), %xmm1
  270. mulpd %xmm1, %xmm3
  271. mulpd 62 * SIZE(BB), %xmm1
  272. addpd %xmm3, %xmm6
  273. movapd 72 * SIZE(BB), %xmm3
  274. addpd %xmm1, %xmm7
  275. movddup 24 * SIZE(AA), %xmm1
  276. addl $16 * SIZE, AA
  277. addl $64 * SIZE, BB
  278. decl %eax
  279. jne .L22
  280. ALIGN_4
  281. .L25:
  282. #if defined(LT) || defined(RN)
  283. movl KK, %eax
  284. #else
  285. movl K, %eax
  286. subl KK, %eax
  287. #endif
  288. andl $15, %eax # if (k & 1)
  289. BRANCH
  290. je .L28
  291. .L26:
  292. mulpd %xmm0, %xmm2
  293. mulpd 2 * SIZE(BB), %xmm0
  294. addpd %xmm2, %xmm4
  295. movapd 4 * SIZE(BB), %xmm2
  296. addpd %xmm0, %xmm5
  297. movddup 1 * SIZE(AA), %xmm0
  298. addl $1 * SIZE, AA
  299. addl $4 * SIZE, BB
  300. decl %eax
  301. jg .L26
  302. ALIGN_4
  303. .L28:
  304. addpd %xmm6, %xmm4
  305. addpd %xmm7, %xmm5
  306. #if defined(LN) || defined(RT)
  307. movl KK, %eax
  308. #ifdef LN
  309. subl $1, %eax
  310. #else
  311. subl $4, %eax
  312. #endif
  313. movl AORIG, AA
  314. leal (, %eax, SIZE), %eax
  315. leal (AA, %eax, 1), AA
  316. leal (B, %eax, 4), BB
  317. #endif
  318. #if defined(LN) || defined(LT)
  319. movapd 0 * SIZE(BB), %xmm0
  320. movapd 2 * SIZE(BB), %xmm1
  321. subpd %xmm4, %xmm0
  322. subpd %xmm5, %xmm1
  323. #else
  324. movapd 0 * SIZE(AA), %xmm1
  325. movapd 2 * SIZE(AA), %xmm3
  326. subpd %xmm4, %xmm1
  327. subpd %xmm5, %xmm3
  328. movapd %xmm1, %xmm0
  329. unpckhpd %xmm1, %xmm1
  330. movapd %xmm3, %xmm2
  331. unpckhpd %xmm3, %xmm3
  332. #endif
  333. #ifdef LN
  334. movddup 0 * SIZE(AA), %xmm4
  335. mulpd %xmm4, %xmm0
  336. mulpd %xmm4, %xmm1
  337. #endif
  338. #ifdef LT
  339. movddup 0 * SIZE(AA), %xmm4
  340. mulpd %xmm4, %xmm0
  341. mulpd %xmm4, %xmm1
  342. #endif
  343. #ifdef RN
  344. movsd 0 * SIZE(BB), %xmm4
  345. mulsd %xmm4, %xmm0
  346. movsd 1 * SIZE(BB), %xmm4
  347. mulsd %xmm0, %xmm4
  348. subsd %xmm4, %xmm1
  349. movsd 2 * SIZE(BB), %xmm4
  350. mulsd %xmm0, %xmm4
  351. subsd %xmm4, %xmm2
  352. movsd 3 * SIZE(BB), %xmm4
  353. mulsd %xmm0, %xmm4
  354. subsd %xmm4, %xmm3
  355. movsd 5 * SIZE(BB), %xmm4
  356. mulsd %xmm4, %xmm1
  357. movsd 6 * SIZE(BB), %xmm4
  358. mulsd %xmm1, %xmm4
  359. subsd %xmm4, %xmm2
  360. movsd 7 * SIZE(BB), %xmm4
  361. mulsd %xmm1, %xmm4
  362. subsd %xmm4, %xmm3
  363. movsd 10 * SIZE(BB), %xmm4
  364. mulsd %xmm4, %xmm2
  365. movsd 11 * SIZE(BB), %xmm4
  366. mulsd %xmm2, %xmm4
  367. subsd %xmm4, %xmm3
  368. movsd 15 * SIZE(BB), %xmm4
  369. mulsd %xmm4, %xmm3
  370. #endif
  371. #ifdef RT
  372. movsd 15 * SIZE(BB), %xmm4
  373. mulsd %xmm4, %xmm3
  374. movsd 14 * SIZE(BB), %xmm4
  375. mulsd %xmm3, %xmm4
  376. subsd %xmm4, %xmm2
  377. movsd 13 * SIZE(BB), %xmm4
  378. mulsd %xmm3, %xmm4
  379. subsd %xmm4, %xmm1
  380. movsd 12 * SIZE(BB), %xmm4
  381. mulsd %xmm3, %xmm4
  382. subsd %xmm4, %xmm0
  383. movsd 10 * SIZE(BB), %xmm4
  384. mulsd %xmm4, %xmm2
  385. movsd 9 * SIZE(BB), %xmm4
  386. mulsd %xmm2, %xmm4
  387. subsd %xmm4, %xmm1
  388. movsd 8 * SIZE(BB), %xmm4
  389. mulsd %xmm2, %xmm4
  390. subsd %xmm4, %xmm0
  391. movsd 5 * SIZE(BB), %xmm4
  392. mulsd %xmm4, %xmm1
  393. movsd 4 * SIZE(BB), %xmm4
  394. mulsd %xmm1, %xmm4
  395. subsd %xmm4, %xmm0
  396. movsd 0 * SIZE(BB), %xmm4
  397. mulsd %xmm4, %xmm0
  398. #endif
  399. #if defined(LN) || defined(LT)
  400. movapd %xmm0, 0 * SIZE(BB)
  401. movapd %xmm1, 2 * SIZE(BB)
  402. #else
  403. movsd %xmm0, 0 * SIZE(AA)
  404. movsd %xmm1, 1 * SIZE(AA)
  405. movsd %xmm2, 2 * SIZE(AA)
  406. movsd %xmm3, 3 * SIZE(AA)
  407. #endif
  408. #ifdef LN
  409. subl $1 * SIZE, CO1
  410. #endif
  411. leal (LDC, LDC, 2), %eax
  412. #if defined(LN) || defined(LT)
  413. movsd %xmm0, 0 * SIZE(CO1)
  414. movhpd %xmm0, 0 * SIZE(CO1, LDC, 1)
  415. movsd %xmm1, 0 * SIZE(CO1, LDC, 2)
  416. movhpd %xmm1, 0 * SIZE(CO1, %eax, 1)
  417. #else
  418. movsd %xmm0, 0 * SIZE(CO1)
  419. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  420. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  421. movsd %xmm3, 0 * SIZE(CO1, %eax, 1)
  422. #endif
  423. #ifndef LN
  424. addl $1 * SIZE, CO1
  425. #endif
  426. #if defined(LT) || defined(RN)
  427. movl K, %eax
  428. subl KK, %eax
  429. leal (,%eax, SIZE), %eax
  430. leal (AA, %eax, 1), AA
  431. leal (BB, %eax, 4), BB
  432. #endif
  433. #ifdef LN
  434. subl $1, KK
  435. #endif
  436. #ifdef LT
  437. addl $1, KK
  438. #endif
  439. #ifdef RT
  440. movl K, %eax
  441. sall $BASE_SHIFT, %eax
  442. addl %eax, AORIG
  443. #endif
  444. ALIGN_4
  445. .L20:
  446. movl M, %ebx
  447. sarl $1, %ebx # i = (m >> 2)
  448. jle .L29
  449. ALIGN_4
  450. .L11:
  451. #ifdef LN
  452. movl K, %eax
  453. sall $1 + BASE_SHIFT, %eax
  454. subl %eax, AORIG
  455. #endif
  456. #if defined(LN) || defined(RT)
  457. movl KK, %eax
  458. movl AORIG, AA
  459. leal (, %eax, SIZE), %eax
  460. leal (AA, %eax, 2), AA
  461. #endif
  462. movl B, BB
  463. #if defined(LN) || defined(RT)
  464. movl KK, %eax
  465. sall $2 + BASE_SHIFT, %eax
  466. addl %eax, BB
  467. #endif
  468. movapd 0 * SIZE(AA), %xmm0
  469. pxor %xmm4, %xmm4
  470. movapd 8 * SIZE(AA), %xmm1
  471. pxor %xmm5, %xmm5
  472. movddup 0 * SIZE(BB), %xmm2
  473. pxor %xmm6, %xmm6
  474. movddup 8 * SIZE(BB), %xmm3
  475. pxor %xmm7, %xmm7
  476. leal (LDC, LDC, 2), %eax
  477. #ifdef LN
  478. prefetchnta -2 * SIZE(CO1)
  479. prefetchnta -2 * SIZE(CO1, LDC, 1)
  480. prefetchnta -2 * SIZE(CO1, LDC, 2)
  481. prefetchnta -2 * SIZE(CO1, %eax, 1)
  482. #else
  483. prefetchnta 2 * SIZE(CO1)
  484. prefetchnta 2 * SIZE(CO1, LDC, 1)
  485. prefetchnta 2 * SIZE(CO1, LDC, 2)
  486. prefetchnta 2 * SIZE(CO1, %eax, 1)
  487. #endif
  488. #if defined(LT) || defined(RN)
  489. movl KK, %eax
  490. #else
  491. movl K, %eax
  492. subl KK, %eax
  493. #endif
  494. sarl $3, %eax
  495. je .L15
  496. ALIGN_4
  497. .L12:
  498. mulpd %xmm0, %xmm2
  499. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  500. addpd %xmm2, %xmm4
  501. movddup 1 * SIZE(BB), %xmm2
  502. mulpd %xmm0, %xmm2
  503. addpd %xmm2, %xmm5
  504. movddup 2 * SIZE(BB), %xmm2
  505. mulpd %xmm0, %xmm2
  506. addpd %xmm2, %xmm6
  507. movddup 3 * SIZE(BB), %xmm2
  508. mulpd %xmm0, %xmm2
  509. movapd 2 * SIZE(AA), %xmm0
  510. addpd %xmm2, %xmm7
  511. movddup 4 * SIZE(BB), %xmm2
  512. mulpd %xmm0, %xmm2
  513. addpd %xmm2, %xmm4
  514. movddup 5 * SIZE(BB), %xmm2
  515. mulpd %xmm0, %xmm2
  516. addpd %xmm2, %xmm5
  517. movddup 6 * SIZE(BB), %xmm2
  518. mulpd %xmm0, %xmm2
  519. addpd %xmm2, %xmm6
  520. movddup 7 * SIZE(BB), %xmm2
  521. mulpd %xmm0, %xmm2
  522. movapd 4 * SIZE(AA), %xmm0
  523. addpd %xmm2, %xmm7
  524. movddup 16 * SIZE(BB), %xmm2
  525. mulpd %xmm0, %xmm3
  526. addpd %xmm3, %xmm4
  527. movddup 9 * SIZE(BB), %xmm3
  528. mulpd %xmm0, %xmm3
  529. addpd %xmm3, %xmm5
  530. movddup 10 * SIZE(BB), %xmm3
  531. mulpd %xmm0, %xmm3
  532. addpd %xmm3, %xmm6
  533. movddup 11 * SIZE(BB), %xmm3
  534. mulpd %xmm0, %xmm3
  535. movapd 6 * SIZE(AA), %xmm0
  536. addpd %xmm3, %xmm7
  537. movddup 12 * SIZE(BB), %xmm3
  538. mulpd %xmm0, %xmm3
  539. addpd %xmm3, %xmm4
  540. movddup 13 * SIZE(BB), %xmm3
  541. mulpd %xmm0, %xmm3
  542. addpd %xmm3, %xmm5
  543. movddup 14 * SIZE(BB), %xmm3
  544. mulpd %xmm0, %xmm3
  545. addpd %xmm3, %xmm6
  546. movddup 15 * SIZE(BB), %xmm3
  547. mulpd %xmm0, %xmm3
  548. movapd 16 * SIZE(AA), %xmm0
  549. addpd %xmm3, %xmm7
  550. movddup 24 * SIZE(BB), %xmm3
  551. mulpd %xmm1, %xmm2
  552. addpd %xmm2, %xmm4
  553. movddup 17 * SIZE(BB), %xmm2
  554. mulpd %xmm1, %xmm2
  555. addpd %xmm2, %xmm5
  556. movddup 18 * SIZE(BB), %xmm2
  557. mulpd %xmm1, %xmm2
  558. addpd %xmm2, %xmm6
  559. movddup 19 * SIZE(BB), %xmm2
  560. mulpd %xmm1, %xmm2
  561. movapd 10 * SIZE(AA), %xmm1
  562. addpd %xmm2, %xmm7
  563. movddup 20 * SIZE(BB), %xmm2
  564. mulpd %xmm1, %xmm2
  565. addpd %xmm2, %xmm4
  566. movddup 21 * SIZE(BB), %xmm2
  567. mulpd %xmm1, %xmm2
  568. addpd %xmm2, %xmm5
  569. movddup 22 * SIZE(BB), %xmm2
  570. mulpd %xmm1, %xmm2
  571. addpd %xmm2, %xmm6
  572. movddup 23 * SIZE(BB), %xmm2
  573. mulpd %xmm1, %xmm2
  574. movapd 12 * SIZE(AA), %xmm1
  575. addpd %xmm2, %xmm7
  576. movddup 32 * SIZE(BB), %xmm2
  577. mulpd %xmm1, %xmm3
  578. addpd %xmm3, %xmm4
  579. movddup 25 * SIZE(BB), %xmm3
  580. mulpd %xmm1, %xmm3
  581. addpd %xmm3, %xmm5
  582. movddup 26 * SIZE(BB), %xmm3
  583. mulpd %xmm1, %xmm3
  584. addpd %xmm3, %xmm6
  585. movddup 27 * SIZE(BB), %xmm3
  586. mulpd %xmm1, %xmm3
  587. movapd 14 * SIZE(AA), %xmm1
  588. addpd %xmm3, %xmm7
  589. movddup 28 * SIZE(BB), %xmm3
  590. mulpd %xmm1, %xmm3
  591. addpd %xmm3, %xmm4
  592. movddup 29 * SIZE(BB), %xmm3
  593. mulpd %xmm1, %xmm3
  594. addpd %xmm3, %xmm5
  595. movddup 30 * SIZE(BB), %xmm3
  596. mulpd %xmm1, %xmm3
  597. addpd %xmm3, %xmm6
  598. movddup 31 * SIZE(BB), %xmm3
  599. mulpd %xmm1, %xmm3
  600. movapd 24 * SIZE(AA), %xmm1
  601. addpd %xmm3, %xmm7
  602. movddup 40 * SIZE(BB), %xmm3
  603. addl $32 * SIZE, BB
  604. addl $16 * SIZE, AA
  605. decl %eax
  606. jne .L12
  607. ALIGN_4
  608. .L15:
  609. #if defined(LT) || defined(RN)
  610. movl KK, %eax
  611. #else
  612. movl K, %eax
  613. subl KK, %eax
  614. #endif
  615. andl $7, %eax # if (k & 1)
  616. BRANCH
  617. je .L18
  618. ALIGN_3
  619. .L16:
  620. mulpd %xmm0, %xmm2
  621. addpd %xmm2, %xmm4
  622. movddup 1 * SIZE(BB), %xmm2
  623. mulpd %xmm0, %xmm2
  624. addpd %xmm2, %xmm5
  625. movddup 2 * SIZE(BB), %xmm2
  626. mulpd %xmm0, %xmm2
  627. addpd %xmm2, %xmm6
  628. movddup 3 * SIZE(BB), %xmm2
  629. mulpd %xmm0, %xmm2
  630. movapd 2 * SIZE(AA), %xmm0
  631. addpd %xmm2, %xmm7
  632. movddup 4 * SIZE(BB), %xmm2
  633. addl $2 * SIZE, AA
  634. addl $4 * SIZE, BB
  635. decl %eax
  636. jg .L16
  637. ALIGN_4
  638. .L18:
  639. #if defined(LN) || defined(RT)
  640. movl KK, %eax
  641. #ifdef LN
  642. subl $2, %eax
  643. #else
  644. subl $4, %eax
  645. #endif
  646. movl AORIG, AA
  647. leal (, %eax, SIZE), %eax
  648. leal (AA, %eax, 2), AA
  649. leal (B, %eax, 4), BB
  650. #endif
  651. #if defined(LN) || defined(LT)
  652. movapd %xmm4, %xmm0
  653. unpcklpd %xmm5, %xmm4
  654. unpckhpd %xmm5, %xmm0
  655. movapd %xmm6, %xmm1
  656. unpcklpd %xmm7, %xmm6
  657. unpckhpd %xmm7, %xmm1
  658. movapd 0 * SIZE(BB), %xmm2
  659. movapd 2 * SIZE(BB), %xmm5
  660. movapd 4 * SIZE(BB), %xmm3
  661. movapd 6 * SIZE(BB), %xmm7
  662. subpd %xmm4, %xmm2
  663. subpd %xmm6, %xmm5
  664. subpd %xmm0, %xmm3
  665. subpd %xmm1, %xmm7
  666. #else
  667. movapd 0 * SIZE(AA), %xmm0
  668. movapd 2 * SIZE(AA), %xmm1
  669. movapd 4 * SIZE(AA), %xmm2
  670. movapd 6 * SIZE(AA), %xmm3
  671. subpd %xmm4, %xmm0
  672. subpd %xmm5, %xmm1
  673. subpd %xmm6, %xmm2
  674. subpd %xmm7, %xmm3
  675. #endif
  676. #ifdef LN
  677. movddup 3 * SIZE(AA), %xmm4
  678. mulpd %xmm4, %xmm3
  679. mulpd %xmm4, %xmm7
  680. movddup 2 * SIZE(AA), %xmm4
  681. movapd %xmm4, %xmm6
  682. mulpd %xmm3, %xmm4
  683. subpd %xmm4, %xmm2
  684. mulpd %xmm7, %xmm6
  685. subpd %xmm6, %xmm5
  686. movddup 0 * SIZE(AA), %xmm4
  687. mulpd %xmm4, %xmm2
  688. mulpd %xmm4, %xmm5
  689. #endif
  690. #ifdef LT
  691. movddup 0 * SIZE(AA), %xmm4
  692. mulpd %xmm4, %xmm2
  693. mulpd %xmm4, %xmm5
  694. movddup 1 * SIZE(AA), %xmm4
  695. movapd %xmm4, %xmm6
  696. mulpd %xmm2, %xmm4
  697. subpd %xmm4, %xmm3
  698. mulpd %xmm5, %xmm6
  699. subpd %xmm6, %xmm7
  700. movddup 3 * SIZE(AA), %xmm4
  701. mulpd %xmm4, %xmm3
  702. mulpd %xmm4, %xmm7
  703. #endif
  704. #ifdef RN
  705. movddup 0 * SIZE(BB), %xmm4
  706. mulpd %xmm4, %xmm0
  707. movddup 1 * SIZE(BB), %xmm4
  708. mulpd %xmm0, %xmm4
  709. subpd %xmm4, %xmm1
  710. movddup 2 * SIZE(BB), %xmm4
  711. mulpd %xmm0, %xmm4
  712. subpd %xmm4, %xmm2
  713. movddup 3 * SIZE(BB), %xmm4
  714. mulpd %xmm0, %xmm4
  715. subpd %xmm4, %xmm3
  716. movddup 5 * SIZE(BB), %xmm4
  717. mulpd %xmm4, %xmm1
  718. movddup 6 * SIZE(BB), %xmm4
  719. mulpd %xmm1, %xmm4
  720. subpd %xmm4, %xmm2
  721. movddup 7 * SIZE(BB), %xmm4
  722. mulpd %xmm1, %xmm4
  723. subpd %xmm4, %xmm3
  724. movddup 10 * SIZE(BB), %xmm4
  725. mulpd %xmm4, %xmm2
  726. movddup 11 * SIZE(BB), %xmm4
  727. mulpd %xmm2, %xmm4
  728. subpd %xmm4, %xmm3
  729. movddup 15 * SIZE(BB), %xmm4
  730. mulpd %xmm4, %xmm3
  731. #endif
  732. #ifdef RT
  733. movddup 15 * SIZE(BB), %xmm4
  734. mulpd %xmm4, %xmm3
  735. movddup 14 * SIZE(BB), %xmm4
  736. mulpd %xmm3, %xmm4
  737. subpd %xmm4, %xmm2
  738. movddup 13 * SIZE(BB), %xmm4
  739. mulpd %xmm3, %xmm4
  740. subpd %xmm4, %xmm1
  741. movddup 12 * SIZE(BB), %xmm4
  742. mulpd %xmm3, %xmm4
  743. subpd %xmm4, %xmm0
  744. movddup 10 * SIZE(BB), %xmm4
  745. mulpd %xmm4, %xmm2
  746. movddup 9 * SIZE(BB), %xmm4
  747. mulpd %xmm2, %xmm4
  748. subpd %xmm4, %xmm1
  749. movddup 8 * SIZE(BB), %xmm4
  750. mulpd %xmm2, %xmm4
  751. subpd %xmm4, %xmm0
  752. movddup 5 * SIZE(BB), %xmm4
  753. mulpd %xmm4, %xmm1
  754. movddup 4 * SIZE(BB), %xmm4
  755. mulpd %xmm1, %xmm4
  756. subpd %xmm4, %xmm0
  757. movddup 0 * SIZE(BB), %xmm4
  758. mulpd %xmm4, %xmm0
  759. #endif
  760. #if defined(LN) || defined(LT)
  761. movapd %xmm2, 0 * SIZE(BB)
  762. movapd %xmm5, 2 * SIZE(BB)
  763. movapd %xmm3, 4 * SIZE(BB)
  764. movapd %xmm7, 6 * SIZE(BB)
  765. #else
  766. movapd %xmm0, 0 * SIZE(AA)
  767. movapd %xmm1, 2 * SIZE(AA)
  768. movapd %xmm2, 4 * SIZE(AA)
  769. movapd %xmm3, 6 * SIZE(AA)
  770. #endif
  771. #ifdef LN
  772. subl $2 * SIZE, CO1
  773. #endif
  774. leal (LDC, LDC, 2), %eax
  775. #if defined(LN) || defined(LT)
  776. movsd %xmm2, 0 * SIZE(CO1)
  777. movsd %xmm3, 1 * SIZE(CO1)
  778. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  779. movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
  780. movsd %xmm5, 0 * SIZE(CO1, LDC, 2)
  781. movsd %xmm7, 1 * SIZE(CO1, LDC, 2)
  782. movhpd %xmm5, 0 * SIZE(CO1, %eax, 1)
  783. movhpd %xmm7, 1 * SIZE(CO1, %eax, 1)
  784. #else
  785. movsd %xmm0, 0 * SIZE(CO1)
  786. movhpd %xmm0, 1 * SIZE(CO1)
  787. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  788. movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
  789. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  790. movhpd %xmm2, 1 * SIZE(CO1, LDC, 2)
  791. movsd %xmm3, 0 * SIZE(CO1, %eax, 1)
  792. movhpd %xmm3, 1 * SIZE(CO1, %eax, 1)
  793. #endif
  794. #ifndef LN
  795. addl $2 * SIZE, CO1
  796. #endif
  797. #if defined(LT) || defined(RN)
  798. movl K, %eax
  799. subl KK, %eax
  800. leal (,%eax, SIZE), %eax
  801. leal (AA, %eax, 2), AA
  802. leal (BB, %eax, 4), BB
  803. #endif
  804. #ifdef LN
  805. subl $2, KK
  806. #endif
  807. #ifdef LT
  808. addl $2, KK
  809. #endif
  810. #ifdef RT
  811. movl K, %eax
  812. sall $1 + BASE_SHIFT, %eax
  813. addl %eax, AORIG
  814. #endif
  815. decl %ebx # i --
  816. jg .L11
  817. ALIGN_4
  818. .L29:
  819. #ifdef LN
  820. movl K, %eax
  821. leal (, %eax, SIZE), %eax
  822. leal (B, %eax, 4), B
  823. #endif
  824. #if defined(LT) || defined(RN)
  825. movl BB, B
  826. #endif
  827. #ifdef RN
  828. addl $4, KK
  829. #endif
  830. #ifdef RT
  831. subl $4, KK
  832. #endif
  833. decl J # j --
  834. jg .L10
  835. ALIGN_4
  836. .L30:
  837. testl $2, N
  838. je .L60
  839. #if defined(LT) || defined(RN)
  840. movl A, AA
  841. #else
  842. movl A, %eax
  843. movl %eax, AORIG
  844. #endif
  845. #ifdef RT
  846. movl K, %eax
  847. sall $1 + BASE_SHIFT, %eax
  848. subl %eax, B
  849. #endif
  850. leal (, LDC, 2), %eax
  851. #ifdef RT
  852. subl %eax, C
  853. #endif
  854. movl C, CO1
  855. #ifndef RT
  856. addl %eax, C
  857. #endif
  858. #ifdef LN
  859. movl OFFSET, %eax
  860. addl M, %eax
  861. movl %eax, KK
  862. #endif
  863. #ifdef LT
  864. movl OFFSET, %eax
  865. movl %eax, KK
  866. #endif
  867. movl M, %ebx
  868. testl $1, %ebx # i = (m >> 2)
  869. jle .L50
  870. #ifdef LN
  871. movl K, %eax
  872. sall $BASE_SHIFT, %eax
  873. subl %eax, AORIG
  874. #endif
  875. #if defined(LN) || defined(RT)
  876. movl KK, %eax
  877. movl AORIG, AA
  878. leal (AA, %eax, SIZE), AA
  879. #endif
  880. movl B, BB
  881. #if defined(LN) || defined(RT)
  882. movl KK, %eax
  883. sall $1 + BASE_SHIFT, %eax
  884. addl %eax, BB
  885. #endif
  886. movddup 0 * SIZE(AA), %xmm0
  887. pxor %xmm4, %xmm4
  888. movddup 8 * SIZE(AA), %xmm1
  889. pxor %xmm5, %xmm5
  890. movapd 0 * SIZE(BB), %xmm2
  891. pxor %xmm6, %xmm6
  892. movapd 8 * SIZE(BB), %xmm3
  893. pxor %xmm7, %xmm7
  894. #if defined(LT) || defined(RN)
  895. movl KK, %eax
  896. #else
  897. movl K, %eax
  898. subl KK, %eax
  899. #endif
  900. sarl $4, %eax
  901. je .L55
  902. ALIGN_4
  903. .L52:
  904. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  905. mulpd %xmm0, %xmm2
  906. movddup 1 * SIZE(AA), %xmm0
  907. addpd %xmm2, %xmm4
  908. mulpd 2 * SIZE(BB), %xmm0
  909. movapd 16 * SIZE(BB), %xmm2
  910. addpd %xmm0, %xmm5
  911. movddup 2 * SIZE(AA), %xmm0
  912. mulpd 4 * SIZE(BB), %xmm0
  913. addpd %xmm0, %xmm6
  914. movddup 3 * SIZE(AA), %xmm0
  915. mulpd 6 * SIZE(BB), %xmm0
  916. addpd %xmm0, %xmm7
  917. movddup 4 * SIZE(AA), %xmm0
  918. mulpd %xmm0, %xmm3
  919. movddup 5 * SIZE(AA), %xmm0
  920. addpd %xmm3, %xmm4
  921. mulpd 10 * SIZE(BB), %xmm0
  922. movapd 24 * SIZE(BB), %xmm3
  923. addpd %xmm0, %xmm5
  924. movddup 6 * SIZE(AA), %xmm0
  925. mulpd 12 * SIZE(BB), %xmm0
  926. addpd %xmm0, %xmm6
  927. movddup 7 * SIZE(AA), %xmm0
  928. mulpd 14 * SIZE(BB), %xmm0
  929. addpd %xmm0, %xmm7
  930. movddup 16 * SIZE(AA), %xmm0
  931. mulpd %xmm1, %xmm2
  932. movddup 9 * SIZE(AA), %xmm1
  933. addpd %xmm2, %xmm4
  934. mulpd 18 * SIZE(BB), %xmm1
  935. movapd 32 * SIZE(BB), %xmm2
  936. addpd %xmm1, %xmm5
  937. movddup 10 * SIZE(AA), %xmm1
  938. mulpd 20 * SIZE(BB), %xmm1
  939. addpd %xmm1, %xmm6
  940. movddup 11 * SIZE(AA), %xmm1
  941. mulpd 22 * SIZE(BB), %xmm1
  942. addpd %xmm1, %xmm7
  943. movddup 12 * SIZE(AA), %xmm1
  944. mulpd %xmm1, %xmm3
  945. movddup 13 * SIZE(AA), %xmm1
  946. addpd %xmm3, %xmm4
  947. mulpd 26 * SIZE(BB), %xmm1
  948. movapd 40 * SIZE(BB), %xmm3
  949. addpd %xmm1, %xmm5
  950. movddup 14 * SIZE(AA), %xmm1
  951. mulpd 28 * SIZE(BB), %xmm1
  952. addpd %xmm1, %xmm6
  953. movddup 15 * SIZE(AA), %xmm1
  954. mulpd 30 * SIZE(BB), %xmm1
  955. addpd %xmm1, %xmm7
  956. movddup 24 * SIZE(AA), %xmm1
  957. addl $16 * SIZE, AA
  958. addl $32 * SIZE, BB
  959. decl %eax
  960. jne .L52
  961. ALIGN_4
  962. .L55:
  963. #if defined(LT) || defined(RN)
  964. movl KK, %eax
  965. #else
  966. movl K, %eax
  967. subl KK, %eax
  968. #endif
  969. andl $15, %eax # if (k & 1)
  970. BRANCH
  971. je .L58
  972. .L56:
  973. mulpd %xmm0, %xmm2
  974. movddup 1 * SIZE(AA), %xmm0
  975. addpd %xmm2, %xmm4
  976. movapd 2 * SIZE(BB), %xmm2
  977. addl $1 * SIZE, AA
  978. addl $2 * SIZE, BB
  979. decl %eax
  980. jg .L56
  981. ALIGN_4
  982. .L58:
  983. addpd %xmm5, %xmm4
  984. addpd %xmm7, %xmm6
  985. addpd %xmm6, %xmm4
  986. #if defined(LN) || defined(RT)
  987. movl KK, %eax
  988. #ifdef LN
  989. subl $1, %eax
  990. #else
  991. subl $2, %eax
  992. #endif
  993. movl AORIG, AA
  994. leal (, %eax, SIZE), %eax
  995. addl %eax, AA
  996. leal (B, %eax, 2), BB
  997. #endif
  998. #if defined(LN) || defined(LT)
  999. movapd 0 * SIZE(BB), %xmm0
  1000. subpd %xmm4, %xmm0
  1001. #else
  1002. movapd 0 * SIZE(AA), %xmm1
  1003. subpd %xmm4, %xmm1
  1004. movapd %xmm1, %xmm0
  1005. unpckhpd %xmm1, %xmm1
  1006. #endif
  1007. #ifdef LN
  1008. movddup 0 * SIZE(AA), %xmm4
  1009. mulpd %xmm4, %xmm0
  1010. #endif
  1011. #ifdef LT
  1012. movddup 0 * SIZE(AA), %xmm4
  1013. mulpd %xmm4, %xmm0
  1014. #endif
  1015. #ifdef RN
  1016. movsd 0 * SIZE(BB), %xmm4
  1017. mulsd %xmm4, %xmm0
  1018. movsd 1 * SIZE(BB), %xmm4
  1019. mulsd %xmm0, %xmm4
  1020. subsd %xmm4, %xmm1
  1021. movsd 3 * SIZE(BB), %xmm4
  1022. mulsd %xmm4, %xmm1
  1023. #endif
  1024. #ifdef RT
  1025. movsd 3 * SIZE(BB), %xmm4
  1026. mulsd %xmm4, %xmm1
  1027. movsd 2 * SIZE(BB), %xmm4
  1028. mulsd %xmm1, %xmm4
  1029. subsd %xmm4, %xmm0
  1030. movsd 0 * SIZE(BB), %xmm4
  1031. mulsd %xmm4, %xmm0
  1032. #endif
  1033. #if defined(LN) || defined(LT)
  1034. movapd %xmm0, 0 * SIZE(BB)
  1035. #else
  1036. movsd %xmm0, 0 * SIZE(AA)
  1037. movsd %xmm1, 1 * SIZE(AA)
  1038. #endif
  1039. #ifdef LN
  1040. subl $1 * SIZE, CO1
  1041. #endif
  1042. #if defined(LN) || defined(LT)
  1043. movsd %xmm0, 0 * SIZE(CO1)
  1044. movhpd %xmm0, 0 * SIZE(CO1, LDC, 1)
  1045. #else
  1046. movsd %xmm0, 0 * SIZE(CO1)
  1047. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1048. #endif
  1049. #ifndef LN
  1050. addl $1 * SIZE, CO1
  1051. #endif
  1052. #if defined(LT) || defined(RN)
  1053. movl K, %eax
  1054. subl KK, %eax
  1055. leal (,%eax, SIZE), %eax
  1056. leal (AA, %eax, 1), AA
  1057. leal (BB, %eax, 2), BB
  1058. #endif
  1059. #ifdef LN
  1060. subl $1, KK
  1061. #endif
  1062. #ifdef LT
  1063. addl $1, KK
  1064. #endif
  1065. #ifdef RT
  1066. movl K, %eax
  1067. sall $1 + BASE_SHIFT, %eax
  1068. addl %eax, AORIG
  1069. #endif
  1070. ALIGN_4
  1071. .L50:
  1072. movl M, %ebx
  1073. sarl $1, %ebx # i = (m >> 2)
  1074. jle .L59
  1075. ALIGN_4
  1076. .L41:
  1077. #ifdef LN
  1078. movl K, %eax
  1079. sall $1 + BASE_SHIFT, %eax
  1080. subl %eax, AORIG
  1081. #endif
  1082. #if defined(LN) || defined(RT)
  1083. movl KK, %eax
  1084. movl AORIG, AA
  1085. leal (, %eax, SIZE), %eax
  1086. leal (AA, %eax, 2), AA
  1087. #endif
  1088. movl B, BB
  1089. #if defined(LN) || defined(RT)
  1090. movl KK, %eax
  1091. sall $1 + BASE_SHIFT, %eax
  1092. addl %eax, BB
  1093. #endif
  1094. movapd 0 * SIZE(AA), %xmm0
  1095. pxor %xmm4, %xmm4
  1096. movapd 8 * SIZE(AA), %xmm1
  1097. pxor %xmm5, %xmm5
  1098. movddup 0 * SIZE(BB), %xmm2
  1099. pxor %xmm6, %xmm6
  1100. movddup 8 * SIZE(BB), %xmm3
  1101. pxor %xmm7, %xmm7
  1102. #ifdef LN
  1103. prefetchnta -2 * SIZE(CO1)
  1104. prefetchnta -2 * SIZE(CO1, LDC, 1)
  1105. #else
  1106. prefetchnta 2 * SIZE(CO1)
  1107. prefetchnta 2 * SIZE(CO1, LDC, 1)
  1108. #endif
  1109. #if defined(LT) || defined(RN)
  1110. movl KK, %eax
  1111. #else
  1112. movl K, %eax
  1113. subl KK, %eax
  1114. #endif
  1115. sarl $3, %eax
  1116. je .L45
  1117. ALIGN_4
  1118. .L42:
  1119. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1120. mulpd %xmm0, %xmm2
  1121. addpd %xmm2, %xmm4
  1122. movddup 1 * SIZE(BB), %xmm2
  1123. mulpd %xmm0, %xmm2
  1124. movapd 2 * SIZE(AA), %xmm0
  1125. addpd %xmm2, %xmm5
  1126. movddup 2 * SIZE(BB), %xmm2
  1127. mulpd %xmm0, %xmm2
  1128. addpd %xmm2, %xmm6
  1129. movddup 3 * SIZE(BB), %xmm2
  1130. mulpd %xmm0, %xmm2
  1131. movapd 4 * SIZE(AA), %xmm0
  1132. addpd %xmm2, %xmm7
  1133. movddup 4 * SIZE(BB), %xmm2
  1134. mulpd %xmm0, %xmm2
  1135. addpd %xmm2, %xmm4
  1136. movddup 5 * SIZE(BB), %xmm2
  1137. mulpd %xmm0, %xmm2
  1138. movapd 6 * SIZE(AA), %xmm0
  1139. addpd %xmm2, %xmm5
  1140. movddup 6 * SIZE(BB), %xmm2
  1141. mulpd %xmm0, %xmm2
  1142. addpd %xmm2, %xmm6
  1143. movddup 7 * SIZE(BB), %xmm2
  1144. mulpd %xmm0, %xmm2
  1145. movapd 16 * SIZE(AA), %xmm0
  1146. addpd %xmm2, %xmm7
  1147. movddup 16 * SIZE(BB), %xmm2
  1148. mulpd %xmm1, %xmm3
  1149. addpd %xmm3, %xmm4
  1150. movddup 9 * SIZE(BB), %xmm3
  1151. mulpd %xmm1, %xmm3
  1152. movapd 10 * SIZE(AA), %xmm1
  1153. addpd %xmm3, %xmm5
  1154. movddup 10 * SIZE(BB), %xmm3
  1155. mulpd %xmm1, %xmm3
  1156. addpd %xmm3, %xmm6
  1157. movddup 11 * SIZE(BB), %xmm3
  1158. mulpd %xmm1, %xmm3
  1159. movapd 12 * SIZE(AA), %xmm1
  1160. addpd %xmm3, %xmm7
  1161. movddup 12 * SIZE(BB), %xmm3
  1162. mulpd %xmm1, %xmm3
  1163. addpd %xmm3, %xmm4
  1164. movddup 13 * SIZE(BB), %xmm3
  1165. mulpd %xmm1, %xmm3
  1166. movapd 14 * SIZE(AA), %xmm1
  1167. addpd %xmm3, %xmm5
  1168. movddup 14 * SIZE(BB), %xmm3
  1169. mulpd %xmm1, %xmm3
  1170. addpd %xmm3, %xmm6
  1171. movddup 15 * SIZE(BB), %xmm3
  1172. mulpd %xmm1, %xmm3
  1173. movapd 24 * SIZE(AA), %xmm1
  1174. addpd %xmm3, %xmm7
  1175. movddup 24 * SIZE(BB), %xmm3
  1176. addl $16 * SIZE, AA
  1177. addl $16 * SIZE, BB
  1178. decl %eax
  1179. jne .L42
  1180. ALIGN_4
  1181. .L45:
  1182. #if defined(LT) || defined(RN)
  1183. movl KK, %eax
  1184. #else
  1185. movl K, %eax
  1186. subl KK, %eax
  1187. #endif
  1188. andl $7, %eax # if (k & 1)
  1189. BRANCH
  1190. je .L48
  1191. ALIGN_3
  1192. .L46:
  1193. mulpd %xmm0, %xmm2
  1194. addpd %xmm2, %xmm4
  1195. movddup 1 * SIZE(BB), %xmm2
  1196. mulpd %xmm0, %xmm2
  1197. movapd 2 * SIZE(AA), %xmm0
  1198. addpd %xmm2, %xmm5
  1199. movddup 2 * SIZE(BB), %xmm2
  1200. addl $2 * SIZE, AA
  1201. addl $2 * SIZE, BB
  1202. decl %eax
  1203. jg .L46
  1204. ALIGN_4
  1205. .L48:
  1206. addpd %xmm6, %xmm4
  1207. addpd %xmm7, %xmm5
  1208. #if defined(LN) || defined(RT)
  1209. movl KK, %eax
  1210. #ifdef LN
  1211. subl $2, %eax
  1212. #else
  1213. subl $2, %eax
  1214. #endif
  1215. movl AORIG, AA
  1216. leal (, %eax, SIZE), %eax
  1217. leal (AA, %eax, 2), AA
  1218. leal (B, %eax, 2), BB
  1219. #endif
  1220. #if defined(LN) || defined(LT)
  1221. movapd %xmm4, %xmm0
  1222. unpcklpd %xmm5, %xmm4
  1223. unpckhpd %xmm5, %xmm0
  1224. movapd 0 * SIZE(BB), %xmm2
  1225. movapd 2 * SIZE(BB), %xmm3
  1226. subpd %xmm4, %xmm2
  1227. subpd %xmm0, %xmm3
  1228. #else
  1229. movapd 0 * SIZE(AA), %xmm0
  1230. movapd 2 * SIZE(AA), %xmm1
  1231. subpd %xmm4, %xmm0
  1232. subpd %xmm5, %xmm1
  1233. #endif
  1234. #ifdef LN
  1235. movddup 3 * SIZE(AA), %xmm4
  1236. mulpd %xmm4, %xmm3
  1237. movddup 2 * SIZE(AA), %xmm4
  1238. mulpd %xmm3, %xmm4
  1239. subpd %xmm4, %xmm2
  1240. movddup 0 * SIZE(AA), %xmm4
  1241. mulpd %xmm4, %xmm2
  1242. #endif
  1243. #ifdef LT
  1244. movddup 0 * SIZE(AA), %xmm4
  1245. mulpd %xmm4, %xmm2
  1246. movddup 1 * SIZE(AA), %xmm4
  1247. mulpd %xmm2, %xmm4
  1248. subpd %xmm4, %xmm3
  1249. movddup 3 * SIZE(AA), %xmm4
  1250. mulpd %xmm4, %xmm3
  1251. #endif
  1252. #ifdef RN
  1253. movddup 0 * SIZE(BB), %xmm4
  1254. mulpd %xmm4, %xmm0
  1255. movddup 1 * SIZE(BB), %xmm4
  1256. mulpd %xmm0, %xmm4
  1257. subpd %xmm4, %xmm1
  1258. movddup 3 * SIZE(BB), %xmm4
  1259. mulpd %xmm4, %xmm1
  1260. #endif
  1261. #ifdef RT
  1262. movddup 3 * SIZE(BB), %xmm4
  1263. mulpd %xmm4, %xmm1
  1264. movddup 2 * SIZE(BB), %xmm4
  1265. mulpd %xmm1, %xmm4
  1266. subpd %xmm4, %xmm0
  1267. movddup 0 * SIZE(BB), %xmm4
  1268. mulpd %xmm4, %xmm0
  1269. #endif
  1270. #if defined(LN) || defined(LT)
  1271. movapd %xmm2, 0 * SIZE(BB)
  1272. movapd %xmm3, 2 * SIZE(BB)
  1273. #else
  1274. movapd %xmm0, 0 * SIZE(AA)
  1275. movapd %xmm1, 2 * SIZE(AA)
  1276. #endif
  1277. #ifdef LN
  1278. subl $2 * SIZE, CO1
  1279. #endif
  1280. #if defined(LN) || defined(LT)
  1281. movsd %xmm2, 0 * SIZE(CO1)
  1282. movsd %xmm3, 1 * SIZE(CO1)
  1283. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  1284. movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
  1285. #else
  1286. movsd %xmm0, 0 * SIZE(CO1)
  1287. movhpd %xmm0, 1 * SIZE(CO1)
  1288. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1289. movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
  1290. #endif
  1291. #ifndef LN
  1292. addl $2 * SIZE, CO1
  1293. #endif
  1294. #if defined(LT) || defined(RN)
  1295. movl K, %eax
  1296. subl KK, %eax
  1297. leal (,%eax, SIZE), %eax
  1298. leal (AA, %eax, 2), AA
  1299. leal (BB, %eax, 2), BB
  1300. #endif
  1301. #ifdef LN
  1302. subl $2, KK
  1303. #endif
  1304. #ifdef LT
  1305. addl $2, KK
  1306. #endif
  1307. #ifdef RT
  1308. movl K, %eax
  1309. sall $1 + BASE_SHIFT, %eax
  1310. addl %eax, AORIG
  1311. #endif
  1312. decl %ebx # i --
  1313. jg .L41
  1314. ALIGN_4
  1315. .L59:
  1316. #ifdef LN
  1317. movl K, %eax
  1318. leal (, %eax, SIZE), %eax
  1319. leal (B, %eax, 2), B
  1320. #endif
  1321. #if defined(LT) || defined(RN)
  1322. movl BB, B
  1323. #endif
  1324. #ifdef RN
  1325. addl $2, KK
  1326. #endif
  1327. #ifdef RT
  1328. subl $2, KK
  1329. #endif
  1330. ALIGN_4
  1331. .L60:
  1332. testl $1, N
  1333. je .L999
  1334. #if defined(LT) || defined(RN)
  1335. movl A, AA
  1336. #else
  1337. movl A, %eax
  1338. movl %eax, AORIG
  1339. #endif
  1340. #ifdef RT
  1341. movl K, %eax
  1342. sall $BASE_SHIFT, %eax
  1343. subl %eax, B
  1344. #endif
  1345. #ifdef RT
  1346. subl LDC, C
  1347. #endif
  1348. movl C, CO1
  1349. #ifndef RT
  1350. addl LDC, C
  1351. #endif
  1352. #ifdef LN
  1353. movl OFFSET, %eax
  1354. addl M, %eax
  1355. movl %eax, KK
  1356. #endif
  1357. #ifdef LT
  1358. movl OFFSET, %eax
  1359. movl %eax, KK
  1360. #endif
  1361. movl M, %ebx
  1362. testl $1, %ebx # i = (m >> 2)
  1363. jle .L80
  1364. #ifdef LN
  1365. movl K, %eax
  1366. sall $BASE_SHIFT, %eax
  1367. subl %eax, AORIG
  1368. #endif
  1369. #if defined(LN) || defined(RT)
  1370. movl KK, %eax
  1371. movl AORIG, AA
  1372. leal (AA, %eax, SIZE), AA
  1373. #endif
  1374. movl B, BB
  1375. #if defined(LN) || defined(RT)
  1376. movl KK, %eax
  1377. sall $BASE_SHIFT, %eax
  1378. addl %eax, BB
  1379. #endif
  1380. movsd 0 * SIZE(AA), %xmm0
  1381. movhpd 1 * SIZE(AA), %xmm0
  1382. pxor %xmm4, %xmm4
  1383. movsd 8 * SIZE(AA), %xmm1
  1384. movhpd 9 * SIZE(AA), %xmm1
  1385. pxor %xmm5, %xmm5
  1386. movsd 0 * SIZE(BB), %xmm2
  1387. movhpd 1 * SIZE(BB), %xmm2
  1388. pxor %xmm6, %xmm6
  1389. movsd 8 * SIZE(BB), %xmm3
  1390. movhpd 9 * SIZE(BB), %xmm3
  1391. pxor %xmm7, %xmm7
  1392. #if defined(LT) || defined(RN)
  1393. movl KK, %eax
  1394. #else
  1395. movl K, %eax
  1396. subl KK, %eax
  1397. #endif
  1398. sarl $4, %eax
  1399. je .L85
  1400. ALIGN_4
  1401. .L82:
  1402. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1403. mulpd %xmm0, %xmm2
  1404. movapd 2 * SIZE(AA), %xmm0
  1405. addpd %xmm2, %xmm4
  1406. mulpd 2 * SIZE(BB), %xmm0
  1407. movapd 16 * SIZE(BB), %xmm2
  1408. addpd %xmm0, %xmm5
  1409. movapd 4 * SIZE(AA), %xmm0
  1410. mulpd 4 * SIZE(BB), %xmm0
  1411. addpd %xmm0, %xmm6
  1412. movapd 6 * SIZE(AA), %xmm0
  1413. mulpd 6 * SIZE(BB), %xmm0
  1414. addpd %xmm0, %xmm7
  1415. movapd 16 * SIZE(AA), %xmm0
  1416. mulpd %xmm1, %xmm3
  1417. movapd 10 * SIZE(AA), %xmm1
  1418. addpd %xmm3, %xmm4
  1419. mulpd 10 * SIZE(BB), %xmm1
  1420. movapd 24 * SIZE(BB), %xmm3
  1421. addpd %xmm1, %xmm5
  1422. movapd 12 * SIZE(AA), %xmm1
  1423. mulpd 12 * SIZE(BB), %xmm1
  1424. addpd %xmm1, %xmm6
  1425. movapd 14 * SIZE(AA), %xmm1
  1426. mulpd 14 * SIZE(BB), %xmm1
  1427. addpd %xmm1, %xmm7
  1428. movapd 24 * SIZE(AA), %xmm1
  1429. addl $16 * SIZE, AA
  1430. addl $16 * SIZE, BB
  1431. decl %eax
  1432. jne .L82
  1433. ALIGN_4
  1434. .L85:
  1435. #if defined(LT) || defined(RN)
  1436. movl KK, %eax
  1437. #else
  1438. movl K, %eax
  1439. subl KK, %eax
  1440. #endif
  1441. andl $15, %eax # if (k & 1)
  1442. BRANCH
  1443. je .L88
  1444. .L86:
  1445. mulsd %xmm0, %xmm2
  1446. movsd 1 * SIZE(AA), %xmm0
  1447. addsd %xmm2, %xmm4
  1448. movsd 1 * SIZE(BB), %xmm2
  1449. addl $1 * SIZE, AA
  1450. addl $1 * SIZE, BB
  1451. decl %eax
  1452. jg .L86
  1453. ALIGN_4
  1454. .L88:
  1455. addpd %xmm5, %xmm4
  1456. addpd %xmm7, %xmm6
  1457. addpd %xmm6, %xmm4
  1458. haddpd %xmm4, %xmm4
  1459. #if defined(LN) || defined(RT)
  1460. movl KK, %eax
  1461. #ifdef LN
  1462. subl $1, %eax
  1463. #else
  1464. subl $1, %eax
  1465. #endif
  1466. movl AORIG, AA
  1467. leal (, %eax, SIZE), %eax
  1468. addl %eax, AA
  1469. leal (B, %eax, 1), BB
  1470. #endif
  1471. #if defined(LN) || defined(LT)
  1472. movsd 0 * SIZE(BB), %xmm0
  1473. subsd %xmm4, %xmm0
  1474. #else
  1475. movsd 0 * SIZE(AA), %xmm0
  1476. subsd %xmm4, %xmm0
  1477. #endif
  1478. #ifdef LN
  1479. movsd 0 * SIZE(AA), %xmm4
  1480. mulsd %xmm4, %xmm0
  1481. #endif
  1482. #ifdef LT
  1483. movsd 0 * SIZE(AA), %xmm4
  1484. mulsd %xmm4, %xmm0
  1485. #endif
  1486. #ifdef RN
  1487. movsd 0 * SIZE(BB), %xmm4
  1488. mulsd %xmm4, %xmm0
  1489. #endif
  1490. #ifdef RT
  1491. movsd 0 * SIZE(BB), %xmm4
  1492. mulsd %xmm4, %xmm0
  1493. #endif
  1494. #if defined(LN) || defined(LT)
  1495. movsd %xmm0, 0 * SIZE(BB)
  1496. #else
  1497. movsd %xmm0, 0 * SIZE(AA)
  1498. #endif
  1499. #ifdef LN
  1500. subl $1 * SIZE, CO1
  1501. #endif
  1502. #if defined(LN) || defined(LT)
  1503. movsd %xmm0, 0 * SIZE(CO1)
  1504. #else
  1505. movsd %xmm0, 0 * SIZE(CO1)
  1506. #endif
  1507. #ifndef LN
  1508. addl $1 * SIZE, CO1
  1509. #endif
  1510. #if defined(LT) || defined(RN)
  1511. movl K, %eax
  1512. subl KK, %eax
  1513. leal (,%eax, SIZE), %eax
  1514. addl %eax, AA
  1515. addl %eax, BB
  1516. #endif
  1517. #ifdef LN
  1518. subl $1, KK
  1519. #endif
  1520. #ifdef LT
  1521. addl $1, KK
  1522. #endif
  1523. #ifdef RT
  1524. movl K, %eax
  1525. sall $BASE_SHIFT, %eax
  1526. addl %eax, AORIG
  1527. #endif
  1528. ALIGN_4
  1529. .L80:
  1530. movl M, %ebx
  1531. sarl $1, %ebx # i = (m >> 2)
  1532. jle .L89
  1533. ALIGN_4
  1534. .L71:
  1535. #ifdef LN
  1536. movl K, %eax
  1537. sall $1 + BASE_SHIFT, %eax
  1538. subl %eax, AORIG
  1539. #endif
  1540. #if defined(LN) || defined(RT)
  1541. movl KK, %eax
  1542. movl AORIG, AA
  1543. leal (, %eax, SIZE), %eax
  1544. leal (AA, %eax, 2), AA
  1545. #endif
  1546. movl B, BB
  1547. #if defined(LN) || defined(RT)
  1548. movl KK, %eax
  1549. sall $BASE_SHIFT, %eax
  1550. addl %eax, BB
  1551. #endif
  1552. movapd 0 * SIZE(AA), %xmm0
  1553. pxor %xmm4, %xmm4
  1554. movapd 8 * SIZE(AA), %xmm1
  1555. pxor %xmm5, %xmm5
  1556. movddup 0 * SIZE(BB), %xmm2
  1557. pxor %xmm6, %xmm6
  1558. movddup 4 * SIZE(BB), %xmm3
  1559. pxor %xmm7, %xmm7
  1560. #ifdef LN
  1561. prefetchnta -2 * SIZE(CO1)
  1562. #else
  1563. prefetchnta 2 * SIZE(CO1)
  1564. #endif
  1565. #if defined(LT) || defined(RN)
  1566. movl KK, %eax
  1567. #else
  1568. movl K, %eax
  1569. subl KK, %eax
  1570. #endif
  1571. sarl $3, %eax
  1572. je .L75
  1573. ALIGN_4
  1574. .L72:
  1575. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1576. mulpd %xmm2, %xmm0
  1577. movddup 1 * SIZE(BB), %xmm2
  1578. addpd %xmm0, %xmm4
  1579. movapd 16 * SIZE(AA), %xmm0
  1580. mulpd 2 * SIZE(AA), %xmm2
  1581. addpd %xmm2, %xmm5
  1582. movddup 2 * SIZE(BB), %xmm2
  1583. mulpd 4 * SIZE(AA), %xmm2
  1584. addpd %xmm2, %xmm6
  1585. movddup 3 * SIZE(BB), %xmm2
  1586. mulpd 6 * SIZE(AA), %xmm2
  1587. addpd %xmm2, %xmm7
  1588. movddup 8 * SIZE(BB), %xmm2
  1589. mulpd %xmm3, %xmm1
  1590. movddup 5 * SIZE(BB), %xmm3
  1591. addpd %xmm1, %xmm4
  1592. movapd 24 * SIZE(AA), %xmm1
  1593. mulpd 10 * SIZE(AA), %xmm3
  1594. addpd %xmm3, %xmm5
  1595. movddup 6 * SIZE(BB), %xmm3
  1596. mulpd 12 * SIZE(AA), %xmm3
  1597. addpd %xmm3, %xmm6
  1598. movddup 7 * SIZE(BB), %xmm3
  1599. mulpd 14 * SIZE(AA), %xmm3
  1600. addpd %xmm3, %xmm7
  1601. movddup 12 * SIZE(BB), %xmm3
  1602. addl $16 * SIZE, AA
  1603. addl $ 8 * SIZE, BB
  1604. decl %eax
  1605. jne .L72
  1606. ALIGN_4
  1607. .L75:
  1608. #if defined(LT) || defined(RN)
  1609. movl KK, %eax
  1610. #else
  1611. movl K, %eax
  1612. subl KK, %eax
  1613. #endif
  1614. andl $7, %eax # if (k & 1)
  1615. BRANCH
  1616. je .L78
  1617. ALIGN_3
  1618. .L76:
  1619. mulpd %xmm2, %xmm0
  1620. movddup 1 * SIZE(BB), %xmm2
  1621. addpd %xmm0, %xmm4
  1622. movapd 2 * SIZE(AA), %xmm0
  1623. addl $2 * SIZE, AA
  1624. addl $1 * SIZE, BB
  1625. decl %eax
  1626. jg .L76
  1627. ALIGN_4
  1628. .L78:
  1629. addpd %xmm5, %xmm4
  1630. addpd %xmm7, %xmm6
  1631. addpd %xmm6, %xmm4
  1632. #if defined(LN) || defined(RT)
  1633. movl KK, %eax
  1634. #ifdef LN
  1635. subl $2, %eax
  1636. #else
  1637. subl $1, %eax
  1638. #endif
  1639. movl AORIG, AA
  1640. leal (, %eax, SIZE), %eax
  1641. leal (AA, %eax, 2), AA
  1642. leal (B, %eax, 1), BB
  1643. #endif
  1644. #if defined(LN) || defined(LT)
  1645. movapd 0 * SIZE(BB), %xmm1
  1646. subpd %xmm4, %xmm1
  1647. movapd %xmm1, %xmm0
  1648. unpckhpd %xmm1, %xmm1
  1649. #else
  1650. movapd 0 * SIZE(AA), %xmm0
  1651. subpd %xmm4, %xmm0
  1652. #endif
  1653. #ifdef LN
  1654. movsd 3 * SIZE(AA), %xmm4
  1655. mulsd %xmm4, %xmm1
  1656. movsd 2 * SIZE(AA), %xmm4
  1657. mulsd %xmm1, %xmm4
  1658. subsd %xmm4, %xmm0
  1659. movsd 0 * SIZE(AA), %xmm4
  1660. mulsd %xmm4, %xmm0
  1661. #endif
  1662. #ifdef LT
  1663. movsd 0 * SIZE(AA), %xmm4
  1664. mulsd %xmm4, %xmm0
  1665. movsd 1 * SIZE(AA), %xmm4
  1666. mulsd %xmm0, %xmm4
  1667. subsd %xmm4, %xmm1
  1668. movsd 3 * SIZE(AA), %xmm4
  1669. mulsd %xmm4, %xmm1
  1670. #endif
  1671. #ifdef RN
  1672. movddup 0 * SIZE(BB), %xmm4
  1673. mulpd %xmm4, %xmm0
  1674. #endif
  1675. #ifdef RT
  1676. movddup 0 * SIZE(BB), %xmm4
  1677. mulpd %xmm4, %xmm0
  1678. #endif
  1679. #if defined(LN) || defined(LT)
  1680. movsd %xmm0, 0 * SIZE(BB)
  1681. movsd %xmm1, 1 * SIZE(BB)
  1682. #else
  1683. movapd %xmm0, 0 * SIZE(AA)
  1684. #endif
  1685. #ifdef LN
  1686. subl $2 * SIZE, CO1
  1687. #endif
  1688. #if defined(LN) || defined(LT)
  1689. movsd %xmm0, 0 * SIZE(CO1)
  1690. movsd %xmm1, 1 * SIZE(CO1)
  1691. #else
  1692. movsd %xmm0, 0 * SIZE(CO1)
  1693. movhpd %xmm0, 1 * SIZE(CO1)
  1694. #endif
  1695. #ifndef LN
  1696. addl $2 * SIZE, CO1
  1697. #endif
  1698. #if defined(LT) || defined(RN)
  1699. movl K, %eax
  1700. subl KK, %eax
  1701. leal (,%eax, SIZE), %eax
  1702. leal (AA, %eax, 2), AA
  1703. addl %eax, BB
  1704. #endif
  1705. #ifdef LN
  1706. subl $2, KK
  1707. #endif
  1708. #ifdef LT
  1709. addl $2, KK
  1710. #endif
  1711. #ifdef RT
  1712. movl K, %eax
  1713. sall $1 + BASE_SHIFT, %eax
  1714. addl %eax, AORIG
  1715. #endif
  1716. decl %ebx # i --
  1717. jg .L71
  1718. ALIGN_4
  1719. .L89:
  1720. #ifdef LN
  1721. movl K, %eax
  1722. leal (B, %eax, SIZE), B
  1723. #endif
  1724. #if defined(LT) || defined(RN)
  1725. movl BB, B
  1726. #endif
  1727. #ifdef RN
  1728. addl $1, KK
  1729. #endif
  1730. #ifdef RT
  1731. subl $1, KK
  1732. #endif
  1733. ALIGN_4
  1734. .L999:
  1735. popl %ebx
  1736. popl %esi
  1737. popl %edi
  1738. popl %ebp
  1739. addl $ARGS, %esp
  1740. ret
  1741. EPILOGUE