You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LN_2x4_sse2.S 48 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define OLD_M 4 + STACK + ARGS(%esi)
  43. #define OLD_N 8 + STACK + ARGS(%esi)
  44. #define OLD_K 12 + STACK + ARGS(%esi)
  45. #define OLD_ALPHA 16 + STACK + ARGS(%esi)
  46. #define OLD_A 24 + STACK + ARGS(%esi)
  47. #define OLD_B 28 + STACK + ARGS(%esi)
  48. #define OLD_C 32 + STACK + ARGS(%esi)
  49. #define OLD_LDC 36 + STACK + ARGS(%esi)
  50. #define OLD_OFFT 40 + STACK + ARGS(%esi)
  51. #define K 16(%esp)
  52. #define N 20(%esp)
  53. #define M 24(%esp)
  54. #define A 28(%esp)
  55. #define C 32(%esp)
  56. #define J 36(%esp)
  57. #define OLD_STACK 40(%esp)
  58. #define OFFSET 44(%esp)
  59. #define KK 48(%esp)
  60. #define KKK 52(%esp)
  61. #define AORIG 56(%esp)
  62. #define BORIG 60(%esp)
  63. #define BUFFER 128(%esp)
  64. #define STACK_ALIGN 4096
  65. #define STACK_OFFSET 1024
  66. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
  67. #define PREFETCH prefetch
  68. #define PREFETCHSIZE (8 * 10 + 4)
  69. #endif
  70. #define B %edi
  71. #define AA %edx
  72. #define BB %ecx
  73. #define LDC %ebp
  74. #define CO1 %esi
  75. #define KERNEL1(address) \
  76. mulpd %xmm0, %xmm2; \
  77. addpd %xmm2, %xmm4; \
  78. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  79. movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  80. mulpd %xmm0, %xmm2; \
  81. addpd %xmm2, %xmm5; \
  82. movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  83. mulpd %xmm0, %xmm2; \
  84. mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  85. addpd %xmm2, %xmm6; \
  86. movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  87. addpd %xmm0, %xmm7; \
  88. movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  89. #define KERNEL2(address) \
  90. mulpd %xmm0, %xmm3; \
  91. addpd %xmm3, %xmm4; \
  92. movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  93. mulpd %xmm0, %xmm3; \
  94. addpd %xmm3, %xmm5; \
  95. movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  96. mulpd %xmm0, %xmm3; \
  97. mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  98. addpd %xmm3, %xmm6; \
  99. movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  100. addpd %xmm0, %xmm7; \
  101. movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  102. #define KERNEL3(address) \
  103. mulpd %xmm0, %xmm2; \
  104. addpd %xmm2, %xmm4; \
  105. movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  106. mulpd %xmm0, %xmm2; \
  107. addpd %xmm2, %xmm5; \
  108. movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  109. mulpd %xmm0, %xmm2; \
  110. mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  111. addpd %xmm2, %xmm6; \
  112. movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  113. addpd %xmm0, %xmm7; \
  114. movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  115. #define KERNEL4(address) \
  116. mulpd %xmm0, %xmm3; \
  117. addpd %xmm3, %xmm4; \
  118. movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  119. mulpd %xmm0, %xmm3; \
  120. addpd %xmm3, %xmm5; \
  121. movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  122. mulpd %xmm0, %xmm3; \
  123. mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  124. addpd %xmm3, %xmm6; \
  125. movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  126. addpd %xmm0, %xmm7; \
  127. movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  128. #define KERNEL5(address) \
  129. PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \
  130. mulpd %xmm1, %xmm2; \
  131. addpd %xmm2, %xmm4; \
  132. movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  133. mulpd %xmm1, %xmm2; \
  134. addpd %xmm2, %xmm5; \
  135. movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  136. mulpd %xmm1, %xmm2; \
  137. mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  138. addpd %xmm2, %xmm6; \
  139. movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  140. addpd %xmm1, %xmm7; \
  141. movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  142. #define KERNEL6(address) \
  143. mulpd %xmm1, %xmm3; \
  144. addpd %xmm3, %xmm4; \
  145. movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  146. mulpd %xmm1, %xmm3; \
  147. addpd %xmm3, %xmm5; \
  148. movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  149. mulpd %xmm1, %xmm3; \
  150. mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  151. addpd %xmm3, %xmm6; \
  152. movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  153. addpd %xmm1, %xmm7; \
  154. movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  155. #define KERNEL7(address) \
  156. mulpd %xmm1, %xmm2; \
  157. addpd %xmm2, %xmm4; \
  158. movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  159. mulpd %xmm1, %xmm2; \
  160. addpd %xmm2, %xmm5; \
  161. movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  162. mulpd %xmm1, %xmm2; \
  163. mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  164. addpd %xmm2, %xmm6; \
  165. movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  166. addpd %xmm1, %xmm7; \
  167. movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  168. #define KERNEL8(address) \
  169. mulpd %xmm1, %xmm3; \
  170. addpd %xmm3, %xmm4; \
  171. movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  172. mulpd %xmm1, %xmm3; \
  173. addpd %xmm3, %xmm5; \
  174. movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  175. mulpd %xmm1, %xmm3; \
  176. mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  177. addpd %xmm3, %xmm6; \
  178. movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  179. addpd %xmm1, %xmm7; \
  180. movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  181. PROLOGUE
  182. pushl %ebp
  183. pushl %edi
  184. pushl %esi
  185. pushl %ebx
  186. PROFCODE
  187. EMMS
  188. movl %esp, %esi # save old stack
  189. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  190. andl $-STACK_ALIGN, %esp
  191. addl $STACK_OFFSET, %esp
  192. STACK_TOUCHING
  193. movl OLD_M, %ebx
  194. movl OLD_N, %eax
  195. movl OLD_K, %ecx
  196. movl OLD_A, %edx
  197. movl %ebx, M
  198. movl %eax, N
  199. movl %ecx, K
  200. movl %edx, A
  201. movl %esi, OLD_STACK
  202. movd OLD_OFFT, %mm4
  203. movl OLD_B, B
  204. movl OLD_C, %ebx
  205. movl %ebx, C
  206. movl OLD_LDC, LDC
  207. movd %mm4, OFFSET
  208. movd %mm4, KK
  209. leal (, LDC, SIZE), LDC
  210. #ifdef LN
  211. movl M, %eax
  212. leal (, %eax, SIZE), %eax
  213. addl %eax, C
  214. imull K, %eax
  215. addl %eax, A
  216. #endif
  217. #ifdef RT
  218. movl N, %eax
  219. leal (, %eax, SIZE), %eax
  220. imull K, %eax
  221. addl %eax, B
  222. movl N, %eax
  223. imull LDC, %eax
  224. addl %eax, C
  225. #endif
  226. #ifdef RN
  227. negl KK
  228. #endif
  229. #ifdef RT
  230. movl N, %eax
  231. subl OFFSET, %eax
  232. movl %eax, KK
  233. #endif
  234. movl N, %eax
  235. sarl $2, %eax
  236. movl %eax, J
  237. jle .L30
  238. ALIGN_2
  239. .L01:
  240. #ifdef LN
  241. movl OFFSET, %eax
  242. addl M, %eax
  243. movl %eax, KK
  244. #endif
  245. leal BUFFER, BB
  246. #ifdef RT
  247. movl K, %eax
  248. sall $2 + BASE_SHIFT, %eax
  249. subl %eax, B
  250. #endif
  251. #if defined(LN) || defined(RT)
  252. movl KK, %eax
  253. movl B, BORIG
  254. leal (, %eax, SIZE), %eax
  255. leal (B, %eax, 4), B
  256. leal (BB, %eax, 8), BB
  257. #endif
  258. #ifdef LT
  259. movl OFFSET, %eax
  260. movl %eax, KK
  261. #endif
  262. #if defined(LT) || defined(RN)
  263. movl KK, %eax
  264. #else
  265. movl K, %eax
  266. subl KK, %eax
  267. #endif
  268. sarl $1, %eax
  269. jle .L05
  270. ALIGN_4
  271. .L02:
  272. #define COPYPREFETCH 40
  273. prefetchnta (COPYPREFETCH) * SIZE(B)
  274. movq 0 * SIZE(B), %mm0
  275. movq 1 * SIZE(B), %mm1
  276. movq 2 * SIZE(B), %mm2
  277. movq 3 * SIZE(B), %mm3
  278. movq 4 * SIZE(B), %mm4
  279. movq 5 * SIZE(B), %mm5
  280. movq 6 * SIZE(B), %mm6
  281. movq 7 * SIZE(B), %mm7
  282. movq %mm0, 0 * SIZE(BB)
  283. movq %mm0, 1 * SIZE(BB)
  284. movq %mm1, 2 * SIZE(BB)
  285. movq %mm1, 3 * SIZE(BB)
  286. movq %mm2, 4 * SIZE(BB)
  287. movq %mm2, 5 * SIZE(BB)
  288. movq %mm3, 6 * SIZE(BB)
  289. movq %mm3, 7 * SIZE(BB)
  290. movq %mm4, 8 * SIZE(BB)
  291. movq %mm4, 9 * SIZE(BB)
  292. movq %mm5, 10 * SIZE(BB)
  293. movq %mm5, 11 * SIZE(BB)
  294. movq %mm6, 12 * SIZE(BB)
  295. movq %mm6, 13 * SIZE(BB)
  296. movq %mm7, 14 * SIZE(BB)
  297. movq %mm7, 15 * SIZE(BB)
  298. addl $ 8 * SIZE, B
  299. addl $16 * SIZE, BB
  300. decl %eax
  301. jne .L02
  302. ALIGN_2
  303. .L05:
  304. #if defined(LT) || defined(RN)
  305. movl KK, %eax
  306. #else
  307. movl K, %eax
  308. subl KK, %eax
  309. #endif
  310. andl $1, %eax
  311. BRANCH
  312. jle .L10
  313. movq 0 * SIZE(B), %mm0
  314. movq 1 * SIZE(B), %mm1
  315. movq 2 * SIZE(B), %mm2
  316. movq 3 * SIZE(B), %mm3
  317. movq %mm0, 0 * SIZE(BB)
  318. movq %mm0, 1 * SIZE(BB)
  319. movq %mm1, 2 * SIZE(BB)
  320. movq %mm1, 3 * SIZE(BB)
  321. movq %mm2, 4 * SIZE(BB)
  322. movq %mm2, 5 * SIZE(BB)
  323. movq %mm3, 6 * SIZE(BB)
  324. movq %mm3, 7 * SIZE(BB)
  325. addl $4 * SIZE, B
  326. ALIGN_4
  327. .L10:
  328. #if defined(LT) || defined(RN)
  329. movl A, AA
  330. #else
  331. movl A, %eax
  332. movl %eax, AORIG
  333. #endif
  334. leal (, LDC, 4), %eax
  335. #ifdef RT
  336. subl %eax, C
  337. #endif
  338. movl C, CO1
  339. #ifndef RT
  340. addl %eax, C
  341. #endif
  342. movl M, %ebx
  343. testl $1, %ebx # i = (m >> 2)
  344. jle .L20
  345. #ifdef LN
  346. movl K, %eax
  347. sall $BASE_SHIFT, %eax
  348. subl %eax, AORIG
  349. #endif
  350. #if defined(LN) || defined(RT)
  351. movl KK, %eax
  352. movl AORIG, AA
  353. leal (AA, %eax, SIZE), AA
  354. #endif
  355. leal BUFFER, BB
  356. #if defined(LN) || defined(RT)
  357. movl KK, %eax
  358. sall $3 + BASE_SHIFT, %eax
  359. addl %eax, BB
  360. #endif
  361. pxor %xmm4, %xmm4
  362. pxor %xmm5, %xmm5
  363. pxor %xmm6, %xmm6
  364. pxor %xmm7, %xmm7
  365. movlpd 0 * SIZE(AA), %xmm0
  366. movlpd 4 * SIZE(AA), %xmm1
  367. movlpd 0 * SIZE(BB), %xmm2
  368. movlpd 8 * SIZE(BB), %xmm3
  369. #if defined(LT) || defined(RN)
  370. movl KK, %eax
  371. #else
  372. movl K, %eax
  373. subl KK, %eax
  374. #endif
  375. sarl $3, %eax
  376. je .L25
  377. ALIGN_4
  378. .L22:
  379. mulsd %xmm0, %xmm2
  380. addsd %xmm2, %xmm4
  381. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
  382. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  383. #endif
  384. movlpd 2 * SIZE(BB), %xmm2
  385. mulsd %xmm0, %xmm2
  386. addsd %xmm2, %xmm5
  387. movlpd 4 * SIZE(BB), %xmm2
  388. mulsd %xmm0, %xmm2
  389. mulsd 6 * SIZE(BB), %xmm0
  390. addsd %xmm2, %xmm6
  391. movlpd 16 * SIZE(BB), %xmm2
  392. addsd %xmm0, %xmm7
  393. movlpd 1 * SIZE(AA), %xmm0
  394. mulsd %xmm0, %xmm3
  395. addsd %xmm3, %xmm4
  396. movlpd 10 * SIZE(BB), %xmm3
  397. mulsd %xmm0, %xmm3
  398. addsd %xmm3, %xmm5
  399. movlpd 12 * SIZE(BB), %xmm3
  400. mulsd %xmm0, %xmm3
  401. mulsd 14 * SIZE(BB), %xmm0
  402. addsd %xmm3, %xmm6
  403. movlpd 24 * SIZE(BB), %xmm3
  404. addsd %xmm0, %xmm7
  405. movlpd 2 * SIZE(AA), %xmm0
  406. mulsd %xmm0, %xmm2
  407. addsd %xmm2, %xmm4
  408. movlpd 18 * SIZE(BB), %xmm2
  409. mulsd %xmm0, %xmm2
  410. addsd %xmm2, %xmm5
  411. movlpd 20 * SIZE(BB), %xmm2
  412. mulsd %xmm0, %xmm2
  413. mulsd 22 * SIZE(BB), %xmm0
  414. addsd %xmm2, %xmm6
  415. movlpd 32 * SIZE(BB), %xmm2
  416. addsd %xmm0, %xmm7
  417. movlpd 3 * SIZE(AA), %xmm0
  418. mulsd %xmm0, %xmm3
  419. addsd %xmm3, %xmm4
  420. movlpd 26 * SIZE(BB), %xmm3
  421. mulsd %xmm0, %xmm3
  422. addsd %xmm3, %xmm5
  423. movlpd 28 * SIZE(BB), %xmm3
  424. mulsd %xmm0, %xmm3
  425. mulsd 30 * SIZE(BB), %xmm0
  426. addsd %xmm3, %xmm6
  427. movlpd 40 * SIZE(BB), %xmm3
  428. addsd %xmm0, %xmm7
  429. movlpd 8 * SIZE(AA), %xmm0
  430. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
  431. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  432. #endif
  433. mulsd %xmm1, %xmm2
  434. addsd %xmm2, %xmm4
  435. movlpd 34 * SIZE(BB), %xmm2
  436. mulsd %xmm1, %xmm2
  437. addsd %xmm2, %xmm5
  438. movlpd 36 * SIZE(BB), %xmm2
  439. mulsd %xmm1, %xmm2
  440. mulsd 38 * SIZE(BB), %xmm1
  441. addsd %xmm2, %xmm6
  442. movlpd 48 * SIZE(BB), %xmm2
  443. addsd %xmm1, %xmm7
  444. movlpd 5 * SIZE(AA), %xmm1
  445. mulsd %xmm1, %xmm3
  446. addsd %xmm3, %xmm4
  447. movlpd 42 * SIZE(BB), %xmm3
  448. mulsd %xmm1, %xmm3
  449. addsd %xmm3, %xmm5
  450. movlpd 44 * SIZE(BB), %xmm3
  451. mulsd %xmm1, %xmm3
  452. mulsd 46 * SIZE(BB), %xmm1
  453. addsd %xmm3, %xmm6
  454. movlpd 56 * SIZE(BB), %xmm3
  455. addsd %xmm1, %xmm7
  456. movlpd 6 * SIZE(AA), %xmm1
  457. mulsd %xmm1, %xmm2
  458. addsd %xmm2, %xmm4
  459. movlpd 50 * SIZE(BB), %xmm2
  460. mulsd %xmm1, %xmm2
  461. addsd %xmm2, %xmm5
  462. movlpd 52 * SIZE(BB), %xmm2
  463. mulsd %xmm1, %xmm2
  464. mulsd 54 * SIZE(BB), %xmm1
  465. addsd %xmm2, %xmm6
  466. movlpd 64 * SIZE(BB), %xmm2
  467. addsd %xmm1, %xmm7
  468. movlpd 7 * SIZE(AA), %xmm1
  469. mulsd %xmm1, %xmm3
  470. addsd %xmm3, %xmm4
  471. movlpd 58 * SIZE(BB), %xmm3
  472. mulsd %xmm1, %xmm3
  473. addsd %xmm3, %xmm5
  474. movlpd 60 * SIZE(BB), %xmm3
  475. mulsd %xmm1, %xmm3
  476. mulsd 62 * SIZE(BB), %xmm1
  477. addsd %xmm3, %xmm6
  478. movlpd 72 * SIZE(BB), %xmm3
  479. addl $64 * SIZE, BB
  480. addsd %xmm1, %xmm7
  481. movlpd 12 * SIZE(AA), %xmm1
  482. addl $8 * SIZE, AA
  483. decl %eax
  484. jne .L22
  485. ALIGN_4
  486. .L25:
  487. #if defined(LT) || defined(RN)
  488. movl KK, %eax
  489. #else
  490. movl K, %eax
  491. subl KK, %eax
  492. #endif
  493. andl $7, %eax # if (k & 1)
  494. BRANCH
  495. je .L28
  496. .L26:
  497. mulsd %xmm0, %xmm2
  498. addsd %xmm2, %xmm4
  499. movlpd 2 * SIZE(BB), %xmm2
  500. mulsd %xmm0, %xmm2
  501. addsd %xmm2, %xmm5
  502. movlpd 4 * SIZE(BB), %xmm2
  503. mulsd %xmm0, %xmm2
  504. mulsd 6 * SIZE(BB), %xmm0
  505. addsd %xmm2, %xmm6
  506. movlpd 8 * SIZE(BB), %xmm2
  507. addsd %xmm0, %xmm7
  508. movlpd 1 * SIZE(AA), %xmm0
  509. addl $1 * SIZE, AA
  510. addl $8 * SIZE, BB
  511. decl %eax
  512. jg .L26
  513. ALIGN_4
  514. .L28:
  515. #if defined(LN) || defined(RT)
  516. movl KK, %eax
  517. #ifdef LN
  518. subl $1, %eax
  519. #else
  520. subl $4, %eax
  521. #endif
  522. movl AORIG, AA
  523. movl BORIG, B
  524. leal BUFFER, BB
  525. leal (, %eax, SIZE), %eax
  526. addl %eax, AA
  527. leal (B, %eax, 4), B
  528. leal (BB, %eax, 8), BB
  529. #endif
  530. #if defined(LN) || defined(LT)
  531. unpcklpd %xmm5, %xmm4
  532. unpcklpd %xmm7, %xmm6
  533. movapd 0 * SIZE(B), %xmm2
  534. movapd 2 * SIZE(B), %xmm5
  535. subpd %xmm4, %xmm2
  536. subpd %xmm6, %xmm5
  537. #else
  538. movlpd 0 * SIZE(AA), %xmm0
  539. movlpd 1 * SIZE(AA), %xmm1
  540. movlpd 2 * SIZE(AA), %xmm2
  541. movlpd 3 * SIZE(AA), %xmm3
  542. subsd %xmm4, %xmm0
  543. subsd %xmm5, %xmm1
  544. subsd %xmm6, %xmm2
  545. subsd %xmm7, %xmm3
  546. #endif
  547. #ifdef LN
  548. movlpd 0 * SIZE(AA), %xmm4
  549. movhpd 0 * SIZE(AA), %xmm4
  550. mulpd %xmm4, %xmm2
  551. mulpd %xmm4, %xmm5
  552. #endif
  553. #ifdef LT
  554. movlpd 0 * SIZE(AA), %xmm4
  555. movhpd 0 * SIZE(AA), %xmm4
  556. mulpd %xmm4, %xmm2
  557. mulpd %xmm4, %xmm5
  558. #endif
  559. #ifdef RN
  560. movlpd 0 * SIZE(B), %xmm4
  561. mulsd %xmm4, %xmm0
  562. movlpd 1 * SIZE(B), %xmm4
  563. mulsd %xmm0, %xmm4
  564. subsd %xmm4, %xmm1
  565. movlpd 2 * SIZE(B), %xmm4
  566. mulsd %xmm0, %xmm4
  567. subsd %xmm4, %xmm2
  568. movlpd 3 * SIZE(B), %xmm4
  569. mulsd %xmm0, %xmm4
  570. subsd %xmm4, %xmm3
  571. movlpd 5 * SIZE(B), %xmm4
  572. mulsd %xmm4, %xmm1
  573. movlpd 6 * SIZE(B), %xmm4
  574. mulsd %xmm1, %xmm4
  575. subsd %xmm4, %xmm2
  576. movlpd 7 * SIZE(B), %xmm4
  577. mulsd %xmm1, %xmm4
  578. subsd %xmm4, %xmm3
  579. movlpd 10 * SIZE(B), %xmm4
  580. mulsd %xmm4, %xmm2
  581. movlpd 11 * SIZE(B), %xmm4
  582. mulsd %xmm2, %xmm4
  583. subsd %xmm4, %xmm3
  584. movlpd 15 * SIZE(B), %xmm4
  585. mulsd %xmm4, %xmm3
  586. #endif
  587. #ifdef RT
  588. movlpd 15 * SIZE(B), %xmm4
  589. mulsd %xmm4, %xmm3
  590. movlpd 14 * SIZE(B), %xmm4
  591. mulsd %xmm3, %xmm4
  592. subsd %xmm4, %xmm2
  593. movlpd 13 * SIZE(B), %xmm4
  594. mulsd %xmm3, %xmm4
  595. subsd %xmm4, %xmm1
  596. movlpd 12 * SIZE(B), %xmm4
  597. mulsd %xmm3, %xmm4
  598. subsd %xmm4, %xmm0
  599. movlpd 10 * SIZE(B), %xmm4
  600. mulsd %xmm4, %xmm2
  601. movlpd 9 * SIZE(B), %xmm4
  602. mulsd %xmm2, %xmm4
  603. subsd %xmm4, %xmm1
  604. movlpd 8 * SIZE(B), %xmm4
  605. mulsd %xmm2, %xmm4
  606. subsd %xmm4, %xmm0
  607. movlpd 5 * SIZE(B), %xmm4
  608. mulsd %xmm4, %xmm1
  609. movlpd 4 * SIZE(B), %xmm4
  610. mulsd %xmm1, %xmm4
  611. subsd %xmm4, %xmm0
  612. movlpd 0 * SIZE(B), %xmm4
  613. mulsd %xmm4, %xmm0
  614. #endif
  615. #if defined(LN) || defined(LT)
  616. movapd %xmm2, 0 * SIZE(B)
  617. movapd %xmm5, 2 * SIZE(B)
  618. movlpd %xmm2, 0 * SIZE(BB)
  619. movlpd %xmm2, 1 * SIZE(BB)
  620. movhpd %xmm2, 2 * SIZE(BB)
  621. movhpd %xmm2, 3 * SIZE(BB)
  622. movlpd %xmm5, 4 * SIZE(BB)
  623. movlpd %xmm5, 5 * SIZE(BB)
  624. movhpd %xmm5, 6 * SIZE(BB)
  625. movhpd %xmm5, 7 * SIZE(BB)
  626. #else
  627. movlpd %xmm0, 0 * SIZE(AA)
  628. movlpd %xmm1, 1 * SIZE(AA)
  629. movlpd %xmm2, 2 * SIZE(AA)
  630. movlpd %xmm3, 3 * SIZE(AA)
  631. #endif
  632. #ifdef LN
  633. subl $1 * SIZE, CO1
  634. #endif
  635. leal (LDC, LDC, 2), %eax
  636. #if defined(LN) || defined(LT)
  637. movlpd %xmm2, 0 * SIZE(CO1)
  638. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  639. movlpd %xmm5, 0 * SIZE(CO1, LDC, 2)
  640. movhpd %xmm5, 0 * SIZE(CO1, %eax, 1)
  641. #else
  642. movlpd %xmm0, 0 * SIZE(CO1)
  643. movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
  644. movlpd %xmm2, 0 * SIZE(CO1, LDC, 2)
  645. movlpd %xmm3, 0 * SIZE(CO1, %eax, 1)
  646. #endif
  647. #ifndef LN
  648. addl $1 * SIZE, CO1
  649. #endif
  650. #if defined(LT) || defined(RN)
  651. movl K, %eax
  652. subl KK, %eax
  653. leal (AA,%eax, SIZE), AA
  654. #ifdef LT
  655. addl $4 * SIZE, B
  656. #endif
  657. #endif
  658. #ifdef LN
  659. subl $1, KK
  660. movl BORIG, B
  661. #endif
  662. #ifdef LT
  663. addl $1, KK
  664. #endif
  665. #ifdef RT
  666. movl K, %eax
  667. movl BORIG, B
  668. sall $BASE_SHIFT, %eax
  669. addl %eax, AORIG
  670. #endif
  671. ALIGN_4
  672. .L20:
  673. movl M, %ebx
  674. sarl $1, %ebx # i = (m >> 2)
  675. jle .L29
  676. ALIGN_4
  677. .L11:
  678. #ifdef LN
  679. movl K, %eax
  680. sall $1 + BASE_SHIFT, %eax
  681. subl %eax, AORIG
  682. #endif
  683. #if defined(LN) || defined(RT)
  684. movl KK, %eax
  685. movl AORIG, AA
  686. leal (, %eax, SIZE), %eax
  687. leal (AA, %eax, 2), AA
  688. #endif
  689. leal BUFFER, BB
  690. #if defined(LN) || defined(RT)
  691. movl KK, %eax
  692. sall $3 + BASE_SHIFT, %eax
  693. addl %eax, BB
  694. #endif
  695. pxor %xmm4, %xmm4
  696. pxor %xmm5, %xmm5
  697. pxor %xmm6, %xmm6
  698. pxor %xmm7, %xmm7
  699. movapd 0 * SIZE(AA), %xmm0
  700. movapd 8 * SIZE(AA), %xmm1
  701. movapd 0 * SIZE(BB), %xmm2
  702. movapd 8 * SIZE(BB), %xmm3
  703. leal (LDC, LDC, 2), %eax
  704. #ifdef LN
  705. prefetchw -2 * SIZE(CO1)
  706. prefetchw -2 * SIZE(CO1, LDC)
  707. prefetchw -2 * SIZE(CO1, LDC, 2)
  708. prefetchw -2 * SIZE(CO1, %eax)
  709. #else
  710. prefetchw 1 * SIZE(CO1)
  711. prefetchw 1 * SIZE(CO1, LDC)
  712. prefetchw 1 * SIZE(CO1, LDC, 2)
  713. prefetchw 1 * SIZE(CO1, %eax)
  714. #endif
  715. #if defined(LT) || defined(RN)
  716. movl KK, %eax
  717. #else
  718. movl K, %eax
  719. subl KK, %eax
  720. #endif
  721. #if 1
  722. andl $-8, %eax
  723. sall $4, %eax
  724. je .L15
  725. .L1X:
  726. KERNEL1(16 * 0)
  727. KERNEL2(16 * 0)
  728. KERNEL3(16 * 0)
  729. KERNEL4(16 * 0)
  730. KERNEL5(16 * 0)
  731. KERNEL6(16 * 0)
  732. KERNEL7(16 * 0)
  733. KERNEL8(16 * 0)
  734. cmpl $128 * 1, %eax
  735. jle .L12
  736. KERNEL1(16 * 1)
  737. KERNEL2(16 * 1)
  738. KERNEL3(16 * 1)
  739. KERNEL4(16 * 1)
  740. KERNEL5(16 * 1)
  741. KERNEL6(16 * 1)
  742. KERNEL7(16 * 1)
  743. KERNEL8(16 * 1)
  744. cmpl $128 * 2, %eax
  745. jle .L12
  746. KERNEL1(16 * 2)
  747. KERNEL2(16 * 2)
  748. KERNEL3(16 * 2)
  749. KERNEL4(16 * 2)
  750. KERNEL5(16 * 2)
  751. KERNEL6(16 * 2)
  752. KERNEL7(16 * 2)
  753. KERNEL8(16 * 2)
  754. cmpl $128 * 3, %eax
  755. jle .L12
  756. KERNEL1(16 * 3)
  757. KERNEL2(16 * 3)
  758. KERNEL3(16 * 3)
  759. KERNEL4(16 * 3)
  760. KERNEL5(16 * 3)
  761. KERNEL6(16 * 3)
  762. KERNEL7(16 * 3)
  763. KERNEL8(16 * 3)
  764. cmpl $128 * 4, %eax
  765. jle .L12
  766. KERNEL1(16 * 4)
  767. KERNEL2(16 * 4)
  768. KERNEL3(16 * 4)
  769. KERNEL4(16 * 4)
  770. KERNEL5(16 * 4)
  771. KERNEL6(16 * 4)
  772. KERNEL7(16 * 4)
  773. KERNEL8(16 * 4)
  774. cmpl $128 * 5, %eax
  775. jle .L12
  776. KERNEL1(16 * 5)
  777. KERNEL2(16 * 5)
  778. KERNEL3(16 * 5)
  779. KERNEL4(16 * 5)
  780. KERNEL5(16 * 5)
  781. KERNEL6(16 * 5)
  782. KERNEL7(16 * 5)
  783. KERNEL8(16 * 5)
  784. cmpl $128 * 6, %eax
  785. jle .L12
  786. KERNEL1(16 * 6)
  787. KERNEL2(16 * 6)
  788. KERNEL3(16 * 6)
  789. KERNEL4(16 * 6)
  790. KERNEL5(16 * 6)
  791. KERNEL6(16 * 6)
  792. KERNEL7(16 * 6)
  793. KERNEL8(16 * 6)
  794. cmpl $128 * 7, %eax
  795. jle .L12
  796. KERNEL1(16 * 7)
  797. KERNEL2(16 * 7)
  798. KERNEL3(16 * 7)
  799. KERNEL4(16 * 7)
  800. KERNEL5(16 * 7)
  801. KERNEL6(16 * 7)
  802. KERNEL7(16 * 7)
  803. KERNEL8(16 * 7)
  804. addl $128 * 4 * SIZE, BB
  805. addl $128 * 1 * SIZE, AA
  806. subl $128 * 8, %eax
  807. jg .L1X
  808. jmp .L15
  809. .L12:
  810. leal (AA, %eax, 1), AA
  811. leal (BB, %eax, 4), BB
  812. ALIGN_4
  813. #else
  814. sarl $3, %eax
  815. je .L15
  816. ALIGN_4
  817. .L12:
  818. KERNEL1(16 * 0)
  819. KERNEL2(16 * 0)
  820. KERNEL3(16 * 0)
  821. KERNEL4(16 * 0)
  822. KERNEL5(16 * 0)
  823. KERNEL6(16 * 0)
  824. KERNEL7(16 * 0)
  825. KERNEL8(16 * 0)
  826. addl $64 * SIZE, BB
  827. addl $16 * SIZE, AA
  828. decl %eax
  829. jne .L12
  830. ALIGN_4
  831. #endif
  832. .L15:
  833. #if defined(LT) || defined(RN)
  834. movl KK, %eax
  835. #else
  836. movl K, %eax
  837. subl KK, %eax
  838. #endif
  839. andl $7, %eax # if (k & 1)
  840. BRANCH
  841. je .L18
  842. ALIGN_3
  843. .L16:
  844. mulpd %xmm0, %xmm2
  845. addpd %xmm2, %xmm4
  846. movapd 2 * SIZE(BB), %xmm2
  847. mulpd %xmm0, %xmm2
  848. addpd %xmm2, %xmm5
  849. movapd 4 * SIZE(BB), %xmm2
  850. mulpd %xmm0, %xmm2
  851. mulpd 6 * SIZE(BB), %xmm0
  852. addpd %xmm2, %xmm6
  853. movapd 8 * SIZE(BB), %xmm2
  854. addpd %xmm0, %xmm7
  855. movapd 2 * SIZE(AA), %xmm0
  856. addl $2 * SIZE, AA
  857. addl $8 * SIZE, BB
  858. decl %eax
  859. jg .L16
  860. ALIGN_4
  861. .L18:
  862. #if defined(LN) || defined(RT)
  863. movl KK, %eax
  864. #ifdef LN
  865. subl $2, %eax
  866. #else
  867. subl $4, %eax
  868. #endif
  869. movl AORIG, AA
  870. movl BORIG, B
  871. leal BUFFER, BB
  872. leal (, %eax, SIZE), %eax
  873. leal (AA, %eax, 2), AA
  874. leal (B, %eax, 4), B
  875. leal (BB, %eax, 8), BB
  876. #endif
  877. #if defined(LN) || defined(LT)
  878. movapd %xmm4, %xmm0
  879. unpcklpd %xmm5, %xmm4
  880. unpckhpd %xmm5, %xmm0
  881. movapd %xmm6, %xmm1
  882. unpcklpd %xmm7, %xmm6
  883. unpckhpd %xmm7, %xmm1
  884. movapd 0 * SIZE(B), %xmm2
  885. movapd 2 * SIZE(B), %xmm5
  886. movapd 4 * SIZE(B), %xmm3
  887. movapd 6 * SIZE(B), %xmm7
  888. subpd %xmm4, %xmm2
  889. subpd %xmm6, %xmm5
  890. subpd %xmm0, %xmm3
  891. subpd %xmm1, %xmm7
  892. #else
  893. movapd 0 * SIZE(AA), %xmm0
  894. movapd 2 * SIZE(AA), %xmm1
  895. movapd 4 * SIZE(AA), %xmm2
  896. movapd 6 * SIZE(AA), %xmm3
  897. subpd %xmm4, %xmm0
  898. subpd %xmm5, %xmm1
  899. subpd %xmm6, %xmm2
  900. subpd %xmm7, %xmm3
  901. #endif
  902. #ifdef LN
  903. movlpd 3 * SIZE(AA), %xmm4
  904. movhpd 3 * SIZE(AA), %xmm4
  905. mulpd %xmm4, %xmm3
  906. mulpd %xmm4, %xmm7
  907. movlpd 2 * SIZE(AA), %xmm4
  908. movhpd 2 * SIZE(AA), %xmm4
  909. movapd %xmm4, %xmm6
  910. mulpd %xmm3, %xmm4
  911. subpd %xmm4, %xmm2
  912. mulpd %xmm7, %xmm6
  913. subpd %xmm6, %xmm5
  914. movlpd 0 * SIZE(AA), %xmm4
  915. movhpd 0 * SIZE(AA), %xmm4
  916. mulpd %xmm4, %xmm2
  917. mulpd %xmm4, %xmm5
  918. #endif
  919. #ifdef LT
  920. movlpd 0 * SIZE(AA), %xmm4
  921. movhpd 0 * SIZE(AA), %xmm4
  922. mulpd %xmm4, %xmm2
  923. mulpd %xmm4, %xmm5
  924. movlpd 1 * SIZE(AA), %xmm4
  925. movhpd 1 * SIZE(AA), %xmm4
  926. movapd %xmm4, %xmm6
  927. mulpd %xmm2, %xmm4
  928. subpd %xmm4, %xmm3
  929. mulpd %xmm5, %xmm6
  930. subpd %xmm6, %xmm7
  931. movlpd 3 * SIZE(AA), %xmm4
  932. movhpd 3 * SIZE(AA), %xmm4
  933. mulpd %xmm4, %xmm3
  934. mulpd %xmm4, %xmm7
  935. #endif
  936. #ifdef RN
  937. movlpd 0 * SIZE(B), %xmm4
  938. movhpd 0 * SIZE(B), %xmm4
  939. mulpd %xmm4, %xmm0
  940. movlpd 1 * SIZE(B), %xmm4
  941. movhpd 1 * SIZE(B), %xmm4
  942. mulpd %xmm0, %xmm4
  943. subpd %xmm4, %xmm1
  944. movlpd 2 * SIZE(B), %xmm4
  945. movhpd 2 * SIZE(B), %xmm4
  946. mulpd %xmm0, %xmm4
  947. subpd %xmm4, %xmm2
  948. movlpd 3 * SIZE(B), %xmm4
  949. movhpd 3 * SIZE(B), %xmm4
  950. mulpd %xmm0, %xmm4
  951. subpd %xmm4, %xmm3
  952. movlpd 5 * SIZE(B), %xmm4
  953. movhpd 5 * SIZE(B), %xmm4
  954. mulpd %xmm4, %xmm1
  955. movlpd 6 * SIZE(B), %xmm4
  956. movhpd 6 * SIZE(B), %xmm4
  957. mulpd %xmm1, %xmm4
  958. subpd %xmm4, %xmm2
  959. movlpd 7 * SIZE(B), %xmm4
  960. movhpd 7 * SIZE(B), %xmm4
  961. mulpd %xmm1, %xmm4
  962. subpd %xmm4, %xmm3
  963. movlpd 10 * SIZE(B), %xmm4
  964. movhpd 10 * SIZE(B), %xmm4
  965. mulpd %xmm4, %xmm2
  966. movlpd 11 * SIZE(B), %xmm4
  967. movhpd 11 * SIZE(B), %xmm4
  968. mulpd %xmm2, %xmm4
  969. subpd %xmm4, %xmm3
  970. movlpd 15 * SIZE(B), %xmm4
  971. movhpd 15 * SIZE(B), %xmm4
  972. mulpd %xmm4, %xmm3
  973. #endif
  974. #ifdef RT
  975. movlpd 15 * SIZE(B), %xmm4
  976. movhpd 15 * SIZE(B), %xmm4
  977. mulpd %xmm4, %xmm3
  978. movlpd 14 * SIZE(B), %xmm4
  979. movhpd 14 * SIZE(B), %xmm4
  980. mulpd %xmm3, %xmm4
  981. subpd %xmm4, %xmm2
  982. movlpd 13 * SIZE(B), %xmm4
  983. movhpd 13 * SIZE(B), %xmm4
  984. mulpd %xmm3, %xmm4
  985. subpd %xmm4, %xmm1
  986. movlpd 12 * SIZE(B), %xmm4
  987. movhpd 12 * SIZE(B), %xmm4
  988. mulpd %xmm3, %xmm4
  989. subpd %xmm4, %xmm0
  990. movlpd 10 * SIZE(B), %xmm4
  991. movhpd 10 * SIZE(B), %xmm4
  992. mulpd %xmm4, %xmm2
  993. movlpd 9 * SIZE(B), %xmm4
  994. movhpd 9 * SIZE(B), %xmm4
  995. mulpd %xmm2, %xmm4
  996. subpd %xmm4, %xmm1
  997. movlpd 8 * SIZE(B), %xmm4
  998. movhpd 8 * SIZE(B), %xmm4
  999. mulpd %xmm2, %xmm4
  1000. subpd %xmm4, %xmm0
  1001. movlpd 5 * SIZE(B), %xmm4
  1002. movhpd 5 * SIZE(B), %xmm4
  1003. mulpd %xmm4, %xmm1
  1004. movlpd 4 * SIZE(B), %xmm4
  1005. movhpd 4 * SIZE(B), %xmm4
  1006. mulpd %xmm1, %xmm4
  1007. subpd %xmm4, %xmm0
  1008. movlpd 0 * SIZE(B), %xmm4
  1009. movhpd 0 * SIZE(B), %xmm4
  1010. mulpd %xmm4, %xmm0
  1011. #endif
  1012. #if defined(LN) || defined(LT)
  1013. movapd %xmm2, 0 * SIZE(B)
  1014. movapd %xmm5, 2 * SIZE(B)
  1015. movapd %xmm3, 4 * SIZE(B)
  1016. movapd %xmm7, 6 * SIZE(B)
  1017. movlpd %xmm2, 0 * SIZE(BB)
  1018. movlpd %xmm2, 1 * SIZE(BB)
  1019. movhpd %xmm2, 2 * SIZE(BB)
  1020. movhpd %xmm2, 3 * SIZE(BB)
  1021. movlpd %xmm5, 4 * SIZE(BB)
  1022. movlpd %xmm5, 5 * SIZE(BB)
  1023. movhpd %xmm5, 6 * SIZE(BB)
  1024. movhpd %xmm5, 7 * SIZE(BB)
  1025. movlpd %xmm3, 8 * SIZE(BB)
  1026. movlpd %xmm3, 9 * SIZE(BB)
  1027. movhpd %xmm3, 10 * SIZE(BB)
  1028. movhpd %xmm3, 11 * SIZE(BB)
  1029. movlpd %xmm7, 12 * SIZE(BB)
  1030. movlpd %xmm7, 13 * SIZE(BB)
  1031. movhpd %xmm7, 14 * SIZE(BB)
  1032. movhpd %xmm7, 15 * SIZE(BB)
  1033. #else
  1034. movapd %xmm0, 0 * SIZE(AA)
  1035. movapd %xmm1, 2 * SIZE(AA)
  1036. movapd %xmm2, 4 * SIZE(AA)
  1037. movapd %xmm3, 6 * SIZE(AA)
  1038. #endif
  1039. #ifdef LN
  1040. subl $2 * SIZE, CO1
  1041. #endif
  1042. leal (LDC, LDC, 2), %eax
  1043. #if defined(LN) || defined(LT)
  1044. movlpd %xmm2, 0 * SIZE(CO1)
  1045. movlpd %xmm3, 1 * SIZE(CO1)
  1046. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  1047. movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
  1048. movlpd %xmm5, 0 * SIZE(CO1, LDC, 2)
  1049. movlpd %xmm7, 1 * SIZE(CO1, LDC, 2)
  1050. movhpd %xmm5, 0 * SIZE(CO1, %eax, 1)
  1051. movhpd %xmm7, 1 * SIZE(CO1, %eax, 1)
  1052. #else
  1053. movlpd %xmm0, 0 * SIZE(CO1)
  1054. movhpd %xmm0, 1 * SIZE(CO1)
  1055. movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1056. movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
  1057. movlpd %xmm2, 0 * SIZE(CO1, LDC, 2)
  1058. movhpd %xmm2, 1 * SIZE(CO1, LDC, 2)
  1059. movlpd %xmm3, 0 * SIZE(CO1, %eax, 1)
  1060. movhpd %xmm3, 1 * SIZE(CO1, %eax, 1)
  1061. #endif
  1062. #ifndef LN
  1063. addl $2 * SIZE, CO1
  1064. #endif
  1065. #if defined(LT) || defined(RN)
  1066. movl K, %eax
  1067. subl KK, %eax
  1068. leal (,%eax, SIZE), %eax
  1069. leal (AA, %eax, 2), AA
  1070. #ifdef LT
  1071. addl $8 * SIZE, B
  1072. #endif
  1073. #endif
  1074. #ifdef LN
  1075. subl $2, KK
  1076. movl BORIG, B
  1077. #endif
  1078. #ifdef LT
  1079. addl $2, KK
  1080. #endif
  1081. #ifdef RT
  1082. movl K, %eax
  1083. movl BORIG, B
  1084. sall $1 + BASE_SHIFT, %eax
  1085. addl %eax, AORIG
  1086. #endif
  1087. decl %ebx # i --
  1088. jg .L11
  1089. ALIGN_4
  1090. .L29:
  1091. #ifdef LN
  1092. movl K, %eax
  1093. leal (, %eax, SIZE), %eax
  1094. leal (B, %eax, 4), B
  1095. #endif
  1096. #if defined(LT) || defined(RN)
  1097. movl K, %eax
  1098. subl KK, %eax
  1099. leal (,%eax, SIZE), %eax
  1100. leal (B, %eax, 4), B
  1101. #endif
  1102. #ifdef RN
  1103. addl $4, KK
  1104. #endif
  1105. #ifdef RT
  1106. subl $4, KK
  1107. #endif
  1108. decl J # j --
  1109. jg .L01
  1110. ALIGN_4
  1111. .L30:
  1112. testl $2, N
  1113. je .L60
  1114. #ifdef LN
  1115. movl OFFSET, %eax
  1116. addl M, %eax
  1117. movl %eax, KK
  1118. #endif
  1119. leal BUFFER, BB
  1120. #ifdef RT
  1121. movl K, %eax
  1122. sall $1 + BASE_SHIFT, %eax
  1123. subl %eax, B
  1124. #endif
  1125. #if defined(LN) || defined(RT)
  1126. movl KK, %eax
  1127. movl B, BORIG
  1128. leal (, %eax, SIZE), %eax
  1129. leal (B, %eax, 2), B
  1130. leal (BB, %eax, 4), BB
  1131. #endif
  1132. #ifdef LT
  1133. movl OFFSET, %eax
  1134. movl %eax, KK
  1135. #endif
  1136. #if defined(LT) || defined(RN)
  1137. movl KK, %eax
  1138. #else
  1139. movl K, %eax
  1140. subl KK, %eax
  1141. #endif
  1142. sarl $2, %eax
  1143. jle .L35
  1144. ALIGN_4
  1145. .L32:
  1146. #define COPYPREFETCH 40
  1147. prefetchnta (COPYPREFETCH) * SIZE(B)
  1148. movq 0 * SIZE(B), %mm0
  1149. movq 1 * SIZE(B), %mm1
  1150. movq 2 * SIZE(B), %mm2
  1151. movq 3 * SIZE(B), %mm3
  1152. movq 4 * SIZE(B), %mm4
  1153. movq 5 * SIZE(B), %mm5
  1154. movq 6 * SIZE(B), %mm6
  1155. movq 7 * SIZE(B), %mm7
  1156. movq %mm0, 0 * SIZE(BB)
  1157. movq %mm0, 1 * SIZE(BB)
  1158. movq %mm1, 2 * SIZE(BB)
  1159. movq %mm1, 3 * SIZE(BB)
  1160. movq %mm2, 4 * SIZE(BB)
  1161. movq %mm2, 5 * SIZE(BB)
  1162. movq %mm3, 6 * SIZE(BB)
  1163. movq %mm3, 7 * SIZE(BB)
  1164. movq %mm4, 8 * SIZE(BB)
  1165. movq %mm4, 9 * SIZE(BB)
  1166. movq %mm5, 10 * SIZE(BB)
  1167. movq %mm5, 11 * SIZE(BB)
  1168. movq %mm6, 12 * SIZE(BB)
  1169. movq %mm6, 13 * SIZE(BB)
  1170. movq %mm7, 14 * SIZE(BB)
  1171. movq %mm7, 15 * SIZE(BB)
  1172. addl $ 8 * SIZE, B
  1173. addl $16 * SIZE, BB
  1174. decl %eax
  1175. jne .L32
  1176. ALIGN_2
  1177. .L35:
  1178. #if defined(LT) || defined(RN)
  1179. movl KK, %eax
  1180. #else
  1181. movl K, %eax
  1182. subl KK, %eax
  1183. #endif
  1184. andl $3, %eax
  1185. BRANCH
  1186. jle .L40
  1187. ALIGN_2
  1188. .L36:
  1189. movq 0 * SIZE(B), %mm0
  1190. movq 1 * SIZE(B), %mm1
  1191. movq %mm0, 0 * SIZE(BB)
  1192. movq %mm0, 1 * SIZE(BB)
  1193. movq %mm1, 2 * SIZE(BB)
  1194. movq %mm1, 3 * SIZE(BB)
  1195. addl $2 * SIZE, B
  1196. addl $4 * SIZE, BB
  1197. decl %eax
  1198. jne .L36
  1199. ALIGN_4
  1200. .L40:
  1201. #if defined(LT) || defined(RN)
  1202. movl A, AA
  1203. #else
  1204. movl A, %eax
  1205. movl %eax, AORIG
  1206. #endif
  1207. leal (, LDC, 2), %eax
  1208. #ifdef RT
  1209. subl %eax, C
  1210. #endif
  1211. movl C, CO1
  1212. #ifndef RT
  1213. addl %eax, C
  1214. #endif
  1215. movl M, %ebx
  1216. testl $1, %ebx # i = (m >> 2)
  1217. jle .L50
  1218. #ifdef LN
  1219. movl K, %eax
  1220. sall $BASE_SHIFT, %eax
  1221. subl %eax, AORIG
  1222. #endif
  1223. #if defined(LN) || defined(RT)
  1224. movl KK, %eax
  1225. movl AORIG, AA
  1226. leal (AA, %eax, SIZE), AA
  1227. #endif
  1228. leal BUFFER, BB
  1229. #if defined(LN) || defined(RT)
  1230. movl KK, %eax
  1231. sall $2 + BASE_SHIFT, %eax
  1232. addl %eax, BB
  1233. #endif
  1234. pxor %xmm4, %xmm4
  1235. pxor %xmm5, %xmm5
  1236. pxor %xmm6, %xmm6
  1237. pxor %xmm7, %xmm7
  1238. movlpd 0 * SIZE(AA), %xmm0
  1239. movlpd 4 * SIZE(AA), %xmm1
  1240. movlpd 0 * SIZE(BB), %xmm2
  1241. movlpd 8 * SIZE(BB), %xmm3
  1242. #if defined(LT) || defined(RN)
  1243. movl KK, %eax
  1244. #else
  1245. movl K, %eax
  1246. subl KK, %eax
  1247. #endif
  1248. sarl $3, %eax
  1249. je .L55
  1250. ALIGN_4
  1251. .L52:
  1252. mulsd %xmm0, %xmm2
  1253. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1254. mulsd 2 * SIZE(BB), %xmm0
  1255. addsd %xmm2, %xmm4
  1256. movlpd 4 * SIZE(BB), %xmm2
  1257. addsd %xmm0, %xmm5
  1258. movlpd 1 * SIZE(AA), %xmm0
  1259. mulsd %xmm0, %xmm2
  1260. mulsd 6 * SIZE(BB), %xmm0
  1261. addsd %xmm2, %xmm6
  1262. movlpd 16 * SIZE(BB), %xmm2
  1263. addsd %xmm0, %xmm7
  1264. movlpd 2 * SIZE(AA), %xmm0
  1265. mulsd %xmm0, %xmm3
  1266. mulsd 10 * SIZE(BB), %xmm0
  1267. addsd %xmm3, %xmm4
  1268. movlpd 12 * SIZE(BB), %xmm3
  1269. addsd %xmm0, %xmm5
  1270. movlpd 3 * SIZE(AA), %xmm0
  1271. mulsd %xmm0, %xmm3
  1272. mulsd 14 * SIZE(BB), %xmm0
  1273. addsd %xmm3, %xmm6
  1274. movlpd 24 * SIZE(BB), %xmm3
  1275. addsd %xmm0, %xmm7
  1276. movlpd 8 * SIZE(AA), %xmm0
  1277. mulsd %xmm1, %xmm2
  1278. mulsd 18 * SIZE(BB), %xmm1
  1279. addsd %xmm2, %xmm4
  1280. movlpd 20 * SIZE(BB), %xmm2
  1281. addsd %xmm1, %xmm5
  1282. movlpd 5 * SIZE(AA), %xmm1
  1283. mulsd %xmm1, %xmm2
  1284. mulsd 22 * SIZE(BB), %xmm1
  1285. addsd %xmm2, %xmm6
  1286. movlpd 32 * SIZE(BB), %xmm2
  1287. addsd %xmm1, %xmm7
  1288. movlpd 6 * SIZE(AA), %xmm1
  1289. mulsd %xmm1, %xmm3
  1290. mulsd 26 * SIZE(BB), %xmm1
  1291. addsd %xmm3, %xmm4
  1292. movlpd 28 * SIZE(BB), %xmm3
  1293. addsd %xmm1, %xmm5
  1294. movlpd 7 * SIZE(AA), %xmm1
  1295. mulsd %xmm1, %xmm3
  1296. mulsd 30 * SIZE(BB), %xmm1
  1297. addsd %xmm3, %xmm6
  1298. movlpd 40 * SIZE(BB), %xmm3
  1299. addsd %xmm1, %xmm7
  1300. movlpd 12 * SIZE(AA), %xmm1
  1301. addl $ 8 * SIZE, AA
  1302. addl $32 * SIZE, BB
  1303. decl %eax
  1304. jne .L52
  1305. ALIGN_4
  1306. .L55:
  1307. #if defined(LT) || defined(RN)
  1308. movl KK, %eax
  1309. #else
  1310. movl K, %eax
  1311. subl KK, %eax
  1312. #endif
  1313. andl $7, %eax # if (k & 1)
  1314. BRANCH
  1315. je .L58
  1316. .L56:
  1317. mulsd %xmm0, %xmm2
  1318. mulsd 2 * SIZE(BB), %xmm0
  1319. addsd %xmm2, %xmm4
  1320. movlpd 4 * SIZE(BB), %xmm2
  1321. addsd %xmm0, %xmm5
  1322. movlpd 1 * SIZE(AA), %xmm0
  1323. addl $1 * SIZE, AA
  1324. addl $4 * SIZE, BB
  1325. decl %eax
  1326. jg .L56
  1327. ALIGN_4
  1328. .L58:
  1329. addsd %xmm6, %xmm4
  1330. addsd %xmm7, %xmm5
  1331. #if defined(LN) || defined(RT)
  1332. movl KK, %eax
  1333. #ifdef LN
  1334. subl $1, %eax
  1335. #else
  1336. subl $2, %eax
  1337. #endif
  1338. movl AORIG, AA
  1339. movl BORIG, B
  1340. leal BUFFER, BB
  1341. leal (, %eax, SIZE), %eax
  1342. addl %eax, AA
  1343. leal (B, %eax, 2), B
  1344. leal (BB, %eax, 4), BB
  1345. #endif
  1346. #if defined(LN) || defined(LT)
  1347. unpcklpd %xmm5, %xmm4
  1348. movapd 0 * SIZE(B), %xmm2
  1349. subpd %xmm4, %xmm2
  1350. #else
  1351. movlpd 0 * SIZE(AA), %xmm0
  1352. movlpd 1 * SIZE(AA), %xmm1
  1353. subsd %xmm4, %xmm0
  1354. subsd %xmm5, %xmm1
  1355. #endif
  1356. #ifdef LN
  1357. movlpd 0 * SIZE(AA), %xmm4
  1358. movhpd 0 * SIZE(AA), %xmm4
  1359. mulpd %xmm4, %xmm2
  1360. #endif
  1361. #ifdef LT
  1362. movlpd 0 * SIZE(AA), %xmm4
  1363. movhpd 0 * SIZE(AA), %xmm4
  1364. mulpd %xmm4, %xmm2
  1365. #endif
  1366. #ifdef RN
  1367. movlpd 0 * SIZE(B), %xmm4
  1368. mulsd %xmm4, %xmm0
  1369. movlpd 1 * SIZE(B), %xmm4
  1370. mulsd %xmm0, %xmm4
  1371. subsd %xmm4, %xmm1
  1372. movlpd 3 * SIZE(B), %xmm4
  1373. mulsd %xmm4, %xmm1
  1374. #endif
  1375. #ifdef RT
  1376. movlpd 3 * SIZE(B), %xmm4
  1377. mulsd %xmm4, %xmm1
  1378. movlpd 2 * SIZE(B), %xmm4
  1379. mulsd %xmm1, %xmm4
  1380. subsd %xmm4, %xmm0
  1381. movlpd 0 * SIZE(B), %xmm4
  1382. mulsd %xmm4, %xmm0
  1383. #endif
  1384. #if defined(LN) || defined(LT)
  1385. movapd %xmm2, 0 * SIZE(B)
  1386. movlpd %xmm2, 0 * SIZE(BB)
  1387. movlpd %xmm2, 1 * SIZE(BB)
  1388. movhpd %xmm2, 2 * SIZE(BB)
  1389. movhpd %xmm2, 3 * SIZE(BB)
  1390. #else
  1391. movlpd %xmm0, 0 * SIZE(AA)
  1392. movlpd %xmm1, 1 * SIZE(AA)
  1393. #endif
  1394. #ifdef LN
  1395. subl $1 * SIZE, CO1
  1396. #endif
  1397. #if defined(LN) || defined(LT)
  1398. movlpd %xmm2, 0 * SIZE(CO1)
  1399. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  1400. #else
  1401. movlpd %xmm0, 0 * SIZE(CO1)
  1402. movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1403. #endif
  1404. #ifndef LN
  1405. addl $1 * SIZE, CO1
  1406. #endif
  1407. #if defined(LT) || defined(RN)
  1408. movl K, %eax
  1409. subl KK, %eax
  1410. leal (AA,%eax, SIZE), AA
  1411. #ifdef LT
  1412. addl $2 * SIZE, B
  1413. #endif
  1414. #endif
  1415. #ifdef LN
  1416. subl $1, KK
  1417. movl BORIG, B
  1418. #endif
  1419. #ifdef LT
  1420. addl $1, KK
  1421. #endif
  1422. #ifdef RT
  1423. movl K, %eax
  1424. movl BORIG, B
  1425. sall $BASE_SHIFT, %eax
  1426. addl %eax, AORIG
  1427. #endif
  1428. ALIGN_4
  1429. .L50:
  1430. movl M, %ebx
  1431. sarl $1, %ebx # i = (m >> 2)
  1432. jle .L59
  1433. ALIGN_4
  1434. .L41:
  1435. #ifdef LN
  1436. movl K, %eax
  1437. sall $1 + BASE_SHIFT, %eax
  1438. subl %eax, AORIG
  1439. #endif
  1440. #if defined(LN) || defined(RT)
  1441. movl KK, %eax
  1442. movl AORIG, AA
  1443. leal (, %eax, SIZE), %eax
  1444. leal (AA, %eax, 2), AA
  1445. #endif
  1446. leal BUFFER, BB
  1447. #if defined(LN) || defined(RT)
  1448. movl KK, %eax
  1449. sall $2 + BASE_SHIFT, %eax
  1450. addl %eax, BB
  1451. #endif
  1452. pxor %xmm4, %xmm4
  1453. pxor %xmm5, %xmm5
  1454. pxor %xmm6, %xmm6
  1455. pxor %xmm7, %xmm7
  1456. movapd 0 * SIZE(AA), %xmm0
  1457. movapd 8 * SIZE(AA), %xmm1
  1458. movapd 0 * SIZE(BB), %xmm2
  1459. movapd 8 * SIZE(BB), %xmm3
  1460. #ifdef LN
  1461. prefetchw -2 * SIZE(CO1)
  1462. prefetchw -2 * SIZE(CO1, LDC)
  1463. #else
  1464. prefetchw 1 * SIZE(CO1)
  1465. prefetchw 1 * SIZE(CO1, LDC)
  1466. #endif
  1467. #if defined(LT) || defined(RN)
  1468. movl KK, %eax
  1469. #else
  1470. movl K, %eax
  1471. subl KK, %eax
  1472. #endif
  1473. sarl $3, %eax
  1474. je .L45
  1475. ALIGN_4
  1476. .L42:
  1477. mulpd %xmm0, %xmm2
  1478. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
  1479. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1480. #endif
  1481. mulpd 2 * SIZE(BB), %xmm0
  1482. addpd %xmm2, %xmm4
  1483. movapd 4 * SIZE(BB), %xmm2
  1484. addpd %xmm0, %xmm5
  1485. movapd 2 * SIZE(AA), %xmm0
  1486. mulpd %xmm0, %xmm2
  1487. mulpd 6 * SIZE(BB), %xmm0
  1488. addpd %xmm2, %xmm6
  1489. movapd 16 * SIZE(BB), %xmm2
  1490. addpd %xmm0, %xmm7
  1491. movapd 4 * SIZE(AA), %xmm0
  1492. mulpd %xmm0, %xmm3
  1493. mulpd 10 * SIZE(BB), %xmm0
  1494. addpd %xmm3, %xmm4
  1495. movapd 12 * SIZE(BB), %xmm3
  1496. addpd %xmm0, %xmm5
  1497. movapd 6 * SIZE(AA), %xmm0
  1498. mulpd %xmm0, %xmm3
  1499. mulpd 14 * SIZE(BB), %xmm0
  1500. addpd %xmm3, %xmm6
  1501. movapd 24 * SIZE(BB), %xmm3
  1502. addpd %xmm0, %xmm7
  1503. movapd 16 * SIZE(AA), %xmm0
  1504. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
  1505. prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
  1506. #endif
  1507. mulpd %xmm1, %xmm2
  1508. mulpd 18 * SIZE(BB), %xmm1
  1509. addpd %xmm2, %xmm4
  1510. movapd 20 * SIZE(BB), %xmm2
  1511. addpd %xmm1, %xmm5
  1512. movapd 10 * SIZE(AA), %xmm1
  1513. mulpd %xmm1, %xmm2
  1514. mulpd 22 * SIZE(BB), %xmm1
  1515. addpd %xmm2, %xmm6
  1516. movapd 32 * SIZE(BB), %xmm2
  1517. addpd %xmm1, %xmm7
  1518. movapd 12 * SIZE(AA), %xmm1
  1519. mulpd %xmm1, %xmm3
  1520. mulpd 26 * SIZE(BB), %xmm1
  1521. addpd %xmm3, %xmm4
  1522. movapd 28 * SIZE(BB), %xmm3
  1523. addpd %xmm1, %xmm5
  1524. movapd 14 * SIZE(AA), %xmm1
  1525. mulpd %xmm1, %xmm3
  1526. mulpd 30 * SIZE(BB), %xmm1
  1527. addpd %xmm3, %xmm6
  1528. movapd 40 * SIZE(BB), %xmm3
  1529. addpd %xmm1, %xmm7
  1530. movapd 24 * SIZE(AA), %xmm1
  1531. addl $16 * SIZE, AA
  1532. addl $32 * SIZE, BB
  1533. decl %eax
  1534. jne .L42
  1535. ALIGN_4
  1536. .L45:
  1537. #if defined(LT) || defined(RN)
  1538. movl KK, %eax
  1539. #else
  1540. movl K, %eax
  1541. subl KK, %eax
  1542. #endif
  1543. andl $7, %eax # if (k & 1)
  1544. BRANCH
  1545. je .L48
  1546. ALIGN_3
  1547. .L46:
  1548. mulpd %xmm0, %xmm2
  1549. mulpd 2 * SIZE(BB), %xmm0
  1550. addpd %xmm2, %xmm4
  1551. movapd 4 * SIZE(BB), %xmm2
  1552. addpd %xmm0, %xmm5
  1553. movapd 2 * SIZE(AA), %xmm0
  1554. addl $2 * SIZE, AA
  1555. addl $4 * SIZE, BB
  1556. decl %eax
  1557. jg .L46
  1558. ALIGN_4
  1559. .L48:
  1560. addpd %xmm6, %xmm4
  1561. addpd %xmm7, %xmm5
  1562. #if defined(LN) || defined(RT)
  1563. movl KK, %eax
  1564. #ifdef LN
  1565. subl $2, %eax
  1566. #else
  1567. subl $2, %eax
  1568. #endif
  1569. movl AORIG, AA
  1570. movl BORIG, B
  1571. leal BUFFER, BB
  1572. leal (, %eax, SIZE), %eax
  1573. leal (AA, %eax, 2), AA
  1574. leal (B, %eax, 2), B
  1575. leal (BB, %eax, 4), BB
  1576. #endif
  1577. #if defined(LN) || defined(LT)
  1578. movapd %xmm4, %xmm0
  1579. unpcklpd %xmm5, %xmm4
  1580. unpckhpd %xmm5, %xmm0
  1581. movapd 0 * SIZE(B), %xmm2
  1582. movapd 2 * SIZE(B), %xmm3
  1583. subpd %xmm4, %xmm2
  1584. subpd %xmm0, %xmm3
  1585. #else
  1586. movapd 0 * SIZE(AA), %xmm0
  1587. movapd 2 * SIZE(AA), %xmm1
  1588. subpd %xmm4, %xmm0
  1589. subpd %xmm5, %xmm1
  1590. #endif
  1591. #ifdef LN
  1592. movlpd 3 * SIZE(AA), %xmm4
  1593. movhpd 3 * SIZE(AA), %xmm4
  1594. mulpd %xmm4, %xmm3
  1595. movlpd 2 * SIZE(AA), %xmm4
  1596. movhpd 2 * SIZE(AA), %xmm4
  1597. mulpd %xmm3, %xmm4
  1598. subpd %xmm4, %xmm2
  1599. movlpd 0 * SIZE(AA), %xmm4
  1600. movhpd 0 * SIZE(AA), %xmm4
  1601. mulpd %xmm4, %xmm2
  1602. #endif
  1603. #ifdef LT
  1604. movlpd 0 * SIZE(AA), %xmm4
  1605. movhpd 0 * SIZE(AA), %xmm4
  1606. mulpd %xmm4, %xmm2
  1607. movlpd 1 * SIZE(AA), %xmm4
  1608. movhpd 1 * SIZE(AA), %xmm4
  1609. mulpd %xmm2, %xmm4
  1610. subpd %xmm4, %xmm3
  1611. movlpd 3 * SIZE(AA), %xmm4
  1612. movhpd 3 * SIZE(AA), %xmm4
  1613. mulpd %xmm4, %xmm3
  1614. #endif
  1615. #ifdef RN
  1616. movlpd 0 * SIZE(B), %xmm4
  1617. movhpd 0 * SIZE(B), %xmm4
  1618. mulpd %xmm4, %xmm0
  1619. movlpd 1 * SIZE(B), %xmm4
  1620. movhpd 1 * SIZE(B), %xmm4
  1621. mulpd %xmm0, %xmm4
  1622. subpd %xmm4, %xmm1
  1623. movlpd 3 * SIZE(B), %xmm4
  1624. movhpd 3 * SIZE(B), %xmm4
  1625. mulpd %xmm4, %xmm1
  1626. #endif
  1627. #ifdef RT
  1628. movlpd 3 * SIZE(B), %xmm4
  1629. movhpd 3 * SIZE(B), %xmm4
  1630. mulpd %xmm4, %xmm1
  1631. movlpd 2 * SIZE(B), %xmm4
  1632. movhpd 2 * SIZE(B), %xmm4
  1633. mulpd %xmm1, %xmm4
  1634. subpd %xmm4, %xmm0
  1635. movlpd 0 * SIZE(B), %xmm4
  1636. movhpd 0 * SIZE(B), %xmm4
  1637. mulpd %xmm4, %xmm0
  1638. #endif
  1639. #if defined(LN) || defined(LT)
  1640. movapd %xmm2, 0 * SIZE(B)
  1641. movapd %xmm3, 2 * SIZE(B)
  1642. movlpd %xmm2, 0 * SIZE(BB)
  1643. movlpd %xmm2, 1 * SIZE(BB)
  1644. movhpd %xmm2, 2 * SIZE(BB)
  1645. movhpd %xmm2, 3 * SIZE(BB)
  1646. movlpd %xmm3, 4 * SIZE(BB)
  1647. movlpd %xmm3, 5 * SIZE(BB)
  1648. movhpd %xmm3, 6 * SIZE(BB)
  1649. movhpd %xmm3, 7 * SIZE(BB)
  1650. #else
  1651. movapd %xmm0, 0 * SIZE(AA)
  1652. movapd %xmm1, 2 * SIZE(AA)
  1653. #endif
  1654. #ifdef LN
  1655. subl $2 * SIZE, CO1
  1656. #endif
  1657. #if defined(LN) || defined(LT)
  1658. movlpd %xmm2, 0 * SIZE(CO1)
  1659. movlpd %xmm3, 1 * SIZE(CO1)
  1660. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  1661. movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
  1662. #else
  1663. movlpd %xmm0, 0 * SIZE(CO1)
  1664. movhpd %xmm0, 1 * SIZE(CO1)
  1665. movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1666. movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
  1667. #endif
  1668. #ifndef LN
  1669. addl $2 * SIZE, CO1
  1670. #endif
  1671. #if defined(LT) || defined(RN)
  1672. movl K, %eax
  1673. subl KK, %eax
  1674. leal (,%eax, SIZE), %eax
  1675. leal (AA, %eax, 2), AA
  1676. #ifdef LT
  1677. addl $4 * SIZE, B
  1678. #endif
  1679. #endif
  1680. #ifdef LN
  1681. subl $2, KK
  1682. movl BORIG, B
  1683. #endif
  1684. #ifdef LT
  1685. addl $2, KK
  1686. #endif
  1687. #ifdef RT
  1688. movl K, %eax
  1689. movl BORIG, B
  1690. sall $1 + BASE_SHIFT, %eax
  1691. addl %eax, AORIG
  1692. #endif
  1693. decl %ebx # i --
  1694. jg .L41
  1695. ALIGN_4
  1696. .L59:
  1697. #ifdef LN
  1698. movl K, %eax
  1699. leal (, %eax, SIZE), %eax
  1700. leal (B, %eax, 2), B
  1701. #endif
  1702. #if defined(LT) || defined(RN)
  1703. movl K, %eax
  1704. subl KK, %eax
  1705. leal (,%eax, SIZE), %eax
  1706. leal (B, %eax, 2), B
  1707. #endif
  1708. #ifdef RN
  1709. addl $2, KK
  1710. #endif
  1711. #ifdef RT
  1712. subl $2, KK
  1713. #endif
  1714. ALIGN_4
  1715. .L60:
  1716. testl $1, N
  1717. je .L999
  1718. #ifdef LN
  1719. movl OFFSET, %eax
  1720. addl M, %eax
  1721. movl %eax, KK
  1722. #endif
  1723. leal BUFFER, BB
  1724. #ifdef RT
  1725. movl K, %eax
  1726. sall $BASE_SHIFT, %eax
  1727. subl %eax, B
  1728. #endif
  1729. #if defined(LN) || defined(RT)
  1730. movl KK, %eax
  1731. movl B, BORIG
  1732. leal (, %eax, SIZE), %eax
  1733. leal (B, %eax, 1), B
  1734. leal (BB, %eax, 2), BB
  1735. #endif
  1736. #ifdef LT
  1737. movl OFFSET, %eax
  1738. movl %eax, KK
  1739. #endif
  1740. #if defined(LT) || defined(RN)
  1741. movl KK, %eax
  1742. #else
  1743. movl K, %eax
  1744. subl KK, %eax
  1745. #endif
  1746. sarl $3, %eax
  1747. jle .L65
  1748. ALIGN_4
  1749. .L62:
  1750. #define COPYPREFETCH 40
  1751. prefetchnta (COPYPREFETCH) * SIZE(B)
  1752. movq 0 * SIZE(B), %mm0
  1753. movq 1 * SIZE(B), %mm1
  1754. movq 2 * SIZE(B), %mm2
  1755. movq 3 * SIZE(B), %mm3
  1756. movq 4 * SIZE(B), %mm4
  1757. movq 5 * SIZE(B), %mm5
  1758. movq 6 * SIZE(B), %mm6
  1759. movq 7 * SIZE(B), %mm7
  1760. movq %mm0, 0 * SIZE(BB)
  1761. movq %mm0, 1 * SIZE(BB)
  1762. movq %mm1, 2 * SIZE(BB)
  1763. movq %mm1, 3 * SIZE(BB)
  1764. movq %mm2, 4 * SIZE(BB)
  1765. movq %mm2, 5 * SIZE(BB)
  1766. movq %mm3, 6 * SIZE(BB)
  1767. movq %mm3, 7 * SIZE(BB)
  1768. movq %mm4, 8 * SIZE(BB)
  1769. movq %mm4, 9 * SIZE(BB)
  1770. movq %mm5, 10 * SIZE(BB)
  1771. movq %mm5, 11 * SIZE(BB)
  1772. movq %mm6, 12 * SIZE(BB)
  1773. movq %mm6, 13 * SIZE(BB)
  1774. movq %mm7, 14 * SIZE(BB)
  1775. movq %mm7, 15 * SIZE(BB)
  1776. addl $ 8 * SIZE, B
  1777. addl $16 * SIZE, BB
  1778. decl %eax
  1779. jne .L62
  1780. ALIGN_2
  1781. .L65:
  1782. #if defined(LT) || defined(RN)
  1783. movl KK, %eax
  1784. #else
  1785. movl K, %eax
  1786. subl KK, %eax
  1787. #endif
  1788. andl $7, %eax
  1789. BRANCH
  1790. jle .L70
  1791. ALIGN_2
  1792. .L66:
  1793. movq 0 * SIZE(B), %mm0
  1794. movq %mm0, 0 * SIZE(BB)
  1795. movq %mm0, 1 * SIZE(BB)
  1796. addl $1 * SIZE, B
  1797. addl $2 * SIZE, BB
  1798. decl %eax
  1799. jne .L66
  1800. ALIGN_4
  1801. .L70:
  1802. #if defined(LT) || defined(RN)
  1803. movl A, AA
  1804. #else
  1805. movl A, %eax
  1806. movl %eax, AORIG
  1807. #endif
  1808. #ifdef RT
  1809. subl LDC, C
  1810. #endif
  1811. movl C, CO1
  1812. #ifndef RT
  1813. addl LDC, C
  1814. #endif
  1815. movl M, %ebx
  1816. testl $1, %ebx # i = (m >> 2)
  1817. jle .L80
  1818. #ifdef LN
  1819. movl K, %eax
  1820. sall $BASE_SHIFT, %eax
  1821. subl %eax, AORIG
  1822. #endif
  1823. #if defined(LN) || defined(RT)
  1824. movl KK, %eax
  1825. movl AORIG, AA
  1826. leal (AA, %eax, SIZE), AA
  1827. #endif
  1828. leal BUFFER, BB
  1829. #if defined(LN) || defined(RT)
  1830. movl KK, %eax
  1831. sall $1 + BASE_SHIFT, %eax
  1832. addl %eax, BB
  1833. #endif
  1834. pxor %xmm4, %xmm4
  1835. pxor %xmm5, %xmm5
  1836. pxor %xmm6, %xmm6
  1837. pxor %xmm7, %xmm7
  1838. movlpd 0 * SIZE(AA), %xmm0
  1839. movlpd 4 * SIZE(AA), %xmm1
  1840. movlpd 0 * SIZE(BB), %xmm2
  1841. movlpd 8 * SIZE(BB), %xmm3
  1842. #if defined(LT) || defined(RN)
  1843. movl KK, %eax
  1844. #else
  1845. movl K, %eax
  1846. subl KK, %eax
  1847. #endif
  1848. sarl $3, %eax
  1849. je .L85
  1850. ALIGN_4
  1851. .L82:
  1852. mulsd %xmm0, %xmm2
  1853. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1854. movlpd 1 * SIZE(AA), %xmm0
  1855. mulsd 2 * SIZE(BB), %xmm0
  1856. addsd %xmm2, %xmm4
  1857. movlpd 16 * SIZE(BB), %xmm2
  1858. addsd %xmm0, %xmm5
  1859. movlpd 2 * SIZE(AA), %xmm0
  1860. mulsd 4 * SIZE(BB), %xmm0
  1861. addsd %xmm0, %xmm6
  1862. movlpd 3 * SIZE(AA), %xmm0
  1863. mulsd 6 * SIZE(BB), %xmm0
  1864. addsd %xmm0, %xmm7
  1865. movlpd 8 * SIZE(AA), %xmm0
  1866. mulsd %xmm1, %xmm3
  1867. movlpd 5 * SIZE(AA), %xmm1
  1868. mulsd 10 * SIZE(BB), %xmm1
  1869. addsd %xmm3, %xmm4
  1870. movlpd 24 * SIZE(BB), %xmm3
  1871. addsd %xmm1, %xmm5
  1872. movlpd 6 * SIZE(AA), %xmm1
  1873. mulsd 12 * SIZE(BB), %xmm1
  1874. addsd %xmm1, %xmm6
  1875. movlpd 7 * SIZE(AA), %xmm1
  1876. mulsd 14 * SIZE(BB), %xmm1
  1877. addsd %xmm1, %xmm7
  1878. movlpd 12 * SIZE(AA), %xmm1
  1879. addl $ 8 * SIZE, AA
  1880. addl $16 * SIZE, BB
  1881. decl %eax
  1882. jne .L82
  1883. ALIGN_4
  1884. .L85:
  1885. #if defined(LT) || defined(RN)
  1886. movl KK, %eax
  1887. #else
  1888. movl K, %eax
  1889. subl KK, %eax
  1890. #endif
  1891. andl $7, %eax # if (k & 1)
  1892. BRANCH
  1893. je .L88
  1894. .L86:
  1895. mulsd %xmm0, %xmm2
  1896. addsd %xmm2, %xmm4
  1897. movlpd 2 * SIZE(BB), %xmm2
  1898. movlpd 1 * SIZE(AA), %xmm0
  1899. addl $1 * SIZE, AA
  1900. addl $2 * SIZE, BB
  1901. decl %eax
  1902. jg .L86
  1903. ALIGN_4
  1904. .L88:
  1905. addsd %xmm5, %xmm4
  1906. addsd %xmm7, %xmm6
  1907. addsd %xmm6, %xmm4
  1908. #if defined(LN) || defined(RT)
  1909. movl KK, %eax
  1910. #ifdef LN
  1911. subl $1, %eax
  1912. #else
  1913. subl $1, %eax
  1914. #endif
  1915. movl AORIG, AA
  1916. movl BORIG, B
  1917. leal BUFFER, BB
  1918. leal (, %eax, SIZE), %eax
  1919. addl %eax, AA
  1920. addl %eax, B
  1921. leal (BB, %eax, 2), BB
  1922. #endif
  1923. #if defined(LN) || defined(LT)
  1924. movlpd 0 * SIZE(B), %xmm2
  1925. subsd %xmm4, %xmm2
  1926. #else
  1927. movlpd 0 * SIZE(AA), %xmm0
  1928. subsd %xmm4, %xmm0
  1929. #endif
  1930. #ifdef LN
  1931. movlpd 0 * SIZE(AA), %xmm4
  1932. mulsd %xmm4, %xmm2
  1933. #endif
  1934. #ifdef LT
  1935. movlpd 0 * SIZE(AA), %xmm4
  1936. mulsd %xmm4, %xmm2
  1937. #endif
  1938. #ifdef RN
  1939. movlpd 0 * SIZE(B), %xmm4
  1940. mulsd %xmm4, %xmm0
  1941. #endif
  1942. #ifdef RT
  1943. movlpd 0 * SIZE(B), %xmm4
  1944. mulsd %xmm4, %xmm0
  1945. #endif
  1946. #if defined(LN) || defined(LT)
  1947. movlpd %xmm2, 0 * SIZE(B)
  1948. movlpd %xmm2, 0 * SIZE(BB)
  1949. movlpd %xmm2, 1 * SIZE(BB)
  1950. #else
  1951. movlpd %xmm0, 0 * SIZE(AA)
  1952. #endif
  1953. #ifdef LN
  1954. subl $1 * SIZE, CO1
  1955. #endif
  1956. #if defined(LN) || defined(LT)
  1957. movlpd %xmm2, 0 * SIZE(CO1)
  1958. #else
  1959. movlpd %xmm0, 0 * SIZE(CO1)
  1960. #endif
  1961. #ifndef LN
  1962. addl $1 * SIZE, CO1
  1963. #endif
  1964. #if defined(LT) || defined(RN)
  1965. movl K, %eax
  1966. subl KK, %eax
  1967. leal (AA,%eax, SIZE), AA
  1968. #ifdef LT
  1969. addl $1 * SIZE, B
  1970. #endif
  1971. #endif
  1972. #ifdef LN
  1973. subl $1, KK
  1974. movl BORIG, B
  1975. #endif
  1976. #ifdef LT
  1977. addl $1, KK
  1978. #endif
  1979. #ifdef RT
  1980. movl K, %eax
  1981. movl BORIG, B
  1982. sall $BASE_SHIFT, %eax
  1983. addl %eax, AORIG
  1984. #endif
  1985. ALIGN_4
  1986. .L80:
  1987. movl M, %ebx
  1988. sarl $1, %ebx # i = (m >> 2)
  1989. jle .L99
  1990. ALIGN_4
  1991. .L71:
  1992. #ifdef LN
  1993. movl K, %eax
  1994. sall $1 + BASE_SHIFT, %eax
  1995. subl %eax, AORIG
  1996. #endif
  1997. #if defined(LN) || defined(RT)
  1998. movl KK, %eax
  1999. movl AORIG, AA
  2000. leal (, %eax, SIZE), %eax
  2001. leal (AA, %eax, 2), AA
  2002. #endif
  2003. leal BUFFER, BB
  2004. #if defined(LN) || defined(RT)
  2005. movl KK, %eax
  2006. sall $1 + BASE_SHIFT, %eax
  2007. addl %eax, BB
  2008. #endif
  2009. pxor %xmm4, %xmm4
  2010. pxor %xmm5, %xmm5
  2011. pxor %xmm6, %xmm6
  2012. pxor %xmm7, %xmm7
  2013. movapd 0 * SIZE(AA), %xmm0
  2014. movapd 8 * SIZE(AA), %xmm1
  2015. movapd 0 * SIZE(BB), %xmm2
  2016. movapd 8 * SIZE(BB), %xmm3
  2017. #ifdef LN
  2018. prefetchw -2 * SIZE(CO1)
  2019. #else
  2020. prefetchw 1 * SIZE(CO1)
  2021. #endif
  2022. #if defined(LT) || defined(RN)
  2023. movl KK, %eax
  2024. #else
  2025. movl K, %eax
  2026. subl KK, %eax
  2027. #endif
  2028. sarl $3, %eax
  2029. je .L75
  2030. ALIGN_4
  2031. .L72:
  2032. mulpd %xmm0, %xmm2
  2033. addpd %xmm2, %xmm4
  2034. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  2035. movapd 16 * SIZE(BB), %xmm2
  2036. movapd 2 * SIZE(AA), %xmm0
  2037. mulpd 2 * SIZE(BB), %xmm0
  2038. addpd %xmm0, %xmm4
  2039. movapd 4 * SIZE(AA), %xmm0
  2040. mulpd 4 * SIZE(BB), %xmm0
  2041. addpd %xmm0, %xmm4
  2042. movapd 6 * SIZE(AA), %xmm0
  2043. mulpd 6 * SIZE(BB), %xmm0
  2044. addpd %xmm0, %xmm4
  2045. movapd 16 * SIZE(AA), %xmm0
  2046. prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
  2047. mulpd %xmm1, %xmm3
  2048. addpd %xmm3, %xmm4
  2049. movapd 24 * SIZE(BB), %xmm3
  2050. movapd 10 * SIZE(AA), %xmm1
  2051. mulpd 10 * SIZE(BB), %xmm1
  2052. addpd %xmm1, %xmm4
  2053. movapd 12 * SIZE(AA), %xmm1
  2054. mulpd 12 * SIZE(BB), %xmm1
  2055. addpd %xmm1, %xmm4
  2056. movapd 14 * SIZE(AA), %xmm1
  2057. mulpd 14 * SIZE(BB), %xmm1
  2058. addpd %xmm1, %xmm4
  2059. movapd 24 * SIZE(AA), %xmm1
  2060. addl $16 * SIZE, AA
  2061. addl $16 * SIZE, BB
  2062. decl %eax
  2063. jne .L72
  2064. ALIGN_4
  2065. .L75:
  2066. #if defined(LT) || defined(RN)
  2067. movl KK, %eax
  2068. #else
  2069. movl K, %eax
  2070. subl KK, %eax
  2071. #endif
  2072. andl $7, %eax # if (k & 1)
  2073. BRANCH
  2074. je .L78
  2075. ALIGN_3
  2076. .L76:
  2077. mulpd %xmm0, %xmm2
  2078. addpd %xmm2, %xmm4
  2079. movapd 2 * SIZE(AA), %xmm0
  2080. movapd 2 * SIZE(BB), %xmm2
  2081. addl $2 * SIZE, AA
  2082. addl $2 * SIZE, BB
  2083. decl %eax
  2084. jg .L76
  2085. ALIGN_4
  2086. .L78:
  2087. #if defined(LN) || defined(RT)
  2088. movl KK, %eax
  2089. #ifdef LN
  2090. subl $2, %eax
  2091. #else
  2092. subl $1, %eax
  2093. #endif
  2094. movl AORIG, AA
  2095. movl BORIG, B
  2096. leal BUFFER, BB
  2097. leal (, %eax, SIZE), %eax
  2098. leal (AA, %eax, 2), AA
  2099. leal (B, %eax, 1), B
  2100. leal (BB, %eax, 2), BB
  2101. #endif
  2102. #if defined(LN) || defined(LT)
  2103. movapd 0 * SIZE(B), %xmm2
  2104. subpd %xmm4, %xmm2
  2105. #else
  2106. movapd 0 * SIZE(AA), %xmm0
  2107. subpd %xmm4, %xmm0
  2108. #endif
  2109. #ifdef LN
  2110. movapd %xmm2, %xmm3
  2111. unpckhpd %xmm3, %xmm3
  2112. movlpd 3 * SIZE(AA), %xmm4
  2113. mulsd %xmm4, %xmm3
  2114. movlpd 2 * SIZE(AA), %xmm4
  2115. mulsd %xmm3, %xmm4
  2116. subsd %xmm4, %xmm2
  2117. movlpd 0 * SIZE(AA), %xmm4
  2118. mulsd %xmm4, %xmm2
  2119. unpcklpd %xmm3, %xmm2
  2120. #endif
  2121. #ifdef LT
  2122. movapd %xmm2, %xmm3
  2123. unpckhpd %xmm3, %xmm3
  2124. movlpd 0 * SIZE(AA), %xmm4
  2125. mulsd %xmm4, %xmm2
  2126. movlpd 1 * SIZE(AA), %xmm4
  2127. mulsd %xmm2, %xmm4
  2128. subsd %xmm4, %xmm3
  2129. movlpd 3 * SIZE(AA), %xmm4
  2130. mulsd %xmm4, %xmm3
  2131. unpcklpd %xmm3, %xmm2
  2132. #endif
  2133. #ifdef RN
  2134. movlpd 0 * SIZE(B), %xmm4
  2135. movhpd 0 * SIZE(B), %xmm4
  2136. mulpd %xmm4, %xmm0
  2137. #endif
  2138. #ifdef RT
  2139. movlpd 0 * SIZE(B), %xmm4
  2140. movhpd 0 * SIZE(B), %xmm4
  2141. mulpd %xmm4, %xmm0
  2142. #endif
  2143. #if defined(LN) || defined(LT)
  2144. movapd %xmm2, 0 * SIZE(B)
  2145. movlpd %xmm2, 0 * SIZE(BB)
  2146. movlpd %xmm2, 1 * SIZE(BB)
  2147. movhpd %xmm2, 2 * SIZE(BB)
  2148. movhpd %xmm2, 3 * SIZE(BB)
  2149. #else
  2150. movapd %xmm0, 0 * SIZE(AA)
  2151. #endif
  2152. #ifdef LN
  2153. subl $2 * SIZE, CO1
  2154. #endif
  2155. #if defined(LN) || defined(LT)
  2156. movlpd %xmm2, 0 * SIZE(CO1)
  2157. movhpd %xmm2, 1 * SIZE(CO1)
  2158. #else
  2159. movlpd %xmm0, 0 * SIZE(CO1)
  2160. movhpd %xmm0, 1 * SIZE(CO1)
  2161. #endif
  2162. #ifndef LN
  2163. addl $2 * SIZE, CO1
  2164. #endif
  2165. #if defined(LT) || defined(RN)
  2166. movl K, %eax
  2167. subl KK, %eax
  2168. leal (,%eax, SIZE), %eax
  2169. leal (AA, %eax, 2), AA
  2170. #ifdef LT
  2171. addl $2 * SIZE, B
  2172. #endif
  2173. #endif
  2174. #ifdef LN
  2175. subl $2, KK
  2176. movl BORIG, B
  2177. #endif
  2178. #ifdef LT
  2179. addl $2, KK
  2180. #endif
  2181. #ifdef RT
  2182. movl K, %eax
  2183. movl BORIG, B
  2184. sall $1 + BASE_SHIFT, %eax
  2185. addl %eax, AORIG
  2186. #endif
  2187. decl %ebx # i --
  2188. jg .L71
  2189. ALIGN_4
  2190. .L99:
  2191. #ifdef LN
  2192. movl K, %eax
  2193. leal (B, %eax, SIZE), B
  2194. #endif
  2195. #if defined(LT) || defined(RN)
  2196. movl K, %eax
  2197. subl KK, %eax
  2198. leal (B,%eax, SIZE), B
  2199. #endif
  2200. #ifdef RN
  2201. addl $1, KK
  2202. #endif
  2203. #ifdef RT
  2204. subl $1, KK
  2205. #endif
  2206. ALIGN_4
  2207. .L999:
  2208. movl OLD_STACK, %esp
  2209. EMMS
  2210. popl %ebx
  2211. popl %esi
  2212. popl %edi
  2213. popl %ebp
  2214. ret
  2215. EPILOGUE