You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LN_4x2_sse2.S 43 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if !defined(HAVE_SSE2) || !defined(HAVE_MMX)
  41. #error You have to check your configuration.
  42. #endif
  43. #define STACK 16
  44. #define ARGS 0
  45. #define STACK_M 4 + STACK + ARGS(%esi)
  46. #define STACK_N 8 + STACK + ARGS(%esi)
  47. #define STACK_K 12 + STACK + ARGS(%esi)
  48. #define STACK_ALPHA 16 + STACK + ARGS(%esi)
  49. #define STACK_A 24 + STACK + ARGS(%esi)
  50. #define STACK_B 28 + STACK + ARGS(%esi)
  51. #define STACK_C 32 + STACK + ARGS(%esi)
  52. #define STACK_LDC 36 + STACK + ARGS(%esi)
  53. #define STACK_OFFT 40 + STACK + ARGS(%esi)
  54. #define ALPHA 0(%esp)
  55. #define K 16(%esp)
  56. #define N 20(%esp)
  57. #define M 24(%esp)
  58. #define A 28(%esp)
  59. #define C 32(%esp)
  60. #define J 36(%esp)
  61. #define OLD_STACK 40(%esp)
  62. #define OFFSET 44(%esp)
  63. #define KK 48(%esp)
  64. #define KKK 52(%esp)
  65. #define AORIG 56(%esp)
  66. #define BORIG 60(%esp)
  67. #define BUFFER 128(%esp)
  68. #define B %edi
  69. #define LDC %ebp
  70. #define STACK_ALIGN 4096
  71. #define STACK_OFFSET 1024
  72. #define AA %edx
  73. #define BB %ecx
  74. #define PREFETCHSIZE (8 * 4)
  75. #define KERNEL1(address) \
  76. movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \
  77. mulpd %xmm0, %xmm2; \
  78. mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \
  79. addpd %xmm2, %xmm4; \
  80. movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \
  81. addpd %xmm0, %xmm5; \
  82. movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \
  83. mulpd %xmm0, %xmm2; \
  84. mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \
  85. addpd %xmm2, %xmm6; \
  86. movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \
  87. addpd %xmm0, %xmm7; \
  88. movapd 4 * SIZE + (address) * SIZE(AA), %xmm0
  89. #define KERNEL2(address) \
  90. mulpd %xmm0, %xmm2; \
  91. mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \
  92. addpd %xmm2, %xmm4; \
  93. movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \
  94. addpd %xmm0, %xmm5; \
  95. movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \
  96. mulpd %xmm0, %xmm2; \
  97. mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \
  98. addpd %xmm2, %xmm6; \
  99. movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \
  100. addpd %xmm0, %xmm7; \
  101. movapd 16 * SIZE + (address) * SIZE(AA), %xmm0
  102. #define KERNEL3(address) \
  103. movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \
  104. mulpd %xmm1, %xmm3; \
  105. mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \
  106. addpd %xmm3, %xmm4; \
  107. movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \
  108. addpd %xmm1, %xmm5; \
  109. movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \
  110. mulpd %xmm1, %xmm3; \
  111. mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \
  112. addpd %xmm3, %xmm6; \
  113. movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \
  114. addpd %xmm1, %xmm7; \
  115. movapd 12 * SIZE + (address) * SIZE(AA), %xmm1
  116. #define KERNEL4(address) \
  117. mulpd %xmm1, %xmm3; \
  118. mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \
  119. addpd %xmm3, %xmm4; \
  120. movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \
  121. addpd %xmm1, %xmm5; \
  122. movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \
  123. mulpd %xmm1, %xmm3; \
  124. mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \
  125. addpd %xmm3, %xmm6; \
  126. movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \
  127. addpd %xmm1, %xmm7; \
  128. movapd 24 * SIZE + (address) * SIZE(AA), %xmm1
  129. #define KERNEL5(address) \
  130. movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \
  131. mulpd %xmm0, %xmm2; \
  132. mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \
  133. addpd %xmm2, %xmm4; \
  134. movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \
  135. addpd %xmm0, %xmm5; \
  136. movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \
  137. mulpd %xmm0, %xmm2; \
  138. mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \
  139. addpd %xmm2, %xmm6; \
  140. movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \
  141. addpd %xmm0, %xmm7; \
  142. movapd 20 * SIZE + (address) * SIZE(AA), %xmm0
  143. #define KERNEL6(address) \
  144. mulpd %xmm0, %xmm2; \
  145. mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \
  146. addpd %xmm2, %xmm4; \
  147. movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \
  148. addpd %xmm0, %xmm5; \
  149. movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \
  150. mulpd %xmm0, %xmm2; \
  151. mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \
  152. addpd %xmm2, %xmm6; \
  153. movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \
  154. addpd %xmm0, %xmm7; \
  155. movapd 32 * SIZE + (address) * SIZE(AA), %xmm0
  156. #define KERNEL7(address) \
  157. movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \
  158. mulpd %xmm1, %xmm3; \
  159. mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \
  160. addpd %xmm3, %xmm4; \
  161. movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \
  162. addpd %xmm1, %xmm5; \
  163. movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \
  164. mulpd %xmm1, %xmm3; \
  165. mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \
  166. addpd %xmm3, %xmm6; \
  167. movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \
  168. addpd %xmm1, %xmm7; \
  169. movapd 28 * SIZE + (address) * SIZE(AA), %xmm1
  170. #define KERNEL8(address) \
  171. mulpd %xmm1, %xmm3; \
  172. mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \
  173. addpd %xmm3, %xmm4; \
  174. movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \
  175. addpd %xmm1, %xmm5; \
  176. movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \
  177. mulpd %xmm1, %xmm3; \
  178. mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \
  179. addpd %xmm3, %xmm6; \
  180. movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \
  181. addpd %xmm1, %xmm7; \
  182. movapd 40 * SIZE + (address) * SIZE(AA), %xmm1
  183. PROLOGUE
  184. pushl %ebp
  185. pushl %edi
  186. pushl %esi
  187. pushl %ebx
  188. PROFCODE
  189. EMMS
  190. movl %esp, %esi # save old stack
  191. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  192. andl $-STACK_ALIGN, %esp
  193. addl $STACK_OFFSET, %esp
  194. STACK_TOUCHING
  195. movd STACK_M, %mm0
  196. movl STACK_N, %eax
  197. movd STACK_K, %mm1
  198. movd STACK_A, %mm2
  199. movl STACK_B, B
  200. movd STACK_C, %mm3
  201. movl STACK_LDC, LDC
  202. movd STACK_OFFT, %mm4
  203. movd %mm1, K
  204. movl %eax, N
  205. movd %mm0, M
  206. movd %mm2, A
  207. movd %mm3, C
  208. movl %esi, OLD_STACK
  209. movd %mm4, OFFSET
  210. movd %mm4, KK
  211. sall $BASE_SHIFT, LDC
  212. #ifdef LN
  213. movl M, %eax
  214. leal (, %eax, SIZE), %eax
  215. addl %eax, C
  216. imull K, %eax
  217. addl %eax, A
  218. #endif
  219. #ifdef RT
  220. movl N, %eax
  221. leal (, %eax, SIZE), %eax
  222. imull K, %eax
  223. addl %eax, B
  224. movl N, %eax
  225. imull LDC, %eax
  226. addl %eax, C
  227. #endif
  228. #ifdef RN
  229. negl KK
  230. #endif
  231. #ifdef RT
  232. movl N, %eax
  233. subl OFFSET, %eax
  234. movl %eax, KK
  235. #endif
  236. movl N, %eax
  237. sarl $1, %eax # j = (n >> 1)
  238. movl %eax, J
  239. jle .L100
  240. ALIGN_2
  241. .L01:
  242. /* Copying to Sub Buffer */
  243. #ifdef LN
  244. movl OFFSET, %eax
  245. addl M, %eax
  246. movl %eax, KK
  247. #endif
  248. leal BUFFER, %ecx
  249. #ifdef RT
  250. movl K, %eax
  251. sall $1 + BASE_SHIFT, %eax
  252. subl %eax, B
  253. #endif
  254. #if defined(LN) || defined(RT)
  255. movl KK, %eax
  256. movl B, BORIG
  257. leal (, %eax, SIZE), %eax
  258. leal (B, %eax, 2), B
  259. leal (BB, %eax, 4), BB
  260. #endif
  261. #ifdef LT
  262. movl OFFSET, %eax
  263. movl %eax, KK
  264. #endif
  265. #if defined(LT) || defined(RN)
  266. movl KK, %eax
  267. #else
  268. movl K, %eax
  269. subl KK, %eax
  270. #endif
  271. sarl $2, %eax
  272. jle .L03
  273. ALIGN_2
  274. .L02:
  275. movsd 0 * SIZE(B), %xmm0
  276. movsd 1 * SIZE(B), %xmm1
  277. movsd 2 * SIZE(B), %xmm2
  278. movsd 3 * SIZE(B), %xmm3
  279. movsd 4 * SIZE(B), %xmm4
  280. movsd 5 * SIZE(B), %xmm5
  281. movsd 6 * SIZE(B), %xmm6
  282. movsd 7 * SIZE(B), %xmm7
  283. unpcklpd %xmm0, %xmm0
  284. unpcklpd %xmm1, %xmm1
  285. unpcklpd %xmm2, %xmm2
  286. unpcklpd %xmm3, %xmm3
  287. unpcklpd %xmm4, %xmm4
  288. unpcklpd %xmm5, %xmm5
  289. unpcklpd %xmm6, %xmm6
  290. unpcklpd %xmm7, %xmm7
  291. movapd %xmm0, 0 * SIZE(%ecx)
  292. movapd %xmm1, 2 * SIZE(%ecx)
  293. movapd %xmm2, 4 * SIZE(%ecx)
  294. movapd %xmm3, 6 * SIZE(%ecx)
  295. movapd %xmm4, 8 * SIZE(%ecx)
  296. movapd %xmm5, 10 * SIZE(%ecx)
  297. movapd %xmm6, 12 * SIZE(%ecx)
  298. movapd %xmm7, 14 * SIZE(%ecx)
  299. prefetcht0 104 * SIZE(B)
  300. addl $ 8 * SIZE, B
  301. addl $16 * SIZE, %ecx
  302. decl %eax
  303. jne .L02
  304. ALIGN_2
  305. .L03:
  306. #if defined(LT) || defined(RN)
  307. movl KK, %eax
  308. #else
  309. movl K, %eax
  310. subl KK, %eax
  311. #endif
  312. andl $3, %eax
  313. BRANCH
  314. jle .L05
  315. ALIGN_4
  316. .L04:
  317. movsd 0 * SIZE(B), %xmm0
  318. movsd 1 * SIZE(B), %xmm1
  319. unpcklpd %xmm0, %xmm0
  320. unpcklpd %xmm1, %xmm1
  321. movapd %xmm0, 0 * SIZE(%ecx)
  322. movapd %xmm1, 2 * SIZE(%ecx)
  323. addl $2 * SIZE, B
  324. addl $4 * SIZE, %ecx
  325. decl %eax
  326. jne .L04
  327. ALIGN_4
  328. .L05:
  329. #if defined(LT) || defined(RN)
  330. movl A, AA
  331. #else
  332. movl A, %eax
  333. movl %eax, AORIG
  334. #endif
  335. leal (, LDC, 2), %eax
  336. #ifdef RT
  337. subl %eax, C
  338. #endif
  339. movl C, %esi # coffset = c
  340. #ifndef RT
  341. addl %eax, C
  342. #endif
  343. movl M, %ebx
  344. testl $1, %ebx
  345. jle .L30
  346. #ifdef LN
  347. movl K, %eax
  348. sall $0 + BASE_SHIFT, %eax
  349. subl %eax, AORIG
  350. #endif
  351. #if defined(LN) || defined(RT)
  352. movl KK, %eax
  353. movl AORIG, AA
  354. leal (, %eax, SIZE), %eax
  355. leal (AA, %eax, 1), AA
  356. #endif
  357. leal BUFFER, %ecx
  358. #if defined(LN) || defined(RT)
  359. movl KK, %eax
  360. sall $1 + BASE_SHIFT, %eax
  361. leal (BB, %eax, 2), BB
  362. #endif
  363. movsd 0 * SIZE(BB), %xmm2
  364. pxor %xmm4, %xmm4
  365. movsd 0 * SIZE(AA), %xmm0
  366. pxor %xmm5, %xmm5
  367. movsd 8 * SIZE(BB), %xmm3
  368. pxor %xmm6, %xmm6
  369. movsd 4 * SIZE(AA), %xmm1
  370. pxor %xmm7, %xmm7
  371. #if defined(LT) || defined(RN)
  372. movl KK, %eax
  373. #else
  374. movl K, %eax
  375. subl KK, %eax
  376. #endif
  377. sarl $3, %eax
  378. je .L52
  379. .L51:
  380. mulsd %xmm0, %xmm2
  381. mulsd 2 * SIZE(BB), %xmm0
  382. addsd %xmm2, %xmm4
  383. movsd 4 * SIZE(BB), %xmm2
  384. addsd %xmm0, %xmm5
  385. movsd 1 * SIZE(AA), %xmm0
  386. mulsd %xmm0, %xmm2
  387. mulsd 6 * SIZE(BB), %xmm0
  388. addsd %xmm2, %xmm4
  389. movsd 16 * SIZE(BB), %xmm2
  390. addsd %xmm0, %xmm5
  391. movsd 2 * SIZE(AA), %xmm0
  392. mulsd %xmm0, %xmm3
  393. mulsd 10 * SIZE(BB), %xmm0
  394. addsd %xmm3, %xmm4
  395. movsd 12 * SIZE(BB), %xmm3
  396. addsd %xmm0, %xmm5
  397. movsd 3 * SIZE(AA), %xmm0
  398. mulsd %xmm0, %xmm3
  399. mulsd 14 * SIZE(BB), %xmm0
  400. addsd %xmm3, %xmm4
  401. movsd 24 * SIZE(BB), %xmm3
  402. addsd %xmm0, %xmm5
  403. movsd 8 * SIZE(AA), %xmm0
  404. mulsd %xmm1, %xmm2
  405. mulsd 18 * SIZE(BB), %xmm1
  406. addsd %xmm2, %xmm4
  407. movsd 20 * SIZE(BB), %xmm2
  408. addsd %xmm1, %xmm5
  409. movsd 5 * SIZE(AA), %xmm1
  410. mulsd %xmm1, %xmm2
  411. mulsd 22 * SIZE(BB), %xmm1
  412. addsd %xmm2, %xmm4
  413. movsd 32 * SIZE(BB), %xmm2
  414. addsd %xmm1, %xmm5
  415. movsd 6 * SIZE(AA), %xmm1
  416. mulsd %xmm1, %xmm3
  417. mulsd 26 * SIZE(BB), %xmm1
  418. addsd %xmm3, %xmm4
  419. movsd 28 * SIZE(BB), %xmm3
  420. addsd %xmm1, %xmm5
  421. movsd 7 * SIZE(AA), %xmm1
  422. mulsd %xmm1, %xmm3
  423. mulsd 30 * SIZE(BB), %xmm1
  424. addsd %xmm3, %xmm4
  425. movsd 40 * SIZE(BB), %xmm3
  426. addsd %xmm1, %xmm5
  427. movsd 12 * SIZE(AA), %xmm1
  428. addl $ 8 * SIZE, AA
  429. addl $32 * SIZE, BB
  430. BRANCH
  431. decl %eax
  432. jne .L51
  433. .L52:
  434. #if defined(LT) || defined(RN)
  435. movl KK, %eax
  436. #else
  437. movl K, %eax
  438. subl KK, %eax
  439. #endif
  440. andl $7, %eax # if (k & 1)
  441. BRANCH
  442. je .L54
  443. .L53:
  444. mulsd %xmm0, %xmm2
  445. mulsd 2 * SIZE(BB), %xmm0
  446. addsd %xmm2, %xmm4
  447. movsd 4 * SIZE(BB), %xmm2
  448. addsd %xmm0, %xmm5
  449. movsd 1 * SIZE(AA), %xmm0
  450. addl $1 * SIZE, AA # aoffset += 8
  451. addl $4 * SIZE, BB # boffset1 += 8
  452. decl %eax
  453. BRANCH
  454. jg .L53
  455. ALIGN_4
  456. .L54:
  457. addsd %xmm6, %xmm4
  458. addsd %xmm7, %xmm5
  459. #if defined(LN) || defined(RT)
  460. movl KK, %eax
  461. #ifdef LN
  462. subl $1, %eax
  463. #else
  464. subl $2, %eax
  465. #endif
  466. movl AORIG, AA
  467. movl BORIG, B
  468. leal BUFFER, BB
  469. leal (, %eax, SIZE), %eax
  470. leal (AA, %eax, 1), AA
  471. leal (B, %eax, 2), B
  472. leal (BB, %eax, 4), BB
  473. #endif
  474. #if defined(LN) || defined(LT)
  475. movsd 0 * SIZE(B), %xmm0
  476. movsd 1 * SIZE(B), %xmm1
  477. #else
  478. movsd 0 * SIZE(AA), %xmm0
  479. movsd 1 * SIZE(AA), %xmm1
  480. #endif
  481. subsd %xmm4, %xmm0
  482. subsd %xmm5, %xmm1
  483. #if defined(LN) || defined(LT)
  484. movsd 0 * SIZE(AA), %xmm2
  485. mulsd %xmm2, %xmm0
  486. mulsd %xmm2, %xmm1
  487. #endif
  488. #ifdef RN
  489. mulsd 0 * SIZE(B), %xmm0
  490. movsd 1 * SIZE(B), %xmm4
  491. mulsd %xmm0, %xmm4
  492. subsd %xmm4, %xmm1
  493. mulsd 3 * SIZE(B), %xmm1
  494. #endif
  495. #ifdef RT
  496. mulsd 3 * SIZE(B), %xmm1
  497. movsd 2 * SIZE(B), %xmm4
  498. mulsd %xmm1, %xmm4
  499. subsd %xmm4, %xmm0
  500. mulsd 0 * SIZE(B), %xmm0
  501. #endif
  502. #if defined(LN) || defined(LT)
  503. movsd %xmm0, 0 * SIZE(B)
  504. movsd %xmm1, 1 * SIZE(B)
  505. movsd %xmm0, 0 * SIZE(BB)
  506. movsd %xmm0, 1 * SIZE(BB)
  507. movsd %xmm1, 2 * SIZE(BB)
  508. movsd %xmm1, 3 * SIZE(BB)
  509. #else
  510. movsd %xmm0, 0 * SIZE(AA)
  511. movsd %xmm1, 1 * SIZE(AA)
  512. #endif
  513. #ifdef LN
  514. subl $1 * SIZE, %esi
  515. #endif
  516. movsd %xmm0, 0 * SIZE(%esi)
  517. movsd %xmm1, 0 * SIZE(%esi, LDC)
  518. #ifndef LN
  519. addl $1 * SIZE, %esi
  520. #endif
  521. #if defined(LT) || defined(RN)
  522. movl K, %eax
  523. subl KK, %eax
  524. leal (,%eax, SIZE), %eax
  525. leal (AA, %eax, 1), AA
  526. #ifdef LT
  527. addl $2 * SIZE, B
  528. #endif
  529. #endif
  530. #ifdef LN
  531. subl $1, KK
  532. movl BORIG, B
  533. #endif
  534. #ifdef LT
  535. addl $1, KK
  536. #endif
  537. #ifdef RT
  538. movl K, %eax
  539. movl BORIG, B
  540. sall $0 + BASE_SHIFT, %eax
  541. addl %eax, AORIG
  542. #endif
  543. ALIGN_2
  544. .L30:
  545. movl M, %ebx
  546. testl $2, %ebx
  547. jle .L50
  548. #ifdef LN
  549. movl K, %eax
  550. sall $1 + BASE_SHIFT, %eax
  551. subl %eax, AORIG
  552. #endif
  553. #if defined(LN) || defined(RT)
  554. movl KK, %eax
  555. movl AORIG, AA
  556. leal (, %eax, SIZE), %eax
  557. leal (AA, %eax, 2), AA
  558. #endif
  559. leal BUFFER, BB
  560. #if defined(LN) || defined(RT)
  561. movl KK, %eax
  562. sall $1 + BASE_SHIFT, %eax
  563. leal (BB, %eax, 2), BB
  564. #endif
  565. movapd 0 * SIZE(BB), %xmm2
  566. pxor %xmm4, %xmm4
  567. movapd 0 * SIZE(AA), %xmm0
  568. pxor %xmm5, %xmm5
  569. movapd 8 * SIZE(BB), %xmm3
  570. pxor %xmm6, %xmm6
  571. movapd 8 * SIZE(AA), %xmm1
  572. pxor %xmm7, %xmm7
  573. #if defined(LT) || defined(RN)
  574. movl KK, %eax
  575. #else
  576. movl K, %eax
  577. subl KK, %eax
  578. #endif
  579. sarl $3, %eax
  580. je .L32
  581. .L31:
  582. mulpd %xmm0, %xmm2
  583. mulpd 2 * SIZE(BB), %xmm0
  584. addpd %xmm2, %xmm4
  585. movapd 4 * SIZE(BB), %xmm2
  586. addpd %xmm0, %xmm5
  587. movapd 2 * SIZE(AA), %xmm0
  588. mulpd %xmm0, %xmm2
  589. mulpd 6 * SIZE(BB), %xmm0
  590. addpd %xmm2, %xmm6
  591. movapd 16 * SIZE(BB), %xmm2
  592. addpd %xmm0, %xmm7
  593. movapd 4 * SIZE(AA), %xmm0
  594. mulpd %xmm0, %xmm3
  595. mulpd 10 * SIZE(BB), %xmm0
  596. addpd %xmm3, %xmm4
  597. movapd 12 * SIZE(BB), %xmm3
  598. addpd %xmm0, %xmm5
  599. movapd 6 * SIZE(AA), %xmm0
  600. mulpd %xmm0, %xmm3
  601. mulpd 14 * SIZE(BB), %xmm0
  602. addpd %xmm3, %xmm6
  603. movapd 24 * SIZE(BB), %xmm3
  604. addpd %xmm0, %xmm7
  605. movapd 16 * SIZE(AA), %xmm0
  606. mulpd %xmm1, %xmm2
  607. mulpd 18 * SIZE(BB), %xmm1
  608. addpd %xmm2, %xmm4
  609. movapd 20 * SIZE(BB), %xmm2
  610. addpd %xmm1, %xmm5
  611. movapd 10 * SIZE(AA), %xmm1
  612. mulpd %xmm1, %xmm2
  613. mulpd 22 * SIZE(BB), %xmm1
  614. addpd %xmm2, %xmm6
  615. movapd 32 * SIZE(BB), %xmm2
  616. addpd %xmm1, %xmm7
  617. movapd 12 * SIZE(AA), %xmm1
  618. mulpd %xmm1, %xmm3
  619. mulpd 26 * SIZE(BB), %xmm1
  620. addpd %xmm3, %xmm4
  621. movapd 28 * SIZE(BB), %xmm3
  622. addpd %xmm1, %xmm5
  623. movapd 14 * SIZE(AA), %xmm1
  624. mulpd %xmm1, %xmm3
  625. mulpd 30 * SIZE(BB), %xmm1
  626. addpd %xmm3, %xmm6
  627. movapd 40 * SIZE(BB), %xmm3
  628. addpd %xmm1, %xmm7
  629. movapd 24 * SIZE(AA), %xmm1
  630. addl $16 * SIZE, AA
  631. addl $32 * SIZE, BB
  632. BRANCH
  633. decl %eax
  634. jne .L31
  635. .L32:
  636. #if defined(LT) || defined(RN)
  637. movl KK, %eax
  638. #else
  639. movl K, %eax
  640. subl KK, %eax
  641. #endif
  642. andl $7, %eax # if (k & 1)
  643. BRANCH
  644. je .L34
  645. .L33:
  646. mulpd %xmm0, %xmm2
  647. mulpd 2 * SIZE(BB), %xmm0
  648. addpd %xmm2, %xmm4
  649. movapd 4 * SIZE(BB), %xmm2
  650. addpd %xmm0, %xmm5
  651. movapd 2 * SIZE(AA), %xmm0
  652. addl $2 * SIZE, AA # aoffset += 8
  653. addl $4 * SIZE, BB # boffset1 += 8
  654. decl %eax
  655. BRANCH
  656. jg .L33
  657. ALIGN_4
  658. .L34:
  659. addpd %xmm6, %xmm4
  660. addpd %xmm7, %xmm5
  661. #if defined(LN) || defined(RT)
  662. movl KK, %eax
  663. #ifdef LN
  664. subl $2, %eax
  665. #else
  666. subl $2, %eax
  667. #endif
  668. movl AORIG, AA
  669. movl BORIG, B
  670. leal BUFFER, BB
  671. leal (, %eax, SIZE), %eax
  672. leal (AA, %eax, 2), AA
  673. leal (B, %eax, 2), B
  674. leal (BB, %eax, 4), BB
  675. #endif
  676. #if defined(LN) || defined(LT)
  677. movapd %xmm4, %xmm0
  678. unpcklpd %xmm5, %xmm4
  679. unpckhpd %xmm5, %xmm0
  680. movapd 0 * SIZE(B), %xmm2
  681. movapd 2 * SIZE(B), %xmm3
  682. subpd %xmm4, %xmm2
  683. subpd %xmm0, %xmm3
  684. #else
  685. movapd 0 * SIZE(AA), %xmm0
  686. movapd 2 * SIZE(AA), %xmm1
  687. subpd %xmm4, %xmm0
  688. subpd %xmm5, %xmm1
  689. #endif
  690. #ifdef LN
  691. movsd 3 * SIZE(AA), %xmm0
  692. movhpd 3 * SIZE(AA), %xmm0
  693. mulpd %xmm0, %xmm3
  694. movsd 2 * SIZE(AA), %xmm0
  695. movhpd 2 * SIZE(AA), %xmm0
  696. mulpd %xmm3, %xmm0
  697. subpd %xmm0, %xmm2
  698. movsd 0 * SIZE(AA), %xmm0
  699. movhpd 0 * SIZE(AA), %xmm0
  700. mulpd %xmm0, %xmm2
  701. #endif
  702. #ifdef LT
  703. movsd 0 * SIZE(AA), %xmm0
  704. movhpd 0 * SIZE(AA), %xmm0
  705. mulpd %xmm0, %xmm2
  706. movsd 1 * SIZE(AA), %xmm0
  707. movhpd 1 * SIZE(AA), %xmm0
  708. mulpd %xmm2, %xmm0
  709. subpd %xmm0, %xmm3
  710. movsd 3 * SIZE(AA), %xmm0
  711. movhpd 3 * SIZE(AA), %xmm0
  712. mulpd %xmm0, %xmm3
  713. #endif
  714. #ifdef RN
  715. movsd 0 * SIZE(B), %xmm4
  716. movhpd 0 * SIZE(B), %xmm4
  717. mulpd %xmm4, %xmm0
  718. movsd 1 * SIZE(B), %xmm4
  719. movhpd 1 * SIZE(B), %xmm4
  720. mulpd %xmm0, %xmm4
  721. subpd %xmm4, %xmm1
  722. movsd 3 * SIZE(B), %xmm4
  723. movhpd 3 * SIZE(B), %xmm4
  724. mulpd %xmm4, %xmm1
  725. #endif
  726. #ifdef RT
  727. movsd 3 * SIZE(B), %xmm4
  728. movhpd 3 * SIZE(B), %xmm4
  729. mulpd %xmm4, %xmm1
  730. movsd 2 * SIZE(B), %xmm4
  731. movhpd 2 * SIZE(B), %xmm4
  732. mulpd %xmm1, %xmm4
  733. subpd %xmm4, %xmm0
  734. movsd 0 * SIZE(B), %xmm4
  735. movhpd 0 * SIZE(B), %xmm4
  736. mulpd %xmm4, %xmm0
  737. #endif
  738. #if defined(LN) || defined(LT)
  739. movapd %xmm2, 0 * SIZE(B)
  740. movapd %xmm3, 2 * SIZE(B)
  741. movsd %xmm2, 0 * SIZE(BB)
  742. movsd %xmm2, 1 * SIZE(BB)
  743. movhpd %xmm2, 2 * SIZE(BB)
  744. movhpd %xmm2, 3 * SIZE(BB)
  745. movsd %xmm3, 4 * SIZE(BB)
  746. movsd %xmm3, 5 * SIZE(BB)
  747. movhpd %xmm3, 6 * SIZE(BB)
  748. movhpd %xmm3, 7 * SIZE(BB)
  749. #else
  750. movapd %xmm0, 0 * SIZE(AA)
  751. movapd %xmm1, 2 * SIZE(AA)
  752. #endif
  753. #ifdef LN
  754. subl $2 * SIZE, %esi
  755. #endif
  756. #if defined(LN) || defined(LT)
  757. movsd %xmm2, 0 * SIZE(%esi)
  758. movsd %xmm3, 1 * SIZE(%esi)
  759. movhpd %xmm2, 0 * SIZE(%esi, LDC)
  760. movhpd %xmm3, 1 * SIZE(%esi, LDC)
  761. #else
  762. movsd %xmm0, 0 * SIZE(%esi)
  763. movhpd %xmm0, 1 * SIZE(%esi)
  764. movsd %xmm1, 0 * SIZE(%esi, LDC)
  765. movhpd %xmm1, 1 * SIZE(%esi, LDC)
  766. #endif
  767. #ifndef LN
  768. addl $2 * SIZE, %esi
  769. #endif
  770. #if defined(LT) || defined(RN)
  771. movl K, %eax
  772. subl KK, %eax
  773. leal (,%eax, SIZE), %eax
  774. leal (AA, %eax, 2), AA
  775. #ifdef LT
  776. addl $4 * SIZE, B
  777. #endif
  778. #endif
  779. #ifdef LN
  780. subl $2, KK
  781. movl BORIG, B
  782. #endif
  783. #ifdef LT
  784. addl $2, KK
  785. #endif
  786. #ifdef RT
  787. movl K, %eax
  788. movl BORIG, B
  789. sall $1 + BASE_SHIFT, %eax
  790. addl %eax, AORIG
  791. #endif
  792. ALIGN_2
  793. .L50:
  794. movl M, %ebx
  795. sarl $2, %ebx # i = (m >> 2)
  796. jle .L99
  797. ALIGN_4
  798. .L10:
  799. #ifdef LN
  800. movl K, %eax
  801. sall $2 + BASE_SHIFT, %eax
  802. subl %eax, AORIG
  803. #endif
  804. #if defined(LN) || defined(RT)
  805. movl KK, %eax
  806. movl AORIG, AA
  807. leal (, %eax, SIZE), %eax
  808. leal (AA, %eax, 4), AA
  809. #endif
  810. leal BUFFER, BB
  811. #if defined(LN) || defined(RT)
  812. movl KK, %eax
  813. sall $1 + BASE_SHIFT, %eax
  814. leal (BB, %eax, 2), BB
  815. #endif
  816. movapd 0 * SIZE(BB), %xmm2
  817. pxor %xmm4, %xmm4
  818. movapd 0 * SIZE(AA), %xmm0
  819. pxor %xmm5, %xmm5
  820. movapd 8 * SIZE(BB), %xmm3
  821. pxor %xmm6, %xmm6
  822. movapd 8 * SIZE(AA), %xmm1
  823. pxor %xmm7, %xmm7
  824. #ifdef LN
  825. prefetcht2 -4 * SIZE(%esi)
  826. prefetcht2 -4 * SIZE(%esi, LDC)
  827. #else
  828. prefetcht2 4 * SIZE(%esi)
  829. prefetcht2 4 * SIZE(%esi, LDC)
  830. #endif
  831. #if defined(LT) || defined(RN)
  832. movl KK, %eax
  833. #else
  834. movl K, %eax
  835. subl KK, %eax
  836. #endif
  837. #ifdef PENTIUM4
  838. andl $-8, %eax
  839. NOBRANCH
  840. je .L12
  841. sall $3, %eax
  842. .L1X:
  843. KERNEL1(32 * 0)
  844. KERNEL2(32 * 0)
  845. KERNEL3(32 * 0)
  846. KERNEL4(32 * 0)
  847. KERNEL5(32 * 0)
  848. KERNEL6(32 * 0)
  849. KERNEL7(32 * 0)
  850. KERNEL8(32 * 0)
  851. cmpl $64 * 1, %eax
  852. NOBRANCH
  853. jle .L11
  854. KERNEL1(32 * 1)
  855. KERNEL2(32 * 1)
  856. KERNEL3(32 * 1)
  857. KERNEL4(32 * 1)
  858. KERNEL5(32 * 1)
  859. KERNEL6(32 * 1)
  860. KERNEL7(32 * 1)
  861. KERNEL8(32 * 1)
  862. cmpl $64 * 2, %eax
  863. NOBRANCH
  864. jle .L11
  865. KERNEL1(32 * 2)
  866. KERNEL2(32 * 2)
  867. KERNEL3(32 * 2)
  868. KERNEL4(32 * 2)
  869. KERNEL5(32 * 2)
  870. KERNEL6(32 * 2)
  871. KERNEL7(32 * 2)
  872. KERNEL8(32 * 2)
  873. cmpl $64 * 3, %eax
  874. NOBRANCH
  875. jle .L11
  876. KERNEL1(32 * 3)
  877. KERNEL2(32 * 3)
  878. KERNEL3(32 * 3)
  879. KERNEL4(32 * 3)
  880. KERNEL5(32 * 3)
  881. KERNEL6(32 * 3)
  882. KERNEL7(32 * 3)
  883. KERNEL8(32 * 3)
  884. cmpl $64 * 4, %eax
  885. NOBRANCH
  886. jle .L11
  887. KERNEL1(32 * 4)
  888. KERNEL2(32 * 4)
  889. KERNEL3(32 * 4)
  890. KERNEL4(32 * 4)
  891. KERNEL5(32 * 4)
  892. KERNEL6(32 * 4)
  893. KERNEL7(32 * 4)
  894. KERNEL8(32 * 4)
  895. cmpl $64 * 5, %eax
  896. NOBRANCH
  897. jle .L11
  898. KERNEL1(32 * 5)
  899. KERNEL2(32 * 5)
  900. KERNEL3(32 * 5)
  901. KERNEL4(32 * 5)
  902. KERNEL5(32 * 5)
  903. KERNEL6(32 * 5)
  904. KERNEL7(32 * 5)
  905. KERNEL8(32 * 5)
  906. cmpl $64 * 6, %eax
  907. NOBRANCH
  908. jle .L11
  909. KERNEL1(32 * 6)
  910. KERNEL2(32 * 6)
  911. KERNEL3(32 * 6)
  912. KERNEL4(32 * 6)
  913. KERNEL5(32 * 6)
  914. KERNEL6(32 * 6)
  915. KERNEL7(32 * 6)
  916. KERNEL8(32 * 6)
  917. cmpl $64 * 7, %eax
  918. NOBRANCH
  919. jle .L11
  920. KERNEL1(32 * 7)
  921. KERNEL2(32 * 7)
  922. KERNEL3(32 * 7)
  923. KERNEL4(32 * 7)
  924. KERNEL5(32 * 7)
  925. KERNEL6(32 * 7)
  926. KERNEL7(32 * 7)
  927. KERNEL8(32 * 7)
  928. addl $64 * 4 * SIZE, AA
  929. addl $64 * 4 * SIZE, BB
  930. subl $64 * 8, %eax
  931. BRANCH
  932. jg .L1X
  933. .L11:
  934. leal (AA, %eax, 4), AA
  935. leal (BB, %eax, 4), BB
  936. #else
  937. sarl $3, %eax
  938. je .L12
  939. .L11:
  940. KERNEL1(32 * 0)
  941. KERNEL2(32 * 0)
  942. KERNEL3(32 * 0)
  943. KERNEL4(32 * 0)
  944. KERNEL5(32 * 0)
  945. KERNEL6(32 * 0)
  946. KERNEL7(32 * 0)
  947. KERNEL8(32 * 0)
  948. addl $32 * SIZE, %ecx
  949. addl $32 * SIZE, %edx
  950. decl %eax
  951. jne .L11
  952. #endif
  953. .L12:
  954. #if defined(LT) || defined(RN)
  955. movl KK, %eax
  956. #else
  957. movl K, %eax
  958. subl KK, %eax
  959. #endif
  960. andl $7, %eax # if (k & 1)
  961. BRANCH
  962. je .L14
  963. .L13:
  964. mulpd %xmm0, %xmm2
  965. mulpd 2 * SIZE(BB), %xmm0
  966. addpd %xmm2, %xmm4
  967. movapd 0 * SIZE(BB), %xmm2
  968. addpd %xmm0, %xmm5
  969. movapd 2 * SIZE(AA), %xmm0
  970. mulpd %xmm0, %xmm2
  971. mulpd 2 * SIZE(BB), %xmm0
  972. addpd %xmm2, %xmm6
  973. movapd 4 * SIZE(BB), %xmm2
  974. addpd %xmm0, %xmm7
  975. movapd 4 * SIZE(AA), %xmm0
  976. addl $4 * SIZE, AA # aoffset += 8
  977. addl $4 * SIZE, BB # boffset1 += 8
  978. subl $1, %eax
  979. jg .L13
  980. ALIGN_4
  981. .L14:
  982. #if defined(LN) || defined(RT)
  983. movl KK, %eax
  984. #ifdef LN
  985. subl $4, %eax
  986. #else
  987. subl $2, %eax
  988. #endif
  989. movl AORIG, AA
  990. movl BORIG, B
  991. leal BUFFER, BB
  992. leal (, %eax, SIZE), %eax
  993. leal (AA, %eax, 4), AA
  994. leal (B, %eax, 2), B
  995. leal (BB, %eax, 4), BB
  996. #endif
  997. #if defined(LN) || defined(LT)
  998. movapd %xmm4, %xmm0
  999. unpcklpd %xmm5, %xmm4
  1000. unpckhpd %xmm5, %xmm0
  1001. movapd %xmm6, %xmm1
  1002. unpcklpd %xmm7, %xmm6
  1003. unpckhpd %xmm7, %xmm1
  1004. movapd 0 * SIZE(B), %xmm2
  1005. movapd 2 * SIZE(B), %xmm3
  1006. movapd 4 * SIZE(B), %xmm5
  1007. movapd 6 * SIZE(B), %xmm7
  1008. subpd %xmm4, %xmm2
  1009. subpd %xmm0, %xmm3
  1010. subpd %xmm6, %xmm5
  1011. subpd %xmm1, %xmm7
  1012. #else
  1013. movapd 0 * SIZE(AA), %xmm0
  1014. movapd 2 * SIZE(AA), %xmm1
  1015. movapd 4 * SIZE(AA), %xmm2
  1016. movapd 6 * SIZE(AA), %xmm3
  1017. subpd %xmm4, %xmm0
  1018. subpd %xmm6, %xmm1
  1019. subpd %xmm5, %xmm2
  1020. subpd %xmm7, %xmm3
  1021. #endif
  1022. #ifdef LN
  1023. movsd 15 * SIZE(AA), %xmm0
  1024. movhpd 15 * SIZE(AA), %xmm0
  1025. mulpd %xmm0, %xmm7
  1026. movsd 14 * SIZE(AA), %xmm0
  1027. movhpd 14 * SIZE(AA), %xmm0
  1028. mulpd %xmm7, %xmm0
  1029. subpd %xmm0, %xmm5
  1030. movsd 13 * SIZE(AA), %xmm0
  1031. movhpd 13 * SIZE(AA), %xmm0
  1032. mulpd %xmm7, %xmm0
  1033. subpd %xmm0, %xmm3
  1034. movsd 12 * SIZE(AA), %xmm0
  1035. movhpd 12 * SIZE(AA), %xmm0
  1036. mulpd %xmm7, %xmm0
  1037. subpd %xmm0, %xmm2
  1038. movsd 10 * SIZE(AA), %xmm0
  1039. movhpd 10 * SIZE(AA), %xmm0
  1040. mulpd %xmm0, %xmm5
  1041. movsd 9 * SIZE(AA), %xmm0
  1042. movhpd 9 * SIZE(AA), %xmm0
  1043. mulpd %xmm5, %xmm0
  1044. subpd %xmm0, %xmm3
  1045. movsd 8 * SIZE(AA), %xmm0
  1046. movhpd 8 * SIZE(AA), %xmm0
  1047. mulpd %xmm5, %xmm0
  1048. subpd %xmm0, %xmm2
  1049. movsd 5 * SIZE(AA), %xmm0
  1050. movhpd 5 * SIZE(AA), %xmm0
  1051. mulpd %xmm0, %xmm3
  1052. movsd 4 * SIZE(AA), %xmm0
  1053. movhpd 4 * SIZE(AA), %xmm0
  1054. mulpd %xmm3, %xmm0
  1055. subpd %xmm0, %xmm2
  1056. movsd 0 * SIZE(AA), %xmm0
  1057. movhpd 0 * SIZE(AA), %xmm0
  1058. mulpd %xmm0, %xmm2
  1059. #endif
  1060. #ifdef LT
  1061. movsd 0 * SIZE(AA), %xmm0
  1062. movhpd 0 * SIZE(AA), %xmm0
  1063. mulpd %xmm0, %xmm2
  1064. movsd 1 * SIZE(AA), %xmm0
  1065. movhpd 1 * SIZE(AA), %xmm0
  1066. mulpd %xmm2, %xmm0
  1067. subpd %xmm0, %xmm3
  1068. movsd 2 * SIZE(AA), %xmm0
  1069. movhpd 2 * SIZE(AA), %xmm0
  1070. mulpd %xmm2, %xmm0
  1071. subpd %xmm0, %xmm5
  1072. movsd 3 * SIZE(AA), %xmm0
  1073. movhpd 3 * SIZE(AA), %xmm0
  1074. mulpd %xmm2, %xmm0
  1075. subpd %xmm0, %xmm7
  1076. movsd 5 * SIZE(AA), %xmm0
  1077. movhpd 5 * SIZE(AA), %xmm0
  1078. mulpd %xmm0, %xmm3
  1079. movsd 6 * SIZE(AA), %xmm0
  1080. movhpd 6 * SIZE(AA), %xmm0
  1081. mulpd %xmm3, %xmm0
  1082. subpd %xmm0, %xmm5
  1083. movsd 7 * SIZE(AA), %xmm0
  1084. movhpd 7 * SIZE(AA), %xmm0
  1085. mulpd %xmm3, %xmm0
  1086. subpd %xmm0, %xmm7
  1087. movsd 10 * SIZE(AA), %xmm0
  1088. movhpd 10 * SIZE(AA), %xmm0
  1089. mulpd %xmm0, %xmm5
  1090. movsd 11 * SIZE(AA), %xmm0
  1091. movhpd 11 * SIZE(AA), %xmm0
  1092. mulpd %xmm5, %xmm0
  1093. subpd %xmm0, %xmm7
  1094. movsd 15 * SIZE(AA), %xmm0
  1095. movhpd 15 * SIZE(AA), %xmm0
  1096. mulpd %xmm0, %xmm7
  1097. #endif
  1098. #ifdef RN
  1099. movsd 0 * SIZE(B), %xmm4
  1100. movhpd 0 * SIZE(B), %xmm4
  1101. mulpd %xmm4, %xmm0
  1102. mulpd %xmm4, %xmm1
  1103. movsd 1 * SIZE(B), %xmm4
  1104. movhpd 1 * SIZE(B), %xmm4
  1105. mulpd %xmm0, %xmm4
  1106. subpd %xmm4, %xmm2
  1107. movsd 1 * SIZE(B), %xmm4
  1108. movhpd 1 * SIZE(B), %xmm4
  1109. mulpd %xmm1, %xmm4
  1110. subpd %xmm4, %xmm3
  1111. movsd 3 * SIZE(B), %xmm4
  1112. movhpd 3 * SIZE(B), %xmm4
  1113. mulpd %xmm4, %xmm2
  1114. mulpd %xmm4, %xmm3
  1115. #endif
  1116. #ifdef RT
  1117. movsd 3 * SIZE(B), %xmm4
  1118. movhpd 3 * SIZE(B), %xmm4
  1119. mulpd %xmm4, %xmm2
  1120. mulpd %xmm4, %xmm3
  1121. movsd 2 * SIZE(B), %xmm4
  1122. movhpd 2 * SIZE(B), %xmm4
  1123. mulpd %xmm2, %xmm4
  1124. subpd %xmm4, %xmm0
  1125. movsd 2 * SIZE(B), %xmm4
  1126. movhpd 2 * SIZE(B), %xmm4
  1127. mulpd %xmm3, %xmm4
  1128. subpd %xmm4, %xmm1
  1129. movsd 0 * SIZE(B), %xmm4
  1130. movhpd 0 * SIZE(B), %xmm4
  1131. mulpd %xmm4, %xmm0
  1132. mulpd %xmm4, %xmm1
  1133. #endif
  1134. #if defined(LN) || defined(LT)
  1135. movapd %xmm2, 0 * SIZE(B)
  1136. movapd %xmm3, 2 * SIZE(B)
  1137. movapd %xmm5, 4 * SIZE(B)
  1138. movapd %xmm7, 6 * SIZE(B)
  1139. movsd %xmm2, 0 * SIZE(BB)
  1140. movsd %xmm2, 1 * SIZE(BB)
  1141. movhpd %xmm2, 2 * SIZE(BB)
  1142. movhpd %xmm2, 3 * SIZE(BB)
  1143. movsd %xmm3, 4 * SIZE(BB)
  1144. movsd %xmm3, 5 * SIZE(BB)
  1145. movhpd %xmm3, 6 * SIZE(BB)
  1146. movhpd %xmm3, 7 * SIZE(BB)
  1147. movsd %xmm5, 8 * SIZE(BB)
  1148. movsd %xmm5, 9 * SIZE(BB)
  1149. movhpd %xmm5, 10 * SIZE(BB)
  1150. movhpd %xmm5, 11 * SIZE(BB)
  1151. movsd %xmm7, 12 * SIZE(BB)
  1152. movsd %xmm7, 13 * SIZE(BB)
  1153. movhpd %xmm7, 14 * SIZE(BB)
  1154. movhpd %xmm7, 15 * SIZE(BB)
  1155. #else
  1156. movapd %xmm0, 0 * SIZE(AA)
  1157. movapd %xmm1, 2 * SIZE(AA)
  1158. movapd %xmm2, 4 * SIZE(AA)
  1159. movapd %xmm3, 6 * SIZE(AA)
  1160. #endif
  1161. #ifdef LN
  1162. subl $4 * SIZE, %esi
  1163. #endif
  1164. #if defined(LN) || defined(LT)
  1165. movsd %xmm2, 0 * SIZE(%esi)
  1166. movsd %xmm3, 1 * SIZE(%esi)
  1167. movsd %xmm5, 2 * SIZE(%esi)
  1168. movsd %xmm7, 3 * SIZE(%esi)
  1169. movhpd %xmm2, 0 * SIZE(%esi, LDC)
  1170. movhpd %xmm3, 1 * SIZE(%esi, LDC)
  1171. movhpd %xmm5, 2 * SIZE(%esi, LDC)
  1172. movhpd %xmm7, 3 * SIZE(%esi, LDC)
  1173. #else
  1174. movsd %xmm0, 0 * SIZE(%esi)
  1175. movhpd %xmm0, 1 * SIZE(%esi)
  1176. movsd %xmm1, 2 * SIZE(%esi)
  1177. movhpd %xmm1, 3 * SIZE(%esi)
  1178. movsd %xmm2, 0 * SIZE(%esi, LDC)
  1179. movhpd %xmm2, 1 * SIZE(%esi, LDC)
  1180. movsd %xmm3, 2 * SIZE(%esi, LDC)
  1181. movhpd %xmm3, 3 * SIZE(%esi, LDC)
  1182. #endif
  1183. #ifndef LN
  1184. addl $4 * SIZE, %esi
  1185. #endif
  1186. #if defined(LT) || defined(RN)
  1187. movl K, %eax
  1188. subl KK, %eax
  1189. leal (,%eax, SIZE), %eax
  1190. leal (AA, %eax, 4), AA
  1191. #ifdef LT
  1192. addl $8 * SIZE, B
  1193. #endif
  1194. #endif
  1195. #ifdef LN
  1196. subl $4, KK
  1197. movl BORIG, B
  1198. #endif
  1199. #ifdef LT
  1200. addl $4, KK
  1201. #endif
  1202. #ifdef RT
  1203. movl K, %eax
  1204. movl BORIG, B
  1205. sall $2 + BASE_SHIFT, %eax
  1206. addl %eax, AORIG
  1207. #endif
  1208. decl %ebx # i --
  1209. jg .L10
  1210. ALIGN_2
  1211. .L99:
  1212. #ifdef LN
  1213. movl K, %eax
  1214. leal (, %eax, SIZE), %eax
  1215. leal (B, %eax, 2), B
  1216. #endif
  1217. #if defined(LT) || defined(RN)
  1218. movl K, %eax
  1219. subl KK, %eax
  1220. leal (,%eax, SIZE), %eax
  1221. leal (B, %eax, 2), B
  1222. #endif
  1223. #ifdef RN
  1224. addl $2, KK
  1225. #endif
  1226. #ifdef RT
  1227. subl $2, KK
  1228. #endif
  1229. decl J # j --
  1230. jg .L01
  1231. ALIGN_2
  1232. .L100:
  1233. movl N, %eax
  1234. testl $1, %eax
  1235. jle .L999
  1236. ALIGN_2
  1237. .L101:
  1238. /* Copying to Sub Buffer */
  1239. #ifdef LN
  1240. movl OFFSET, %eax
  1241. addl M, %eax
  1242. movl %eax, KK
  1243. #endif
  1244. leal BUFFER, %ecx
  1245. #ifdef RT
  1246. movl K, %eax
  1247. sall $0 + BASE_SHIFT, %eax
  1248. subl %eax, B
  1249. #endif
  1250. #if defined(LN) || defined(RT)
  1251. movl KK, %eax
  1252. movl B, BORIG
  1253. leal (, %eax, SIZE), %eax
  1254. leal (B, %eax, 1), B
  1255. leal (BB, %eax, 2), BB
  1256. #endif
  1257. #ifdef LT
  1258. movl OFFSET, %eax
  1259. movl %eax, KK
  1260. #endif
  1261. #if defined(LT) || defined(RN)
  1262. movl KK, %eax
  1263. #else
  1264. movl K, %eax
  1265. subl KK, %eax
  1266. #endif
  1267. sarl $3, %eax
  1268. jle .L103
  1269. ALIGN_4
  1270. .L102:
  1271. movsd 0 * SIZE(B), %xmm0
  1272. movsd 1 * SIZE(B), %xmm1
  1273. movsd 2 * SIZE(B), %xmm2
  1274. movsd 3 * SIZE(B), %xmm3
  1275. movsd 4 * SIZE(B), %xmm4
  1276. movsd 5 * SIZE(B), %xmm5
  1277. movsd 6 * SIZE(B), %xmm6
  1278. movsd 7 * SIZE(B), %xmm7
  1279. unpcklpd %xmm0, %xmm0
  1280. unpcklpd %xmm1, %xmm1
  1281. unpcklpd %xmm2, %xmm2
  1282. unpcklpd %xmm3, %xmm3
  1283. unpcklpd %xmm4, %xmm4
  1284. unpcklpd %xmm5, %xmm5
  1285. unpcklpd %xmm6, %xmm6
  1286. unpcklpd %xmm7, %xmm7
  1287. movapd %xmm0, 0 * SIZE(%ecx)
  1288. movapd %xmm1, 2 * SIZE(%ecx)
  1289. movapd %xmm2, 4 * SIZE(%ecx)
  1290. movapd %xmm3, 6 * SIZE(%ecx)
  1291. movapd %xmm4, 8 * SIZE(%ecx)
  1292. movapd %xmm5, 10 * SIZE(%ecx)
  1293. movapd %xmm6, 12 * SIZE(%ecx)
  1294. movapd %xmm7, 14 * SIZE(%ecx)
  1295. prefetcht0 104 * SIZE(B)
  1296. addl $ 8 * SIZE, B
  1297. addl $16 * SIZE, %ecx
  1298. decl %eax
  1299. BRANCH
  1300. jne .L102
  1301. ALIGN_2
  1302. .L103:
  1303. #if defined(LT) || defined(RN)
  1304. movl KK, %eax
  1305. #else
  1306. movl K, %eax
  1307. subl KK, %eax
  1308. #endif
  1309. andl $7, %eax
  1310. BRANCH
  1311. jle .L105
  1312. ALIGN_2
  1313. .L104:
  1314. movsd 0 * SIZE(B), %xmm0
  1315. unpcklpd %xmm0, %xmm0
  1316. movapd %xmm0, 0 * SIZE(%ecx)
  1317. addl $1 * SIZE, B
  1318. addl $2 * SIZE, %ecx
  1319. decl %eax
  1320. jne .L104
  1321. ALIGN_4
  1322. .L105:
  1323. #if defined(LT) || defined(RN)
  1324. movl A, AA
  1325. #else
  1326. movl A, %eax
  1327. movl %eax, AORIG
  1328. #endif
  1329. #ifdef RT
  1330. subl LDC, C
  1331. #endif
  1332. movl C, %esi # coffset = c
  1333. #ifndef RT
  1334. addl LDC, C
  1335. #endif
  1336. movl M, %ebx
  1337. testl $1, %ebx
  1338. jle .L130
  1339. #ifdef LN
  1340. movl K, %eax
  1341. sall $0 + BASE_SHIFT, %eax
  1342. subl %eax, AORIG
  1343. #endif
  1344. #if defined(LN) || defined(RT)
  1345. movl KK, %eax
  1346. movl AORIG, AA
  1347. leal (, %eax, SIZE), %eax
  1348. leal (AA, %eax, 1), AA
  1349. #endif
  1350. leal BUFFER, BB
  1351. movsd 0 * SIZE(BB), %xmm2
  1352. pxor %xmm4, %xmm4
  1353. movsd 0 * SIZE(AA), %xmm0
  1354. pxor %xmm5, %xmm5
  1355. movsd 8 * SIZE(BB), %xmm3
  1356. pxor %xmm6, %xmm6
  1357. movsd 4 * SIZE(AA), %xmm1
  1358. pxor %xmm7, %xmm7
  1359. #ifdef LN
  1360. prefetcht2 -4 * SIZE(%esi)
  1361. #else
  1362. prefetcht2 4 * SIZE(%esi)
  1363. #endif
  1364. #if defined(LN) || defined(RT)
  1365. movl KK, %eax
  1366. sall $0 + BASE_SHIFT, %eax
  1367. leal (BB, %eax, 2), BB
  1368. #endif
  1369. #if defined(LT) || defined(RN)
  1370. movl KK, %eax
  1371. #else
  1372. movl K, %eax
  1373. subl KK, %eax
  1374. #endif
  1375. sarl $3, %eax
  1376. je .L152
  1377. .L151:
  1378. mulsd %xmm0, %xmm2
  1379. movsd 1 * SIZE(AA), %xmm0
  1380. addsd %xmm2, %xmm4
  1381. mulsd 2 * SIZE(BB), %xmm0
  1382. movsd 16 * SIZE(BB), %xmm2
  1383. addsd %xmm0, %xmm4
  1384. movsd 2 * SIZE(AA), %xmm0
  1385. mulsd 4 * SIZE(BB), %xmm0
  1386. addsd %xmm0, %xmm4
  1387. movsd 3 * SIZE(AA), %xmm0
  1388. mulsd 6 * SIZE(BB), %xmm0
  1389. addsd %xmm0, %xmm4
  1390. movsd 8 * SIZE(AA), %xmm0
  1391. mulsd %xmm1, %xmm3
  1392. movsd 5 * SIZE(AA), %xmm1
  1393. addsd %xmm3, %xmm4
  1394. mulsd 10 * SIZE(BB), %xmm1
  1395. movsd 24 * SIZE(BB), %xmm3
  1396. addsd %xmm1, %xmm4
  1397. movsd 6 * SIZE(AA), %xmm1
  1398. mulsd 12 * SIZE(BB), %xmm1
  1399. addsd %xmm1, %xmm4
  1400. movsd 7 * SIZE(AA), %xmm1
  1401. mulsd 14 * SIZE(BB), %xmm1
  1402. addsd %xmm1, %xmm4
  1403. movsd 12 * SIZE(AA), %xmm1
  1404. addl $ 8 * SIZE, AA
  1405. addl $16 * SIZE, BB
  1406. BRANCH
  1407. decl %eax
  1408. jne .L151
  1409. .L152:
  1410. #if defined(LT) || defined(RN)
  1411. movl KK, %eax
  1412. #else
  1413. movl K, %eax
  1414. subl KK, %eax
  1415. #endif
  1416. andl $7, %eax # if (k & 1)
  1417. BRANCH
  1418. je .L154
  1419. .L153:
  1420. movsd 0 * SIZE(AA), %xmm0
  1421. mulsd 0 * SIZE(BB), %xmm0
  1422. addsd %xmm0, %xmm4
  1423. addl $1 * SIZE, AA # aoffset += 8
  1424. addl $2 * SIZE, BB # boffset1 += 8
  1425. decl %eax
  1426. BRANCH
  1427. jg .L153
  1428. ALIGN_4
  1429. .L154:
  1430. addsd %xmm6, %xmm4
  1431. addsd %xmm7, %xmm5
  1432. #if defined(LN) || defined(RT)
  1433. movl KK, %eax
  1434. subl $1, %eax
  1435. movl AORIG, AA
  1436. movl BORIG, B
  1437. leal BUFFER, BB
  1438. leal (, %eax, SIZE), %eax
  1439. leal (AA, %eax, 1), AA
  1440. leal (B, %eax, 1), B
  1441. leal (BB, %eax, 2), BB
  1442. #endif
  1443. #if defined(LN) || defined(LT)
  1444. movsd 0 * SIZE(B), %xmm0
  1445. #else
  1446. movsd 0 * SIZE(AA), %xmm0
  1447. #endif
  1448. subsd %xmm4, %xmm0
  1449. #if defined(LN) || defined(LT)
  1450. mulsd 0 * SIZE(AA), %xmm0
  1451. #endif
  1452. #if defined(RN) || defined(RT)
  1453. mulsd 0 * SIZE(B), %xmm0
  1454. #endif
  1455. #if defined(LN) || defined(LT)
  1456. movsd %xmm0, 0 * SIZE(B)
  1457. movsd %xmm0, 0 * SIZE(BB)
  1458. movsd %xmm0, 1 * SIZE(BB)
  1459. #else
  1460. movsd %xmm0, 0 * SIZE(AA)
  1461. #endif
  1462. #ifdef LN
  1463. subl $1 * SIZE, %esi
  1464. #endif
  1465. movsd %xmm0, 0 * SIZE(%esi)
  1466. #ifndef LN
  1467. addl $1 * SIZE, %esi
  1468. #endif
  1469. #if defined(LT) || defined(RN)
  1470. movl K, %eax
  1471. subl KK, %eax
  1472. leal (,%eax, SIZE), %eax
  1473. leal (AA, %eax, 1), AA
  1474. #ifdef LT
  1475. addl $1 * SIZE, B
  1476. #endif
  1477. #endif
  1478. #ifdef LN
  1479. subl $1, KK
  1480. movl BORIG, B
  1481. #endif
  1482. #ifdef LT
  1483. addl $1, KK
  1484. #endif
  1485. #ifdef RT
  1486. movl K, %eax
  1487. movl BORIG, B
  1488. sall $0 + BASE_SHIFT, %eax
  1489. addl %eax, AORIG
  1490. #endif
  1491. ALIGN_2
  1492. .L130:
  1493. movl M, %ebx
  1494. testl $2, %ebx
  1495. jle .L150
  1496. #ifdef LN
  1497. movl K, %eax
  1498. sall $1 + BASE_SHIFT, %eax
  1499. subl %eax, AORIG
  1500. #endif
  1501. #if defined(LN) || defined(RT)
  1502. movl KK, %eax
  1503. movl AORIG, AA
  1504. leal (, %eax, SIZE), %eax
  1505. leal (AA, %eax, 2), AA
  1506. #endif
  1507. leal BUFFER, BB
  1508. movapd 0 * SIZE(BB), %xmm2
  1509. pxor %xmm4, %xmm4
  1510. movapd 0 * SIZE(AA), %xmm0
  1511. pxor %xmm5, %xmm5
  1512. movapd 8 * SIZE(BB), %xmm3
  1513. pxor %xmm6, %xmm6
  1514. movapd 8 * SIZE(AA), %xmm1
  1515. pxor %xmm7, %xmm7
  1516. #if defined(LN) || defined(RT)
  1517. movl KK, %eax
  1518. sall $0 + BASE_SHIFT, %eax
  1519. leal (BB, %eax, 2), BB
  1520. #endif
  1521. #if defined(LT) || defined(RN)
  1522. movl KK, %eax
  1523. #else
  1524. movl K, %eax
  1525. subl KK, %eax
  1526. #endif
  1527. sarl $3, %eax
  1528. je .L132
  1529. .L131:
  1530. mulpd %xmm0, %xmm2
  1531. movapd 2 * SIZE(AA), %xmm0
  1532. addpd %xmm2, %xmm4
  1533. mulpd 2 * SIZE(BB), %xmm0
  1534. movapd 16 * SIZE(BB), %xmm2
  1535. addpd %xmm0, %xmm5
  1536. movapd 4 * SIZE(AA), %xmm0
  1537. mulpd 4 * SIZE(BB), %xmm0
  1538. addpd %xmm0, %xmm6
  1539. movapd 6 * SIZE(AA), %xmm0
  1540. mulpd 6 * SIZE(BB), %xmm0
  1541. addpd %xmm0, %xmm7
  1542. movapd 16 * SIZE(AA), %xmm0
  1543. mulpd %xmm1, %xmm3
  1544. movapd 10 * SIZE(AA), %xmm1
  1545. addpd %xmm3, %xmm4
  1546. mulpd 10 * SIZE(BB), %xmm1
  1547. movapd 24 * SIZE(BB), %xmm3
  1548. addpd %xmm1, %xmm5
  1549. movapd 12 * SIZE(AA), %xmm1
  1550. mulpd 12 * SIZE(BB), %xmm1
  1551. addpd %xmm1, %xmm6
  1552. movapd 14 * SIZE(AA), %xmm1
  1553. mulpd 14 * SIZE(BB), %xmm1
  1554. addpd %xmm1, %xmm7
  1555. movapd 24 * SIZE(AA), %xmm1
  1556. addl $16 * SIZE, AA
  1557. addl $16 * SIZE, BB
  1558. BRANCH
  1559. decl %eax
  1560. jne .L131
  1561. .L132:
  1562. #if defined(LT) || defined(RN)
  1563. movl KK, %eax
  1564. #else
  1565. movl K, %eax
  1566. subl KK, %eax
  1567. #endif
  1568. andl $7, %eax # if (k & 1)
  1569. BRANCH
  1570. je .L134
  1571. .L133:
  1572. movapd 0 * SIZE(AA), %xmm0
  1573. mulpd 0 * SIZE(BB), %xmm0
  1574. addpd %xmm0, %xmm4
  1575. addl $2 * SIZE, AA # aoffset += 8
  1576. addl $2 * SIZE, BB # boffset1 += 8
  1577. decl %eax
  1578. BRANCH
  1579. jg .L133
  1580. ALIGN_4
  1581. .L134:
  1582. addpd %xmm5, %xmm4
  1583. addpd %xmm7, %xmm6
  1584. addpd %xmm6, %xmm4
  1585. #if defined(LN) || defined(RT)
  1586. movl KK, %eax
  1587. #ifdef LN
  1588. subl $2, %eax
  1589. #else
  1590. subl $1, %eax
  1591. #endif
  1592. movl AORIG, AA
  1593. movl BORIG, B
  1594. leal BUFFER, BB
  1595. leal (, %eax, SIZE), %eax
  1596. leal (AA, %eax, 2), AA
  1597. leal (B, %eax, 1), B
  1598. leal (BB, %eax, 2), BB
  1599. #endif
  1600. #if defined(LN) || defined(LT)
  1601. movapd 0 * SIZE(B), %xmm0
  1602. #else
  1603. movapd 0 * SIZE(AA), %xmm0
  1604. #endif
  1605. subpd %xmm4, %xmm0
  1606. #ifdef LN
  1607. movapd %xmm0, %xmm2
  1608. unpckhpd %xmm2, %xmm2
  1609. movsd 3 * SIZE(AA), %xmm4
  1610. mulsd %xmm4, %xmm2
  1611. movsd 2 * SIZE(AA), %xmm5
  1612. mulsd %xmm2, %xmm5
  1613. subsd %xmm5, %xmm0
  1614. movsd 0 * SIZE(AA), %xmm4
  1615. mulsd %xmm4, %xmm0
  1616. unpcklpd %xmm2, %xmm0
  1617. #endif
  1618. #ifdef LT
  1619. movapd %xmm0, %xmm2
  1620. unpckhpd %xmm2, %xmm2
  1621. movsd 0 * SIZE(AA), %xmm4
  1622. mulsd %xmm4, %xmm0
  1623. movsd 1 * SIZE(AA), %xmm5
  1624. mulsd %xmm0, %xmm5
  1625. subsd %xmm5, %xmm2
  1626. movsd 3 * SIZE(AA), %xmm4
  1627. mulsd %xmm4, %xmm2
  1628. unpcklpd %xmm2, %xmm0
  1629. #endif
  1630. #if defined(RN) || defined(RT)
  1631. movsd 0 * SIZE(B), %xmm4
  1632. movhpd 0 * SIZE(B), %xmm4
  1633. mulpd %xmm4, %xmm0
  1634. #endif
  1635. #if defined(LN) || defined(LT)
  1636. movapd %xmm0, 0 * SIZE(B)
  1637. movsd %xmm0, 0 * SIZE(BB)
  1638. movsd %xmm0, 1 * SIZE(BB)
  1639. movhpd %xmm0, 2 * SIZE(BB)
  1640. movhpd %xmm0, 3 * SIZE(BB)
  1641. #else
  1642. movapd %xmm0, 0 * SIZE(AA)
  1643. #endif
  1644. #ifdef LN
  1645. subl $2 * SIZE, %esi
  1646. #endif
  1647. movsd %xmm0, 0 * SIZE(%esi)
  1648. movhpd %xmm0, 1 * SIZE(%esi)
  1649. #ifndef LN
  1650. addl $2 * SIZE, %esi
  1651. #endif
  1652. #if defined(LT) || defined(RN)
  1653. movl K, %eax
  1654. subl KK, %eax
  1655. leal (,%eax, SIZE), %eax
  1656. leal (AA, %eax, 2), AA
  1657. #ifdef LT
  1658. addl $2 * SIZE, B
  1659. #endif
  1660. #endif
  1661. #ifdef LN
  1662. subl $2, KK
  1663. movl BORIG, B
  1664. #endif
  1665. #ifdef LT
  1666. addl $2, KK
  1667. #endif
  1668. #ifdef RT
  1669. movl K, %eax
  1670. movl BORIG, B
  1671. sall $1 + BASE_SHIFT, %eax
  1672. addl %eax, AORIG
  1673. #endif
  1674. ALIGN_2
  1675. .L150:
  1676. movl M, %ebx
  1677. sarl $2, %ebx # i = (m >> 2)
  1678. jle .L159
  1679. ALIGN_4
  1680. .L110:
  1681. #ifdef LN
  1682. movl K, %eax
  1683. sall $2 + BASE_SHIFT, %eax
  1684. subl %eax, AORIG
  1685. #endif
  1686. #if defined(LN) || defined(RT)
  1687. movl KK, %eax
  1688. movl AORIG, AA
  1689. leal (, %eax, SIZE), %eax
  1690. leal (AA, %eax, 4), AA
  1691. #endif
  1692. leal BUFFER, BB
  1693. #if defined(LN) || defined(RT)
  1694. movl KK, %eax
  1695. sall $0 + BASE_SHIFT, %eax
  1696. leal (BB, %eax, 2), BB
  1697. #endif
  1698. movapd 0 * SIZE(BB), %xmm2
  1699. pxor %xmm4, %xmm4
  1700. movapd 0 * SIZE(AA), %xmm0
  1701. pxor %xmm5, %xmm5
  1702. movapd 8 * SIZE(BB), %xmm3
  1703. pxor %xmm6, %xmm6
  1704. movapd 8 * SIZE(AA), %xmm1
  1705. pxor %xmm7, %xmm7
  1706. #if defined(LT) || defined(RN)
  1707. movl KK, %eax
  1708. #else
  1709. movl K, %eax
  1710. subl KK, %eax
  1711. #endif
  1712. sarl $3, %eax
  1713. je .L112
  1714. .L111:
  1715. mulpd %xmm2, %xmm0
  1716. mulpd 2 * SIZE(AA), %xmm2
  1717. addpd %xmm0, %xmm4
  1718. movapd 4 * SIZE(AA), %xmm0
  1719. addpd %xmm2, %xmm6
  1720. movapd 2 * SIZE(BB), %xmm2
  1721. mulpd %xmm2, %xmm0
  1722. mulpd 6 * SIZE(AA), %xmm2
  1723. addpd %xmm0, %xmm5
  1724. movapd 16 * SIZE(AA), %xmm0
  1725. addpd %xmm2, %xmm7
  1726. movapd 4 * SIZE(BB), %xmm2
  1727. mulpd %xmm2, %xmm1
  1728. mulpd 10 * SIZE(AA), %xmm2
  1729. addpd %xmm1, %xmm4
  1730. movapd 12 * SIZE(AA), %xmm1
  1731. addpd %xmm2, %xmm6
  1732. movapd 6 * SIZE(BB), %xmm2
  1733. mulpd %xmm2, %xmm1
  1734. mulpd 14 * SIZE(AA), %xmm2
  1735. addpd %xmm1, %xmm5
  1736. movapd 24 * SIZE(AA), %xmm1
  1737. addpd %xmm2, %xmm7
  1738. movapd 16 * SIZE(BB), %xmm2
  1739. mulpd %xmm3, %xmm0
  1740. mulpd 18 * SIZE(AA), %xmm3
  1741. addpd %xmm0, %xmm4
  1742. movapd 20 * SIZE(AA), %xmm0
  1743. addpd %xmm3, %xmm6
  1744. movapd 10 * SIZE(BB), %xmm3
  1745. mulpd %xmm3, %xmm0
  1746. mulpd 22 * SIZE(AA), %xmm3
  1747. addpd %xmm0, %xmm5
  1748. movapd 32 * SIZE(AA), %xmm0
  1749. addpd %xmm3, %xmm7
  1750. movapd 12 * SIZE(BB), %xmm3
  1751. mulpd %xmm3, %xmm1
  1752. mulpd 26 * SIZE(AA), %xmm3
  1753. addpd %xmm1, %xmm4
  1754. movapd 28 * SIZE(AA), %xmm1
  1755. addpd %xmm3, %xmm6
  1756. movapd 14 * SIZE(BB), %xmm3
  1757. mulpd %xmm3, %xmm1
  1758. mulpd 30 * SIZE(AA), %xmm3
  1759. addpd %xmm1, %xmm5
  1760. movapd 40 * SIZE(AA), %xmm1
  1761. addpd %xmm3, %xmm7
  1762. movapd 24 * SIZE(BB), %xmm3
  1763. addl $32 * SIZE, AA
  1764. addl $16 * SIZE, BB
  1765. decl %eax
  1766. jne .L111
  1767. .L112:
  1768. #if defined(LT) || defined(RN)
  1769. movl KK, %eax
  1770. #else
  1771. movl K, %eax
  1772. subl KK, %eax
  1773. #endif
  1774. andl $7, %eax # if (k & 1)
  1775. BRANCH
  1776. je .L114
  1777. .L113:
  1778. mulpd %xmm2, %xmm0
  1779. mulpd 2 * SIZE(AA), %xmm2
  1780. addpd %xmm0, %xmm4
  1781. movapd 4 * SIZE(AA), %xmm0
  1782. addpd %xmm2, %xmm6
  1783. movapd 2 * SIZE(BB), %xmm2
  1784. addl $4 * SIZE, AA # aoffset += 8
  1785. addl $2 * SIZE, BB # boffset1 += 8
  1786. subl $1, %eax
  1787. jg .L113
  1788. ALIGN_4
  1789. .L114:
  1790. addpd %xmm5, %xmm4
  1791. addpd %xmm7, %xmm6
  1792. #if defined(LN) || defined(RT)
  1793. movl KK, %eax
  1794. #ifdef LN
  1795. subl $4, %eax
  1796. #else
  1797. subl $1, %eax
  1798. #endif
  1799. movl AORIG, AA
  1800. movl BORIG, B
  1801. leal BUFFER, BB
  1802. leal (, %eax, SIZE), %eax
  1803. leal (AA, %eax, 4), AA
  1804. leal (B, %eax, 1), B
  1805. leal (BB, %eax, 2), BB
  1806. #endif
  1807. #if defined(LN) || defined(LT)
  1808. movapd 0 * SIZE(B), %xmm0
  1809. movapd 2 * SIZE(B), %xmm1
  1810. #else
  1811. movapd 0 * SIZE(AA), %xmm0
  1812. movapd 2 * SIZE(AA), %xmm1
  1813. #endif
  1814. subpd %xmm4, %xmm0
  1815. subpd %xmm6, %xmm1
  1816. #ifdef LN
  1817. movapd %xmm0, %xmm2
  1818. unpckhpd %xmm2, %xmm2
  1819. movapd %xmm1, %xmm3
  1820. unpckhpd %xmm3, %xmm3
  1821. movsd 15 * SIZE(AA), %xmm4
  1822. mulsd %xmm4, %xmm3
  1823. movsd 14 * SIZE(AA), %xmm5
  1824. mulsd %xmm3, %xmm5
  1825. subsd %xmm5, %xmm1
  1826. movsd 13 * SIZE(AA), %xmm6
  1827. mulsd %xmm3, %xmm6
  1828. subsd %xmm6, %xmm2
  1829. movsd 12 * SIZE(AA), %xmm7
  1830. mulsd %xmm3, %xmm7
  1831. subsd %xmm7, %xmm0
  1832. movsd 10 * SIZE(AA), %xmm4
  1833. mulsd %xmm4, %xmm1
  1834. movsd 9 * SIZE(AA), %xmm5
  1835. mulsd %xmm1, %xmm5
  1836. subsd %xmm5, %xmm2
  1837. movsd 8 * SIZE(AA), %xmm6
  1838. mulsd %xmm1, %xmm6
  1839. subsd %xmm6, %xmm0
  1840. movsd 5 * SIZE(AA), %xmm4
  1841. mulsd %xmm4, %xmm2
  1842. movsd 4 * SIZE(AA), %xmm5
  1843. mulsd %xmm2, %xmm5
  1844. subsd %xmm5, %xmm0
  1845. movsd 0 * SIZE(AA), %xmm4
  1846. mulsd %xmm4, %xmm0
  1847. unpcklpd %xmm2, %xmm0
  1848. unpcklpd %xmm3, %xmm1
  1849. #endif
  1850. #ifdef LT
  1851. movapd %xmm0, %xmm2
  1852. unpckhpd %xmm2, %xmm2
  1853. movapd %xmm1, %xmm3
  1854. unpckhpd %xmm3, %xmm3
  1855. movsd 0 * SIZE(AA), %xmm4
  1856. mulsd %xmm4, %xmm0
  1857. movsd 1 * SIZE(AA), %xmm5
  1858. mulsd %xmm0, %xmm5
  1859. subsd %xmm5, %xmm2
  1860. movsd 2 * SIZE(AA), %xmm6
  1861. mulsd %xmm0, %xmm6
  1862. subsd %xmm6, %xmm1
  1863. movsd 3 * SIZE(AA), %xmm7
  1864. mulsd %xmm0, %xmm7
  1865. subsd %xmm7, %xmm3
  1866. movsd 5 * SIZE(AA), %xmm4
  1867. mulsd %xmm4, %xmm2
  1868. movsd 6 * SIZE(AA), %xmm5
  1869. mulsd %xmm2, %xmm5
  1870. subsd %xmm5, %xmm1
  1871. movsd 7 * SIZE(AA), %xmm6
  1872. mulsd %xmm2, %xmm6
  1873. subsd %xmm6, %xmm3
  1874. movsd 10 * SIZE(AA), %xmm4
  1875. mulsd %xmm4, %xmm1
  1876. movsd 11 * SIZE(AA), %xmm5
  1877. mulsd %xmm1, %xmm5
  1878. subsd %xmm5, %xmm3
  1879. movsd 15 * SIZE(AA), %xmm4
  1880. mulsd %xmm4, %xmm3
  1881. unpcklpd %xmm2, %xmm0
  1882. unpcklpd %xmm3, %xmm1
  1883. #endif
  1884. #if defined(RN) || defined(RT)
  1885. movsd 0 * SIZE(B), %xmm4
  1886. movhpd 0 * SIZE(B), %xmm4
  1887. mulpd %xmm4, %xmm0
  1888. mulpd %xmm4, %xmm1
  1889. #endif
  1890. #if defined(LN) || defined(LT)
  1891. movapd %xmm0, 0 * SIZE(B)
  1892. movapd %xmm1, 2 * SIZE(B)
  1893. movsd %xmm0, 0 * SIZE(BB)
  1894. movsd %xmm0, 1 * SIZE(BB)
  1895. movhpd %xmm0, 2 * SIZE(BB)
  1896. movhpd %xmm0, 3 * SIZE(BB)
  1897. movsd %xmm1, 4 * SIZE(BB)
  1898. movsd %xmm1, 5 * SIZE(BB)
  1899. movhpd %xmm1, 6 * SIZE(BB)
  1900. movhpd %xmm1, 7 * SIZE(BB)
  1901. #else
  1902. movapd %xmm0, 0 * SIZE(AA)
  1903. movapd %xmm1, 2 * SIZE(AA)
  1904. #endif
  1905. #ifdef LN
  1906. subl $4 * SIZE, %esi
  1907. #endif
  1908. movsd %xmm0, 0 * SIZE(%esi)
  1909. movhpd %xmm0, 1 * SIZE(%esi)
  1910. movsd %xmm1, 2 * SIZE(%esi)
  1911. movhpd %xmm1, 3 * SIZE(%esi)
  1912. #ifndef LN
  1913. addl $4 * SIZE, %esi
  1914. #endif
  1915. #if defined(LT) || defined(RN)
  1916. movl K, %eax
  1917. subl KK, %eax
  1918. leal (,%eax, SIZE), %eax
  1919. leal (AA, %eax, 4), AA
  1920. #ifdef LT
  1921. addl $4 * SIZE, B
  1922. #endif
  1923. #endif
  1924. #ifdef LN
  1925. subl $4, KK
  1926. movl BORIG, B
  1927. #endif
  1928. #ifdef LT
  1929. addl $4, KK
  1930. #endif
  1931. #ifdef RT
  1932. movl K, %eax
  1933. movl BORIG, B
  1934. sall $2 + BASE_SHIFT, %eax
  1935. addl %eax, AORIG
  1936. #endif
  1937. BRANCH
  1938. decl %ebx # i --
  1939. jg .L110
  1940. ALIGN_2
  1941. .L159:
  1942. #ifdef LN
  1943. movl K, %eax
  1944. leal (, %eax, SIZE), %eax
  1945. leal (B, %eax, 1), B
  1946. #endif
  1947. #if defined(LT) || defined(RN)
  1948. movl K, %eax
  1949. subl KK, %eax
  1950. leal (,%eax, SIZE), %eax
  1951. leal (B, %eax, 1), B
  1952. #endif
  1953. #ifdef RN
  1954. addl $1, KK
  1955. #endif
  1956. #ifdef RT
  1957. subl $1, KK
  1958. #endif
  1959. ALIGN_2
  1960. .L999:
  1961. movl OLD_STACK, %esp
  1962. EMMS
  1963. popl %ebx
  1964. popl %esi
  1965. popl %edi
  1966. popl %ebp
  1967. ret
  1968. ALIGN_2
  1969. EPILOGUE