You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_8x2_sse.S 57 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if !defined(HAVE_SSE) || !defined(HAVE_MMX)
  41. #error You have to check your configuration.
  42. #endif
  43. #define STACK 16
  44. #define ARGS 0
  45. #define STACK_M 4 + STACK + ARGS(%esi)
  46. #define STACK_N 8 + STACK + ARGS(%esi)
  47. #define STACK_K 12 + STACK + ARGS(%esi)
  48. #define STACK_ALPHA 16 + STACK + ARGS(%esi)
  49. #define STACK_A 20 + STACK + ARGS(%esi)
  50. #define STACK_B 24 + STACK + ARGS(%esi)
  51. #define STACK_C 28 + STACK + ARGS(%esi)
  52. #define STACK_LDC 32 + STACK + ARGS(%esi)
  53. #define STACK_OFFT 36 + STACK + ARGS(%esi)
  54. #define ALPHA 0(%esp)
  55. #define K 16(%esp)
  56. #define N 20(%esp)
  57. #define M 24(%esp)
  58. #define A 28(%esp)
  59. #define C 32(%esp)
  60. #define J 36(%esp)
  61. #define OLD_STACK 40(%esp)
  62. #define OFFSET 44(%esp)
  63. #define KK 48(%esp)
  64. #define KKK 52(%esp)
  65. #define BUFFER 128(%esp)
  66. #define B %edi
  67. #define LDC %ebp
  68. #define STACK_ALIGN 4096
  69. #define STACK_OFFSET 1024
  70. #define PREFETCHSIZE 48 /* for PIII */
  71. #define AA %edx
  72. #define BB %ecx
  73. #if !defined(HAVE_SSE2) || defined(OPTERON)
  74. #define movsd movlps
  75. #endif
  76. #ifdef HAVE_SSE2
  77. #define xorps pxor
  78. #endif
  79. #define KERNEL1(address) \
  80. mulps %xmm0, %xmm2; \
  81. mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
  82. addps %xmm2, %xmm4; \
  83. movaps 0 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
  84. addps %xmm0, %xmm5; \
  85. movaps 4 * SIZE + (address) * SIZE * 2(AA), %xmm0; \
  86. mulps %xmm0, %xmm2; \
  87. mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
  88. addps %xmm2, %xmm6; \
  89. movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
  90. addps %xmm0, %xmm7; \
  91. movaps 8 * SIZE + (address) * SIZE * 2(AA), %xmm0
  92. #define KERNEL2(address) \
  93. mulps %xmm0, %xmm2; \
  94. mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
  95. addps %xmm2, %xmm4; \
  96. movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
  97. addps %xmm0, %xmm5; \
  98. movaps 12 * SIZE + (address) * SIZE * 2(AA), %xmm0; \
  99. mulps %xmm0, %xmm2; \
  100. mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
  101. addps %xmm2, %xmm6; \
  102. movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
  103. addps %xmm0, %xmm7; \
  104. movaps 32 * SIZE + (address) * SIZE * 2(AA), %xmm0
  105. #define KERNEL3(address) \
  106. mulps %xmm1, %xmm3; \
  107. mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
  108. addps %xmm3, %xmm4; \
  109. movaps 16 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
  110. addps %xmm1, %xmm5; \
  111. movaps 20 * SIZE + (address) * SIZE * 2(AA), %xmm1; \
  112. mulps %xmm1, %xmm3; \
  113. mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
  114. addps %xmm3, %xmm6; \
  115. movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
  116. addps %xmm1, %xmm7; \
  117. movaps 24 * SIZE + (address) * SIZE * 2(AA), %xmm1
  118. #define KERNEL4(address) \
  119. mulps %xmm1, %xmm3; \
  120. mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
  121. addps %xmm3, %xmm4; \
  122. movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
  123. addps %xmm1, %xmm5; \
  124. movaps 28 * SIZE + (address) * SIZE * 2(AA), %xmm1; \
  125. mulps %xmm1, %xmm3; \
  126. mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
  127. addps %xmm3, %xmm6; \
  128. movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
  129. addps %xmm1, %xmm7; \
  130. movaps 48 * SIZE + (address) * SIZE * 2(AA), %xmm1
  131. #define KERNEL5(address) \
  132. mulps %xmm0, %xmm2; \
  133. mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
  134. addps %xmm2, %xmm4; \
  135. movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
  136. addps %xmm0, %xmm5; \
  137. movaps 36 * SIZE + (address) * SIZE * 2(AA), %xmm0; \
  138. mulps %xmm0, %xmm2; \
  139. mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
  140. addps %xmm2, %xmm6; \
  141. movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
  142. addps %xmm0, %xmm7; \
  143. movaps 40 * SIZE + (address) * SIZE * 2(AA), %xmm0
  144. #define KERNEL6(address) \
  145. mulps %xmm0, %xmm2; \
  146. mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
  147. addps %xmm2, %xmm4; \
  148. movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
  149. addps %xmm0, %xmm5; \
  150. movaps 44 * SIZE + (address) * SIZE * 2(AA), %xmm0; \
  151. mulps %xmm0, %xmm2; \
  152. mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
  153. addps %xmm2, %xmm6; \
  154. movaps 64 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
  155. addps %xmm0, %xmm7; \
  156. movaps 64 * SIZE + (address) * SIZE * 2(AA), %xmm0
  157. #define KERNEL7(address) \
  158. mulps %xmm1, %xmm3; \
  159. mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
  160. addps %xmm3, %xmm4; \
  161. movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
  162. addps %xmm1, %xmm5; \
  163. movaps 52 * SIZE + (address) * SIZE * 2(AA), %xmm1; \
  164. mulps %xmm1, %xmm3; \
  165. mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
  166. addps %xmm3, %xmm6; \
  167. movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
  168. addps %xmm1, %xmm7; \
  169. movaps 56 * SIZE + (address) * SIZE * 2(AA), %xmm1
  170. #define KERNEL8(address) \
  171. mulps %xmm1, %xmm3; \
  172. mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
  173. addps %xmm3, %xmm4; \
  174. movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
  175. addps %xmm1, %xmm5; \
  176. movaps 60 * SIZE + (address) * SIZE * 2(AA), %xmm1; \
  177. mulps %xmm1, %xmm3; \
  178. mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
  179. addps %xmm3, %xmm6; \
  180. movaps 80 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
  181. addps %xmm1, %xmm7; \
  182. movaps 80 * SIZE + (address) * SIZE * 2(AA), %xmm1
  183. PROLOGUE
  184. pushl %ebp
  185. pushl %edi
  186. pushl %esi
  187. pushl %ebx
  188. PROFCODE
  189. EMMS
  190. movl %esp, %esi # save old stack
  191. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  192. andl $-STACK_ALIGN, %esp
  193. addl $STACK_OFFSET, %esp
  194. STACK_TOUCHING
  195. movd STACK_M, %mm0
  196. movl STACK_N, %eax
  197. movd STACK_K, %mm1
  198. movd STACK_A, %mm2
  199. movd STACK_ALPHA, %mm7
  200. movl STACK_B, B
  201. movd STACK_C, %mm3
  202. movl STACK_LDC, LDC
  203. #ifdef TRMMKERNEL
  204. movd STACK_OFFT, %mm4
  205. #endif
  206. movd %mm7, 0 * SIZE + ALPHA
  207. movd %mm7, 1 * SIZE + ALPHA
  208. movd %mm7, 2 * SIZE + ALPHA
  209. movd %mm7, 3 * SIZE + ALPHA
  210. movd %mm1, K
  211. movl %eax, N
  212. movd %mm0, M
  213. movd %mm2, A
  214. movd %mm3, C
  215. movl %esi, OLD_STACK
  216. #ifdef TRMMKERNEL
  217. movd %mm4, OFFSET
  218. movd %mm4, KK
  219. #ifndef LEFT
  220. negl KK
  221. #endif
  222. #endif
  223. leal (, LDC, SIZE), LDC
  224. sarl $1, %eax # j = (n >> 1)
  225. movl %eax, J
  226. jle .L100
  227. ALIGN_2
  228. .L01:
  229. #if defined(TRMMKERNEL) && defined(LEFT)
  230. movl OFFSET, %eax
  231. movl %eax, KK
  232. #endif
  233. /* Copying to Sub Buffer */
  234. movl K, %eax
  235. leal BUFFER, %ecx
  236. sarl $2, %eax
  237. jle .L03
  238. ALIGN_4
  239. .L02:
  240. movss 0 * SIZE(B), %xmm0
  241. movss 1 * SIZE(B), %xmm1
  242. movss 2 * SIZE(B), %xmm2
  243. movss 3 * SIZE(B), %xmm3
  244. movss 4 * SIZE(B), %xmm4
  245. movss 5 * SIZE(B), %xmm5
  246. movss 6 * SIZE(B), %xmm6
  247. movss 7 * SIZE(B), %xmm7
  248. shufps $0, %xmm0, %xmm0
  249. shufps $0, %xmm1, %xmm1
  250. shufps $0, %xmm2, %xmm2
  251. shufps $0, %xmm3, %xmm3
  252. shufps $0, %xmm4, %xmm4
  253. shufps $0, %xmm5, %xmm5
  254. shufps $0, %xmm6, %xmm6
  255. shufps $0, %xmm7, %xmm7
  256. movaps %xmm0, 0 * SIZE(%ecx)
  257. movaps %xmm1, 4 * SIZE(%ecx)
  258. movaps %xmm2, 8 * SIZE(%ecx)
  259. movaps %xmm3, 12 * SIZE(%ecx)
  260. movaps %xmm4, 16 * SIZE(%ecx)
  261. movaps %xmm5, 20 * SIZE(%ecx)
  262. movaps %xmm6, 24 * SIZE(%ecx)
  263. movaps %xmm7, 28 * SIZE(%ecx)
  264. prefetcht0 104 * SIZE(B)
  265. addl $ 8 * SIZE, B
  266. addl $32 * SIZE, %ecx
  267. decl %eax
  268. BRANCH
  269. jne .L02
  270. ALIGN_2
  271. .L03:
  272. movl K, %eax
  273. andl $3, %eax
  274. BRANCH
  275. jle .L05
  276. ALIGN_2
  277. .L04:
  278. movss 0 * SIZE(B), %xmm0
  279. movss 1 * SIZE(B), %xmm1
  280. addl $2 * SIZE, B
  281. shufps $0, %xmm0, %xmm0
  282. shufps $0, %xmm1, %xmm1
  283. movaps %xmm0, 0 * SIZE(%ecx)
  284. movaps %xmm1, 4 * SIZE(%ecx)
  285. addl $8 * SIZE, %ecx
  286. decl %eax
  287. jne .L04
  288. ALIGN_4
  289. .L05:
  290. movl C, %esi # coffset = c
  291. movl A, AA # aoffset = a
  292. movl M, %ebx
  293. sarl $3, %ebx # i = (m >> 2)
  294. jle .L30
  295. ALIGN_4
  296. .L10:
  297. #ifdef PENTIUM4
  298. #if !defined(TRMMKERNEL) || \
  299. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  300. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  301. leal BUFFER, BB
  302. movaps 0 * SIZE + BUFFER, %xmm2
  303. xorps %xmm4, %xmm4
  304. movaps 0 * SIZE(AA), %xmm0
  305. xorps %xmm5, %xmm5
  306. movaps 16 * SIZE + BUFFER, %xmm3
  307. xorps %xmm6, %xmm6
  308. movaps 16 * SIZE(AA), %xmm1
  309. xorps %xmm7, %xmm7
  310. #else
  311. leal BUFFER, BB
  312. movl KK, %eax
  313. leal (, %eax, 8), %eax
  314. leal (AA, %eax, 4), AA
  315. leal (BB, %eax, 4), BB /* because it's doubled */
  316. movaps 0 * SIZE(BB), %xmm2
  317. xorps %xmm4, %xmm4
  318. movaps 0 * SIZE(AA), %xmm0
  319. xorps %xmm5, %xmm5
  320. movaps 16 * SIZE(BB), %xmm3
  321. xorps %xmm6, %xmm6
  322. movaps 16 * SIZE(AA), %xmm1
  323. xorps %xmm7, %xmm7
  324. #endif
  325. prefetchnta 7 * SIZE(%esi)
  326. prefetchnta 7 * SIZE(%esi, %ebp)
  327. #ifndef TRMMKERNEL
  328. movl K, %eax
  329. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  330. movl K, %eax
  331. subl KK, %eax
  332. movl %eax, KKK
  333. #else
  334. movl KK, %eax
  335. #ifdef LEFT
  336. addl $8, %eax
  337. #else
  338. addl $2, %eax
  339. #endif
  340. movl %eax, KKK
  341. #endif
  342. andl $-8, %eax
  343. NOBRANCH
  344. je .L12
  345. sall $3, %eax
  346. .L1X:
  347. KERNEL1(32 * 0)
  348. KERNEL2(32 * 0)
  349. KERNEL3(32 * 0)
  350. KERNEL4(32 * 0)
  351. KERNEL5(32 * 0)
  352. KERNEL6(32 * 0)
  353. KERNEL7(32 * 0)
  354. KERNEL8(32 * 0)
  355. cmpl $64 * 1, %eax
  356. NOBRANCH
  357. jle .L11
  358. KERNEL1(32 * 1)
  359. KERNEL2(32 * 1)
  360. KERNEL3(32 * 1)
  361. KERNEL4(32 * 1)
  362. KERNEL5(32 * 1)
  363. KERNEL6(32 * 1)
  364. KERNEL7(32 * 1)
  365. KERNEL8(32 * 1)
  366. cmpl $64 * 2, %eax
  367. NOBRANCH
  368. jle .L11
  369. KERNEL1(32 * 2)
  370. KERNEL2(32 * 2)
  371. KERNEL3(32 * 2)
  372. KERNEL4(32 * 2)
  373. KERNEL5(32 * 2)
  374. KERNEL6(32 * 2)
  375. KERNEL7(32 * 2)
  376. KERNEL8(32 * 2)
  377. cmpl $64 * 3, %eax
  378. NOBRANCH
  379. jle .L11
  380. KERNEL1(32 * 3)
  381. KERNEL2(32 * 3)
  382. KERNEL3(32 * 3)
  383. KERNEL4(32 * 3)
  384. KERNEL5(32 * 3)
  385. KERNEL6(32 * 3)
  386. KERNEL7(32 * 3)
  387. KERNEL8(32 * 3)
  388. cmpl $64 * 4, %eax
  389. NOBRANCH
  390. jle .L11
  391. KERNEL1(32 * 4)
  392. KERNEL2(32 * 4)
  393. KERNEL3(32 * 4)
  394. KERNEL4(32 * 4)
  395. KERNEL5(32 * 4)
  396. KERNEL6(32 * 4)
  397. KERNEL7(32 * 4)
  398. KERNEL8(32 * 4)
  399. cmpl $64 * 5, %eax
  400. NOBRANCH
  401. jle .L11
  402. KERNEL1(32 * 5)
  403. KERNEL2(32 * 5)
  404. KERNEL3(32 * 5)
  405. KERNEL4(32 * 5)
  406. KERNEL5(32 * 5)
  407. KERNEL6(32 * 5)
  408. KERNEL7(32 * 5)
  409. KERNEL8(32 * 5)
  410. cmpl $64 * 6, %eax
  411. NOBRANCH
  412. jle .L11
  413. KERNEL1(32 * 6)
  414. KERNEL2(32 * 6)
  415. KERNEL3(32 * 6)
  416. KERNEL4(32 * 6)
  417. KERNEL5(32 * 6)
  418. KERNEL6(32 * 6)
  419. KERNEL7(32 * 6)
  420. KERNEL8(32 * 6)
  421. cmpl $64 * 7, %eax
  422. NOBRANCH
  423. jle .L11
  424. KERNEL1(32 * 7)
  425. KERNEL2(32 * 7)
  426. KERNEL3(32 * 7)
  427. KERNEL4(32 * 7)
  428. KERNEL5(32 * 7)
  429. KERNEL6(32 * 7)
  430. KERNEL7(32 * 7)
  431. KERNEL8(32 * 7)
  432. addl $64 * 8 * SIZE, AA
  433. addl $64 * 8 * SIZE, BB
  434. subl $64 * 8, %eax
  435. BRANCH
  436. jg .L1X
  437. .L11:
  438. leal (AA, %eax, 4), AA
  439. leal (BB, %eax, 4), BB
  440. #else
  441. #if !defined(TRMMKERNEL) || \
  442. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  443. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  444. leal BUFFER, BB
  445. movaps 0 * SIZE + BUFFER, %xmm2
  446. xorps %xmm4, %xmm4
  447. movaps 0 * SIZE(AA), %xmm0
  448. xorps %xmm5, %xmm5
  449. movaps 8 * SIZE + BUFFER, %xmm3
  450. xorps %xmm6, %xmm6
  451. movaps 8 * SIZE(AA), %xmm1
  452. xorps %xmm7, %xmm7
  453. #else
  454. leal BUFFER, BB
  455. movl KK, %eax
  456. leal (, %eax, 8), %eax
  457. leal (AA, %eax, 4), AA
  458. leal (BB, %eax, 4), BB /* because it's doubled */
  459. movaps 0 * SIZE(BB), %xmm2
  460. xorps %xmm4, %xmm4
  461. movaps 0 * SIZE(AA), %xmm0
  462. xorps %xmm5, %xmm5
  463. movaps 8 * SIZE(BB), %xmm3
  464. xorps %xmm6, %xmm6
  465. movaps 8 * SIZE(AA), %xmm1
  466. xorps %xmm7, %xmm7
  467. #endif
  468. prefetchnta 8 * SIZE(%esi)
  469. prefetchnta 8 * SIZE(%esi, %ebp)
  470. #ifndef TRMMKERNEL
  471. movl K, %eax
  472. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  473. movl K, %eax
  474. subl KK, %eax
  475. movl %eax, KKK
  476. #else
  477. movl KK, %eax
  478. #ifdef LEFT
  479. addl $8, %eax
  480. #else
  481. addl $2, %eax
  482. #endif
  483. movl %eax, KKK
  484. #endif
  485. sarl $3, %eax
  486. je .L12
  487. ALIGN_2
  488. .L11:
  489. #ifdef CORE_KATMAI
  490. prefetcht0 PREFETCHSIZE * SIZE(AA)
  491. #endif
  492. mulps %xmm0, %xmm2
  493. mulps 4 * SIZE(BB), %xmm0
  494. addps %xmm2, %xmm4
  495. movaps 0 * SIZE(BB), %xmm2
  496. addps %xmm0, %xmm5
  497. movaps 4 * SIZE(AA), %xmm0
  498. mulps %xmm0, %xmm2
  499. mulps 4 * SIZE(BB), %xmm0
  500. addps %xmm2, %xmm6
  501. movaps 16 * SIZE(BB), %xmm2
  502. addps %xmm0, %xmm7
  503. movaps 16 * SIZE(AA), %xmm0
  504. #ifdef CORE_KATMAI
  505. prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
  506. #endif
  507. mulps %xmm1, %xmm3
  508. mulps 12 * SIZE(BB), %xmm1
  509. addps %xmm3, %xmm4
  510. movaps 8 * SIZE(BB), %xmm3
  511. addps %xmm1, %xmm5
  512. movaps 12 * SIZE(AA), %xmm1
  513. mulps %xmm1, %xmm3
  514. mulps 12 * SIZE(BB), %xmm1
  515. addps %xmm3, %xmm6
  516. movaps 24 * SIZE(BB), %xmm3
  517. addps %xmm1, %xmm7
  518. movaps 24 * SIZE(AA), %xmm1
  519. #ifdef CORE_KATMAI
  520. prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
  521. #endif
  522. mulps %xmm0, %xmm2
  523. mulps 20 * SIZE(BB), %xmm0
  524. addps %xmm2, %xmm4
  525. movaps 16 * SIZE(BB), %xmm2
  526. addps %xmm0, %xmm5
  527. movaps 20 * SIZE(AA), %xmm0
  528. mulps %xmm0, %xmm2
  529. mulps 20 * SIZE(BB), %xmm0
  530. addps %xmm2, %xmm6
  531. movaps 32 * SIZE(BB), %xmm2
  532. addps %xmm0, %xmm7
  533. movaps 32 * SIZE(AA), %xmm0
  534. #ifdef CORE_KATMAI
  535. prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA)
  536. #endif
  537. mulps %xmm1, %xmm3
  538. mulps 28 * SIZE(BB), %xmm1
  539. addps %xmm3, %xmm4
  540. movaps 24 * SIZE(BB), %xmm3
  541. addps %xmm1, %xmm5
  542. movaps 28 * SIZE(AA), %xmm1
  543. mulps %xmm1, %xmm3
  544. mulps 28 * SIZE(BB), %xmm1
  545. addps %xmm3, %xmm6
  546. movaps 40 * SIZE(BB), %xmm3
  547. addps %xmm1, %xmm7
  548. movaps 40 * SIZE(AA), %xmm1
  549. #ifdef CORE_KATMAI
  550. prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA)
  551. #endif
  552. mulps %xmm0, %xmm2
  553. mulps 36 * SIZE(BB), %xmm0
  554. addps %xmm2, %xmm4
  555. movaps 32 * SIZE(BB), %xmm2
  556. addps %xmm0, %xmm5
  557. movaps 36 * SIZE(AA), %xmm0
  558. mulps %xmm0, %xmm2
  559. mulps 36 * SIZE(BB), %xmm0
  560. addps %xmm2, %xmm6
  561. movaps 48 * SIZE(BB), %xmm2
  562. addps %xmm0, %xmm7
  563. movaps 48 * SIZE(AA), %xmm0
  564. #ifdef CORE_KATMAI
  565. prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA)
  566. #endif
  567. mulps %xmm1, %xmm3
  568. mulps 44 * SIZE(BB), %xmm1
  569. addps %xmm3, %xmm4
  570. movaps 40 * SIZE(BB), %xmm3
  571. addps %xmm1, %xmm5
  572. movaps 44 * SIZE(AA), %xmm1
  573. mulps %xmm1, %xmm3
  574. mulps 44 * SIZE(BB), %xmm1
  575. addps %xmm3, %xmm6
  576. movaps 56 * SIZE(BB), %xmm3
  577. addps %xmm1, %xmm7
  578. movaps 56 * SIZE(AA), %xmm1
  579. #ifdef CORE_KATMAI
  580. prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA)
  581. #endif
  582. mulps %xmm0, %xmm2
  583. mulps 52 * SIZE(BB), %xmm0
  584. addps %xmm2, %xmm4
  585. movaps 48 * SIZE(BB), %xmm2
  586. addps %xmm0, %xmm5
  587. movaps 52 * SIZE(AA), %xmm0
  588. mulps %xmm0, %xmm2
  589. mulps 52 * SIZE(BB), %xmm0
  590. addps %xmm2, %xmm6
  591. movaps 64 * SIZE(BB), %xmm2
  592. addps %xmm0, %xmm7
  593. movaps 64 * SIZE(AA), %xmm0
  594. #ifdef CORE_KATMAI
  595. prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA)
  596. #endif
  597. mulps %xmm1, %xmm3
  598. mulps 60 * SIZE(BB), %xmm1
  599. addps %xmm3, %xmm4
  600. movaps 56 * SIZE(BB), %xmm3
  601. addps %xmm1, %xmm5
  602. movaps 60 * SIZE(AA), %xmm1
  603. mulps %xmm1, %xmm3
  604. mulps 60 * SIZE(BB), %xmm1
  605. addps %xmm3, %xmm6
  606. movaps 72 * SIZE(BB), %xmm3
  607. addps %xmm1, %xmm7
  608. movaps 72 * SIZE(AA), %xmm1
  609. addl $64 * SIZE, BB
  610. addl $64 * SIZE, AA
  611. decl %eax
  612. jne .L11
  613. ALIGN_2
  614. #endif
  615. .L12:
  616. #ifndef TRMMKERNEL
  617. movl K, %eax
  618. #else
  619. movl KKK, %eax
  620. #endif
  621. movaps ALPHA, %xmm3
  622. andl $7, %eax # if (k & 1)
  623. BRANCH
  624. je .L14
  625. .L13:
  626. movaps 4 * SIZE(BB), %xmm1
  627. mulps %xmm0, %xmm2
  628. addps %xmm2, %xmm4
  629. movaps 0 * SIZE(BB), %xmm2
  630. mulps %xmm0, %xmm1
  631. movaps 4 * SIZE(AA), %xmm0
  632. addps %xmm1, %xmm5
  633. movaps 4 * SIZE(BB), %xmm1
  634. mulps %xmm0, %xmm2
  635. addps %xmm2, %xmm6
  636. movaps 8 * SIZE(BB), %xmm2
  637. mulps %xmm0, %xmm1
  638. movaps 8 * SIZE(AA), %xmm0
  639. addps %xmm1, %xmm7
  640. addl $8 * SIZE, AA
  641. addl $8 * SIZE, BB
  642. subl $1, %eax
  643. jg .L13
  644. ALIGN_4
  645. .L14:
  646. mulps %xmm3, %xmm4
  647. mulps %xmm3, %xmm5
  648. mulps %xmm3, %xmm6
  649. mulps %xmm3, %xmm7
  650. #ifndef TRMMKERNEL
  651. shufps $0xe4, %xmm4, %xmm4
  652. shufps $0xe4, %xmm5, %xmm5
  653. movsd 0 * SIZE(%esi), %xmm0
  654. movhps 2 * SIZE(%esi), %xmm0
  655. movsd 4 * SIZE(%esi), %xmm1
  656. movhps 6 * SIZE(%esi), %xmm1
  657. shufps $0xe4, %xmm6, %xmm6
  658. shufps $0xe4, %xmm7, %xmm7
  659. movsd 0 * SIZE(%esi, LDC), %xmm2
  660. movhps 2 * SIZE(%esi, LDC), %xmm2
  661. movsd 4 * SIZE(%esi, LDC), %xmm3
  662. movhps 6 * SIZE(%esi, LDC), %xmm3
  663. addps %xmm0, %xmm4
  664. addps %xmm1, %xmm6
  665. addps %xmm2, %xmm5
  666. addps %xmm3, %xmm7
  667. #endif
  668. movsd %xmm4, 0 * SIZE(%esi)
  669. movhps %xmm4, 2 * SIZE(%esi)
  670. movsd %xmm6, 4 * SIZE(%esi)
  671. movhps %xmm6, 6 * SIZE(%esi)
  672. movsd %xmm5, 0 * SIZE(%esi, LDC)
  673. movhps %xmm5, 2 * SIZE(%esi, LDC)
  674. movsd %xmm7, 4 * SIZE(%esi, LDC)
  675. movhps %xmm7, 6 * SIZE(%esi, LDC)
  676. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  677. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  678. movl K, %eax
  679. subl KKK, %eax
  680. leal (,%eax, 8), %eax
  681. leal (AA, %eax, 4), AA
  682. leal (BB, %eax, 4), BB
  683. #endif
  684. #if defined(TRMMKERNEL) && defined(LEFT)
  685. addl $8, KK
  686. #endif
  687. addl $8 * SIZE, %esi
  688. BRANCH
  689. decl %ebx # i --
  690. jg .L10
  691. ALIGN_2
  692. .L30:
  693. movl M, %ebx
  694. andl $7, %ebx
  695. jle .L99
  696. testl $4, %ebx
  697. jle .L50
  698. #if (L1_DATA_LINESIZE == 64)
  699. #if !defined(TRMMKERNEL) || \
  700. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  701. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  702. leal BUFFER, BB
  703. movaps 0 * SIZE + BUFFER, %xmm2
  704. xorps %xmm4, %xmm4
  705. movaps 0 * SIZE(AA), %xmm0
  706. xorps %xmm5, %xmm5
  707. movaps 16 * SIZE + BUFFER, %xmm3
  708. xorps %xmm6, %xmm6
  709. movaps 16 * SIZE(AA), %xmm1
  710. xorps %xmm7, %xmm7
  711. #else
  712. leal BUFFER, BB
  713. movl KK, %eax
  714. leal (, %eax, 8), %eax
  715. leal (AA, %eax, 2), AA
  716. leal (BB, %eax, 4), BB /* because it's doubled */
  717. movaps 0 * SIZE(BB), %xmm2
  718. xorps %xmm4, %xmm4
  719. movaps 0 * SIZE(AA), %xmm0
  720. xorps %xmm5, %xmm5
  721. movaps 16 * SIZE(BB), %xmm3
  722. xorps %xmm6, %xmm6
  723. movaps 16 * SIZE(AA), %xmm1
  724. xorps %xmm7, %xmm7
  725. #endif
  726. #ifndef TRMMKERNEL
  727. movl K, %eax
  728. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  729. movl K, %eax
  730. subl KK, %eax
  731. movl %eax, KKK
  732. #else
  733. movl KK, %eax
  734. #ifdef LEFT
  735. addl $4, %eax
  736. #else
  737. addl $2, %eax
  738. #endif
  739. movl %eax, KKK
  740. #endif
  741. sarl $3, %eax
  742. je .L32
  743. ALIGN_2
  744. .L31:
  745. mulps %xmm0, %xmm2
  746. mulps 4 * SIZE(BB), %xmm0
  747. addps %xmm2, %xmm4
  748. movaps 8 * SIZE(BB), %xmm2
  749. addps %xmm0, %xmm5
  750. movaps 4 * SIZE(AA), %xmm0
  751. mulps %xmm0, %xmm2
  752. mulps 12 * SIZE(BB), %xmm0
  753. addps %xmm2, %xmm6
  754. movaps 32 * SIZE(BB), %xmm2
  755. addps %xmm0, %xmm7
  756. movaps 8 * SIZE(AA), %xmm0
  757. mulps %xmm0, %xmm3
  758. mulps 20 * SIZE(BB), %xmm0
  759. addps %xmm3, %xmm4
  760. movaps 24 * SIZE(BB), %xmm3
  761. addps %xmm0, %xmm5
  762. movaps 12 * SIZE(AA), %xmm0
  763. mulps %xmm0, %xmm3
  764. mulps 28 * SIZE(BB), %xmm0
  765. addps %xmm3, %xmm6
  766. movaps 48 * SIZE(BB), %xmm3
  767. addps %xmm0, %xmm7
  768. movaps 32 * SIZE(AA), %xmm0
  769. mulps %xmm1, %xmm2
  770. mulps 36 * SIZE(BB), %xmm1
  771. addps %xmm2, %xmm4
  772. movaps 40 * SIZE(BB), %xmm2
  773. addps %xmm1, %xmm5
  774. movaps 20 * SIZE(AA), %xmm1
  775. mulps %xmm1, %xmm2
  776. mulps 44 * SIZE(BB), %xmm1
  777. addps %xmm2, %xmm6
  778. movaps 64 * SIZE(BB), %xmm2
  779. addps %xmm1, %xmm7
  780. movaps 24 * SIZE(AA), %xmm1
  781. mulps %xmm1, %xmm3
  782. mulps 52 * SIZE(BB), %xmm1
  783. addps %xmm3, %xmm4
  784. movaps 56 * SIZE(BB), %xmm3
  785. addps %xmm1, %xmm5
  786. movaps 28 * SIZE(AA), %xmm1
  787. mulps %xmm1, %xmm3
  788. mulps 60 * SIZE(BB), %xmm1
  789. addps %xmm3, %xmm6
  790. movaps 80 * SIZE(BB), %xmm3
  791. addps %xmm1, %xmm7
  792. movaps 48 * SIZE(AA), %xmm1
  793. addl $32 * SIZE, AA
  794. addl $64 * SIZE, BB
  795. decl %eax
  796. jne .L31
  797. ALIGN_2
  798. #else
  799. #if !defined(TRMMKERNEL) || \
  800. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  801. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  802. leal BUFFER, BB
  803. movaps 0 * SIZE + BUFFER, %xmm2
  804. xorps %xmm4, %xmm4
  805. movaps 0 * SIZE(AA), %xmm0
  806. xorps %xmm5, %xmm5
  807. movaps 8 * SIZE + BUFFER, %xmm3
  808. xorps %xmm6, %xmm6
  809. movaps 8 * SIZE(AA), %xmm1
  810. xorps %xmm7, %xmm7
  811. #else
  812. leal BUFFER, BB
  813. movl KK, %eax
  814. leal (, %eax, 8), %eax
  815. leal (AA, %eax, 2), AA
  816. leal (BB, %eax, 4), BB /* because it's doubled */
  817. movaps 0 * SIZE(BB), %xmm2
  818. xorps %xmm4, %xmm4
  819. movaps 0 * SIZE(AA), %xmm0
  820. xorps %xmm5, %xmm5
  821. movaps 8 * SIZE(BB), %xmm3
  822. xorps %xmm6, %xmm6
  823. movaps 8 * SIZE(AA), %xmm1
  824. xorps %xmm7, %xmm7
  825. #endif
  826. #ifndef TRMMKERNEL
  827. movl K, %eax
  828. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  829. movl K, %eax
  830. subl KK, %eax
  831. movl %eax, KKK
  832. #else
  833. movl KK, %eax
  834. #ifdef LEFT
  835. addl $4, %eax
  836. #else
  837. addl $2, %eax
  838. #endif
  839. movl %eax, KKK
  840. #endif
  841. sarl $3, %eax
  842. je .L32
  843. ALIGN_2
  844. .L31:
  845. mulps %xmm0, %xmm2
  846. mulps 4 * SIZE(BB), %xmm0
  847. addps %xmm2, %xmm4
  848. movaps 16 * SIZE(BB), %xmm2
  849. addps %xmm0, %xmm5
  850. movaps 4 * SIZE(AA), %xmm0
  851. mulps %xmm0, %xmm3
  852. mulps 12 * SIZE(BB), %xmm0
  853. addps %xmm3, %xmm6
  854. movaps 24 * SIZE(BB), %xmm3
  855. addps %xmm0, %xmm7
  856. movaps 16 * SIZE(AA), %xmm0
  857. mulps %xmm1, %xmm2
  858. mulps 20 * SIZE(BB), %xmm1
  859. addps %xmm2, %xmm4
  860. movaps 32 * SIZE(BB), %xmm2
  861. addps %xmm1, %xmm5
  862. movaps 12 * SIZE(AA), %xmm1
  863. mulps %xmm1, %xmm3
  864. mulps 28 * SIZE(BB), %xmm1
  865. addps %xmm3, %xmm6
  866. movaps 40 * SIZE(BB), %xmm3
  867. addps %xmm1, %xmm7
  868. movaps 24 * SIZE(AA), %xmm1
  869. mulps %xmm0, %xmm2
  870. mulps 36 * SIZE(BB), %xmm0
  871. addps %xmm2, %xmm4
  872. movaps 48 * SIZE(BB), %xmm2
  873. addps %xmm0, %xmm5
  874. movaps 20 * SIZE(AA), %xmm0
  875. mulps %xmm0, %xmm3
  876. mulps 44 * SIZE(BB), %xmm0
  877. addps %xmm3, %xmm6
  878. movaps 56 * SIZE(BB), %xmm3
  879. addps %xmm0, %xmm7
  880. movaps 32 * SIZE(AA), %xmm0
  881. mulps %xmm1, %xmm2
  882. mulps 52 * SIZE(BB), %xmm1
  883. addps %xmm2, %xmm4
  884. movaps 64 * SIZE(BB), %xmm2
  885. addps %xmm1, %xmm5
  886. movaps 28 * SIZE(AA), %xmm1
  887. mulps %xmm1, %xmm3
  888. mulps 60 * SIZE(BB), %xmm1
  889. addps %xmm3, %xmm6
  890. movaps 72 * SIZE(BB), %xmm3
  891. addps %xmm1, %xmm7
  892. movaps 40 * SIZE(AA), %xmm1
  893. addl $32 * SIZE, AA
  894. addl $64 * SIZE, BB
  895. decl %eax
  896. jne .L31
  897. ALIGN_2
  898. #endif
  899. .L32:
  900. #ifndef TRMMKERNEL
  901. movl K, %eax
  902. #else
  903. movl KKK, %eax
  904. #endif
  905. movaps ALPHA, %xmm3
  906. andl $7, %eax # if (k & 1)
  907. BRANCH
  908. je .L34
  909. .L33:
  910. mulps %xmm0, %xmm2
  911. mulps 4 * SIZE(BB), %xmm0
  912. addps %xmm2, %xmm4
  913. movaps 8 * SIZE(BB), %xmm2
  914. addps %xmm0, %xmm5
  915. movaps 4 * SIZE(AA), %xmm0
  916. addl $4 * SIZE, AA
  917. addl $8 * SIZE, BB
  918. decl %eax
  919. jg .L33
  920. ALIGN_4
  921. .L34:
  922. addps %xmm6, %xmm4
  923. addps %xmm7, %xmm5
  924. mulps %xmm3, %xmm4
  925. mulps %xmm3, %xmm5
  926. #ifndef TRMMKERNEL
  927. movsd 0 * SIZE(%esi), %xmm0
  928. movhps 2 * SIZE(%esi), %xmm0
  929. addps %xmm0, %xmm4
  930. movsd 0 * SIZE(%esi, LDC), %xmm0
  931. movhps 2 * SIZE(%esi, LDC), %xmm0
  932. addps %xmm0, %xmm5
  933. #endif
  934. #ifdef HAVE_SSE2
  935. movsd %xmm4, 0 * SIZE(%esi)
  936. unpckhpd %xmm4, %xmm4
  937. movsd %xmm4, 2 * SIZE(%esi)
  938. movsd %xmm5, 0 * SIZE(%esi, LDC)
  939. unpckhpd %xmm5, %xmm5
  940. movsd %xmm5, 2 * SIZE(%esi, LDC)
  941. #else
  942. movlps %xmm4, 0 * SIZE(%esi)
  943. movhps %xmm4, 2 * SIZE(%esi)
  944. movlps %xmm5, 0 * SIZE(%esi, LDC)
  945. movhps %xmm5, 2 * SIZE(%esi, LDC)
  946. #endif
  947. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  948. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  949. movl K, %eax
  950. subl KKK, %eax
  951. leal (,%eax, 8), %eax
  952. leal (AA, %eax, 2), AA
  953. leal (BB, %eax, 4), BB
  954. #endif
  955. #if defined(TRMMKERNEL) && defined(LEFT)
  956. addl $4, KK
  957. #endif
  958. addl $4 * SIZE, %esi
  959. ALIGN_2
  960. .L50:
  961. testl $2, %ebx
  962. jle .L70
  963. #if (L1_DATA_LINESIZE == 64)
  964. #if !defined(TRMMKERNEL) || \
  965. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  966. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  967. leal BUFFER, BB
  968. movaps 0 * SIZE + BUFFER, %xmm2
  969. xorps %xmm4, %xmm4
  970. movaps 0 * SIZE(AA), %xmm0
  971. xorps %xmm5, %xmm5
  972. movaps 16 * SIZE + BUFFER, %xmm3
  973. xorps %xmm6, %xmm6
  974. movaps 8 * SIZE(AA), %xmm1
  975. xorps %xmm7, %xmm7
  976. #else
  977. leal BUFFER, BB
  978. movl KK, %eax
  979. leal (, %eax, 8), %eax
  980. leal (AA, %eax, 1), AA
  981. leal (BB, %eax, 4), BB /* because it's doubled */
  982. movaps 0 * SIZE(BB), %xmm2
  983. xorps %xmm4, %xmm4
  984. movaps 0 * SIZE(AA), %xmm0
  985. xorps %xmm5, %xmm5
  986. movaps 16 * SIZE(BB), %xmm3
  987. xorps %xmm6, %xmm6
  988. movaps 8 * SIZE(AA), %xmm1
  989. xorps %xmm7, %xmm7
  990. #endif
  991. #ifndef TRMMKERNEL
  992. movl K, %eax
  993. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  994. movl K, %eax
  995. subl KK, %eax
  996. movl %eax, KKK
  997. #else
  998. movl KK, %eax
  999. #ifdef LEFT
  1000. addl $2, %eax
  1001. #else
  1002. addl $2, %eax
  1003. #endif
  1004. movl %eax, KKK
  1005. #endif
  1006. sarl $3, %eax
  1007. je .L52
  1008. ALIGN_2
  1009. .L51:
  1010. mulps %xmm0, %xmm2
  1011. addps %xmm2, %xmm4
  1012. movaps 4 * SIZE(BB), %xmm2
  1013. mulps %xmm0, %xmm2
  1014. movsd 2 * SIZE(AA), %xmm0
  1015. addps %xmm2, %xmm5
  1016. movaps 8 * SIZE(BB), %xmm2
  1017. mulps %xmm0, %xmm2
  1018. addps %xmm2, %xmm6
  1019. movaps 12 * SIZE(BB), %xmm2
  1020. mulps %xmm0, %xmm2
  1021. movsd 4 * SIZE(AA), %xmm0
  1022. addps %xmm2, %xmm7
  1023. movaps 32 * SIZE(BB), %xmm2
  1024. mulps %xmm0, %xmm3
  1025. addps %xmm3, %xmm4
  1026. movaps 20 * SIZE(BB), %xmm3
  1027. mulps %xmm0, %xmm3
  1028. movsd 6 * SIZE(AA), %xmm0
  1029. addps %xmm3, %xmm5
  1030. movaps 24 * SIZE(BB), %xmm3
  1031. mulps %xmm0, %xmm3
  1032. addps %xmm3, %xmm6
  1033. movaps 28 * SIZE(BB), %xmm3
  1034. mulps %xmm0, %xmm3
  1035. movsd 16 * SIZE(AA), %xmm0
  1036. addps %xmm3, %xmm7
  1037. movaps 48 * SIZE(BB), %xmm3
  1038. mulps %xmm1, %xmm2
  1039. addps %xmm2, %xmm4
  1040. movaps 36 * SIZE(BB), %xmm2
  1041. mulps %xmm1, %xmm2
  1042. movsd 10 * SIZE(AA), %xmm1
  1043. addps %xmm2, %xmm5
  1044. movaps 40 * SIZE(BB), %xmm2
  1045. mulps %xmm1, %xmm2
  1046. addps %xmm2, %xmm6
  1047. movaps 44 * SIZE(BB), %xmm2
  1048. mulps %xmm1, %xmm2
  1049. movsd 12 * SIZE(AA), %xmm1
  1050. addps %xmm2, %xmm7
  1051. movaps 64 * SIZE(BB), %xmm2
  1052. mulps %xmm1, %xmm3
  1053. addps %xmm3, %xmm4
  1054. movaps 52 * SIZE(BB), %xmm3
  1055. mulps %xmm1, %xmm3
  1056. movsd 14 * SIZE(AA), %xmm1
  1057. addps %xmm3, %xmm5
  1058. movaps 56 * SIZE(BB), %xmm3
  1059. mulps %xmm1, %xmm3
  1060. addps %xmm3, %xmm6
  1061. movaps 60 * SIZE(BB), %xmm3
  1062. mulps %xmm1, %xmm3
  1063. movsd 24 * SIZE(AA), %xmm1
  1064. addps %xmm3, %xmm7
  1065. movaps 80 * SIZE(BB), %xmm3
  1066. addl $16 * SIZE, AA
  1067. addl $64 * SIZE, BB
  1068. decl %eax
  1069. jne .L51
  1070. ALIGN_2
  1071. #else
  1072. #if !defined(TRMMKERNEL) || \
  1073. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1074. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1075. leal BUFFER, BB
  1076. movaps 0 * SIZE + BUFFER, %xmm2
  1077. xorps %xmm4, %xmm4
  1078. movsd 0 * SIZE(AA), %xmm0
  1079. xorps %xmm5, %xmm5
  1080. movaps 8 * SIZE + BUFFER, %xmm3
  1081. xorps %xmm6, %xmm6
  1082. movaps 8 * SIZE(AA), %xmm1
  1083. xorps %xmm7, %xmm7
  1084. #else
  1085. leal BUFFER, BB
  1086. movl KK, %eax
  1087. leal (, %eax, 8), %eax
  1088. leal (AA, %eax, 1), AA
  1089. leal (BB, %eax, 4), BB /* because it's doubled */
  1090. movaps 0 * SIZE(BB), %xmm2
  1091. xorps %xmm4, %xmm4
  1092. movaps 0 * SIZE(AA), %xmm0
  1093. xorps %xmm5, %xmm5
  1094. movaps 8 * SIZE(BB), %xmm3
  1095. xorps %xmm6, %xmm6
  1096. movaps 8 * SIZE(AA), %xmm1
  1097. xorps %xmm7, %xmm7
  1098. #endif
  1099. #ifndef TRMMKERNEL
  1100. movl K, %eax
  1101. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1102. movl K, %eax
  1103. subl KK, %eax
  1104. movl %eax, KKK
  1105. #else
  1106. movl KK, %eax
  1107. #ifdef LEFT
  1108. addl $2, %eax
  1109. #else
  1110. addl $2, %eax
  1111. #endif
  1112. movl %eax, KKK
  1113. #endif
  1114. sarl $3, %eax
  1115. je .L52
  1116. ALIGN_2
  1117. .L51:
  1118. mulps %xmm0, %xmm2
  1119. addps %xmm2, %xmm4
  1120. movaps 4 * SIZE(BB), %xmm2
  1121. mulps %xmm0, %xmm2
  1122. movsd 2 * SIZE(AA), %xmm0
  1123. addps %xmm2, %xmm5
  1124. movaps 16 * SIZE(BB), %xmm2
  1125. mulps %xmm0, %xmm3
  1126. addps %xmm3, %xmm6
  1127. movaps 12 * SIZE(BB), %xmm3
  1128. mulps %xmm0, %xmm3
  1129. movsd 4 * SIZE(AA), %xmm0
  1130. addps %xmm3, %xmm7
  1131. movaps 24 * SIZE(BB), %xmm3
  1132. mulps %xmm0, %xmm2
  1133. addps %xmm2, %xmm4
  1134. movaps 20 * SIZE(BB), %xmm2
  1135. mulps %xmm0, %xmm2
  1136. movsd 6 * SIZE(AA), %xmm0
  1137. addps %xmm2, %xmm5
  1138. movaps 32 * SIZE(BB), %xmm2
  1139. mulps %xmm0, %xmm3
  1140. addps %xmm3, %xmm6
  1141. movaps 28 * SIZE(BB), %xmm3
  1142. mulps %xmm0, %xmm3
  1143. movsd 16 * SIZE(AA), %xmm0
  1144. addps %xmm3, %xmm7
  1145. movaps 40 * SIZE(BB), %xmm3
  1146. mulps %xmm1, %xmm2
  1147. addps %xmm2, %xmm4
  1148. movaps 36 * SIZE(BB), %xmm2
  1149. mulps %xmm1, %xmm2
  1150. movsd 10 * SIZE(AA), %xmm1
  1151. addps %xmm2, %xmm5
  1152. movaps 48 * SIZE(BB), %xmm2
  1153. mulps %xmm1, %xmm3
  1154. addps %xmm3, %xmm6
  1155. movaps 44 * SIZE(BB), %xmm3
  1156. mulps %xmm1, %xmm3
  1157. movsd 12 * SIZE(AA), %xmm1
  1158. addps %xmm3, %xmm7
  1159. movaps 56 * SIZE(BB), %xmm3
  1160. mulps %xmm1, %xmm2
  1161. addps %xmm2, %xmm4
  1162. movaps 52 * SIZE(BB), %xmm2
  1163. mulps %xmm1, %xmm2
  1164. movsd 14 * SIZE(AA), %xmm1
  1165. addps %xmm2, %xmm5
  1166. movaps 64 * SIZE(BB), %xmm2
  1167. mulps %xmm1, %xmm3
  1168. addps %xmm3, %xmm6
  1169. movaps 60 * SIZE(BB), %xmm3
  1170. mulps %xmm1, %xmm3
  1171. movsd 24 * SIZE(AA), %xmm1
  1172. addps %xmm3, %xmm7
  1173. movaps 72 * SIZE(BB), %xmm3
  1174. addl $16 * SIZE, AA
  1175. addl $64 * SIZE, BB
  1176. decl %eax
  1177. jne .L51
  1178. ALIGN_2
  1179. #endif
  1180. .L52:
  1181. #ifndef TRMMKERNEL
  1182. movl K, %eax
  1183. #else
  1184. movl KKK, %eax
  1185. #endif
  1186. movaps ALPHA, %xmm3
  1187. andl $7, %eax # if (k & 1)
  1188. BRANCH
  1189. je .L54
  1190. .L53:
  1191. mulps %xmm0, %xmm2
  1192. addps %xmm2, %xmm4
  1193. movaps 4 * SIZE(BB), %xmm2
  1194. mulps %xmm0, %xmm2
  1195. movsd 2 * SIZE(AA), %xmm0
  1196. addps %xmm2, %xmm5
  1197. movaps 8 * SIZE(BB), %xmm2
  1198. addl $2 * SIZE, AA
  1199. addl $8 * SIZE, BB
  1200. decl %eax
  1201. jg .L53
  1202. ALIGN_4
  1203. .L54:
  1204. addps %xmm6, %xmm4
  1205. addps %xmm7, %xmm5
  1206. mulps %xmm3, %xmm4
  1207. mulps %xmm3, %xmm5
  1208. #ifndef TRMMKERNEL
  1209. #ifdef movsd
  1210. xorps %xmm0, %xmm0
  1211. #endif
  1212. movsd 0 * SIZE(%esi), %xmm0
  1213. addps %xmm0, %xmm4
  1214. #ifdef movsd
  1215. xorps %xmm0, %xmm0
  1216. #endif
  1217. movsd 0 * SIZE(%esi, LDC), %xmm0
  1218. addps %xmm0, %xmm5
  1219. #endif
  1220. movlps %xmm4, 0 * SIZE(%esi)
  1221. movlps %xmm5, 0 * SIZE(%esi, LDC)
  1222. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1223. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1224. movl K, %eax
  1225. subl KKK, %eax
  1226. leal (,%eax, 8), %eax
  1227. leal (AA, %eax, 1), AA
  1228. leal (BB, %eax, 4), BB
  1229. #endif
  1230. #if defined(TRMMKERNEL) && defined(LEFT)
  1231. addl $2, KK
  1232. #endif
  1233. addl $2 * SIZE, %esi
  1234. ALIGN_2
  1235. .L70:
  1236. testl $1, %ebx
  1237. jle .L99
  1238. #if (L1_DATA_LINESIZE == 64)
  1239. #if !defined(TRMMKERNEL) || \
  1240. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1241. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1242. leal BUFFER, BB
  1243. movss 0 * SIZE + BUFFER, %xmm2
  1244. xorps %xmm4, %xmm4
  1245. movss 0 * SIZE(AA), %xmm0
  1246. xorps %xmm5, %xmm5
  1247. movss 16 * SIZE + BUFFER, %xmm3
  1248. xorps %xmm6, %xmm6
  1249. movss 4 * SIZE(AA), %xmm1
  1250. xorps %xmm7, %xmm7
  1251. #else
  1252. leal BUFFER, BB
  1253. movl KK, %eax
  1254. leal (, %eax, 4), %eax
  1255. leal (AA, %eax, 1), AA
  1256. leal (BB, %eax, 8), BB /* because it's doubled */
  1257. movss 0 * SIZE(BB), %xmm2
  1258. xorps %xmm4, %xmm4
  1259. movss 0 * SIZE(AA), %xmm0
  1260. xorps %xmm5, %xmm5
  1261. movss 16 * SIZE(BB), %xmm3
  1262. xorps %xmm6, %xmm6
  1263. movss 4 * SIZE(AA), %xmm1
  1264. xorps %xmm7, %xmm7
  1265. #endif
  1266. #ifndef TRMMKERNEL
  1267. movl K, %eax
  1268. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1269. movl K, %eax
  1270. subl KK, %eax
  1271. movl %eax, KKK
  1272. #else
  1273. movl KK, %eax
  1274. #ifdef LEFT
  1275. addl $1, %eax
  1276. #else
  1277. addl $2, %eax
  1278. #endif
  1279. movl %eax, KKK
  1280. #endif
  1281. sarl $3, %eax
  1282. je .L72
  1283. ALIGN_2
  1284. .L71:
  1285. mulss %xmm0, %xmm2
  1286. mulss 4 * SIZE(BB), %xmm0
  1287. addss %xmm2, %xmm4
  1288. movss 8 * SIZE(BB), %xmm2
  1289. addss %xmm0, %xmm5
  1290. movss 1 * SIZE(AA), %xmm0
  1291. mulss %xmm0, %xmm2
  1292. mulss 12 * SIZE(BB), %xmm0
  1293. addss %xmm2, %xmm6
  1294. movss 32 * SIZE(BB), %xmm2
  1295. addss %xmm0, %xmm7
  1296. movss 2 * SIZE(AA), %xmm0
  1297. mulss %xmm0, %xmm3
  1298. mulss 20 * SIZE(BB), %xmm0
  1299. addss %xmm3, %xmm4
  1300. movss 24 * SIZE(BB), %xmm3
  1301. addss %xmm0, %xmm5
  1302. movss 3 * SIZE(AA), %xmm0
  1303. mulss %xmm0, %xmm3
  1304. mulss 28 * SIZE(BB), %xmm0
  1305. addss %xmm3, %xmm6
  1306. movss 48 * SIZE(BB), %xmm3
  1307. addss %xmm0, %xmm7
  1308. movss 8 * SIZE(AA), %xmm0
  1309. mulss %xmm1, %xmm2
  1310. mulss 36 * SIZE(BB), %xmm1
  1311. addss %xmm2, %xmm4
  1312. movss 40 * SIZE(BB), %xmm2
  1313. addss %xmm1, %xmm5
  1314. movss 5 * SIZE(AA), %xmm1
  1315. mulss %xmm1, %xmm2
  1316. mulss 44 * SIZE(BB), %xmm1
  1317. addss %xmm2, %xmm6
  1318. movss 64 * SIZE(BB), %xmm2
  1319. addss %xmm1, %xmm7
  1320. movss 6 * SIZE(AA), %xmm1
  1321. mulss %xmm1, %xmm3
  1322. mulss 52 * SIZE(BB), %xmm1
  1323. addss %xmm3, %xmm4
  1324. movss 56 * SIZE(BB), %xmm3
  1325. addss %xmm1, %xmm5
  1326. movss 7 * SIZE(AA), %xmm1
  1327. mulss %xmm1, %xmm3
  1328. mulss 60 * SIZE(BB), %xmm1
  1329. addss %xmm3, %xmm6
  1330. movss 80 * SIZE(BB), %xmm3
  1331. addss %xmm1, %xmm7
  1332. movss 12 * SIZE(AA), %xmm1
  1333. addl $ 8 * SIZE, AA
  1334. addl $64 * SIZE, BB
  1335. decl %eax
  1336. jne .L71
  1337. ALIGN_2
  1338. #else
  1339. #if !defined(TRMMKERNEL) || \
  1340. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1341. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1342. leal BUFFER, BB
  1343. movss 0 * SIZE + BUFFER, %xmm2
  1344. xorps %xmm4, %xmm4
  1345. movss 0 * SIZE(AA), %xmm0
  1346. xorps %xmm5, %xmm5
  1347. movss 8 * SIZE + BUFFER, %xmm3
  1348. xorps %xmm6, %xmm6
  1349. movss 4 * SIZE(AA), %xmm1
  1350. xorps %xmm7, %xmm7
  1351. #else
  1352. leal BUFFER, BB
  1353. movl KK, %eax
  1354. leal (, %eax, 4), %eax
  1355. leal (AA, %eax, 1), AA
  1356. leal (BB, %eax, 8), BB /* because it's doubled */
  1357. movss 0 * SIZE(BB), %xmm2
  1358. xorps %xmm4, %xmm4
  1359. movss 0 * SIZE(AA), %xmm0
  1360. xorps %xmm5, %xmm5
  1361. movss 8 * SIZE(BB), %xmm3
  1362. xorps %xmm6, %xmm6
  1363. movss 4 * SIZE(AA), %xmm1
  1364. xorps %xmm7, %xmm7
  1365. #endif
  1366. #ifndef TRMMKERNEL
  1367. movl K, %eax
  1368. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1369. movl K, %eax
  1370. subl KK, %eax
  1371. movl %eax, KKK
  1372. #else
  1373. movl KK, %eax
  1374. #ifdef LEFT
  1375. addl $1, %eax
  1376. #else
  1377. addl $2, %eax
  1378. #endif
  1379. movl %eax, KKK
  1380. #endif
  1381. sarl $3, %eax
  1382. je .L72
  1383. ALIGN_2
  1384. .L71:
  1385. mulss %xmm0, %xmm2
  1386. mulss 4 * SIZE(BB), %xmm0
  1387. addss %xmm2, %xmm4
  1388. movss 16 * SIZE(BB), %xmm2
  1389. addss %xmm0, %xmm5
  1390. movss 1 * SIZE(AA), %xmm0
  1391. mulss %xmm0, %xmm3
  1392. mulss 12 * SIZE(BB), %xmm0
  1393. addss %xmm3, %xmm6
  1394. movss 24 * SIZE(BB), %xmm3
  1395. addss %xmm0, %xmm7
  1396. movss 2 * SIZE(AA), %xmm0
  1397. mulss %xmm0, %xmm2
  1398. mulss 20 * SIZE(BB), %xmm0
  1399. addss %xmm2, %xmm4
  1400. movss 32 * SIZE(BB), %xmm2
  1401. addss %xmm0, %xmm5
  1402. movss 3 * SIZE(AA), %xmm0
  1403. mulss %xmm0, %xmm3
  1404. mulss 28 * SIZE(BB), %xmm0
  1405. addss %xmm3, %xmm6
  1406. movss 40 * SIZE(BB), %xmm3
  1407. addss %xmm0, %xmm7
  1408. movss 8 * SIZE(AA), %xmm0
  1409. mulss %xmm1, %xmm2
  1410. mulss 36 * SIZE(BB), %xmm1
  1411. addss %xmm2, %xmm4
  1412. movss 48 * SIZE(BB), %xmm2
  1413. addss %xmm1, %xmm5
  1414. movss 5 * SIZE(AA), %xmm1
  1415. mulss %xmm1, %xmm3
  1416. mulss 44 * SIZE(BB), %xmm1
  1417. addss %xmm3, %xmm6
  1418. movss 56 * SIZE(BB), %xmm3
  1419. addss %xmm1, %xmm7
  1420. movss 6 * SIZE(AA), %xmm1
  1421. mulss %xmm1, %xmm2
  1422. mulss 52 * SIZE(BB), %xmm1
  1423. addss %xmm2, %xmm4
  1424. movss 64 * SIZE(BB), %xmm2
  1425. addss %xmm1, %xmm5
  1426. movss 7 * SIZE(AA), %xmm1
  1427. mulss %xmm1, %xmm3
  1428. mulss 60 * SIZE(BB), %xmm1
  1429. addss %xmm3, %xmm6
  1430. movss 72 * SIZE(BB), %xmm3
  1431. addss %xmm1, %xmm7
  1432. movss 12 * SIZE(AA), %xmm1
  1433. addl $ 8 * SIZE, AA
  1434. addl $64 * SIZE, BB
  1435. decl %eax
  1436. jne .L71
  1437. ALIGN_2
  1438. #endif
  1439. .L72:
  1440. #ifndef TRMMKERNEL
  1441. movl K, %eax
  1442. #else
  1443. movl KKK, %eax
  1444. #endif
  1445. movss ALPHA, %xmm3
  1446. andl $7, %eax # if (k & 1)
  1447. BRANCH
  1448. je .L74
  1449. .L73:
  1450. mulss %xmm0, %xmm2
  1451. mulss 4 * SIZE(BB), %xmm0
  1452. addss %xmm2, %xmm4
  1453. movss 8 * SIZE(BB), %xmm2
  1454. addss %xmm0, %xmm5
  1455. movss 1 * SIZE(AA), %xmm0
  1456. addl $1 * SIZE, AA
  1457. addl $8 * SIZE, BB
  1458. decl %eax
  1459. jg .L73
  1460. ALIGN_4
  1461. .L74:
  1462. addss %xmm6, %xmm4
  1463. addss %xmm7, %xmm5
  1464. mulss %xmm3, %xmm4
  1465. mulss %xmm3, %xmm5
  1466. #ifndef TRMMKERNEL
  1467. addss 0 * SIZE(%esi), %xmm4
  1468. addss 0 * SIZE(%esi, LDC), %xmm5
  1469. #endif
  1470. movss %xmm4, 0 * SIZE(%esi)
  1471. movss %xmm5, 0 * SIZE(%esi, LDC)
  1472. addl $1 * SIZE, %esi
  1473. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1474. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1475. movl K, %eax
  1476. subl KKK, %eax
  1477. leal (,%eax, 4), %eax
  1478. leal (AA, %eax, 1), AA
  1479. leal (BB, %eax, 8), BB
  1480. #endif
  1481. #if defined(TRMMKERNEL) && defined(LEFT)
  1482. addl $1, KK
  1483. #endif
  1484. ALIGN_2
  1485. .L99:
  1486. #if defined(TRMMKERNEL) && !defined(LEFT)
  1487. addl $2, KK
  1488. #endif
  1489. leal (, LDC, 2), %eax
  1490. addl %eax, C # c += 2 * ldc
  1491. BRANCH
  1492. decl J # j --
  1493. jg .L01
  1494. ALIGN_2
  1495. .L100:
  1496. movl N, %eax
  1497. testl $1, %eax
  1498. jle .L999
  1499. ALIGN_2
  1500. .L101:
  1501. #if defined(TRMMKERNEL) && defined(LEFT)
  1502. movl OFFSET, %eax
  1503. movl %eax, KK
  1504. #endif
  1505. /* Copying to Sub Buffer */
  1506. movl K, %eax
  1507. leal BUFFER, %ecx
  1508. sarl $3, %eax
  1509. jle .L103
  1510. ALIGN_4
  1511. .L102:
  1512. prefetchnta 96 * SIZE(B)
  1513. movss 0 * SIZE(B), %xmm0
  1514. movss 1 * SIZE(B), %xmm1
  1515. movss 2 * SIZE(B), %xmm2
  1516. movss 3 * SIZE(B), %xmm3
  1517. movss 4 * SIZE(B), %xmm4
  1518. movss 5 * SIZE(B), %xmm5
  1519. movss 6 * SIZE(B), %xmm6
  1520. movss 7 * SIZE(B), %xmm7
  1521. addl $ 8 * SIZE, B
  1522. shufps $0, %xmm0, %xmm0
  1523. shufps $0, %xmm1, %xmm1
  1524. shufps $0, %xmm2, %xmm2
  1525. shufps $0, %xmm3, %xmm3
  1526. shufps $0, %xmm4, %xmm4
  1527. shufps $0, %xmm5, %xmm5
  1528. shufps $0, %xmm6, %xmm6
  1529. shufps $0, %xmm7, %xmm7
  1530. movaps %xmm0, 0 * SIZE(%ecx)
  1531. movaps %xmm1, 4 * SIZE(%ecx)
  1532. movaps %xmm2, 8 * SIZE(%ecx)
  1533. movaps %xmm3, 12 * SIZE(%ecx)
  1534. movaps %xmm4, 16 * SIZE(%ecx)
  1535. movaps %xmm5, 20 * SIZE(%ecx)
  1536. movaps %xmm6, 24 * SIZE(%ecx)
  1537. movaps %xmm7, 28 * SIZE(%ecx)
  1538. addl $32 * SIZE, %ecx
  1539. decl %eax
  1540. BRANCH
  1541. jne .L102
  1542. ALIGN_2
  1543. .L103:
  1544. movl K, %eax
  1545. andl $7, %eax
  1546. BRANCH
  1547. jle .L105
  1548. ALIGN_2
  1549. .L104:
  1550. movss 0 * SIZE(B), %xmm0
  1551. addl $1 * SIZE, B
  1552. shufps $0, %xmm0, %xmm0
  1553. movaps %xmm0, 0 * SIZE(%ecx)
  1554. addl $4 * SIZE, %ecx
  1555. decl %eax
  1556. jne .L104
  1557. ALIGN_4
  1558. .L105:
  1559. movl C, %esi # coffset = c
  1560. movl A, AA # aoffset = a
  1561. movl M, %ebx
  1562. sarl $3, %ebx # i = (m >> 2)
  1563. jle .L130
  1564. ALIGN_4
  1565. .L110:
  1566. #if (L1_DATA_LINESIZE == 64)
  1567. #if !defined(TRMMKERNEL) || \
  1568. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1569. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1570. leal BUFFER, BB
  1571. movaps 0 * SIZE + BUFFER, %xmm2
  1572. xorps %xmm4, %xmm4
  1573. movaps 0 * SIZE(AA), %xmm0
  1574. xorps %xmm5, %xmm5
  1575. movaps 16 * SIZE + BUFFER, %xmm3
  1576. xorps %xmm6, %xmm6
  1577. movaps 16 * SIZE(AA), %xmm1
  1578. xorps %xmm7, %xmm7
  1579. #else
  1580. leal BUFFER, BB
  1581. movl KK, %eax
  1582. leal (, %eax, 8), %eax
  1583. leal (AA, %eax, 4), AA
  1584. leal (BB, %eax, 2), BB /* because it's doubled */
  1585. movaps 0 * SIZE(BB), %xmm2
  1586. xorps %xmm4, %xmm4
  1587. movaps 0 * SIZE(AA), %xmm0
  1588. xorps %xmm5, %xmm5
  1589. movaps 16 * SIZE(BB), %xmm3
  1590. xorps %xmm6, %xmm6
  1591. movaps 16 * SIZE(AA), %xmm1
  1592. xorps %xmm7, %xmm7
  1593. #endif
  1594. #ifndef TRMMKERNEL
  1595. movl K, %eax
  1596. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1597. movl K, %eax
  1598. subl KK, %eax
  1599. movl %eax, KKK
  1600. #else
  1601. movl KK, %eax
  1602. #ifdef LEFT
  1603. addl $8, %eax
  1604. #else
  1605. addl $1, %eax
  1606. #endif
  1607. movl %eax, KKK
  1608. #endif
  1609. sarl $3, %eax
  1610. je .L112
  1611. ALIGN_2
  1612. .L111:
  1613. mulps %xmm2, %xmm0
  1614. mulps 4 * SIZE(AA), %xmm2
  1615. addps %xmm0, %xmm4
  1616. movaps 8 * SIZE(AA), %xmm0
  1617. addps %xmm2, %xmm5
  1618. movaps 4 * SIZE(BB), %xmm2
  1619. mulps %xmm2, %xmm0
  1620. mulps 12 * SIZE(AA), %xmm2
  1621. addps %xmm0, %xmm6
  1622. movaps 32 * SIZE(AA), %xmm0
  1623. addps %xmm2, %xmm7
  1624. movaps 8 * SIZE(BB), %xmm2
  1625. mulps %xmm2, %xmm1
  1626. mulps 20 * SIZE(AA), %xmm2
  1627. addps %xmm1, %xmm4
  1628. movaps 24 * SIZE(AA), %xmm1
  1629. addps %xmm2, %xmm5
  1630. movaps 12 * SIZE(BB), %xmm2
  1631. mulps %xmm2, %xmm1
  1632. mulps 28 * SIZE(AA), %xmm2
  1633. addps %xmm1, %xmm6
  1634. movaps 48 * SIZE(AA), %xmm1
  1635. addps %xmm2, %xmm7
  1636. movaps 32 * SIZE(BB), %xmm2
  1637. mulps %xmm3, %xmm0
  1638. mulps 36 * SIZE(AA), %xmm3
  1639. addps %xmm0, %xmm4
  1640. movaps 40 * SIZE(AA), %xmm0
  1641. addps %xmm3, %xmm5
  1642. movaps 20 * SIZE(BB), %xmm3
  1643. mulps %xmm3, %xmm0
  1644. mulps 44 * SIZE(AA), %xmm3
  1645. addps %xmm0, %xmm6
  1646. movaps 64 * SIZE(AA), %xmm0
  1647. addps %xmm3, %xmm7
  1648. movaps 24 * SIZE(BB), %xmm3
  1649. mulps %xmm3, %xmm1
  1650. mulps 52 * SIZE(AA), %xmm3
  1651. addps %xmm1, %xmm4
  1652. movaps 56 * SIZE(AA), %xmm1
  1653. addps %xmm3, %xmm5
  1654. movaps 28 * SIZE(BB), %xmm3
  1655. mulps %xmm3, %xmm1
  1656. mulps 60 * SIZE(AA), %xmm3
  1657. addps %xmm1, %xmm6
  1658. movaps 80 * SIZE(AA), %xmm1
  1659. addps %xmm3, %xmm7
  1660. movaps 48 * SIZE(BB), %xmm3
  1661. addl $64 * SIZE, AA
  1662. addl $32 * SIZE, BB
  1663. decl %eax
  1664. jne .L111
  1665. ALIGN_2
  1666. #else
  1667. #if !defined(TRMMKERNEL) || \
  1668. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1669. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1670. leal BUFFER, BB
  1671. movaps 0 * SIZE + BUFFER, %xmm2
  1672. xorps %xmm4, %xmm4
  1673. movaps 0 * SIZE(AA), %xmm0
  1674. xorps %xmm5, %xmm5
  1675. movaps 8 * SIZE + BUFFER, %xmm3
  1676. xorps %xmm6, %xmm6
  1677. movaps 8 * SIZE(AA), %xmm1
  1678. xorps %xmm7, %xmm7
  1679. #else
  1680. leal BUFFER, BB
  1681. movl KK, %eax
  1682. leal (, %eax, 8), %eax
  1683. leal (AA, %eax, 4), AA
  1684. leal (BB, %eax, 2), BB /* because it's doubled */
  1685. movaps 0 * SIZE(BB), %xmm2
  1686. xorps %xmm4, %xmm4
  1687. movaps 0 * SIZE(AA), %xmm0
  1688. xorps %xmm5, %xmm5
  1689. movaps 8 * SIZE(BB), %xmm3
  1690. xorps %xmm6, %xmm6
  1691. movaps 8 * SIZE(AA), %xmm1
  1692. xorps %xmm7, %xmm7
  1693. #endif
  1694. #ifndef TRMMKERNEL
  1695. movl K, %eax
  1696. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1697. movl K, %eax
  1698. subl KK, %eax
  1699. movl %eax, KKK
  1700. #else
  1701. movl KK, %eax
  1702. #ifdef LEFT
  1703. addl $8, %eax
  1704. #else
  1705. addl $1, %eax
  1706. #endif
  1707. movl %eax, KKK
  1708. #endif
  1709. sarl $3, %eax
  1710. je .L112
  1711. ALIGN_2
  1712. .L111:
  1713. mulps %xmm2, %xmm0
  1714. mulps 4 * SIZE(AA), %xmm2
  1715. addps %xmm0, %xmm4
  1716. movaps 16 * SIZE(AA), %xmm0
  1717. addps %xmm2, %xmm5
  1718. movaps 4 * SIZE(BB), %xmm2
  1719. mulps %xmm2, %xmm1
  1720. mulps 12 * SIZE(AA), %xmm2
  1721. addps %xmm1, %xmm6
  1722. movaps 24 * SIZE(AA), %xmm1
  1723. addps %xmm2, %xmm7
  1724. movaps 16 * SIZE(BB), %xmm2
  1725. mulps %xmm3, %xmm0
  1726. mulps 20 * SIZE(AA), %xmm3
  1727. addps %xmm0, %xmm4
  1728. movaps 32 * SIZE(AA), %xmm0
  1729. addps %xmm3, %xmm5
  1730. movaps 12 * SIZE(BB), %xmm3
  1731. mulps %xmm3, %xmm1
  1732. mulps 28 * SIZE(AA), %xmm3
  1733. addps %xmm1, %xmm6
  1734. movaps 40 * SIZE(AA), %xmm1
  1735. addps %xmm3, %xmm7
  1736. movaps 24 * SIZE(BB), %xmm3
  1737. mulps %xmm2, %xmm0
  1738. mulps 36 * SIZE(AA), %xmm2
  1739. addps %xmm0, %xmm4
  1740. movaps 48 * SIZE(AA), %xmm0
  1741. addps %xmm2, %xmm5
  1742. movaps 20 * SIZE(BB), %xmm2
  1743. mulps %xmm2, %xmm1
  1744. mulps 44 * SIZE(AA), %xmm2
  1745. addps %xmm1, %xmm6
  1746. movaps 56 * SIZE(AA), %xmm1
  1747. addps %xmm2, %xmm7
  1748. movaps 32 * SIZE(BB), %xmm2
  1749. mulps %xmm3, %xmm0
  1750. mulps 52 * SIZE(AA), %xmm3
  1751. addps %xmm0, %xmm4
  1752. movaps 64 * SIZE(AA), %xmm0
  1753. addps %xmm3, %xmm5
  1754. movaps 28 * SIZE(BB), %xmm3
  1755. mulps %xmm3, %xmm1
  1756. mulps 60 * SIZE(AA), %xmm3
  1757. addps %xmm1, %xmm6
  1758. movaps 72 * SIZE(AA), %xmm1
  1759. addps %xmm3, %xmm7
  1760. movaps 40 * SIZE(BB), %xmm3
  1761. addl $64 * SIZE, AA
  1762. addl $32 * SIZE, BB
  1763. decl %eax
  1764. jne .L111
  1765. ALIGN_2
  1766. #endif
  1767. .L112:
  1768. #ifndef TRMMKERNEL
  1769. movl K, %eax
  1770. #else
  1771. movl KKK, %eax
  1772. #endif
  1773. movaps ALPHA, %xmm3
  1774. andl $7, %eax # if (k & 1)
  1775. BRANCH
  1776. je .L114
  1777. .L113:
  1778. movaps 0 * SIZE(BB), %xmm2
  1779. movaps 0 * SIZE(AA), %xmm0
  1780. mulps %xmm2, %xmm0
  1781. addps %xmm0, %xmm4
  1782. mulps 4 * SIZE(AA), %xmm2
  1783. addps %xmm2, %xmm5
  1784. addl $8 * SIZE, AA
  1785. addl $4 * SIZE, BB
  1786. subl $1, %eax
  1787. jg .L113
  1788. ALIGN_4
  1789. .L114:
  1790. addps %xmm6, %xmm4
  1791. addps %xmm7, %xmm5
  1792. mulps %xmm3, %xmm4
  1793. mulps %xmm3, %xmm5
  1794. #ifndef TRMMKERNEL
  1795. movsd 0 * SIZE(%esi), %xmm0
  1796. movhps 2 * SIZE(%esi), %xmm0
  1797. addps %xmm0, %xmm4
  1798. movsd 4 * SIZE(%esi), %xmm0
  1799. movhps 6 * SIZE(%esi), %xmm0
  1800. addps %xmm0, %xmm5
  1801. #endif
  1802. #ifdef HAVE_SSE2
  1803. movsd %xmm4, 0 * SIZE(%esi)
  1804. unpckhpd %xmm4, %xmm4
  1805. movsd %xmm4, 2 * SIZE(%esi)
  1806. movsd %xmm5, 4 * SIZE(%esi)
  1807. unpckhpd %xmm5, %xmm5
  1808. movsd %xmm5, 6 * SIZE(%esi)
  1809. #else
  1810. movlps %xmm4, 0 * SIZE(%esi)
  1811. movhps %xmm4, 2 * SIZE(%esi)
  1812. movlps %xmm5, 4 * SIZE(%esi)
  1813. movhps %xmm5, 6 * SIZE(%esi)
  1814. #endif
  1815. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1816. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1817. movl K, %eax
  1818. subl KKK, %eax
  1819. leal (,%eax, 8), %eax
  1820. leal (AA, %eax, 4), AA
  1821. leal (BB, %eax, 2), BB
  1822. #endif
  1823. #if defined(TRMMKERNEL) && defined(LEFT)
  1824. addl $8, KK
  1825. #endif
  1826. addl $8 * SIZE, %esi
  1827. BRANCH
  1828. decl %ebx # i --
  1829. jg .L110
  1830. ALIGN_2
  1831. .L130:
  1832. movl M, %ebx
  1833. andl $7, %ebx
  1834. jle .L999
  1835. testl $4, %ebx
  1836. jle .L150
  1837. #if (L1_DATA_LINESIZE == 64)
  1838. #if !defined(TRMMKERNEL) || \
  1839. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1840. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1841. leal BUFFER, BB
  1842. movaps 0 * SIZE + BUFFER, %xmm2
  1843. xorps %xmm4, %xmm4
  1844. movaps 0 * SIZE(AA), %xmm0
  1845. xorps %xmm5, %xmm5
  1846. movaps 16 * SIZE + BUFFER, %xmm3
  1847. xorps %xmm6, %xmm6
  1848. movaps 16 * SIZE(AA), %xmm1
  1849. xorps %xmm7, %xmm7
  1850. #else
  1851. leal BUFFER, BB
  1852. movl KK, %eax
  1853. leal (, %eax, 8), %eax
  1854. leal (AA, %eax, 2), AA
  1855. leal (BB, %eax, 2), BB /* because it's doubled */
  1856. movaps 0 * SIZE(BB), %xmm2
  1857. xorps %xmm4, %xmm4
  1858. movaps 0 * SIZE(AA), %xmm0
  1859. xorps %xmm5, %xmm5
  1860. movaps 16 * SIZE(BB), %xmm3
  1861. xorps %xmm6, %xmm6
  1862. movaps 16 * SIZE(AA), %xmm1
  1863. xorps %xmm7, %xmm7
  1864. #endif
  1865. #ifndef TRMMKERNEL
  1866. movl K, %eax
  1867. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1868. movl K, %eax
  1869. subl KK, %eax
  1870. movl %eax, KKK
  1871. #else
  1872. movl KK, %eax
  1873. #ifdef LEFT
  1874. addl $4, %eax
  1875. #else
  1876. addl $1, %eax
  1877. #endif
  1878. movl %eax, KKK
  1879. #endif
  1880. sarl $3, %eax
  1881. je .L132
  1882. ALIGN_2
  1883. .L131:
  1884. mulps %xmm0, %xmm2
  1885. movaps 4 * SIZE(AA), %xmm0
  1886. addps %xmm2, %xmm4
  1887. mulps 4 * SIZE(BB), %xmm0
  1888. movaps 32 * SIZE(BB), %xmm2
  1889. addps %xmm0, %xmm5
  1890. movaps 8 * SIZE(AA), %xmm0
  1891. mulps 8 * SIZE(BB), %xmm0
  1892. addps %xmm0, %xmm6
  1893. movaps 12 * SIZE(AA), %xmm0
  1894. mulps 12 * SIZE(BB), %xmm0
  1895. addps %xmm0, %xmm7
  1896. movaps 32 * SIZE(AA), %xmm0
  1897. mulps %xmm1, %xmm3
  1898. movaps 20 * SIZE(AA), %xmm1
  1899. addps %xmm3, %xmm4
  1900. mulps 20 * SIZE(BB), %xmm1
  1901. movaps 48 * SIZE(BB), %xmm3
  1902. addps %xmm1, %xmm5
  1903. movaps 24 * SIZE(AA), %xmm1
  1904. mulps 24 * SIZE(BB), %xmm1
  1905. addps %xmm1, %xmm6
  1906. movaps 28 * SIZE(AA), %xmm1
  1907. mulps 28 * SIZE(BB), %xmm1
  1908. addps %xmm1, %xmm7
  1909. movaps 48 * SIZE(AA), %xmm1
  1910. addl $32 * SIZE, AA
  1911. addl $32 * SIZE, BB
  1912. decl %eax
  1913. jne .L131
  1914. ALIGN_2
  1915. #else
  1916. #if !defined(TRMMKERNEL) || \
  1917. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1918. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1919. leal BUFFER, BB
  1920. movaps 0 * SIZE + BUFFER, %xmm2
  1921. xorps %xmm4, %xmm4
  1922. movaps 0 * SIZE(AA), %xmm0
  1923. xorps %xmm5, %xmm5
  1924. movaps 8 * SIZE + BUFFER, %xmm3
  1925. xorps %xmm6, %xmm6
  1926. movaps 8 * SIZE(AA), %xmm1
  1927. xorps %xmm7, %xmm7
  1928. #else
  1929. leal BUFFER, BB
  1930. movl KK, %eax
  1931. leal (, %eax, 8), %eax
  1932. leal (AA, %eax, 2), AA
  1933. leal (BB, %eax, 2), BB /* because it's doubled */
  1934. movaps 0 * SIZE(BB), %xmm2
  1935. xorps %xmm4, %xmm4
  1936. movaps 0 * SIZE(AA), %xmm0
  1937. xorps %xmm5, %xmm5
  1938. movaps 8 * SIZE(BB), %xmm3
  1939. xorps %xmm6, %xmm6
  1940. movaps 8 * SIZE(AA), %xmm1
  1941. xorps %xmm7, %xmm7
  1942. #endif
  1943. #ifndef TRMMKERNEL
  1944. movl K, %eax
  1945. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1946. movl K, %eax
  1947. subl KK, %eax
  1948. movl %eax, KKK
  1949. #else
  1950. movl KK, %eax
  1951. #ifdef LEFT
  1952. addl $4, %eax
  1953. #else
  1954. addl $1, %eax
  1955. #endif
  1956. movl %eax, KKK
  1957. #endif
  1958. sarl $3, %eax
  1959. je .L132
  1960. ALIGN_2
  1961. .L131:
  1962. mulps %xmm0, %xmm2
  1963. movaps 4 * SIZE(AA), %xmm0
  1964. addps %xmm2, %xmm4
  1965. mulps 4 * SIZE(BB), %xmm0
  1966. movaps 16 * SIZE(BB), %xmm2
  1967. addps %xmm0, %xmm5
  1968. movaps 16 * SIZE(AA), %xmm0
  1969. mulps %xmm1, %xmm3
  1970. movaps 12 * SIZE(AA), %xmm1
  1971. addps %xmm3, %xmm6
  1972. mulps 12 * SIZE(BB), %xmm1
  1973. movaps 24 * SIZE(BB), %xmm3
  1974. addps %xmm1, %xmm7
  1975. movaps 24 * SIZE(AA), %xmm1
  1976. mulps %xmm0, %xmm2
  1977. movaps 20 * SIZE(AA), %xmm0
  1978. addps %xmm2, %xmm4
  1979. mulps 20 * SIZE(BB), %xmm0
  1980. movaps 32 * SIZE(BB), %xmm2
  1981. addps %xmm0, %xmm5
  1982. movaps 32 * SIZE(AA), %xmm0
  1983. mulps %xmm1, %xmm3
  1984. movaps 28 * SIZE(AA), %xmm1
  1985. addps %xmm3, %xmm6
  1986. mulps 28 * SIZE(BB), %xmm1
  1987. movaps 40 * SIZE(BB), %xmm3
  1988. addps %xmm1, %xmm7
  1989. movaps 40 * SIZE(AA), %xmm1
  1990. addl $32 * SIZE, AA
  1991. addl $32 * SIZE, BB
  1992. decl %eax
  1993. jne .L131
  1994. ALIGN_2
  1995. #endif
  1996. .L132:
  1997. #ifndef TRMMKERNEL
  1998. movl K, %eax
  1999. #else
  2000. movl KKK, %eax
  2001. #endif
  2002. movaps ALPHA, %xmm3
  2003. andl $7, %eax # if (k & 1)
  2004. BRANCH
  2005. je .L134
  2006. .L133:
  2007. movaps 0 * SIZE(BB), %xmm2
  2008. movaps 0 * SIZE(AA), %xmm0
  2009. mulps %xmm0, %xmm2
  2010. addps %xmm2, %xmm4
  2011. addl $4 * SIZE, AA
  2012. addl $4 * SIZE, BB
  2013. decl %eax
  2014. jg .L133
  2015. ALIGN_4
  2016. .L134:
  2017. addps %xmm5, %xmm4
  2018. addps %xmm7, %xmm6
  2019. addps %xmm6, %xmm4
  2020. mulps %xmm3, %xmm4
  2021. #ifndef TRMMKERNEL
  2022. movsd 0 * SIZE(%esi), %xmm0
  2023. movhps 2 * SIZE(%esi), %xmm0
  2024. addps %xmm0, %xmm4
  2025. #endif
  2026. movlps %xmm4, 0 * SIZE(%esi)
  2027. movhps %xmm4, 2 * SIZE(%esi)
  2028. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2029. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2030. movl K, %eax
  2031. subl KKK, %eax
  2032. leal (,%eax, 8), %eax
  2033. leal (AA, %eax, 2), AA
  2034. leal (BB, %eax, 2), BB
  2035. #endif
  2036. #if defined(TRMMKERNEL) && defined(LEFT)
  2037. addl $4, KK
  2038. #endif
  2039. addl $4 * SIZE, %esi
  2040. ALIGN_2
  2041. .L150:
  2042. testl $2, %ebx
  2043. jle .L170
  2044. #if (L1_DATA_LINESIZE == 64)
  2045. #if !defined(TRMMKERNEL) || \
  2046. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2047. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2048. leal BUFFER, BB
  2049. movaps 0 * SIZE + BUFFER, %xmm2
  2050. xorps %xmm4, %xmm4
  2051. movaps 0 * SIZE(AA), %xmm0
  2052. xorps %xmm5, %xmm5
  2053. movaps 16 * SIZE + BUFFER, %xmm3
  2054. xorps %xmm6, %xmm6
  2055. movaps 8 * SIZE(AA), %xmm1
  2056. xorps %xmm7, %xmm7
  2057. #else
  2058. leal BUFFER, BB
  2059. movl KK, %eax
  2060. leal (, %eax, 8), %eax
  2061. leal (AA, %eax, 1), AA
  2062. leal (BB, %eax, 2), BB /* because it's doubled */
  2063. movaps 0 * SIZE(BB), %xmm2
  2064. xorps %xmm4, %xmm4
  2065. movaps 0 * SIZE(AA), %xmm0
  2066. xorps %xmm5, %xmm5
  2067. movaps 16 * SIZE(BB), %xmm3
  2068. xorps %xmm6, %xmm6
  2069. movaps 8 * SIZE(AA), %xmm1
  2070. xorps %xmm7, %xmm7
  2071. #endif
  2072. #ifndef TRMMKERNEL
  2073. movl K, %eax
  2074. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2075. movl K, %eax
  2076. subl KK, %eax
  2077. movl %eax, KKK
  2078. #else
  2079. movl KK, %eax
  2080. #ifdef LEFT
  2081. addl $2, %eax
  2082. #else
  2083. addl $1, %eax
  2084. #endif
  2085. movl %eax, KKK
  2086. #endif
  2087. sarl $3, %eax
  2088. je .L152
  2089. ALIGN_2
  2090. .L151:
  2091. mulps %xmm0, %xmm2
  2092. movsd 2 * SIZE(AA), %xmm0
  2093. addps %xmm2, %xmm4
  2094. movaps 4 * SIZE(BB), %xmm2
  2095. mulps %xmm0, %xmm2
  2096. movsd 4 * SIZE(AA), %xmm0
  2097. addps %xmm2, %xmm5
  2098. movaps 8 * SIZE(BB), %xmm2
  2099. mulps %xmm0, %xmm2
  2100. movsd 6 * SIZE(AA), %xmm0
  2101. addps %xmm2, %xmm6
  2102. movaps 12 * SIZE(BB), %xmm2
  2103. mulps %xmm0, %xmm2
  2104. movsd 16 * SIZE(AA), %xmm0
  2105. addps %xmm2, %xmm7
  2106. movaps 32 * SIZE(BB), %xmm2
  2107. mulps %xmm1, %xmm3
  2108. movsd 10 * SIZE(AA), %xmm1
  2109. addps %xmm3, %xmm4
  2110. movaps 20 * SIZE(BB), %xmm3
  2111. mulps %xmm1, %xmm3
  2112. movsd 12 * SIZE(AA), %xmm1
  2113. addps %xmm3, %xmm5
  2114. movaps 24 * SIZE(BB), %xmm3
  2115. mulps %xmm1, %xmm3
  2116. movsd 14 * SIZE(AA), %xmm1
  2117. addps %xmm3, %xmm6
  2118. movaps 28 * SIZE(BB), %xmm3
  2119. mulps %xmm1, %xmm3
  2120. movsd 24 * SIZE(AA), %xmm1
  2121. addps %xmm3, %xmm7
  2122. movaps 48 * SIZE(BB), %xmm3
  2123. addl $16 * SIZE, AA
  2124. addl $32 * SIZE, BB
  2125. decl %eax
  2126. jne .L151
  2127. ALIGN_2
  2128. #else
  2129. #if !defined(TRMMKERNEL) || \
  2130. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2131. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2132. leal BUFFER, BB
  2133. movaps 0 * SIZE + BUFFER, %xmm2
  2134. xorps %xmm4, %xmm4
  2135. movaps 0 * SIZE(AA), %xmm0
  2136. xorps %xmm5, %xmm5
  2137. movaps 8 * SIZE + BUFFER, %xmm3
  2138. xorps %xmm6, %xmm6
  2139. movaps 8 * SIZE(AA), %xmm1
  2140. xorps %xmm7, %xmm7
  2141. #else
  2142. leal BUFFER, BB
  2143. movl KK, %eax
  2144. leal (, %eax, 8), %eax
  2145. leal (AA, %eax, 1), AA
  2146. leal (BB, %eax, 2), BB /* because it's doubled */
  2147. movaps 0 * SIZE(BB), %xmm2
  2148. xorps %xmm4, %xmm4
  2149. movaps 0 * SIZE(AA), %xmm0
  2150. xorps %xmm5, %xmm5
  2151. movaps 8 * SIZE(BB), %xmm3
  2152. xorps %xmm6, %xmm6
  2153. movaps 8 * SIZE(AA), %xmm1
  2154. xorps %xmm7, %xmm7
  2155. #endif
  2156. #ifndef TRMMKERNEL
  2157. movl K, %eax
  2158. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2159. movl K, %eax
  2160. subl KK, %eax
  2161. movl %eax, KKK
  2162. #else
  2163. movl KK, %eax
  2164. #ifdef LEFT
  2165. addl $2, %eax
  2166. #else
  2167. addl $1, %eax
  2168. #endif
  2169. movl %eax, KKK
  2170. #endif
  2171. sarl $3, %eax
  2172. je .L152
  2173. ALIGN_2
  2174. .L151:
  2175. mulps %xmm0, %xmm2
  2176. movsd 2 * SIZE(AA), %xmm0
  2177. addps %xmm2, %xmm4
  2178. movaps 4 * SIZE(BB), %xmm2
  2179. mulps %xmm0, %xmm2
  2180. movsd 4 * SIZE(AA), %xmm0
  2181. addps %xmm2, %xmm5
  2182. movaps 16 * SIZE(BB), %xmm2
  2183. mulps %xmm0, %xmm3
  2184. movsd 6 * SIZE(AA), %xmm0
  2185. addps %xmm3, %xmm6
  2186. movaps 12 * SIZE(BB), %xmm3
  2187. mulps %xmm0, %xmm3
  2188. movsd 16 * SIZE(AA), %xmm0
  2189. addps %xmm3, %xmm7
  2190. movaps 24 * SIZE(BB), %xmm3
  2191. mulps %xmm1, %xmm2
  2192. movsd 10 * SIZE(AA), %xmm1
  2193. addps %xmm2, %xmm4
  2194. movaps 20 * SIZE(BB), %xmm2
  2195. mulps %xmm1, %xmm2
  2196. movsd 12 * SIZE(AA), %xmm1
  2197. addps %xmm2, %xmm5
  2198. movaps 32 * SIZE(BB), %xmm2
  2199. mulps %xmm1, %xmm3
  2200. movsd 14 * SIZE(AA), %xmm1
  2201. addps %xmm3, %xmm6
  2202. movaps 28 * SIZE(BB), %xmm3
  2203. mulps %xmm1, %xmm3
  2204. movsd 24 * SIZE(AA), %xmm1
  2205. addps %xmm3, %xmm7
  2206. movaps 40 * SIZE(BB), %xmm3
  2207. addl $16 * SIZE, AA
  2208. addl $32 * SIZE, BB
  2209. decl %eax
  2210. jne .L151
  2211. ALIGN_2
  2212. #endif
  2213. .L152:
  2214. #ifndef TRMMKERNEL
  2215. movl K, %eax
  2216. #else
  2217. movl KKK, %eax
  2218. #endif
  2219. movaps ALPHA, %xmm3
  2220. andl $7, %eax # if (k & 1)
  2221. BRANCH
  2222. je .L154
  2223. .L153:
  2224. mulps %xmm0, %xmm2
  2225. movsd 2 * SIZE(AA), %xmm0
  2226. addps %xmm2, %xmm4
  2227. movaps 4 * SIZE(BB), %xmm2
  2228. addl $2 * SIZE, AA
  2229. addl $4 * SIZE, BB
  2230. decl %eax
  2231. jg .L153
  2232. ALIGN_4
  2233. .L154:
  2234. addps %xmm5, %xmm4
  2235. addps %xmm7, %xmm6
  2236. addps %xmm6, %xmm4
  2237. mulps %xmm3, %xmm4
  2238. #ifndef TRMMKERNEL
  2239. #ifdef movsd
  2240. xorps %xmm0, %xmm0
  2241. #endif
  2242. movsd 0 * SIZE(%esi), %xmm0
  2243. addps %xmm0, %xmm4
  2244. #endif
  2245. movlps %xmm4, 0 * SIZE(%esi)
  2246. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2247. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2248. movl K, %eax
  2249. subl KKK, %eax
  2250. leal (,%eax, 8), %eax
  2251. leal (AA, %eax, 1), AA
  2252. leal (BB, %eax, 2), BB
  2253. #endif
  2254. #if defined(TRMMKERNEL) && defined(LEFT)
  2255. addl $2, KK
  2256. #endif
  2257. addl $2 * SIZE, %esi
  2258. ALIGN_2
  2259. .L170:
  2260. testl $1, %ebx
  2261. jle .L999
  2262. #if (L1_DATA_LINESIZE == 64)
  2263. #if !defined(TRMMKERNEL) || \
  2264. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2265. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2266. leal BUFFER, BB
  2267. movss 0 * SIZE + BUFFER, %xmm2
  2268. xorps %xmm4, %xmm4
  2269. movss 0 * SIZE(AA), %xmm0
  2270. xorps %xmm5, %xmm5
  2271. movss 16 * SIZE + BUFFER, %xmm3
  2272. xorps %xmm6, %xmm6
  2273. movss 4 * SIZE(AA), %xmm1
  2274. xorps %xmm7, %xmm7
  2275. #else
  2276. leal BUFFER, BB
  2277. movl KK, %eax
  2278. leal (, %eax, 4), %eax
  2279. leal (AA, %eax, 1), AA
  2280. leal (BB, %eax, 4), BB /* because it's doubled */
  2281. movss 0 * SIZE(BB), %xmm2
  2282. xorps %xmm4, %xmm4
  2283. movss 0 * SIZE(AA), %xmm0
  2284. xorps %xmm5, %xmm5
  2285. movss 16 * SIZE(BB), %xmm3
  2286. xorps %xmm6, %xmm6
  2287. movss 4 * SIZE(AA), %xmm1
  2288. xorps %xmm7, %xmm7
  2289. #endif
  2290. #ifndef TRMMKERNEL
  2291. movl K, %eax
  2292. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2293. movl K, %eax
  2294. subl KK, %eax
  2295. movl %eax, KKK
  2296. #else
  2297. movl KK, %eax
  2298. #ifdef LEFT
  2299. addl $1, %eax
  2300. #else
  2301. addl $1, %eax
  2302. #endif
  2303. movl %eax, KKK
  2304. #endif
  2305. sarl $3, %eax
  2306. je .L172
  2307. ALIGN_2
  2308. .L171:
  2309. mulss %xmm0, %xmm2
  2310. movss 1 * SIZE(AA), %xmm0
  2311. addss %xmm2, %xmm4
  2312. mulss 4 * SIZE(BB), %xmm0
  2313. movss 32 * SIZE(BB), %xmm2
  2314. addss %xmm0, %xmm5
  2315. movss 2 * SIZE(AA), %xmm0
  2316. mulss 8 * SIZE(BB), %xmm0
  2317. addss %xmm0, %xmm6
  2318. movss 3 * SIZE(AA), %xmm0
  2319. mulss 12 * SIZE(BB), %xmm0
  2320. addss %xmm0, %xmm7
  2321. movss 8 * SIZE(AA), %xmm0
  2322. mulss %xmm1, %xmm3
  2323. movss 5 * SIZE(AA), %xmm1
  2324. addss %xmm3, %xmm4
  2325. mulss 20 * SIZE(BB), %xmm1
  2326. movss 48 * SIZE(BB), %xmm3
  2327. addss %xmm1, %xmm5
  2328. movss 6 * SIZE(AA), %xmm1
  2329. mulss 24 * SIZE(BB), %xmm1
  2330. addss %xmm1, %xmm6
  2331. movss 7 * SIZE(AA), %xmm1
  2332. mulss 28 * SIZE(BB), %xmm1
  2333. addss %xmm1, %xmm7
  2334. movss 12 * SIZE(AA), %xmm1
  2335. addl $ 8 * SIZE, AA
  2336. addl $32 * SIZE, BB
  2337. decl %eax
  2338. jne .L171
  2339. ALIGN_2
  2340. #else
  2341. #if !defined(TRMMKERNEL) || \
  2342. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2343. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2344. leal BUFFER, BB
  2345. movss 0 * SIZE + BUFFER, %xmm2
  2346. xorps %xmm4, %xmm4
  2347. movss 0 * SIZE(AA), %xmm0
  2348. xorps %xmm5, %xmm5
  2349. movss 8 * SIZE + BUFFER, %xmm3
  2350. xorps %xmm6, %xmm6
  2351. movss 4 * SIZE(AA), %xmm1
  2352. xorps %xmm7, %xmm7
  2353. #else
  2354. leal BUFFER, BB
  2355. movl KK, %eax
  2356. leal (, %eax, 4), %eax
  2357. leal (AA, %eax, 1), AA
  2358. leal (BB, %eax, 4), BB /* because it's doubled */
  2359. movss 0 * SIZE(BB), %xmm2
  2360. xorps %xmm4, %xmm4
  2361. movss 0 * SIZE(AA), %xmm0
  2362. xorps %xmm5, %xmm5
  2363. movss 8 * SIZE(BB), %xmm3
  2364. xorps %xmm6, %xmm6
  2365. movss 4 * SIZE(AA), %xmm1
  2366. xorps %xmm7, %xmm7
  2367. #endif
  2368. #ifndef TRMMKERNEL
  2369. movl K, %eax
  2370. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2371. movl K, %eax
  2372. subl KK, %eax
  2373. movl %eax, KKK
  2374. #else
  2375. movl KK, %eax
  2376. #ifdef LEFT
  2377. addl $1, %eax
  2378. #else
  2379. addl $1, %eax
  2380. #endif
  2381. movl %eax, KKK
  2382. #endif
  2383. sarl $3, %eax
  2384. je .L172
  2385. ALIGN_2
  2386. .L171:
  2387. mulss %xmm0, %xmm2
  2388. movss 1 * SIZE(AA), %xmm0
  2389. addss %xmm2, %xmm4
  2390. mulss 4 * SIZE(BB), %xmm0
  2391. movss 16 * SIZE(BB), %xmm2
  2392. addss %xmm0, %xmm5
  2393. movss 2 * SIZE(AA), %xmm0
  2394. mulss %xmm0, %xmm3
  2395. movss 3 * SIZE(AA), %xmm0
  2396. addss %xmm3, %xmm6
  2397. mulss 12 * SIZE(BB), %xmm0
  2398. movss 24 * SIZE(BB), %xmm3
  2399. addss %xmm0, %xmm7
  2400. movss 8 * SIZE(AA), %xmm0
  2401. mulss %xmm1, %xmm2
  2402. movss 5 * SIZE(AA), %xmm1
  2403. addss %xmm2, %xmm4
  2404. mulss 20 * SIZE(BB), %xmm1
  2405. movss 32 * SIZE(BB), %xmm2
  2406. addss %xmm1, %xmm5
  2407. movss 6 * SIZE(AA), %xmm1
  2408. mulss %xmm1, %xmm3
  2409. movss 7 * SIZE(AA), %xmm1
  2410. addss %xmm3, %xmm6
  2411. mulss 28 * SIZE(BB), %xmm1
  2412. movss 40 * SIZE(BB), %xmm3
  2413. addss %xmm1, %xmm7
  2414. movss 12 * SIZE(AA), %xmm1
  2415. addl $ 8 * SIZE, AA
  2416. addl $32 * SIZE, BB
  2417. decl %eax
  2418. jne .L171
  2419. ALIGN_2
  2420. #endif
  2421. .L172:
  2422. #ifndef TRMMKERNEL
  2423. movl K, %eax
  2424. #else
  2425. movl KKK, %eax
  2426. #endif
  2427. movss ALPHA, %xmm3
  2428. andl $7, %eax # if (k & 1)
  2429. BRANCH
  2430. je .L174
  2431. .L173:
  2432. movss 0 * SIZE(AA), %xmm0
  2433. movss 0 * SIZE(BB), %xmm2
  2434. mulss %xmm0, %xmm2
  2435. addss %xmm2, %xmm4
  2436. addl $1 * SIZE, AA
  2437. addl $4 * SIZE, BB
  2438. decl %eax
  2439. jg .L173
  2440. ALIGN_4
  2441. .L174:
  2442. addss %xmm5, %xmm4
  2443. addss %xmm7, %xmm6
  2444. addss %xmm6, %xmm4
  2445. mulss %xmm3, %xmm4
  2446. #ifndef TRMMKERNEL
  2447. addss 0 * SIZE(%esi), %xmm4
  2448. #endif
  2449. movss %xmm4, 0 * SIZE(%esi)
  2450. ALIGN_2
  2451. .L999:
  2452. movl OLD_STACK, %esp
  2453. EMMS
  2454. popl %ebx
  2455. popl %esi
  2456. popl %edi
  2457. popl %ebp
  2458. ret
  2459. ALIGN_2
  2460. EPILOGUE