You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_4x8_nano.S 46 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define M %r13
  43. #define N %r14
  44. #define K %rdx
  45. #define A %rcx
  46. #define B %r8
  47. #define C %r9
  48. #define LDC %r10
  49. #define I %r11
  50. #define AO %rdi
  51. #define BO %rsi
  52. #define CO1 %r15
  53. #define CO2 %rbp
  54. #define BB %r12
  55. #ifndef WINDOWS_ABI
  56. #define STACKSIZE 64
  57. #define OLD_LDC 8 + STACKSIZE(%rsp)
  58. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  59. #else
  60. #define STACKSIZE 256
  61. #define OLD_A 40 + STACKSIZE(%rsp)
  62. #define OLD_B 48 + STACKSIZE(%rsp)
  63. #define OLD_C 56 + STACKSIZE(%rsp)
  64. #define OLD_LDC 64 + STACKSIZE(%rsp)
  65. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  66. #endif
  67. #define ALPHA 0(%rsp)
  68. #define J 16(%rsp)
  69. #define OFFSET 24(%rsp)
  70. #define KK 32(%rsp)
  71. #define KKK 40(%rsp)
  72. #define BUFFER 256(%rsp)
  73. #define PREFETCH prefetcht0
  74. #define PREFETCHW prefetcht0
  75. #define RPREFETCHSIZE (16 * 4)
  76. #define PREFETCHSIZE (16 * 8 + 8)
  77. PROLOGUE
  78. PROFCODE
  79. subq $STACKSIZE, %rsp
  80. movq %rbx, 0(%rsp)
  81. movq %rbp, 8(%rsp)
  82. movq %r12, 16(%rsp)
  83. movq %r13, 24(%rsp)
  84. movq %r14, 32(%rsp)
  85. movq %r15, 40(%rsp)
  86. #ifdef WINDOWS_ABI
  87. movq %rdi, 48(%rsp)
  88. movq %rsi, 56(%rsp)
  89. movups %xmm6, 64(%rsp)
  90. movups %xmm7, 80(%rsp)
  91. movups %xmm8, 96(%rsp)
  92. movups %xmm9, 112(%rsp)
  93. movups %xmm10, 128(%rsp)
  94. movups %xmm11, 144(%rsp)
  95. movups %xmm12, 160(%rsp)
  96. movups %xmm13, 176(%rsp)
  97. movups %xmm14, 192(%rsp)
  98. movups %xmm15, 208(%rsp)
  99. movq ARG1, OLD_M
  100. movq ARG2, OLD_N
  101. movq ARG3, K
  102. movq OLD_A, A
  103. movq OLD_B, B
  104. movq OLD_C, C
  105. movq OLD_LDC, LDC
  106. #ifdef TRMMKERNEL
  107. movsd OLD_OFFSET, %xmm4
  108. #endif
  109. movaps %xmm3, %xmm0
  110. #else
  111. movq OLD_LDC, LDC
  112. #ifdef TRMMKERNEL
  113. movsd OLD_OFFSET, %xmm4
  114. #endif
  115. #endif
  116. movq %rsp, %rbx # save old stack
  117. subq $256 + LOCAL_BUFFER_SIZE, %rsp
  118. andq $-4096, %rsp # align stack
  119. STACK_TOUCHING
  120. movq OLD_M, M
  121. movq OLD_N, N
  122. shufps $0, %xmm0, %xmm0
  123. movaps %xmm0, ALPHA
  124. #ifdef TRMMKERNEL
  125. movsd %xmm4, OFFSET
  126. movsd %xmm4, KK
  127. #ifndef LEFT
  128. negq KK
  129. #endif
  130. #endif
  131. subq $-32 * SIZE, A
  132. salq $BASE_SHIFT, LDC
  133. movq N, J
  134. sarq $3, J
  135. jle .L40
  136. .L01:
  137. #if defined(TRMMKERNEL) && defined(LEFT)
  138. movq OFFSET, %rax
  139. movq %rax, KK
  140. #endif
  141. leaq 32 * SIZE + BUFFER, BO
  142. movaps 0 * SIZE(B), %xmm1
  143. movaps 4 * SIZE(B), %xmm3
  144. movaps 8 * SIZE(B), %xmm5
  145. movaps 12 * SIZE(B), %xmm7
  146. movq K, %rax
  147. sarq $1, %rax
  148. jle .L03
  149. ALIGN_4
  150. .L02:
  151. PREFETCH (RPREFETCHSIZE + 0) * SIZE(B)
  152. pshufd $0x50, %xmm1, %xmm0
  153. movaps %xmm0, -32 * SIZE(BO)
  154. pshufd $0xfa, %xmm1, %xmm1
  155. movaps %xmm1, -28 * SIZE(BO)
  156. movaps 16 * SIZE(B), %xmm1
  157. pshufd $0x50, %xmm3, %xmm2
  158. movaps %xmm2, -24 * SIZE(BO)
  159. pshufd $0xfa, %xmm3, %xmm3
  160. movaps %xmm3, -20 * SIZE(BO)
  161. movaps 20 * SIZE(B), %xmm3
  162. pshufd $0x50, %xmm5, %xmm4
  163. movaps %xmm4, -16 * SIZE(BO)
  164. pshufd $0xfa, %xmm5, %xmm5
  165. movaps %xmm5, -12 * SIZE(BO)
  166. movaps 24 * SIZE(B), %xmm5
  167. pshufd $0x50, %xmm7, %xmm6
  168. movaps %xmm6, -8 * SIZE(BO)
  169. pshufd $0xfa, %xmm7, %xmm7
  170. movaps %xmm7, -4 * SIZE(BO)
  171. movaps 28 * SIZE(B), %xmm7
  172. addq $16 * SIZE, B
  173. addq $32 * SIZE, BO
  174. decq %rax
  175. jne .L02
  176. ALIGN_4
  177. .L03:
  178. movq K, %rax
  179. andq $1, %rax
  180. BRANCH
  181. jle .L10
  182. pshufd $0x50, %xmm1, %xmm0
  183. movaps %xmm0, -32 * SIZE(BO)
  184. pshufd $0xfa, %xmm1, %xmm1
  185. movaps %xmm1, -28 * SIZE(BO)
  186. pshufd $0x50, %xmm3, %xmm2
  187. movaps %xmm2, -24 * SIZE(BO)
  188. pshufd $0xfa, %xmm3, %xmm3
  189. movaps %xmm3, -20 * SIZE(BO)
  190. addq $ 8 * SIZE, B
  191. subq $-16 * SIZE, BO
  192. ALIGN_4
  193. .L10:
  194. movq C, CO1
  195. leaq (C, LDC, 4), CO2
  196. movq A, AO
  197. leaq (RPREFETCHSIZE + 0) * SIZE(B), BB
  198. movq M, I
  199. sarq $2, I
  200. jle .L20
  201. ALIGN_4
  202. .L11:
  203. #if !defined(TRMMKERNEL) || \
  204. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  205. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  206. leaq 32 * SIZE + BUFFER, BO
  207. #else
  208. leaq 32 * SIZE + BUFFER, BO
  209. movq KK, %rax
  210. salq $BASE_SHIFT + 1, %rax
  211. leaq (AO, %rax, 2), AO
  212. leaq (BO, %rax, 8), BO
  213. #endif
  214. leaq (LDC, LDC, 2), %rax
  215. movaps -32 * SIZE(AO), %xmm0
  216. movaps -32 * SIZE(BO), %xmm1
  217. pxor %xmm8, %xmm8
  218. PREFETCHW 3 * SIZE(CO1)
  219. pxor %xmm9, %xmm9
  220. PREFETCHW 5 * SIZE(CO1, LDC, 1)
  221. pxor %xmm10, %xmm10
  222. PREFETCHW 3 * SIZE(CO1, LDC, 2)
  223. pxor %xmm11, %xmm11
  224. PREFETCHW 5 * SIZE(CO1, %rax)
  225. pxor %xmm12, %xmm12
  226. PREFETCHW 3 * SIZE(CO2)
  227. pxor %xmm13, %xmm13
  228. PREFETCHW 5 * SIZE(CO2, LDC, 1)
  229. pxor %xmm14, %xmm14
  230. PREFETCHW 3 * SIZE(CO2, LDC, 2)
  231. pxor %xmm15, %xmm15
  232. PREFETCHW 5 * SIZE(CO2, %rax)
  233. PREFETCH -32 * SIZE(BB)
  234. addq $16 * SIZE, BB
  235. #ifndef TRMMKERNEL
  236. movq K, %rax
  237. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  238. movq K, %rax
  239. subq KK, %rax
  240. movq %rax, KKK
  241. #else
  242. movq KK, %rax
  243. #ifdef LEFT
  244. addq $4, %rax
  245. #else
  246. addq $8, %rax
  247. #endif
  248. movq %rax, KKK
  249. #endif
  250. sarq $2, %rax
  251. NOBRANCH
  252. jle .L16
  253. ALIGN_3
  254. .L12:
  255. PREFETCH (PREFETCHSIZE + 0)(AO)
  256. pshufd $0x4e, %xmm1, %xmm3
  257. mulps %xmm0, %xmm1
  258. addps %xmm1, %xmm8
  259. movaps -28 * SIZE(BO), %xmm1
  260. mulps %xmm0, %xmm3
  261. movaps %xmm0, %xmm2
  262. movaps -16 * SIZE(AO), %xmm0
  263. addps %xmm3, %xmm9
  264. pshufd $0x4e, %xmm1, %xmm3
  265. mulps %xmm2, %xmm1
  266. addps %xmm1, %xmm10
  267. movaps -24 * SIZE(BO), %xmm1
  268. mulps %xmm2, %xmm3
  269. addps %xmm3, %xmm11
  270. pshufd $0x4e, %xmm1, %xmm3
  271. mulps %xmm2, %xmm1
  272. addps %xmm1, %xmm12
  273. movaps -20 * SIZE(BO), %xmm1
  274. mulps %xmm2, %xmm3
  275. addps %xmm3, %xmm13
  276. pshufd $0x4e, %xmm1, %xmm3
  277. mulps %xmm2, %xmm1
  278. addps %xmm1, %xmm14
  279. movaps -16 * SIZE(BO), %xmm1
  280. mulps %xmm2, %xmm3
  281. movaps -28 * SIZE(AO), %xmm2
  282. addps %xmm3, %xmm15
  283. pshufd $0x4e, %xmm1, %xmm3
  284. mulps %xmm2, %xmm1
  285. addps %xmm1, %xmm8
  286. movaps -12 * SIZE(BO), %xmm1
  287. mulps %xmm2, %xmm3
  288. addps %xmm3, %xmm9
  289. pshufd $0x4e, %xmm1, %xmm3
  290. mulps %xmm2, %xmm1
  291. addps %xmm1, %xmm10
  292. movaps -8 * SIZE(BO), %xmm1
  293. mulps %xmm2, %xmm3
  294. addps %xmm3, %xmm11
  295. pshufd $0x4e, %xmm1, %xmm3
  296. mulps %xmm2, %xmm1
  297. addps %xmm1, %xmm12
  298. movaps -4 * SIZE(BO), %xmm1
  299. mulps %xmm2, %xmm3
  300. addps %xmm3, %xmm13
  301. pshufd $0x4e, %xmm1, %xmm3
  302. mulps %xmm2, %xmm1
  303. addps %xmm1, %xmm14
  304. movaps 0 * SIZE(BO), %xmm1
  305. mulps %xmm2, %xmm3
  306. movaps -24 * SIZE(AO), %xmm2
  307. addps %xmm3, %xmm15
  308. pshufd $0x4e, %xmm1, %xmm3
  309. mulps %xmm2, %xmm1
  310. addps %xmm1, %xmm8
  311. movaps 4 * SIZE(BO), %xmm1
  312. mulps %xmm2, %xmm3
  313. addps %xmm3, %xmm9
  314. pshufd $0x4e, %xmm1, %xmm3
  315. mulps %xmm2, %xmm1
  316. addps %xmm1, %xmm10
  317. movaps 8 * SIZE(BO), %xmm1
  318. mulps %xmm2, %xmm3
  319. addps %xmm3, %xmm11
  320. pshufd $0x4e, %xmm1, %xmm3
  321. mulps %xmm2, %xmm1
  322. addps %xmm1, %xmm12
  323. movaps 12 * SIZE(BO), %xmm1
  324. mulps %xmm2, %xmm3
  325. addps %xmm3, %xmm13
  326. pshufd $0x4e, %xmm1, %xmm3
  327. mulps %xmm2, %xmm1
  328. addps %xmm1, %xmm14
  329. movaps 16 * SIZE(BO), %xmm1
  330. mulps %xmm2, %xmm3
  331. movaps -20 * SIZE(AO), %xmm2
  332. addps %xmm3, %xmm15
  333. pshufd $0x4e, %xmm1, %xmm3
  334. mulps %xmm2, %xmm1
  335. addps %xmm1, %xmm8
  336. movaps 20 * SIZE(BO), %xmm1
  337. mulps %xmm2, %xmm3
  338. addps %xmm3, %xmm9
  339. pshufd $0x4e, %xmm1, %xmm3
  340. mulps %xmm2, %xmm1
  341. addps %xmm1, %xmm10
  342. movaps 24 * SIZE(BO), %xmm1
  343. mulps %xmm2, %xmm3
  344. addps %xmm3, %xmm11
  345. pshufd $0x4e, %xmm1, %xmm3
  346. mulps %xmm2, %xmm1
  347. addps %xmm1, %xmm12
  348. movaps 28 * SIZE(BO), %xmm1
  349. mulps %xmm2, %xmm3
  350. addps %xmm3, %xmm13
  351. pshufd $0x4e, %xmm1, %xmm3
  352. mulps %xmm2, %xmm1
  353. addps %xmm1, %xmm14
  354. movaps 32 * SIZE(BO), %xmm1
  355. mulps %xmm2, %xmm3
  356. addps %xmm3, %xmm15
  357. subq $-16 * SIZE, AO
  358. addq $ 64 * SIZE, BO
  359. decq %rax
  360. BRANCH
  361. jg .L12
  362. .L16:
  363. movaps ALPHA, %xmm7
  364. #ifndef TRMMKERNEL
  365. movq K, %rax
  366. #else
  367. movq KKK, %rax
  368. #endif
  369. andq $3, %rax
  370. je .L18
  371. ALIGN_4
  372. .L17:
  373. pshufd $0x4e, %xmm1, %xmm3
  374. mulps %xmm0, %xmm1
  375. addps %xmm1, %xmm8
  376. movaps -28 * SIZE(BO), %xmm1
  377. mulps %xmm0, %xmm3
  378. addps %xmm3, %xmm9
  379. pshufd $0x4e, %xmm1, %xmm3
  380. mulps %xmm0, %xmm1
  381. addps %xmm1, %xmm10
  382. movaps -24 * SIZE(BO), %xmm1
  383. mulps %xmm0, %xmm3
  384. addps %xmm3, %xmm11
  385. pshufd $0x4e, %xmm1, %xmm3
  386. mulps %xmm0, %xmm1
  387. addps %xmm1, %xmm12
  388. movaps -20 * SIZE(BO), %xmm1
  389. mulps %xmm0, %xmm3
  390. addps %xmm3, %xmm13
  391. pshufd $0x4e, %xmm1, %xmm3
  392. mulps %xmm0, %xmm1
  393. addps %xmm1, %xmm14
  394. movaps -16 * SIZE(BO), %xmm1
  395. mulps %xmm0, %xmm3
  396. movaps -28 * SIZE(AO), %xmm0
  397. addps %xmm3, %xmm15
  398. addq $ 4 * SIZE, AO
  399. subq $-16 * SIZE, BO
  400. decq %rax
  401. jg .L17
  402. ALIGN_4
  403. .L18:
  404. leaq (LDC, LDC, 2), %rax
  405. mulps %xmm7, %xmm8
  406. mulps %xmm7, %xmm9
  407. mulps %xmm7, %xmm10
  408. mulps %xmm7, %xmm11
  409. #ifndef TRMMKERNEL
  410. movsd 0 * SIZE(CO1), %xmm0
  411. movhps 2 * SIZE(CO1, LDC, 1), %xmm0
  412. movsd 0 * SIZE(CO1, LDC, 1), %xmm1
  413. movhps 2 * SIZE(CO1), %xmm1
  414. movsd 0 * SIZE(CO1, LDC, 2), %xmm2
  415. movhps 2 * SIZE(CO1, %rax), %xmm2
  416. movsd 0 * SIZE(CO1, %rax), %xmm3
  417. movhps 2 * SIZE(CO1, LDC, 2), %xmm3
  418. addps %xmm0, %xmm8
  419. addps %xmm1, %xmm9
  420. addps %xmm2, %xmm10
  421. addps %xmm3, %xmm11
  422. #endif
  423. mulps %xmm7, %xmm12
  424. mulps %xmm7, %xmm13
  425. mulps %xmm7, %xmm14
  426. mulps %xmm7, %xmm15
  427. #ifndef TRMMKERNEL
  428. movsd 0 * SIZE(CO2), %xmm4
  429. movhps 2 * SIZE(CO2, LDC, 1), %xmm4
  430. movsd 0 * SIZE(CO2, LDC, 1), %xmm5
  431. movhps 2 * SIZE(CO2), %xmm5
  432. movsd 0 * SIZE(CO2, LDC, 2), %xmm6
  433. movhps 2 * SIZE(CO2, %rax), %xmm6
  434. movsd 0 * SIZE(CO2, %rax), %xmm7
  435. movhps 2 * SIZE(CO2, LDC, 2), %xmm7
  436. addps %xmm4, %xmm12
  437. addps %xmm5, %xmm13
  438. addps %xmm6, %xmm14
  439. addps %xmm7, %xmm15
  440. #endif
  441. movlps %xmm8, 0 * SIZE(CO1)
  442. movhps %xmm8, 2 * SIZE(CO1, LDC, 1)
  443. movlps %xmm9, 0 * SIZE(CO1, LDC, 1)
  444. movhps %xmm9, 2 * SIZE(CO1)
  445. movlps %xmm10, 0 * SIZE(CO1, LDC, 2)
  446. movhps %xmm10, 2 * SIZE(CO1, %rax)
  447. movlps %xmm11, 0 * SIZE(CO1, %rax)
  448. movhps %xmm11, 2 * SIZE(CO1, LDC, 2)
  449. movlps %xmm12, 0 * SIZE(CO2)
  450. movhps %xmm12, 2 * SIZE(CO2, LDC, 1)
  451. movlps %xmm13, 0 * SIZE(CO2, LDC, 1)
  452. movhps %xmm13, 2 * SIZE(CO2)
  453. movlps %xmm14, 0 * SIZE(CO2, LDC, 2)
  454. movhps %xmm14, 2 * SIZE(CO2, %rax)
  455. movlps %xmm15, 0 * SIZE(CO2, %rax)
  456. movhps %xmm15, 2 * SIZE(CO2, LDC, 2)
  457. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  458. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  459. movq K, %rax
  460. subq KKK, %rax
  461. salq $BASE_SHIFT + 1, %rax
  462. leaq (AO, %rax, 2), AO
  463. leaq (BO, %rax, 8), BO
  464. #endif
  465. #if defined(TRMMKERNEL) && defined(LEFT)
  466. addq $4, KK
  467. #endif
  468. addq $4 * SIZE, CO1
  469. addq $4 * SIZE, CO2
  470. decq I
  471. jg .L11
  472. ALIGN_4
  473. .L20:
  474. testq $2, M
  475. je .L30
  476. #if !defined(TRMMKERNEL) || \
  477. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  478. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  479. leaq 32 * SIZE + BUFFER, BO
  480. #else
  481. leaq 32 * SIZE + BUFFER, BO
  482. movq KK, %rax
  483. salq $BASE_SHIFT + 1, %rax
  484. leaq (AO, %rax, 1), AO
  485. leaq (BO, %rax, 8), BO
  486. #endif
  487. movddup -32 * SIZE(AO), %xmm0
  488. pxor %xmm8, %xmm8
  489. movaps -32 * SIZE(BO), %xmm1
  490. pxor %xmm9, %xmm9
  491. pxor %xmm10, %xmm10
  492. pxor %xmm11, %xmm11
  493. #ifndef TRMMKERNEL
  494. movq K, %rax
  495. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  496. movq K, %rax
  497. subq KK, %rax
  498. movq %rax, KKK
  499. #else
  500. movq KK, %rax
  501. #ifdef LEFT
  502. addq $2, %rax
  503. #else
  504. addq $8, %rax
  505. #endif
  506. movq %rax, KKK
  507. #endif
  508. sarq $2, %rax
  509. NOBRANCH
  510. jle .L26
  511. ALIGN_3
  512. .L22:
  513. PREFETCH (PREFETCHSIZE + 0)(AO)
  514. mulps %xmm0, %xmm1
  515. addps %xmm1, %xmm8
  516. movaps -28 * SIZE(BO), %xmm1
  517. mulps %xmm0, %xmm1
  518. addps %xmm1, %xmm9
  519. movaps -24 * SIZE(BO), %xmm1
  520. mulps %xmm0, %xmm1
  521. mulps -20 * SIZE(BO), %xmm0
  522. addps %xmm1, %xmm10
  523. movaps -16 * SIZE(BO), %xmm1
  524. addps %xmm0, %xmm11
  525. movddup -30 * SIZE(AO), %xmm0
  526. mulps %xmm0, %xmm1
  527. addps %xmm1, %xmm8
  528. movaps -12 * SIZE(BO), %xmm1
  529. mulps %xmm0, %xmm1
  530. addps %xmm1, %xmm9
  531. movaps -8 * SIZE(BO), %xmm1
  532. mulps %xmm0, %xmm1
  533. mulps -4 * SIZE(BO), %xmm0
  534. addps %xmm1, %xmm10
  535. movaps 0 * SIZE(BO), %xmm1
  536. addps %xmm0, %xmm11
  537. movddup -28 * SIZE(AO), %xmm0
  538. mulps %xmm0, %xmm1
  539. addps %xmm1, %xmm8
  540. movaps 4 * SIZE(BO), %xmm1
  541. mulps %xmm0, %xmm1
  542. addps %xmm1, %xmm9
  543. movaps 8 * SIZE(BO), %xmm1
  544. mulps %xmm0, %xmm1
  545. mulps 12 * SIZE(BO), %xmm0
  546. addps %xmm1, %xmm10
  547. movaps 16 * SIZE(BO), %xmm1
  548. addps %xmm0, %xmm11
  549. movddup -26 * SIZE(AO), %xmm0
  550. mulps %xmm0, %xmm1
  551. addps %xmm1, %xmm8
  552. movaps 20 * SIZE(BO), %xmm1
  553. mulps %xmm0, %xmm1
  554. addps %xmm1, %xmm9
  555. movaps 24 * SIZE(BO), %xmm1
  556. mulps %xmm0, %xmm1
  557. mulps 28 * SIZE(BO), %xmm0
  558. addps %xmm1, %xmm10
  559. movaps 32 * SIZE(BO), %xmm1
  560. addps %xmm0, %xmm11
  561. movddup -24 * SIZE(AO), %xmm0
  562. subq $-8 * SIZE, AO
  563. addq $64 * SIZE, BO
  564. decq %rax
  565. BRANCH
  566. jg .L22
  567. .L26:
  568. movaps ALPHA, %xmm7
  569. #ifndef TRMMKERNEL
  570. movq K, %rax
  571. #else
  572. movq KKK, %rax
  573. #endif
  574. andq $3, %rax
  575. je .L28
  576. ALIGN_4
  577. .L27:
  578. mulps %xmm0, %xmm1
  579. addps %xmm1, %xmm8
  580. movaps -28 * SIZE(BO), %xmm1
  581. mulps %xmm0, %xmm1
  582. addps %xmm1, %xmm9
  583. movaps -24 * SIZE(BO), %xmm1
  584. mulps %xmm0, %xmm1
  585. mulps -20 * SIZE(BO), %xmm0
  586. addps %xmm1, %xmm10
  587. movaps -16 * SIZE(BO), %xmm1
  588. addps %xmm0, %xmm11
  589. movddup -30 * SIZE(AO), %xmm0
  590. addq $ 2 * SIZE, AO
  591. subq $-16 * SIZE, BO
  592. decq %rax
  593. jg .L27
  594. ALIGN_4
  595. .L28:
  596. leaq (LDC, LDC, 2), %rax
  597. mulps %xmm7, %xmm8
  598. mulps %xmm7, %xmm9
  599. mulps %xmm7, %xmm10
  600. mulps %xmm7, %xmm11
  601. #ifndef TRMMKERNEL
  602. movsd (CO1), %xmm0
  603. movhps (CO1, LDC, 1), %xmm0
  604. movsd (CO1, LDC, 2), %xmm1
  605. movhps (CO1, %rax), %xmm1
  606. movsd (CO2), %xmm2
  607. movhps (CO2, LDC, 1), %xmm2
  608. movsd (CO2, LDC, 2), %xmm3
  609. movhps (CO2, %rax), %xmm3
  610. addps %xmm0, %xmm8
  611. addps %xmm1, %xmm9
  612. addps %xmm2, %xmm10
  613. addps %xmm3, %xmm11
  614. #endif
  615. movlps %xmm8, (CO1)
  616. movhps %xmm8, (CO1, LDC, 1)
  617. movlps %xmm9, (CO1, LDC, 2)
  618. movhps %xmm9, (CO1, %rax)
  619. movlps %xmm10, (CO2)
  620. movhps %xmm10, (CO2, LDC, 1)
  621. movlps %xmm11, (CO2, LDC, 2)
  622. movhps %xmm11, (CO2, %rax)
  623. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  624. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  625. movq K, %rax
  626. subq KKK, %rax
  627. salq $BASE_SHIFT + 1, %rax
  628. leaq (AO, %rax, 1), AO
  629. leaq (BO, %rax, 8), BO
  630. #endif
  631. #if defined(TRMMKERNEL) && defined(LEFT)
  632. addq $2, KK
  633. #endif
  634. addq $2 * SIZE, CO1
  635. addq $2 * SIZE, CO2
  636. ALIGN_4
  637. .L30:
  638. testq $1, M
  639. je .L39
  640. #if !defined(TRMMKERNEL) || \
  641. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  642. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  643. leaq 32 * SIZE + BUFFER, BO
  644. #else
  645. leaq 32 * SIZE + BUFFER, BO
  646. movq KK, %rax
  647. salq $BASE_SHIFT, %rax
  648. leaq (AO, %rax, 1), AO
  649. addq %rax, %rax
  650. leaq (BO, %rax, 8), BO
  651. #endif
  652. movss -32 * SIZE(AO), %xmm0
  653. pxor %xmm8, %xmm8
  654. movaps -32 * SIZE(BO), %xmm1
  655. pxor %xmm9, %xmm9
  656. pxor %xmm10, %xmm10
  657. pxor %xmm11, %xmm11
  658. #ifndef TRMMKERNEL
  659. movq K, %rax
  660. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  661. movq K, %rax
  662. subq KK, %rax
  663. movq %rax, KKK
  664. #else
  665. movq KK, %rax
  666. #ifdef LEFT
  667. addq $1, %rax
  668. #else
  669. addq $8, %rax
  670. #endif
  671. movq %rax, KKK
  672. #endif
  673. sarq $2, %rax
  674. NOBRANCH
  675. jle .L36
  676. ALIGN_3
  677. .L32:
  678. PREFETCH (PREFETCHSIZE + 0)(AO)
  679. shufps $0, %xmm0, %xmm0
  680. mulps %xmm0, %xmm1
  681. addps %xmm1, %xmm8
  682. movaps -28 * SIZE(BO), %xmm1
  683. mulps %xmm0, %xmm1
  684. addps %xmm1, %xmm9
  685. movaps -24 * SIZE(BO), %xmm1
  686. mulps %xmm0, %xmm1
  687. mulps -20 * SIZE(BO), %xmm0
  688. addps %xmm1, %xmm10
  689. movaps -16 * SIZE(BO), %xmm1
  690. addps %xmm0, %xmm11
  691. movss -31 * SIZE(AO), %xmm0
  692. shufps $0, %xmm0, %xmm0
  693. mulps %xmm0, %xmm1
  694. addps %xmm1, %xmm8
  695. movaps -12 * SIZE(BO), %xmm1
  696. mulps %xmm0, %xmm1
  697. addps %xmm1, %xmm9
  698. movaps -8 * SIZE(BO), %xmm1
  699. mulps %xmm0, %xmm1
  700. mulps -4 * SIZE(BO), %xmm0
  701. addps %xmm1, %xmm10
  702. movaps 0 * SIZE(BO), %xmm1
  703. addps %xmm0, %xmm11
  704. movss -30 * SIZE(AO), %xmm0
  705. shufps $0, %xmm0, %xmm0
  706. mulps %xmm0, %xmm1
  707. addps %xmm1, %xmm8
  708. movaps 4 * SIZE(BO), %xmm1
  709. mulps %xmm0, %xmm1
  710. addps %xmm1, %xmm9
  711. movaps 8 * SIZE(BO), %xmm1
  712. mulps %xmm0, %xmm1
  713. mulps 12 * SIZE(BO), %xmm0
  714. addps %xmm1, %xmm10
  715. movaps 16 * SIZE(BO), %xmm1
  716. addps %xmm0, %xmm11
  717. movss -29 * SIZE(AO), %xmm0
  718. shufps $0, %xmm0, %xmm0
  719. mulps %xmm0, %xmm1
  720. addps %xmm1, %xmm8
  721. movaps 20 * SIZE(BO), %xmm1
  722. mulps %xmm0, %xmm1
  723. addps %xmm1, %xmm9
  724. movaps 24 * SIZE(BO), %xmm1
  725. mulps %xmm0, %xmm1
  726. mulps 28 * SIZE(BO), %xmm0
  727. addps %xmm1, %xmm10
  728. movaps 32 * SIZE(BO), %xmm1
  729. addps %xmm0, %xmm11
  730. movss -28 * SIZE(AO), %xmm0
  731. subq $-4 * SIZE, AO
  732. addq $64 * SIZE, BO
  733. decq %rax
  734. BRANCH
  735. jg .L32
  736. .L36:
  737. movaps ALPHA, %xmm7
  738. #ifndef TRMMKERNEL
  739. movq K, %rax
  740. #else
  741. movq KKK, %rax
  742. #endif
  743. andq $3, %rax
  744. je .L38
  745. ALIGN_4
  746. .L37:
  747. shufps $0, %xmm0, %xmm0
  748. mulps %xmm0, %xmm1
  749. addps %xmm1, %xmm8
  750. movaps -28 * SIZE(BO), %xmm1
  751. mulps %xmm0, %xmm1
  752. addps %xmm1, %xmm9
  753. movaps -24 * SIZE(BO), %xmm1
  754. mulps %xmm0, %xmm1
  755. mulps -20 * SIZE(BO), %xmm0
  756. addps %xmm1, %xmm10
  757. movaps -16 * SIZE(BO), %xmm1
  758. addps %xmm0, %xmm11
  759. movss -31 * SIZE(AO), %xmm0
  760. addq $ 1 * SIZE, AO
  761. subq $-16 * SIZE, BO
  762. decq %rax
  763. jg .L37
  764. ALIGN_4
  765. .L38:
  766. leaq (LDC, LDC, 2), %rax
  767. mulps %xmm7, %xmm8
  768. mulps %xmm7, %xmm9
  769. mulps %xmm7, %xmm10
  770. mulps %xmm7, %xmm11
  771. movhlps %xmm8, %xmm12
  772. movhlps %xmm9, %xmm13
  773. movhlps %xmm10, %xmm14
  774. movhlps %xmm11, %xmm15
  775. #ifndef TRMMKERNEL
  776. addss (CO1), %xmm8
  777. addss (CO1, LDC, 1), %xmm12
  778. addss (CO1, LDC, 2), %xmm9
  779. addss (CO1, %rax), %xmm13
  780. addss (CO2), %xmm10
  781. addss (CO2, LDC, 1), %xmm14
  782. addss (CO2, LDC, 2), %xmm11
  783. addss (CO2, %rax), %xmm15
  784. #endif
  785. movss %xmm8, (CO1)
  786. movss %xmm12, (CO1, LDC, 1)
  787. movss %xmm9, (CO1, LDC, 2)
  788. movss %xmm13, (CO1, %rax)
  789. movss %xmm10, (CO2)
  790. movss %xmm14, (CO2, LDC, 1)
  791. movss %xmm11, (CO2, LDC, 2)
  792. movss %xmm15, (CO2, %rax)
  793. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  794. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  795. movq K, %rax
  796. subq KKK, %rax
  797. salq $BASE_SHIFT, %rax
  798. leaq (AO, %rax, 1), AO
  799. addq %rax, %rax
  800. leaq (BO, %rax, 8), BO
  801. #endif
  802. #if defined(TRMMKERNEL) && defined(LEFT)
  803. addq $1, KK
  804. #endif
  805. ALIGN_4
  806. .L39:
  807. #if defined(TRMMKERNEL) && !defined(LEFT)
  808. addl $8, KK
  809. #endif
  810. leaq (C, LDC, 8), C
  811. decq J
  812. jg .L01
  813. ALIGN_4
  814. .L40:
  815. testq $4, N
  816. jle .L80
  817. #if defined(TRMMKERNEL) && defined(LEFT)
  818. movq OFFSET, %rax
  819. movq %rax, KK
  820. #endif
  821. leaq 32 * SIZE + BUFFER, BO
  822. movaps 0 * SIZE(B), %xmm1
  823. movaps 4 * SIZE(B), %xmm3
  824. movaps 8 * SIZE(B), %xmm5
  825. movaps 12 * SIZE(B), %xmm7
  826. movq K, %rax
  827. sarq $2, %rax
  828. jle .L43
  829. ALIGN_4
  830. .L42:
  831. PREFETCH (RPREFETCHSIZE + 0) * SIZE(B)
  832. pshufd $0x50, %xmm1, %xmm0
  833. movaps %xmm0, -32 * SIZE(BO)
  834. pshufd $0xfa, %xmm1, %xmm1
  835. movaps %xmm1, -28 * SIZE(BO)
  836. movaps 16 * SIZE(B), %xmm1
  837. pshufd $0x50, %xmm3, %xmm2
  838. movaps %xmm2, -24 * SIZE(BO)
  839. pshufd $0xfa, %xmm3, %xmm3
  840. movaps %xmm3, -20 * SIZE(BO)
  841. movaps 20 * SIZE(B), %xmm3
  842. pshufd $0x50, %xmm5, %xmm4
  843. movaps %xmm4, -16 * SIZE(BO)
  844. pshufd $0xfa, %xmm5, %xmm5
  845. movaps %xmm5, -12 * SIZE(BO)
  846. movaps 24 * SIZE(B), %xmm5
  847. pshufd $0x50, %xmm7, %xmm6
  848. movaps %xmm6, -8 * SIZE(BO)
  849. pshufd $0xfa, %xmm7, %xmm7
  850. movaps %xmm7, -4 * SIZE(BO)
  851. movaps 28 * SIZE(B), %xmm7
  852. addq $16 * SIZE, B
  853. addq $32 * SIZE, BO
  854. decq %rax
  855. jne .L42
  856. ALIGN_4
  857. .L43:
  858. movq K, %rax
  859. andq $3, %rax
  860. BRANCH
  861. jle .L50
  862. ALIGN_4
  863. .L45:
  864. pshufd $0x50, %xmm1, %xmm0
  865. movaps %xmm0, -32 * SIZE(BO)
  866. pshufd $0xfa, %xmm1, %xmm1
  867. movaps %xmm1, -28 * SIZE(BO)
  868. movaps 4 * SIZE(B), %xmm1
  869. addq $ 4 * SIZE, B
  870. subq $-8 * SIZE, BO
  871. decq %rax
  872. jne .L45
  873. ALIGN_4
  874. .L50:
  875. movq C, CO1
  876. leaq (C, LDC, 2), CO2
  877. movq A, AO
  878. movq M, I
  879. sarq $2, I
  880. jle .L60
  881. ALIGN_4
  882. .L51:
  883. #if !defined(TRMMKERNEL) || \
  884. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  885. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  886. leaq 32 * SIZE + BUFFER, BO
  887. #else
  888. leaq 32 * SIZE + BUFFER, BO
  889. movq KK, %rax
  890. salq $BASE_SHIFT + 1, %rax
  891. leaq (AO, %rax, 2), AO
  892. leaq (BO, %rax, 4), BO
  893. #endif
  894. movaps -32 * SIZE(AO), %xmm0
  895. movaps -32 * SIZE(BO), %xmm1
  896. pxor %xmm8, %xmm8
  897. PREFETCHW 3 * SIZE(CO1)
  898. pxor %xmm9, %xmm9
  899. PREFETCHW 5 * SIZE(CO1, LDC)
  900. pxor %xmm10, %xmm10
  901. PREFETCHW 3 * SIZE(CO2)
  902. pxor %xmm11, %xmm11
  903. PREFETCHW 5 * SIZE(CO2, LDC)
  904. #ifndef TRMMKERNEL
  905. movq K, %rax
  906. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  907. movq K, %rax
  908. subq KK, %rax
  909. movq %rax, KKK
  910. #else
  911. movq KK, %rax
  912. #ifdef LEFT
  913. addq $4, %rax
  914. #else
  915. addq $4, %rax
  916. #endif
  917. movq %rax, KKK
  918. #endif
  919. sarq $2, %rax
  920. NOBRANCH
  921. jle .L56
  922. ALIGN_3
  923. .L52:
  924. PREFETCH (PREFETCHSIZE + 0)(AO)
  925. pshufd $0x4e, %xmm1, %xmm3
  926. mulps %xmm0, %xmm1
  927. addps %xmm1, %xmm8
  928. movaps -28 * SIZE(BO), %xmm1
  929. mulps %xmm0, %xmm3
  930. addps %xmm3, %xmm9
  931. pshufd $0x4e, %xmm1, %xmm3
  932. mulps %xmm0, %xmm1
  933. addps %xmm1, %xmm10
  934. movaps -24 * SIZE(BO), %xmm1
  935. mulps %xmm0, %xmm3
  936. movaps -28 * SIZE(AO), %xmm0
  937. addps %xmm3, %xmm11
  938. pshufd $0x4e, %xmm1, %xmm3
  939. mulps %xmm0, %xmm1
  940. addps %xmm1, %xmm8
  941. movaps -20 * SIZE(BO), %xmm1
  942. mulps %xmm0, %xmm3
  943. addps %xmm3, %xmm9
  944. pshufd $0x4e, %xmm1, %xmm3
  945. mulps %xmm0, %xmm1
  946. addps %xmm1, %xmm10
  947. movaps -16 * SIZE(BO), %xmm1
  948. mulps %xmm0, %xmm3
  949. movaps -24 * SIZE(AO), %xmm0
  950. addps %xmm3, %xmm11
  951. pshufd $0x4e, %xmm1, %xmm3
  952. mulps %xmm0, %xmm1
  953. addps %xmm1, %xmm8
  954. movaps -12 * SIZE(BO), %xmm1
  955. mulps %xmm0, %xmm3
  956. addps %xmm3, %xmm9
  957. pshufd $0x4e, %xmm1, %xmm3
  958. mulps %xmm0, %xmm1
  959. addps %xmm1, %xmm10
  960. movaps -8 * SIZE(BO), %xmm1
  961. mulps %xmm0, %xmm3
  962. movaps -20 * SIZE(AO), %xmm0
  963. addps %xmm3, %xmm11
  964. pshufd $0x4e, %xmm1, %xmm3
  965. mulps %xmm0, %xmm1
  966. addps %xmm1, %xmm8
  967. movaps -4 * SIZE(BO), %xmm1
  968. mulps %xmm0, %xmm3
  969. addps %xmm3, %xmm9
  970. pshufd $0x4e, %xmm1, %xmm3
  971. mulps %xmm0, %xmm1
  972. addps %xmm1, %xmm10
  973. movaps 0 * SIZE(BO), %xmm1
  974. mulps %xmm0, %xmm3
  975. movaps -16 * SIZE(AO), %xmm0
  976. addps %xmm3, %xmm11
  977. subq $-16 * SIZE, AO
  978. subq $-32 * SIZE, BO
  979. decq %rax
  980. BRANCH
  981. jg .L52
  982. .L56:
  983. movaps ALPHA, %xmm7
  984. #ifndef TRMMKERNEL
  985. movq K, %rax
  986. #else
  987. movq KKK, %rax
  988. #endif
  989. andq $3, %rax
  990. je .L58
  991. ALIGN_4
  992. .L57:
  993. pshufd $0x4e, %xmm1, %xmm3
  994. mulps %xmm0, %xmm1
  995. addps %xmm1, %xmm8
  996. movaps -28 * SIZE(BO), %xmm1
  997. mulps %xmm0, %xmm3
  998. addps %xmm3, %xmm9
  999. pshufd $0x4e, %xmm1, %xmm3
  1000. mulps %xmm0, %xmm1
  1001. addps %xmm1, %xmm10
  1002. movaps -24 * SIZE(BO), %xmm1
  1003. mulps %xmm0, %xmm3
  1004. movaps -28 * SIZE(AO), %xmm0
  1005. addps %xmm3, %xmm11
  1006. addq $ 4 * SIZE, AO
  1007. subq $-8 * SIZE, BO
  1008. decq %rax
  1009. jg .L57
  1010. ALIGN_4
  1011. .L58:
  1012. mulps %xmm7, %xmm8
  1013. mulps %xmm7, %xmm9
  1014. mulps %xmm7, %xmm10
  1015. mulps %xmm7, %xmm11
  1016. #ifndef TRMMKERNEL
  1017. movsd 0 * SIZE(CO1), %xmm0
  1018. movhps 2 * SIZE(CO1, LDC), %xmm0
  1019. movsd 0 * SIZE(CO1, LDC), %xmm1
  1020. movhps 2 * SIZE(CO1), %xmm1
  1021. movsd 0 * SIZE(CO2), %xmm2
  1022. movhps 2 * SIZE(CO2, LDC), %xmm2
  1023. movsd 0 * SIZE(CO2, LDC), %xmm3
  1024. movhps 2 * SIZE(CO2), %xmm3
  1025. addps %xmm0, %xmm8
  1026. addps %xmm1, %xmm9
  1027. addps %xmm2, %xmm10
  1028. addps %xmm3, %xmm11
  1029. #endif
  1030. movlps %xmm8, 0 * SIZE(CO1)
  1031. movhps %xmm8, 2 * SIZE(CO1, LDC)
  1032. movlps %xmm9, 0 * SIZE(CO1, LDC)
  1033. movhps %xmm9, 2 * SIZE(CO1)
  1034. movlps %xmm10, 0 * SIZE(CO2)
  1035. movhps %xmm10, 2 * SIZE(CO2, LDC)
  1036. movlps %xmm11, 0 * SIZE(CO2, LDC)
  1037. movhps %xmm11, 2 * SIZE(CO2)
  1038. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1039. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1040. movq K, %rax
  1041. subq KKK, %rax
  1042. salq $BASE_SHIFT + 1, %rax
  1043. leaq (AO, %rax, 2), AO
  1044. leaq (BO, %rax, 4), BO
  1045. #endif
  1046. #if defined(TRMMKERNEL) && defined(LEFT)
  1047. addq $4, KK
  1048. #endif
  1049. addq $4 * SIZE, CO1
  1050. addq $4 * SIZE, CO2
  1051. decq I
  1052. jg .L51
  1053. ALIGN_4
  1054. .L60:
  1055. testq $2, M
  1056. je .L70
  1057. #if !defined(TRMMKERNEL) || \
  1058. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1059. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1060. leaq 32 * SIZE + BUFFER, BO
  1061. #else
  1062. leaq 32 * SIZE + BUFFER, BO
  1063. movq KK, %rax
  1064. salq $BASE_SHIFT + 1, %rax
  1065. leaq (AO, %rax, 1), AO
  1066. leaq (BO, %rax, 4), BO
  1067. #endif
  1068. movddup -32 * SIZE(AO), %xmm0
  1069. pxor %xmm8, %xmm8
  1070. movaps -32 * SIZE(BO), %xmm1
  1071. pxor %xmm9, %xmm9
  1072. pxor %xmm10, %xmm10
  1073. pxor %xmm11, %xmm11
  1074. #ifndef TRMMKERNEL
  1075. movq K, %rax
  1076. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1077. movq K, %rax
  1078. subq KK, %rax
  1079. movq %rax, KKK
  1080. #else
  1081. movq KK, %rax
  1082. #ifdef LEFT
  1083. addq $2, %rax
  1084. #else
  1085. addq $4, %rax
  1086. #endif
  1087. movq %rax, KKK
  1088. #endif
  1089. sarq $2, %rax
  1090. NOBRANCH
  1091. jle .L66
  1092. ALIGN_3
  1093. .L62:
  1094. PREFETCH (PREFETCHSIZE + 0)(AO)
  1095. mulps %xmm0, %xmm1
  1096. mulps -28 * SIZE(BO), %xmm0
  1097. addps %xmm1, %xmm8
  1098. movaps -24 * SIZE(BO), %xmm1
  1099. addps %xmm0, %xmm9
  1100. movddup -30 * SIZE(AO), %xmm0
  1101. mulps %xmm0, %xmm1
  1102. mulps -20 * SIZE(BO), %xmm0
  1103. addps %xmm1, %xmm8
  1104. movaps -16 * SIZE(BO), %xmm1
  1105. addps %xmm0, %xmm9
  1106. movddup -28 * SIZE(AO), %xmm0
  1107. mulps %xmm0, %xmm1
  1108. mulps -12 * SIZE(BO), %xmm0
  1109. addps %xmm1, %xmm8
  1110. movaps -8 * SIZE(BO), %xmm1
  1111. addps %xmm0, %xmm9
  1112. movddup -26 * SIZE(AO), %xmm0
  1113. mulps %xmm0, %xmm1
  1114. mulps -4 * SIZE(BO), %xmm0
  1115. addps %xmm1, %xmm8
  1116. movaps 0 * SIZE(BO), %xmm1
  1117. addps %xmm0, %xmm9
  1118. movddup -24 * SIZE(AO), %xmm0
  1119. subq $ -8 * SIZE, AO
  1120. subq $-32 * SIZE, BO
  1121. decq %rax
  1122. BRANCH
  1123. jg .L62
  1124. .L66:
  1125. movaps ALPHA, %xmm7
  1126. #ifndef TRMMKERNEL
  1127. movq K, %rax
  1128. #else
  1129. movq KKK, %rax
  1130. #endif
  1131. andq $3, %rax
  1132. je .L68
  1133. ALIGN_4
  1134. .L67:
  1135. mulps %xmm0, %xmm1
  1136. mulps -28 * SIZE(BO), %xmm0
  1137. addps %xmm1, %xmm8
  1138. movaps -24 * SIZE(BO), %xmm1
  1139. addps %xmm0, %xmm9
  1140. movddup -30 * SIZE(AO), %xmm0
  1141. addq $ 2 * SIZE, AO
  1142. subq $-8 * SIZE, BO
  1143. decq %rax
  1144. jg .L67
  1145. ALIGN_4
  1146. .L68:
  1147. mulps %xmm7, %xmm8
  1148. mulps %xmm7, %xmm9
  1149. #ifndef TRMMKERNEL
  1150. movsd (CO1), %xmm0
  1151. movhps (CO1, LDC), %xmm0
  1152. movsd (CO2), %xmm1
  1153. movhps (CO2, LDC), %xmm1
  1154. addps %xmm0, %xmm8
  1155. addps %xmm1, %xmm9
  1156. #endif
  1157. movlps %xmm8, (CO1)
  1158. movhps %xmm8, (CO1, LDC)
  1159. movlps %xmm9, (CO2)
  1160. movhps %xmm9, (CO2, LDC)
  1161. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1162. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1163. movq K, %rax
  1164. subq KKK, %rax
  1165. salq $BASE_SHIFT + 1, %rax
  1166. leaq (AO, %rax, 1), AO
  1167. leaq (BO, %rax, 4), BO
  1168. #endif
  1169. #if defined(TRMMKERNEL) && defined(LEFT)
  1170. addq $2, KK
  1171. #endif
  1172. addq $2 * SIZE, CO1
  1173. addq $2 * SIZE, CO2
  1174. ALIGN_4
  1175. .L70:
  1176. testq $1, M
  1177. je .L79
  1178. #if !defined(TRMMKERNEL) || \
  1179. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1180. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1181. leaq 32 * SIZE + BUFFER, BO
  1182. #else
  1183. leaq 32 * SIZE + BUFFER, BO
  1184. movq KK, %rax
  1185. salq $BASE_SHIFT, %rax
  1186. leaq (AO, %rax, 1), AO
  1187. leaq (BO, %rax, 8), BO
  1188. #endif
  1189. movss -32 * SIZE(AO), %xmm0
  1190. pxor %xmm8, %xmm8
  1191. movaps -32 * SIZE(BO), %xmm1
  1192. pxor %xmm9, %xmm9
  1193. #ifndef TRMMKERNEL
  1194. movq K, %rax
  1195. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1196. movq K, %rax
  1197. subq KK, %rax
  1198. movq %rax, KKK
  1199. #else
  1200. movq KK, %rax
  1201. #ifdef LEFT
  1202. addq $1, %rax
  1203. #else
  1204. addq $4, %rax
  1205. #endif
  1206. movq %rax, KKK
  1207. #endif
  1208. sarq $2, %rax
  1209. NOBRANCH
  1210. jle .L76
  1211. ALIGN_3
  1212. .L72:
  1213. PREFETCH (PREFETCHSIZE + 0)(AO)
  1214. shufps $0, %xmm0, %xmm0
  1215. mulps %xmm0, %xmm1
  1216. mulps -28 * SIZE(BO), %xmm0
  1217. addps %xmm1, %xmm8
  1218. movaps -24 * SIZE(BO), %xmm1
  1219. addps %xmm0, %xmm9
  1220. movss -31 * SIZE(AO), %xmm0
  1221. shufps $0, %xmm0, %xmm0
  1222. mulps %xmm0, %xmm1
  1223. mulps -20 * SIZE(BO), %xmm0
  1224. addps %xmm1, %xmm8
  1225. movaps -16 * SIZE(BO), %xmm1
  1226. addps %xmm0, %xmm9
  1227. movss -30 * SIZE(AO), %xmm0
  1228. shufps $0, %xmm0, %xmm0
  1229. mulps %xmm0, %xmm1
  1230. mulps -12 * SIZE(BO), %xmm0
  1231. addps %xmm1, %xmm8
  1232. movaps -8 * SIZE(BO), %xmm1
  1233. addps %xmm0, %xmm9
  1234. movss -29 * SIZE(AO), %xmm0
  1235. shufps $0, %xmm0, %xmm0
  1236. mulps %xmm0, %xmm1
  1237. mulps -4 * SIZE(BO), %xmm0
  1238. addps %xmm1, %xmm8
  1239. movaps 0 * SIZE(BO), %xmm1
  1240. addps %xmm0, %xmm9
  1241. movss -28 * SIZE(AO), %xmm0
  1242. subq $ -4 * SIZE, AO
  1243. subq $-32 * SIZE, BO
  1244. decq %rax
  1245. BRANCH
  1246. jg .L72
  1247. .L76:
  1248. movaps ALPHA, %xmm7
  1249. #ifndef TRMMKERNEL
  1250. movq K, %rax
  1251. #else
  1252. movq KKK, %rax
  1253. #endif
  1254. andq $3, %rax
  1255. je .L78
  1256. ALIGN_4
  1257. .L77:
  1258. shufps $0, %xmm0, %xmm0
  1259. mulps %xmm0, %xmm1
  1260. mulps -28 * SIZE(BO), %xmm0
  1261. addps %xmm1, %xmm8
  1262. movaps -24 * SIZE(BO), %xmm1
  1263. addps %xmm0, %xmm9
  1264. movss -31 * SIZE(AO), %xmm0
  1265. addq $ 1 * SIZE, AO
  1266. subq $-8 * SIZE, BO
  1267. decq %rax
  1268. jg .L77
  1269. ALIGN_4
  1270. .L78:
  1271. mulps %xmm7, %xmm8
  1272. mulps %xmm7, %xmm9
  1273. movhlps %xmm8, %xmm10
  1274. movhlps %xmm9, %xmm11
  1275. #ifndef TRMMKERNEL
  1276. addss (CO1), %xmm8
  1277. addss (CO1, LDC), %xmm10
  1278. addss (CO2), %xmm9
  1279. addss (CO2, LDC), %xmm11
  1280. #endif
  1281. movss %xmm8, (CO1)
  1282. movss %xmm10, (CO1, LDC)
  1283. movss %xmm9, (CO2)
  1284. movss %xmm11, (CO2, LDC)
  1285. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1286. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1287. movq K, %rax
  1288. subq KKK, %rax
  1289. salq $BASE_SHIFT, %rax
  1290. leaq (AO, %rax, 1), AO
  1291. leaq (BO, %rax, 8), BO
  1292. #endif
  1293. #if defined(TRMMKERNEL) && defined(LEFT)
  1294. addq $1, KK
  1295. #endif
  1296. ALIGN_4
  1297. .L79:
  1298. #if defined(TRMMKERNEL) && !defined(LEFT)
  1299. addl $4, KK
  1300. #endif
  1301. leaq (C, LDC, 4), C
  1302. ALIGN_4
  1303. .L80:
  1304. testq $2, N
  1305. jle .L120
  1306. #if defined(TRMMKERNEL) && defined(LEFT)
  1307. movq OFFSET, %rax
  1308. movq %rax, KK
  1309. #endif
  1310. leaq 32 * SIZE + BUFFER, BO
  1311. movaps 0 * SIZE(B), %xmm1
  1312. movaps 4 * SIZE(B), %xmm3
  1313. movq K, %rax
  1314. sarq $2, %rax
  1315. jle .L83
  1316. ALIGN_4
  1317. .L82:
  1318. pshufd $0x50, %xmm1, %xmm0
  1319. movaps %xmm0, -32 * SIZE(BO)
  1320. pshufd $0xfa, %xmm1, %xmm1
  1321. movaps %xmm1, -28 * SIZE(BO)
  1322. movaps 8 * SIZE(B), %xmm1
  1323. pshufd $0x50, %xmm3, %xmm2
  1324. movaps %xmm2, -24 * SIZE(BO)
  1325. pshufd $0xfa, %xmm3, %xmm3
  1326. movaps %xmm3, -20 * SIZE(BO)
  1327. movaps 12 * SIZE(B), %xmm3
  1328. addq $ 8 * SIZE, B
  1329. subq $-16 * SIZE, BO
  1330. decq %rax
  1331. jne .L82
  1332. ALIGN_4
  1333. .L83:
  1334. movq K, %rax
  1335. andq $3, %rax
  1336. BRANCH
  1337. jle .L90
  1338. ALIGN_4
  1339. .L85:
  1340. pshufd $0x50, %xmm1, %xmm0
  1341. movaps %xmm0, -32 * SIZE(BO)
  1342. movsd 2 * SIZE(B), %xmm1
  1343. addq $ 2 * SIZE, B
  1344. subq $-4 * SIZE, BO
  1345. decq %rax
  1346. jne .L85
  1347. ALIGN_4
  1348. .L90:
  1349. movq C, CO1
  1350. leaq (C, LDC), CO2
  1351. movq A, AO
  1352. movq M, I
  1353. sarq $2, I
  1354. jle .L100
  1355. ALIGN_4
  1356. .L91:
  1357. #if !defined(TRMMKERNEL) || \
  1358. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1359. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1360. leaq 32 * SIZE + BUFFER, BO
  1361. #else
  1362. leaq 32 * SIZE + BUFFER, BO
  1363. movq KK, %rax
  1364. salq $BASE_SHIFT, %rax
  1365. leaq (AO, %rax, 4), AO
  1366. leaq (BO, %rax, 4), BO
  1367. #endif
  1368. movaps -32 * SIZE(AO), %xmm0
  1369. movaps -32 * SIZE(BO), %xmm1
  1370. pxor %xmm8, %xmm8
  1371. PREFETCHW 3 * SIZE(CO1)
  1372. pxor %xmm9, %xmm9
  1373. PREFETCHW 3 * SIZE(CO2)
  1374. #ifndef TRMMKERNEL
  1375. movq K, %rax
  1376. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1377. movq K, %rax
  1378. subq KK, %rax
  1379. movq %rax, KKK
  1380. #else
  1381. movq KK, %rax
  1382. #ifdef LEFT
  1383. addq $4, %rax
  1384. #else
  1385. addq $2, %rax
  1386. #endif
  1387. movq %rax, KKK
  1388. #endif
  1389. sarq $2, %rax
  1390. NOBRANCH
  1391. jle .L96
  1392. ALIGN_3
  1393. .L92:
  1394. PREFETCH (PREFETCHSIZE + 0)(AO)
  1395. pshufd $0x4e, %xmm1, %xmm3
  1396. mulps %xmm0, %xmm1
  1397. addps %xmm1, %xmm8
  1398. movaps -28 * SIZE(BO), %xmm1
  1399. mulps %xmm0, %xmm3
  1400. addps %xmm3, %xmm9
  1401. movaps -28 * SIZE(AO), %xmm0
  1402. pshufd $0x4e, %xmm1, %xmm3
  1403. mulps %xmm0, %xmm1
  1404. addps %xmm1, %xmm8
  1405. movaps -24 * SIZE(BO), %xmm1
  1406. mulps %xmm0, %xmm3
  1407. addps %xmm3, %xmm9
  1408. movaps -24 * SIZE(AO), %xmm0
  1409. pshufd $0x4e, %xmm1, %xmm3
  1410. mulps %xmm0, %xmm1
  1411. addps %xmm1, %xmm8
  1412. movaps -20 * SIZE(BO), %xmm1
  1413. mulps %xmm0, %xmm3
  1414. addps %xmm3, %xmm9
  1415. movaps -20 * SIZE(AO), %xmm0
  1416. pshufd $0x4e, %xmm1, %xmm3
  1417. mulps %xmm0, %xmm1
  1418. addps %xmm1, %xmm8
  1419. movaps -16 * SIZE(BO), %xmm1
  1420. mulps %xmm0, %xmm3
  1421. addps %xmm3, %xmm9
  1422. movaps -16 * SIZE(AO), %xmm0
  1423. subq $-16 * SIZE, AO
  1424. subq $-16 * SIZE, BO
  1425. decq %rax
  1426. BRANCH
  1427. jg .L92
  1428. .L96:
  1429. movaps ALPHA, %xmm7
  1430. #ifndef TRMMKERNEL
  1431. movq K, %rax
  1432. #else
  1433. movq KKK, %rax
  1434. #endif
  1435. andq $3, %rax
  1436. je .L98
  1437. ALIGN_4
  1438. .L97:
  1439. pshufd $0x4e, %xmm1, %xmm3
  1440. mulps %xmm0, %xmm1
  1441. addps %xmm1, %xmm8
  1442. movaps -28 * SIZE(BO), %xmm1
  1443. mulps %xmm0, %xmm3
  1444. addps %xmm3, %xmm9
  1445. movaps -28 * SIZE(AO), %xmm0
  1446. addq $ 4 * SIZE, AO
  1447. subq $-4 * SIZE, BO
  1448. decq %rax
  1449. jg .L97
  1450. ALIGN_4
  1451. .L98:
  1452. mulps %xmm7, %xmm8
  1453. mulps %xmm7, %xmm9
  1454. #ifndef TRMMKERNEL
  1455. movsd 0 * SIZE(CO1), %xmm0
  1456. movhps 2 * SIZE(CO2), %xmm0
  1457. movsd 0 * SIZE(CO2), %xmm1
  1458. movhps 2 * SIZE(CO1), %xmm1
  1459. addps %xmm0, %xmm8
  1460. addps %xmm1, %xmm9
  1461. #endif
  1462. movlps %xmm8, 0 * SIZE(CO1)
  1463. movhps %xmm8, 2 * SIZE(CO2)
  1464. movlps %xmm9, 0 * SIZE(CO2)
  1465. movhps %xmm9, 2 * SIZE(CO1)
  1466. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1467. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1468. movq K, %rax
  1469. subq KKK, %rax
  1470. salq $BASE_SHIFT, %rax
  1471. leaq (AO, %rax, 4), AO
  1472. leaq (BO, %rax, 4), BO
  1473. #endif
  1474. #if defined(TRMMKERNEL) && defined(LEFT)
  1475. addq $4, KK
  1476. #endif
  1477. addq $4 * SIZE, CO1
  1478. addq $4 * SIZE, CO2
  1479. decq I
  1480. jg .L91
  1481. ALIGN_4
  1482. .L100:
  1483. testq $2, M
  1484. je .L110
  1485. #if !defined(TRMMKERNEL) || \
  1486. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1487. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1488. leaq 32 * SIZE + BUFFER, BO
  1489. #else
  1490. leaq 32 * SIZE + BUFFER, BO
  1491. movq KK, %rax
  1492. salq $BASE_SHIFT, %rax
  1493. leaq (AO, %rax, 2), AO
  1494. leaq (BO, %rax, 4), BO
  1495. #endif
  1496. movddup -32 * SIZE(AO), %xmm0
  1497. pxor %xmm8, %xmm8
  1498. movaps -32 * SIZE(BO), %xmm1
  1499. #ifndef TRMMKERNEL
  1500. movq K, %rax
  1501. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1502. movq K, %rax
  1503. subq KK, %rax
  1504. movq %rax, KKK
  1505. #else
  1506. movq KK, %rax
  1507. #ifdef LEFT
  1508. addq $2, %rax
  1509. #else
  1510. addq $2, %rax
  1511. #endif
  1512. movq %rax, KKK
  1513. #endif
  1514. sarq $2, %rax
  1515. NOBRANCH
  1516. jle .L106
  1517. ALIGN_3
  1518. .L102:
  1519. PREFETCH (PREFETCHSIZE + 0)(AO)
  1520. mulps %xmm0, %xmm1
  1521. movddup -30 * SIZE(AO), %xmm0
  1522. addps %xmm1, %xmm8
  1523. movaps -28 * SIZE(BO), %xmm1
  1524. mulps %xmm0, %xmm1
  1525. movddup -28 * SIZE(AO), %xmm0
  1526. addps %xmm1, %xmm8
  1527. movaps -24 * SIZE(BO), %xmm1
  1528. mulps %xmm0, %xmm1
  1529. movddup -26 * SIZE(AO), %xmm0
  1530. addps %xmm1, %xmm8
  1531. movaps -20 * SIZE(BO), %xmm1
  1532. mulps %xmm0, %xmm1
  1533. movddup -24 * SIZE(AO), %xmm0
  1534. addps %xmm1, %xmm8
  1535. movaps -16 * SIZE(BO), %xmm1
  1536. subq $ -8 * SIZE, AO
  1537. subq $-16 * SIZE, BO
  1538. decq %rax
  1539. BRANCH
  1540. jg .L102
  1541. .L106:
  1542. movaps ALPHA, %xmm7
  1543. #ifndef TRMMKERNEL
  1544. movq K, %rax
  1545. #else
  1546. movq KKK, %rax
  1547. #endif
  1548. andq $3, %rax
  1549. je .L108
  1550. ALIGN_4
  1551. .L107:
  1552. mulps %xmm0, %xmm1
  1553. movddup -30 * SIZE(AO), %xmm0
  1554. addps %xmm1, %xmm8
  1555. movaps -28 * SIZE(BO), %xmm1
  1556. addq $ 2 * SIZE, AO
  1557. subq $-4 * SIZE, BO
  1558. decq %rax
  1559. jg .L107
  1560. ALIGN_4
  1561. .L108:
  1562. mulps %xmm7, %xmm8
  1563. #ifndef TRMMKERNEL
  1564. movsd (CO1), %xmm0
  1565. movhps (CO2), %xmm0
  1566. addps %xmm0, %xmm8
  1567. #endif
  1568. movlps %xmm8, (CO1)
  1569. movhps %xmm8, (CO2)
  1570. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1571. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1572. movq K, %rax
  1573. subq KKK, %rax
  1574. salq $BASE_SHIFT, %rax
  1575. leaq (AO, %rax, 2), AO
  1576. leaq (BO, %rax, 4), BO
  1577. #endif
  1578. #if defined(TRMMKERNEL) && defined(LEFT)
  1579. addq $2, KK
  1580. #endif
  1581. addq $2 * SIZE, CO1
  1582. addq $2 * SIZE, CO2
  1583. ALIGN_4
  1584. .L110:
  1585. testq $1, M
  1586. je .L119
  1587. #if !defined(TRMMKERNEL) || \
  1588. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1589. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1590. leaq 32 * SIZE + BUFFER, BO
  1591. #else
  1592. leaq 32 * SIZE + BUFFER, BO
  1593. movq KK, %rax
  1594. salq $BASE_SHIFT, %rax
  1595. leaq (AO, %rax, 1), AO
  1596. leaq (BO, %rax, 4), BO
  1597. #endif
  1598. movss -32 * SIZE(AO), %xmm0
  1599. pxor %xmm8, %xmm8
  1600. movaps -32 * SIZE(BO), %xmm1
  1601. #ifndef TRMMKERNEL
  1602. movq K, %rax
  1603. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1604. movq K, %rax
  1605. subq KK, %rax
  1606. movq %rax, KKK
  1607. #else
  1608. movq KK, %rax
  1609. #ifdef LEFT
  1610. addq $1, %rax
  1611. #else
  1612. addq $2, %rax
  1613. #endif
  1614. movq %rax, KKK
  1615. #endif
  1616. sarq $2, %rax
  1617. NOBRANCH
  1618. jle .L116
  1619. ALIGN_3
  1620. .L112:
  1621. PREFETCH (PREFETCHSIZE + 0)(AO)
  1622. shufps $0, %xmm0, %xmm0
  1623. mulps %xmm0, %xmm1
  1624. movss -31 * SIZE(AO), %xmm0
  1625. addps %xmm1, %xmm8
  1626. movaps -28 * SIZE(BO), %xmm1
  1627. shufps $0, %xmm0, %xmm0
  1628. mulps %xmm0, %xmm1
  1629. movss -30 * SIZE(AO), %xmm0
  1630. addps %xmm1, %xmm8
  1631. movaps -24 * SIZE(BO), %xmm1
  1632. shufps $0, %xmm0, %xmm0
  1633. mulps %xmm0, %xmm1
  1634. movss -29 * SIZE(AO), %xmm0
  1635. addps %xmm1, %xmm8
  1636. movaps -20 * SIZE(BO), %xmm1
  1637. shufps $0, %xmm0, %xmm0
  1638. mulps %xmm0, %xmm1
  1639. movss -28 * SIZE(AO), %xmm0
  1640. addps %xmm1, %xmm8
  1641. movaps -16 * SIZE(BO), %xmm1
  1642. subq $ -4 * SIZE, AO
  1643. subq $-16 * SIZE, BO
  1644. decq %rax
  1645. BRANCH
  1646. jg .L112
  1647. .L116:
  1648. movaps ALPHA, %xmm7
  1649. #ifndef TRMMKERNEL
  1650. movq K, %rax
  1651. #else
  1652. movq KKK, %rax
  1653. #endif
  1654. andq $3, %rax
  1655. je .L118
  1656. ALIGN_4
  1657. .L117:
  1658. shufps $0, %xmm0, %xmm0
  1659. mulps %xmm0, %xmm1
  1660. movss -31 * SIZE(AO), %xmm0
  1661. addps %xmm1, %xmm8
  1662. movaps -28 * SIZE(BO), %xmm1
  1663. addq $ 1 * SIZE, AO
  1664. subq $-4 * SIZE, BO
  1665. decq %rax
  1666. jg .L117
  1667. ALIGN_4
  1668. .L118:
  1669. mulps %xmm7, %xmm8
  1670. movhlps %xmm8, %xmm9
  1671. #ifndef TRMMKERNEL
  1672. addss (CO1), %xmm8
  1673. addss (CO2), %xmm9
  1674. #endif
  1675. movss %xmm8, (CO1)
  1676. movss %xmm9, (CO2)
  1677. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1678. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1679. movq K, %rax
  1680. subq KKK, %rax
  1681. salq $BASE_SHIFT, %rax
  1682. leaq (AO, %rax, 1), AO
  1683. leaq (BO, %rax, 4), BO
  1684. #endif
  1685. #if defined(TRMMKERNEL) && defined(LEFT)
  1686. addq $1, KK
  1687. #endif
  1688. ALIGN_4
  1689. .L119:
  1690. #if defined(TRMMKERNEL) && !defined(LEFT)
  1691. addl $2, KK
  1692. #endif
  1693. leaq (C, LDC, 2), C
  1694. ALIGN_4
  1695. .L120:
  1696. testq $1, N
  1697. jle .L999
  1698. #if defined(TRMMKERNEL) && defined(LEFT)
  1699. movq OFFSET, %rax
  1700. movq %rax, KK
  1701. #endif
  1702. leaq 32 * SIZE + BUFFER, BO
  1703. movsd 0 * SIZE(B), %xmm1
  1704. movhps 2 * SIZE(B), %xmm1
  1705. movq K, %rax
  1706. sarq $2, %rax
  1707. jle .L123
  1708. ALIGN_4
  1709. .L122:
  1710. pshufd $0x50, %xmm1, %xmm0
  1711. movaps %xmm0, -32 * SIZE(BO)
  1712. pshufd $0xfa, %xmm1, %xmm1
  1713. movaps %xmm1, -28 * SIZE(BO)
  1714. movsd 4 * SIZE(B), %xmm1
  1715. movhps 6 * SIZE(B), %xmm1
  1716. addq $ 4 * SIZE, B
  1717. subq $-8 * SIZE, BO
  1718. decq %rax
  1719. jne .L122
  1720. ALIGN_4
  1721. .L123:
  1722. movq K, %rax
  1723. andq $3, %rax
  1724. BRANCH
  1725. jle .L130
  1726. ALIGN_4
  1727. .L125:
  1728. pshufd $0x50, %xmm1, %xmm0
  1729. movlps %xmm0, -32 * SIZE(BO)
  1730. movss 1 * SIZE(B), %xmm1
  1731. addq $ 1 * SIZE, B
  1732. subq $-2 * SIZE, BO
  1733. decq %rax
  1734. jne .L125
  1735. ALIGN_4
  1736. .L130:
  1737. movq C, CO1
  1738. movq A, AO
  1739. movq M, I
  1740. sarq $2, I
  1741. jle .L140
  1742. ALIGN_4
  1743. .L131:
  1744. #if !defined(TRMMKERNEL) || \
  1745. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1746. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1747. leaq 32 * SIZE + BUFFER, BO
  1748. #else
  1749. leaq 32 * SIZE + BUFFER, BO
  1750. movq KK, %rax
  1751. salq $BASE_SHIFT, %rax
  1752. leaq (AO, %rax, 4), AO
  1753. leaq (BO, %rax, 2), BO
  1754. #endif
  1755. movaps -32 * SIZE(AO), %xmm0
  1756. movddup -32 * SIZE(BO), %xmm1
  1757. pxor %xmm8, %xmm8
  1758. PREFETCHW 3 * SIZE(CO1)
  1759. #ifndef TRMMKERNEL
  1760. movq K, %rax
  1761. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1762. movq K, %rax
  1763. subq KK, %rax
  1764. movq %rax, KKK
  1765. #else
  1766. movq KK, %rax
  1767. #ifdef LEFT
  1768. addq $4, %rax
  1769. #else
  1770. addq $1, %rax
  1771. #endif
  1772. movq %rax, KKK
  1773. #endif
  1774. sarq $2, %rax
  1775. NOBRANCH
  1776. jle .L136
  1777. ALIGN_3
  1778. .L132:
  1779. PREFETCH (PREFETCHSIZE + 0)(AO)
  1780. mulps %xmm0, %xmm1
  1781. movaps -28 * SIZE(AO), %xmm0
  1782. addps %xmm1, %xmm8
  1783. movddup -30 * SIZE(BO), %xmm1
  1784. mulps %xmm0, %xmm1
  1785. movaps -24 * SIZE(AO), %xmm0
  1786. addps %xmm1, %xmm8
  1787. movddup -28 * SIZE(BO), %xmm1
  1788. mulps %xmm0, %xmm1
  1789. movaps -20 * SIZE(AO), %xmm0
  1790. addps %xmm1, %xmm8
  1791. movddup -26 * SIZE(BO), %xmm1
  1792. mulps %xmm0, %xmm1
  1793. movaps -16 * SIZE(AO), %xmm0
  1794. addps %xmm1, %xmm8
  1795. movddup -24 * SIZE(BO), %xmm1
  1796. subq $-16 * SIZE, AO
  1797. subq $ -8 * SIZE, BO
  1798. decq %rax
  1799. BRANCH
  1800. jg .L132
  1801. .L136:
  1802. movaps ALPHA, %xmm7
  1803. #ifndef TRMMKERNEL
  1804. movq K, %rax
  1805. #else
  1806. movq KKK, %rax
  1807. #endif
  1808. andq $3, %rax
  1809. je .L138
  1810. ALIGN_4
  1811. .L137:
  1812. mulps %xmm0, %xmm1
  1813. movaps -28 * SIZE(AO), %xmm0
  1814. addps %xmm1, %xmm8
  1815. movddup -30 * SIZE(BO), %xmm1
  1816. addq $ 4 * SIZE, AO
  1817. subq $-2 * SIZE, BO
  1818. decq %rax
  1819. jg .L137
  1820. ALIGN_4
  1821. .L138:
  1822. mulps %xmm7, %xmm8
  1823. #ifndef TRMMKERNEL
  1824. movsd 0 * SIZE(CO1), %xmm0
  1825. movhps 2 * SIZE(CO1), %xmm0
  1826. addps %xmm0, %xmm8
  1827. #endif
  1828. movlps %xmm8, 0 * SIZE(CO1)
  1829. movhps %xmm8, 2 * SIZE(CO1)
  1830. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1831. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1832. movq K, %rax
  1833. subq KKK, %rax
  1834. salq $BASE_SHIFT, %rax
  1835. leaq (AO, %rax, 4), AO
  1836. leaq (BO, %rax, 2), BO
  1837. #endif
  1838. #if defined(TRMMKERNEL) && defined(LEFT)
  1839. addq $4, KK
  1840. #endif
  1841. addq $4 * SIZE, CO1
  1842. decq I
  1843. jg .L131
  1844. ALIGN_4
  1845. .L140:
  1846. testq $2, M
  1847. je .L150
  1848. #if !defined(TRMMKERNEL) || \
  1849. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1850. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1851. leaq 32 * SIZE + BUFFER, BO
  1852. #else
  1853. leaq 32 * SIZE + BUFFER, BO
  1854. movq KK, %rax
  1855. salq $BASE_SHIFT, %rax
  1856. leaq (AO, %rax, 2), AO
  1857. leaq (BO, %rax, 2), BO
  1858. #endif
  1859. movddup -32 * SIZE(AO), %xmm0
  1860. pxor %xmm8, %xmm8
  1861. movaps -32 * SIZE(BO), %xmm1
  1862. #ifndef TRMMKERNEL
  1863. movq K, %rax
  1864. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1865. movq K, %rax
  1866. subq KK, %rax
  1867. movq %rax, KKK
  1868. #else
  1869. movq KK, %rax
  1870. #ifdef LEFT
  1871. addq $2, %rax
  1872. #else
  1873. addq $1, %rax
  1874. #endif
  1875. movq %rax, KKK
  1876. #endif
  1877. sarq $2, %rax
  1878. NOBRANCH
  1879. jle .L146
  1880. ALIGN_3
  1881. .L142:
  1882. PREFETCH (PREFETCHSIZE + 0)(AO)
  1883. mulps %xmm0, %xmm1
  1884. movddup -30 * SIZE(AO), %xmm0
  1885. addps %xmm1, %xmm8
  1886. movsd -30 * SIZE(BO), %xmm1
  1887. mulps %xmm0, %xmm1
  1888. movddup -28 * SIZE(AO), %xmm0
  1889. addps %xmm1, %xmm8
  1890. movsd -28 * SIZE(BO), %xmm1
  1891. mulps %xmm0, %xmm1
  1892. movddup -26 * SIZE(AO), %xmm0
  1893. addps %xmm1, %xmm8
  1894. movsd -26 * SIZE(BO), %xmm1
  1895. mulps %xmm0, %xmm1
  1896. movddup -24 * SIZE(AO), %xmm0
  1897. addps %xmm1, %xmm8
  1898. movsd -24 * SIZE(BO), %xmm1
  1899. subq $-8 * SIZE, AO
  1900. subq $-8 * SIZE, BO
  1901. decq %rax
  1902. BRANCH
  1903. jg .L142
  1904. .L146:
  1905. movaps ALPHA, %xmm7
  1906. #ifndef TRMMKERNEL
  1907. movq K, %rax
  1908. #else
  1909. movq KKK, %rax
  1910. #endif
  1911. andq $3, %rax
  1912. je .L148
  1913. ALIGN_4
  1914. .L147:
  1915. mulps %xmm0, %xmm1
  1916. movddup -30 * SIZE(AO), %xmm0
  1917. addps %xmm1, %xmm8
  1918. movsd -30 * SIZE(BO), %xmm1
  1919. addq $ 2 * SIZE, AO
  1920. subq $-2 * SIZE, BO
  1921. decq %rax
  1922. jg .L147
  1923. ALIGN_4
  1924. .L148:
  1925. mulps %xmm7, %xmm8
  1926. #ifndef TRMMKERNEL
  1927. movsd (CO1), %xmm0
  1928. addps %xmm0, %xmm8
  1929. #endif
  1930. movlps %xmm8, (CO1)
  1931. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1932. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1933. movq K, %rax
  1934. subq KKK, %rax
  1935. salq $BASE_SHIFT, %rax
  1936. leaq (AO, %rax, 2), AO
  1937. leaq (BO, %rax, 2), BO
  1938. #endif
  1939. #if defined(TRMMKERNEL) && defined(LEFT)
  1940. addq $2, KK
  1941. #endif
  1942. addq $2 * SIZE, CO1
  1943. ALIGN_4
  1944. .L150:
  1945. testq $1, M
  1946. je .L999
  1947. #if !defined(TRMMKERNEL) || \
  1948. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1949. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1950. leaq 32 * SIZE + BUFFER, BO
  1951. #else
  1952. leaq 32 * SIZE + BUFFER, BO
  1953. movq KK, %rax
  1954. salq $BASE_SHIFT, %rax
  1955. leaq (AO, %rax, 1), AO
  1956. leaq (BO, %rax, 2), BO
  1957. #endif
  1958. movss -32 * SIZE(AO), %xmm0
  1959. pxor %xmm8, %xmm8
  1960. movss -32 * SIZE(BO), %xmm1
  1961. #ifndef TRMMKERNEL
  1962. movq K, %rax
  1963. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1964. movq K, %rax
  1965. subq KK, %rax
  1966. movq %rax, KKK
  1967. #else
  1968. movq KK, %rax
  1969. #ifdef LEFT
  1970. addq $1, %rax
  1971. #else
  1972. addq $1, %rax
  1973. #endif
  1974. movq %rax, KKK
  1975. #endif
  1976. sarq $2, %rax
  1977. NOBRANCH
  1978. jle .L156
  1979. ALIGN_3
  1980. .L152:
  1981. PREFETCH (PREFETCHSIZE + 0)(AO)
  1982. mulss %xmm0, %xmm1
  1983. movss -31 * SIZE(AO), %xmm0
  1984. addss %xmm1, %xmm8
  1985. movss -30 * SIZE(BO), %xmm1
  1986. mulss %xmm0, %xmm1
  1987. movss -30 * SIZE(AO), %xmm0
  1988. addss %xmm1, %xmm8
  1989. movss -28 * SIZE(BO), %xmm1
  1990. mulss %xmm0, %xmm1
  1991. movss -29 * SIZE(AO), %xmm0
  1992. addss %xmm1, %xmm8
  1993. movss -26 * SIZE(BO), %xmm1
  1994. mulss %xmm0, %xmm1
  1995. movss -28 * SIZE(AO), %xmm0
  1996. addss %xmm1, %xmm8
  1997. movss -24 * SIZE(BO), %xmm1
  1998. subq $-4 * SIZE, AO
  1999. subq $-8 * SIZE, BO
  2000. decq %rax
  2001. BRANCH
  2002. jg .L152
  2003. .L156:
  2004. movaps ALPHA, %xmm7
  2005. #ifndef TRMMKERNEL
  2006. movq K, %rax
  2007. #else
  2008. movq KKK, %rax
  2009. #endif
  2010. andq $3, %rax
  2011. je .L158
  2012. ALIGN_4
  2013. .L157:
  2014. mulss %xmm0, %xmm1
  2015. movss -31 * SIZE(AO), %xmm0
  2016. addss %xmm1, %xmm8
  2017. movss -30 * SIZE(BO), %xmm1
  2018. addq $1 * SIZE, AO
  2019. addq $2 * SIZE, BO
  2020. decq %rax
  2021. jg .L157
  2022. ALIGN_4
  2023. .L158:
  2024. mulss %xmm7, %xmm8
  2025. #ifndef TRMMKERNEL
  2026. addss (CO1), %xmm8
  2027. #endif
  2028. movss %xmm8, (CO1)
  2029. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2030. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2031. movq K, %rax
  2032. subq KKK, %rax
  2033. salq $BASE_SHIFT, %rax
  2034. leaq (AO, %rax, 1), AO
  2035. leaq (BO, %rax, 2), BO
  2036. #endif
  2037. #if defined(TRMMKERNEL) && defined(LEFT)
  2038. addq $1, KK
  2039. #endif
  2040. ALIGN_4
  2041. .L999:
  2042. movq %rbx, %rsp
  2043. movq 0(%rsp), %rbx
  2044. movq 8(%rsp), %rbp
  2045. movq 16(%rsp), %r12
  2046. movq 24(%rsp), %r13
  2047. movq 32(%rsp), %r14
  2048. movq 40(%rsp), %r15
  2049. #ifdef WINDOWS_ABI
  2050. movq 48(%rsp), %rdi
  2051. movq 56(%rsp), %rsi
  2052. movups 64(%rsp), %xmm6
  2053. movups 80(%rsp), %xmm7
  2054. movups 96(%rsp), %xmm8
  2055. movups 112(%rsp), %xmm9
  2056. movups 128(%rsp), %xmm10
  2057. movups 144(%rsp), %xmm11
  2058. movups 160(%rsp), %xmm12
  2059. movups 176(%rsp), %xmm13
  2060. movups 192(%rsp), %xmm14
  2061. movups 208(%rsp), %xmm15
  2062. #endif
  2063. addq $STACKSIZE, %rsp
  2064. ret
  2065. EPILOGUE