You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm3m_kernel_4x4_penryn.S 33 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 24 + STACK + ARGS(%esp)
  47. #define ARG_B 28 + STACK + ARGS(%esp)
  48. #define C 32 + STACK + ARGS(%esp)
  49. #define ARG_LDC 36 + STACK + ARGS(%esp)
  50. #define J 0 + STACK(%esp)
  51. #define BX 4 + STACK(%esp)
  52. #define KK 8 + STACK(%esp)
  53. #define KKK 12 + STACK(%esp)
  54. #define PREFETCH_R (8 * 4)
  55. #define PREFETCHSIZE (8 * 17 + 4)
  56. #define PREFETCH prefetcht0
  57. #define AA %edx
  58. #define BB %ecx
  59. #define LDC %ebp
  60. #define B %edi
  61. #define C1 %esi
  62. #define I %ebx
  63. PROLOGUE
  64. subl $ARGS, %esp # Generate Stack Frame
  65. pushl %ebp
  66. pushl %edi
  67. pushl %esi
  68. pushl %ebx
  69. PROFCODE
  70. movl ARG_B, B
  71. movl ARG_LDC, LDC
  72. #ifdef TRMMKERNEL
  73. movl OFFSET, %eax
  74. #ifndef LEFT
  75. negl %eax
  76. #endif
  77. movl %eax, KK
  78. #endif
  79. subl $-32 * SIZE, A
  80. subl $-32 * SIZE, B
  81. sall $ZBASE_SHIFT, LDC
  82. movl N, %eax
  83. sarl $2, %eax
  84. movl %eax, J
  85. jle .L40
  86. ALIGN_4
  87. .L01:
  88. #if defined(TRMMKERNEL) && defined(LEFT)
  89. movl OFFSET, %eax
  90. movl %eax, KK
  91. #endif
  92. movl K, %eax
  93. sall $BASE_SHIFT + 2, %eax
  94. leal (B, %eax), %eax
  95. movl %eax, BX
  96. movl C, C1
  97. movl A, AA
  98. movl M, I
  99. sarl $2, I
  100. jle .L20
  101. ALIGN_4
  102. .L11:
  103. #if !defined(TRMMKERNEL) || \
  104. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  105. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  106. movl B, BB
  107. #else
  108. movl B, BB
  109. movl KK, %eax
  110. leal (, %eax, SIZE), %eax
  111. leal (AA, %eax, 4), AA
  112. leal (BB, %eax, 4), BB
  113. #endif
  114. movl BX, %eax
  115. prefetcht2 -32 * SIZE(%eax)
  116. subl $-16 * SIZE, BX
  117. leal (C1, LDC, 2), %eax
  118. movaps -32 * SIZE(AA), %xmm0
  119. pxor %xmm2, %xmm2
  120. movaps -32 * SIZE(BB), %xmm1
  121. pxor %xmm3, %xmm3
  122. pxor %xmm4, %xmm4
  123. prefetcht0 3 * SIZE(C1)
  124. pxor %xmm5, %xmm5
  125. prefetcht0 3 * SIZE(C1, LDC)
  126. pxor %xmm6, %xmm6
  127. prefetcht0 3 * SIZE(%eax)
  128. pxor %xmm7, %xmm7
  129. prefetcht0 3 * SIZE(%eax, LDC)
  130. #ifndef TRMMKERNEL
  131. movl K, %eax
  132. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  133. movl K, %eax
  134. subl KK, %eax
  135. movl %eax, KKK
  136. #else
  137. movl KK, %eax
  138. #ifdef LEFT
  139. addl $4, %eax
  140. #else
  141. addl $4, %eax
  142. #endif
  143. movl %eax, KKK
  144. #endif
  145. sarl $3, %eax
  146. je .L15
  147. ALIGN_4
  148. .L12:
  149. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  150. addps %xmm2, %xmm7
  151. pshufd $0x93, %xmm1, %xmm2
  152. mulps %xmm0, %xmm1
  153. addps %xmm3, %xmm6
  154. pshufd $0x93, %xmm2, %xmm3
  155. mulps %xmm0, %xmm2
  156. addps %xmm2, %xmm5
  157. pshufd $0x93, %xmm3, %xmm2
  158. mulps %xmm0, %xmm3
  159. addps %xmm1, %xmm4
  160. movaps -28 * SIZE(BB), %xmm1
  161. mulps %xmm0, %xmm2
  162. movaps -28 * SIZE(AA), %xmm0
  163. addps %xmm2, %xmm7
  164. pshufd $0x93, %xmm1, %xmm2
  165. mulps %xmm0, %xmm1
  166. addps %xmm3, %xmm6
  167. pshufd $0x93, %xmm2, %xmm3
  168. mulps %xmm0, %xmm2
  169. addps %xmm2, %xmm5
  170. pshufd $0x93, %xmm3, %xmm2
  171. mulps %xmm0, %xmm3
  172. addps %xmm1, %xmm4
  173. movaps -24 * SIZE(BB), %xmm1
  174. mulps %xmm0, %xmm2
  175. movaps -24 * SIZE(AA), %xmm0
  176. addps %xmm2, %xmm7
  177. pshufd $0x93, %xmm1, %xmm2
  178. mulps %xmm0, %xmm1
  179. addps %xmm3, %xmm6
  180. pshufd $0x93, %xmm2, %xmm3
  181. mulps %xmm0, %xmm2
  182. addps %xmm2, %xmm5
  183. pshufd $0x93, %xmm3, %xmm2
  184. mulps %xmm0, %xmm3
  185. addps %xmm1, %xmm4
  186. movaps -20 * SIZE(BB), %xmm1
  187. mulps %xmm0, %xmm2
  188. movaps -20 * SIZE(AA), %xmm0
  189. addps %xmm2, %xmm7
  190. pshufd $0x93, %xmm1, %xmm2
  191. mulps %xmm0, %xmm1
  192. addps %xmm3, %xmm6
  193. pshufd $0x93, %xmm2, %xmm3
  194. mulps %xmm0, %xmm2
  195. addps %xmm2, %xmm5
  196. pshufd $0x93, %xmm3, %xmm2
  197. mulps %xmm0, %xmm3
  198. addps %xmm1, %xmm4
  199. movaps -16 * SIZE(BB), %xmm1
  200. mulps %xmm0, %xmm2
  201. movaps -16 * SIZE(AA), %xmm0
  202. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  203. addps %xmm2, %xmm7
  204. pshufd $0x93, %xmm1, %xmm2
  205. mulps %xmm0, %xmm1
  206. addps %xmm3, %xmm6
  207. pshufd $0x93, %xmm2, %xmm3
  208. mulps %xmm0, %xmm2
  209. addps %xmm2, %xmm5
  210. pshufd $0x93, %xmm3, %xmm2
  211. mulps %xmm0, %xmm3
  212. addps %xmm1, %xmm4
  213. movaps -12 * SIZE(BB), %xmm1
  214. mulps %xmm0, %xmm2
  215. movaps -12 * SIZE(AA), %xmm0
  216. addps %xmm2, %xmm7
  217. pshufd $0x93, %xmm1, %xmm2
  218. mulps %xmm0, %xmm1
  219. addps %xmm3, %xmm6
  220. pshufd $0x93, %xmm2, %xmm3
  221. mulps %xmm0, %xmm2
  222. addps %xmm2, %xmm5
  223. pshufd $0x93, %xmm3, %xmm2
  224. mulps %xmm0, %xmm3
  225. addps %xmm1, %xmm4
  226. movaps -8 * SIZE(BB), %xmm1
  227. mulps %xmm0, %xmm2
  228. movaps -8 * SIZE(AA), %xmm0
  229. addps %xmm2, %xmm7
  230. pshufd $0x93, %xmm1, %xmm2
  231. mulps %xmm0, %xmm1
  232. addps %xmm3, %xmm6
  233. pshufd $0x93, %xmm2, %xmm3
  234. mulps %xmm0, %xmm2
  235. addps %xmm2, %xmm5
  236. pshufd $0x93, %xmm3, %xmm2
  237. mulps %xmm0, %xmm3
  238. addps %xmm1, %xmm4
  239. movaps -4 * SIZE(BB), %xmm1
  240. mulps %xmm0, %xmm2
  241. movaps -4 * SIZE(AA), %xmm0
  242. addps %xmm2, %xmm7
  243. subl $-32 * SIZE, BB
  244. pshufd $0x93, %xmm1, %xmm2
  245. mulps %xmm0, %xmm1
  246. addps %xmm3, %xmm6
  247. pshufd $0x93, %xmm2, %xmm3
  248. mulps %xmm0, %xmm2
  249. addps %xmm2, %xmm5
  250. subl $-32 * SIZE, AA
  251. pshufd $0x93, %xmm3, %xmm2
  252. mulps %xmm0, %xmm3
  253. addps %xmm1, %xmm4
  254. movaps -32 * SIZE(BB), %xmm1
  255. mulps %xmm0, %xmm2
  256. movaps -32 * SIZE(AA), %xmm0
  257. subl $1, %eax
  258. jne .L12
  259. ALIGN_4
  260. .L15:
  261. #ifndef TRMMKERNEL
  262. movl K, %eax
  263. #else
  264. movl KKK, %eax
  265. #endif
  266. andl $7, %eax
  267. BRANCH
  268. je .L18
  269. ALIGN_4
  270. .L16:
  271. addps %xmm2, %xmm7
  272. pshufd $0x93, %xmm1, %xmm2
  273. mulps %xmm0, %xmm1
  274. addps %xmm3, %xmm6
  275. pshufd $0x93, %xmm2, %xmm3
  276. mulps %xmm0, %xmm2
  277. addps %xmm2, %xmm5
  278. pshufd $0x93, %xmm3, %xmm2
  279. mulps %xmm0, %xmm3
  280. addps %xmm1, %xmm4
  281. movaps -28 * SIZE(BB), %xmm1
  282. mulps %xmm0, %xmm2
  283. movaps -28 * SIZE(AA), %xmm0
  284. addl $4 * SIZE, AA
  285. addl $4 * SIZE, BB
  286. decl %eax
  287. jg .L16
  288. ALIGN_4
  289. .L18:
  290. addps %xmm3, %xmm6
  291. addps %xmm2, %xmm7
  292. movddup ALPHA, %xmm3
  293. pshufd $0x39, %xmm5, %xmm2
  294. pshufd $0x4e, %xmm6, %xmm0
  295. pshufd $0x93, %xmm7, %xmm7
  296. movaps %xmm4, %xmm6
  297. unpcklps %xmm0, %xmm4
  298. unpckhps %xmm0, %xmm6
  299. movaps %xmm2, %xmm1
  300. unpcklps %xmm7, %xmm2
  301. unpckhps %xmm7, %xmm1
  302. movaps %xmm4, %xmm5
  303. unpcklps %xmm2, %xmm4
  304. unpckhps %xmm2, %xmm5
  305. movaps %xmm6, %xmm7
  306. unpcklps %xmm1, %xmm6
  307. unpckhps %xmm1, %xmm7
  308. pshufd $0x93, %xmm5, %xmm5
  309. pshufd $0x4e, %xmm6, %xmm6
  310. pshufd $0x39, %xmm7, %xmm7
  311. leal (C1, LDC, 2), %eax
  312. movsd 0 * SIZE(C1), %xmm0
  313. movhps 2 * SIZE(C1), %xmm0
  314. movsd 4 * SIZE(C1), %xmm1
  315. movhps 6 * SIZE(C1), %xmm1
  316. pshufd $0x50, %xmm4, %xmm2
  317. pshufd $0xfa, %xmm4, %xmm4
  318. mulps %xmm3, %xmm2
  319. mulps %xmm3, %xmm4
  320. addps %xmm2, %xmm0
  321. addps %xmm4, %xmm1
  322. movlps %xmm0, 0 * SIZE(C1)
  323. movhps %xmm0, 2 * SIZE(C1)
  324. movlps %xmm1, 4 * SIZE(C1)
  325. movhps %xmm1, 6 * SIZE(C1)
  326. movsd 0 * SIZE(C1, LDC), %xmm0
  327. movhps 2 * SIZE(C1, LDC), %xmm0
  328. movsd 4 * SIZE(C1, LDC), %xmm1
  329. movhps 6 * SIZE(C1, LDC), %xmm1
  330. pshufd $0x50, %xmm5, %xmm2
  331. pshufd $0xfa, %xmm5, %xmm5
  332. mulps %xmm3, %xmm2
  333. mulps %xmm3, %xmm5
  334. addps %xmm2, %xmm0
  335. addps %xmm5, %xmm1
  336. movlps %xmm0, 0 * SIZE(C1, LDC)
  337. movhps %xmm0, 2 * SIZE(C1, LDC)
  338. movlps %xmm1, 4 * SIZE(C1, LDC)
  339. movhps %xmm1, 6 * SIZE(C1, LDC)
  340. movsd 0 * SIZE(%eax), %xmm0
  341. movhps 2 * SIZE(%eax), %xmm0
  342. movsd 4 * SIZE(%eax), %xmm1
  343. movhps 6 * SIZE(%eax), %xmm1
  344. pshufd $0x50, %xmm6, %xmm2
  345. pshufd $0xfa, %xmm6, %xmm6
  346. mulps %xmm3, %xmm2
  347. mulps %xmm3, %xmm6
  348. addps %xmm2, %xmm0
  349. addps %xmm6, %xmm1
  350. movlps %xmm0, 0 * SIZE(%eax)
  351. movhps %xmm0, 2 * SIZE(%eax)
  352. movlps %xmm1, 4 * SIZE(%eax)
  353. movhps %xmm1, 6 * SIZE(%eax)
  354. movsd 0 * SIZE(%eax, LDC), %xmm0
  355. movhps 2 * SIZE(%eax, LDC), %xmm0
  356. movsd 4 * SIZE(%eax, LDC), %xmm1
  357. movhps 6 * SIZE(%eax, LDC), %xmm1
  358. pshufd $0x50, %xmm7, %xmm2
  359. pshufd $0xfa, %xmm7, %xmm7
  360. mulps %xmm3, %xmm2
  361. mulps %xmm3, %xmm7
  362. addps %xmm2, %xmm0
  363. addps %xmm7, %xmm1
  364. movlps %xmm0, 0 * SIZE(%eax, LDC)
  365. movhps %xmm0, 2 * SIZE(%eax, LDC)
  366. movlps %xmm1, 4 * SIZE(%eax, LDC)
  367. movhps %xmm1, 6 * SIZE(%eax, LDC)
  368. addl $8 * SIZE, C1
  369. decl I
  370. jg .L11
  371. ALIGN_4
  372. .L20:
  373. movl M, I
  374. testl $2, I
  375. jle .L30
  376. #if !defined(TRMMKERNEL) || \
  377. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  378. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  379. movl B, BB
  380. #else
  381. movl B, BB
  382. movl KK, %eax
  383. leal (, %eax, SIZE), %eax
  384. leal (AA, %eax, 2), AA
  385. leal (BB, %eax, 4), BB
  386. #endif
  387. pxor %xmm4, %xmm4
  388. movaps -32 * SIZE(AA), %xmm0
  389. pxor %xmm5, %xmm5
  390. movaps -32 * SIZE(BB), %xmm1
  391. pxor %xmm6, %xmm6
  392. pxor %xmm7, %xmm7
  393. #ifndef TRMMKERNEL
  394. movl K, %eax
  395. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  396. movl K, %eax
  397. subl KK, %eax
  398. movl %eax, KKK
  399. #else
  400. movl KK, %eax
  401. #ifdef LEFT
  402. addl $2, %eax
  403. #else
  404. addl $4, %eax
  405. #endif
  406. movl %eax, KKK
  407. #endif
  408. sarl $3, %eax
  409. je .L25
  410. ALIGN_4
  411. .L22:
  412. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  413. pshufd $0x44, %xmm0, %xmm2
  414. pshufd $0x50, %xmm1, %xmm3
  415. mulps %xmm2, %xmm3
  416. addps %xmm3, %xmm4
  417. pshufd $0xfa, %xmm1, %xmm3
  418. movaps -28 * SIZE(BB), %xmm1
  419. mulps %xmm2, %xmm3
  420. addps %xmm3, %xmm5
  421. pshufd $0xee, %xmm0, %xmm2
  422. movaps -28 * SIZE(AA), %xmm0
  423. pshufd $0x50, %xmm1, %xmm3
  424. mulps %xmm2, %xmm3
  425. addps %xmm3, %xmm6
  426. pshufd $0xfa, %xmm1, %xmm3
  427. movaps -24 * SIZE(BB), %xmm1
  428. mulps %xmm2, %xmm3
  429. addps %xmm3, %xmm7
  430. pshufd $0x44, %xmm0, %xmm2
  431. pshufd $0x50, %xmm1, %xmm3
  432. mulps %xmm2, %xmm3
  433. addps %xmm3, %xmm4
  434. pshufd $0xfa, %xmm1, %xmm3
  435. movaps -20 * SIZE(BB), %xmm1
  436. mulps %xmm2, %xmm3
  437. addps %xmm3, %xmm5
  438. pshufd $0xee, %xmm0, %xmm2
  439. movaps -24 * SIZE(AA), %xmm0
  440. pshufd $0x50, %xmm1, %xmm3
  441. mulps %xmm2, %xmm3
  442. addps %xmm3, %xmm6
  443. pshufd $0xfa, %xmm1, %xmm3
  444. movaps -16 * SIZE(BB), %xmm1
  445. mulps %xmm2, %xmm3
  446. addps %xmm3, %xmm7
  447. pshufd $0x44, %xmm0, %xmm2
  448. pshufd $0x50, %xmm1, %xmm3
  449. mulps %xmm2, %xmm3
  450. addps %xmm3, %xmm4
  451. pshufd $0xfa, %xmm1, %xmm3
  452. movaps -12 * SIZE(BB), %xmm1
  453. mulps %xmm2, %xmm3
  454. addps %xmm3, %xmm5
  455. pshufd $0xee, %xmm0, %xmm2
  456. movaps -20 * SIZE(AA), %xmm0
  457. pshufd $0x50, %xmm1, %xmm3
  458. mulps %xmm2, %xmm3
  459. addps %xmm3, %xmm6
  460. pshufd $0xfa, %xmm1, %xmm3
  461. movaps -8 * SIZE(BB), %xmm1
  462. mulps %xmm2, %xmm3
  463. addps %xmm3, %xmm7
  464. pshufd $0x44, %xmm0, %xmm2
  465. pshufd $0x50, %xmm1, %xmm3
  466. mulps %xmm2, %xmm3
  467. addps %xmm3, %xmm4
  468. pshufd $0xfa, %xmm1, %xmm3
  469. movaps -4 * SIZE(BB), %xmm1
  470. mulps %xmm2, %xmm3
  471. addps %xmm3, %xmm5
  472. pshufd $0xee, %xmm0, %xmm2
  473. movaps -16 * SIZE(AA), %xmm0
  474. pshufd $0x50, %xmm1, %xmm3
  475. mulps %xmm2, %xmm3
  476. addps %xmm3, %xmm6
  477. pshufd $0xfa, %xmm1, %xmm3
  478. movaps 0 * SIZE(BB), %xmm1
  479. mulps %xmm2, %xmm3
  480. addps %xmm3, %xmm7
  481. subl $-16 * SIZE, AA
  482. subl $-32 * SIZE, BB
  483. subl $1, %eax
  484. jne .L22
  485. ALIGN_4
  486. .L25:
  487. #ifndef TRMMKERNEL
  488. movl K, %eax
  489. #else
  490. movl KKK, %eax
  491. #endif
  492. andl $7, %eax
  493. BRANCH
  494. je .L28
  495. ALIGN_4
  496. .L26:
  497. pshufd $0x44, %xmm0, %xmm2
  498. movsd -30 * SIZE(AA), %xmm0
  499. pshufd $0x50, %xmm1, %xmm3
  500. mulps %xmm2, %xmm3
  501. addps %xmm3, %xmm4
  502. pshufd $0xfa, %xmm1, %xmm3
  503. movaps -28 * SIZE(BB), %xmm1
  504. mulps %xmm2, %xmm3
  505. addps %xmm3, %xmm5
  506. addl $2 * SIZE, AA
  507. addl $4 * SIZE, BB
  508. decl %eax
  509. jg .L26
  510. ALIGN_4
  511. .L28:
  512. movddup ALPHA, %xmm3
  513. addps %xmm6, %xmm4
  514. addps %xmm7, %xmm5
  515. leal (C1, LDC, 2), %eax
  516. movsd 0 * SIZE(C1), %xmm0
  517. movhps 2 * SIZE(C1), %xmm0
  518. movsd 0 * SIZE(C1, LDC), %xmm1
  519. movhps 2 * SIZE(C1, LDC), %xmm1
  520. pshufd $0x50, %xmm4, %xmm2
  521. pshufd $0xfa, %xmm4, %xmm4
  522. mulps %xmm3, %xmm2
  523. mulps %xmm3, %xmm4
  524. addps %xmm2, %xmm0
  525. addps %xmm4, %xmm1
  526. movlps %xmm0, 0 * SIZE(C1)
  527. movhps %xmm0, 2 * SIZE(C1)
  528. movlps %xmm1, 0 * SIZE(C1, LDC)
  529. movhps %xmm1, 2 * SIZE(C1, LDC)
  530. movsd 0 * SIZE(%eax), %xmm0
  531. movhps 2 * SIZE(%eax), %xmm0
  532. movsd 0 * SIZE(%eax, LDC), %xmm1
  533. movhps 2 * SIZE(%eax, LDC), %xmm1
  534. pshufd $0x50, %xmm5, %xmm2
  535. pshufd $0xfa, %xmm5, %xmm5
  536. mulps %xmm3, %xmm2
  537. mulps %xmm3, %xmm5
  538. addps %xmm2, %xmm0
  539. addps %xmm5, %xmm1
  540. movlps %xmm0, 0 * SIZE(%eax)
  541. movhps %xmm0, 2 * SIZE(%eax)
  542. movlps %xmm1, 0 * SIZE(%eax, LDC)
  543. movhps %xmm1, 2 * SIZE(%eax, LDC)
  544. addl $4 * SIZE, C1
  545. ALIGN_4
  546. .L30:
  547. movl M, I
  548. testl $1, I
  549. jle .L39
  550. #if !defined(TRMMKERNEL) || \
  551. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  552. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  553. movl B, BB
  554. #else
  555. movl B, BB
  556. movl KK, %eax
  557. leal (, %eax, SIZE), %eax
  558. addl %eax, AA
  559. leal (BB, %eax, 4), BB
  560. #endif
  561. pxor %xmm4, %xmm4
  562. movsd -32 * SIZE(AA), %xmm0
  563. pxor %xmm5, %xmm5
  564. movaps -32 * SIZE(BB), %xmm1
  565. pxor %xmm6, %xmm6
  566. pxor %xmm7, %xmm7
  567. #ifndef TRMMKERNEL
  568. movl K, %eax
  569. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  570. movl K, %eax
  571. subl KK, %eax
  572. movl %eax, KKK
  573. #else
  574. movl KK, %eax
  575. #ifdef LEFT
  576. addl $1, %eax
  577. #else
  578. addl $4, %eax
  579. #endif
  580. movl %eax, KKK
  581. #endif
  582. sarl $3, %eax
  583. je .L35
  584. ALIGN_4
  585. .L32:
  586. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  587. pshufd $0x00, %xmm0, %xmm2
  588. mulps %xmm2, %xmm1
  589. addps %xmm1, %xmm4
  590. movaps -28 * SIZE(BB), %xmm1
  591. pshufd $0x55, %xmm0, %xmm2
  592. movsd -30 * SIZE(AA), %xmm0
  593. mulps %xmm2, %xmm1
  594. addps %xmm1, %xmm4
  595. movaps -24 * SIZE(BB), %xmm1
  596. pshufd $0x00, %xmm0, %xmm2
  597. mulps %xmm2, %xmm1
  598. addps %xmm1, %xmm4
  599. movaps -20 * SIZE(BB), %xmm1
  600. pshufd $0x55, %xmm0, %xmm2
  601. movsd -28 * SIZE(AA), %xmm0
  602. mulps %xmm2, %xmm1
  603. addps %xmm1, %xmm4
  604. movaps -16 * SIZE(BB), %xmm1
  605. pshufd $0x00, %xmm0, %xmm2
  606. mulps %xmm2, %xmm1
  607. addps %xmm1, %xmm4
  608. movaps -12 * SIZE(BB), %xmm1
  609. pshufd $0x55, %xmm0, %xmm2
  610. movsd -26 * SIZE(AA), %xmm0
  611. mulps %xmm2, %xmm1
  612. addps %xmm1, %xmm4
  613. movaps -8 * SIZE(BB), %xmm1
  614. pshufd $0x00, %xmm0, %xmm2
  615. mulps %xmm2, %xmm1
  616. addps %xmm1, %xmm4
  617. movaps -4 * SIZE(BB), %xmm1
  618. pshufd $0x55, %xmm0, %xmm2
  619. movsd -24 * SIZE(AA), %xmm0
  620. mulps %xmm2, %xmm1
  621. addps %xmm1, %xmm4
  622. movaps 0 * SIZE(BB), %xmm1
  623. subl $ -8 * SIZE, AA
  624. subl $-32 * SIZE, BB
  625. subl $1, %eax
  626. jne .L32
  627. ALIGN_4
  628. .L35:
  629. #ifndef TRMMKERNEL
  630. movl K, %eax
  631. #else
  632. movl KKK, %eax
  633. #endif
  634. andl $7, %eax
  635. BRANCH
  636. je .L38
  637. ALIGN_4
  638. .L36:
  639. pshufd $0x00, %xmm0, %xmm2
  640. movss -31 * SIZE(AA), %xmm0
  641. mulps %xmm2, %xmm1
  642. addps %xmm1, %xmm4
  643. movaps -28 * SIZE(BB), %xmm1
  644. addl $1 * SIZE, AA
  645. addl $4 * SIZE, BB
  646. decl %eax
  647. jg .L36
  648. ALIGN_4
  649. .L38:
  650. movddup ALPHA, %xmm3
  651. leal (C1, LDC, 2), %eax
  652. movsd (C1), %xmm0
  653. movhps (C1, LDC), %xmm0
  654. movsd (%eax), %xmm1
  655. movhps (%eax, LDC), %xmm1
  656. pshufd $0x50, %xmm4, %xmm2
  657. pshufd $0xfa, %xmm4, %xmm4
  658. mulps %xmm3, %xmm2
  659. mulps %xmm3, %xmm4
  660. addps %xmm2, %xmm0
  661. addps %xmm4, %xmm1
  662. movlps %xmm0, (C1)
  663. movhps %xmm0, (C1, LDC)
  664. movlps %xmm1, (%eax)
  665. movhps %xmm1, (%eax, LDC)
  666. ALIGN_4
  667. .L39:
  668. #if defined(TRMMKERNEL) && !defined(LEFT)
  669. addl $4, KK
  670. #endif
  671. movl BB, B
  672. leal (, LDC, 4), %eax
  673. addl %eax, C
  674. decl J
  675. jg .L01
  676. ALIGN_4
  677. .L40:
  678. movl N, %eax
  679. testl $2, %eax
  680. jle .L70
  681. #if defined(TRMMKERNEL) && defined(LEFT)
  682. movl OFFSET, %eax
  683. movl %eax, KK
  684. #endif
  685. movl C, C1
  686. movl A, AA
  687. movl M, I
  688. sarl $2, I
  689. jle .L50
  690. ALIGN_4
  691. .L41:
  692. #if !defined(TRMMKERNEL) || \
  693. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  694. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  695. movl B, BB
  696. #else
  697. movl B, BB
  698. movl KK, %eax
  699. leal (, %eax, SIZE), %eax
  700. leal (AA, %eax, 4), AA
  701. leal (BB, %eax, 2), BB
  702. #endif
  703. movaps -32 * SIZE(AA), %xmm0
  704. pxor %xmm2, %xmm2
  705. movaps -32 * SIZE(BB), %xmm1
  706. pxor %xmm3, %xmm3
  707. pxor %xmm4, %xmm4
  708. prefetcht0 3 * SIZE(C1)
  709. pxor %xmm5, %xmm5
  710. prefetcht0 3 * SIZE(C1, LDC)
  711. pxor %xmm6, %xmm6
  712. pxor %xmm7, %xmm7
  713. #ifndef TRMMKERNEL
  714. movl K, %eax
  715. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  716. movl K, %eax
  717. subl KK, %eax
  718. movl %eax, KKK
  719. #else
  720. movl KK, %eax
  721. #ifdef LEFT
  722. addl $4, %eax
  723. #else
  724. addl $2, %eax
  725. #endif
  726. movl %eax, KKK
  727. #endif
  728. sarl $3, %eax
  729. je .L45
  730. ALIGN_4
  731. .L42:
  732. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  733. addps %xmm2, %xmm4
  734. pshufd $0x00, %xmm1, %xmm2
  735. mulps %xmm0, %xmm2
  736. addps %xmm3, %xmm5
  737. pshufd $0x55, %xmm1, %xmm3
  738. mulps %xmm0, %xmm3
  739. movaps -28 * SIZE(AA), %xmm0
  740. addps %xmm2, %xmm6
  741. pshufd $0xaa, %xmm1, %xmm2
  742. mulps %xmm0, %xmm2
  743. addps %xmm3, %xmm7
  744. pshufd $0xff, %xmm1, %xmm3
  745. movaps -28 * SIZE(BB), %xmm1
  746. mulps %xmm0, %xmm3
  747. movaps -24 * SIZE(AA), %xmm0
  748. addps %xmm2, %xmm4
  749. pshufd $0x00, %xmm1, %xmm2
  750. mulps %xmm0, %xmm2
  751. addps %xmm3, %xmm5
  752. pshufd $0x55, %xmm1, %xmm3
  753. mulps %xmm0, %xmm3
  754. movaps -20 * SIZE(AA), %xmm0
  755. addps %xmm2, %xmm6
  756. pshufd $0xaa, %xmm1, %xmm2
  757. mulps %xmm0, %xmm2
  758. addps %xmm3, %xmm7
  759. pshufd $0xff, %xmm1, %xmm3
  760. movaps -24 * SIZE(BB), %xmm1
  761. mulps %xmm0, %xmm3
  762. movaps -16 * SIZE(AA), %xmm0
  763. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  764. addps %xmm2, %xmm4
  765. pshufd $0x00, %xmm1, %xmm2
  766. mulps %xmm0, %xmm2
  767. addps %xmm3, %xmm5
  768. pshufd $0x55, %xmm1, %xmm3
  769. mulps %xmm0, %xmm3
  770. movaps -12 * SIZE(AA), %xmm0
  771. addps %xmm2, %xmm6
  772. pshufd $0xaa, %xmm1, %xmm2
  773. mulps %xmm0, %xmm2
  774. addps %xmm3, %xmm7
  775. pshufd $0xff, %xmm1, %xmm3
  776. movaps -20 * SIZE(BB), %xmm1
  777. mulps %xmm0, %xmm3
  778. movaps -8 * SIZE(AA), %xmm0
  779. addps %xmm2, %xmm4
  780. pshufd $0x00, %xmm1, %xmm2
  781. mulps %xmm0, %xmm2
  782. addps %xmm3, %xmm5
  783. pshufd $0x55, %xmm1, %xmm3
  784. mulps %xmm0, %xmm3
  785. movaps -4 * SIZE(AA), %xmm0
  786. addps %xmm2, %xmm6
  787. pshufd $0xaa, %xmm1, %xmm2
  788. mulps %xmm0, %xmm2
  789. addps %xmm3, %xmm7
  790. pshufd $0xff, %xmm1, %xmm3
  791. movaps -16 * SIZE(BB), %xmm1
  792. mulps %xmm0, %xmm3
  793. movaps 0 * SIZE(AA), %xmm0
  794. subl $-32 * SIZE, AA
  795. subl $-16 * SIZE, BB
  796. subl $1, %eax
  797. jne .L42
  798. ALIGN_4
  799. .L45:
  800. #ifndef TRMMKERNEL
  801. movl K, %eax
  802. #else
  803. movl KKK, %eax
  804. #endif
  805. andl $7, %eax
  806. BRANCH
  807. je .L48
  808. ALIGN_4
  809. .L46:
  810. addps %xmm2, %xmm4
  811. pshufd $0x00, %xmm1, %xmm2
  812. mulps %xmm0, %xmm2
  813. addps %xmm3, %xmm5
  814. pshufd $0x55, %xmm1, %xmm3
  815. movsd -30 * SIZE(BB), %xmm1
  816. mulps %xmm0, %xmm3
  817. movaps -28 * SIZE(AA), %xmm0
  818. addl $4 * SIZE, AA
  819. addl $2 * SIZE, BB
  820. decl %eax
  821. jg .L46
  822. ALIGN_4
  823. .L48:
  824. addps %xmm2, %xmm4
  825. addps %xmm3, %xmm5
  826. movddup ALPHA, %xmm3
  827. addps %xmm6, %xmm4
  828. addps %xmm7, %xmm5
  829. movsd 0 * SIZE(C1), %xmm0
  830. movhps 2 * SIZE(C1), %xmm0
  831. movsd 4 * SIZE(C1), %xmm1
  832. movhps 6 * SIZE(C1), %xmm1
  833. pshufd $0x50, %xmm4, %xmm2
  834. pshufd $0xfa, %xmm4, %xmm4
  835. mulps %xmm3, %xmm2
  836. mulps %xmm3, %xmm4
  837. addps %xmm2, %xmm0
  838. addps %xmm4, %xmm1
  839. movlps %xmm0, 0 * SIZE(C1)
  840. movhps %xmm0, 2 * SIZE(C1)
  841. movlps %xmm1, 4 * SIZE(C1)
  842. movhps %xmm1, 6 * SIZE(C1)
  843. movsd 0 * SIZE(C1, LDC), %xmm0
  844. movhps 2 * SIZE(C1, LDC), %xmm0
  845. movsd 4 * SIZE(C1, LDC), %xmm1
  846. movhps 6 * SIZE(C1, LDC), %xmm1
  847. pshufd $0x50, %xmm5, %xmm2
  848. pshufd $0xfa, %xmm5, %xmm5
  849. mulps %xmm3, %xmm2
  850. mulps %xmm3, %xmm5
  851. addps %xmm2, %xmm0
  852. addps %xmm5, %xmm1
  853. movlps %xmm0, 0 * SIZE(C1, LDC)
  854. movhps %xmm0, 2 * SIZE(C1, LDC)
  855. movlps %xmm1, 4 * SIZE(C1, LDC)
  856. movhps %xmm1, 6 * SIZE(C1, LDC)
  857. addl $8 * SIZE, C1
  858. decl I
  859. jg .L41
  860. ALIGN_4
  861. .L50:
  862. movl M, I
  863. testl $2, I
  864. jle .L60
  865. #if !defined(TRMMKERNEL) || \
  866. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  867. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  868. movl B, BB
  869. #else
  870. movl B, BB
  871. movl KK, %eax
  872. leal (, %eax, SIZE), %eax
  873. leal (AA, %eax, 2), AA
  874. leal (BB, %eax, 2), BB
  875. #endif
  876. movaps -32 * SIZE(AA), %xmm0
  877. pxor %xmm3, %xmm3
  878. movaps -32 * SIZE(BB), %xmm1
  879. pxor %xmm4, %xmm4
  880. pxor %xmm5, %xmm5
  881. #ifndef TRMMKERNEL
  882. movl K, %eax
  883. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  884. movl K, %eax
  885. subl KK, %eax
  886. movl %eax, KKK
  887. #else
  888. movl KK, %eax
  889. #ifdef LEFT
  890. addl $2, %eax
  891. #else
  892. addl $2, %eax
  893. #endif
  894. movl %eax, KKK
  895. #endif
  896. sarl $3, %eax
  897. je .L55
  898. ALIGN_4
  899. .L52:
  900. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  901. pshufd $0x44, %xmm0, %xmm2
  902. addps %xmm3, %xmm4
  903. pshufd $0x50, %xmm1, %xmm3
  904. mulps %xmm2, %xmm3
  905. pshufd $0xee, %xmm0, %xmm2
  906. movaps -28 * SIZE(AA), %xmm0
  907. addps %xmm3, %xmm5
  908. pshufd $0xfa, %xmm1, %xmm3
  909. movaps -28 * SIZE(BB), %xmm1
  910. mulps %xmm2, %xmm3
  911. pshufd $0x44, %xmm0, %xmm2
  912. addps %xmm3, %xmm4
  913. pshufd $0x50, %xmm1, %xmm3
  914. mulps %xmm2, %xmm3
  915. pshufd $0xee, %xmm0, %xmm2
  916. movaps -24 * SIZE(AA), %xmm0
  917. addps %xmm3, %xmm5
  918. pshufd $0xfa, %xmm1, %xmm3
  919. movaps -24 * SIZE(BB), %xmm1
  920. mulps %xmm2, %xmm3
  921. pshufd $0x44, %xmm0, %xmm2
  922. addps %xmm3, %xmm4
  923. pshufd $0x50, %xmm1, %xmm3
  924. mulps %xmm2, %xmm3
  925. pshufd $0xee, %xmm0, %xmm2
  926. movaps -20 * SIZE(AA), %xmm0
  927. addps %xmm3, %xmm5
  928. pshufd $0xfa, %xmm1, %xmm3
  929. movaps -20 * SIZE(BB), %xmm1
  930. mulps %xmm2, %xmm3
  931. pshufd $0x44, %xmm0, %xmm2
  932. addps %xmm3, %xmm4
  933. pshufd $0x50, %xmm1, %xmm3
  934. mulps %xmm2, %xmm3
  935. pshufd $0xee, %xmm0, %xmm2
  936. movaps -16 * SIZE(AA), %xmm0
  937. addps %xmm3, %xmm5
  938. pshufd $0xfa, %xmm1, %xmm3
  939. movaps -16 * SIZE(BB), %xmm1
  940. mulps %xmm2, %xmm3
  941. subl $-16 * SIZE, AA
  942. subl $-16 * SIZE, BB
  943. subl $1, %eax
  944. jne .L52
  945. ALIGN_4
  946. .L55:
  947. #ifndef TRMMKERNEL
  948. movl K, %eax
  949. #else
  950. movl KKK, %eax
  951. #endif
  952. andl $7, %eax
  953. BRANCH
  954. je .L58
  955. ALIGN_4
  956. .L56:
  957. pshufd $0x44, %xmm0, %xmm2
  958. movsd -30 * SIZE(AA), %xmm0
  959. addps %xmm3, %xmm4
  960. pshufd $0x50, %xmm1, %xmm3
  961. movsd -30 * SIZE(BB), %xmm1
  962. mulps %xmm2, %xmm3
  963. addl $2 * SIZE, AA
  964. addl $2 * SIZE, BB
  965. decl %eax
  966. jg .L56
  967. ALIGN_4
  968. .L58:
  969. addps %xmm3, %xmm4
  970. addps %xmm5, %xmm4
  971. movddup ALPHA, %xmm3
  972. movsd 0 * SIZE(C1), %xmm0
  973. movhps 2 * SIZE(C1), %xmm0
  974. movsd 0 * SIZE(C1, LDC), %xmm1
  975. movhps 2 * SIZE(C1, LDC), %xmm1
  976. pshufd $0x50, %xmm4, %xmm2
  977. pshufd $0xfa, %xmm4, %xmm4
  978. mulps %xmm3, %xmm2
  979. mulps %xmm3, %xmm4
  980. addps %xmm2, %xmm0
  981. addps %xmm4, %xmm1
  982. movlps %xmm0, 0 * SIZE(C1)
  983. movhps %xmm0, 2 * SIZE(C1)
  984. movlps %xmm1, 0 * SIZE(C1, LDC)
  985. movhps %xmm1, 2 * SIZE(C1, LDC)
  986. addl $4 * SIZE, C1
  987. ALIGN_4
  988. .L60:
  989. movl M, I
  990. testl $1, I
  991. jle .L69
  992. #if !defined(TRMMKERNEL) || \
  993. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  994. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  995. movl B, BB
  996. #else
  997. movl B, BB
  998. movl KK, %eax
  999. leal (, %eax, SIZE), %eax
  1000. addl %eax, AA
  1001. leal (BB, %eax, 2), BB
  1002. #endif
  1003. pxor %xmm4, %xmm4
  1004. movsd -32 * SIZE(AA), %xmm0
  1005. pxor %xmm5, %xmm5
  1006. movsd -32 * SIZE(BB), %xmm1
  1007. #ifndef TRMMKERNEL
  1008. movl K, %eax
  1009. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1010. movl K, %eax
  1011. subl KK, %eax
  1012. movl %eax, KKK
  1013. #else
  1014. movl KK, %eax
  1015. #ifdef LEFT
  1016. addl $1, %eax
  1017. #else
  1018. addl $2, %eax
  1019. #endif
  1020. movl %eax, KKK
  1021. #endif
  1022. sarl $3, %eax
  1023. je .L65
  1024. ALIGN_4
  1025. .L62:
  1026. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1027. pshufd $0x00, %xmm0, %xmm2
  1028. mulps %xmm2, %xmm1
  1029. addps %xmm1, %xmm4
  1030. movsd -30 * SIZE(BB), %xmm1
  1031. pshufd $0x55, %xmm0, %xmm2
  1032. movsd -30 * SIZE(AA), %xmm0
  1033. mulps %xmm2, %xmm1
  1034. addps %xmm1, %xmm5
  1035. movsd -28 * SIZE(BB), %xmm1
  1036. pshufd $0x00, %xmm0, %xmm2
  1037. mulps %xmm2, %xmm1
  1038. addps %xmm1, %xmm4
  1039. movsd -26 * SIZE(BB), %xmm1
  1040. pshufd $0x55, %xmm0, %xmm2
  1041. movsd -28 * SIZE(AA), %xmm0
  1042. mulps %xmm2, %xmm1
  1043. addps %xmm1, %xmm5
  1044. movsd -24 * SIZE(BB), %xmm1
  1045. pshufd $0x00, %xmm0, %xmm2
  1046. mulps %xmm2, %xmm1
  1047. addps %xmm1, %xmm4
  1048. movsd -22 * SIZE(BB), %xmm1
  1049. pshufd $0x55, %xmm0, %xmm2
  1050. movsd -26 * SIZE(AA), %xmm0
  1051. mulps %xmm2, %xmm1
  1052. addps %xmm1, %xmm5
  1053. movsd -20 * SIZE(BB), %xmm1
  1054. pshufd $0x00, %xmm0, %xmm2
  1055. mulps %xmm2, %xmm1
  1056. addps %xmm1, %xmm4
  1057. movsd -18 * SIZE(BB), %xmm1
  1058. pshufd $0x55, %xmm0, %xmm2
  1059. movsd -24 * SIZE(AA), %xmm0
  1060. mulps %xmm2, %xmm1
  1061. addps %xmm1, %xmm5
  1062. movsd -16 * SIZE(BB), %xmm1
  1063. subl $ -8 * SIZE, AA
  1064. subl $-16 * SIZE, BB
  1065. subl $1, %eax
  1066. jne .L62
  1067. ALIGN_4
  1068. .L65:
  1069. #ifndef TRMMKERNEL
  1070. movl K, %eax
  1071. #else
  1072. movl KKK, %eax
  1073. #endif
  1074. andl $7, %eax
  1075. BRANCH
  1076. je .L68
  1077. ALIGN_4
  1078. .L66:
  1079. pshufd $0x00, %xmm0, %xmm2
  1080. movss -31 * SIZE(AA), %xmm0
  1081. mulps %xmm2, %xmm1
  1082. addps %xmm1, %xmm4
  1083. movsd -30 * SIZE(BB), %xmm1
  1084. addl $1 * SIZE, AA
  1085. addl $2 * SIZE, BB
  1086. decl %eax
  1087. jg .L66
  1088. ALIGN_4
  1089. .L68:
  1090. movddup ALPHA, %xmm3
  1091. addps %xmm5, %xmm4
  1092. movsd (C1), %xmm0
  1093. movhps (C1, LDC), %xmm0
  1094. pshufd $0x50, %xmm4, %xmm2
  1095. mulps %xmm3, %xmm2
  1096. addps %xmm2, %xmm0
  1097. movlps %xmm0, (C1)
  1098. movhps %xmm0, (C1, LDC)
  1099. ALIGN_4
  1100. .L69:
  1101. #if defined(TRMMKERNEL) && !defined(LEFT)
  1102. addl $2, KK
  1103. #endif
  1104. movl BB, B
  1105. leal (, LDC, 2), %eax
  1106. addl %eax, C
  1107. ALIGN_4
  1108. .L70:
  1109. movl N, %eax
  1110. testl $1, %eax
  1111. jle .L999
  1112. #if defined(TRMMKERNEL) && defined(LEFT)
  1113. movl OFFSET, %eax
  1114. movl %eax, KK
  1115. #endif
  1116. movl C, C1
  1117. movl A, AA
  1118. movl M, I
  1119. sarl $2, I
  1120. jle .L80
  1121. ALIGN_4
  1122. .L71:
  1123. #if !defined(TRMMKERNEL) || \
  1124. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1125. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1126. movl B, BB
  1127. #else
  1128. movl B, BB
  1129. movl KK, %eax
  1130. leal (, %eax, SIZE), %eax
  1131. leal (AA, %eax, 4), AA
  1132. addl %eax, BB
  1133. #endif
  1134. movaps -32 * SIZE(AA), %xmm0
  1135. pxor %xmm2, %xmm2
  1136. movsd -32 * SIZE(BB), %xmm1
  1137. pxor %xmm4, %xmm4
  1138. prefetcht0 3 * SIZE(C1)
  1139. pxor %xmm5, %xmm5
  1140. #ifndef TRMMKERNEL
  1141. movl K, %eax
  1142. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1143. movl K, %eax
  1144. subl KK, %eax
  1145. movl %eax, KKK
  1146. #else
  1147. movl KK, %eax
  1148. #ifdef LEFT
  1149. addl $4, %eax
  1150. #else
  1151. addl $1, %eax
  1152. #endif
  1153. movl %eax, KKK
  1154. #endif
  1155. sarl $3, %eax
  1156. je .L75
  1157. ALIGN_4
  1158. .L72:
  1159. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1160. addps %xmm2, %xmm4
  1161. pshufd $0x00, %xmm1, %xmm2
  1162. mulps %xmm0, %xmm2
  1163. movaps -28 * SIZE(AA), %xmm0
  1164. addps %xmm2, %xmm5
  1165. pshufd $0x55, %xmm1, %xmm2
  1166. movsd -30 * SIZE(BB), %xmm1
  1167. mulps %xmm0, %xmm2
  1168. movaps -24 * SIZE(AA), %xmm0
  1169. addps %xmm2, %xmm4
  1170. pshufd $0x00, %xmm1, %xmm2
  1171. mulps %xmm0, %xmm2
  1172. movaps -20 * SIZE(AA), %xmm0
  1173. addps %xmm2, %xmm5
  1174. pshufd $0x55, %xmm1, %xmm2
  1175. movsd -28 * SIZE(BB), %xmm1
  1176. mulps %xmm0, %xmm2
  1177. movaps -16 * SIZE(AA), %xmm0
  1178. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  1179. addps %xmm2, %xmm4
  1180. pshufd $0x00, %xmm1, %xmm2
  1181. mulps %xmm0, %xmm2
  1182. movaps -12 * SIZE(AA), %xmm0
  1183. addps %xmm2, %xmm5
  1184. pshufd $0x55, %xmm1, %xmm2
  1185. movsd -26 * SIZE(BB), %xmm1
  1186. mulps %xmm0, %xmm2
  1187. movaps -8 * SIZE(AA), %xmm0
  1188. addps %xmm2, %xmm4
  1189. pshufd $0x00, %xmm1, %xmm2
  1190. mulps %xmm0, %xmm2
  1191. movaps -4 * SIZE(AA), %xmm0
  1192. addps %xmm2, %xmm5
  1193. pshufd $0x55, %xmm1, %xmm2
  1194. movsd -24 * SIZE(BB), %xmm1
  1195. mulps %xmm0, %xmm2
  1196. movaps 0 * SIZE(AA), %xmm0
  1197. subl $-32 * SIZE, AA
  1198. subl $ -8 * SIZE, BB
  1199. subl $1, %eax
  1200. jne .L72
  1201. ALIGN_4
  1202. .L75:
  1203. #ifndef TRMMKERNEL
  1204. movl K, %eax
  1205. #else
  1206. movl KKK, %eax
  1207. #endif
  1208. andl $7, %eax
  1209. BRANCH
  1210. je .L78
  1211. ALIGN_4
  1212. .L76:
  1213. addps %xmm2, %xmm4
  1214. pshufd $0x00, %xmm1, %xmm2
  1215. movss -31 * SIZE(BB), %xmm1
  1216. mulps %xmm0, %xmm2
  1217. movaps -28 * SIZE(AA), %xmm0
  1218. addl $4 * SIZE, AA
  1219. addl $1 * SIZE, BB
  1220. decl %eax
  1221. jg .L76
  1222. ALIGN_4
  1223. .L78:
  1224. movddup ALPHA, %xmm3
  1225. addps %xmm2, %xmm4
  1226. addps %xmm5, %xmm4
  1227. movsd 0 * SIZE(C1), %xmm0
  1228. movhps 2 * SIZE(C1), %xmm0
  1229. movsd 4 * SIZE(C1), %xmm1
  1230. movhps 6 * SIZE(C1), %xmm1
  1231. pshufd $0x50, %xmm4, %xmm2
  1232. pshufd $0xfa, %xmm4, %xmm4
  1233. mulps %xmm3, %xmm2
  1234. mulps %xmm3, %xmm4
  1235. addps %xmm2, %xmm0
  1236. addps %xmm4, %xmm1
  1237. movlps %xmm0, 0 * SIZE(C1)
  1238. movhps %xmm0, 2 * SIZE(C1)
  1239. movlps %xmm1, 4 * SIZE(C1)
  1240. movhps %xmm1, 6 * SIZE(C1)
  1241. addl $8 * SIZE, C1
  1242. decl I
  1243. jg .L71
  1244. ALIGN_4
  1245. .L80:
  1246. movl M, I
  1247. testl $2, I
  1248. jle .L90
  1249. #if !defined(TRMMKERNEL) || \
  1250. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1251. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1252. movl B, BB
  1253. #else
  1254. movl B, BB
  1255. movl KK, %eax
  1256. leal (, %eax, SIZE), %eax
  1257. leal (AA, %eax, 2), AA
  1258. addl %eax, BB
  1259. #endif
  1260. movsd -32 * SIZE(AA), %xmm0
  1261. pxor %xmm3, %xmm3
  1262. movsd -32 * SIZE(BB), %xmm1
  1263. pxor %xmm4, %xmm4
  1264. pxor %xmm5, %xmm5
  1265. #ifndef TRMMKERNEL
  1266. movl K, %eax
  1267. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1268. movl K, %eax
  1269. subl KK, %eax
  1270. movl %eax, KKK
  1271. #else
  1272. movl KK, %eax
  1273. #ifdef LEFT
  1274. addl $2, %eax
  1275. #else
  1276. addl $1, %eax
  1277. #endif
  1278. movl %eax, KKK
  1279. #endif
  1280. sarl $3, %eax
  1281. je .L85
  1282. ALIGN_4
  1283. .L82:
  1284. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1285. pshufd $0x00, %xmm1, %xmm2
  1286. mulps %xmm0, %xmm2
  1287. movsd -30 * SIZE(AA), %xmm0
  1288. addps %xmm2, %xmm4
  1289. pshufd $0x55, %xmm1, %xmm2
  1290. movsd -30 * SIZE(BB), %xmm1
  1291. mulps %xmm0, %xmm2
  1292. movsd -28 * SIZE(AA), %xmm0
  1293. addps %xmm2, %xmm5
  1294. pshufd $0x00, %xmm1, %xmm2
  1295. mulps %xmm0, %xmm2
  1296. movsd -26 * SIZE(AA), %xmm0
  1297. addps %xmm2, %xmm4
  1298. pshufd $0x55, %xmm1, %xmm2
  1299. movsd -28 * SIZE(BB), %xmm1
  1300. mulps %xmm0, %xmm2
  1301. movsd -24 * SIZE(AA), %xmm0
  1302. addps %xmm2, %xmm5
  1303. pshufd $0x00, %xmm1, %xmm2
  1304. mulps %xmm0, %xmm2
  1305. movsd -22 * SIZE(AA), %xmm0
  1306. addps %xmm2, %xmm4
  1307. pshufd $0x55, %xmm1, %xmm2
  1308. movsd -26 * SIZE(BB), %xmm1
  1309. mulps %xmm0, %xmm2
  1310. movsd -20 * SIZE(AA), %xmm0
  1311. addps %xmm2, %xmm5
  1312. pshufd $0x00, %xmm1, %xmm2
  1313. mulps %xmm0, %xmm2
  1314. movsd -18 * SIZE(AA), %xmm0
  1315. addps %xmm2, %xmm4
  1316. pshufd $0x55, %xmm1, %xmm2
  1317. movsd -24 * SIZE(BB), %xmm1
  1318. mulps %xmm0, %xmm2
  1319. movsd -16 * SIZE(AA), %xmm0
  1320. addps %xmm2, %xmm5
  1321. subl $-16 * SIZE, AA
  1322. subl $ -8 * SIZE, BB
  1323. subl $1, %eax
  1324. jne .L82
  1325. ALIGN_4
  1326. .L85:
  1327. #ifndef TRMMKERNEL
  1328. movl K, %eax
  1329. #else
  1330. movl KKK, %eax
  1331. #endif
  1332. andl $7, %eax
  1333. BRANCH
  1334. je .L88
  1335. ALIGN_4
  1336. .L86:
  1337. pshufd $0x00, %xmm1, %xmm2
  1338. movss -31 * SIZE(BB), %xmm1
  1339. mulps %xmm0, %xmm2
  1340. movsd -30 * SIZE(AA), %xmm0
  1341. addps %xmm2, %xmm4
  1342. addl $2 * SIZE, AA
  1343. addl $1 * SIZE, BB
  1344. decl %eax
  1345. jg .L86
  1346. ALIGN_4
  1347. .L88:
  1348. movddup ALPHA, %xmm3
  1349. addps %xmm5, %xmm4
  1350. movsd 0 * SIZE(C1), %xmm0
  1351. movhps 2 * SIZE(C1), %xmm0
  1352. pshufd $0x50, %xmm4, %xmm2
  1353. mulps %xmm3, %xmm2
  1354. addps %xmm2, %xmm0
  1355. movlps %xmm0, 0 * SIZE(C1)
  1356. movhps %xmm0, 2 * SIZE(C1)
  1357. addl $4 * SIZE, C1
  1358. ALIGN_4
  1359. .L90:
  1360. movl M, I
  1361. testl $1, I
  1362. jle .L999
  1363. #if !defined(TRMMKERNEL) || \
  1364. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1365. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1366. movl B, BB
  1367. #else
  1368. movl B, BB
  1369. movl KK, %eax
  1370. leal (, %eax, SIZE), %eax
  1371. addl %eax, AA
  1372. addl %eax, BB
  1373. #endif
  1374. pxor %xmm4, %xmm4
  1375. movsd -32 * SIZE(AA), %xmm0
  1376. pxor %xmm5, %xmm5
  1377. movsd -32 * SIZE(BB), %xmm1
  1378. #ifndef TRMMKERNEL
  1379. movl K, %eax
  1380. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1381. movl K, %eax
  1382. subl KK, %eax
  1383. movl %eax, KKK
  1384. #else
  1385. movl KK, %eax
  1386. #ifdef LEFT
  1387. addl $1, %eax
  1388. #else
  1389. addl $1, %eax
  1390. #endif
  1391. movl %eax, KKK
  1392. #endif
  1393. sarl $3, %eax
  1394. je .L95
  1395. ALIGN_4
  1396. .L92:
  1397. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1398. mulps %xmm0, %xmm1
  1399. movsd -30 * SIZE(AA), %xmm0
  1400. addps %xmm1, %xmm4
  1401. movsd -30 * SIZE(BB), %xmm1
  1402. mulps %xmm0, %xmm1
  1403. movsd -28 * SIZE(AA), %xmm0
  1404. addps %xmm1, %xmm4
  1405. movsd -28 * SIZE(BB), %xmm1
  1406. mulps %xmm0, %xmm1
  1407. movsd -26 * SIZE(AA), %xmm0
  1408. addps %xmm1, %xmm4
  1409. movsd -26 * SIZE(BB), %xmm1
  1410. mulps %xmm0, %xmm1
  1411. movsd -24 * SIZE(AA), %xmm0
  1412. addps %xmm1, %xmm4
  1413. movsd -24 * SIZE(BB), %xmm1
  1414. subl $-8 * SIZE, AA
  1415. subl $-8 * SIZE, BB
  1416. subl $1, %eax
  1417. jne .L92
  1418. ALIGN_4
  1419. .L95:
  1420. #ifndef TRMMKERNEL
  1421. movl K, %eax
  1422. #else
  1423. movl KKK, %eax
  1424. #endif
  1425. andl $7, %eax
  1426. BRANCH
  1427. je .L98
  1428. ALIGN_4
  1429. .L96:
  1430. mulss %xmm0, %xmm1
  1431. movss -31 * SIZE(AA), %xmm0
  1432. addss %xmm1, %xmm4
  1433. movss -31 * SIZE(BB), %xmm1
  1434. addl $1 * SIZE, AA
  1435. addl $1 * SIZE, BB
  1436. decl %eax
  1437. jg .L96
  1438. ALIGN_4
  1439. .L98:
  1440. movddup ALPHA, %xmm3
  1441. haddps %xmm4, %xmm4
  1442. movsd 0 * SIZE(C1), %xmm0
  1443. pshufd $0x50, %xmm4, %xmm2
  1444. mulps %xmm3, %xmm2
  1445. addps %xmm2, %xmm0
  1446. movlps %xmm0, 0 * SIZE(C1)
  1447. ALIGN_4
  1448. .L999:
  1449. popl %ebx
  1450. popl %esi
  1451. popl %edi
  1452. popl %ebp
  1453. addl $ARGS, %esp
  1454. ret
  1455. EPILOGUE