You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_4x4_penryn.S 34 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 20 + STACK + ARGS(%esp)
  47. #define ARG_B 24 + STACK + ARGS(%esp)
  48. #define C 28 + STACK + ARGS(%esp)
  49. #define ARG_LDC 32 + STACK + ARGS(%esp)
  50. #define OFFSET 36 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define BX 4 + STACK(%esp)
  53. #define KK 8 + STACK(%esp)
  54. #define KKK 12 + STACK(%esp)
  55. #ifdef NANO
  56. #define PREFETCHSIZE (16 * 3 + 8)
  57. #define PREFETCHW prefetcht0
  58. #define PREFETCHB prefetcht0
  59. #endif
  60. #ifdef NEHALEM
  61. #define PREFETCHSIZE (16 * 1 - 8)
  62. #define PREFETCHW prefetcht0
  63. #define PREFETCHB prefetcht0
  64. #endif
  65. #ifdef SANDYBRIDGE
  66. #define PREFETCHSIZE (16 * 1 - 8)
  67. #define PREFETCHW prefetcht0
  68. #define PREFETCHB prefetcht0
  69. #endif
  70. #ifndef PREFETCH
  71. #define PREFETCH prefetcht0
  72. #endif
  73. #ifndef PREFETCHW
  74. #define PREFETCHW prefetcht0
  75. #endif
  76. #ifndef PREFETCHB
  77. #define PREFETCHB prefetcht0
  78. #endif
  79. #ifndef PREFETCHSIZE
  80. #define PREFETCHSIZE (16 * 13 + 8)
  81. #endif
  82. #define AA %edx
  83. #define BB %ecx
  84. #define LDC %ebp
  85. #define B %edi
  86. #define C1 %esi
  87. #define I %ebx
  88. PROLOGUE
  89. subl $ARGS, %esp # Generate Stack Frame
  90. pushl %ebp
  91. pushl %edi
  92. pushl %esi
  93. pushl %ebx
  94. PROFCODE
  95. movl ARG_B, B
  96. movl ARG_LDC, LDC
  97. #ifdef TRMMKERNEL
  98. movl OFFSET, %eax
  99. #ifndef LEFT
  100. negl %eax
  101. #endif
  102. movl %eax, KK
  103. #endif
  104. subl $-32 * SIZE, A
  105. subl $-32 * SIZE, B
  106. leal (, LDC, SIZE), LDC
  107. movl N, %eax
  108. sarl $2, %eax
  109. movl %eax, J
  110. jle .L40
  111. ALIGN_4
  112. .L01:
  113. #if defined(TRMMKERNEL) && defined(LEFT)
  114. movl OFFSET, %eax
  115. movl %eax, KK
  116. #endif
  117. movl K, %eax
  118. sall $BASE_SHIFT + 2, %eax
  119. leal (B, %eax), %eax
  120. movl %eax, BX
  121. movl C, C1
  122. movl A, AA
  123. movl M, I
  124. sarl $2, I
  125. jle .L20
  126. ALIGN_4
  127. .L11:
  128. #if !defined(TRMMKERNEL) || \
  129. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  130. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  131. movl B, BB
  132. #else
  133. movl B, BB
  134. movl KK, %eax
  135. leal (, %eax, SIZE), %eax
  136. leal (AA, %eax, 4), AA
  137. leal (BB, %eax, 4), BB
  138. #endif
  139. movl BX, %eax
  140. PREFETCHB -32 * SIZE(%eax)
  141. subl $-16 * SIZE, %eax
  142. movl %eax, BX
  143. leal (C1, LDC, 2), %eax
  144. movaps -32 * SIZE(AA), %xmm0
  145. pxor %xmm2, %xmm2
  146. movaps -32 * SIZE(BB), %xmm1
  147. pxor %xmm3, %xmm3
  148. xorps %xmm4, %xmm4
  149. PREFETCHW 3 * SIZE(C1)
  150. xorps %xmm5, %xmm5
  151. PREFETCHW 7 * SIZE(C1, LDC)
  152. xorps %xmm6, %xmm6
  153. PREFETCHW 3 * SIZE(%eax)
  154. xorps %xmm7, %xmm7
  155. PREFETCHW 7 * SIZE(%eax, LDC)
  156. #ifndef TRMMKERNEL
  157. movl K, %eax
  158. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  159. movl K, %eax
  160. subl KK, %eax
  161. movl %eax, KKK
  162. #else
  163. movl KK, %eax
  164. #ifdef LEFT
  165. addl $4, %eax
  166. #else
  167. addl $4, %eax
  168. #endif
  169. movl %eax, KKK
  170. #endif
  171. sarl $3, %eax
  172. je .L15
  173. ALIGN_4
  174. .L12:
  175. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  176. addps %xmm2, %xmm7
  177. pshufd $0x93, %xmm1, %xmm2
  178. mulps %xmm0, %xmm1
  179. addps %xmm3, %xmm6
  180. pshufd $0x93, %xmm2, %xmm3
  181. mulps %xmm0, %xmm2
  182. addps %xmm2, %xmm5
  183. pshufd $0x93, %xmm3, %xmm2
  184. mulps %xmm0, %xmm3
  185. addps %xmm1, %xmm4
  186. movaps -28 * SIZE(BB), %xmm1
  187. mulps %xmm0, %xmm2
  188. movaps -28 * SIZE(AA), %xmm0
  189. addps %xmm2, %xmm7
  190. pshufd $0x93, %xmm1, %xmm2
  191. mulps %xmm0, %xmm1
  192. addps %xmm3, %xmm6
  193. pshufd $0x93, %xmm2, %xmm3
  194. mulps %xmm0, %xmm2
  195. addps %xmm2, %xmm5
  196. pshufd $0x93, %xmm3, %xmm2
  197. mulps %xmm0, %xmm3
  198. addps %xmm1, %xmm4
  199. movaps -24 * SIZE(BB), %xmm1
  200. mulps %xmm0, %xmm2
  201. movaps -24 * SIZE(AA), %xmm0
  202. addps %xmm2, %xmm7
  203. pshufd $0x93, %xmm1, %xmm2
  204. mulps %xmm0, %xmm1
  205. addps %xmm3, %xmm6
  206. pshufd $0x93, %xmm2, %xmm3
  207. mulps %xmm0, %xmm2
  208. addps %xmm2, %xmm5
  209. pshufd $0x93, %xmm3, %xmm2
  210. mulps %xmm0, %xmm3
  211. addps %xmm1, %xmm4
  212. movaps -20 * SIZE(BB), %xmm1
  213. mulps %xmm0, %xmm2
  214. movaps -20 * SIZE(AA), %xmm0
  215. addps %xmm2, %xmm7
  216. pshufd $0x93, %xmm1, %xmm2
  217. mulps %xmm0, %xmm1
  218. addps %xmm3, %xmm6
  219. pshufd $0x93, %xmm2, %xmm3
  220. mulps %xmm0, %xmm2
  221. addps %xmm2, %xmm5
  222. pshufd $0x93, %xmm3, %xmm2
  223. mulps %xmm0, %xmm3
  224. addps %xmm1, %xmm4
  225. movaps -16 * SIZE(BB), %xmm1
  226. mulps %xmm0, %xmm2
  227. movaps -16 * SIZE(AA), %xmm0
  228. addps %xmm2, %xmm7
  229. #if !(defined(NEHALEM) || defined(SANDYBRIDGE))
  230. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  231. #endif
  232. pshufd $0x93, %xmm1, %xmm2
  233. mulps %xmm0, %xmm1
  234. addps %xmm3, %xmm6
  235. pshufd $0x93, %xmm2, %xmm3
  236. mulps %xmm0, %xmm2
  237. addps %xmm2, %xmm5
  238. pshufd $0x93, %xmm3, %xmm2
  239. mulps %xmm0, %xmm3
  240. addps %xmm1, %xmm4
  241. movaps -12 * SIZE(BB), %xmm1
  242. mulps %xmm0, %xmm2
  243. movaps -12 * SIZE(AA), %xmm0
  244. addps %xmm2, %xmm7
  245. pshufd $0x93, %xmm1, %xmm2
  246. mulps %xmm0, %xmm1
  247. addps %xmm3, %xmm6
  248. pshufd $0x93, %xmm2, %xmm3
  249. mulps %xmm0, %xmm2
  250. addps %xmm2, %xmm5
  251. pshufd $0x93, %xmm3, %xmm2
  252. mulps %xmm0, %xmm3
  253. addps %xmm1, %xmm4
  254. movaps -8 * SIZE(BB), %xmm1
  255. mulps %xmm0, %xmm2
  256. movaps -8 * SIZE(AA), %xmm0
  257. addps %xmm2, %xmm7
  258. pshufd $0x93, %xmm1, %xmm2
  259. mulps %xmm0, %xmm1
  260. addps %xmm3, %xmm6
  261. pshufd $0x93, %xmm2, %xmm3
  262. mulps %xmm0, %xmm2
  263. addps %xmm2, %xmm5
  264. pshufd $0x93, %xmm3, %xmm2
  265. mulps %xmm0, %xmm3
  266. addps %xmm1, %xmm4
  267. movaps -4 * SIZE(BB), %xmm1
  268. mulps %xmm0, %xmm2
  269. movaps -4 * SIZE(AA), %xmm0
  270. addps %xmm2, %xmm7
  271. subl $-32 * SIZE, BB
  272. pshufd $0x93, %xmm1, %xmm2
  273. mulps %xmm0, %xmm1
  274. addps %xmm3, %xmm6
  275. pshufd $0x93, %xmm2, %xmm3
  276. mulps %xmm0, %xmm2
  277. addps %xmm2, %xmm5
  278. subl $-32 * SIZE, AA
  279. pshufd $0x93, %xmm3, %xmm2
  280. mulps %xmm0, %xmm3
  281. addps %xmm1, %xmm4
  282. movaps -32 * SIZE(BB), %xmm1
  283. mulps %xmm0, %xmm2
  284. movaps -32 * SIZE(AA), %xmm0
  285. subl $1, %eax
  286. jne .L12
  287. ALIGN_4
  288. .L15:
  289. #ifndef TRMMKERNEL
  290. movl K, %eax
  291. #else
  292. movl KKK, %eax
  293. #endif
  294. andl $7, %eax
  295. BRANCH
  296. je .L18
  297. ALIGN_4
  298. .L16:
  299. addps %xmm2, %xmm7
  300. pshufd $0x93, %xmm1, %xmm2
  301. mulps %xmm0, %xmm1
  302. addps %xmm3, %xmm6
  303. pshufd $0x93, %xmm2, %xmm3
  304. mulps %xmm0, %xmm2
  305. addps %xmm2, %xmm5
  306. pshufd $0x93, %xmm3, %xmm2
  307. mulps %xmm0, %xmm3
  308. addps %xmm1, %xmm4
  309. movaps -28 * SIZE(BB), %xmm1
  310. mulps %xmm0, %xmm2
  311. movaps -28 * SIZE(AA), %xmm0
  312. addl $4 * SIZE, AA
  313. addl $4 * SIZE, BB
  314. decl %eax
  315. jg .L16
  316. ALIGN_4
  317. .L18:
  318. addps %xmm3, %xmm6
  319. addps %xmm2, %xmm7
  320. movss ALPHA, %xmm3
  321. pshufd $0x39, %xmm5, %xmm2
  322. pshufd $0x4e, %xmm6, %xmm0
  323. pshufd $0x93, %xmm7, %xmm7
  324. movaps %xmm4, %xmm6
  325. unpcklps %xmm0, %xmm4
  326. unpckhps %xmm0, %xmm6
  327. movaps %xmm2, %xmm1
  328. unpcklps %xmm7, %xmm2
  329. unpckhps %xmm7, %xmm1
  330. movaps %xmm4, %xmm5
  331. unpcklps %xmm2, %xmm4
  332. unpckhps %xmm2, %xmm5
  333. movaps %xmm6, %xmm7
  334. unpcklps %xmm1, %xmm6
  335. unpckhps %xmm1, %xmm7
  336. pshufd $0x93, %xmm5, %xmm5
  337. pshufd $0x4e, %xmm6, %xmm6
  338. pshufd $0x39, %xmm7, %xmm7
  339. shufps $0, %xmm3, %xmm3
  340. mulps %xmm3, %xmm4
  341. mulps %xmm3, %xmm5
  342. mulps %xmm3, %xmm6
  343. mulps %xmm3, %xmm7
  344. leal (C1, LDC, 2), %eax
  345. #ifndef TRMMKERNEL
  346. movsd 0 * SIZE(C1), %xmm0
  347. movhps 2 * SIZE(C1), %xmm0
  348. movsd 0 * SIZE(C1, LDC), %xmm1
  349. movhps 2 * SIZE(C1, LDC), %xmm1
  350. movsd 0 * SIZE(%eax), %xmm2
  351. movhps 2 * SIZE(%eax), %xmm2
  352. movsd 0 * SIZE(%eax, LDC), %xmm3
  353. movhps 2 * SIZE(%eax, LDC), %xmm3
  354. addps %xmm0, %xmm4
  355. addps %xmm1, %xmm5
  356. addps %xmm2, %xmm6
  357. addps %xmm3, %xmm7
  358. #endif
  359. movsd %xmm4, 0 * SIZE(C1)
  360. movhps %xmm4, 2 * SIZE(C1)
  361. movsd %xmm5, 0 * SIZE(C1, LDC)
  362. movhps %xmm5, 2 * SIZE(C1, LDC)
  363. movsd %xmm6, 0 * SIZE(%eax)
  364. movhps %xmm6, 2 * SIZE(%eax)
  365. movsd %xmm7, 0 * SIZE(%eax, LDC)
  366. movhps %xmm7, 2 * SIZE(%eax, LDC)
  367. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  368. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  369. movl K, %eax
  370. subl KKK, %eax
  371. leal (,%eax, SIZE), %eax
  372. leal (AA, %eax, 4), AA
  373. leal (BB, %eax, 4), BB
  374. #endif
  375. #if defined(TRMMKERNEL) && defined(LEFT)
  376. addl $4, KK
  377. #endif
  378. addl $4 * SIZE, C1
  379. decl I
  380. jg .L11
  381. ALIGN_4
  382. .L20:
  383. movl M, I
  384. testl $2, I
  385. jle .L30
  386. #if !defined(TRMMKERNEL) || \
  387. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  388. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  389. movl B, BB
  390. #else
  391. movl B, BB
  392. movl KK, %eax
  393. leal (, %eax, SIZE), %eax
  394. leal (AA, %eax, 2), AA
  395. leal (BB, %eax, 4), BB
  396. #endif
  397. pxor %xmm4, %xmm4
  398. movaps -32 * SIZE(AA), %xmm0
  399. pxor %xmm5, %xmm5
  400. movaps -32 * SIZE(BB), %xmm1
  401. pxor %xmm6, %xmm6
  402. pxor %xmm7, %xmm7
  403. #ifndef TRMMKERNEL
  404. movl K, %eax
  405. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  406. movl K, %eax
  407. subl KK, %eax
  408. movl %eax, KKK
  409. #else
  410. movl KK, %eax
  411. #ifdef LEFT
  412. addl $2, %eax
  413. #else
  414. addl $4, %eax
  415. #endif
  416. movl %eax, KKK
  417. #endif
  418. sarl $3, %eax
  419. je .L25
  420. ALIGN_4
  421. .L22:
  422. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  423. pshufd $0x44, %xmm0, %xmm2
  424. pshufd $0x50, %xmm1, %xmm3
  425. mulps %xmm2, %xmm3
  426. addps %xmm3, %xmm4
  427. pshufd $0xfa, %xmm1, %xmm3
  428. movaps -28 * SIZE(BB), %xmm1
  429. mulps %xmm2, %xmm3
  430. addps %xmm3, %xmm5
  431. pshufd $0xee, %xmm0, %xmm2
  432. movaps -28 * SIZE(AA), %xmm0
  433. pshufd $0x50, %xmm1, %xmm3
  434. mulps %xmm2, %xmm3
  435. addps %xmm3, %xmm6
  436. pshufd $0xfa, %xmm1, %xmm3
  437. movaps -24 * SIZE(BB), %xmm1
  438. mulps %xmm2, %xmm3
  439. addps %xmm3, %xmm7
  440. pshufd $0x44, %xmm0, %xmm2
  441. pshufd $0x50, %xmm1, %xmm3
  442. mulps %xmm2, %xmm3
  443. addps %xmm3, %xmm4
  444. pshufd $0xfa, %xmm1, %xmm3
  445. movaps -20 * SIZE(BB), %xmm1
  446. mulps %xmm2, %xmm3
  447. addps %xmm3, %xmm5
  448. pshufd $0xee, %xmm0, %xmm2
  449. movaps -24 * SIZE(AA), %xmm0
  450. pshufd $0x50, %xmm1, %xmm3
  451. mulps %xmm2, %xmm3
  452. addps %xmm3, %xmm6
  453. pshufd $0xfa, %xmm1, %xmm3
  454. movaps -16 * SIZE(BB), %xmm1
  455. mulps %xmm2, %xmm3
  456. addps %xmm3, %xmm7
  457. pshufd $0x44, %xmm0, %xmm2
  458. pshufd $0x50, %xmm1, %xmm3
  459. mulps %xmm2, %xmm3
  460. addps %xmm3, %xmm4
  461. pshufd $0xfa, %xmm1, %xmm3
  462. movaps -12 * SIZE(BB), %xmm1
  463. mulps %xmm2, %xmm3
  464. addps %xmm3, %xmm5
  465. pshufd $0xee, %xmm0, %xmm2
  466. movaps -20 * SIZE(AA), %xmm0
  467. pshufd $0x50, %xmm1, %xmm3
  468. mulps %xmm2, %xmm3
  469. addps %xmm3, %xmm6
  470. pshufd $0xfa, %xmm1, %xmm3
  471. movaps -8 * SIZE(BB), %xmm1
  472. mulps %xmm2, %xmm3
  473. addps %xmm3, %xmm7
  474. pshufd $0x44, %xmm0, %xmm2
  475. pshufd $0x50, %xmm1, %xmm3
  476. mulps %xmm2, %xmm3
  477. addps %xmm3, %xmm4
  478. pshufd $0xfa, %xmm1, %xmm3
  479. movaps -4 * SIZE(BB), %xmm1
  480. mulps %xmm2, %xmm3
  481. addps %xmm3, %xmm5
  482. pshufd $0xee, %xmm0, %xmm2
  483. movaps -16 * SIZE(AA), %xmm0
  484. pshufd $0x50, %xmm1, %xmm3
  485. mulps %xmm2, %xmm3
  486. addps %xmm3, %xmm6
  487. pshufd $0xfa, %xmm1, %xmm3
  488. movaps 0 * SIZE(BB), %xmm1
  489. mulps %xmm2, %xmm3
  490. addps %xmm3, %xmm7
  491. subl $-16 * SIZE, AA
  492. subl $-32 * SIZE, BB
  493. subl $1, %eax
  494. jne .L22
  495. ALIGN_4
  496. .L25:
  497. #ifndef TRMMKERNEL
  498. movl K, %eax
  499. #else
  500. movl KKK, %eax
  501. #endif
  502. andl $7, %eax
  503. BRANCH
  504. je .L28
  505. ALIGN_4
  506. .L26:
  507. pshufd $0x44, %xmm0, %xmm2
  508. movsd -30 * SIZE(AA), %xmm0
  509. pshufd $0x50, %xmm1, %xmm3
  510. mulps %xmm2, %xmm3
  511. addps %xmm3, %xmm4
  512. pshufd $0xfa, %xmm1, %xmm3
  513. movaps -28 * SIZE(BB), %xmm1
  514. mulps %xmm2, %xmm3
  515. addps %xmm3, %xmm5
  516. addl $2 * SIZE, AA
  517. addl $4 * SIZE, BB
  518. decl %eax
  519. jg .L26
  520. ALIGN_4
  521. .L28:
  522. movss ALPHA, %xmm1
  523. addps %xmm6, %xmm4
  524. addps %xmm7, %xmm5
  525. shufps $0, %xmm1, %xmm1
  526. mulps %xmm1, %xmm4
  527. mulps %xmm1, %xmm5
  528. leal (C1, LDC, 2), %eax
  529. #ifndef TRMMKERNEL
  530. movsd 0 * SIZE(C1), %xmm0
  531. movhps 0 * SIZE(C1, LDC), %xmm0
  532. movsd 0 * SIZE(%eax), %xmm1
  533. movhps 0 * SIZE(%eax, LDC), %xmm1
  534. addps %xmm0, %xmm4
  535. addps %xmm1, %xmm5
  536. #endif
  537. movsd %xmm4, 0 * SIZE(C1)
  538. movhps %xmm4, 0 * SIZE(C1, LDC)
  539. movsd %xmm5, 0 * SIZE(%eax)
  540. movhps %xmm5, 0 * SIZE(%eax, LDC)
  541. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  542. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  543. movl K, %eax
  544. subl KKK, %eax
  545. leal (,%eax, SIZE), %eax
  546. leal (AA, %eax, 2), AA
  547. leal (BB, %eax, 4), BB
  548. #endif
  549. #if defined(TRMMKERNEL) && defined(LEFT)
  550. addl $2, KK
  551. #endif
  552. addl $2 * SIZE, C1
  553. ALIGN_4
  554. .L30:
  555. movl M, I
  556. testl $1, I
  557. jle .L39
  558. #if !defined(TRMMKERNEL) || \
  559. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  560. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  561. movl B, BB
  562. #else
  563. movl B, BB
  564. movl KK, %eax
  565. leal (, %eax, SIZE), %eax
  566. addl %eax, AA
  567. leal (BB, %eax, 4), BB
  568. #endif
  569. pxor %xmm4, %xmm4
  570. movsd -32 * SIZE(AA), %xmm0
  571. pxor %xmm5, %xmm5
  572. movaps -32 * SIZE(BB), %xmm1
  573. pxor %xmm6, %xmm6
  574. pxor %xmm7, %xmm7
  575. #ifndef TRMMKERNEL
  576. movl K, %eax
  577. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  578. movl K, %eax
  579. subl KK, %eax
  580. movl %eax, KKK
  581. #else
  582. movl KK, %eax
  583. #ifdef LEFT
  584. addl $1, %eax
  585. #else
  586. addl $4, %eax
  587. #endif
  588. movl %eax, KKK
  589. #endif
  590. sarl $3, %eax
  591. je .L35
  592. ALIGN_4
  593. .L32:
  594. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  595. pshufd $0x00, %xmm0, %xmm2
  596. mulps %xmm2, %xmm1
  597. addps %xmm1, %xmm4
  598. movaps -28 * SIZE(BB), %xmm1
  599. pshufd $0x55, %xmm0, %xmm2
  600. movsd -30 * SIZE(AA), %xmm0
  601. mulps %xmm2, %xmm1
  602. addps %xmm1, %xmm4
  603. movaps -24 * SIZE(BB), %xmm1
  604. pshufd $0x00, %xmm0, %xmm2
  605. mulps %xmm2, %xmm1
  606. addps %xmm1, %xmm4
  607. movaps -20 * SIZE(BB), %xmm1
  608. pshufd $0x55, %xmm0, %xmm2
  609. movsd -28 * SIZE(AA), %xmm0
  610. mulps %xmm2, %xmm1
  611. addps %xmm1, %xmm4
  612. movaps -16 * SIZE(BB), %xmm1
  613. pshufd $0x00, %xmm0, %xmm2
  614. mulps %xmm2, %xmm1
  615. addps %xmm1, %xmm4
  616. movaps -12 * SIZE(BB), %xmm1
  617. pshufd $0x55, %xmm0, %xmm2
  618. movsd -26 * SIZE(AA), %xmm0
  619. mulps %xmm2, %xmm1
  620. addps %xmm1, %xmm4
  621. movaps -8 * SIZE(BB), %xmm1
  622. pshufd $0x00, %xmm0, %xmm2
  623. mulps %xmm2, %xmm1
  624. addps %xmm1, %xmm4
  625. movaps -4 * SIZE(BB), %xmm1
  626. pshufd $0x55, %xmm0, %xmm2
  627. movsd -24 * SIZE(AA), %xmm0
  628. mulps %xmm2, %xmm1
  629. addps %xmm1, %xmm4
  630. movaps 0 * SIZE(BB), %xmm1
  631. subl $ -8 * SIZE, AA
  632. subl $-32 * SIZE, BB
  633. subl $1, %eax
  634. jne .L32
  635. ALIGN_4
  636. .L35:
  637. #ifndef TRMMKERNEL
  638. movl K, %eax
  639. #else
  640. movl KKK, %eax
  641. #endif
  642. andl $7, %eax
  643. BRANCH
  644. je .L38
  645. ALIGN_4
  646. .L36:
  647. pshufd $0x00, %xmm0, %xmm2
  648. movss -31 * SIZE(AA), %xmm0
  649. mulps %xmm2, %xmm1
  650. addps %xmm1, %xmm4
  651. movaps -28 * SIZE(BB), %xmm1
  652. addl $1 * SIZE, AA
  653. addl $4 * SIZE, BB
  654. decl %eax
  655. jg .L36
  656. ALIGN_4
  657. .L38:
  658. movss ALPHA, %xmm1
  659. shufps $0, %xmm1, %xmm1
  660. mulps %xmm1, %xmm4
  661. pshufd $0xff, %xmm4, %xmm7
  662. pshufd $0xaa, %xmm4, %xmm6
  663. pshufd $0x55, %xmm4, %xmm5
  664. pshufd $0x00, %xmm4, %xmm4
  665. leal (C1, LDC, 2), %eax
  666. #ifndef TRMMKERNEL
  667. movss 0 * SIZE(C1), %xmm0
  668. movss 0 * SIZE(C1, LDC), %xmm1
  669. movss 0 * SIZE(%eax), %xmm2
  670. movss 0 * SIZE(%eax, LDC), %xmm3
  671. addss %xmm0, %xmm4
  672. addss %xmm1, %xmm5
  673. addss %xmm2, %xmm6
  674. addss %xmm3, %xmm7
  675. #endif
  676. movss %xmm4, 0 * SIZE(C1)
  677. movss %xmm5, 0 * SIZE(C1, LDC)
  678. movss %xmm6, 0 * SIZE(%eax)
  679. movss %xmm7, 0 * SIZE(%eax, LDC)
  680. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  681. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  682. movl K, %eax
  683. subl KKK, %eax
  684. leal (,%eax, SIZE), %eax
  685. addl %eax, AA
  686. leal (BB, %eax, 4), BB
  687. #endif
  688. #if defined(TRMMKERNEL) && defined(LEFT)
  689. addl $1, KK
  690. #endif
  691. ALIGN_4
  692. .L39:
  693. #if defined(TRMMKERNEL) && !defined(LEFT)
  694. addl $4, KK
  695. #endif
  696. movl BB, B
  697. leal (, LDC, 4), %eax
  698. addl %eax, C
  699. decl J
  700. jg .L01
  701. ALIGN_4
  702. .L40:
  703. movl N, %eax
  704. testl $2, %eax
  705. jle .L70
  706. #if defined(TRMMKERNEL) && defined(LEFT)
  707. movl OFFSET, %eax
  708. movl %eax, KK
  709. #endif
  710. movl C, C1
  711. movl A, AA
  712. movl M, I
  713. sarl $2, I
  714. jle .L50
  715. ALIGN_4
  716. .L41:
  717. #if !defined(TRMMKERNEL) || \
  718. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  719. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  720. movl B, BB
  721. #else
  722. movl B, BB
  723. movl KK, %eax
  724. leal (, %eax, SIZE), %eax
  725. leal (AA, %eax, 4), AA
  726. leal (BB, %eax, 2), BB
  727. #endif
  728. movaps -32 * SIZE(AA), %xmm0
  729. pxor %xmm2, %xmm2
  730. movaps -32 * SIZE(BB), %xmm1
  731. pxor %xmm3, %xmm3
  732. pxor %xmm4, %xmm4
  733. prefetcht0 3 * SIZE(C1)
  734. pxor %xmm5, %xmm5
  735. prefetcht0 3 * SIZE(C1, LDC)
  736. pxor %xmm6, %xmm6
  737. pxor %xmm7, %xmm7
  738. #ifndef TRMMKERNEL
  739. movl K, %eax
  740. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  741. movl K, %eax
  742. subl KK, %eax
  743. movl %eax, KKK
  744. #else
  745. movl KK, %eax
  746. #ifdef LEFT
  747. addl $4, %eax
  748. #else
  749. addl $2, %eax
  750. #endif
  751. movl %eax, KKK
  752. #endif
  753. sarl $3, %eax
  754. je .L45
  755. ALIGN_4
  756. .L42:
  757. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  758. addps %xmm2, %xmm4
  759. pshufd $0x00, %xmm1, %xmm2
  760. mulps %xmm0, %xmm2
  761. addps %xmm3, %xmm5
  762. pshufd $0x55, %xmm1, %xmm3
  763. mulps %xmm0, %xmm3
  764. movaps -28 * SIZE(AA), %xmm0
  765. addps %xmm2, %xmm6
  766. pshufd $0xaa, %xmm1, %xmm2
  767. mulps %xmm0, %xmm2
  768. addps %xmm3, %xmm7
  769. pshufd $0xff, %xmm1, %xmm3
  770. movaps -28 * SIZE(BB), %xmm1
  771. mulps %xmm0, %xmm3
  772. movaps -24 * SIZE(AA), %xmm0
  773. addps %xmm2, %xmm4
  774. pshufd $0x00, %xmm1, %xmm2
  775. mulps %xmm0, %xmm2
  776. addps %xmm3, %xmm5
  777. pshufd $0x55, %xmm1, %xmm3
  778. mulps %xmm0, %xmm3
  779. movaps -20 * SIZE(AA), %xmm0
  780. addps %xmm2, %xmm6
  781. pshufd $0xaa, %xmm1, %xmm2
  782. mulps %xmm0, %xmm2
  783. addps %xmm3, %xmm7
  784. pshufd $0xff, %xmm1, %xmm3
  785. movaps -24 * SIZE(BB), %xmm1
  786. mulps %xmm0, %xmm3
  787. movaps -16 * SIZE(AA), %xmm0
  788. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  789. addps %xmm2, %xmm4
  790. pshufd $0x00, %xmm1, %xmm2
  791. mulps %xmm0, %xmm2
  792. addps %xmm3, %xmm5
  793. pshufd $0x55, %xmm1, %xmm3
  794. mulps %xmm0, %xmm3
  795. movaps -12 * SIZE(AA), %xmm0
  796. addps %xmm2, %xmm6
  797. pshufd $0xaa, %xmm1, %xmm2
  798. mulps %xmm0, %xmm2
  799. addps %xmm3, %xmm7
  800. pshufd $0xff, %xmm1, %xmm3
  801. movaps -20 * SIZE(BB), %xmm1
  802. mulps %xmm0, %xmm3
  803. movaps -8 * SIZE(AA), %xmm0
  804. addps %xmm2, %xmm4
  805. pshufd $0x00, %xmm1, %xmm2
  806. mulps %xmm0, %xmm2
  807. addps %xmm3, %xmm5
  808. pshufd $0x55, %xmm1, %xmm3
  809. mulps %xmm0, %xmm3
  810. movaps -4 * SIZE(AA), %xmm0
  811. addps %xmm2, %xmm6
  812. pshufd $0xaa, %xmm1, %xmm2
  813. mulps %xmm0, %xmm2
  814. addps %xmm3, %xmm7
  815. pshufd $0xff, %xmm1, %xmm3
  816. movaps -16 * SIZE(BB), %xmm1
  817. mulps %xmm0, %xmm3
  818. movaps 0 * SIZE(AA), %xmm0
  819. subl $-32 * SIZE, AA
  820. subl $-16 * SIZE, BB
  821. subl $1, %eax
  822. jne .L42
  823. ALIGN_4
  824. .L45:
  825. #ifndef TRMMKERNEL
  826. movl K, %eax
  827. #else
  828. movl KKK, %eax
  829. #endif
  830. andl $7, %eax
  831. BRANCH
  832. je .L48
  833. ALIGN_4
  834. .L46:
  835. addps %xmm2, %xmm4
  836. pshufd $0x00, %xmm1, %xmm2
  837. mulps %xmm0, %xmm2
  838. addps %xmm3, %xmm5
  839. pshufd $0x55, %xmm1, %xmm3
  840. movsd -30 * SIZE(BB), %xmm1
  841. mulps %xmm0, %xmm3
  842. movaps -28 * SIZE(AA), %xmm0
  843. addl $4 * SIZE, AA
  844. addl $2 * SIZE, BB
  845. decl %eax
  846. jg .L46
  847. ALIGN_4
  848. .L48:
  849. movss ALPHA, %xmm1
  850. addps %xmm6, %xmm4
  851. addps %xmm7, %xmm5
  852. addps %xmm2, %xmm4
  853. addps %xmm3, %xmm5
  854. shufps $0, %xmm1, %xmm1
  855. mulps %xmm1, %xmm4
  856. mulps %xmm1, %xmm5
  857. #ifndef TRMMKERNEL
  858. movsd 0 * SIZE(C1), %xmm0
  859. movhps 2 * SIZE(C1), %xmm0
  860. movsd 0 * SIZE(C1, LDC), %xmm1
  861. movhps 2 * SIZE(C1, LDC), %xmm1
  862. addps %xmm0, %xmm4
  863. addps %xmm1, %xmm5
  864. #endif
  865. movsd %xmm4, 0 * SIZE(C1)
  866. movhps %xmm4, 2 * SIZE(C1)
  867. movsd %xmm5, 0 * SIZE(C1, LDC)
  868. movhps %xmm5, 2 * SIZE(C1, LDC)
  869. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  870. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  871. movl K, %eax
  872. subl KKK, %eax
  873. leal (,%eax, SIZE), %eax
  874. leal (AA, %eax, 4), AA
  875. leal (BB, %eax, 2), BB
  876. #endif
  877. #if defined(TRMMKERNEL) && defined(LEFT)
  878. addl $4, KK
  879. #endif
  880. addl $4 * SIZE, C1
  881. decl I
  882. jg .L41
  883. ALIGN_4
  884. .L50:
  885. movl M, I
  886. testl $2, I
  887. jle .L60
  888. #if !defined(TRMMKERNEL) || \
  889. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  890. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  891. movl B, BB
  892. #else
  893. movl B, BB
  894. movl KK, %eax
  895. leal (, %eax, SIZE), %eax
  896. leal (AA, %eax, 2), AA
  897. leal (BB, %eax, 2), BB
  898. #endif
  899. movaps -32 * SIZE(AA), %xmm0
  900. pxor %xmm3, %xmm3
  901. movaps -32 * SIZE(BB), %xmm1
  902. pxor %xmm4, %xmm4
  903. pxor %xmm5, %xmm5
  904. #ifndef TRMMKERNEL
  905. movl K, %eax
  906. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  907. movl K, %eax
  908. subl KK, %eax
  909. movl %eax, KKK
  910. #else
  911. movl KK, %eax
  912. #ifdef LEFT
  913. addl $2, %eax
  914. #else
  915. addl $2, %eax
  916. #endif
  917. movl %eax, KKK
  918. #endif
  919. sarl $3, %eax
  920. je .L55
  921. ALIGN_4
  922. .L52:
  923. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  924. pshufd $0x44, %xmm0, %xmm2
  925. addps %xmm3, %xmm4
  926. pshufd $0x50, %xmm1, %xmm3
  927. mulps %xmm2, %xmm3
  928. pshufd $0xee, %xmm0, %xmm2
  929. movaps -28 * SIZE(AA), %xmm0
  930. addps %xmm3, %xmm5
  931. pshufd $0xfa, %xmm1, %xmm3
  932. movaps -28 * SIZE(BB), %xmm1
  933. mulps %xmm2, %xmm3
  934. pshufd $0x44, %xmm0, %xmm2
  935. addps %xmm3, %xmm4
  936. pshufd $0x50, %xmm1, %xmm3
  937. mulps %xmm2, %xmm3
  938. pshufd $0xee, %xmm0, %xmm2
  939. movaps -24 * SIZE(AA), %xmm0
  940. addps %xmm3, %xmm5
  941. pshufd $0xfa, %xmm1, %xmm3
  942. movaps -24 * SIZE(BB), %xmm1
  943. mulps %xmm2, %xmm3
  944. pshufd $0x44, %xmm0, %xmm2
  945. addps %xmm3, %xmm4
  946. pshufd $0x50, %xmm1, %xmm3
  947. mulps %xmm2, %xmm3
  948. pshufd $0xee, %xmm0, %xmm2
  949. movaps -20 * SIZE(AA), %xmm0
  950. addps %xmm3, %xmm5
  951. pshufd $0xfa, %xmm1, %xmm3
  952. movaps -20 * SIZE(BB), %xmm1
  953. mulps %xmm2, %xmm3
  954. pshufd $0x44, %xmm0, %xmm2
  955. addps %xmm3, %xmm4
  956. pshufd $0x50, %xmm1, %xmm3
  957. mulps %xmm2, %xmm3
  958. pshufd $0xee, %xmm0, %xmm2
  959. movaps -16 * SIZE(AA), %xmm0
  960. addps %xmm3, %xmm5
  961. pshufd $0xfa, %xmm1, %xmm3
  962. movaps -16 * SIZE(BB), %xmm1
  963. mulps %xmm2, %xmm3
  964. subl $-16 * SIZE, AA
  965. subl $-16 * SIZE, BB
  966. subl $1, %eax
  967. jne .L52
  968. ALIGN_4
  969. .L55:
  970. #ifndef TRMMKERNEL
  971. movl K, %eax
  972. #else
  973. movl KKK, %eax
  974. #endif
  975. andl $7, %eax
  976. BRANCH
  977. je .L58
  978. ALIGN_4
  979. .L56:
  980. pshufd $0x44, %xmm0, %xmm2
  981. movsd -30 * SIZE(AA), %xmm0
  982. addps %xmm3, %xmm4
  983. pshufd $0x50, %xmm1, %xmm3
  984. movsd -30 * SIZE(BB), %xmm1
  985. mulps %xmm2, %xmm3
  986. addl $2 * SIZE, AA
  987. addl $2 * SIZE, BB
  988. decl %eax
  989. jg .L56
  990. ALIGN_4
  991. .L58:
  992. movss ALPHA, %xmm1
  993. addps %xmm3, %xmm4
  994. addps %xmm5, %xmm4
  995. shufps $0, %xmm1, %xmm1
  996. mulps %xmm1, %xmm4
  997. #ifndef TRMMKERNEL
  998. movsd 0 * SIZE(C1), %xmm0
  999. movhps 0 * SIZE(C1, LDC), %xmm0
  1000. addps %xmm0, %xmm4
  1001. #endif
  1002. movsd %xmm4, 0 * SIZE(C1)
  1003. movhps %xmm4, 0 * SIZE(C1, LDC)
  1004. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1005. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1006. movl K, %eax
  1007. subl KKK, %eax
  1008. leal (,%eax, SIZE), %eax
  1009. leal (AA, %eax, 2), AA
  1010. leal (BB, %eax, 2), BB
  1011. #endif
  1012. #if defined(TRMMKERNEL) && defined(LEFT)
  1013. addl $2, KK
  1014. #endif
  1015. addl $2 * SIZE, C1
  1016. ALIGN_4
  1017. .L60:
  1018. movl M, I
  1019. testl $1, I
  1020. jle .L69
  1021. #if !defined(TRMMKERNEL) || \
  1022. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1023. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1024. movl B, BB
  1025. #else
  1026. movl B, BB
  1027. movl KK, %eax
  1028. leal (, %eax, SIZE), %eax
  1029. addl %eax, AA
  1030. leal (BB, %eax, 2), BB
  1031. #endif
  1032. pxor %xmm4, %xmm4
  1033. movsd -32 * SIZE(AA), %xmm0
  1034. pxor %xmm5, %xmm5
  1035. movsd -32 * SIZE(BB), %xmm1
  1036. #ifndef TRMMKERNEL
  1037. movl K, %eax
  1038. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1039. movl K, %eax
  1040. subl KK, %eax
  1041. movl %eax, KKK
  1042. #else
  1043. movl KK, %eax
  1044. #ifdef LEFT
  1045. addl $1, %eax
  1046. #else
  1047. addl $2, %eax
  1048. #endif
  1049. movl %eax, KKK
  1050. #endif
  1051. sarl $3, %eax
  1052. je .L65
  1053. ALIGN_4
  1054. .L62:
  1055. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1056. pshufd $0x00, %xmm0, %xmm2
  1057. mulps %xmm2, %xmm1
  1058. addps %xmm1, %xmm4
  1059. movsd -30 * SIZE(BB), %xmm1
  1060. pshufd $0x55, %xmm0, %xmm2
  1061. movsd -30 * SIZE(AA), %xmm0
  1062. mulps %xmm2, %xmm1
  1063. addps %xmm1, %xmm5
  1064. movsd -28 * SIZE(BB), %xmm1
  1065. pshufd $0x00, %xmm0, %xmm2
  1066. mulps %xmm2, %xmm1
  1067. addps %xmm1, %xmm4
  1068. movsd -26 * SIZE(BB), %xmm1
  1069. pshufd $0x55, %xmm0, %xmm2
  1070. movsd -28 * SIZE(AA), %xmm0
  1071. mulps %xmm2, %xmm1
  1072. addps %xmm1, %xmm5
  1073. movsd -24 * SIZE(BB), %xmm1
  1074. pshufd $0x00, %xmm0, %xmm2
  1075. mulps %xmm2, %xmm1
  1076. addps %xmm1, %xmm4
  1077. movsd -22 * SIZE(BB), %xmm1
  1078. pshufd $0x55, %xmm0, %xmm2
  1079. movsd -26 * SIZE(AA), %xmm0
  1080. mulps %xmm2, %xmm1
  1081. addps %xmm1, %xmm5
  1082. movsd -20 * SIZE(BB), %xmm1
  1083. pshufd $0x00, %xmm0, %xmm2
  1084. mulps %xmm2, %xmm1
  1085. addps %xmm1, %xmm4
  1086. movsd -18 * SIZE(BB), %xmm1
  1087. pshufd $0x55, %xmm0, %xmm2
  1088. movsd -24 * SIZE(AA), %xmm0
  1089. mulps %xmm2, %xmm1
  1090. addps %xmm1, %xmm5
  1091. movsd -16 * SIZE(BB), %xmm1
  1092. subl $ -8 * SIZE, AA
  1093. subl $-16 * SIZE, BB
  1094. subl $1, %eax
  1095. jne .L62
  1096. ALIGN_4
  1097. .L65:
  1098. #ifndef TRMMKERNEL
  1099. movl K, %eax
  1100. #else
  1101. movl KKK, %eax
  1102. #endif
  1103. andl $7, %eax
  1104. BRANCH
  1105. je .L68
  1106. ALIGN_4
  1107. .L66:
  1108. pshufd $0x00, %xmm0, %xmm2
  1109. movss -31 * SIZE(AA), %xmm0
  1110. mulps %xmm2, %xmm1
  1111. addps %xmm1, %xmm4
  1112. movsd -30 * SIZE(BB), %xmm1
  1113. addl $1 * SIZE, AA
  1114. addl $2 * SIZE, BB
  1115. decl %eax
  1116. jg .L66
  1117. ALIGN_4
  1118. .L68:
  1119. movss ALPHA, %xmm1
  1120. addps %xmm5, %xmm4
  1121. shufps $0, %xmm1, %xmm1
  1122. mulps %xmm1, %xmm4
  1123. pshufd $0x55, %xmm4, %xmm5
  1124. pshufd $0x00, %xmm4, %xmm4
  1125. #ifndef TRMMKERNEL
  1126. movss 0 * SIZE(C1), %xmm0
  1127. movss 0 * SIZE(C1, LDC), %xmm1
  1128. addss %xmm0, %xmm4
  1129. addss %xmm1, %xmm5
  1130. #endif
  1131. movss %xmm4, 0 * SIZE(C1)
  1132. movss %xmm5, 0 * SIZE(C1, LDC)
  1133. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1134. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1135. movl K, %eax
  1136. subl KKK, %eax
  1137. leal (,%eax, SIZE), %eax
  1138. addl %eax, AA
  1139. leal (BB, %eax, 2), BB
  1140. #endif
  1141. #if defined(TRMMKERNEL) && defined(LEFT)
  1142. addl $1, KK
  1143. #endif
  1144. ALIGN_4
  1145. .L69:
  1146. #if defined(TRMMKERNEL) && !defined(LEFT)
  1147. addl $2, KK
  1148. #endif
  1149. movl BB, B
  1150. leal (, LDC, 2), %eax
  1151. addl %eax, C
  1152. ALIGN_4
  1153. .L70:
  1154. movl N, %eax
  1155. testl $1, %eax
  1156. jle .L999
  1157. #if defined(TRMMKERNEL) && defined(LEFT)
  1158. movl OFFSET, %eax
  1159. movl %eax, KK
  1160. #endif
  1161. movl C, C1
  1162. movl A, AA
  1163. movl M, I
  1164. sarl $2, I
  1165. jle .L80
  1166. ALIGN_4
  1167. .L71:
  1168. #if !defined(TRMMKERNEL) || \
  1169. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1170. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1171. movl B, BB
  1172. #else
  1173. movl B, BB
  1174. movl KK, %eax
  1175. leal (, %eax, SIZE), %eax
  1176. leal (AA, %eax, 4), AA
  1177. addl %eax, BB
  1178. #endif
  1179. movaps -32 * SIZE(AA), %xmm0
  1180. pxor %xmm2, %xmm2
  1181. movsd -32 * SIZE(BB), %xmm1
  1182. pxor %xmm4, %xmm4
  1183. prefetcht0 3 * SIZE(C1)
  1184. pxor %xmm5, %xmm5
  1185. #ifndef TRMMKERNEL
  1186. movl K, %eax
  1187. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1188. movl K, %eax
  1189. subl KK, %eax
  1190. movl %eax, KKK
  1191. #else
  1192. movl KK, %eax
  1193. #ifdef LEFT
  1194. addl $4, %eax
  1195. #else
  1196. addl $1, %eax
  1197. #endif
  1198. movl %eax, KKK
  1199. #endif
  1200. sarl $3, %eax
  1201. je .L75
  1202. ALIGN_4
  1203. .L72:
  1204. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1205. addps %xmm2, %xmm4
  1206. pshufd $0x00, %xmm1, %xmm2
  1207. mulps %xmm0, %xmm2
  1208. movaps -28 * SIZE(AA), %xmm0
  1209. addps %xmm2, %xmm5
  1210. pshufd $0x55, %xmm1, %xmm2
  1211. movsd -30 * SIZE(BB), %xmm1
  1212. mulps %xmm0, %xmm2
  1213. movaps -24 * SIZE(AA), %xmm0
  1214. addps %xmm2, %xmm4
  1215. pshufd $0x00, %xmm1, %xmm2
  1216. mulps %xmm0, %xmm2
  1217. movaps -20 * SIZE(AA), %xmm0
  1218. addps %xmm2, %xmm5
  1219. pshufd $0x55, %xmm1, %xmm2
  1220. movsd -28 * SIZE(BB), %xmm1
  1221. mulps %xmm0, %xmm2
  1222. movaps -16 * SIZE(AA), %xmm0
  1223. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  1224. addps %xmm2, %xmm4
  1225. pshufd $0x00, %xmm1, %xmm2
  1226. mulps %xmm0, %xmm2
  1227. movaps -12 * SIZE(AA), %xmm0
  1228. addps %xmm2, %xmm5
  1229. pshufd $0x55, %xmm1, %xmm2
  1230. movsd -26 * SIZE(BB), %xmm1
  1231. mulps %xmm0, %xmm2
  1232. movaps -8 * SIZE(AA), %xmm0
  1233. addps %xmm2, %xmm4
  1234. pshufd $0x00, %xmm1, %xmm2
  1235. mulps %xmm0, %xmm2
  1236. movaps -4 * SIZE(AA), %xmm0
  1237. addps %xmm2, %xmm5
  1238. pshufd $0x55, %xmm1, %xmm2
  1239. movsd -24 * SIZE(BB), %xmm1
  1240. mulps %xmm0, %xmm2
  1241. movaps 0 * SIZE(AA), %xmm0
  1242. subl $-32 * SIZE, AA
  1243. subl $ -8 * SIZE, BB
  1244. subl $1, %eax
  1245. jne .L72
  1246. ALIGN_4
  1247. .L75:
  1248. #ifndef TRMMKERNEL
  1249. movl K, %eax
  1250. #else
  1251. movl KKK, %eax
  1252. #endif
  1253. andl $7, %eax
  1254. BRANCH
  1255. je .L78
  1256. ALIGN_4
  1257. .L76:
  1258. addps %xmm2, %xmm4
  1259. pshufd $0x00, %xmm1, %xmm2
  1260. movss -31 * SIZE(BB), %xmm1
  1261. mulps %xmm0, %xmm2
  1262. movaps -28 * SIZE(AA), %xmm0
  1263. addl $4 * SIZE, AA
  1264. addl $1 * SIZE, BB
  1265. decl %eax
  1266. jg .L76
  1267. ALIGN_4
  1268. .L78:
  1269. movss ALPHA, %xmm1
  1270. addps %xmm2, %xmm4
  1271. addps %xmm5, %xmm4
  1272. shufps $0, %xmm1, %xmm1
  1273. mulps %xmm1, %xmm4
  1274. #ifndef TRMMKERNEL
  1275. movsd 0 * SIZE(C1), %xmm0
  1276. movhps 2 * SIZE(C1), %xmm0
  1277. addps %xmm0, %xmm4
  1278. #endif
  1279. movsd %xmm4, 0 * SIZE(C1)
  1280. movhps %xmm4, 2 * SIZE(C1)
  1281. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1282. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1283. movl K, %eax
  1284. subl KKK, %eax
  1285. leal (,%eax, SIZE), %eax
  1286. leal (AA, %eax, 4), AA
  1287. addl %eax, BB
  1288. #endif
  1289. #if defined(TRMMKERNEL) && defined(LEFT)
  1290. addl $4, KK
  1291. #endif
  1292. addl $4 * SIZE, C1
  1293. decl I
  1294. jg .L71
  1295. ALIGN_4
  1296. .L80:
  1297. movl M, I
  1298. testl $2, I
  1299. jle .L90
  1300. #if !defined(TRMMKERNEL) || \
  1301. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1302. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1303. movl B, BB
  1304. #else
  1305. movl B, BB
  1306. movl KK, %eax
  1307. leal (, %eax, SIZE), %eax
  1308. leal (AA, %eax, 2), AA
  1309. addl %eax, BB
  1310. #endif
  1311. movsd -32 * SIZE(AA), %xmm0
  1312. pxor %xmm3, %xmm3
  1313. movsd -32 * SIZE(BB), %xmm1
  1314. pxor %xmm4, %xmm4
  1315. pxor %xmm5, %xmm5
  1316. #ifndef TRMMKERNEL
  1317. movl K, %eax
  1318. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1319. movl K, %eax
  1320. subl KK, %eax
  1321. movl %eax, KKK
  1322. #else
  1323. movl KK, %eax
  1324. #ifdef LEFT
  1325. addl $2, %eax
  1326. #else
  1327. addl $1, %eax
  1328. #endif
  1329. movl %eax, KKK
  1330. #endif
  1331. sarl $3, %eax
  1332. je .L85
  1333. ALIGN_4
  1334. .L82:
  1335. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1336. pshufd $0x00, %xmm1, %xmm2
  1337. mulps %xmm0, %xmm2
  1338. movsd -30 * SIZE(AA), %xmm0
  1339. addps %xmm2, %xmm4
  1340. pshufd $0x55, %xmm1, %xmm2
  1341. movsd -30 * SIZE(BB), %xmm1
  1342. mulps %xmm0, %xmm2
  1343. movsd -28 * SIZE(AA), %xmm0
  1344. addps %xmm2, %xmm5
  1345. pshufd $0x00, %xmm1, %xmm2
  1346. mulps %xmm0, %xmm2
  1347. movsd -26 * SIZE(AA), %xmm0
  1348. addps %xmm2, %xmm4
  1349. pshufd $0x55, %xmm1, %xmm2
  1350. movsd -28 * SIZE(BB), %xmm1
  1351. mulps %xmm0, %xmm2
  1352. movsd -24 * SIZE(AA), %xmm0
  1353. addps %xmm2, %xmm5
  1354. pshufd $0x00, %xmm1, %xmm2
  1355. mulps %xmm0, %xmm2
  1356. movsd -22 * SIZE(AA), %xmm0
  1357. addps %xmm2, %xmm4
  1358. pshufd $0x55, %xmm1, %xmm2
  1359. movsd -26 * SIZE(BB), %xmm1
  1360. mulps %xmm0, %xmm2
  1361. movsd -20 * SIZE(AA), %xmm0
  1362. addps %xmm2, %xmm5
  1363. pshufd $0x00, %xmm1, %xmm2
  1364. mulps %xmm0, %xmm2
  1365. movsd -18 * SIZE(AA), %xmm0
  1366. addps %xmm2, %xmm4
  1367. pshufd $0x55, %xmm1, %xmm2
  1368. movsd -24 * SIZE(BB), %xmm1
  1369. mulps %xmm0, %xmm2
  1370. movsd -16 * SIZE(AA), %xmm0
  1371. addps %xmm2, %xmm5
  1372. subl $-16 * SIZE, AA
  1373. subl $ -8 * SIZE, BB
  1374. subl $1, %eax
  1375. jne .L82
  1376. ALIGN_4
  1377. .L85:
  1378. #ifndef TRMMKERNEL
  1379. movl K, %eax
  1380. #else
  1381. movl KKK, %eax
  1382. #endif
  1383. andl $7, %eax
  1384. BRANCH
  1385. je .L88
  1386. ALIGN_4
  1387. .L86:
  1388. pshufd $0x00, %xmm1, %xmm2
  1389. movss -31 * SIZE(BB), %xmm1
  1390. mulps %xmm0, %xmm2
  1391. movsd -30 * SIZE(AA), %xmm0
  1392. addps %xmm2, %xmm4
  1393. addl $2 * SIZE, AA
  1394. addl $1 * SIZE, BB
  1395. decl %eax
  1396. jg .L86
  1397. ALIGN_4
  1398. .L88:
  1399. movss ALPHA, %xmm1
  1400. addps %xmm5, %xmm4
  1401. shufps $0, %xmm1, %xmm1
  1402. mulps %xmm1, %xmm4
  1403. #ifndef TRMMKERNEL
  1404. movsd 0 * SIZE(C1), %xmm0
  1405. addps %xmm0, %xmm4
  1406. #endif
  1407. movsd %xmm4, 0 * SIZE(C1)
  1408. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1409. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1410. movl K, %eax
  1411. subl KKK, %eax
  1412. leal (,%eax, SIZE), %eax
  1413. leal (AA, %eax, 2), AA
  1414. addl %eax, BB
  1415. #endif
  1416. #if defined(TRMMKERNEL) && defined(LEFT)
  1417. addl $2, KK
  1418. #endif
  1419. addl $2 * SIZE, C1
  1420. ALIGN_4
  1421. .L90:
  1422. movl M, I
  1423. testl $1, I
  1424. jle .L999
  1425. #if !defined(TRMMKERNEL) || \
  1426. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1427. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1428. movl B, BB
  1429. #else
  1430. movl B, BB
  1431. movl KK, %eax
  1432. leal (, %eax, SIZE), %eax
  1433. addl %eax, AA
  1434. addl %eax, BB
  1435. #endif
  1436. pxor %xmm4, %xmm4
  1437. movsd -32 * SIZE(AA), %xmm0
  1438. pxor %xmm5, %xmm5
  1439. movsd -32 * SIZE(BB), %xmm1
  1440. #ifndef TRMMKERNEL
  1441. movl K, %eax
  1442. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1443. movl K, %eax
  1444. subl KK, %eax
  1445. movl %eax, KKK
  1446. #else
  1447. movl KK, %eax
  1448. #ifdef LEFT
  1449. addl $1, %eax
  1450. #else
  1451. addl $1, %eax
  1452. #endif
  1453. movl %eax, KKK
  1454. #endif
  1455. sarl $3, %eax
  1456. je .L95
  1457. ALIGN_4
  1458. .L92:
  1459. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1460. mulps %xmm0, %xmm1
  1461. movsd -30 * SIZE(AA), %xmm0
  1462. addps %xmm1, %xmm4
  1463. movsd -30 * SIZE(BB), %xmm1
  1464. mulps %xmm0, %xmm1
  1465. movsd -28 * SIZE(AA), %xmm0
  1466. addps %xmm1, %xmm4
  1467. movsd -28 * SIZE(BB), %xmm1
  1468. mulps %xmm0, %xmm1
  1469. movsd -26 * SIZE(AA), %xmm0
  1470. addps %xmm1, %xmm4
  1471. movsd -26 * SIZE(BB), %xmm1
  1472. mulps %xmm0, %xmm1
  1473. movsd -24 * SIZE(AA), %xmm0
  1474. addps %xmm1, %xmm4
  1475. movsd -24 * SIZE(BB), %xmm1
  1476. subl $-8 * SIZE, AA
  1477. subl $-8 * SIZE, BB
  1478. subl $1, %eax
  1479. jne .L92
  1480. ALIGN_4
  1481. .L95:
  1482. #ifndef TRMMKERNEL
  1483. movl K, %eax
  1484. #else
  1485. movl KKK, %eax
  1486. #endif
  1487. andl $7, %eax
  1488. BRANCH
  1489. je .L98
  1490. ALIGN_4
  1491. .L96:
  1492. mulss %xmm0, %xmm1
  1493. movss -31 * SIZE(AA), %xmm0
  1494. addss %xmm1, %xmm4
  1495. movss -31 * SIZE(BB), %xmm1
  1496. addl $1 * SIZE, AA
  1497. addl $1 * SIZE, BB
  1498. decl %eax
  1499. jg .L96
  1500. ALIGN_4
  1501. .L98:
  1502. movss ALPHA, %xmm1
  1503. haddps %xmm4, %xmm4
  1504. mulss %xmm1, %xmm4
  1505. #ifndef TRMMKERNEL
  1506. movss 0 * SIZE(C1), %xmm0
  1507. addss %xmm0, %xmm4
  1508. #endif
  1509. movss %xmm4, 0 * SIZE(C1)
  1510. ALIGN_4
  1511. .L999:
  1512. popl %ebx
  1513. popl %esi
  1514. popl %edi
  1515. popl %ebp
  1516. addl $ARGS, %esp
  1517. ret
  1518. EPILOGUE