You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_2x4_3dnow.S 44 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define OLD_M 4 + STACK + ARGS(%esi)
  43. #define OLD_N 8 + STACK + ARGS(%esi)
  44. #define OLD_K 12 + STACK + ARGS(%esi)
  45. #define OLD_ALPHA 16 + STACK + ARGS(%esi)
  46. #define OLD_A 20 + STACK + ARGS(%esi)
  47. #define OLD_B 24 + STACK + ARGS(%esi)
  48. #define OLD_C 28 + STACK + ARGS(%esi)
  49. #define OLD_LDC 32 + STACK + ARGS(%esi)
  50. #define OLD_OFFSET 36 + STACK + ARGS(%esi)
  51. #define ALPHA 0(%esp)
  52. #define K 8(%esp)
  53. #define N 12(%esp)
  54. #define M 16(%esp)
  55. #define A 20(%esp)
  56. #define C 24(%esp)
  57. #define J 28(%esp)
  58. #define OLD_STACK 32(%esp)
  59. #define OFFSET 36(%esp)
  60. #define KK 40(%esp)
  61. #define KKK 44(%esp)
  62. #define BUFFER 64(%esp)
  63. #define AA %edx
  64. #define BB %ecx
  65. #define PREFETCHSIZE (16 * 2 + 6)
  66. #define AOFFSET -32
  67. #define BOFFSET 128
  68. /*
  69. A hint of scheduling is received from following URL
  70. https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=flat&viewmonth=200309&viewday=11
  71. */
  72. PROLOGUE
  73. pushl %ebp
  74. pushl %edi
  75. pushl %esi
  76. pushl %ebx
  77. PROFCODE
  78. EMMS
  79. movl %esp, %esi # save old stack
  80. subl $128 + LOCAL_BUFFER_SIZE, %esp
  81. movl OLD_M, %ebx
  82. andl $-1024, %esp # align stack
  83. STACK_TOUCHING
  84. movl OLD_N, %eax
  85. movl OLD_K, %ecx
  86. movl OLD_A, %edx
  87. movd OLD_ALPHA, %mm3
  88. movl %ebx, M
  89. movl %eax, N
  90. movl %ecx, K
  91. subl $AOFFSET * SIZE, %edx
  92. movl %edx, A
  93. movl %esi, OLD_STACK
  94. movl OLD_B, %edi
  95. movl OLD_C, %ebx
  96. punpckldq %mm3, %mm3
  97. movq %mm3, ALPHA
  98. movl %ebx, C
  99. movl OLD_LDC, %ebp
  100. leal (, %ebp, SIZE), %ebp
  101. #ifdef TRMMKERNEL
  102. movl OLD_OFFSET, %eax
  103. movl %eax, OFFSET
  104. #ifndef LEFT
  105. negl %eax
  106. movl %eax, KK
  107. #endif
  108. #endif
  109. movl N, %eax
  110. sarl $2, %eax
  111. movl %eax, J
  112. jle .L30
  113. ALIGN_3
  114. .L01:
  115. /* Copying to Sub Buffer */
  116. leal BUFFER, %ecx
  117. #if defined(TRMMKERNEL) && defined(LEFT)
  118. movl OFFSET, %eax
  119. movl %eax, KK
  120. #endif
  121. movl K, %eax
  122. sarl $2, %eax
  123. jle .L03
  124. ALIGN_3
  125. .L02:
  126. movd 0 * SIZE(%edi), %mm0
  127. movd 1 * SIZE(%edi), %mm1
  128. movd 2 * SIZE(%edi), %mm2
  129. movd 3 * SIZE(%edi), %mm3
  130. movd 4 * SIZE(%edi), %mm4
  131. movd 5 * SIZE(%edi), %mm5
  132. movd 6 * SIZE(%edi), %mm6
  133. movd 7 * SIZE(%edi), %mm7
  134. prefetchnta 72 * SIZE(%edi)
  135. punpckldq %mm0, %mm0
  136. punpckldq %mm1, %mm1
  137. punpckldq %mm2, %mm2
  138. punpckldq %mm3, %mm3
  139. punpckldq %mm4, %mm4
  140. punpckldq %mm5, %mm5
  141. punpckldq %mm6, %mm6
  142. punpckldq %mm7, %mm7
  143. movq %mm0, 0 * SIZE(%ecx)
  144. movq %mm1, 2 * SIZE(%ecx)
  145. movq %mm2, 4 * SIZE(%ecx)
  146. movq %mm3, 6 * SIZE(%ecx)
  147. movq %mm4, 8 * SIZE(%ecx)
  148. movq %mm5, 10 * SIZE(%ecx)
  149. movq %mm6, 12 * SIZE(%ecx)
  150. movq %mm7, 14 * SIZE(%ecx)
  151. movd 8 * SIZE(%edi), %mm0
  152. movd 9 * SIZE(%edi), %mm1
  153. movd 10 * SIZE(%edi), %mm2
  154. movd 11 * SIZE(%edi), %mm3
  155. movd 12 * SIZE(%edi), %mm4
  156. movd 13 * SIZE(%edi), %mm5
  157. movd 14 * SIZE(%edi), %mm6
  158. movd 15 * SIZE(%edi), %mm7
  159. punpckldq %mm0, %mm0
  160. punpckldq %mm1, %mm1
  161. punpckldq %mm2, %mm2
  162. punpckldq %mm3, %mm3
  163. punpckldq %mm4, %mm4
  164. punpckldq %mm5, %mm5
  165. punpckldq %mm6, %mm6
  166. punpckldq %mm7, %mm7
  167. movq %mm0, 16 * SIZE(%ecx)
  168. movq %mm1, 18 * SIZE(%ecx)
  169. movq %mm2, 20 * SIZE(%ecx)
  170. movq %mm3, 22 * SIZE(%ecx)
  171. movq %mm4, 24 * SIZE(%ecx)
  172. movq %mm5, 26 * SIZE(%ecx)
  173. movq %mm6, 28 * SIZE(%ecx)
  174. movq %mm7, 30 * SIZE(%ecx)
  175. addl $16 * SIZE, %edi
  176. addl $32 * SIZE, %ecx
  177. decl %eax
  178. jne .L02
  179. .L03:
  180. movl K, %eax
  181. andl $3, %eax
  182. BRANCH
  183. jle .L10
  184. ALIGN_2
  185. .L04:
  186. movd 0 * SIZE(%edi), %mm0
  187. movd 1 * SIZE(%edi), %mm1
  188. movd 2 * SIZE(%edi), %mm2
  189. movd 3 * SIZE(%edi), %mm3
  190. punpckldq %mm0, %mm0
  191. punpckldq %mm1, %mm1
  192. punpckldq %mm2, %mm2
  193. punpckldq %mm3, %mm3
  194. movq %mm0, 0 * SIZE(%ecx)
  195. movq %mm1, 2 * SIZE(%ecx)
  196. movq %mm2, 4 * SIZE(%ecx)
  197. movq %mm3, 6 * SIZE(%ecx)
  198. addl $4 * SIZE, %edi
  199. addl $8 * SIZE, %ecx
  200. decl %eax
  201. jne .L04
  202. ALIGN_4
  203. .L10:
  204. movl C, %esi # coffset = c
  205. movl A, %edx # aoffset = a
  206. movl M, %ebx
  207. sarl $1, %ebx # i = (m >> 2)
  208. jle .L20
  209. ALIGN_4
  210. .L11:
  211. leal - BOFFSET * SIZE + BUFFER, BB
  212. #if !defined(TRMMKERNEL) || \
  213. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  214. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  215. #else
  216. movl KK, %eax
  217. leal (, %eax, SIZE), %eax
  218. leal (AA, %eax, 2), AA
  219. leal (BB, %eax, 8), BB
  220. #endif
  221. movq ( 0 + AOFFSET) * SIZE(AA), %mm0
  222. pxor %mm4, %mm4
  223. movq ( 16 + AOFFSET) * SIZE(AA), %mm1
  224. pxor %mm5, %mm5
  225. PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2
  226. pxor %mm6, %mm6
  227. PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3
  228. pxor %mm7, %mm7
  229. leal (%ebp, %ebp, 2), %eax
  230. prefetchw 2 * SIZE(%esi)
  231. prefetchw 2 * SIZE(%esi, %ebp)
  232. prefetchw 2 * SIZE(%esi, %ebp, 2)
  233. prefetchw 2 * SIZE(%esi, %eax)
  234. #ifndef TRMMKERNEL
  235. movl K, %eax
  236. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  237. movl K, %eax
  238. subl KK, %eax
  239. movl %eax, KKK
  240. #else
  241. movl KK, %eax
  242. #ifdef LEFT
  243. addl $2, %eax
  244. #else
  245. addl $4, %eax
  246. #endif
  247. movl %eax, KKK
  248. #endif
  249. sarl $4, %eax
  250. je .L15
  251. ALIGN_4
  252. .L12:
  253. pfmul %mm0, %mm2
  254. pfadd %mm2, %mm4
  255. PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2
  256. pfmul %mm0, %mm2
  257. pfadd %mm2, %mm5
  258. PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2
  259. pfmul %mm0, %mm2
  260. pfadd %mm2, %mm6
  261. PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA)
  262. PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2
  263. pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0
  264. pfadd %mm0, %mm7
  265. movq ( 2 + AOFFSET) * SIZE(AA), %mm0
  266. pfmul %mm0, %mm2
  267. pfadd %mm2, %mm4
  268. PADDING movq ( 10 + BOFFSET) * SIZE(BB), %mm2
  269. pfmul %mm0, %mm2
  270. pfadd %mm2, %mm5
  271. PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2
  272. pfmul %mm0, %mm2
  273. pfadd %mm2, %mm6
  274. PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2
  275. pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0
  276. pfadd %mm0, %mm7
  277. movq ( 4 + AOFFSET) * SIZE(AA), %mm0
  278. pfmul %mm0, %mm3
  279. pfadd %mm3, %mm4
  280. PADDING movq ( 18 + BOFFSET) * SIZE(BB), %mm3
  281. pfmul %mm0, %mm3
  282. pfadd %mm3, %mm5
  283. PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3
  284. pfmul %mm0, %mm3
  285. pfadd %mm3, %mm6
  286. PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3
  287. pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0
  288. pfadd %mm0, %mm7
  289. movq ( 6 + AOFFSET) * SIZE(AA), %mm0
  290. pfmul %mm0, %mm3
  291. pfadd %mm3, %mm4
  292. PADDING movq ( 26 + BOFFSET) * SIZE(BB), %mm3
  293. pfmul %mm0, %mm3
  294. pfadd %mm3, %mm5
  295. PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3
  296. pfmul %mm0, %mm3
  297. pfadd %mm3, %mm6
  298. PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3
  299. pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0
  300. pfadd %mm0, %mm7
  301. movq ( 8 + AOFFSET) * SIZE(AA), %mm0
  302. pfmul %mm0, %mm2
  303. pfadd %mm2, %mm4
  304. PADDING movq ( 34 + BOFFSET) * SIZE(BB), %mm2
  305. pfmul %mm0, %mm2
  306. pfadd %mm2, %mm5
  307. PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2
  308. pfmul %mm0, %mm2
  309. pfadd %mm2, %mm6
  310. PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2
  311. pfmul ( 38 + BOFFSET) * SIZE(BB), %mm0
  312. pfadd %mm0, %mm7
  313. movq ( 10 + AOFFSET) * SIZE(AA), %mm0
  314. pfmul %mm0, %mm2
  315. pfadd %mm2, %mm4
  316. PADDING movq ( 42 + BOFFSET) * SIZE(BB), %mm2
  317. pfmul %mm0, %mm2
  318. pfadd %mm2, %mm5
  319. PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2
  320. pfmul %mm0, %mm2
  321. pfadd %mm2, %mm6
  322. PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2
  323. pfmul ( 46 + BOFFSET) * SIZE(BB), %mm0
  324. pfadd %mm0, %mm7
  325. movq ( 12 + AOFFSET) * SIZE(AA), %mm0
  326. pfmul %mm0, %mm3
  327. pfadd %mm3, %mm4
  328. PADDING movq ( 50 + BOFFSET) * SIZE(BB), %mm3
  329. pfmul %mm0, %mm3
  330. pfadd %mm3, %mm5
  331. PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3
  332. pfmul %mm0, %mm3
  333. pfadd %mm3, %mm6
  334. PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3
  335. pfmul ( 54 + BOFFSET) * SIZE(BB), %mm0
  336. pfadd %mm0, %mm7
  337. movq ( 14 + AOFFSET) * SIZE(AA), %mm0
  338. pfmul %mm0, %mm3
  339. pfadd %mm3, %mm4
  340. PADDING movq ( 58 + BOFFSET) * SIZE(BB), %mm3
  341. pfmul %mm0, %mm3
  342. pfadd %mm3, %mm5
  343. PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3
  344. pfmul %mm0, %mm3
  345. pfadd %mm3, %mm6
  346. PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3
  347. pfmul ( 62 + BOFFSET) * SIZE(BB), %mm0
  348. pfadd %mm0, %mm7
  349. movq ( 32 + AOFFSET) * SIZE(AA), %mm0
  350. pfmul %mm1, %mm2
  351. pfadd %mm2, %mm4
  352. PADDING movq ( 66 + BOFFSET) * SIZE(BB), %mm2
  353. pfmul %mm1, %mm2
  354. pfadd %mm2, %mm5
  355. PADDING movq ( 68 + BOFFSET) * SIZE(BB), %mm2
  356. pfmul %mm1, %mm2
  357. pfadd %mm2, %mm6
  358. PADDING movq ( 72 + BOFFSET) * SIZE(BB), %mm2
  359. pfmul ( 70 + BOFFSET) * SIZE(BB), %mm1
  360. pfadd %mm1, %mm7
  361. movq ( 18 + AOFFSET) * SIZE(AA), %mm1
  362. pfmul %mm1, %mm2
  363. pfadd %mm2, %mm4
  364. PADDING movq ( 74 + BOFFSET) * SIZE(BB), %mm2
  365. pfmul %mm1, %mm2
  366. pfadd %mm2, %mm5
  367. PADDING movq ( 76 + BOFFSET) * SIZE(BB), %mm2
  368. pfmul %mm1, %mm2
  369. pfadd %mm2, %mm6
  370. PADDING movq ( 96 + BOFFSET) * SIZE(BB), %mm2
  371. pfmul ( 78 + BOFFSET) * SIZE(BB), %mm1
  372. pfadd %mm1, %mm7
  373. movq ( 20 + AOFFSET) * SIZE(AA), %mm1
  374. pfmul %mm1, %mm3
  375. pfadd %mm3, %mm4
  376. PADDING movq ( 82 + BOFFSET) * SIZE(BB), %mm3
  377. pfmul %mm1, %mm3
  378. pfadd %mm3, %mm5
  379. PADDING movq ( 84 + BOFFSET) * SIZE(BB), %mm3
  380. pfmul %mm1, %mm3
  381. pfadd %mm3, %mm6
  382. PADDING movq ( 88 + BOFFSET) * SIZE(BB), %mm3
  383. pfmul ( 86 + BOFFSET) * SIZE(BB), %mm1
  384. pfadd %mm1, %mm7
  385. movq ( 22 + AOFFSET) * SIZE(AA), %mm1
  386. pfmul %mm1, %mm3
  387. pfadd %mm3, %mm4
  388. PADDING movq ( 90 + BOFFSET) * SIZE(BB), %mm3
  389. pfmul %mm1, %mm3
  390. pfadd %mm3, %mm5
  391. PADDING movq ( 92 + BOFFSET) * SIZE(BB), %mm3
  392. pfmul %mm1, %mm3
  393. pfadd %mm3, %mm6
  394. PADDING movq (112 + BOFFSET) * SIZE(BB), %mm3
  395. pfmul ( 94 + BOFFSET) * SIZE(BB), %mm1
  396. pfadd %mm1, %mm7
  397. movq ( 24 + AOFFSET) * SIZE(AA), %mm1
  398. pfmul %mm1, %mm2
  399. pfadd %mm2, %mm4
  400. PADDING movq ( 98 + BOFFSET) * SIZE(BB), %mm2
  401. pfmul %mm1, %mm2
  402. pfadd %mm2, %mm5
  403. PADDING movq (100 + BOFFSET) * SIZE(BB), %mm2
  404. pfmul %mm1, %mm2
  405. pfadd %mm2, %mm6
  406. PADDING movq (104 + BOFFSET) * SIZE(BB), %mm2
  407. pfmul (102 + BOFFSET) * SIZE(BB), %mm1
  408. pfadd %mm1, %mm7
  409. movq ( 26 + AOFFSET) * SIZE(AA), %mm1
  410. pfmul %mm1, %mm2
  411. pfadd %mm2, %mm4
  412. PADDING movq (106 + BOFFSET) * SIZE(BB), %mm2
  413. pfmul %mm1, %mm2
  414. pfadd %mm2, %mm5
  415. PADDING movq (108 + BOFFSET) * SIZE(BB), %mm2
  416. pfmul %mm1, %mm2
  417. pfadd %mm2, %mm6
  418. PADDING movq (128 + BOFFSET) * SIZE(BB), %mm2
  419. pfmul (110 + BOFFSET) * SIZE(BB), %mm1
  420. pfadd %mm1, %mm7
  421. movq ( 28 + AOFFSET) * SIZE(AA), %mm1
  422. pfmul %mm1, %mm3
  423. pfadd %mm3, %mm4
  424. PADDING movq (114 + BOFFSET) * SIZE(BB), %mm3
  425. pfmul %mm1, %mm3
  426. pfadd %mm3, %mm5
  427. PADDING movq (116 + BOFFSET) * SIZE(BB), %mm3
  428. pfmul %mm1, %mm3
  429. pfadd %mm3, %mm6
  430. PADDING movq (120 + BOFFSET) * SIZE(BB), %mm3
  431. pfmul (118 + BOFFSET) * SIZE(BB), %mm1
  432. pfadd %mm1, %mm7
  433. movq ( 30 + AOFFSET) * SIZE(AA), %mm1
  434. pfmul %mm1, %mm3
  435. pfadd %mm3, %mm4
  436. PADDING movq (122 + BOFFSET) * SIZE(BB), %mm3
  437. pfmul %mm1, %mm3
  438. pfadd %mm3, %mm5
  439. PADDING movq (124 + BOFFSET) * SIZE(BB), %mm3
  440. pfmul %mm1, %mm3
  441. pfadd %mm3, %mm6
  442. PADDING movq (144 + BOFFSET) * SIZE(BB), %mm3
  443. pfmul (126 + BOFFSET) * SIZE(BB), %mm1
  444. pfadd %mm1, %mm7
  445. movq ( 48 + AOFFSET) * SIZE(AA), %mm1
  446. subl $-32 * SIZE, AA
  447. addl $128 * SIZE, BB
  448. decl %eax
  449. jne .L12
  450. ALIGN_3
  451. .L15:
  452. movq ALPHA, %mm3
  453. #ifndef TRMMKERNEL
  454. movl K, %eax
  455. #else
  456. movl KKK, %eax
  457. #endif
  458. andl $15, %eax
  459. BRANCH
  460. je .L18
  461. ALIGN_3
  462. .L16:
  463. pfmul %mm0, %mm2
  464. pfadd %mm2, %mm4
  465. PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2
  466. pfmul %mm0, %mm2
  467. pfadd %mm2, %mm5
  468. PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2
  469. pfmul %mm0, %mm2
  470. pfadd %mm2, %mm6
  471. PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2
  472. pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0
  473. pfadd %mm0, %mm7
  474. movq ( 2 + AOFFSET) * SIZE(AA), %mm0
  475. addl $2 * SIZE, AA
  476. addl $8 * SIZE, BB
  477. decl %eax
  478. jg .L16
  479. ALIGN_3
  480. .L18:
  481. leal (%ebp, %ebp, 2), %eax
  482. #ifndef TRMMKERNEL
  483. pfmul %mm3, %mm4
  484. pfadd 0 * SIZE(%esi), %mm4
  485. pfmul %mm3, %mm5
  486. pfadd 0 * SIZE(%esi, %ebp, 1), %mm5
  487. pfmul %mm3, %mm6
  488. pfadd 0 * SIZE(%esi, %ebp, 2), %mm6
  489. pfmul %mm3, %mm7
  490. pfadd 0 * SIZE(%esi, %eax, 1), %mm7
  491. #else
  492. pfmul %mm3, %mm4
  493. pfmul %mm3, %mm5
  494. pfmul %mm3, %mm6
  495. pfmul %mm3, %mm7
  496. #endif
  497. movq %mm4, 0 * SIZE(%esi)
  498. movq %mm5, 0 * SIZE(%esi, %ebp, 1)
  499. movq %mm6, 0 * SIZE(%esi, %ebp, 2)
  500. movq %mm7, 0 * SIZE(%esi, %eax, 1)
  501. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  502. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  503. movl K, %eax
  504. subl KKK, %eax
  505. leal (,%eax, SIZE), %eax
  506. leal (AA, %eax, 2), AA
  507. leal (BB, %eax, 8), BB
  508. #endif
  509. #if defined(TRMMKERNEL) && defined(LEFT)
  510. addl $2, KK
  511. #endif
  512. addl $2 * SIZE, %esi # coffset += 2
  513. decl %ebx # i --
  514. jg .L11
  515. ALIGN_4
  516. .L20:
  517. movl M, %ebx
  518. testl $1, %ebx # i = (m >> 2)
  519. jle .L29
  520. ALIGN_4
  521. .L21:
  522. leal - BOFFSET * SIZE + BUFFER, BB
  523. #if !defined(TRMMKERNEL) || \
  524. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  525. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  526. #else
  527. movl KK, %eax
  528. leal (, %eax, SIZE), %eax
  529. leal (AA, %eax, 1), AA
  530. leal (BB, %eax, 8), BB
  531. #endif
  532. movq ( 0 + AOFFSET) * SIZE(AA), %mm0
  533. pxor %mm4, %mm4
  534. movq ( 8 + AOFFSET) * SIZE(AA), %mm1
  535. pxor %mm5, %mm5
  536. PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2
  537. pxor %mm6, %mm6
  538. PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3
  539. pxor %mm7, %mm7
  540. #ifndef TRMMKERNEL
  541. movl K, %eax
  542. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  543. movl K, %eax
  544. subl KK, %eax
  545. movl %eax, KKK
  546. #else
  547. movl KK, %eax
  548. #ifdef LEFT
  549. addl $1, %eax
  550. #else
  551. addl $4, %eax
  552. #endif
  553. movl %eax, KKK
  554. #endif
  555. sarl $4, %eax
  556. je .L25
  557. ALIGN_4
  558. .L22:
  559. pfmul %mm0, %mm2
  560. pfadd %mm2, %mm4
  561. PADDING movd ( 2 + BOFFSET) * SIZE(BB), %mm2
  562. pfmul %mm0, %mm2
  563. pfadd %mm2, %mm5
  564. PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2
  565. pfmul %mm0, %mm2
  566. pfadd %mm2, %mm6
  567. PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA)
  568. PADDING movd ( 8 + BOFFSET) * SIZE(BB), %mm2
  569. pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0
  570. pfadd %mm0, %mm7
  571. movd ( 1 + AOFFSET) * SIZE(AA), %mm0
  572. pfmul %mm0, %mm2
  573. pfadd %mm2, %mm4
  574. PADDING movd ( 10 + BOFFSET) * SIZE(BB), %mm2
  575. pfmul %mm0, %mm2
  576. pfadd %mm2, %mm5
  577. PADDING movd ( 12 + BOFFSET) * SIZE(BB), %mm2
  578. pfmul %mm0, %mm2
  579. pfadd %mm2, %mm6
  580. PADDING movd ( 32 + BOFFSET) * SIZE(BB), %mm2
  581. pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0
  582. pfadd %mm0, %mm7
  583. movd ( 2 + AOFFSET) * SIZE(AA), %mm0
  584. pfmul %mm0, %mm3
  585. pfadd %mm3, %mm4
  586. PADDING movd ( 18 + BOFFSET) * SIZE(BB), %mm3
  587. pfmul %mm0, %mm3
  588. pfadd %mm3, %mm5
  589. PADDING movd ( 20 + BOFFSET) * SIZE(BB), %mm3
  590. pfmul %mm0, %mm3
  591. pfadd %mm3, %mm6
  592. PADDING movd ( 24 + BOFFSET) * SIZE(BB), %mm3
  593. pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0
  594. pfadd %mm0, %mm7
  595. movd ( 3 + AOFFSET) * SIZE(AA), %mm0
  596. pfmul %mm0, %mm3
  597. pfadd %mm3, %mm4
  598. PADDING movd ( 26 + BOFFSET) * SIZE(BB), %mm3
  599. pfmul %mm0, %mm3
  600. pfadd %mm3, %mm5
  601. PADDING movd ( 28 + BOFFSET) * SIZE(BB), %mm3
  602. pfmul %mm0, %mm3
  603. pfadd %mm3, %mm6
  604. PADDING movd ( 48 + BOFFSET) * SIZE(BB), %mm3
  605. pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0
  606. pfadd %mm0, %mm7
  607. movd ( 4 + AOFFSET) * SIZE(AA), %mm0
  608. pfmul %mm0, %mm2
  609. pfadd %mm2, %mm4
  610. PADDING movd ( 34 + BOFFSET) * SIZE(BB), %mm2
  611. pfmul %mm0, %mm2
  612. pfadd %mm2, %mm5
  613. PADDING movd ( 36 + BOFFSET) * SIZE(BB), %mm2
  614. pfmul %mm0, %mm2
  615. pfadd %mm2, %mm6
  616. PADDING movd ( 40 + BOFFSET) * SIZE(BB), %mm2
  617. pfmul ( 38 + BOFFSET) * SIZE(BB), %mm0
  618. pfadd %mm0, %mm7
  619. movd ( 5 + AOFFSET) * SIZE(AA), %mm0
  620. pfmul %mm0, %mm2
  621. pfadd %mm2, %mm4
  622. PADDING movd ( 42 + BOFFSET) * SIZE(BB), %mm2
  623. pfmul %mm0, %mm2
  624. pfadd %mm2, %mm5
  625. PADDING movd ( 44 + BOFFSET) * SIZE(BB), %mm2
  626. pfmul %mm0, %mm2
  627. pfadd %mm2, %mm6
  628. PADDING movd ( 64 + BOFFSET) * SIZE(BB), %mm2
  629. pfmul ( 46 + BOFFSET) * SIZE(BB), %mm0
  630. pfadd %mm0, %mm7
  631. movd ( 6 + AOFFSET) * SIZE(AA), %mm0
  632. pfmul %mm0, %mm3
  633. pfadd %mm3, %mm4
  634. PADDING movd ( 50 + BOFFSET) * SIZE(BB), %mm3
  635. pfmul %mm0, %mm3
  636. pfadd %mm3, %mm5
  637. PADDING movd ( 52 + BOFFSET) * SIZE(BB), %mm3
  638. pfmul %mm0, %mm3
  639. pfadd %mm3, %mm6
  640. PADDING movd ( 56 + BOFFSET) * SIZE(BB), %mm3
  641. pfmul ( 54 + BOFFSET) * SIZE(BB), %mm0
  642. pfadd %mm0, %mm7
  643. movd ( 7 + AOFFSET) * SIZE(AA), %mm0
  644. pfmul %mm0, %mm3
  645. pfadd %mm3, %mm4
  646. PADDING movd ( 58 + BOFFSET) * SIZE(BB), %mm3
  647. pfmul %mm0, %mm3
  648. pfadd %mm3, %mm5
  649. PADDING movd ( 60 + BOFFSET) * SIZE(BB), %mm3
  650. pfmul %mm0, %mm3
  651. pfadd %mm3, %mm6
  652. PADDING movd ( 80 + BOFFSET) * SIZE(BB), %mm3
  653. pfmul ( 62 + BOFFSET) * SIZE(BB), %mm0
  654. pfadd %mm0, %mm7
  655. movd ( 16 + AOFFSET) * SIZE(AA), %mm0
  656. pfmul %mm1, %mm2
  657. pfadd %mm2, %mm4
  658. PADDING movd ( 66 + BOFFSET) * SIZE(BB), %mm2
  659. pfmul %mm1, %mm2
  660. pfadd %mm2, %mm5
  661. PADDING movd ( 68 + BOFFSET) * SIZE(BB), %mm2
  662. pfmul %mm1, %mm2
  663. pfadd %mm2, %mm6
  664. PADDING movd ( 72 + BOFFSET) * SIZE(BB), %mm2
  665. pfmul ( 70 + BOFFSET) * SIZE(BB), %mm1
  666. pfadd %mm1, %mm7
  667. movd ( 9 + AOFFSET) * SIZE(AA), %mm1
  668. pfmul %mm1, %mm2
  669. pfadd %mm2, %mm4
  670. PADDING movd ( 74 + BOFFSET) * SIZE(BB), %mm2
  671. pfmul %mm1, %mm2
  672. pfadd %mm2, %mm5
  673. PADDING movd ( 76 + BOFFSET) * SIZE(BB), %mm2
  674. pfmul %mm1, %mm2
  675. pfadd %mm2, %mm6
  676. PADDING movd ( 96 + BOFFSET) * SIZE(BB), %mm2
  677. pfmul ( 78 + BOFFSET) * SIZE(BB), %mm1
  678. pfadd %mm1, %mm7
  679. movd ( 10 + AOFFSET) * SIZE(AA), %mm1
  680. pfmul %mm1, %mm3
  681. pfadd %mm3, %mm4
  682. PADDING movd ( 82 + BOFFSET) * SIZE(BB), %mm3
  683. pfmul %mm1, %mm3
  684. pfadd %mm3, %mm5
  685. PADDING movd ( 84 + BOFFSET) * SIZE(BB), %mm3
  686. pfmul %mm1, %mm3
  687. pfadd %mm3, %mm6
  688. PADDING movd ( 88 + BOFFSET) * SIZE(BB), %mm3
  689. pfmul ( 86 + BOFFSET) * SIZE(BB), %mm1
  690. pfadd %mm1, %mm7
  691. movd ( 11 + AOFFSET) * SIZE(AA), %mm1
  692. pfmul %mm1, %mm3
  693. pfadd %mm3, %mm4
  694. PADDING movd ( 90 + BOFFSET) * SIZE(BB), %mm3
  695. pfmul %mm1, %mm3
  696. pfadd %mm3, %mm5
  697. PADDING movd ( 92 + BOFFSET) * SIZE(BB), %mm3
  698. pfmul %mm1, %mm3
  699. pfadd %mm3, %mm6
  700. PADDING movd (112 + BOFFSET) * SIZE(BB), %mm3
  701. pfmul ( 94 + BOFFSET) * SIZE(BB), %mm1
  702. pfadd %mm1, %mm7
  703. movd ( 12 + AOFFSET) * SIZE(AA), %mm1
  704. pfmul %mm1, %mm2
  705. pfadd %mm2, %mm4
  706. PADDING movd ( 98 + BOFFSET) * SIZE(BB), %mm2
  707. pfmul %mm1, %mm2
  708. pfadd %mm2, %mm5
  709. PADDING movd (100 + BOFFSET) * SIZE(BB), %mm2
  710. pfmul %mm1, %mm2
  711. pfadd %mm2, %mm6
  712. PADDING movd (104 + BOFFSET) * SIZE(BB), %mm2
  713. pfmul (102 + BOFFSET) * SIZE(BB), %mm1
  714. pfadd %mm1, %mm7
  715. movd ( 13 + AOFFSET) * SIZE(AA), %mm1
  716. pfmul %mm1, %mm2
  717. pfadd %mm2, %mm4
  718. PADDING movd (106 + BOFFSET) * SIZE(BB), %mm2
  719. pfmul %mm1, %mm2
  720. pfadd %mm2, %mm5
  721. PADDING movd (108 + BOFFSET) * SIZE(BB), %mm2
  722. pfmul %mm1, %mm2
  723. pfadd %mm2, %mm6
  724. PADDING movd (128 + BOFFSET) * SIZE(BB), %mm2
  725. pfmul (110 + BOFFSET) * SIZE(BB), %mm1
  726. pfadd %mm1, %mm7
  727. movd ( 14 + AOFFSET) * SIZE(AA), %mm1
  728. pfmul %mm1, %mm3
  729. pfadd %mm3, %mm4
  730. PADDING movd (114 + BOFFSET) * SIZE(BB), %mm3
  731. pfmul %mm1, %mm3
  732. pfadd %mm3, %mm5
  733. PADDING movd (116 + BOFFSET) * SIZE(BB), %mm3
  734. pfmul %mm1, %mm3
  735. pfadd %mm3, %mm6
  736. PADDING movd (120 + BOFFSET) * SIZE(BB), %mm3
  737. pfmul (118 + BOFFSET) * SIZE(BB), %mm1
  738. pfadd %mm1, %mm7
  739. movd ( 15 + AOFFSET) * SIZE(AA), %mm1
  740. pfmul %mm1, %mm3
  741. pfadd %mm3, %mm4
  742. PADDING movd (122 + BOFFSET) * SIZE(BB), %mm3
  743. pfmul %mm1, %mm3
  744. pfadd %mm3, %mm5
  745. PADDING movd (124 + BOFFSET) * SIZE(BB), %mm3
  746. pfmul %mm1, %mm3
  747. pfadd %mm3, %mm6
  748. PADDING movd (144 + BOFFSET) * SIZE(BB), %mm3
  749. pfmul (126 + BOFFSET) * SIZE(BB), %mm1
  750. pfadd %mm1, %mm7
  751. movd ( 24 + AOFFSET) * SIZE(AA), %mm1
  752. subl $-16 * SIZE, AA
  753. addl $128 * SIZE, BB
  754. decl %eax
  755. jne .L22
  756. ALIGN_3
  757. .L25:
  758. movd ALPHA, %mm3
  759. #ifndef TRMMKERNEL
  760. movl K, %eax
  761. #else
  762. movl KKK, %eax
  763. #endif
  764. andl $15, %eax
  765. BRANCH
  766. je .L28
  767. ALIGN_3
  768. .L26:
  769. pfmul %mm0, %mm2
  770. pfadd %mm2, %mm4
  771. PADDING movd ( 2 + BOFFSET) * SIZE(BB), %mm2
  772. pfmul %mm0, %mm2
  773. pfadd %mm2, %mm5
  774. PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2
  775. pfmul %mm0, %mm2
  776. pfadd %mm2, %mm6
  777. PADDING movd ( 8 + BOFFSET) * SIZE(BB), %mm2
  778. pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0
  779. pfadd %mm0, %mm7
  780. movd ( 1 + AOFFSET) * SIZE(AA), %mm0
  781. addl $1 * SIZE, AA
  782. addl $8 * SIZE, BB
  783. decl %eax
  784. jg .L26
  785. ALIGN_3
  786. .L28:
  787. leal (%ebp, %ebp, 2), %eax
  788. pfmul %mm3, %mm4
  789. pfmul %mm3, %mm5
  790. pfmul %mm3, %mm6
  791. pfmul %mm3, %mm7
  792. #ifndef TRMMKERNEL
  793. movd 0 * SIZE(%esi) , %mm0
  794. movd 0 * SIZE(%esi, %ebp, 1), %mm1
  795. movd 0 * SIZE(%esi, %ebp, 2), %mm2
  796. movd 0 * SIZE(%esi, %eax, 1), %mm3
  797. pfadd %mm0, %mm4
  798. pfadd %mm1, %mm5
  799. pfadd %mm2, %mm6
  800. pfadd %mm3, %mm7
  801. #endif
  802. movd %mm4, 0 * SIZE(%esi)
  803. movd %mm5, 0 * SIZE(%esi, %ebp, 1)
  804. movd %mm6, 0 * SIZE(%esi, %ebp, 2)
  805. movd %mm7, 0 * SIZE(%esi, %eax, 1)
  806. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  807. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  808. movl K, %eax
  809. subl KKK, %eax
  810. leal (,%eax, SIZE), %eax
  811. leal (AA, %eax, 1), AA
  812. leal (BB, %eax, 8), BB
  813. #endif
  814. #if defined(TRMMKERNEL) && defined(LEFT)
  815. addl $1, KK
  816. #endif
  817. ALIGN_4
  818. .L29:
  819. #if defined(TRMMKERNEL) && !defined(LEFT)
  820. addl $4, KK
  821. #endif
  822. leal (, %ebp, 4), %eax
  823. addl %eax, C # c += 4 * ldc
  824. decl J # j --
  825. jg .L01
  826. ALIGN_4
  827. .L30:
  828. movl N, %eax
  829. testl $2, %eax
  830. jle .L60
  831. ALIGN_3
  832. .L31:
  833. /* Copying to Sub Buffer */
  834. leal BUFFER, %ecx
  835. #if defined(TRMMKERNEL) && defined(LEFT)
  836. movl OFFSET, %eax
  837. movl %eax, KK
  838. #endif
  839. movl K, %eax
  840. sarl $2, %eax
  841. jle .L33
  842. ALIGN_3
  843. .L32:
  844. movd 0 * SIZE(%edi), %mm0
  845. movd 1 * SIZE(%edi), %mm1
  846. movd 2 * SIZE(%edi), %mm2
  847. movd 3 * SIZE(%edi), %mm3
  848. movd 4 * SIZE(%edi), %mm4
  849. movd 5 * SIZE(%edi), %mm5
  850. movd 6 * SIZE(%edi), %mm6
  851. movd 7 * SIZE(%edi), %mm7
  852. prefetchnta 72 * SIZE(%edi)
  853. punpckldq %mm0, %mm0
  854. punpckldq %mm1, %mm1
  855. punpckldq %mm2, %mm2
  856. punpckldq %mm3, %mm3
  857. punpckldq %mm4, %mm4
  858. punpckldq %mm5, %mm5
  859. punpckldq %mm6, %mm6
  860. punpckldq %mm7, %mm7
  861. movq %mm0, 0 * SIZE(%ecx)
  862. movq %mm1, 2 * SIZE(%ecx)
  863. movq %mm2, 4 * SIZE(%ecx)
  864. movq %mm3, 6 * SIZE(%ecx)
  865. movq %mm4, 8 * SIZE(%ecx)
  866. movq %mm5, 10 * SIZE(%ecx)
  867. movq %mm6, 12 * SIZE(%ecx)
  868. movq %mm7, 14 * SIZE(%ecx)
  869. addl $ 8 * SIZE, %edi
  870. addl $16 * SIZE, %ecx
  871. decl %eax
  872. jne .L32
  873. .L33:
  874. movl K, %eax
  875. andl $3, %eax
  876. BRANCH
  877. jle .L40
  878. ALIGN_2
  879. .L34:
  880. movd 0 * SIZE(%edi), %mm0
  881. movd 1 * SIZE(%edi), %mm1
  882. punpckldq %mm0, %mm0
  883. punpckldq %mm1, %mm1
  884. movq %mm0, 0 * SIZE(%ecx)
  885. movq %mm1, 2 * SIZE(%ecx)
  886. addl $2 * SIZE, %edi
  887. addl $4 * SIZE, %ecx
  888. decl %eax
  889. jne .L34
  890. ALIGN_4
  891. .L40:
  892. movl C, %esi # coffset = c
  893. movl A, %edx # aoffset = a
  894. movl M, %ebx
  895. sarl $1, %ebx # i = (m >> 2)
  896. jle .L50
  897. ALIGN_4
  898. .L41:
  899. leal - BOFFSET * SIZE + BUFFER, BB
  900. #if !defined(TRMMKERNEL) || \
  901. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  902. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  903. #else
  904. movl KK, %eax
  905. leal (, %eax, SIZE), %eax
  906. leal (AA, %eax, 2), AA
  907. leal (BB, %eax, 4), BB
  908. #endif
  909. movq ( 0 + AOFFSET) * SIZE(AA), %mm0
  910. pxor %mm4, %mm4
  911. movq ( 16 + AOFFSET) * SIZE(AA), %mm1
  912. pxor %mm5, %mm5
  913. PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2
  914. pxor %mm6, %mm6
  915. PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3
  916. pxor %mm7, %mm7
  917. prefetchw 2 * SIZE(%esi)
  918. prefetchw 2 * SIZE(%esi, %ebp)
  919. #ifndef TRMMKERNEL
  920. movl K, %eax
  921. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  922. movl K, %eax
  923. subl KK, %eax
  924. movl %eax, KKK
  925. #else
  926. movl KK, %eax
  927. #ifdef LEFT
  928. addl $2, %eax
  929. #else
  930. addl $2, %eax
  931. #endif
  932. movl %eax, KKK
  933. #endif
  934. sarl $4, %eax
  935. je .L45
  936. ALIGN_4
  937. .L42:
  938. pfmul %mm0, %mm2
  939. pfadd %mm2, %mm4
  940. PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2
  941. pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0
  942. pfadd %mm0, %mm5
  943. movq ( 2 + AOFFSET) * SIZE(AA), %mm0
  944. PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA)
  945. pfmul %mm0, %mm2
  946. pfadd %mm2, %mm6
  947. PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2
  948. pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0
  949. pfadd %mm0, %mm7
  950. movq ( 4 + AOFFSET) * SIZE(AA), %mm0
  951. pfmul %mm0, %mm2
  952. pfadd %mm2, %mm4
  953. PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2
  954. pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0
  955. pfadd %mm0, %mm5
  956. movq ( 6 + AOFFSET) * SIZE(AA), %mm0
  957. pfmul %mm0, %mm2
  958. pfadd %mm2, %mm6
  959. PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2
  960. pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0
  961. pfadd %mm0, %mm7
  962. movq ( 8 + AOFFSET) * SIZE(AA), %mm0
  963. pfmul %mm0, %mm3
  964. pfadd %mm3, %mm4
  965. PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3
  966. pfmul ( 18 + BOFFSET) * SIZE(BB), %mm0
  967. pfadd %mm0, %mm5
  968. movq ( 10 + AOFFSET) * SIZE(AA), %mm0
  969. pfmul %mm0, %mm3
  970. pfadd %mm3, %mm6
  971. PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3
  972. pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0
  973. pfadd %mm0, %mm7
  974. movq ( 12 + AOFFSET) * SIZE(AA), %mm0
  975. pfmul %mm0, %mm3
  976. pfadd %mm3, %mm4
  977. PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3
  978. pfmul ( 26 + BOFFSET) * SIZE(BB), %mm0
  979. pfadd %mm0, %mm5
  980. movq ( 14 + AOFFSET) * SIZE(AA), %mm0
  981. pfmul %mm0, %mm3
  982. pfadd %mm3, %mm6
  983. PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3
  984. pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0
  985. pfadd %mm0, %mm7
  986. movq ( 32 + AOFFSET) * SIZE(AA), %mm0
  987. pfmul %mm1, %mm2
  988. pfadd %mm2, %mm4
  989. PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2
  990. pfmul ( 34 + BOFFSET) * SIZE(BB), %mm1
  991. pfadd %mm1, %mm5
  992. movq ( 18 + AOFFSET) * SIZE(AA), %mm1
  993. pfmul %mm1, %mm2
  994. pfadd %mm2, %mm6
  995. PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2
  996. pfmul ( 38 + BOFFSET) * SIZE(BB), %mm1
  997. pfadd %mm1, %mm7
  998. movq ( 20 + AOFFSET) * SIZE(AA), %mm1
  999. pfmul %mm1, %mm2
  1000. pfadd %mm2, %mm4
  1001. PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2
  1002. pfmul ( 42 + BOFFSET) * SIZE(BB), %mm1
  1003. pfadd %mm1, %mm5
  1004. movq ( 22 + AOFFSET) * SIZE(AA), %mm1
  1005. pfmul %mm1, %mm2
  1006. pfadd %mm2, %mm6
  1007. PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2
  1008. pfmul ( 46 + BOFFSET) * SIZE(BB), %mm1
  1009. pfadd %mm1, %mm7
  1010. movq ( 24 + AOFFSET) * SIZE(AA), %mm1
  1011. pfmul %mm1, %mm3
  1012. pfadd %mm3, %mm4
  1013. PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3
  1014. pfmul ( 50 + BOFFSET) * SIZE(BB), %mm1
  1015. pfadd %mm1, %mm5
  1016. movq ( 26 + AOFFSET) * SIZE(AA), %mm1
  1017. pfmul %mm1, %mm3
  1018. pfadd %mm3, %mm6
  1019. PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3
  1020. pfmul ( 54 + BOFFSET) * SIZE(BB), %mm1
  1021. pfadd %mm1, %mm7
  1022. movq ( 28 + AOFFSET) * SIZE(AA), %mm1
  1023. pfmul %mm1, %mm3
  1024. pfadd %mm3, %mm4
  1025. PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3
  1026. pfmul ( 58 + BOFFSET) * SIZE(BB), %mm1
  1027. pfadd %mm1, %mm5
  1028. movq ( 30 + AOFFSET) * SIZE(AA), %mm1
  1029. pfmul %mm1, %mm3
  1030. pfadd %mm3, %mm6
  1031. PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3
  1032. pfmul ( 62 + BOFFSET) * SIZE(BB), %mm1
  1033. pfadd %mm1, %mm7
  1034. movq ( 48 + AOFFSET) * SIZE(AA), %mm1
  1035. subl $-32 * SIZE, AA
  1036. addl $ 64 * SIZE, BB
  1037. decl %eax
  1038. jne .L42
  1039. ALIGN_3
  1040. .L45:
  1041. movq ALPHA, %mm3
  1042. #ifndef TRMMKERNEL
  1043. movl K, %eax
  1044. #else
  1045. movl KKK, %eax
  1046. #endif
  1047. andl $15, %eax
  1048. BRANCH
  1049. je .L48
  1050. ALIGN_3
  1051. .L46:
  1052. pfmul %mm0, %mm2
  1053. pfadd %mm2, %mm4
  1054. PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2
  1055. pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0
  1056. pfadd %mm0, %mm5
  1057. movq ( 2 + AOFFSET) * SIZE(AA), %mm0
  1058. addl $2 * SIZE, AA
  1059. addl $4 * SIZE, BB
  1060. decl %eax
  1061. jg .L46
  1062. ALIGN_3
  1063. .L48:
  1064. pfadd %mm6, %mm4
  1065. pfadd %mm7, %mm5
  1066. pfmul %mm3, %mm4
  1067. pfmul %mm3, %mm5
  1068. #ifndef TRMMKERNEL
  1069. pfadd 0 * SIZE(%esi), %mm4
  1070. pfadd 0 * SIZE(%esi, %ebp, 1), %mm5
  1071. #endif
  1072. movq %mm4, 0 * SIZE(%esi)
  1073. movq %mm5, 0 * SIZE(%esi, %ebp, 1)
  1074. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1075. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1076. movl K, %eax
  1077. subl KKK, %eax
  1078. leal (,%eax, SIZE), %eax
  1079. leal (AA, %eax, 2), AA
  1080. leal (BB, %eax, 4), BB
  1081. #endif
  1082. #if defined(TRMMKERNEL) && defined(LEFT)
  1083. addl $2, KK
  1084. #endif
  1085. addl $2 * SIZE, %esi # coffset += 2
  1086. decl %ebx # i --
  1087. jg .L41
  1088. ALIGN_4
  1089. .L50:
  1090. movl M, %ebx
  1091. testl $1, %ebx # i = (m >> 2)
  1092. jle .L59
  1093. ALIGN_4
  1094. .L51:
  1095. leal - BOFFSET * SIZE + BUFFER, BB
  1096. #if !defined(TRMMKERNEL) || \
  1097. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1098. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1099. #else
  1100. movl KK, %eax
  1101. leal (, %eax, SIZE), %eax
  1102. leal (AA, %eax, 1), AA
  1103. leal (BB, %eax, 4), BB
  1104. #endif
  1105. movq ( 0 + AOFFSET) * SIZE(AA), %mm0
  1106. pxor %mm4, %mm4
  1107. movq ( 8 + AOFFSET) * SIZE(AA), %mm1
  1108. pxor %mm5, %mm5
  1109. PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2
  1110. pxor %mm6, %mm6
  1111. PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3
  1112. pxor %mm7, %mm7
  1113. #ifndef TRMMKERNEL
  1114. movl K, %eax
  1115. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1116. movl K, %eax
  1117. subl KK, %eax
  1118. movl %eax, KKK
  1119. #else
  1120. movl KK, %eax
  1121. #ifdef LEFT
  1122. addl $1, %eax
  1123. #else
  1124. addl $2, %eax
  1125. #endif
  1126. movl %eax, KKK
  1127. #endif
  1128. sarl $4, %eax
  1129. je .L55
  1130. ALIGN_4
  1131. .L52:
  1132. pfmul %mm0, %mm2
  1133. pfadd %mm2, %mm4
  1134. PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2
  1135. pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0
  1136. pfadd %mm0, %mm5
  1137. movd ( 1 + AOFFSET) * SIZE(AA), %mm0
  1138. PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA)
  1139. pfmul %mm0, %mm2
  1140. pfadd %mm2, %mm6
  1141. PADDING movd ( 8 + BOFFSET) * SIZE(BB), %mm2
  1142. pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0
  1143. pfadd %mm0, %mm7
  1144. movd ( 2 + AOFFSET) * SIZE(AA), %mm0
  1145. pfmul %mm0, %mm2
  1146. pfadd %mm2, %mm4
  1147. PADDING movd ( 12 + BOFFSET) * SIZE(BB), %mm2
  1148. pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0
  1149. pfadd %mm0, %mm5
  1150. movd ( 3 + AOFFSET) * SIZE(AA), %mm0
  1151. pfmul %mm0, %mm2
  1152. pfadd %mm2, %mm6
  1153. PADDING movd ( 32 + BOFFSET) * SIZE(BB), %mm2
  1154. pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0
  1155. pfadd %mm0, %mm7
  1156. movd ( 4 + AOFFSET) * SIZE(AA), %mm0
  1157. pfmul %mm0, %mm3
  1158. pfadd %mm3, %mm4
  1159. PADDING movd ( 20 + BOFFSET) * SIZE(BB), %mm3
  1160. pfmul ( 18 + BOFFSET) * SIZE(BB), %mm0
  1161. pfadd %mm0, %mm5
  1162. movd ( 5 + AOFFSET) * SIZE(AA), %mm0
  1163. pfmul %mm0, %mm3
  1164. pfadd %mm3, %mm6
  1165. PADDING movd ( 24 + BOFFSET) * SIZE(BB), %mm3
  1166. pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0
  1167. pfadd %mm0, %mm7
  1168. movd ( 6 + AOFFSET) * SIZE(AA), %mm0
  1169. pfmul %mm0, %mm3
  1170. pfadd %mm3, %mm4
  1171. PADDING movd ( 28 + BOFFSET) * SIZE(BB), %mm3
  1172. pfmul ( 26 + BOFFSET) * SIZE(BB), %mm0
  1173. pfadd %mm0, %mm5
  1174. movd ( 7 + AOFFSET) * SIZE(AA), %mm0
  1175. pfmul %mm0, %mm3
  1176. pfadd %mm3, %mm6
  1177. PADDING movd ( 48 + BOFFSET) * SIZE(BB), %mm3
  1178. pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0
  1179. pfadd %mm0, %mm7
  1180. movd ( 16 + AOFFSET) * SIZE(AA), %mm0
  1181. pfmul %mm1, %mm2
  1182. pfadd %mm2, %mm4
  1183. PADDING movd ( 36 + BOFFSET) * SIZE(BB), %mm2
  1184. pfmul ( 34 + BOFFSET) * SIZE(BB), %mm1
  1185. pfadd %mm1, %mm5
  1186. movd ( 9 + AOFFSET) * SIZE(AA), %mm1
  1187. pfmul %mm1, %mm2
  1188. pfadd %mm2, %mm6
  1189. PADDING movd ( 40 + BOFFSET) * SIZE(BB), %mm2
  1190. pfmul ( 38 + BOFFSET) * SIZE(BB), %mm1
  1191. pfadd %mm1, %mm7
  1192. movd ( 10 + AOFFSET) * SIZE(AA), %mm1
  1193. pfmul %mm1, %mm2
  1194. pfadd %mm2, %mm4
  1195. PADDING movd ( 44 + BOFFSET) * SIZE(BB), %mm2
  1196. pfmul ( 42 + BOFFSET) * SIZE(BB), %mm1
  1197. pfadd %mm1, %mm5
  1198. movd ( 11 + AOFFSET) * SIZE(AA), %mm1
  1199. pfmul %mm1, %mm2
  1200. pfadd %mm2, %mm6
  1201. PADDING movd ( 64 + BOFFSET) * SIZE(BB), %mm2
  1202. pfmul ( 46 + BOFFSET) * SIZE(BB), %mm1
  1203. pfadd %mm1, %mm7
  1204. movd ( 12 + AOFFSET) * SIZE(AA), %mm1
  1205. pfmul %mm1, %mm3
  1206. pfadd %mm3, %mm4
  1207. PADDING movd ( 52 + BOFFSET) * SIZE(BB), %mm3
  1208. pfmul ( 50 + BOFFSET) * SIZE(BB), %mm1
  1209. pfadd %mm1, %mm5
  1210. movd ( 13 + AOFFSET) * SIZE(AA), %mm1
  1211. pfmul %mm1, %mm3
  1212. pfadd %mm3, %mm6
  1213. PADDING movd ( 56 + BOFFSET) * SIZE(BB), %mm3
  1214. pfmul ( 54 + BOFFSET) * SIZE(BB), %mm1
  1215. pfadd %mm1, %mm7
  1216. movd ( 14 + AOFFSET) * SIZE(AA), %mm1
  1217. pfmul %mm1, %mm3
  1218. pfadd %mm3, %mm4
  1219. PADDING movd ( 60 + BOFFSET) * SIZE(BB), %mm3
  1220. pfmul ( 58 + BOFFSET) * SIZE(BB), %mm1
  1221. pfadd %mm1, %mm5
  1222. movd ( 15 + AOFFSET) * SIZE(AA), %mm1
  1223. pfmul %mm1, %mm3
  1224. pfadd %mm3, %mm6
  1225. PADDING movd ( 80 + BOFFSET) * SIZE(BB), %mm3
  1226. pfmul ( 62 + BOFFSET) * SIZE(BB), %mm1
  1227. pfadd %mm1, %mm7
  1228. movd ( 24 + AOFFSET) * SIZE(AA), %mm1
  1229. subl $-16 * SIZE, AA
  1230. addl $ 64 * SIZE, BB
  1231. decl %eax
  1232. jne .L52
  1233. ALIGN_3
  1234. .L55:
  1235. movd ALPHA, %mm3
  1236. #ifndef TRMMKERNEL
  1237. movl K, %eax
  1238. #else
  1239. movl KKK, %eax
  1240. #endif
  1241. andl $15, %eax
  1242. BRANCH
  1243. je .L58
  1244. ALIGN_3
  1245. .L56:
  1246. pfmul %mm0, %mm2
  1247. pfadd %mm2, %mm4
  1248. PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2
  1249. pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0
  1250. pfadd %mm0, %mm5
  1251. movd ( 1 + AOFFSET) * SIZE(AA), %mm0
  1252. addl $1 * SIZE, AA
  1253. addl $4 * SIZE, BB
  1254. decl %eax
  1255. jg .L56
  1256. ALIGN_3
  1257. .L58:
  1258. pfadd %mm6, %mm4
  1259. pfadd %mm7, %mm5
  1260. pfmul %mm3, %mm4
  1261. pfmul %mm3, %mm5
  1262. #ifndef TRMMKERNEL
  1263. movd 0 * SIZE(%esi) , %mm0
  1264. movd 0 * SIZE(%esi, %ebp, 1), %mm1
  1265. pfadd %mm0, %mm4
  1266. pfadd %mm1, %mm5
  1267. #endif
  1268. movd %mm4, 0 * SIZE(%esi)
  1269. movd %mm5, 0 * SIZE(%esi, %ebp, 1)
  1270. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1271. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1272. movl K, %eax
  1273. subl KKK, %eax
  1274. leal (,%eax, SIZE), %eax
  1275. leal (AA, %eax, 1), AA
  1276. leal (BB, %eax, 4), BB
  1277. #endif
  1278. #if defined(TRMMKERNEL) && defined(LEFT)
  1279. addl $1, KK
  1280. #endif
  1281. ALIGN_4
  1282. .L59:
  1283. #if defined(TRMMKERNEL) && !defined(LEFT)
  1284. addl $2, KK
  1285. #endif
  1286. leal (, %ebp, 2), %eax
  1287. addl %eax, C # c += 4 * ldc
  1288. ALIGN_4
  1289. .L60:
  1290. movl N, %eax
  1291. testl $1, %eax
  1292. jle .L999
  1293. ALIGN_3
  1294. .L61:
  1295. /* Copying to Sub Buffer */
  1296. leal BUFFER, %ecx
  1297. #if defined(TRMMKERNEL) && defined(LEFT)
  1298. movl OFFSET, %eax
  1299. movl %eax, KK
  1300. #endif
  1301. movl K, %eax
  1302. sarl $3, %eax
  1303. jle .L63
  1304. ALIGN_3
  1305. .L62:
  1306. movd 0 * SIZE(%edi), %mm0
  1307. movd 1 * SIZE(%edi), %mm1
  1308. movd 2 * SIZE(%edi), %mm2
  1309. movd 3 * SIZE(%edi), %mm3
  1310. movd 4 * SIZE(%edi), %mm4
  1311. movd 5 * SIZE(%edi), %mm5
  1312. movd 6 * SIZE(%edi), %mm6
  1313. movd 7 * SIZE(%edi), %mm7
  1314. prefetchnta 72 * SIZE(%edi)
  1315. punpckldq %mm0, %mm0
  1316. punpckldq %mm1, %mm1
  1317. punpckldq %mm2, %mm2
  1318. punpckldq %mm3, %mm3
  1319. punpckldq %mm4, %mm4
  1320. punpckldq %mm5, %mm5
  1321. punpckldq %mm6, %mm6
  1322. punpckldq %mm7, %mm7
  1323. movq %mm0, 0 * SIZE(%ecx)
  1324. movq %mm1, 2 * SIZE(%ecx)
  1325. movq %mm2, 4 * SIZE(%ecx)
  1326. movq %mm3, 6 * SIZE(%ecx)
  1327. movq %mm4, 8 * SIZE(%ecx)
  1328. movq %mm5, 10 * SIZE(%ecx)
  1329. movq %mm6, 12 * SIZE(%ecx)
  1330. movq %mm7, 14 * SIZE(%ecx)
  1331. addl $ 8 * SIZE, %edi
  1332. addl $16 * SIZE, %ecx
  1333. decl %eax
  1334. jne .L62
  1335. .L63:
  1336. movl K, %eax
  1337. andl $7, %eax
  1338. BRANCH
  1339. jle .L70
  1340. ALIGN_2
  1341. .L64:
  1342. movd 0 * SIZE(%edi), %mm0
  1343. punpckldq %mm0, %mm0
  1344. movq %mm0, 0 * SIZE(%ecx)
  1345. addl $1 * SIZE, %edi
  1346. addl $2 * SIZE, %ecx
  1347. decl %eax
  1348. jne .L64
  1349. ALIGN_4
  1350. .L70:
  1351. movl C, %esi # coffset = c
  1352. movl A, %edx # aoffset = a
  1353. movl M, %ebx
  1354. sarl $1, %ebx # i = (m >> 2)
  1355. jle .L90
  1356. ALIGN_4
  1357. .L71:
  1358. leal - BOFFSET * SIZE + BUFFER, BB
  1359. #if !defined(TRMMKERNEL) || \
  1360. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1361. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1362. #else
  1363. movl KK, %eax
  1364. leal (, %eax, SIZE), %eax
  1365. leal (AA, %eax, 2), AA
  1366. leal (BB, %eax, 2), BB
  1367. #endif
  1368. movq ( 0 + AOFFSET) * SIZE(AA), %mm0
  1369. pxor %mm4, %mm4
  1370. movq ( 16 + AOFFSET) * SIZE(AA), %mm1
  1371. pxor %mm5, %mm5
  1372. PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2
  1373. pxor %mm6, %mm6
  1374. pxor %mm7, %mm7
  1375. prefetchw 2 * SIZE(%esi)
  1376. prefetchw 2 * SIZE(%esi, %ebp)
  1377. #ifndef TRMMKERNEL
  1378. movl K, %eax
  1379. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1380. movl K, %eax
  1381. subl KK, %eax
  1382. movl %eax, KKK
  1383. #else
  1384. movl KK, %eax
  1385. #ifdef LEFT
  1386. addl $2, %eax
  1387. #else
  1388. addl $1, %eax
  1389. #endif
  1390. movl %eax, KKK
  1391. #endif
  1392. sarl $4, %eax
  1393. je .L75
  1394. ALIGN_4
  1395. .L72:
  1396. pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0
  1397. pfadd %mm0, %mm4
  1398. movq ( 2 + AOFFSET) * SIZE(AA), %mm0
  1399. PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA)
  1400. pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0
  1401. pfadd %mm0, %mm5
  1402. movq ( 4 + AOFFSET) * SIZE(AA), %mm0
  1403. pfmul ( 4 + BOFFSET) * SIZE(BB), %mm0
  1404. pfadd %mm0, %mm6
  1405. movq ( 6 + AOFFSET) * SIZE(AA), %mm0
  1406. pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0
  1407. pfadd %mm0, %mm7
  1408. movq ( 8 + AOFFSET) * SIZE(AA), %mm0
  1409. pfmul ( 8 + BOFFSET) * SIZE(BB), %mm0
  1410. pfadd %mm0, %mm4
  1411. movq ( 10 + AOFFSET) * SIZE(AA), %mm0
  1412. pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0
  1413. pfadd %mm0, %mm5
  1414. movq ( 12 + AOFFSET) * SIZE(AA), %mm0
  1415. pfmul ( 12 + BOFFSET) * SIZE(BB), %mm0
  1416. pfadd %mm0, %mm6
  1417. movq ( 14 + AOFFSET) * SIZE(AA), %mm0
  1418. pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0
  1419. pfadd %mm0, %mm7
  1420. movq ( 32 + AOFFSET) * SIZE(AA), %mm0
  1421. pfmul ( 16 + BOFFSET) * SIZE(BB), %mm1
  1422. pfadd %mm1, %mm4
  1423. movq ( 18 + AOFFSET) * SIZE(AA), %mm1
  1424. pfmul ( 18 + BOFFSET) * SIZE(BB), %mm1
  1425. pfadd %mm1, %mm5
  1426. movq ( 20 + AOFFSET) * SIZE(AA), %mm1
  1427. pfmul ( 20 + BOFFSET) * SIZE(BB), %mm1
  1428. pfadd %mm1, %mm6
  1429. movq ( 22 + AOFFSET) * SIZE(AA), %mm1
  1430. pfmul ( 22 + BOFFSET) * SIZE(BB), %mm1
  1431. pfadd %mm1, %mm7
  1432. movq ( 24 + AOFFSET) * SIZE(AA), %mm1
  1433. pfmul ( 24 + BOFFSET) * SIZE(BB), %mm1
  1434. pfadd %mm1, %mm4
  1435. movq ( 26 + AOFFSET) * SIZE(AA), %mm1
  1436. pfmul ( 26 + BOFFSET) * SIZE(BB), %mm1
  1437. pfadd %mm1, %mm5
  1438. movq ( 28 + AOFFSET) * SIZE(AA), %mm1
  1439. pfmul ( 28 + BOFFSET) * SIZE(BB), %mm1
  1440. pfadd %mm1, %mm6
  1441. movq ( 30 + AOFFSET) * SIZE(AA), %mm1
  1442. pfmul ( 30 + BOFFSET) * SIZE(BB), %mm1
  1443. pfadd %mm1, %mm7
  1444. movq ( 48 + AOFFSET) * SIZE(AA), %mm1
  1445. subl $-32 * SIZE, AA
  1446. addl $ 32 * SIZE, BB
  1447. decl %eax
  1448. jne .L72
  1449. ALIGN_3
  1450. .L75:
  1451. movq ALPHA, %mm3
  1452. #ifndef TRMMKERNEL
  1453. movl K, %eax
  1454. #else
  1455. movl KKK, %eax
  1456. #endif
  1457. andl $15, %eax
  1458. BRANCH
  1459. je .L78
  1460. ALIGN_3
  1461. .L76:
  1462. pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0
  1463. pfadd %mm0, %mm4
  1464. movq ( 2 + AOFFSET) * SIZE(AA), %mm0
  1465. addl $2 * SIZE, AA
  1466. addl $2 * SIZE, BB
  1467. decl %eax
  1468. jg .L76
  1469. ALIGN_3
  1470. .L78:
  1471. pfadd %mm5, %mm4
  1472. pfadd %mm7, %mm6
  1473. pfadd %mm6, %mm4
  1474. pfmul %mm3, %mm4
  1475. #ifndef TRMMKERNEL
  1476. pfadd 0 * SIZE(%esi), %mm4
  1477. #endif
  1478. movq %mm4, 0 * SIZE(%esi)
  1479. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1480. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1481. movl K, %eax
  1482. subl KKK, %eax
  1483. leal (,%eax, SIZE), %eax
  1484. leal (AA, %eax, 2), AA
  1485. leal (BB, %eax, 2), BB
  1486. #endif
  1487. #if defined(TRMMKERNEL) && defined(LEFT)
  1488. addl $2, KK
  1489. #endif
  1490. addl $2 * SIZE, %esi # coffset += 2
  1491. decl %ebx # i --
  1492. jg .L71
  1493. ALIGN_4
  1494. .L90:
  1495. movl M, %ebx
  1496. testl $1, %ebx # i = (m >> 2)
  1497. jle .L999
  1498. ALIGN_4
  1499. .L91:
  1500. leal - BOFFSET * SIZE + BUFFER, BB
  1501. #if !defined(TRMMKERNEL) || \
  1502. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1503. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1504. #else
  1505. movl KK, %eax
  1506. leal (, %eax, SIZE), %eax
  1507. leal (AA, %eax, 1), AA
  1508. leal (BB, %eax, 2), BB
  1509. #endif
  1510. movq ( 0 + AOFFSET) * SIZE(AA), %mm0
  1511. pxor %mm4, %mm4
  1512. movq ( 8 + AOFFSET) * SIZE(AA), %mm1
  1513. pxor %mm5, %mm5
  1514. PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2
  1515. pxor %mm6, %mm6
  1516. pxor %mm7, %mm7
  1517. #ifndef TRMMKERNEL
  1518. movl K, %eax
  1519. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1520. movl K, %eax
  1521. subl KK, %eax
  1522. movl %eax, KKK
  1523. #else
  1524. movl KK, %eax
  1525. #ifdef LEFT
  1526. addl $1, %eax
  1527. #else
  1528. addl $1, %eax
  1529. #endif
  1530. movl %eax, KKK
  1531. #endif
  1532. sarl $4, %eax
  1533. je .L95
  1534. ALIGN_4
  1535. .L92:
  1536. PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA)
  1537. pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0
  1538. pfadd %mm0, %mm4
  1539. movd ( 1 + AOFFSET) * SIZE(AA), %mm0
  1540. pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0
  1541. pfadd %mm0, %mm5
  1542. movd ( 2 + AOFFSET) * SIZE(AA), %mm0
  1543. pfmul ( 4 + BOFFSET) * SIZE(BB), %mm0
  1544. pfadd %mm0, %mm6
  1545. movd ( 3 + AOFFSET) * SIZE(AA), %mm0
  1546. pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0
  1547. pfadd %mm0, %mm7
  1548. movd ( 4 + AOFFSET) * SIZE(AA), %mm0
  1549. pfmul ( 8 + BOFFSET) * SIZE(BB), %mm0
  1550. pfadd %mm0, %mm4
  1551. movd ( 5 + AOFFSET) * SIZE(AA), %mm0
  1552. pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0
  1553. pfadd %mm0, %mm5
  1554. movd ( 6 + AOFFSET) * SIZE(AA), %mm0
  1555. pfmul ( 12 + BOFFSET) * SIZE(BB), %mm0
  1556. pfadd %mm0, %mm6
  1557. movd ( 7 + AOFFSET) * SIZE(AA), %mm0
  1558. pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0
  1559. pfadd %mm0, %mm7
  1560. movd ( 16 + AOFFSET) * SIZE(AA), %mm0
  1561. pfmul ( 16 + BOFFSET) * SIZE(BB), %mm1
  1562. pfadd %mm1, %mm4
  1563. movd ( 9 + AOFFSET) * SIZE(AA), %mm1
  1564. pfmul ( 18 + BOFFSET) * SIZE(BB), %mm1
  1565. pfadd %mm1, %mm5
  1566. movd ( 10 + AOFFSET) * SIZE(AA), %mm1
  1567. pfmul ( 20 + BOFFSET) * SIZE(BB), %mm1
  1568. pfadd %mm1, %mm6
  1569. movd ( 11 + AOFFSET) * SIZE(AA), %mm1
  1570. pfmul ( 22 + BOFFSET) * SIZE(BB), %mm1
  1571. pfadd %mm1, %mm7
  1572. movd ( 12 + AOFFSET) * SIZE(AA), %mm1
  1573. pfmul ( 24 + BOFFSET) * SIZE(BB), %mm1
  1574. pfadd %mm1, %mm4
  1575. movd ( 13 + AOFFSET) * SIZE(AA), %mm1
  1576. pfmul ( 26 + BOFFSET) * SIZE(BB), %mm1
  1577. pfadd %mm1, %mm5
  1578. movd ( 14 + AOFFSET) * SIZE(AA), %mm1
  1579. pfmul ( 28 + BOFFSET) * SIZE(BB), %mm1
  1580. pfadd %mm1, %mm6
  1581. movd ( 15 + AOFFSET) * SIZE(AA), %mm1
  1582. pfmul ( 30 + BOFFSET) * SIZE(BB), %mm1
  1583. pfadd %mm1, %mm7
  1584. movd ( 24 + AOFFSET) * SIZE(AA), %mm1
  1585. subl $-16 * SIZE, AA
  1586. addl $ 32 * SIZE, BB
  1587. decl %eax
  1588. jne .L92
  1589. ALIGN_3
  1590. .L95:
  1591. movd ALPHA, %mm3
  1592. #ifndef TRMMKERNEL
  1593. movl K, %eax
  1594. #else
  1595. movl KKK, %eax
  1596. #endif
  1597. andl $15, %eax
  1598. BRANCH
  1599. je .L98
  1600. ALIGN_3
  1601. .L96:
  1602. pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0
  1603. pfadd %mm0, %mm4
  1604. movd ( 1 + AOFFSET) * SIZE(AA), %mm0
  1605. addl $1 * SIZE, AA
  1606. addl $2 * SIZE, BB
  1607. decl %eax
  1608. jg .L96
  1609. ALIGN_3
  1610. .L98:
  1611. #ifndef TRMMKERNEL
  1612. movd 0 * SIZE(%esi), %mm0
  1613. #endif
  1614. pfadd %mm5, %mm4
  1615. pfadd %mm7, %mm6
  1616. pfadd %mm6, %mm4
  1617. pfmul %mm3, %mm4
  1618. pfmul %mm3, %mm5
  1619. #ifndef TRMMKERNEL
  1620. pfadd %mm0, %mm4
  1621. #endif
  1622. movd %mm4, 0 * SIZE(%esi)
  1623. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1624. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1625. movl K, %eax
  1626. subl KKK, %eax
  1627. leal (,%eax, SIZE), %eax
  1628. leal (AA, %eax, 1), AA
  1629. leal (BB, %eax, 2), BB
  1630. #endif
  1631. #if defined(TRMMKERNEL) && defined(LEFT)
  1632. addl $1, KK
  1633. #endif
  1634. ALIGN_4
  1635. .L999:
  1636. EMMS
  1637. movl OLD_STACK, %esp
  1638. popl %ebx
  1639. popl %esi
  1640. popl %edi
  1641. popl %ebp
  1642. ret
  1643. EPILOGUE