You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_2x4_sse2.S 38 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define OLD_M 4 + STACK + ARGS(%esi)
  43. #define OLD_N 8 + STACK + ARGS(%esi)
  44. #define OLD_K 12 + STACK + ARGS(%esi)
  45. #define OLD_ALPHA 16 + STACK + ARGS(%esi)
  46. #define OLD_A 24 + STACK + ARGS(%esi)
  47. #define OLD_B 28 + STACK + ARGS(%esi)
  48. #define OLD_C 32 + STACK + ARGS(%esi)
  49. #define OLD_LDC 36 + STACK + ARGS(%esi)
  50. #define OLD_OFFT 40 + STACK + ARGS(%esi)
  51. #define ALPHA 0(%esp)
  52. #define K 16(%esp)
  53. #define N 20(%esp)
  54. #define M 24(%esp)
  55. #define A 28(%esp)
  56. #define C 32(%esp)
  57. #define J 36(%esp)
  58. #define BX 40(%esp)
  59. #define OLD_STACK 44(%esp)
  60. #define OFFSET 48(%esp)
  61. #define KK 52(%esp)
  62. #define KKK 56(%esp)
  63. #define BUFFER 128(%esp)
  64. #if defined(OPTERON) || defined(BARCELONA)
  65. #define movsd movlpd
  66. #endif
  67. #if defined(OPTERON) || defined(BARCELONA)
  68. #define PREFETCH prefetch
  69. #define PREFETCHSIZE (8 * 10 + 4)
  70. #endif
  71. #define AA %edx
  72. #define BB %ecx
  73. #define LDC %ebp
  74. #define KERNEL1(address) \
  75. mulpd %xmm0, %xmm2; \
  76. addpd %xmm2, %xmm4; \
  77. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  78. movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  79. mulpd %xmm0, %xmm2; \
  80. addpd %xmm2, %xmm5; \
  81. movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  82. mulpd %xmm0, %xmm2; \
  83. mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  84. addpd %xmm2, %xmm6; \
  85. movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  86. addpd %xmm0, %xmm7; \
  87. movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  88. #define KERNEL2(address) \
  89. mulpd %xmm0, %xmm3; \
  90. addpd %xmm3, %xmm4; \
  91. movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  92. mulpd %xmm0, %xmm3; \
  93. addpd %xmm3, %xmm5; \
  94. movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  95. mulpd %xmm0, %xmm3; \
  96. mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  97. addpd %xmm3, %xmm6; \
  98. movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  99. addpd %xmm0, %xmm7; \
  100. movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  101. #define KERNEL3(address) \
  102. mulpd %xmm0, %xmm2; \
  103. addpd %xmm2, %xmm4; \
  104. movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  105. mulpd %xmm0, %xmm2; \
  106. addpd %xmm2, %xmm5; \
  107. movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  108. mulpd %xmm0, %xmm2; \
  109. mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  110. addpd %xmm2, %xmm6; \
  111. movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  112. addpd %xmm0, %xmm7; \
  113. movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  114. #define KERNEL4(address) \
  115. mulpd %xmm0, %xmm3; \
  116. addpd %xmm3, %xmm4; \
  117. movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  118. mulpd %xmm0, %xmm3; \
  119. addpd %xmm3, %xmm5; \
  120. movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  121. mulpd %xmm0, %xmm3; \
  122. mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  123. addpd %xmm3, %xmm6; \
  124. movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  125. addpd %xmm0, %xmm7; \
  126. movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  127. #define KERNEL5(address) \
  128. PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \
  129. mulpd %xmm1, %xmm2; \
  130. addpd %xmm2, %xmm4; \
  131. movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  132. mulpd %xmm1, %xmm2; \
  133. addpd %xmm2, %xmm5; \
  134. movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  135. mulpd %xmm1, %xmm2; \
  136. mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  137. addpd %xmm2, %xmm6; \
  138. movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  139. addpd %xmm1, %xmm7; \
  140. movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  141. #define KERNEL6(address) \
  142. mulpd %xmm1, %xmm3; \
  143. addpd %xmm3, %xmm4; \
  144. movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  145. mulpd %xmm1, %xmm3; \
  146. addpd %xmm3, %xmm5; \
  147. movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  148. mulpd %xmm1, %xmm3; \
  149. mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  150. addpd %xmm3, %xmm6; \
  151. movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  152. addpd %xmm1, %xmm7; \
  153. movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  154. #define KERNEL7(address) \
  155. mulpd %xmm1, %xmm2; \
  156. addpd %xmm2, %xmm4; \
  157. movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  158. mulpd %xmm1, %xmm2; \
  159. addpd %xmm2, %xmm5; \
  160. movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  161. mulpd %xmm1, %xmm2; \
  162. mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  163. addpd %xmm2, %xmm6; \
  164. movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  165. addpd %xmm1, %xmm7; \
  166. movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  167. #define KERNEL8(address) \
  168. mulpd %xmm1, %xmm3; \
  169. addpd %xmm3, %xmm4; \
  170. movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  171. mulpd %xmm1, %xmm3; \
  172. addpd %xmm3, %xmm5; \
  173. movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  174. mulpd %xmm1, %xmm3; \
  175. mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  176. addpd %xmm3, %xmm6; \
  177. movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  178. addpd %xmm1, %xmm7; \
  179. movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  180. PROLOGUE
  181. pushl %ebp
  182. pushl %edi
  183. pushl %esi
  184. pushl %ebx
  185. PROFCODE
  186. EMMS
  187. movl %esp, %esi # save old stack
  188. subl $128 + LOCAL_BUFFER_SIZE, %esp
  189. andl $-1024, %esp # align stack
  190. STACK_TOUCHING
  191. movl OLD_M, %ebx
  192. movl OLD_N, %eax
  193. movl OLD_K, %ecx
  194. movl OLD_A, %edx
  195. movsd OLD_ALPHA, %xmm3
  196. movl %ebx, M
  197. movl %eax, N
  198. movl %ecx, K
  199. movl %edx, A
  200. movl %esi, OLD_STACK
  201. #ifdef TRMMKERNEL
  202. movss OLD_OFFT, %xmm4
  203. #endif
  204. unpcklpd %xmm3, %xmm3
  205. movl OLD_B, %edi
  206. movl OLD_C, %ebx
  207. movapd %xmm3, ALPHA
  208. movl %ebx, C
  209. movl OLD_LDC, LDC
  210. #ifdef TRMMKERNEL
  211. movss %xmm4, OFFSET
  212. movss %xmm4, KK
  213. #ifndef LEFT
  214. negl KK
  215. #endif
  216. #endif
  217. leal (, LDC, SIZE), LDC
  218. sarl $2, %eax
  219. movl %eax, J
  220. jle .L30
  221. ALIGN_2
  222. .L01:
  223. #if defined(TRMMKERNEL) && defined(LEFT)
  224. movl OFFSET, %eax
  225. movl %eax, KK
  226. #endif
  227. /* Copying to Sub Buffer */
  228. movl K, %eax
  229. leal BUFFER, %ecx
  230. sarl $1, %eax
  231. jle .L05
  232. ALIGN_4
  233. .L02:
  234. #define COPYPREFETCH 40
  235. prefetchnta (COPYPREFETCH) * SIZE(%edi)
  236. movq 0 * SIZE(%edi), %mm0
  237. movq 1 * SIZE(%edi), %mm1
  238. movq 2 * SIZE(%edi), %mm2
  239. movq 3 * SIZE(%edi), %mm3
  240. movq 4 * SIZE(%edi), %mm4
  241. movq 5 * SIZE(%edi), %mm5
  242. movq 6 * SIZE(%edi), %mm6
  243. movq 7 * SIZE(%edi), %mm7
  244. movq %mm0, 0 * SIZE(%ecx)
  245. movq %mm0, 1 * SIZE(%ecx)
  246. movq %mm1, 2 * SIZE(%ecx)
  247. movq %mm1, 3 * SIZE(%ecx)
  248. movq %mm2, 4 * SIZE(%ecx)
  249. movq %mm2, 5 * SIZE(%ecx)
  250. movq %mm3, 6 * SIZE(%ecx)
  251. movq %mm3, 7 * SIZE(%ecx)
  252. movq %mm4, 8 * SIZE(%ecx)
  253. movq %mm4, 9 * SIZE(%ecx)
  254. movq %mm5, 10 * SIZE(%ecx)
  255. movq %mm5, 11 * SIZE(%ecx)
  256. movq %mm6, 12 * SIZE(%ecx)
  257. movq %mm6, 13 * SIZE(%ecx)
  258. movq %mm7, 14 * SIZE(%ecx)
  259. movq %mm7, 15 * SIZE(%ecx)
  260. addl $ 8 * SIZE, %edi
  261. addl $16 * SIZE, %ecx
  262. decl %eax
  263. jne .L02
  264. ALIGN_2
  265. .L05:
  266. movl K, %eax
  267. andl $1, %eax
  268. BRANCH
  269. jle .L10
  270. movq 0 * SIZE(%edi), %mm0
  271. movq 1 * SIZE(%edi), %mm1
  272. movq 2 * SIZE(%edi), %mm2
  273. movq 3 * SIZE(%edi), %mm3
  274. movq %mm0, 0 * SIZE(%ecx)
  275. movq %mm0, 1 * SIZE(%ecx)
  276. movq %mm1, 2 * SIZE(%ecx)
  277. movq %mm1, 3 * SIZE(%ecx)
  278. movq %mm2, 4 * SIZE(%ecx)
  279. movq %mm2, 5 * SIZE(%ecx)
  280. movq %mm3, 6 * SIZE(%ecx)
  281. movq %mm3, 7 * SIZE(%ecx)
  282. addl $4 * SIZE, %edi
  283. ALIGN_4
  284. .L10:
  285. movl %edi, BX
  286. movl C, %esi # coffset = c
  287. movl A, AA # aoffset = a
  288. movl M, %ebx
  289. sarl $1, %ebx # i = (m >> 2)
  290. jle .L20
  291. ALIGN_4
  292. .L11:
  293. #if !defined(TRMMKERNEL) || \
  294. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  295. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  296. leal BUFFER, BB
  297. #else
  298. leal BUFFER, BB
  299. movl KK, %eax
  300. leal (, %eax, SIZE), %eax
  301. leal (AA, %eax, 2), AA
  302. leal (BB, %eax, 8), BB
  303. #endif
  304. movl BX, %eax
  305. prefetchnta 0 * SIZE(%eax)
  306. prefetchnta 8 * SIZE(%eax)
  307. subl $-8 * SIZE, BX
  308. pxor %xmm4, %xmm4
  309. pxor %xmm5, %xmm5
  310. pxor %xmm6, %xmm6
  311. pxor %xmm7, %xmm7
  312. movapd 0 * SIZE(AA), %xmm0
  313. movapd 8 * SIZE(AA), %xmm1
  314. movapd 0 * SIZE(BB), %xmm2
  315. movapd 8 * SIZE(BB), %xmm3
  316. leal (LDC, LDC, 2), %eax
  317. prefetchw 1 * SIZE(%esi)
  318. prefetchw 1 * SIZE(%esi, LDC)
  319. prefetchw 1 * SIZE(%esi, LDC, 2)
  320. prefetchw 1 * SIZE(%esi, %eax)
  321. #ifndef TRMMKERNEL
  322. movl K, %eax
  323. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  324. movl K, %eax
  325. subl KK, %eax
  326. movl %eax, KKK
  327. #else
  328. movl KK, %eax
  329. #ifdef LEFT
  330. addl $2, %eax
  331. #else
  332. addl $4, %eax
  333. #endif
  334. movl %eax, KKK
  335. #endif
  336. #if 1
  337. andl $-8, %eax
  338. sall $4, %eax
  339. je .L15
  340. .L1X:
  341. KERNEL1(16 * 0)
  342. KERNEL2(16 * 0)
  343. KERNEL3(16 * 0)
  344. KERNEL4(16 * 0)
  345. KERNEL5(16 * 0)
  346. KERNEL6(16 * 0)
  347. KERNEL7(16 * 0)
  348. KERNEL8(16 * 0)
  349. cmpl $128 * 1, %eax
  350. jle .L12
  351. KERNEL1(16 * 1)
  352. KERNEL2(16 * 1)
  353. KERNEL3(16 * 1)
  354. KERNEL4(16 * 1)
  355. KERNEL5(16 * 1)
  356. KERNEL6(16 * 1)
  357. KERNEL7(16 * 1)
  358. KERNEL8(16 * 1)
  359. cmpl $128 * 2, %eax
  360. jle .L12
  361. KERNEL1(16 * 2)
  362. KERNEL2(16 * 2)
  363. KERNEL3(16 * 2)
  364. KERNEL4(16 * 2)
  365. KERNEL5(16 * 2)
  366. KERNEL6(16 * 2)
  367. KERNEL7(16 * 2)
  368. KERNEL8(16 * 2)
  369. cmpl $128 * 3, %eax
  370. jle .L12
  371. KERNEL1(16 * 3)
  372. KERNEL2(16 * 3)
  373. KERNEL3(16 * 3)
  374. KERNEL4(16 * 3)
  375. KERNEL5(16 * 3)
  376. KERNEL6(16 * 3)
  377. KERNEL7(16 * 3)
  378. KERNEL8(16 * 3)
  379. cmpl $128 * 4, %eax
  380. jle .L12
  381. KERNEL1(16 * 4)
  382. KERNEL2(16 * 4)
  383. KERNEL3(16 * 4)
  384. KERNEL4(16 * 4)
  385. KERNEL5(16 * 4)
  386. KERNEL6(16 * 4)
  387. KERNEL7(16 * 4)
  388. KERNEL8(16 * 4)
  389. cmpl $128 * 5, %eax
  390. jle .L12
  391. KERNEL1(16 * 5)
  392. KERNEL2(16 * 5)
  393. KERNEL3(16 * 5)
  394. KERNEL4(16 * 5)
  395. KERNEL5(16 * 5)
  396. KERNEL6(16 * 5)
  397. KERNEL7(16 * 5)
  398. KERNEL8(16 * 5)
  399. cmpl $128 * 6, %eax
  400. jle .L12
  401. KERNEL1(16 * 6)
  402. KERNEL2(16 * 6)
  403. KERNEL3(16 * 6)
  404. KERNEL4(16 * 6)
  405. KERNEL5(16 * 6)
  406. KERNEL6(16 * 6)
  407. KERNEL7(16 * 6)
  408. KERNEL8(16 * 6)
  409. cmpl $128 * 7, %eax
  410. jle .L12
  411. KERNEL1(16 * 7)
  412. KERNEL2(16 * 7)
  413. KERNEL3(16 * 7)
  414. KERNEL4(16 * 7)
  415. KERNEL5(16 * 7)
  416. KERNEL6(16 * 7)
  417. KERNEL7(16 * 7)
  418. KERNEL8(16 * 7)
  419. addl $128 * 4 * SIZE, BB
  420. addl $128 * 1 * SIZE, AA
  421. subl $128 * 8, %eax
  422. jg .L1X
  423. jmp .L15
  424. .L12:
  425. leal (AA, %eax, 1), AA
  426. leal (BB, %eax, 4), BB
  427. ALIGN_4
  428. #else
  429. sarl $3, %eax
  430. je .L15
  431. ALIGN_4
  432. .L12:
  433. KERNEL1(16 * 0)
  434. KERNEL2(16 * 0)
  435. KERNEL3(16 * 0)
  436. KERNEL4(16 * 0)
  437. KERNEL5(16 * 0)
  438. KERNEL6(16 * 0)
  439. KERNEL7(16 * 0)
  440. KERNEL8(16 * 0)
  441. addl $64 * SIZE, BB
  442. addl $16 * SIZE, AA
  443. decl %eax
  444. jne .L12
  445. ALIGN_4
  446. #endif
  447. .L15:
  448. #ifndef TRMMKERNEL
  449. movl K, %eax
  450. #else
  451. movl KKK, %eax
  452. #endif
  453. movapd ALPHA, %xmm3
  454. andl $7, %eax # if (k & 1)
  455. BRANCH
  456. je .L18
  457. ALIGN_3
  458. .L16:
  459. mulpd %xmm0, %xmm2
  460. addpd %xmm2, %xmm4
  461. movapd 2 * SIZE(BB), %xmm2
  462. mulpd %xmm0, %xmm2
  463. addpd %xmm2, %xmm5
  464. movapd 4 * SIZE(BB), %xmm2
  465. mulpd %xmm0, %xmm2
  466. mulpd 6 * SIZE(BB), %xmm0
  467. addpd %xmm2, %xmm6
  468. movapd 8 * SIZE(BB), %xmm2
  469. addpd %xmm0, %xmm7
  470. movapd 2 * SIZE(AA), %xmm0
  471. addl $2 * SIZE, AA
  472. addl $8 * SIZE, BB
  473. decl %eax
  474. jg .L16
  475. ALIGN_4
  476. .L18:
  477. leal (LDC, LDC, 2), %eax
  478. #ifndef TRMMKERNEL
  479. mulpd %xmm3, %xmm4
  480. movsd 0 * SIZE(%esi), %xmm0
  481. movhpd 1 * SIZE(%esi), %xmm0
  482. mulpd %xmm3, %xmm5
  483. movsd 0 * SIZE(%esi, LDC, 1), %xmm1
  484. movhpd 1 * SIZE(%esi, LDC, 1), %xmm1
  485. mulpd %xmm3, %xmm6
  486. movsd 0 * SIZE(%esi, LDC, 2), %xmm2
  487. movhpd 1 * SIZE(%esi, LDC, 2), %xmm2
  488. mulpd %xmm3, %xmm7
  489. movsd 0 * SIZE(%esi, %eax, 1), %xmm3
  490. movhpd 1 * SIZE(%esi, %eax, 1), %xmm3
  491. addpd %xmm0, %xmm4
  492. addpd %xmm1, %xmm5
  493. addpd %xmm2, %xmm6
  494. addpd %xmm3, %xmm7
  495. #else
  496. mulpd %xmm3, %xmm4
  497. mulpd %xmm3, %xmm5
  498. mulpd %xmm3, %xmm6
  499. mulpd %xmm3, %xmm7
  500. #endif
  501. movsd %xmm4, 0 * SIZE(%esi)
  502. movhpd %xmm4, 1 * SIZE(%esi)
  503. movsd %xmm5, 0 * SIZE(%esi, LDC, 1)
  504. movhpd %xmm5, 1 * SIZE(%esi, LDC, 1)
  505. movsd %xmm6, 0 * SIZE(%esi, LDC, 2)
  506. movhpd %xmm6, 1 * SIZE(%esi, LDC, 2)
  507. movsd %xmm7, 0 * SIZE(%esi, %eax, 1)
  508. movhpd %xmm7, 1 * SIZE(%esi, %eax, 1)
  509. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  510. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  511. movl K, %eax
  512. subl KKK, %eax
  513. leal (,%eax, SIZE), %eax
  514. leal (AA, %eax, 2), AA
  515. leal (BB, %eax, 8), BB
  516. #endif
  517. #if defined(TRMMKERNEL) && defined(LEFT)
  518. addl $2, KK
  519. #endif
  520. addl $2 * SIZE, %esi # coffset += 2
  521. decl %ebx # i --
  522. jg .L11
  523. ALIGN_4
  524. .L20:
  525. movl M, %ebx
  526. testl $1, %ebx # i = (m >> 2)
  527. jle .L29
  528. #if !defined(TRMMKERNEL) || \
  529. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  530. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  531. leal BUFFER, BB
  532. #else
  533. leal BUFFER, BB
  534. movl KK, %eax
  535. leal (, %eax, SIZE), %eax
  536. leal (AA, %eax, 1), AA
  537. leal (BB, %eax, 8), BB
  538. #endif
  539. pxor %xmm4, %xmm4
  540. pxor %xmm5, %xmm5
  541. pxor %xmm6, %xmm6
  542. pxor %xmm7, %xmm7
  543. leal (LDC, LDC, 2), %eax
  544. movsd 0 * SIZE(AA), %xmm0
  545. movsd 4 * SIZE(AA), %xmm1
  546. movsd 0 * SIZE(BB), %xmm2
  547. movsd 8 * SIZE(BB), %xmm3
  548. #ifndef TRMMKERNEL
  549. movl K, %eax
  550. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  551. movl K, %eax
  552. subl KK, %eax
  553. movl %eax, KKK
  554. #else
  555. movl KK, %eax
  556. #ifdef LEFT
  557. addl $1, %eax
  558. #else
  559. addl $4, %eax
  560. #endif
  561. movl %eax, KKK
  562. #endif
  563. sarl $3, %eax
  564. je .L25
  565. ALIGN_4
  566. .L22:
  567. mulsd %xmm0, %xmm2
  568. addsd %xmm2, %xmm4
  569. #if defined(OPTERON) || defined(BARCELONA)
  570. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  571. #endif
  572. movsd 2 * SIZE(BB), %xmm2
  573. mulsd %xmm0, %xmm2
  574. addsd %xmm2, %xmm5
  575. movsd 4 * SIZE(BB), %xmm2
  576. mulsd %xmm0, %xmm2
  577. mulsd 6 * SIZE(BB), %xmm0
  578. addsd %xmm2, %xmm6
  579. movsd 16 * SIZE(BB), %xmm2
  580. addsd %xmm0, %xmm7
  581. movsd 1 * SIZE(AA), %xmm0
  582. mulsd %xmm0, %xmm3
  583. addsd %xmm3, %xmm4
  584. movsd 10 * SIZE(BB), %xmm3
  585. mulsd %xmm0, %xmm3
  586. addsd %xmm3, %xmm5
  587. movsd 12 * SIZE(BB), %xmm3
  588. mulsd %xmm0, %xmm3
  589. mulsd 14 * SIZE(BB), %xmm0
  590. addsd %xmm3, %xmm6
  591. movsd 24 * SIZE(BB), %xmm3
  592. addsd %xmm0, %xmm7
  593. movsd 2 * SIZE(AA), %xmm0
  594. mulsd %xmm0, %xmm2
  595. addsd %xmm2, %xmm4
  596. movsd 18 * SIZE(BB), %xmm2
  597. mulsd %xmm0, %xmm2
  598. addsd %xmm2, %xmm5
  599. movsd 20 * SIZE(BB), %xmm2
  600. mulsd %xmm0, %xmm2
  601. mulsd 22 * SIZE(BB), %xmm0
  602. addsd %xmm2, %xmm6
  603. movsd 32 * SIZE(BB), %xmm2
  604. addsd %xmm0, %xmm7
  605. movsd 3 * SIZE(AA), %xmm0
  606. mulsd %xmm0, %xmm3
  607. addsd %xmm3, %xmm4
  608. movsd 26 * SIZE(BB), %xmm3
  609. mulsd %xmm0, %xmm3
  610. addsd %xmm3, %xmm5
  611. movsd 28 * SIZE(BB), %xmm3
  612. mulsd %xmm0, %xmm3
  613. mulsd 30 * SIZE(BB), %xmm0
  614. addsd %xmm3, %xmm6
  615. movsd 40 * SIZE(BB), %xmm3
  616. addsd %xmm0, %xmm7
  617. movsd 8 * SIZE(AA), %xmm0
  618. #if defined(OPTERON) || defined(BARCELONA)
  619. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  620. #endif
  621. mulsd %xmm1, %xmm2
  622. addsd %xmm2, %xmm4
  623. movsd 34 * SIZE(BB), %xmm2
  624. mulsd %xmm1, %xmm2
  625. addsd %xmm2, %xmm5
  626. movsd 36 * SIZE(BB), %xmm2
  627. mulsd %xmm1, %xmm2
  628. mulsd 38 * SIZE(BB), %xmm1
  629. addsd %xmm2, %xmm6
  630. movsd 48 * SIZE(BB), %xmm2
  631. addsd %xmm1, %xmm7
  632. movsd 5 * SIZE(AA), %xmm1
  633. mulsd %xmm1, %xmm3
  634. addsd %xmm3, %xmm4
  635. movsd 42 * SIZE(BB), %xmm3
  636. mulsd %xmm1, %xmm3
  637. addsd %xmm3, %xmm5
  638. movsd 44 * SIZE(BB), %xmm3
  639. mulsd %xmm1, %xmm3
  640. mulsd 46 * SIZE(BB), %xmm1
  641. addsd %xmm3, %xmm6
  642. movsd 56 * SIZE(BB), %xmm3
  643. addsd %xmm1, %xmm7
  644. movsd 6 * SIZE(AA), %xmm1
  645. mulsd %xmm1, %xmm2
  646. addsd %xmm2, %xmm4
  647. movsd 50 * SIZE(BB), %xmm2
  648. mulsd %xmm1, %xmm2
  649. addsd %xmm2, %xmm5
  650. movsd 52 * SIZE(BB), %xmm2
  651. mulsd %xmm1, %xmm2
  652. mulsd 54 * SIZE(BB), %xmm1
  653. addsd %xmm2, %xmm6
  654. movsd 64 * SIZE(BB), %xmm2
  655. addsd %xmm1, %xmm7
  656. movsd 7 * SIZE(AA), %xmm1
  657. mulsd %xmm1, %xmm3
  658. addsd %xmm3, %xmm4
  659. movsd 58 * SIZE(BB), %xmm3
  660. mulsd %xmm1, %xmm3
  661. addsd %xmm3, %xmm5
  662. movsd 60 * SIZE(BB), %xmm3
  663. mulsd %xmm1, %xmm3
  664. mulsd 62 * SIZE(BB), %xmm1
  665. addsd %xmm3, %xmm6
  666. movsd 72 * SIZE(BB), %xmm3
  667. addl $64 * SIZE, BB
  668. addsd %xmm1, %xmm7
  669. movsd 12 * SIZE(AA), %xmm1
  670. addl $8 * SIZE, AA
  671. decl %eax
  672. jne .L22
  673. ALIGN_4
  674. .L25:
  675. #ifndef TRMMKERNEL
  676. movl K, %eax
  677. #else
  678. movl KKK, %eax
  679. #endif
  680. movsd ALPHA, %xmm3
  681. andl $7, %eax # if (k & 1)
  682. BRANCH
  683. je .L28
  684. .L26:
  685. mulsd %xmm0, %xmm2
  686. addsd %xmm2, %xmm4
  687. movsd 2 * SIZE(BB), %xmm2
  688. mulsd %xmm0, %xmm2
  689. addsd %xmm2, %xmm5
  690. movsd 4 * SIZE(BB), %xmm2
  691. mulsd %xmm0, %xmm2
  692. mulsd 6 * SIZE(BB), %xmm0
  693. addsd %xmm2, %xmm6
  694. movsd 8 * SIZE(BB), %xmm2
  695. addsd %xmm0, %xmm7
  696. movsd 1 * SIZE(AA), %xmm0
  697. addl $1 * SIZE, AA
  698. addl $8 * SIZE, BB
  699. decl %eax
  700. jg .L26
  701. ALIGN_4
  702. .L28:
  703. leal (LDC, LDC, 2), %eax
  704. #ifndef TRMMKERNEL
  705. mulsd %xmm3, %xmm4
  706. movsd 0 * SIZE(%esi), %xmm0
  707. mulsd %xmm3, %xmm5
  708. movsd 0 * SIZE(%esi, LDC, 1), %xmm1
  709. mulsd %xmm3, %xmm6
  710. movsd 0 * SIZE(%esi, LDC, 2), %xmm2
  711. mulsd %xmm3, %xmm7
  712. movsd 0 * SIZE(%esi, %eax, 1), %xmm3
  713. addsd %xmm0, %xmm4
  714. addsd %xmm1, %xmm5
  715. addsd %xmm2, %xmm6
  716. addsd %xmm3, %xmm7
  717. #else
  718. mulsd %xmm3, %xmm4
  719. mulsd %xmm3, %xmm5
  720. mulsd %xmm3, %xmm6
  721. mulsd %xmm3, %xmm7
  722. #endif
  723. movsd %xmm4, 0 * SIZE(%esi)
  724. movsd %xmm5, 0 * SIZE(%esi, LDC, 1)
  725. movsd %xmm6, 0 * SIZE(%esi, LDC, 2)
  726. movsd %xmm7, 0 * SIZE(%esi, %eax, 1)
  727. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  728. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  729. movl K, %eax
  730. subl KKK, %eax
  731. leal (,%eax, SIZE), %eax
  732. leal (AA, %eax, 1), AA
  733. leal (BB, %eax, 8), BB
  734. #endif
  735. #if defined(TRMMKERNEL) && defined(LEFT)
  736. addl $1, KK
  737. #endif
  738. ALIGN_4
  739. .L29:
  740. #if defined(TRMMKERNEL) && !defined(LEFT)
  741. addl $4, KK
  742. #endif
  743. leal (, LDC, 4), %eax
  744. addl %eax, C # c += 4 * ldc
  745. decl J # j --
  746. jg .L01
  747. ALIGN_4
  748. .L30:
  749. testl $2, N
  750. je .L60
  751. ALIGN_2
  752. .L31:
  753. #if defined(TRMMKERNEL) && defined(LEFT)
  754. movl OFFSET, %eax
  755. movl %eax, KK
  756. #endif
  757. /* Copying to Sub Buffer */
  758. movl K, %eax
  759. leal BUFFER, %ecx
  760. sarl $2, %eax
  761. jle .L35
  762. ALIGN_4
  763. .L32:
  764. #ifdef PENTIUM4
  765. #ifdef HAVE_SSE3
  766. movddup 0 * SIZE(%edi), %xmm0
  767. movddup 1 * SIZE(%edi), %xmm1
  768. movddup 2 * SIZE(%edi), %xmm2
  769. movddup 3 * SIZE(%edi), %xmm3
  770. movddup 4 * SIZE(%edi), %xmm4
  771. movddup 5 * SIZE(%edi), %xmm5
  772. movddup 6 * SIZE(%edi), %xmm6
  773. movddup 7 * SIZE(%edi), %xmm7
  774. movapd %xmm0, 0 * SIZE(%ecx)
  775. movapd %xmm1, 2 * SIZE(%ecx)
  776. movapd %xmm2, 4 * SIZE(%ecx)
  777. movapd %xmm3, 6 * SIZE(%ecx)
  778. movapd %xmm4, 8 * SIZE(%ecx)
  779. movapd %xmm5, 10 * SIZE(%ecx)
  780. movapd %xmm6, 12 * SIZE(%ecx)
  781. movapd %xmm7, 14 * SIZE(%ecx)
  782. #else
  783. movsd 0 * SIZE(%edi), %xmm0
  784. movsd 1 * SIZE(%edi), %xmm1
  785. movsd 2 * SIZE(%edi), %xmm2
  786. movsd 3 * SIZE(%edi), %xmm3
  787. movsd 4 * SIZE(%edi), %xmm4
  788. movsd 5 * SIZE(%edi), %xmm5
  789. movsd 6 * SIZE(%edi), %xmm6
  790. movsd 7 * SIZE(%edi), %xmm7
  791. unpcklpd %xmm0, %xmm0
  792. unpckhpd %xmm1, %xmm1
  793. unpcklpd %xmm2, %xmm2
  794. unpckhpd %xmm3, %xmm3
  795. unpcklpd %xmm4, %xmm4
  796. unpckhpd %xmm5, %xmm5
  797. unpcklpd %xmm6, %xmm6
  798. unpckhpd %xmm7, %xmm7
  799. movapd %xmm0, 0 * SIZE(%ecx)
  800. movapd %xmm1, 2 * SIZE(%ecx)
  801. movapd %xmm2, 4 * SIZE(%ecx)
  802. movapd %xmm3, 6 * SIZE(%ecx)
  803. movapd %xmm4, 8 * SIZE(%ecx)
  804. movapd %xmm5, 10 * SIZE(%ecx)
  805. movapd %xmm6, 12 * SIZE(%ecx)
  806. movapd %xmm7, 14 * SIZE(%ecx)
  807. #endif
  808. prefetcht0 80 * SIZE(%edi)
  809. prefetcht1 112 * SIZE(%ecx)
  810. #endif
  811. #if defined(OPTERON) || defined(BARCELONA)
  812. #define COPYPREFETCH 40
  813. prefetchnta (COPYPREFETCH) * SIZE(%edi)
  814. movq 0 * SIZE(%edi), %mm0
  815. movq 1 * SIZE(%edi), %mm1
  816. movq 2 * SIZE(%edi), %mm2
  817. movq 3 * SIZE(%edi), %mm3
  818. movq 4 * SIZE(%edi), %mm4
  819. movq 5 * SIZE(%edi), %mm5
  820. movq 6 * SIZE(%edi), %mm6
  821. movq 7 * SIZE(%edi), %mm7
  822. movq %mm0, 0 * SIZE(%ecx)
  823. movq %mm0, 1 * SIZE(%ecx)
  824. movq %mm1, 2 * SIZE(%ecx)
  825. movq %mm1, 3 * SIZE(%ecx)
  826. movq %mm2, 4 * SIZE(%ecx)
  827. movq %mm2, 5 * SIZE(%ecx)
  828. movq %mm3, 6 * SIZE(%ecx)
  829. movq %mm3, 7 * SIZE(%ecx)
  830. movq %mm4, 8 * SIZE(%ecx)
  831. movq %mm4, 9 * SIZE(%ecx)
  832. movq %mm5, 10 * SIZE(%ecx)
  833. movq %mm5, 11 * SIZE(%ecx)
  834. movq %mm6, 12 * SIZE(%ecx)
  835. movq %mm6, 13 * SIZE(%ecx)
  836. movq %mm7, 14 * SIZE(%ecx)
  837. movq %mm7, 15 * SIZE(%ecx)
  838. #endif
  839. addl $ 8 * SIZE, %edi
  840. addl $16 * SIZE, %ecx
  841. decl %eax
  842. jne .L32
  843. ALIGN_2
  844. .L35:
  845. movl K, %eax
  846. andl $3, %eax
  847. BRANCH
  848. jle .L40
  849. ALIGN_2
  850. .L36:
  851. #ifdef PENTIUM4
  852. #ifdef HAVE_SSE3
  853. movddup 0 * SIZE(%edi), %xmm0
  854. movddup 1 * SIZE(%edi), %xmm1
  855. movapd %xmm0, 0 * SIZE(%ecx)
  856. movapd %xmm1, 2 * SIZE(%ecx)
  857. #else
  858. movsd 0 * SIZE(%edi), %xmm0
  859. movsd 1 * SIZE(%edi), %xmm1
  860. unpcklpd %xmm0, %xmm0
  861. unpckhpd %xmm1, %xmm1
  862. movapd %xmm0, 0 * SIZE(%ecx)
  863. movapd %xmm1, 2 * SIZE(%ecx)
  864. #endif
  865. #endif
  866. #if defined(OPTERON) || defined(BARCELONA)
  867. movq 0 * SIZE(%edi), %mm0
  868. movq 1 * SIZE(%edi), %mm1
  869. movq %mm0, 0 * SIZE(%ecx)
  870. movq %mm0, 1 * SIZE(%ecx)
  871. movq %mm1, 2 * SIZE(%ecx)
  872. movq %mm1, 3 * SIZE(%ecx)
  873. #endif
  874. addl $2 * SIZE, %edi
  875. addl $4 * SIZE, %ecx
  876. decl %eax
  877. jne .L36
  878. ALIGN_4
  879. .L40:
  880. movl C, %esi # coffset = c
  881. movl A, AA # aoffset = a
  882. movl M, %ebx
  883. sarl $1, %ebx # i = (m >> 2)
  884. jle .L50
  885. ALIGN_4
  886. .L41:
  887. #if !defined(TRMMKERNEL) || \
  888. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  889. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  890. leal BUFFER, BB
  891. #else
  892. leal BUFFER, BB
  893. movl KK, %eax
  894. leal (, %eax, SIZE), %eax
  895. leal (AA, %eax, 2), AA
  896. leal (BB, %eax, 4), BB
  897. #endif
  898. pxor %xmm4, %xmm4
  899. pxor %xmm5, %xmm5
  900. pxor %xmm6, %xmm6
  901. pxor %xmm7, %xmm7
  902. movapd 0 * SIZE(AA), %xmm0
  903. movapd 8 * SIZE(AA), %xmm1
  904. movapd 0 * SIZE(BB), %xmm2
  905. movapd 8 * SIZE(BB), %xmm3
  906. #ifdef HAVE_3DNOW
  907. prefetchw 2 * SIZE(%esi)
  908. prefetchw 2 * SIZE(%esi, LDC)
  909. #endif
  910. #ifdef PENTIUM4
  911. prefetchnta 4 * SIZE(%esi)
  912. prefetchnta 4 * SIZE(%esi, LDC)
  913. #endif
  914. #ifndef TRMMKERNEL
  915. movl K, %eax
  916. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  917. movl K, %eax
  918. subl KK, %eax
  919. movl %eax, KKK
  920. #else
  921. movl KK, %eax
  922. #ifdef LEFT
  923. addl $2, %eax
  924. #else
  925. addl $2, %eax
  926. #endif
  927. movl %eax, KKK
  928. #endif
  929. sarl $3, %eax
  930. je .L45
  931. ALIGN_4
  932. .L42:
  933. mulpd %xmm0, %xmm2
  934. #if defined(OPTERON) || defined(BARCELONA)
  935. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  936. #endif
  937. mulpd 2 * SIZE(BB), %xmm0
  938. addpd %xmm2, %xmm4
  939. movapd 4 * SIZE(BB), %xmm2
  940. addpd %xmm0, %xmm5
  941. movapd 2 * SIZE(AA), %xmm0
  942. mulpd %xmm0, %xmm2
  943. mulpd 6 * SIZE(BB), %xmm0
  944. addpd %xmm2, %xmm6
  945. movapd 16 * SIZE(BB), %xmm2
  946. addpd %xmm0, %xmm7
  947. movapd 4 * SIZE(AA), %xmm0
  948. mulpd %xmm0, %xmm3
  949. mulpd 10 * SIZE(BB), %xmm0
  950. addpd %xmm3, %xmm4
  951. movapd 12 * SIZE(BB), %xmm3
  952. addpd %xmm0, %xmm5
  953. movapd 6 * SIZE(AA), %xmm0
  954. mulpd %xmm0, %xmm3
  955. mulpd 14 * SIZE(BB), %xmm0
  956. addpd %xmm3, %xmm6
  957. movapd 24 * SIZE(BB), %xmm3
  958. addpd %xmm0, %xmm7
  959. movapd 16 * SIZE(AA), %xmm0
  960. #if defined(OPTERON) || defined(BARCELONA)
  961. prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
  962. #endif
  963. mulpd %xmm1, %xmm2
  964. mulpd 18 * SIZE(BB), %xmm1
  965. addpd %xmm2, %xmm4
  966. movapd 20 * SIZE(BB), %xmm2
  967. addpd %xmm1, %xmm5
  968. movapd 10 * SIZE(AA), %xmm1
  969. mulpd %xmm1, %xmm2
  970. mulpd 22 * SIZE(BB), %xmm1
  971. addpd %xmm2, %xmm6
  972. movapd 32 * SIZE(BB), %xmm2
  973. addpd %xmm1, %xmm7
  974. movapd 12 * SIZE(AA), %xmm1
  975. mulpd %xmm1, %xmm3
  976. mulpd 26 * SIZE(BB), %xmm1
  977. addpd %xmm3, %xmm4
  978. movapd 28 * SIZE(BB), %xmm3
  979. addpd %xmm1, %xmm5
  980. movapd 14 * SIZE(AA), %xmm1
  981. mulpd %xmm1, %xmm3
  982. mulpd 30 * SIZE(BB), %xmm1
  983. addpd %xmm3, %xmm6
  984. movapd 40 * SIZE(BB), %xmm3
  985. addpd %xmm1, %xmm7
  986. movapd 24 * SIZE(AA), %xmm1
  987. addl $16 * SIZE, AA
  988. addl $32 * SIZE, BB
  989. decl %eax
  990. jne .L42
  991. ALIGN_4
  992. .L45:
  993. #ifndef TRMMKERNEL
  994. movl K, %eax
  995. #else
  996. movl KKK, %eax
  997. #endif
  998. movapd ALPHA, %xmm3
  999. andl $7, %eax # if (k & 1)
  1000. BRANCH
  1001. je .L48
  1002. ALIGN_3
  1003. .L46:
  1004. mulpd %xmm0, %xmm2
  1005. mulpd 2 * SIZE(BB), %xmm0
  1006. addpd %xmm2, %xmm4
  1007. movapd 4 * SIZE(BB), %xmm2
  1008. addpd %xmm0, %xmm5
  1009. movapd 2 * SIZE(AA), %xmm0
  1010. addl $2 * SIZE, AA
  1011. addl $4 * SIZE, BB
  1012. decl %eax
  1013. jg .L46
  1014. ALIGN_4
  1015. .L48:
  1016. #ifndef TRMMKERNEL
  1017. movsd 0 * SIZE(%esi), %xmm0
  1018. movhpd 1 * SIZE(%esi), %xmm0
  1019. movsd 0 * SIZE(%esi, LDC, 1), %xmm1
  1020. movhpd 1 * SIZE(%esi, LDC, 1), %xmm1
  1021. #endif
  1022. addpd %xmm6, %xmm4
  1023. addpd %xmm7, %xmm5
  1024. mulpd %xmm3, %xmm4
  1025. mulpd %xmm3, %xmm5
  1026. #ifndef TRMMKERNEL
  1027. addpd %xmm0, %xmm4
  1028. addpd %xmm1, %xmm5
  1029. #endif
  1030. movsd %xmm4, 0 * SIZE(%esi)
  1031. movhpd %xmm4, 1 * SIZE(%esi)
  1032. movsd %xmm5, 0 * SIZE(%esi, LDC, 1)
  1033. movhpd %xmm5, 1 * SIZE(%esi, LDC, 1)
  1034. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1035. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1036. movl K, %eax
  1037. subl KKK, %eax
  1038. leal (,%eax, SIZE), %eax
  1039. leal (AA, %eax, 2), AA
  1040. leal (BB, %eax, 4), BB
  1041. #endif
  1042. #if defined(TRMMKERNEL) && defined(LEFT)
  1043. addl $2, KK
  1044. #endif
  1045. addl $2 * SIZE, %esi # coffset += 2
  1046. decl %ebx # i --
  1047. jg .L41
  1048. ALIGN_4
  1049. .L50:
  1050. movl M, %ebx
  1051. testl $1, %ebx # i = (m >> 2)
  1052. jle .L59
  1053. #if !defined(TRMMKERNEL) || \
  1054. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1055. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1056. leal BUFFER, BB
  1057. #else
  1058. leal BUFFER, BB
  1059. movl KK, %eax
  1060. leal (, %eax, SIZE), %eax
  1061. leal (AA, %eax, 1), AA
  1062. leal (BB, %eax, 4), BB
  1063. #endif
  1064. pxor %xmm4, %xmm4
  1065. pxor %xmm5, %xmm5
  1066. pxor %xmm6, %xmm6
  1067. pxor %xmm7, %xmm7
  1068. leal (LDC, LDC, 2), %eax
  1069. movsd 0 * SIZE(AA), %xmm0
  1070. movsd 4 * SIZE(AA), %xmm1
  1071. movsd 0 * SIZE(BB), %xmm2
  1072. movsd 8 * SIZE(BB), %xmm3
  1073. #ifndef TRMMKERNEL
  1074. movl K, %eax
  1075. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1076. movl K, %eax
  1077. subl KK, %eax
  1078. movl %eax, KKK
  1079. #else
  1080. movl KK, %eax
  1081. #ifdef LEFT
  1082. addl $1, %eax
  1083. #else
  1084. addl $2, %eax
  1085. #endif
  1086. movl %eax, KKK
  1087. #endif
  1088. sarl $3, %eax
  1089. je .L55
  1090. ALIGN_4
  1091. .L52:
  1092. mulsd %xmm0, %xmm2
  1093. #if defined(OPTERON) || defined(BARCELONA)
  1094. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1095. #endif
  1096. mulsd 2 * SIZE(BB), %xmm0
  1097. addsd %xmm2, %xmm4
  1098. movsd 4 * SIZE(BB), %xmm2
  1099. addsd %xmm0, %xmm5
  1100. movsd 1 * SIZE(AA), %xmm0
  1101. mulsd %xmm0, %xmm2
  1102. mulsd 6 * SIZE(BB), %xmm0
  1103. addsd %xmm2, %xmm6
  1104. movsd 16 * SIZE(BB), %xmm2
  1105. addsd %xmm0, %xmm7
  1106. movsd 2 * SIZE(AA), %xmm0
  1107. mulsd %xmm0, %xmm3
  1108. mulsd 10 * SIZE(BB), %xmm0
  1109. addsd %xmm3, %xmm4
  1110. movsd 12 * SIZE(BB), %xmm3
  1111. addsd %xmm0, %xmm5
  1112. movsd 3 * SIZE(AA), %xmm0
  1113. mulsd %xmm0, %xmm3
  1114. mulsd 14 * SIZE(BB), %xmm0
  1115. addsd %xmm3, %xmm6
  1116. movsd 24 * SIZE(BB), %xmm3
  1117. addsd %xmm0, %xmm7
  1118. movsd 8 * SIZE(AA), %xmm0
  1119. mulsd %xmm1, %xmm2
  1120. mulsd 18 * SIZE(BB), %xmm1
  1121. addsd %xmm2, %xmm4
  1122. movsd 20 * SIZE(BB), %xmm2
  1123. addsd %xmm1, %xmm5
  1124. movsd 5 * SIZE(AA), %xmm1
  1125. mulsd %xmm1, %xmm2
  1126. mulsd 22 * SIZE(BB), %xmm1
  1127. addsd %xmm2, %xmm6
  1128. movsd 32 * SIZE(BB), %xmm2
  1129. addsd %xmm1, %xmm7
  1130. movsd 6 * SIZE(AA), %xmm1
  1131. mulsd %xmm1, %xmm3
  1132. mulsd 26 * SIZE(BB), %xmm1
  1133. addsd %xmm3, %xmm4
  1134. movsd 28 * SIZE(BB), %xmm3
  1135. addsd %xmm1, %xmm5
  1136. movsd 7 * SIZE(AA), %xmm1
  1137. mulsd %xmm1, %xmm3
  1138. mulsd 30 * SIZE(BB), %xmm1
  1139. addsd %xmm3, %xmm6
  1140. movsd 40 * SIZE(BB), %xmm3
  1141. addsd %xmm1, %xmm7
  1142. movsd 12 * SIZE(AA), %xmm1
  1143. addl $ 8 * SIZE, AA
  1144. addl $32 * SIZE, BB
  1145. decl %eax
  1146. jne .L52
  1147. ALIGN_4
  1148. .L55:
  1149. #ifndef TRMMKERNEL
  1150. movl K, %eax
  1151. #else
  1152. movl KKK, %eax
  1153. #endif
  1154. movsd ALPHA, %xmm3
  1155. andl $7, %eax # if (k & 1)
  1156. BRANCH
  1157. je .L58
  1158. .L56:
  1159. mulsd %xmm0, %xmm2
  1160. mulsd 2 * SIZE(BB), %xmm0
  1161. addsd %xmm2, %xmm4
  1162. movsd 4 * SIZE(BB), %xmm2
  1163. addsd %xmm0, %xmm5
  1164. movsd 1 * SIZE(AA), %xmm0
  1165. addl $1 * SIZE, AA
  1166. addl $4 * SIZE, BB
  1167. decl %eax
  1168. jg .L56
  1169. ALIGN_4
  1170. .L58:
  1171. addsd %xmm6, %xmm4
  1172. addsd %xmm7, %xmm5
  1173. mulpd %xmm3, %xmm4
  1174. mulpd %xmm3, %xmm5
  1175. #ifndef TRMMKERNEL
  1176. movsd 0 * SIZE(%esi), %xmm0
  1177. movsd 0 * SIZE(%esi, LDC, 1), %xmm1
  1178. addsd %xmm0, %xmm4
  1179. addsd %xmm1, %xmm5
  1180. #endif
  1181. movsd %xmm4, 0 * SIZE(%esi)
  1182. movsd %xmm5, 0 * SIZE(%esi, LDC, 1)
  1183. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1184. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1185. movl K, %eax
  1186. subl KKK, %eax
  1187. leal (,%eax, SIZE), %eax
  1188. leal (AA, %eax, 1), AA
  1189. leal (BB, %eax, 4), BB
  1190. #endif
  1191. #if defined(TRMMKERNEL) && defined(LEFT)
  1192. addl $1, KK
  1193. #endif
  1194. ALIGN_4
  1195. .L59:
  1196. #if defined(TRMMKERNEL) && !defined(LEFT)
  1197. addl $2, KK
  1198. #endif
  1199. leal (, LDC, 2), %eax
  1200. addl %eax, C # c += 4 * ldc
  1201. ALIGN_4
  1202. .L60:
  1203. testl $1, N
  1204. je .L999
  1205. #if defined(TRMMKERNEL) && defined(LEFT)
  1206. movl OFFSET, %eax
  1207. movl %eax, KK
  1208. #endif
  1209. movl K, %eax
  1210. leal BUFFER, %ecx
  1211. sarl $3, %eax
  1212. jle .L65
  1213. ALIGN_4
  1214. .L62:
  1215. #ifdef PENTIUM4
  1216. #ifdef HAVE_SSE3
  1217. movddup 0 * SIZE(%edi), %xmm0
  1218. movddup 1 * SIZE(%edi), %xmm1
  1219. movddup 2 * SIZE(%edi), %xmm2
  1220. movddup 3 * SIZE(%edi), %xmm3
  1221. movddup 4 * SIZE(%edi), %xmm4
  1222. movddup 5 * SIZE(%edi), %xmm5
  1223. movddup 6 * SIZE(%edi), %xmm6
  1224. movddup 7 * SIZE(%edi), %xmm7
  1225. movapd %xmm0, 0 * SIZE(%ecx)
  1226. movapd %xmm1, 2 * SIZE(%ecx)
  1227. movapd %xmm2, 4 * SIZE(%ecx)
  1228. movapd %xmm3, 6 * SIZE(%ecx)
  1229. movapd %xmm4, 8 * SIZE(%ecx)
  1230. movapd %xmm5, 10 * SIZE(%ecx)
  1231. movapd %xmm6, 12 * SIZE(%ecx)
  1232. movapd %xmm7, 14 * SIZE(%ecx)
  1233. #else
  1234. movsd 0 * SIZE(%edi), %xmm0
  1235. movsd 1 * SIZE(%edi), %xmm1
  1236. movsd 2 * SIZE(%edi), %xmm2
  1237. movsd 3 * SIZE(%edi), %xmm3
  1238. movsd 4 * SIZE(%edi), %xmm4
  1239. movsd 5 * SIZE(%edi), %xmm5
  1240. movsd 6 * SIZE(%edi), %xmm6
  1241. movsd 7 * SIZE(%edi), %xmm7
  1242. unpcklpd %xmm0, %xmm0
  1243. unpckhpd %xmm1, %xmm1
  1244. unpcklpd %xmm2, %xmm2
  1245. unpckhpd %xmm3, %xmm3
  1246. unpcklpd %xmm4, %xmm4
  1247. unpckhpd %xmm5, %xmm5
  1248. unpcklpd %xmm6, %xmm6
  1249. unpckhpd %xmm7, %xmm7
  1250. movapd %xmm0, 0 * SIZE(%ecx)
  1251. movapd %xmm1, 2 * SIZE(%ecx)
  1252. movapd %xmm2, 4 * SIZE(%ecx)
  1253. movapd %xmm3, 6 * SIZE(%ecx)
  1254. movapd %xmm4, 8 * SIZE(%ecx)
  1255. movapd %xmm5, 10 * SIZE(%ecx)
  1256. movapd %xmm6, 12 * SIZE(%ecx)
  1257. movapd %xmm7, 14 * SIZE(%ecx)
  1258. #endif
  1259. prefetcht1 80 * SIZE(%edi)
  1260. prefetcht0 112 * SIZE(%ecx)
  1261. #endif
  1262. #if defined(OPTERON) || defined(BARCELONA)
  1263. #define COPYPREFETCH 40
  1264. prefetchnta (COPYPREFETCH) * SIZE(%edi)
  1265. movq 0 * SIZE(%edi), %mm0
  1266. movq 1 * SIZE(%edi), %mm1
  1267. movq 2 * SIZE(%edi), %mm2
  1268. movq 3 * SIZE(%edi), %mm3
  1269. movq 4 * SIZE(%edi), %mm4
  1270. movq 5 * SIZE(%edi), %mm5
  1271. movq 6 * SIZE(%edi), %mm6
  1272. movq 7 * SIZE(%edi), %mm7
  1273. movq %mm0, 0 * SIZE(%ecx)
  1274. movq %mm0, 1 * SIZE(%ecx)
  1275. movq %mm1, 2 * SIZE(%ecx)
  1276. movq %mm1, 3 * SIZE(%ecx)
  1277. movq %mm2, 4 * SIZE(%ecx)
  1278. movq %mm2, 5 * SIZE(%ecx)
  1279. movq %mm3, 6 * SIZE(%ecx)
  1280. movq %mm3, 7 * SIZE(%ecx)
  1281. movq %mm4, 8 * SIZE(%ecx)
  1282. movq %mm4, 9 * SIZE(%ecx)
  1283. movq %mm5, 10 * SIZE(%ecx)
  1284. movq %mm5, 11 * SIZE(%ecx)
  1285. movq %mm6, 12 * SIZE(%ecx)
  1286. movq %mm6, 13 * SIZE(%ecx)
  1287. movq %mm7, 14 * SIZE(%ecx)
  1288. movq %mm7, 15 * SIZE(%ecx)
  1289. #endif
  1290. addl $ 8 * SIZE, %edi
  1291. addl $16 * SIZE, %ecx
  1292. decl %eax
  1293. jne .L62
  1294. ALIGN_2
  1295. .L65:
  1296. movl K, %eax
  1297. andl $7, %eax
  1298. BRANCH
  1299. jle .L70
  1300. ALIGN_2
  1301. .L66:
  1302. #ifdef PENTIUM4
  1303. #ifdef HAVE_SSE3
  1304. movddup 0 * SIZE(%edi), %xmm0
  1305. movapd %xmm0, 0 * SIZE(%ecx)
  1306. #else
  1307. movsd 0 * SIZE(%edi), %xmm0
  1308. unpcklpd %xmm0, %xmm0
  1309. movapd %xmm0, 0 * SIZE(%ecx)
  1310. #endif
  1311. #endif
  1312. #if defined(OPTERON) || defined(BARCELONA)
  1313. movq 0 * SIZE(%edi), %mm0
  1314. movq %mm0, 0 * SIZE(%ecx)
  1315. movq %mm0, 1 * SIZE(%ecx)
  1316. #endif
  1317. addl $1 * SIZE, %edi
  1318. addl $2 * SIZE, %ecx
  1319. decl %eax
  1320. jne .L66
  1321. ALIGN_4
  1322. .L70:
  1323. movl C, %esi # coffset = c
  1324. movl A, AA # aoffset = a
  1325. movl M, %ebx
  1326. sarl $1, %ebx # i = (m >> 2)
  1327. jle .L80
  1328. ALIGN_4
  1329. .L71:
  1330. #if !defined(TRMMKERNEL) || \
  1331. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1332. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1333. leal BUFFER, BB
  1334. #else
  1335. leal BUFFER, BB
  1336. movl KK, %eax
  1337. leal (, %eax, SIZE), %eax
  1338. leal (AA, %eax, 2), AA
  1339. leal (BB, %eax, 2), BB
  1340. #endif
  1341. pxor %xmm4, %xmm4
  1342. pxor %xmm5, %xmm5
  1343. pxor %xmm6, %xmm6
  1344. pxor %xmm7, %xmm7
  1345. movapd 0 * SIZE(AA), %xmm0
  1346. movapd 8 * SIZE(AA), %xmm1
  1347. movapd 0 * SIZE(BB), %xmm2
  1348. movapd 8 * SIZE(BB), %xmm3
  1349. #ifdef HAVE_3DNOW
  1350. prefetchw 2 * SIZE(%esi)
  1351. #endif
  1352. #ifdef PENTIUM4
  1353. prefetchnta 2 * SIZE(%esi)
  1354. #endif
  1355. #ifndef TRMMKERNEL
  1356. movl K, %eax
  1357. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1358. movl K, %eax
  1359. subl KK, %eax
  1360. movl %eax, KKK
  1361. #else
  1362. movl KK, %eax
  1363. #ifdef LEFT
  1364. addl $2, %eax
  1365. #else
  1366. addl $1, %eax
  1367. #endif
  1368. movl %eax, KKK
  1369. #endif
  1370. sarl $3, %eax
  1371. je .L75
  1372. ALIGN_4
  1373. .L72:
  1374. mulpd %xmm0, %xmm2
  1375. addpd %xmm2, %xmm4
  1376. #if defined(OPTERON) || defined(BARCELONA)
  1377. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1378. #endif
  1379. movapd 16 * SIZE(BB), %xmm2
  1380. movapd 2 * SIZE(AA), %xmm0
  1381. mulpd 2 * SIZE(BB), %xmm0
  1382. addpd %xmm0, %xmm4
  1383. movapd 4 * SIZE(AA), %xmm0
  1384. mulpd 4 * SIZE(BB), %xmm0
  1385. addpd %xmm0, %xmm4
  1386. movapd 6 * SIZE(AA), %xmm0
  1387. mulpd 6 * SIZE(BB), %xmm0
  1388. addpd %xmm0, %xmm4
  1389. movapd 16 * SIZE(AA), %xmm0
  1390. #if defined(OPTERON) || defined(BARCELONA)
  1391. prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
  1392. #endif
  1393. mulpd %xmm1, %xmm3
  1394. addpd %xmm3, %xmm4
  1395. movapd 24 * SIZE(BB), %xmm3
  1396. movapd 10 * SIZE(AA), %xmm1
  1397. mulpd 10 * SIZE(BB), %xmm1
  1398. addpd %xmm1, %xmm4
  1399. movapd 12 * SIZE(AA), %xmm1
  1400. mulpd 12 * SIZE(BB), %xmm1
  1401. addpd %xmm1, %xmm4
  1402. movapd 14 * SIZE(AA), %xmm1
  1403. mulpd 14 * SIZE(BB), %xmm1
  1404. addpd %xmm1, %xmm4
  1405. movapd 24 * SIZE(AA), %xmm1
  1406. addl $16 * SIZE, AA
  1407. addl $16 * SIZE, BB
  1408. decl %eax
  1409. jne .L72
  1410. ALIGN_4
  1411. .L75:
  1412. #ifndef TRMMKERNEL
  1413. movl K, %eax
  1414. #else
  1415. movl KKK, %eax
  1416. #endif
  1417. movapd ALPHA, %xmm3
  1418. andl $7, %eax # if (k & 1)
  1419. BRANCH
  1420. je .L78
  1421. ALIGN_3
  1422. .L76:
  1423. mulpd %xmm0, %xmm2
  1424. addpd %xmm2, %xmm4
  1425. movapd 2 * SIZE(AA), %xmm0
  1426. movapd 2 * SIZE(BB), %xmm2
  1427. addl $2 * SIZE, AA
  1428. addl $2 * SIZE, BB
  1429. decl %eax
  1430. jg .L76
  1431. ALIGN_4
  1432. .L78:
  1433. mulpd %xmm3, %xmm4
  1434. #ifndef TRMMKERNEL
  1435. movsd 0 * SIZE(%esi), %xmm0
  1436. movhpd 1 * SIZE(%esi), %xmm0
  1437. addpd %xmm0, %xmm4
  1438. #endif
  1439. movsd %xmm4, 0 * SIZE(%esi)
  1440. movhpd %xmm4, 1 * SIZE(%esi)
  1441. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1442. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1443. movl K, %eax
  1444. subl KKK, %eax
  1445. leal (,%eax, SIZE), %eax
  1446. leal (AA, %eax, 2), AA
  1447. leal (BB, %eax, 2), BB
  1448. #endif
  1449. #if defined(TRMMKERNEL) && defined(LEFT)
  1450. addl $2, KK
  1451. #endif
  1452. addl $2 * SIZE, %esi # coffset += 2
  1453. decl %ebx # i --
  1454. jg .L71
  1455. ALIGN_4
  1456. .L80:
  1457. movl M, %ebx
  1458. testl $1, %ebx # i = (m >> 2)
  1459. jle .L999
  1460. #if !defined(TRMMKERNEL) || \
  1461. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1462. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1463. leal BUFFER, BB
  1464. #else
  1465. leal BUFFER, BB
  1466. movl KK, %eax
  1467. leal (, %eax, SIZE), %eax
  1468. leal (AA, %eax, 1), AA
  1469. leal (BB, %eax, 2), BB
  1470. #endif
  1471. pxor %xmm4, %xmm4
  1472. pxor %xmm5, %xmm5
  1473. pxor %xmm6, %xmm6
  1474. pxor %xmm7, %xmm7
  1475. leal (LDC, LDC, 2), %eax
  1476. movsd 0 * SIZE(AA), %xmm0
  1477. movsd 4 * SIZE(AA), %xmm1
  1478. movsd 0 * SIZE(BB), %xmm2
  1479. movsd 8 * SIZE(BB), %xmm3
  1480. #ifndef TRMMKERNEL
  1481. movl K, %eax
  1482. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1483. movl K, %eax
  1484. subl KK, %eax
  1485. movl %eax, KKK
  1486. #else
  1487. movl KK, %eax
  1488. #ifdef LEFT
  1489. addl $1, %eax
  1490. #else
  1491. addl $1, %eax
  1492. #endif
  1493. movl %eax, KKK
  1494. #endif
  1495. sarl $3, %eax
  1496. je .L85
  1497. ALIGN_4
  1498. .L82:
  1499. mulsd %xmm0, %xmm2
  1500. #if defined(OPTERON) || defined(BARCELONA)
  1501. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1502. #endif
  1503. movsd 1 * SIZE(AA), %xmm0
  1504. mulsd 2 * SIZE(BB), %xmm0
  1505. addsd %xmm2, %xmm4
  1506. movsd 16 * SIZE(BB), %xmm2
  1507. addsd %xmm0, %xmm5
  1508. movsd 2 * SIZE(AA), %xmm0
  1509. mulsd 4 * SIZE(BB), %xmm0
  1510. addsd %xmm0, %xmm6
  1511. movsd 3 * SIZE(AA), %xmm0
  1512. mulsd 6 * SIZE(BB), %xmm0
  1513. addsd %xmm0, %xmm7
  1514. movsd 8 * SIZE(AA), %xmm0
  1515. mulsd %xmm1, %xmm3
  1516. movsd 5 * SIZE(AA), %xmm1
  1517. mulsd 10 * SIZE(BB), %xmm1
  1518. addsd %xmm3, %xmm4
  1519. movsd 24 * SIZE(BB), %xmm3
  1520. addsd %xmm1, %xmm5
  1521. movsd 6 * SIZE(AA), %xmm1
  1522. mulsd 12 * SIZE(BB), %xmm1
  1523. addsd %xmm1, %xmm6
  1524. movsd 7 * SIZE(AA), %xmm1
  1525. mulsd 14 * SIZE(BB), %xmm1
  1526. addsd %xmm1, %xmm7
  1527. movsd 12 * SIZE(AA), %xmm1
  1528. addl $ 8 * SIZE, AA
  1529. addl $16 * SIZE, BB
  1530. decl %eax
  1531. jne .L82
  1532. ALIGN_4
  1533. .L85:
  1534. #ifndef TRMMKERNEL
  1535. movl K, %eax
  1536. #else
  1537. movl KKK, %eax
  1538. #endif
  1539. movsd ALPHA, %xmm3
  1540. andl $7, %eax # if (k & 1)
  1541. BRANCH
  1542. je .L88
  1543. .L86:
  1544. mulsd %xmm0, %xmm2
  1545. addsd %xmm2, %xmm4
  1546. movsd 2 * SIZE(BB), %xmm2
  1547. movsd 1 * SIZE(AA), %xmm0
  1548. addl $1 * SIZE, AA
  1549. addl $2 * SIZE, BB
  1550. decl %eax
  1551. jg .L86
  1552. ALIGN_4
  1553. .L88:
  1554. addsd %xmm5, %xmm4
  1555. addsd %xmm7, %xmm6
  1556. addsd %xmm6, %xmm4
  1557. mulsd %xmm3, %xmm4
  1558. #ifndef TRMMKERNEL
  1559. movsd 0 * SIZE(%esi), %xmm0
  1560. addsd %xmm0, %xmm4
  1561. #endif
  1562. movsd %xmm4, 0 * SIZE(%esi)
  1563. ALIGN_4
  1564. .L999:
  1565. movl OLD_STACK, %esp
  1566. EMMS
  1567. popl %ebx
  1568. popl %esi
  1569. popl %edi
  1570. popl %ebp
  1571. ret
  1572. EPILOGUE