You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_2x4_penryn.S 25 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 24 + STACK + ARGS(%esp)
  47. #define ARG_B 28 + STACK + ARGS(%esp)
  48. #define C 32 + STACK + ARGS(%esp)
  49. #define ARG_LDC 36 + STACK + ARGS(%esp)
  50. #define OFFSET 40 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define BX 4 + STACK(%esp)
  53. #define KK 8 + STACK(%esp)
  54. #define KKK 12 + STACK(%esp)
  55. #define AA %edx
  56. #define BB %ecx
  57. #define LDC %ebp
  58. #define B %edi
  59. #define C1 %esi
  60. #define I %ebx
  61. #ifdef NANO
  62. #define PREFETCHSIZE (8 * 3 + 4)
  63. #define PREFETCHW prefetcht0
  64. #define PREFETCHB prefetcht0
  65. #endif
  66. #ifdef NEHALEM
  67. #define PREFETCHSIZE (8 * 1 - 4)
  68. #define PREFETCHW prefetcht0
  69. #define PREFETCHB prefetcht0
  70. #endif
  71. #ifdef SANDYBRIDGE
  72. #define PREFETCHSIZE (8 * 1 - 4)
  73. #define PREFETCHW prefetcht0
  74. #define PREFETCHB prefetcht0
  75. #endif
  76. #ifndef PREFETCH
  77. #define PREFETCH prefetcht0
  78. #endif
  79. #ifndef PREFETCHW
  80. #define PREFETCHW prefetcht0
  81. #endif
  82. #ifndef PREFETCHB
  83. #define PREFETCHB prefetcht0
  84. #endif
  85. #ifndef PREFETCHSIZE
  86. #define PREFETCHSIZE (8 * 13 + 4)
  87. #endif
  88. PROLOGUE
  89. subl $ARGS, %esp # Generate Stack Frame
  90. pushl %ebp
  91. pushl %edi
  92. pushl %esi
  93. pushl %ebx
  94. PROFCODE
  95. movl ARG_B, B
  96. movl ARG_LDC, LDC
  97. #ifdef TRMMKERNEL
  98. movl OFFSET, %eax
  99. #ifndef LEFT
  100. negl %eax
  101. #endif
  102. movl %eax, KK
  103. #endif
  104. subl $-16 * SIZE, A
  105. subl $-16 * SIZE, B
  106. leal (, LDC, SIZE), LDC
  107. movl N, %eax
  108. sarl $2, %eax
  109. movl %eax, J
  110. jle .L30
  111. ALIGN_4
  112. .L01:
  113. #if defined(TRMMKERNEL) && defined(LEFT)
  114. movl OFFSET, %eax
  115. movl %eax, KK
  116. #endif
  117. movl K, %eax
  118. sall $BASE_SHIFT + 2, %eax
  119. leal (B, %eax), %eax
  120. movl %eax, BX
  121. movl C, C1
  122. movl A, AA
  123. movl M, I
  124. sarl $1, I
  125. jle .L20
  126. ALIGN_4
  127. .L11:
  128. #if !defined(TRMMKERNEL) || \
  129. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  130. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  131. movl B, BB
  132. #else
  133. movl B, BB
  134. movl KK, %eax
  135. leal (, %eax, SIZE), %eax
  136. leal (AA, %eax, 2), AA
  137. leal (BB, %eax, 4), BB
  138. #endif
  139. movl BX, %eax
  140. PREFETCHB -16 * SIZE(%eax)
  141. subl $-8 * SIZE, %eax
  142. movl %eax, BX
  143. leal (C1, LDC, 2), %eax
  144. movaps -16 * SIZE(AA), %xmm0
  145. xorps %xmm2, %xmm2
  146. movaps -16 * SIZE(BB), %xmm1
  147. xorps %xmm3, %xmm3
  148. xorps %xmm4, %xmm4
  149. PREFETCHW 1 * SIZE(C1)
  150. xorps %xmm5, %xmm5
  151. PREFETCHW 3 * SIZE(C1, LDC)
  152. xorps %xmm6, %xmm6
  153. PREFETCHW 1 * SIZE(%eax)
  154. xorps %xmm7, %xmm7
  155. PREFETCHW 3 * SIZE(%eax, LDC)
  156. #ifndef TRMMKERNEL
  157. movl K, %eax
  158. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  159. movl K, %eax
  160. subl KK, %eax
  161. movl %eax, KKK
  162. #else
  163. movl KK, %eax
  164. #ifdef LEFT
  165. addl $2, %eax
  166. #else
  167. addl $4, %eax
  168. #endif
  169. movl %eax, KKK
  170. #endif
  171. sarl $3, %eax
  172. je .L15
  173. ALIGN_4
  174. .L12:
  175. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  176. addpd %xmm3, %xmm7
  177. movaps -14 * SIZE(BB), %xmm3
  178. addpd %xmm2, %xmm6
  179. pshufd $0x4e, %xmm1, %xmm2
  180. mulpd %xmm0, %xmm1
  181. mulpd %xmm0, %xmm2
  182. addpd %xmm1, %xmm5
  183. movaps -12 * SIZE(BB), %xmm1
  184. addpd %xmm2, %xmm4
  185. pshufd $0x4e, %xmm3, %xmm2
  186. mulpd %xmm0, %xmm3
  187. mulpd %xmm0, %xmm2
  188. movaps -14 * SIZE(AA), %xmm0
  189. addpd %xmm3, %xmm7
  190. movaps -10 * SIZE(BB), %xmm3
  191. addpd %xmm2, %xmm6
  192. pshufd $0x4e, %xmm1, %xmm2
  193. mulpd %xmm0, %xmm1
  194. mulpd %xmm0, %xmm2
  195. addpd %xmm1, %xmm5
  196. movaps -8 * SIZE(BB), %xmm1
  197. addpd %xmm2, %xmm4
  198. pshufd $0x4e, %xmm3, %xmm2
  199. mulpd %xmm0, %xmm3
  200. mulpd %xmm0, %xmm2
  201. movaps -12 * SIZE(AA), %xmm0
  202. addpd %xmm3, %xmm7
  203. movaps -6 * SIZE(BB), %xmm3
  204. addpd %xmm2, %xmm6
  205. pshufd $0x4e, %xmm1, %xmm2
  206. mulpd %xmm0, %xmm1
  207. mulpd %xmm0, %xmm2
  208. addpd %xmm1, %xmm5
  209. movaps -4 * SIZE(BB), %xmm1
  210. addpd %xmm2, %xmm4
  211. pshufd $0x4e, %xmm3, %xmm2
  212. mulpd %xmm0, %xmm3
  213. mulpd %xmm0, %xmm2
  214. movaps -10 * SIZE(AA), %xmm0
  215. addpd %xmm3, %xmm7
  216. movaps -2 * SIZE(BB), %xmm3
  217. addpd %xmm2, %xmm6
  218. pshufd $0x4e, %xmm1, %xmm2
  219. mulpd %xmm0, %xmm1
  220. mulpd %xmm0, %xmm2
  221. addpd %xmm1, %xmm5
  222. movaps 0 * SIZE(BB), %xmm1
  223. addpd %xmm2, %xmm4
  224. pshufd $0x4e, %xmm3, %xmm2
  225. mulpd %xmm0, %xmm3
  226. mulpd %xmm0, %xmm2
  227. movaps -8 * SIZE(AA), %xmm0
  228. addpd %xmm3, %xmm7
  229. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  230. movaps 2 * SIZE(BB), %xmm3
  231. addpd %xmm2, %xmm6
  232. pshufd $0x4e, %xmm1, %xmm2
  233. mulpd %xmm0, %xmm1
  234. mulpd %xmm0, %xmm2
  235. addpd %xmm1, %xmm5
  236. movaps 4 * SIZE(BB), %xmm1
  237. addpd %xmm2, %xmm4
  238. pshufd $0x4e, %xmm3, %xmm2
  239. mulpd %xmm0, %xmm3
  240. mulpd %xmm0, %xmm2
  241. movaps -6 * SIZE(AA), %xmm0
  242. addpd %xmm3, %xmm7
  243. movaps 6 * SIZE(BB), %xmm3
  244. addpd %xmm2, %xmm6
  245. pshufd $0x4e, %xmm1, %xmm2
  246. mulpd %xmm0, %xmm1
  247. mulpd %xmm0, %xmm2
  248. addpd %xmm1, %xmm5
  249. movaps 8 * SIZE(BB), %xmm1
  250. addpd %xmm2, %xmm4
  251. pshufd $0x4e, %xmm3, %xmm2
  252. mulpd %xmm0, %xmm3
  253. mulpd %xmm0, %xmm2
  254. movaps -4 * SIZE(AA), %xmm0
  255. addpd %xmm3, %xmm7
  256. movaps 10 * SIZE(BB), %xmm3
  257. addpd %xmm2, %xmm6
  258. pshufd $0x4e, %xmm1, %xmm2
  259. mulpd %xmm0, %xmm1
  260. mulpd %xmm0, %xmm2
  261. addpd %xmm1, %xmm5
  262. movaps 12 * SIZE(BB), %xmm1
  263. addpd %xmm2, %xmm4
  264. pshufd $0x4e, %xmm3, %xmm2
  265. mulpd %xmm0, %xmm3
  266. mulpd %xmm0, %xmm2
  267. movaps -2 * SIZE(AA), %xmm0
  268. addpd %xmm3, %xmm7
  269. movaps 14 * SIZE(BB), %xmm3
  270. addpd %xmm2, %xmm6
  271. pshufd $0x4e, %xmm1, %xmm2
  272. mulpd %xmm0, %xmm1
  273. mulpd %xmm0, %xmm2
  274. addpd %xmm1, %xmm5
  275. movaps 16 * SIZE(BB), %xmm1
  276. addpd %xmm2, %xmm4
  277. subl $-32 * SIZE, BB
  278. pshufd $0x4e, %xmm3, %xmm2
  279. mulpd %xmm0, %xmm3
  280. mulpd %xmm0, %xmm2
  281. movaps 0 * SIZE(AA), %xmm0
  282. subl $-16 * SIZE, AA
  283. subl $1, %eax
  284. BRANCH
  285. jne .L12
  286. ALIGN_4
  287. .L15:
  288. #ifndef TRMMKERNEL
  289. movl K, %eax
  290. #else
  291. movl KKK, %eax
  292. #endif
  293. andl $7, %eax
  294. BRANCH
  295. je .L18
  296. ALIGN_4
  297. .L16:
  298. addpd %xmm3, %xmm7
  299. movaps -14 * SIZE(BB), %xmm3
  300. addpd %xmm2, %xmm6
  301. pshufd $0x4e, %xmm1, %xmm2
  302. mulpd %xmm0, %xmm1
  303. mulpd %xmm0, %xmm2
  304. addpd %xmm1, %xmm5
  305. movaps -12 * SIZE(BB), %xmm1
  306. addpd %xmm2, %xmm4
  307. pshufd $0x4e, %xmm3, %xmm2
  308. mulpd %xmm0, %xmm3
  309. mulpd %xmm0, %xmm2
  310. movaps -14 * SIZE(AA), %xmm0
  311. addl $2 * SIZE, AA
  312. addl $4 * SIZE, BB
  313. decl %eax
  314. jg .L16
  315. ALIGN_4
  316. .L18:
  317. addpd %xmm2, %xmm6
  318. addpd %xmm3, %xmm7
  319. movddup ALPHA, %xmm3
  320. movaps %xmm4, %xmm0
  321. movsd %xmm5, %xmm4
  322. mulpd %xmm3, %xmm4
  323. movsd %xmm0, %xmm5
  324. mulpd %xmm3, %xmm5
  325. movaps %xmm6, %xmm0
  326. movsd %xmm7, %xmm6
  327. mulpd %xmm3, %xmm6
  328. movsd %xmm0, %xmm7
  329. mulpd %xmm3, %xmm7
  330. movl C1, %eax
  331. orl LDC, %eax
  332. testl $15, %eax
  333. NOBRANCH
  334. jne .L18x
  335. leal (C1, LDC, 2), %eax
  336. #ifndef TRMMKERNEL
  337. movaps (C1), %xmm0
  338. movaps (C1, LDC), %xmm1
  339. movaps (%eax), %xmm2
  340. movaps (%eax, LDC), %xmm3
  341. addpd %xmm0, %xmm4
  342. addpd %xmm1, %xmm5
  343. addpd %xmm2, %xmm6
  344. addpd %xmm3, %xmm7
  345. #endif
  346. movaps %xmm4, (C1)
  347. movaps %xmm5, (C1, LDC)
  348. movaps %xmm6, (%eax)
  349. movaps %xmm7, (%eax, LDC)
  350. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  351. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  352. movl K, %eax
  353. subl KKK, %eax
  354. leal (,%eax, SIZE), %eax
  355. leal (AA, %eax, 2), AA
  356. leal (BB, %eax, 4), BB
  357. #endif
  358. #if defined(TRMMKERNEL) && defined(LEFT)
  359. addl $2, KK
  360. #endif
  361. addl $2 * SIZE, C1
  362. decl I
  363. jg .L11
  364. jmp .L20
  365. ALIGN_4
  366. .L18x:
  367. leal (C1, LDC, 2), %eax
  368. #ifndef TRMMKERNEL
  369. movups (C1), %xmm0
  370. movups (C1, LDC), %xmm1
  371. movups (%eax), %xmm2
  372. movups (%eax, LDC), %xmm3
  373. addpd %xmm0, %xmm4
  374. addpd %xmm1, %xmm5
  375. addpd %xmm2, %xmm6
  376. addpd %xmm3, %xmm7
  377. #endif
  378. movups %xmm4, (C1)
  379. movups %xmm5, (C1, LDC)
  380. movups %xmm6, (%eax)
  381. movups %xmm7, (%eax, LDC)
  382. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  383. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  384. movl K, %eax
  385. subl KKK, %eax
  386. leal (,%eax, SIZE), %eax
  387. leal (AA, %eax, 2), AA
  388. leal (BB, %eax, 4), BB
  389. #endif
  390. #if defined(TRMMKERNEL) && defined(LEFT)
  391. addl $2, KK
  392. #endif
  393. addl $2 * SIZE, C1
  394. decl I
  395. jg .L11
  396. ALIGN_4
  397. .L20:
  398. movl M, I
  399. testl $1, I
  400. jle .L29
  401. #if !defined(TRMMKERNEL) || \
  402. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  403. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  404. movl B, BB
  405. #else
  406. movl B, BB
  407. movl KK, %eax
  408. leal (, %eax, SIZE), %eax
  409. addl %eax, AA
  410. leal (BB, %eax, 4), BB
  411. #endif
  412. movaps -16 * SIZE(AA), %xmm0
  413. xorps %xmm4, %xmm4
  414. movaps -16 * SIZE(BB), %xmm2
  415. xorps %xmm5, %xmm5
  416. movaps -14 * SIZE(BB), %xmm3
  417. xorps %xmm6, %xmm6
  418. xorps %xmm7, %xmm7
  419. #ifndef TRMMKERNEL
  420. movl K, %eax
  421. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  422. movl K, %eax
  423. subl KK, %eax
  424. movl %eax, KKK
  425. #else
  426. movl KK, %eax
  427. #ifdef LEFT
  428. addl $1, %eax
  429. #else
  430. addl $4, %eax
  431. #endif
  432. movl %eax, KKK
  433. #endif
  434. sarl $3, %eax
  435. je .L25
  436. ALIGN_4
  437. .L22:
  438. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  439. pshufd $0x44, %xmm0, %xmm1
  440. mulpd %xmm1, %xmm2
  441. mulpd %xmm1, %xmm3
  442. addpd %xmm2, %xmm4
  443. movaps -12 * SIZE(BB), %xmm2
  444. addpd %xmm3, %xmm5
  445. movaps -10 * SIZE(BB), %xmm3
  446. pshufd $0xee, %xmm0, %xmm1
  447. movaps -14 * SIZE(AA), %xmm0
  448. mulpd %xmm1, %xmm2
  449. mulpd %xmm1, %xmm3
  450. addpd %xmm2, %xmm6
  451. movaps -8 * SIZE(BB), %xmm2
  452. addpd %xmm3, %xmm7
  453. movaps -6 * SIZE(BB), %xmm3
  454. pshufd $0x44, %xmm0, %xmm1
  455. mulpd %xmm1, %xmm2
  456. mulpd %xmm1, %xmm3
  457. addpd %xmm2, %xmm4
  458. movaps -4 * SIZE(BB), %xmm2
  459. addpd %xmm3, %xmm5
  460. movaps -2 * SIZE(BB), %xmm3
  461. pshufd $0xee, %xmm0, %xmm1
  462. movaps -12 * SIZE(AA), %xmm0
  463. mulpd %xmm1, %xmm2
  464. mulpd %xmm1, %xmm3
  465. addpd %xmm2, %xmm6
  466. movaps 0 * SIZE(BB), %xmm2
  467. addpd %xmm3, %xmm7
  468. movaps 2 * SIZE(BB), %xmm3
  469. pshufd $0x44, %xmm0, %xmm1
  470. mulpd %xmm1, %xmm2
  471. mulpd %xmm1, %xmm3
  472. addpd %xmm2, %xmm4
  473. movaps 4 * SIZE(BB), %xmm2
  474. addpd %xmm3, %xmm5
  475. movaps 6 * SIZE(BB), %xmm3
  476. pshufd $0xee, %xmm0, %xmm1
  477. movaps -10 * SIZE(AA), %xmm0
  478. mulpd %xmm1, %xmm2
  479. mulpd %xmm1, %xmm3
  480. addpd %xmm2, %xmm6
  481. movaps 8 * SIZE(BB), %xmm2
  482. addpd %xmm3, %xmm7
  483. movaps 10 * SIZE(BB), %xmm3
  484. pshufd $0x44, %xmm0, %xmm1
  485. mulpd %xmm1, %xmm2
  486. mulpd %xmm1, %xmm3
  487. addpd %xmm2, %xmm4
  488. movaps 12 * SIZE(BB), %xmm2
  489. addpd %xmm3, %xmm5
  490. movaps 14 * SIZE(BB), %xmm3
  491. pshufd $0xee, %xmm0, %xmm1
  492. movaps -8 * SIZE(AA), %xmm0
  493. mulpd %xmm1, %xmm2
  494. mulpd %xmm1, %xmm3
  495. addpd %xmm2, %xmm6
  496. movaps 16 * SIZE(BB), %xmm2
  497. addpd %xmm3, %xmm7
  498. movaps 18 * SIZE(BB), %xmm3
  499. subl $ -8 * SIZE, AA
  500. subl $-32 * SIZE, BB
  501. subl $1, %eax
  502. jne .L22
  503. ALIGN_4
  504. .L25:
  505. #ifndef TRMMKERNEL
  506. movl K, %eax
  507. #else
  508. movl KKK, %eax
  509. #endif
  510. andl $7, %eax
  511. BRANCH
  512. je .L28
  513. ALIGN_4
  514. .L26:
  515. pshufd $0x44, %xmm0, %xmm1
  516. movsd -15 * SIZE(AA), %xmm0
  517. mulpd %xmm1, %xmm2
  518. mulpd %xmm1, %xmm3
  519. addpd %xmm2, %xmm4
  520. movaps -12 * SIZE(BB), %xmm2
  521. addpd %xmm3, %xmm5
  522. movaps -10 * SIZE(BB), %xmm3
  523. addl $1 * SIZE, AA
  524. addl $4 * SIZE, BB
  525. decl %eax
  526. jg .L26
  527. ALIGN_4
  528. .L28:
  529. movddup ALPHA, %xmm3
  530. addpd %xmm6, %xmm4
  531. addpd %xmm7, %xmm5
  532. leal (C1, LDC, 2), %eax
  533. #ifndef TRMMKERNEL
  534. movsd 0 * SIZE(C1), %xmm0
  535. movhpd 0 * SIZE(C1, LDC), %xmm0
  536. movsd 0 * SIZE(%eax), %xmm1
  537. movhpd 0 * SIZE(%eax, LDC), %xmm1
  538. #endif
  539. mulpd %xmm3, %xmm4
  540. mulpd %xmm3, %xmm5
  541. #ifndef TRMMKERNEL
  542. addpd %xmm0, %xmm4
  543. addpd %xmm1, %xmm5
  544. #endif
  545. movsd %xmm4, 0 * SIZE(C1)
  546. movhpd %xmm4, 0 * SIZE(C1, LDC)
  547. movsd %xmm5, 0 * SIZE(%eax)
  548. movhpd %xmm5, 0 * SIZE(%eax, LDC)
  549. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  550. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  551. movl K, %eax
  552. subl KKK, %eax
  553. leal (,%eax, SIZE), %eax
  554. addl %eax, AA
  555. leal (BB, %eax, 4), BB
  556. #endif
  557. #if defined(TRMMKERNEL) && defined(LEFT)
  558. addl $1, KK
  559. #endif
  560. ALIGN_4
  561. .L29:
  562. #if defined(TRMMKERNEL) && !defined(LEFT)
  563. addl $4, KK
  564. #endif
  565. movl BB, B
  566. leal (, LDC, 4), %eax
  567. addl %eax, C
  568. decl J
  569. jg .L01
  570. ALIGN_4
  571. .L30:
  572. movl N, %eax
  573. testl $2, %eax
  574. jle .L50
  575. #if defined(TRMMKERNEL) && defined(LEFT)
  576. movl OFFSET, %eax
  577. movl %eax, KK
  578. #endif
  579. movl C, C1
  580. movl A, AA
  581. movl M, I
  582. sarl $1, I
  583. jle .L40
  584. ALIGN_4
  585. .L31:
  586. #if !defined(TRMMKERNEL) || \
  587. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  588. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  589. movl B, BB
  590. #else
  591. movl B, BB
  592. movl KK, %eax
  593. leal (, %eax, SIZE), %eax
  594. leal (AA, %eax, 2), AA
  595. leal (BB, %eax, 2), BB
  596. #endif
  597. movaps -16 * SIZE(AA), %xmm0
  598. xorps %xmm4, %xmm4
  599. movaps -16 * SIZE(BB), %xmm1
  600. xorps %xmm5, %xmm5
  601. PREFETCHW 1 * SIZE(C1)
  602. xorps %xmm6, %xmm6
  603. PREFETCHW 1 * SIZE(C1, LDC)
  604. xorps %xmm7, %xmm7
  605. #ifndef TRMMKERNEL
  606. movl K, %eax
  607. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  608. movl K, %eax
  609. subl KK, %eax
  610. movl %eax, KKK
  611. #else
  612. movl KK, %eax
  613. #ifdef LEFT
  614. addl $2, %eax
  615. #else
  616. addl $2, %eax
  617. #endif
  618. movl %eax, KKK
  619. #endif
  620. sarl $3, %eax
  621. je .L35
  622. ALIGN_4
  623. .L32:
  624. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  625. pshufd $0x4e, %xmm1, %xmm2
  626. mulpd %xmm0, %xmm1
  627. mulpd %xmm0, %xmm2
  628. movaps -14 * SIZE(AA), %xmm0
  629. addpd %xmm1, %xmm5
  630. movaps -14 * SIZE(BB), %xmm1
  631. addpd %xmm2, %xmm4
  632. pshufd $0x4e, %xmm1, %xmm2
  633. mulpd %xmm0, %xmm1
  634. mulpd %xmm0, %xmm2
  635. movaps -12 * SIZE(AA), %xmm0
  636. addpd %xmm1, %xmm7
  637. movaps -12 * SIZE(BB), %xmm1
  638. addpd %xmm2, %xmm6
  639. pshufd $0x4e, %xmm1, %xmm2
  640. mulpd %xmm0, %xmm1
  641. mulpd %xmm0, %xmm2
  642. movaps -10 * SIZE(AA), %xmm0
  643. addpd %xmm1, %xmm5
  644. movaps -10 * SIZE(BB), %xmm1
  645. addpd %xmm2, %xmm4
  646. pshufd $0x4e, %xmm1, %xmm2
  647. mulpd %xmm0, %xmm1
  648. mulpd %xmm0, %xmm2
  649. movaps -8 * SIZE(AA), %xmm0
  650. addpd %xmm1, %xmm7
  651. movaps -8 * SIZE(BB), %xmm1
  652. addpd %xmm2, %xmm6
  653. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  654. pshufd $0x4e, %xmm1, %xmm2
  655. mulpd %xmm0, %xmm1
  656. mulpd %xmm0, %xmm2
  657. movaps -6 * SIZE(AA), %xmm0
  658. addpd %xmm1, %xmm5
  659. movaps -6 * SIZE(BB), %xmm1
  660. addpd %xmm2, %xmm4
  661. pshufd $0x4e, %xmm1, %xmm2
  662. mulpd %xmm0, %xmm1
  663. mulpd %xmm0, %xmm2
  664. movaps -4 * SIZE(AA), %xmm0
  665. addpd %xmm1, %xmm7
  666. movaps -4 * SIZE(BB), %xmm1
  667. addpd %xmm2, %xmm6
  668. pshufd $0x4e, %xmm1, %xmm2
  669. mulpd %xmm0, %xmm1
  670. mulpd %xmm0, %xmm2
  671. movaps -2 * SIZE(AA), %xmm0
  672. addpd %xmm1, %xmm5
  673. movaps -2 * SIZE(BB), %xmm1
  674. addpd %xmm2, %xmm4
  675. pshufd $0x4e, %xmm1, %xmm2
  676. mulpd %xmm0, %xmm1
  677. mulpd %xmm0, %xmm2
  678. movaps 0 * SIZE(AA), %xmm0
  679. addpd %xmm1, %xmm7
  680. movaps 0 * SIZE(BB), %xmm1
  681. addpd %xmm2, %xmm6
  682. subl $-16 * SIZE, AA
  683. subl $-16 * SIZE, BB
  684. subl $1, %eax
  685. jne .L32
  686. ALIGN_4
  687. .L35:
  688. #ifndef TRMMKERNEL
  689. movl K, %eax
  690. #else
  691. movl KKK, %eax
  692. #endif
  693. andl $7, %eax
  694. BRANCH
  695. je .L38
  696. ALIGN_4
  697. .L36:
  698. pshufd $0x4e, %xmm1, %xmm2
  699. mulpd %xmm0, %xmm1
  700. mulpd %xmm0, %xmm2
  701. movaps -14 * SIZE(AA), %xmm0
  702. addpd %xmm1, %xmm5
  703. movaps -14 * SIZE(BB), %xmm1
  704. addpd %xmm2, %xmm4
  705. addl $2 * SIZE, AA
  706. addl $2 * SIZE, BB
  707. decl %eax
  708. jg .L36
  709. ALIGN_4
  710. .L38:
  711. movddup ALPHA, %xmm3
  712. addpd %xmm6, %xmm4
  713. addpd %xmm7, %xmm5
  714. movaps %xmm4, %xmm0
  715. movsd %xmm5, %xmm4
  716. mulpd %xmm3, %xmm4
  717. movsd %xmm0, %xmm5
  718. mulpd %xmm3, %xmm5
  719. #ifndef TRMMKERNEL
  720. movsd 0 * SIZE(C1), %xmm0
  721. movhpd 1 * SIZE(C1), %xmm0
  722. movsd 0 * SIZE(C1, LDC), %xmm1
  723. movhpd 1 * SIZE(C1, LDC), %xmm1
  724. addpd %xmm0, %xmm4
  725. addpd %xmm1, %xmm5
  726. #endif
  727. movsd %xmm4, 0 * SIZE(C1)
  728. movhpd %xmm4, 1 * SIZE(C1)
  729. movsd %xmm5, 0 * SIZE(C1, LDC)
  730. movhpd %xmm5, 1 * SIZE(C1, LDC)
  731. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  732. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  733. movl K, %eax
  734. subl KKK, %eax
  735. leal (,%eax, SIZE), %eax
  736. leal (AA, %eax, 2), AA
  737. leal (BB, %eax, 2), BB
  738. #endif
  739. #if defined(TRMMKERNEL) && defined(LEFT)
  740. addl $2, KK
  741. #endif
  742. addl $2 * SIZE, C1
  743. decl I
  744. jg .L31
  745. ALIGN_4
  746. .L40:
  747. movl M, I
  748. testl $1, I
  749. jle .L49
  750. #if !defined(TRMMKERNEL) || \
  751. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  752. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  753. movl B, BB
  754. #else
  755. movl B, BB
  756. movl KK, %eax
  757. leal (, %eax, SIZE), %eax
  758. addl %eax, AA
  759. leal (BB, %eax, 2), BB
  760. #endif
  761. movaps -16 * SIZE(AA), %xmm0
  762. xorps %xmm4, %xmm4
  763. movaps -16 * SIZE(BB), %xmm2
  764. xorps %xmm5, %xmm5
  765. #ifndef TRMMKERNEL
  766. movl K, %eax
  767. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  768. movl K, %eax
  769. subl KK, %eax
  770. movl %eax, KKK
  771. #else
  772. movl KK, %eax
  773. #ifdef LEFT
  774. addl $1, %eax
  775. #else
  776. addl $2, %eax
  777. #endif
  778. movl %eax, KKK
  779. #endif
  780. sarl $3, %eax
  781. je .L45
  782. ALIGN_4
  783. .L42:
  784. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  785. pshufd $0x44, %xmm0, %xmm1
  786. mulpd %xmm1, %xmm2
  787. addpd %xmm2, %xmm4
  788. movaps -14 * SIZE(BB), %xmm2
  789. pshufd $0xee, %xmm0, %xmm1
  790. movaps -14 * SIZE(AA), %xmm0
  791. mulpd %xmm1, %xmm2
  792. addpd %xmm2, %xmm5
  793. movaps -12 * SIZE(BB), %xmm2
  794. pshufd $0x44, %xmm0, %xmm1
  795. mulpd %xmm1, %xmm2
  796. addpd %xmm2, %xmm4
  797. movaps -10 * SIZE(BB), %xmm2
  798. pshufd $0xee, %xmm0, %xmm1
  799. movaps -12 * SIZE(AA), %xmm0
  800. mulpd %xmm1, %xmm2
  801. addpd %xmm2, %xmm5
  802. movaps -8 * SIZE(BB), %xmm2
  803. pshufd $0x44, %xmm0, %xmm1
  804. mulpd %xmm1, %xmm2
  805. addpd %xmm2, %xmm4
  806. movaps -6 * SIZE(BB), %xmm2
  807. pshufd $0xee, %xmm0, %xmm1
  808. movaps -10 * SIZE(AA), %xmm0
  809. mulpd %xmm1, %xmm2
  810. addpd %xmm2, %xmm5
  811. movaps -4 * SIZE(BB), %xmm2
  812. pshufd $0x44, %xmm0, %xmm1
  813. mulpd %xmm1, %xmm2
  814. addpd %xmm2, %xmm4
  815. movaps -2 * SIZE(BB), %xmm2
  816. pshufd $0xee, %xmm0, %xmm1
  817. movaps -8 * SIZE(AA), %xmm0
  818. mulpd %xmm1, %xmm2
  819. addpd %xmm2, %xmm5
  820. movaps 0 * SIZE(BB), %xmm2
  821. subl $ -8 * SIZE, AA
  822. subl $-16 * SIZE, BB
  823. subl $1, %eax
  824. jne .L42
  825. ALIGN_4
  826. .L45:
  827. #ifndef TRMMKERNEL
  828. movl K, %eax
  829. #else
  830. movl KKK, %eax
  831. #endif
  832. andl $7, %eax
  833. BRANCH
  834. je .L48
  835. ALIGN_4
  836. .L46:
  837. pshufd $0x44, %xmm0, %xmm1
  838. movsd -15 * SIZE(AA), %xmm0
  839. mulpd %xmm1, %xmm2
  840. addpd %xmm2, %xmm4
  841. movaps -14 * SIZE(BB), %xmm2
  842. addl $1 * SIZE, AA
  843. addl $2 * SIZE, BB
  844. decl %eax
  845. jg .L46
  846. ALIGN_4
  847. .L48:
  848. movddup ALPHA, %xmm3
  849. addpd %xmm5, %xmm4
  850. #ifndef TRMMKERNEL
  851. movsd 0 * SIZE(C1), %xmm0
  852. movhpd 0 * SIZE(C1, LDC), %xmm0
  853. #endif
  854. mulpd %xmm3, %xmm4
  855. #ifndef TRMMKERNEL
  856. addpd %xmm0, %xmm4
  857. #endif
  858. movsd %xmm4, 0 * SIZE(C1)
  859. movhpd %xmm4, 0 * SIZE(C1, LDC)
  860. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  861. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  862. movl K, %eax
  863. subl KKK, %eax
  864. leal (,%eax, SIZE), %eax
  865. addl %eax, AA
  866. leal (BB, %eax, 2), BB
  867. #endif
  868. #if defined(TRMMKERNEL) && defined(LEFT)
  869. addl $1, KK
  870. #endif
  871. ALIGN_4
  872. .L49:
  873. #if defined(TRMMKERNEL) && !defined(LEFT)
  874. addl $2, KK
  875. #endif
  876. movl BB, B
  877. leal (, LDC, 2), %eax
  878. addl %eax, C
  879. ALIGN_4
  880. .L50:
  881. movl N, %eax
  882. testl $1, %eax
  883. jle .L999
  884. #if defined(TRMMKERNEL) && defined(LEFT)
  885. movl OFFSET, %eax
  886. movl %eax, KK
  887. #endif
  888. movl C, C1
  889. movl A, AA
  890. movl M, I
  891. sarl $1, I
  892. jle .L60
  893. ALIGN_4
  894. .L51:
  895. #if !defined(TRMMKERNEL) || \
  896. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  897. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  898. movl B, BB
  899. #else
  900. movl B, BB
  901. movl KK, %eax
  902. leal (, %eax, SIZE), %eax
  903. leal (AA, %eax, 2), AA
  904. addl %eax, BB
  905. #endif
  906. movaps -16 * SIZE(AA), %xmm0
  907. xorps %xmm4, %xmm4
  908. movaps -16 * SIZE(BB), %xmm1
  909. xorps %xmm5, %xmm5
  910. PREFETCHW 1 * SIZE(C1)
  911. #ifndef TRMMKERNEL
  912. movl K, %eax
  913. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  914. movl K, %eax
  915. subl KK, %eax
  916. movl %eax, KKK
  917. #else
  918. movl KK, %eax
  919. #ifdef LEFT
  920. addl $2, %eax
  921. #else
  922. addl $1, %eax
  923. #endif
  924. movl %eax, KKK
  925. #endif
  926. sarl $3, %eax
  927. je .L55
  928. ALIGN_4
  929. .L52:
  930. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  931. pshufd $0x44, %xmm1, %xmm2
  932. mulpd %xmm0, %xmm2
  933. movaps -14 * SIZE(AA), %xmm0
  934. addpd %xmm2, %xmm4
  935. pshufd $0xee, %xmm1, %xmm2
  936. movaps -14 * SIZE(BB), %xmm1
  937. mulpd %xmm0, %xmm2
  938. movaps -12 * SIZE(AA), %xmm0
  939. addpd %xmm2, %xmm5
  940. pshufd $0x44, %xmm1, %xmm2
  941. mulpd %xmm0, %xmm2
  942. movaps -10 * SIZE(AA), %xmm0
  943. addpd %xmm2, %xmm4
  944. pshufd $0xee, %xmm1, %xmm2
  945. movaps -12 * SIZE(BB), %xmm1
  946. mulpd %xmm0, %xmm2
  947. movaps -8 * SIZE(AA), %xmm0
  948. addpd %xmm2, %xmm5
  949. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  950. pshufd $0x44, %xmm1, %xmm2
  951. mulpd %xmm0, %xmm2
  952. movaps -6 * SIZE(AA), %xmm0
  953. addpd %xmm2, %xmm4
  954. pshufd $0xee, %xmm1, %xmm2
  955. movaps -10 * SIZE(BB), %xmm1
  956. mulpd %xmm0, %xmm2
  957. movaps -4 * SIZE(AA), %xmm0
  958. addpd %xmm2, %xmm5
  959. pshufd $0x44, %xmm1, %xmm2
  960. mulpd %xmm0, %xmm2
  961. movaps -2 * SIZE(AA), %xmm0
  962. addpd %xmm2, %xmm4
  963. pshufd $0xee, %xmm1, %xmm2
  964. movaps -8 * SIZE(BB), %xmm1
  965. mulpd %xmm0, %xmm2
  966. movaps 0 * SIZE(AA), %xmm0
  967. addpd %xmm2, %xmm5
  968. subl $-16 * SIZE, AA
  969. subl $ -8 * SIZE, BB
  970. subl $1, %eax
  971. jne .L52
  972. ALIGN_4
  973. .L55:
  974. #ifndef TRMMKERNEL
  975. movl K, %eax
  976. #else
  977. movl KKK, %eax
  978. #endif
  979. andl $7, %eax
  980. BRANCH
  981. je .L58
  982. ALIGN_4
  983. .L56:
  984. pshufd $0x44, %xmm1, %xmm2
  985. movsd -15 * SIZE(BB), %xmm1
  986. mulpd %xmm0, %xmm2
  987. movaps -14 * SIZE(AA), %xmm0
  988. addpd %xmm2, %xmm4
  989. addl $2 * SIZE, AA
  990. addl $1 * SIZE, BB
  991. decl %eax
  992. jg .L56
  993. ALIGN_4
  994. .L58:
  995. movddup ALPHA, %xmm3
  996. addpd %xmm5, %xmm4
  997. mulpd %xmm3, %xmm4
  998. #ifndef TRMMKERNEL
  999. movsd 0 * SIZE(C1), %xmm0
  1000. movhpd 1 * SIZE(C1), %xmm0
  1001. addpd %xmm0, %xmm4
  1002. #endif
  1003. movsd %xmm4, 0 * SIZE(C1)
  1004. movhpd %xmm4, 1 * SIZE(C1)
  1005. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1006. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1007. movl K, %eax
  1008. subl KKK, %eax
  1009. leal (,%eax, SIZE), %eax
  1010. leal (AA, %eax, 2), AA
  1011. addl %eax, BB
  1012. #endif
  1013. #if defined(TRMMKERNEL) && defined(LEFT)
  1014. addl $2, KK
  1015. #endif
  1016. addl $2 * SIZE, C1
  1017. decl I
  1018. jg .L51
  1019. ALIGN_4
  1020. .L60:
  1021. movl M, I
  1022. testl $1, I
  1023. jle .L999
  1024. #if !defined(TRMMKERNEL) || \
  1025. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1026. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1027. movl B, BB
  1028. #else
  1029. movl B, BB
  1030. movl KK, %eax
  1031. leal (, %eax, SIZE), %eax
  1032. addl %eax, AA
  1033. addl %eax, BB
  1034. #endif
  1035. movaps -16 * SIZE(AA), %xmm0
  1036. xorps %xmm4, %xmm4
  1037. movaps -16 * SIZE(BB), %xmm2
  1038. xorps %xmm5, %xmm5
  1039. #ifndef TRMMKERNEL
  1040. movl K, %eax
  1041. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1042. movl K, %eax
  1043. subl KK, %eax
  1044. movl %eax, KKK
  1045. #else
  1046. movl KK, %eax
  1047. #ifdef LEFT
  1048. addl $1, %eax
  1049. #else
  1050. addl $1, %eax
  1051. #endif
  1052. movl %eax, KKK
  1053. #endif
  1054. sarl $3, %eax
  1055. je .L65
  1056. ALIGN_4
  1057. .L62:
  1058. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1059. mulpd %xmm0, %xmm2
  1060. movaps -14 * SIZE(AA), %xmm0
  1061. addpd %xmm2, %xmm4
  1062. movaps -14 * SIZE(BB), %xmm2
  1063. mulpd %xmm0, %xmm2
  1064. movaps -12 * SIZE(AA), %xmm0
  1065. addpd %xmm2, %xmm5
  1066. movaps -12 * SIZE(BB), %xmm2
  1067. mulpd %xmm0, %xmm2
  1068. movaps -10 * SIZE(AA), %xmm0
  1069. addpd %xmm2, %xmm4
  1070. movaps -10 * SIZE(BB), %xmm2
  1071. mulpd %xmm0, %xmm2
  1072. movaps -8 * SIZE(AA), %xmm0
  1073. addpd %xmm2, %xmm5
  1074. movaps -8 * SIZE(BB), %xmm2
  1075. subl $-8 * SIZE, AA
  1076. subl $-8 * SIZE, BB
  1077. subl $1, %eax
  1078. jne .L62
  1079. ALIGN_4
  1080. .L65:
  1081. #ifndef TRMMKERNEL
  1082. movl K, %eax
  1083. #else
  1084. movl KKK, %eax
  1085. #endif
  1086. andl $7, %eax
  1087. BRANCH
  1088. je .L68
  1089. ALIGN_4
  1090. .L66:
  1091. mulsd %xmm0, %xmm2
  1092. movsd -15 * SIZE(AA), %xmm0
  1093. addsd %xmm2, %xmm4
  1094. movsd -15 * SIZE(BB), %xmm2
  1095. addl $1 * SIZE, AA
  1096. addl $1 * SIZE, BB
  1097. decl %eax
  1098. jg .L66
  1099. ALIGN_4
  1100. .L68:
  1101. movddup ALPHA, %xmm3
  1102. addpd %xmm5, %xmm4
  1103. haddpd %xmm4, %xmm4
  1104. #ifndef TRMMKERNEL
  1105. movsd 0 * SIZE(C1), %xmm0
  1106. #endif
  1107. mulsd %xmm3, %xmm4
  1108. #ifndef TRMMKERNEL
  1109. addsd %xmm0, %xmm4
  1110. #endif
  1111. movsd %xmm4, 0 * SIZE(C1)
  1112. ALIGN_4
  1113. .L999:
  1114. popl %ebx
  1115. popl %esi
  1116. popl %edi
  1117. popl %ebp
  1118. addl $ARGS, %esp
  1119. ret
  1120. EPILOGUE