You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm3m_kernel_4x2_core2.S 27 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define OLD_M 4 + STACK + ARGS(%esi)
  43. #define OLD_N 8 + STACK + ARGS(%esi)
  44. #define OLD_K 12 + STACK + ARGS(%esi)
  45. #define OLD_ALPHA_R 16 + STACK + ARGS(%esi)
  46. #define OLD_ALPHA_I 24 + STACK + ARGS(%esi)
  47. #define OLD_A 32 + STACK + ARGS(%esi)
  48. #define OLD_B 36 + STACK + ARGS(%esi)
  49. #define OLD_C 40 + STACK + ARGS(%esi)
  50. #define OLD_LDC 44 + STACK + ARGS(%esi)
  51. #define ALPHA 0(%esp)
  52. #define K 16(%esp)
  53. #define N 20(%esp)
  54. #define M 24(%esp)
  55. #define A 28(%esp)
  56. #define C 32(%esp)
  57. #define J 36(%esp)
  58. #define BX 40(%esp)
  59. #define OLD_STACK 44(%esp)
  60. #define OFFSET 48(%esp)
  61. #define KK 52(%esp)
  62. #define KKK 56(%esp)
  63. #define BUFFER 256(%esp)
  64. #define PREFETCH_R (8 * 16 + 0)
  65. #define PREFETCH_W (PREFETCH_R * 2)
  66. #define PREFETCHSIZE (8 * 7 + 4)
  67. #define PREFETCH prefetcht0
  68. #define AA %edx
  69. #define BB %ecx
  70. #define LDC %ebp
  71. #define B %edi
  72. #define C1 %esi
  73. #define I %ebx
  74. PROLOGUE
  75. PROFCODE
  76. pushl %ebp
  77. pushl %edi
  78. pushl %esi
  79. pushl %ebx
  80. movl %esp, %esi # save old stack
  81. subl $512 + LOCAL_BUFFER_SIZE, %esp
  82. andl $-4096, %esp # align stack
  83. STACK_TOUCHING
  84. movl OLD_M, %ebx
  85. movl OLD_N, %eax
  86. movl OLD_K, %ecx
  87. movl OLD_A, %edx
  88. movsd OLD_ALPHA_R, %xmm0
  89. movhps OLD_ALPHA_I, %xmm0
  90. movl %ebx, M
  91. movl %eax, N
  92. movl %ecx, K
  93. movl %edx, A
  94. movl %esi, OLD_STACK
  95. movl OLD_B, B
  96. movl OLD_C, %ebx
  97. movaps %xmm0, ALPHA
  98. movl %ebx, C
  99. movl OLD_LDC, LDC
  100. subl $-16 * SIZE, A
  101. subl $-16 * SIZE, B
  102. sall $ZBASE_SHIFT, LDC
  103. sarl $1, %eax
  104. movl %eax, J
  105. jle .L40
  106. ALIGN_4
  107. .L01:
  108. leal 16 * SIZE + BUFFER, BB
  109. #if defined(TRMMKERNEL) && defined(LEFT)
  110. movl OFFSET, %eax
  111. movl %eax, KK
  112. #endif
  113. movl K, %eax
  114. sarl $2, %eax
  115. jle .L05
  116. ALIGN_4
  117. .L02:
  118. movddup -16 * SIZE(B), %xmm0
  119. movddup -15 * SIZE(B), %xmm1
  120. movddup -14 * SIZE(B), %xmm2
  121. movddup -13 * SIZE(B), %xmm3
  122. movddup -12 * SIZE(B), %xmm4
  123. movddup -11 * SIZE(B), %xmm5
  124. movddup -10 * SIZE(B), %xmm6
  125. movddup -9 * SIZE(B), %xmm7
  126. prefetcht0 (PREFETCH_R + 0) * SIZE(B)
  127. movapd %xmm0, -16 * SIZE(BB)
  128. movapd %xmm1, -14 * SIZE(BB)
  129. movapd %xmm2, -12 * SIZE(BB)
  130. movapd %xmm3, -10 * SIZE(BB)
  131. movapd %xmm4, -8 * SIZE(BB)
  132. movapd %xmm5, -6 * SIZE(BB)
  133. movapd %xmm6, -4 * SIZE(BB)
  134. movapd %xmm7, -2 * SIZE(BB)
  135. addl $ 8 * SIZE, B
  136. addl $16 * SIZE, BB
  137. decl %eax
  138. jne .L02
  139. ALIGN_4
  140. .L05:
  141. movl K, %eax
  142. andl $3, %eax
  143. BRANCH
  144. jle .L10
  145. ALIGN_4
  146. .L06:
  147. movddup -16 * SIZE(B), %xmm0
  148. movddup -15 * SIZE(B), %xmm1
  149. movapd %xmm0, -16 * SIZE(BB)
  150. movapd %xmm1, -14 * SIZE(BB)
  151. addl $2 * SIZE, B
  152. addl $4 * SIZE, BB
  153. decl %eax
  154. jne .L06
  155. ALIGN_4
  156. .L10:
  157. movl B, BX
  158. movl C, C1
  159. movl A, AA
  160. movl M, I
  161. sarl $2, I
  162. jle .L20
  163. ALIGN_4
  164. .L11:
  165. #if !defined(TRMMKERNEL) || \
  166. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  167. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  168. leal 16 * SIZE + BUFFER, BB
  169. #else
  170. leal 16 * SIZE + BUFFER, BB
  171. movl KK, %eax
  172. leal (, %eax, SIZE), %eax
  173. leal (AA, %eax, 4), AA
  174. leal (BB, %eax, 4), BB /* because it's doubled */
  175. #endif
  176. movapd -16 * SIZE(AA), %xmm0
  177. pxor %xmm4, %xmm4
  178. movapd -16 * SIZE(BB), %xmm1
  179. pxor %xmm5, %xmm5
  180. movapd -8 * SIZE(AA), %xmm3
  181. pxor %xmm6, %xmm6
  182. prefetcht0 3 * SIZE(C1)
  183. pxor %xmm7, %xmm7
  184. prefetcht0 3 * SIZE(C1, LDC)
  185. movapd %xmm1, %xmm2
  186. movl BX, %eax
  187. prefetcht0 (%eax)
  188. subl $-8 * SIZE, %eax
  189. movl %eax, BX
  190. #ifndef TRMMKERNEL
  191. movl K, %eax
  192. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  193. movl K, %eax
  194. subl KK, %eax
  195. movl %eax, KKK
  196. #else
  197. movl KK, %eax
  198. #ifdef LEFT
  199. addl $4, %eax
  200. #else
  201. addl $2, %eax
  202. #endif
  203. movl %eax, KKK
  204. #endif
  205. sarl $3, %eax
  206. je .L15
  207. ALIGN_4
  208. .L12:
  209. mulpd %xmm0, %xmm1
  210. addpd %xmm1, %xmm4
  211. movapd -14 * SIZE(BB), %xmm1
  212. mulpd %xmm1, %xmm0
  213. addpd %xmm0, %xmm5
  214. movapd -14 * SIZE(AA), %xmm0
  215. mulpd %xmm0, %xmm2
  216. addpd %xmm2, %xmm6
  217. movapd -12 * SIZE(BB), %xmm2
  218. mulpd %xmm0, %xmm1
  219. movapd -12 * SIZE(AA), %xmm0
  220. addpd %xmm1, %xmm7
  221. PADDING;
  222. movapd %xmm2, %xmm1
  223. mulpd %xmm0, %xmm2
  224. addpd %xmm2, %xmm4
  225. movapd -10 * SIZE(BB), %xmm2
  226. mulpd %xmm2, %xmm0
  227. addpd %xmm0, %xmm5
  228. movapd -10 * SIZE(AA), %xmm0
  229. mulpd %xmm0, %xmm1
  230. addpd %xmm1, %xmm6
  231. movapd -8 * SIZE(BB), %xmm1
  232. mulpd %xmm0, %xmm2
  233. PADDING;
  234. movapd 0 * SIZE(AA), %xmm0
  235. addpd %xmm2, %xmm7
  236. PADDING;
  237. movapd %xmm1, %xmm2
  238. mulpd %xmm3, %xmm1
  239. addpd %xmm1, %xmm4
  240. movapd -6 * SIZE(BB), %xmm1
  241. mulpd %xmm1, %xmm3
  242. addpd %xmm3, %xmm5
  243. movapd -6 * SIZE(AA), %xmm3
  244. mulpd %xmm3, %xmm2
  245. addpd %xmm2, %xmm6
  246. movapd -4 * SIZE(BB), %xmm2
  247. mulpd %xmm3, %xmm1
  248. movapd -4 * SIZE(AA), %xmm3
  249. addpd %xmm1, %xmm7
  250. PADDING;
  251. movapd %xmm2, %xmm1
  252. mulpd %xmm3, %xmm2
  253. addpd %xmm2, %xmm4
  254. movapd -2 * SIZE(BB), %xmm2
  255. mulpd %xmm2, %xmm3
  256. addpd %xmm3, %xmm5
  257. movapd -2 * SIZE(AA), %xmm3
  258. mulpd %xmm3, %xmm1
  259. addpd %xmm1, %xmm6
  260. PADDING;
  261. movapd 0 * SIZE(BB), %xmm1
  262. mulpd %xmm3, %xmm2
  263. movapd 8 * SIZE(AA), %xmm3
  264. addpd %xmm2, %xmm7
  265. PADDING;
  266. movapd %xmm1, %xmm2
  267. mulpd %xmm0, %xmm1
  268. addpd %xmm1, %xmm4
  269. movapd 2 * SIZE(BB), %xmm1
  270. mulpd %xmm1, %xmm0
  271. addpd %xmm0, %xmm5
  272. movapd 2 * SIZE(AA), %xmm0
  273. mulpd %xmm0, %xmm2
  274. addpd %xmm2, %xmm6
  275. movapd 4 * SIZE(BB), %xmm2
  276. mulpd %xmm0, %xmm1
  277. movapd 4 * SIZE(AA), %xmm0
  278. addpd %xmm1, %xmm7
  279. PADDING;
  280. movapd %xmm2, %xmm1
  281. mulpd %xmm0, %xmm2
  282. addpd %xmm2, %xmm4
  283. movapd 6 * SIZE(BB), %xmm2
  284. mulpd %xmm2, %xmm0
  285. addpd %xmm0, %xmm5
  286. movapd 6 * SIZE(AA), %xmm0
  287. mulpd %xmm0, %xmm1
  288. addpd %xmm1, %xmm6
  289. movapd 8 * SIZE(BB), %xmm1
  290. mulpd %xmm0, %xmm2
  291. movapd 16 * SIZE(AA), %xmm0
  292. addpd %xmm2, %xmm7
  293. PADDING;
  294. movapd %xmm1, %xmm2
  295. mulpd %xmm3, %xmm1
  296. addpd %xmm1, %xmm4
  297. movapd 10 * SIZE(BB), %xmm1
  298. mulpd %xmm1, %xmm3
  299. addpd %xmm3, %xmm5
  300. movapd 10 * SIZE(AA), %xmm3
  301. mulpd %xmm3, %xmm2
  302. addpd %xmm2, %xmm6
  303. movapd 12 * SIZE(BB), %xmm2
  304. mulpd %xmm3, %xmm1
  305. movapd 12 * SIZE(AA), %xmm3
  306. addpd %xmm1, %xmm7
  307. PADDING;
  308. movapd %xmm2, %xmm1
  309. mulpd %xmm3, %xmm2
  310. addpd %xmm2, %xmm4
  311. movapd 14 * SIZE(BB), %xmm2
  312. mulpd %xmm2, %xmm3
  313. subl $-32 * SIZE, BB
  314. addpd %xmm3, %xmm5
  315. movapd 14 * SIZE(AA), %xmm3
  316. mulpd %xmm3, %xmm1
  317. addpd %xmm1, %xmm6
  318. movapd -16 * SIZE(BB), %xmm1
  319. mulpd %xmm3, %xmm2
  320. movapd 24 * SIZE(AA), %xmm3
  321. addpd %xmm2, %xmm7
  322. PADDING;
  323. movapd %xmm1, %xmm2
  324. subl $-32 * SIZE, AA
  325. decl %eax
  326. BRANCH
  327. jne .L12
  328. ALIGN_4
  329. .L15:
  330. #ifndef TRMMKERNEL
  331. movl K, %eax
  332. #else
  333. movl KKK, %eax
  334. #endif
  335. movaps ALPHA, %xmm3
  336. andl $7, %eax
  337. BRANCH
  338. je .L18
  339. ALIGN_4
  340. .L16:
  341. mulpd %xmm0, %xmm1
  342. addpd %xmm1, %xmm4
  343. movapd -14 * SIZE(BB), %xmm1
  344. mulpd %xmm1, %xmm0
  345. addpd %xmm0, %xmm5
  346. movapd -14 * SIZE(AA), %xmm0
  347. mulpd %xmm0, %xmm2
  348. mulpd %xmm0, %xmm1
  349. movapd -12 * SIZE(AA), %xmm0
  350. addpd %xmm2, %xmm6
  351. addpd %xmm1, %xmm7
  352. movapd -12 * SIZE(BB), %xmm1
  353. movapd %xmm1, %xmm2
  354. addl $4 * SIZE, AA
  355. addl $4 * SIZE, BB
  356. decl %eax
  357. jg .L16
  358. ALIGN_4
  359. .L18:
  360. movsd 0 * SIZE(%esi), %xmm0
  361. movhps 1 * SIZE(%esi), %xmm0
  362. movsd 2 * SIZE(%esi), %xmm1
  363. movhps 3 * SIZE(%esi), %xmm1
  364. pshufd $0x44, %xmm4, %xmm2
  365. unpckhpd %xmm4, %xmm4
  366. mulpd %xmm3, %xmm2
  367. addpd %xmm2, %xmm0
  368. mulpd %xmm3, %xmm4
  369. addpd %xmm4, %xmm1
  370. movlps %xmm0, 0 * SIZE(%esi)
  371. movhps %xmm0, 1 * SIZE(%esi)
  372. movlps %xmm1, 2 * SIZE(%esi)
  373. movhps %xmm1, 3 * SIZE(%esi)
  374. movsd 4 * SIZE(%esi), %xmm0
  375. movhps 5 * SIZE(%esi), %xmm0
  376. movsd 6 * SIZE(%esi), %xmm1
  377. movhps 7 * SIZE(%esi), %xmm1
  378. pshufd $0x44, %xmm6, %xmm2
  379. unpckhpd %xmm6, %xmm6
  380. mulpd %xmm3, %xmm2
  381. addpd %xmm2, %xmm0
  382. mulpd %xmm3, %xmm6
  383. addpd %xmm6, %xmm1
  384. movlps %xmm0, 4 * SIZE(%esi)
  385. movhps %xmm0, 5 * SIZE(%esi)
  386. movlps %xmm1, 6 * SIZE(%esi)
  387. movhps %xmm1, 7 * SIZE(%esi)
  388. movsd 0 * SIZE(%esi, LDC), %xmm0
  389. movhps 1 * SIZE(%esi, LDC), %xmm0
  390. movsd 2 * SIZE(%esi, LDC), %xmm1
  391. movhps 3 * SIZE(%esi, LDC), %xmm1
  392. pshufd $0x44, %xmm5, %xmm2
  393. unpckhpd %xmm5, %xmm5
  394. mulpd %xmm3, %xmm2
  395. addpd %xmm2, %xmm0
  396. mulpd %xmm3, %xmm5
  397. addpd %xmm5, %xmm1
  398. movlps %xmm0, 0 * SIZE(%esi, LDC)
  399. movhps %xmm0, 1 * SIZE(%esi, LDC)
  400. movlps %xmm1, 2 * SIZE(%esi, LDC)
  401. movhps %xmm1, 3 * SIZE(%esi, LDC)
  402. movsd 4 * SIZE(%esi, LDC), %xmm0
  403. movhps 5 * SIZE(%esi, LDC), %xmm0
  404. movsd 6 * SIZE(%esi, LDC), %xmm1
  405. movhps 7 * SIZE(%esi, LDC), %xmm1
  406. pshufd $0x44, %xmm7, %xmm2
  407. unpckhpd %xmm7, %xmm7
  408. mulpd %xmm3, %xmm2
  409. addpd %xmm2, %xmm0
  410. mulpd %xmm3, %xmm7
  411. addpd %xmm7, %xmm1
  412. movlps %xmm0, 4 * SIZE(%esi, LDC)
  413. movhps %xmm0, 5 * SIZE(%esi, LDC)
  414. movlps %xmm1, 6 * SIZE(%esi, LDC)
  415. movhps %xmm1, 7 * SIZE(%esi, LDC)
  416. addl $8 * SIZE, C1
  417. decl I
  418. jg .L11
  419. ALIGN_4
  420. .L20:
  421. movl M, I
  422. testl $2, I
  423. jle .L30
  424. .L21:
  425. #if !defined(TRMMKERNEL) || \
  426. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  427. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  428. leal 16 * SIZE + BUFFER, BB
  429. #else
  430. leal 16 * SIZE + BUFFER, BB
  431. movl KK, %eax
  432. leal (, %eax, SIZE), %eax
  433. leal (AA, %eax, 2), AA
  434. leal (BB, %eax, 4), BB /* because it's doubled */
  435. #endif
  436. movapd -16 * SIZE(AA), %xmm0
  437. pxor %xmm4, %xmm4
  438. movapd -16 * SIZE(BB), %xmm1
  439. pxor %xmm5, %xmm5
  440. movapd -8 * SIZE(AA), %xmm2
  441. pxor %xmm6, %xmm6
  442. movapd -8 * SIZE(BB), %xmm3
  443. pxor %xmm7, %xmm7
  444. #ifndef TRMMKERNEL
  445. movl K, %eax
  446. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  447. movl K, %eax
  448. subl KK, %eax
  449. movl %eax, KKK
  450. #else
  451. movl KK, %eax
  452. addl $2, %eax
  453. movl %eax, KKK
  454. #endif
  455. sarl $3, %eax
  456. je .L25
  457. ALIGN_4
  458. .L22:
  459. mulpd %xmm0, %xmm1
  460. mulpd -14 * SIZE(BB), %xmm0
  461. addpd %xmm1, %xmm4
  462. movapd -12 * SIZE(BB), %xmm1
  463. addpd %xmm0, %xmm5
  464. movapd -14 * SIZE(AA), %xmm0
  465. mulpd %xmm0, %xmm1
  466. mulpd -10 * SIZE(BB), %xmm0
  467. addpd %xmm1, %xmm6
  468. movapd 0 * SIZE(BB), %xmm1
  469. addpd %xmm0, %xmm7
  470. movapd -12 * SIZE(AA), %xmm0
  471. mulpd %xmm0, %xmm3
  472. mulpd -6 * SIZE(BB), %xmm0
  473. addpd %xmm3, %xmm4
  474. movapd -4 * SIZE(BB), %xmm3
  475. addpd %xmm0, %xmm5
  476. movapd -10 * SIZE(AA), %xmm0
  477. mulpd %xmm0, %xmm3
  478. mulpd -2 * SIZE(BB), %xmm0
  479. addpd %xmm3, %xmm6
  480. movapd 8 * SIZE(BB), %xmm3
  481. addpd %xmm0, %xmm7
  482. movapd 0 * SIZE(AA), %xmm0
  483. mulpd %xmm2, %xmm1
  484. mulpd 2 * SIZE(BB), %xmm2
  485. addpd %xmm1, %xmm4
  486. movapd 4 * SIZE(BB), %xmm1
  487. addpd %xmm2, %xmm5
  488. movapd -6 * SIZE(AA), %xmm2
  489. mulpd %xmm2, %xmm1
  490. mulpd 6 * SIZE(BB), %xmm2
  491. addpd %xmm1, %xmm6
  492. movapd 16 * SIZE(BB), %xmm1
  493. addpd %xmm2, %xmm7
  494. movapd -4 * SIZE(AA), %xmm2
  495. mulpd %xmm2, %xmm3
  496. mulpd 10 * SIZE(BB), %xmm2
  497. addpd %xmm3, %xmm4
  498. movapd 12 * SIZE(BB), %xmm3
  499. addpd %xmm2, %xmm5
  500. movapd -2 * SIZE(AA), %xmm2
  501. mulpd %xmm2, %xmm3
  502. mulpd 14 * SIZE(BB), %xmm2
  503. addpd %xmm3, %xmm6
  504. movapd 24 * SIZE(BB), %xmm3
  505. addpd %xmm2, %xmm7
  506. movapd 8 * SIZE(AA), %xmm2
  507. subl $-16 * SIZE, AA
  508. addl $ 32 * SIZE, BB
  509. decl %eax
  510. jne .L22
  511. ALIGN_4
  512. .L25:
  513. movaps ALPHA, %xmm3
  514. #ifndef TRMMKERNEL
  515. movl K, %eax
  516. #else
  517. movl KKK, %eax
  518. #endif
  519. andl $7, %eax
  520. BRANCH
  521. je .L28
  522. ALIGN_4
  523. .L26:
  524. mulpd %xmm0, %xmm1
  525. mulpd -14 * SIZE(BB), %xmm0
  526. addpd %xmm1, %xmm4
  527. movapd -12 * SIZE(BB), %xmm1
  528. addpd %xmm0, %xmm5
  529. movapd -14 * SIZE(AA), %xmm0
  530. addl $2 * SIZE, AA
  531. addl $4 * SIZE, BB
  532. decl %eax
  533. jg .L26
  534. ALIGN_4
  535. .L28:
  536. addpd %xmm6, %xmm4
  537. addpd %xmm7, %xmm5
  538. movsd 0 * SIZE(%esi), %xmm0
  539. movhps 1 * SIZE(%esi), %xmm0
  540. movsd 2 * SIZE(%esi), %xmm1
  541. movhps 3 * SIZE(%esi), %xmm1
  542. pshufd $0x44, %xmm4, %xmm2
  543. unpckhpd %xmm4, %xmm4
  544. mulpd %xmm3, %xmm2
  545. addpd %xmm2, %xmm0
  546. mulpd %xmm3, %xmm4
  547. addpd %xmm4, %xmm1
  548. movlps %xmm0, 0 * SIZE(%esi)
  549. movhps %xmm0, 1 * SIZE(%esi)
  550. movlps %xmm1, 2 * SIZE(%esi)
  551. movhps %xmm1, 3 * SIZE(%esi)
  552. movsd 0 * SIZE(%esi, LDC), %xmm0
  553. movhps 1 * SIZE(%esi, LDC), %xmm0
  554. movsd 2 * SIZE(%esi, LDC), %xmm1
  555. movhps 3 * SIZE(%esi, LDC), %xmm1
  556. pshufd $0x44, %xmm5, %xmm2
  557. unpckhpd %xmm5, %xmm5
  558. mulpd %xmm3, %xmm2
  559. addpd %xmm2, %xmm0
  560. mulpd %xmm3, %xmm5
  561. addpd %xmm5, %xmm1
  562. movlps %xmm0, 0 * SIZE(%esi, LDC)
  563. movhps %xmm0, 1 * SIZE(%esi, LDC)
  564. movlps %xmm1, 2 * SIZE(%esi, LDC)
  565. movhps %xmm1, 3 * SIZE(%esi, LDC)
  566. addl $4 * SIZE, C1
  567. ALIGN_4
  568. .L30:
  569. movl M, I
  570. testl $1, I
  571. jle .L39
  572. .L31:
  573. #if !defined(TRMMKERNEL) || \
  574. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  575. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  576. leal 16 * SIZE + BUFFER, BB
  577. #else
  578. leal 16 * SIZE + BUFFER, BB
  579. movl KK, %eax
  580. leal (, %eax, SIZE), %eax
  581. leal (AA, %eax, 1), AA
  582. leal (BB, %eax, 4), BB /* because it's doubled */
  583. #endif
  584. movsd -16 * SIZE(AA), %xmm0
  585. pxor %xmm4, %xmm4
  586. movsd -16 * SIZE(BB), %xmm1
  587. pxor %xmm5, %xmm5
  588. movsd -12 * SIZE(AA), %xmm2
  589. pxor %xmm6, %xmm6
  590. movsd -8 * SIZE(BB), %xmm3
  591. pxor %xmm7, %xmm7
  592. #ifndef TRMMKERNEL
  593. movl K, %eax
  594. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  595. movl K, %eax
  596. subl KK, %eax
  597. movl %eax, KKK
  598. #else
  599. movl KK, %eax
  600. #ifdef LEFT
  601. addl $1, %eax
  602. #else
  603. addl $2, %eax
  604. #endif
  605. movl %eax, KKK
  606. #endif
  607. sarl $3, %eax
  608. je .L35
  609. ALIGN_4
  610. .L32:
  611. mulsd %xmm0, %xmm1
  612. mulsd -14 * SIZE(BB), %xmm0
  613. addsd %xmm1, %xmm4
  614. movsd -12 * SIZE(BB), %xmm1
  615. addsd %xmm0, %xmm5
  616. movsd -15 * SIZE(AA), %xmm0
  617. mulsd %xmm0, %xmm1
  618. mulsd -10 * SIZE(BB), %xmm0
  619. addsd %xmm1, %xmm6
  620. movsd 0 * SIZE(BB), %xmm1
  621. addsd %xmm0, %xmm7
  622. movsd -14 * SIZE(AA), %xmm0
  623. mulsd %xmm0, %xmm3
  624. mulsd -6 * SIZE(BB), %xmm0
  625. addsd %xmm3, %xmm4
  626. movsd -4 * SIZE(BB), %xmm3
  627. addsd %xmm0, %xmm5
  628. movsd -13 * SIZE(AA), %xmm0
  629. mulsd %xmm0, %xmm3
  630. mulsd -2 * SIZE(BB), %xmm0
  631. addsd %xmm3, %xmm6
  632. movsd 8 * SIZE(BB), %xmm3
  633. addsd %xmm0, %xmm7
  634. movsd -8 * SIZE(AA), %xmm0
  635. mulsd %xmm2, %xmm1
  636. mulsd 2 * SIZE(BB), %xmm2
  637. addsd %xmm1, %xmm4
  638. movsd 4 * SIZE(BB), %xmm1
  639. addsd %xmm2, %xmm5
  640. movsd -11 * SIZE(AA), %xmm2
  641. mulsd %xmm2, %xmm1
  642. mulsd 6 * SIZE(BB), %xmm2
  643. addsd %xmm1, %xmm6
  644. movsd 16 * SIZE(BB), %xmm1
  645. addsd %xmm2, %xmm7
  646. movsd -10 * SIZE(AA), %xmm2
  647. mulsd %xmm2, %xmm3
  648. mulsd 10 * SIZE(BB), %xmm2
  649. addsd %xmm3, %xmm4
  650. movsd 12 * SIZE(BB), %xmm3
  651. addsd %xmm2, %xmm5
  652. movsd -9 * SIZE(AA), %xmm2
  653. mulsd %xmm2, %xmm3
  654. mulsd 14 * SIZE(BB), %xmm2
  655. addsd %xmm3, %xmm6
  656. movsd 24 * SIZE(BB), %xmm3
  657. addsd %xmm2, %xmm7
  658. movsd -4 * SIZE(AA), %xmm2
  659. subl $-8 * SIZE, AA
  660. addl $32 * SIZE, BB
  661. decl %eax
  662. jne .L32
  663. ALIGN_4
  664. .L35:
  665. movaps ALPHA, %xmm3
  666. #ifndef TRMMKERNEL
  667. movl K, %eax
  668. #else
  669. movl KKK, %eax
  670. #endif
  671. andl $7, %eax
  672. BRANCH
  673. je .L38
  674. ALIGN_4
  675. .L36:
  676. mulsd %xmm0, %xmm1
  677. mulsd -14 * SIZE(BB), %xmm0
  678. addsd %xmm1, %xmm4
  679. movsd -12 * SIZE(BB), %xmm1
  680. addsd %xmm0, %xmm5
  681. movsd -15 * SIZE(AA), %xmm0
  682. addl $1 * SIZE, AA
  683. addl $4 * SIZE, BB
  684. decl %eax
  685. jg .L36
  686. ALIGN_4
  687. .L38:
  688. addsd %xmm6, %xmm4
  689. addsd %xmm7, %xmm5
  690. movsd 0 * SIZE(%esi), %xmm0
  691. movhps 1 * SIZE(%esi), %xmm0
  692. movsd 0 * SIZE(%esi, LDC), %xmm1
  693. movhps 1 * SIZE(%esi, LDC), %xmm1
  694. unpcklpd %xmm4, %xmm4
  695. unpcklpd %xmm5, %xmm5
  696. mulpd %xmm3, %xmm4
  697. addpd %xmm4, %xmm0
  698. mulpd %xmm3, %xmm5
  699. addpd %xmm5, %xmm1
  700. movlps %xmm0, 0 * SIZE(%esi)
  701. movhps %xmm0, 1 * SIZE(%esi)
  702. movlps %xmm1, 0 * SIZE(%esi, LDC)
  703. movhps %xmm1, 1 * SIZE(%esi, LDC)
  704. ALIGN_4
  705. .L39:
  706. #if defined(TRMMKERNEL) && !defined(LEFT)
  707. addl $2, KK
  708. #endif
  709. leal (, LDC, 2), %eax
  710. addl %eax, C
  711. decl J
  712. jg .L01
  713. ALIGN_4
  714. .L40:
  715. movl N, %eax
  716. testl $1, %eax
  717. jle .L999
  718. ALIGN_4
  719. .L41:
  720. leal 16 * SIZE + BUFFER, BB
  721. #if defined(TRMMKERNEL) && defined(LEFT)
  722. movl OFFSET, %eax
  723. movl %eax, KK
  724. #endif
  725. movl K, %eax
  726. sarl $3, %eax
  727. jle .L45
  728. ALIGN_4
  729. .L42:
  730. movddup -16 * SIZE(B), %xmm0
  731. movddup -15 * SIZE(B), %xmm1
  732. movddup -14 * SIZE(B), %xmm2
  733. movddup -13 * SIZE(B), %xmm3
  734. movddup -12 * SIZE(B), %xmm4
  735. movddup -11 * SIZE(B), %xmm5
  736. movddup -10 * SIZE(B), %xmm6
  737. movddup -9 * SIZE(B), %xmm7
  738. movapd %xmm0, -16 * SIZE(BB)
  739. movapd %xmm1, -14 * SIZE(BB)
  740. movapd %xmm2, -12 * SIZE(BB)
  741. movapd %xmm3, -10 * SIZE(BB)
  742. movapd %xmm4, -8 * SIZE(BB)
  743. movapd %xmm5, -6 * SIZE(BB)
  744. movapd %xmm6, -4 * SIZE(BB)
  745. movapd %xmm7, -2 * SIZE(BB)
  746. addl $ 8 * SIZE, B
  747. addl $16 * SIZE, BB
  748. decl %eax
  749. jne .L42
  750. ALIGN_4
  751. .L45:
  752. movl K, %eax
  753. andl $7, %eax
  754. BRANCH
  755. jle .L50
  756. ALIGN_4
  757. .L46:
  758. movddup -16 * SIZE(B), %xmm0
  759. movapd %xmm0, -16 * SIZE(BB)
  760. addl $1 * SIZE, B
  761. addl $2 * SIZE, BB
  762. decl %eax
  763. jne .L46
  764. ALIGN_4
  765. .L50:
  766. movl C, C1
  767. movl A, AA
  768. movl M, I
  769. sarl $2, I
  770. jle .L60
  771. ALIGN_4
  772. .L51:
  773. #if !defined(TRMMKERNEL) || \
  774. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  775. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  776. leal 16 * SIZE + BUFFER, BB
  777. #else
  778. leal 16 * SIZE + BUFFER, BB
  779. movl KK, %eax
  780. leal (, %eax, SIZE), %eax
  781. leal (AA, %eax, 4), AA
  782. leal (BB, %eax, 2), BB
  783. #endif
  784. movapd -16 * SIZE(AA), %xmm0
  785. pxor %xmm4, %xmm4
  786. movapd -16 * SIZE(BB), %xmm1
  787. pxor %xmm5, %xmm5
  788. movapd -8 * SIZE(AA), %xmm2
  789. pxor %xmm6, %xmm6
  790. movapd -8 * SIZE(BB), %xmm3
  791. pxor %xmm7, %xmm7
  792. prefetcht0 3 * SIZE(C1)
  793. #ifndef TRMMKERNEL
  794. movl K, %eax
  795. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  796. movl K, %eax
  797. subl KK, %eax
  798. movl %eax, KKK
  799. #else
  800. movl KK, %eax
  801. #ifdef LEFT
  802. addl $4, %eax
  803. #else
  804. addl $1, %eax
  805. #endif
  806. movl %eax, KKK
  807. #endif
  808. sarl $3, %eax
  809. je .L55
  810. ALIGN_4
  811. .L52:
  812. mulpd %xmm1, %xmm0
  813. mulpd -14 * SIZE(AA), %xmm1
  814. addpd %xmm0, %xmm4
  815. movapd -12 * SIZE(AA), %xmm0
  816. addpd %xmm1, %xmm6
  817. movapd -14 * SIZE(BB), %xmm1
  818. mulpd %xmm1, %xmm0
  819. mulpd -10 * SIZE(AA), %xmm1
  820. addpd %xmm0, %xmm5
  821. movapd 0 * SIZE(AA), %xmm0
  822. addpd %xmm1, %xmm7
  823. movapd -12 * SIZE(BB), %xmm1
  824. mulpd %xmm1, %xmm2
  825. mulpd -6 * SIZE(AA), %xmm1
  826. addpd %xmm2, %xmm4
  827. movapd -4 * SIZE(AA), %xmm2
  828. addpd %xmm1, %xmm6
  829. movapd -10 * SIZE(BB), %xmm1
  830. mulpd %xmm1, %xmm2
  831. mulpd -2 * SIZE(AA), %xmm1
  832. addpd %xmm2, %xmm5
  833. movapd 8 * SIZE(AA), %xmm2
  834. addpd %xmm1, %xmm7
  835. movapd 0 * SIZE(BB), %xmm1
  836. mulpd %xmm3, %xmm0
  837. mulpd 2 * SIZE(AA), %xmm3
  838. addpd %xmm0, %xmm4
  839. movapd 4 * SIZE(AA), %xmm0
  840. addpd %xmm3, %xmm6
  841. movapd -6 * SIZE(BB), %xmm3
  842. mulpd %xmm3, %xmm0
  843. mulpd 6 * SIZE(AA), %xmm3
  844. addpd %xmm0, %xmm5
  845. movapd 16 * SIZE(AA), %xmm0
  846. addpd %xmm3, %xmm7
  847. movapd -4 * SIZE(BB), %xmm3
  848. mulpd %xmm3, %xmm2
  849. mulpd 10 * SIZE(AA), %xmm3
  850. addpd %xmm2, %xmm4
  851. movapd 12 * SIZE(AA), %xmm2
  852. addpd %xmm3, %xmm6
  853. movapd -2 * SIZE(BB), %xmm3
  854. mulpd %xmm3, %xmm2
  855. mulpd 14 * SIZE(AA), %xmm3
  856. addpd %xmm2, %xmm5
  857. movapd 24 * SIZE(AA), %xmm2
  858. addpd %xmm3, %xmm7
  859. movapd 8 * SIZE(BB), %xmm3
  860. addl $ 32 * SIZE, AA
  861. subl $-16 * SIZE, BB
  862. decl %eax
  863. jne .L52
  864. ALIGN_4
  865. .L55:
  866. movaps ALPHA, %xmm3
  867. #ifndef TRMMKERNEL
  868. movl K, %eax
  869. #else
  870. movl KKK, %eax
  871. #endif
  872. andl $7, %eax
  873. BRANCH
  874. je .L58
  875. ALIGN_4
  876. .L56:
  877. mulpd %xmm1, %xmm0
  878. mulpd -14 * SIZE(AA), %xmm1
  879. addpd %xmm0, %xmm4
  880. movapd -12 * SIZE(AA), %xmm0
  881. addpd %xmm1, %xmm6
  882. movapd -14 * SIZE(BB), %xmm1
  883. addl $4 * SIZE, AA
  884. addl $2 * SIZE, BB
  885. decl %eax
  886. jg .L56
  887. ALIGN_4
  888. .L58:
  889. addpd %xmm5, %xmm4
  890. addpd %xmm7, %xmm6
  891. movsd 0 * SIZE(%esi), %xmm0
  892. movhps 1 * SIZE(%esi), %xmm0
  893. movsd 2 * SIZE(%esi), %xmm1
  894. movhps 3 * SIZE(%esi), %xmm1
  895. pshufd $0x44, %xmm4, %xmm2
  896. unpckhpd %xmm4, %xmm4
  897. mulpd %xmm3, %xmm2
  898. addpd %xmm2, %xmm0
  899. mulpd %xmm3, %xmm4
  900. addpd %xmm4, %xmm1
  901. movlps %xmm0, 0 * SIZE(%esi)
  902. movhps %xmm0, 1 * SIZE(%esi)
  903. movlps %xmm1, 2 * SIZE(%esi)
  904. movhps %xmm1, 3 * SIZE(%esi)
  905. movsd 4 * SIZE(%esi), %xmm0
  906. movhps 5 * SIZE(%esi), %xmm0
  907. movsd 6 * SIZE(%esi), %xmm1
  908. movhps 7 * SIZE(%esi), %xmm1
  909. pshufd $0x44, %xmm6, %xmm2
  910. unpckhpd %xmm6, %xmm6
  911. mulpd %xmm3, %xmm2
  912. addpd %xmm2, %xmm0
  913. mulpd %xmm3, %xmm6
  914. addpd %xmm6, %xmm1
  915. movlps %xmm0, 4 * SIZE(%esi)
  916. movhps %xmm0, 5 * SIZE(%esi)
  917. movlps %xmm1, 6 * SIZE(%esi)
  918. movhps %xmm1, 7 * SIZE(%esi)
  919. addl $8 * SIZE, C1
  920. decl I
  921. jg .L51
  922. ALIGN_4
  923. .L60:
  924. movl M, I
  925. testl $2, I
  926. jle .L70
  927. .L61:
  928. #if !defined(TRMMKERNEL) || \
  929. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  930. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  931. leal 16 * SIZE + BUFFER, BB
  932. #else
  933. leal 16 * SIZE + BUFFER, BB
  934. movl KK, %eax
  935. leal (, %eax, SIZE), %eax
  936. leal (AA, %eax, 2), AA
  937. leal (BB, %eax, 2), BB
  938. #endif
  939. movapd -16 * SIZE(AA), %xmm0
  940. pxor %xmm4, %xmm4
  941. movapd -16 * SIZE(BB), %xmm1
  942. pxor %xmm5, %xmm5
  943. movapd -8 * SIZE(AA), %xmm2
  944. movapd -8 * SIZE(BB), %xmm3
  945. #ifndef TRMMKERNEL
  946. movl K, %eax
  947. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  948. movl K, %eax
  949. subl KK, %eax
  950. movl %eax, KKK
  951. #else
  952. movl KK, %eax
  953. #ifdef LEFT
  954. addl $2, %eax
  955. #else
  956. addl $1, %eax
  957. #endif
  958. movl %eax, KKK
  959. #endif
  960. sarl $3, %eax
  961. je .L65
  962. ALIGN_4
  963. .L62:
  964. mulpd %xmm0, %xmm1
  965. movapd -14 * SIZE(AA), %xmm0
  966. addpd %xmm1, %xmm4
  967. movapd -14 * SIZE(BB), %xmm1
  968. mulpd %xmm0, %xmm1
  969. movapd -12 * SIZE(AA), %xmm0
  970. addpd %xmm1, %xmm5
  971. movapd -12 * SIZE(BB), %xmm1
  972. mulpd %xmm0, %xmm1
  973. movapd -10 * SIZE(AA), %xmm0
  974. addpd %xmm1, %xmm4
  975. movapd -10 * SIZE(BB), %xmm1
  976. mulpd %xmm0, %xmm1
  977. movapd 0 * SIZE(AA), %xmm0
  978. addpd %xmm1, %xmm5
  979. movapd 0 * SIZE(BB), %xmm1
  980. mulpd %xmm2, %xmm3
  981. movapd -6 * SIZE(AA), %xmm2
  982. addpd %xmm3, %xmm4
  983. movapd -6 * SIZE(BB), %xmm3
  984. mulpd %xmm2, %xmm3
  985. movapd -4 * SIZE(AA), %xmm2
  986. addpd %xmm3, %xmm5
  987. movapd -4 * SIZE(BB), %xmm3
  988. mulpd %xmm2, %xmm3
  989. movapd -2 * SIZE(AA), %xmm2
  990. addpd %xmm3, %xmm4
  991. movapd -2 * SIZE(BB), %xmm3
  992. mulpd %xmm2, %xmm3
  993. movapd 8 * SIZE(AA), %xmm2
  994. addpd %xmm3, %xmm5
  995. movapd 8 * SIZE(BB), %xmm3
  996. subl $-16 * SIZE, AA
  997. subl $-16 * SIZE, BB
  998. decl %eax
  999. jne .L62
  1000. ALIGN_4
  1001. .L65:
  1002. movaps ALPHA, %xmm3
  1003. #ifndef TRMMKERNEL
  1004. movl K, %eax
  1005. #else
  1006. movl KKK, %eax
  1007. #endif
  1008. andl $7, %eax
  1009. BRANCH
  1010. je .L68
  1011. ALIGN_4
  1012. .L66:
  1013. mulpd %xmm0, %xmm1
  1014. movapd -14 * SIZE(AA), %xmm0
  1015. addpd %xmm1, %xmm4
  1016. movapd -14 * SIZE(BB), %xmm1
  1017. addl $2 * SIZE, AA
  1018. addl $2 * SIZE, BB
  1019. decl %eax
  1020. jg .L66
  1021. ALIGN_4
  1022. .L68:
  1023. addpd %xmm5, %xmm4
  1024. movsd 0 * SIZE(%esi), %xmm0
  1025. movhps 1 * SIZE(%esi), %xmm0
  1026. movsd 2 * SIZE(%esi), %xmm1
  1027. movhps 3 * SIZE(%esi), %xmm1
  1028. pshufd $0x44, %xmm4, %xmm2
  1029. unpckhpd %xmm4, %xmm4
  1030. mulpd %xmm3, %xmm2
  1031. addpd %xmm2, %xmm0
  1032. mulpd %xmm3, %xmm4
  1033. addpd %xmm4, %xmm1
  1034. movlps %xmm0, 0 * SIZE(%esi)
  1035. movhps %xmm0, 1 * SIZE(%esi)
  1036. movlps %xmm1, 2 * SIZE(%esi)
  1037. movhps %xmm1, 3 * SIZE(%esi)
  1038. addl $4 * SIZE, C1
  1039. ALIGN_4
  1040. .L70:
  1041. movl M, I
  1042. testl $1, I
  1043. jle .L79
  1044. .L71:
  1045. #if !defined(TRMMKERNEL) || \
  1046. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1047. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1048. leal 16 * SIZE + BUFFER, BB
  1049. #else
  1050. leal 16 * SIZE + BUFFER, BB
  1051. movl KK, %eax
  1052. leal (, %eax, SIZE), %eax
  1053. leal (AA, %eax, 1), AA
  1054. leal (BB, %eax, 2), BB
  1055. #endif
  1056. movsd -16 * SIZE(AA), %xmm0
  1057. pxor %xmm4, %xmm4
  1058. movsd -16 * SIZE(BB), %xmm1
  1059. pxor %xmm5, %xmm5
  1060. movsd -8 * SIZE(BB), %xmm3
  1061. movsd -12 * SIZE(AA), %xmm2
  1062. #ifndef TRMMKERNEL
  1063. movl K, %eax
  1064. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1065. movl K, %eax
  1066. subl KK, %eax
  1067. movl %eax, KKK
  1068. #else
  1069. movl KK, %eax
  1070. addl $1, %eax
  1071. movl %eax, KKK
  1072. #endif
  1073. sarl $3, %eax
  1074. je .L75
  1075. ALIGN_4
  1076. .L72:
  1077. mulsd %xmm0, %xmm1
  1078. movsd -15 * SIZE(AA), %xmm0
  1079. addsd %xmm1, %xmm4
  1080. movsd -14 * SIZE(BB), %xmm1
  1081. mulsd %xmm0, %xmm1
  1082. movsd -14 * SIZE(AA), %xmm0
  1083. addsd %xmm1, %xmm5
  1084. movsd -12 * SIZE(BB), %xmm1
  1085. mulsd %xmm0, %xmm1
  1086. movsd -13 * SIZE(AA), %xmm0
  1087. addsd %xmm1, %xmm4
  1088. movsd -10 * SIZE(BB), %xmm1
  1089. mulsd %xmm0, %xmm1
  1090. movsd -8 * SIZE(AA), %xmm0
  1091. addsd %xmm1, %xmm5
  1092. movsd -0 * SIZE(BB), %xmm1
  1093. mulsd %xmm2, %xmm3
  1094. movsd -11 * SIZE(AA), %xmm2
  1095. addsd %xmm3, %xmm4
  1096. movsd -6 * SIZE(BB), %xmm3
  1097. mulsd %xmm2, %xmm3
  1098. movsd -10 * SIZE(AA), %xmm2
  1099. addsd %xmm3, %xmm5
  1100. movsd -4 * SIZE(BB), %xmm3
  1101. mulsd %xmm2, %xmm3
  1102. movsd -9 * SIZE(AA), %xmm2
  1103. addsd %xmm3, %xmm4
  1104. movsd -2 * SIZE(BB), %xmm3
  1105. mulsd %xmm2, %xmm3
  1106. movsd -4 * SIZE(AA), %xmm2
  1107. addsd %xmm3, %xmm5
  1108. movsd 8 * SIZE(BB), %xmm3
  1109. subl $ -8 * SIZE, AA
  1110. subl $-16 * SIZE, BB
  1111. decl %eax
  1112. jne .L72
  1113. ALIGN_4
  1114. .L75:
  1115. movaps ALPHA, %xmm3
  1116. #ifndef TRMMKERNEL
  1117. movl K, %eax
  1118. #else
  1119. movl KKK, %eax
  1120. #endif
  1121. andl $7, %eax
  1122. BRANCH
  1123. je .L78
  1124. ALIGN_4
  1125. .L76:
  1126. mulsd %xmm0, %xmm1
  1127. movsd -15 * SIZE(AA), %xmm0
  1128. addsd %xmm1, %xmm4
  1129. movsd -14 * SIZE(BB), %xmm1
  1130. addl $1 * SIZE, AA
  1131. addl $2 * SIZE, BB
  1132. decl %eax
  1133. jg .L76
  1134. ALIGN_4
  1135. .L78:
  1136. addsd %xmm5, %xmm4
  1137. movsd 0 * SIZE(%esi), %xmm0
  1138. movhps 1 * SIZE(%esi), %xmm0
  1139. unpcklpd %xmm4, %xmm4
  1140. mulpd %xmm3, %xmm4
  1141. addpd %xmm4, %xmm0
  1142. movlps %xmm0, 0 * SIZE(%esi)
  1143. movhps %xmm0, 1 * SIZE(%esi)
  1144. ALIGN_4
  1145. .L79:
  1146. addl LDC, C
  1147. ALIGN_4
  1148. .L999:
  1149. movl OLD_STACK, %esp
  1150. EMMS
  1151. popl %ebx
  1152. popl %esi
  1153. popl %edi
  1154. popl %ebp
  1155. ret
  1156. EPILOGUE