You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_4x2_atom.S 26 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %rdi
  41. #define N %rsi
  42. #define K %rdx
  43. #define A %rcx
  44. #define B %r8
  45. #define C %r9
  46. #define LDC %r10
  47. #define I %r11
  48. #define J %r12
  49. #define AO %r13
  50. #define BO %r14
  51. #define CO1 %r15
  52. #define CO2 %rbx
  53. #define BB %rbp
  54. #ifndef WINDOWS_ABI
  55. #define STACKSIZE 128
  56. #define OLD_LDC 8 + STACKSIZE(%rsp)
  57. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  58. #define ALPHA 48(%rsp)
  59. #define OFFSET 56(%rsp)
  60. #define KKK 64(%rsp)
  61. #define KK 72(%rsp)
  62. #else
  63. #define STACKSIZE 256
  64. #define OLD_A 40 + STACKSIZE(%rsp)
  65. #define OLD_B 48 + STACKSIZE(%rsp)
  66. #define OLD_C 56 + STACKSIZE(%rsp)
  67. #define OLD_LDC 64 + STACKSIZE(%rsp)
  68. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  69. #define ALPHA 224(%rsp)
  70. #define OFFSET 232(%rsp)
  71. #define KK 240(%rsp)
  72. #define KKK 248(%rsp)
  73. #endif
  74. #define PREFETCH prefetcht0
  75. #define PREFETCHSIZE (8 * 8 + 3)
  76. PROLOGUE
  77. PROFCODE
  78. subq $STACKSIZE, %rsp
  79. movq %rbx, 0(%rsp)
  80. movq %rbp, 8(%rsp)
  81. movq %r12, 16(%rsp)
  82. movq %r13, 24(%rsp)
  83. movq %r14, 32(%rsp)
  84. movq %r15, 40(%rsp)
  85. #ifdef WINDOWS_ABI
  86. movq %rdi, 48(%rsp)
  87. movq %rsi, 56(%rsp)
  88. movups %xmm6, 64(%rsp)
  89. movups %xmm7, 80(%rsp)
  90. movups %xmm8, 96(%rsp)
  91. movups %xmm9, 112(%rsp)
  92. movups %xmm10, 128(%rsp)
  93. movups %xmm11, 144(%rsp)
  94. movups %xmm12, 160(%rsp)
  95. movups %xmm13, 176(%rsp)
  96. movups %xmm14, 192(%rsp)
  97. movups %xmm15, 208(%rsp)
  98. movq ARG1, M
  99. movq ARG2, N
  100. movq ARG3, K
  101. movq OLD_A, A
  102. movq OLD_B, B
  103. movq OLD_C, C
  104. movq OLD_LDC, LDC
  105. #ifdef TRMMKERNEL
  106. movsd OLD_OFFSET, %xmm4
  107. #endif
  108. movaps %xmm3, %xmm0
  109. #else
  110. movq OLD_LDC, LDC
  111. #ifdef TRMMKERNEL
  112. movsd OLD_OFFSET, %xmm4
  113. #endif
  114. #endif
  115. movsd %xmm0, ALPHA
  116. #ifdef TRMMKERNEL
  117. movsd %xmm4, OFFSET
  118. movsd %xmm4, KK
  119. #ifndef LEFT
  120. negq KK
  121. #endif
  122. #endif
  123. leaq (, LDC, SIZE), LDC
  124. movq N, J
  125. sarq $1, J
  126. jle .L40
  127. ALIGN_4
  128. .L10:
  129. #if defined(TRMMKERNEL) && defined(LEFT)
  130. movq OFFSET, %rax
  131. movq %rax, KK
  132. #endif
  133. movq C, CO1
  134. leaq (C, LDC, 1), CO2
  135. leaq (C, LDC, 2), C
  136. movq A, AO
  137. movq K, %rax
  138. salq $BASE_SHIFT + 1, %rax
  139. leaq (B, %rax), BB
  140. movq M, I
  141. sarq $2, I
  142. jle .L20
  143. ALIGN_4
  144. .L11:
  145. #if !defined(TRMMKERNEL) || \
  146. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  147. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  148. movq B, BO
  149. #else
  150. movq KK, %rax
  151. leaq (, %rax, SIZE), %rax
  152. leaq (AO, %rax, 4), AO
  153. leaq (B, %rax, 2), BO
  154. #endif
  155. prefetcht0 0 * SIZE(BB)
  156. subq $-8 * SIZE, BB
  157. movsd 0 * SIZE(AO), %xmm0
  158. xorps %xmm2, %xmm2
  159. movsd 1 * SIZE(AO), %xmm4
  160. xorps %xmm5, %xmm5
  161. movsd 2 * SIZE(AO), %xmm5
  162. xorps %xmm6, %xmm6
  163. xorps %xmm7, %xmm7
  164. movsd 0 * SIZE(BO), %xmm1
  165. xorps %xmm8, %xmm8
  166. xorps %xmm9, %xmm9
  167. movsd 1 * SIZE(BO), %xmm3
  168. xorps %xmm10, %xmm10
  169. xorps %xmm11, %xmm11
  170. prefetcht0 3 * SIZE(CO1)
  171. xorps %xmm12, %xmm12
  172. xorps %xmm13, %xmm13
  173. prefetcht0 3 * SIZE(CO2)
  174. xorps %xmm14, %xmm14
  175. xorps %xmm15, %xmm15
  176. #ifndef TRMMKERNEL
  177. movq K, %rax
  178. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  179. movq K, %rax
  180. subq KK, %rax
  181. movq %rax, KKK
  182. #else
  183. movq KK, %rax
  184. #ifdef LEFT
  185. addq $4, %rax
  186. #else
  187. addq $2, %rax
  188. #endif
  189. movq %rax, KKK
  190. #endif
  191. sarq $2, %rax
  192. je .L15
  193. ALIGN_4
  194. .L12:
  195. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  196. addsd %xmm2, %xmm13
  197. movaps %xmm0, %xmm2
  198. mulsd %xmm1, %xmm0
  199. addsd %xmm7, %xmm14
  200. movsd 3 * SIZE(AO), %xmm7
  201. mulsd %xmm3, %xmm2
  202. addsd %xmm6, %xmm15
  203. movaps %xmm4, %xmm6
  204. mulsd %xmm1, %xmm4
  205. addsd %xmm0, %xmm8
  206. movsd 4 * SIZE(AO), %xmm0
  207. mulsd %xmm3, %xmm6
  208. addsd %xmm2, %xmm9
  209. movaps %xmm5, %xmm2
  210. mulsd %xmm1, %xmm5
  211. addsd %xmm4, %xmm10
  212. movsd 5 * SIZE(AO), %xmm4
  213. mulsd %xmm3, %xmm2
  214. addsd %xmm6, %xmm11
  215. movaps %xmm7, %xmm6
  216. mulsd %xmm1, %xmm7
  217. movsd 2 * SIZE(BO), %xmm1
  218. addsd %xmm5, %xmm12
  219. movsd 6 * SIZE(AO), %xmm5
  220. mulsd %xmm3, %xmm6
  221. movsd 3 * SIZE(BO), %xmm3
  222. addsd %xmm2, %xmm13
  223. movaps %xmm0, %xmm2
  224. mulsd %xmm1, %xmm0
  225. addsd %xmm7, %xmm14
  226. movsd 7 * SIZE(AO), %xmm7
  227. mulsd %xmm3, %xmm2
  228. addsd %xmm6, %xmm15
  229. movaps %xmm4, %xmm6
  230. mulsd %xmm1, %xmm4
  231. addsd %xmm0, %xmm8
  232. movsd 8 * SIZE(AO), %xmm0
  233. mulsd %xmm3, %xmm6
  234. addsd %xmm2, %xmm9
  235. movaps %xmm5, %xmm2
  236. mulsd %xmm1, %xmm5
  237. addsd %xmm4, %xmm10
  238. movsd 9 * SIZE(AO), %xmm4
  239. mulsd %xmm3, %xmm2
  240. addsd %xmm6, %xmm11
  241. movaps %xmm7, %xmm6
  242. mulsd %xmm1, %xmm7
  243. movsd 4 * SIZE(BO), %xmm1
  244. addsd %xmm5, %xmm12
  245. movsd 10 * SIZE(AO), %xmm5
  246. mulsd %xmm3, %xmm6
  247. movsd 5 * SIZE(BO), %xmm3
  248. addsd %xmm2, %xmm13
  249. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  250. movaps %xmm0, %xmm2
  251. mulsd %xmm1, %xmm0
  252. addsd %xmm7, %xmm14
  253. movsd 11 * SIZE(AO), %xmm7
  254. mulsd %xmm3, %xmm2
  255. addsd %xmm6, %xmm15
  256. movaps %xmm4, %xmm6
  257. mulsd %xmm1, %xmm4
  258. addsd %xmm0, %xmm8
  259. movsd 12 * SIZE(AO), %xmm0
  260. mulsd %xmm3, %xmm6
  261. addsd %xmm2, %xmm9
  262. movaps %xmm5, %xmm2
  263. mulsd %xmm1, %xmm5
  264. addsd %xmm4, %xmm10
  265. movsd 13 * SIZE(AO), %xmm4
  266. mulsd %xmm3, %xmm2
  267. addsd %xmm6, %xmm11
  268. movaps %xmm7, %xmm6
  269. mulsd %xmm1, %xmm7
  270. movsd 6 * SIZE(BO), %xmm1
  271. addsd %xmm5, %xmm12
  272. movsd 14 * SIZE(AO), %xmm5
  273. mulsd %xmm3, %xmm6
  274. movsd 7 * SIZE(BO), %xmm3
  275. addsd %xmm2, %xmm13
  276. movaps %xmm0, %xmm2
  277. mulsd %xmm1, %xmm0
  278. addsd %xmm7, %xmm14
  279. movsd 15 * SIZE(AO), %xmm7
  280. mulsd %xmm3, %xmm2
  281. subq $-16 * SIZE, AO
  282. addsd %xmm6, %xmm15
  283. movaps %xmm4, %xmm6
  284. mulsd %xmm1, %xmm4
  285. addsd %xmm0, %xmm8
  286. movsd 0 * SIZE(AO), %xmm0
  287. mulsd %xmm3, %xmm6
  288. addsd %xmm2, %xmm9
  289. movaps %xmm5, %xmm2
  290. mulsd %xmm1, %xmm5
  291. addq $ 8 * SIZE, BO
  292. addsd %xmm4, %xmm10
  293. movsd 1 * SIZE(AO), %xmm4
  294. mulsd %xmm3, %xmm2
  295. decq %rax
  296. addsd %xmm6, %xmm11
  297. movaps %xmm7, %xmm6
  298. mulsd %xmm1, %xmm7
  299. movsd 0 * SIZE(BO), %xmm1
  300. addsd %xmm5, %xmm12
  301. movsd 2 * SIZE(AO), %xmm5
  302. mulsd %xmm3, %xmm6
  303. movsd 1 * SIZE(BO), %xmm3
  304. jne .L12
  305. ALIGN_4
  306. .L15:
  307. #ifndef TRMMKERNEL
  308. movq K, %rax
  309. #else
  310. movq KKK, %rax
  311. #endif
  312. andq $3, %rax
  313. BRANCH
  314. BRANCH
  315. je .L19
  316. ALIGN_4
  317. .L16:
  318. addsd %xmm2, %xmm13
  319. movaps %xmm0, %xmm2
  320. mulsd %xmm1, %xmm0
  321. addsd %xmm7, %xmm14
  322. movsd 3 * SIZE(AO), %xmm7
  323. mulsd %xmm3, %xmm2
  324. addsd %xmm6, %xmm15
  325. movaps %xmm4, %xmm6
  326. mulsd %xmm1, %xmm4
  327. addsd %xmm0, %xmm8
  328. movsd 4 * SIZE(AO), %xmm0
  329. mulsd %xmm3, %xmm6
  330. addsd %xmm2, %xmm9
  331. movaps %xmm5, %xmm2
  332. mulsd %xmm1, %xmm5
  333. addsd %xmm4, %xmm10
  334. movsd 5 * SIZE(AO), %xmm4
  335. mulsd %xmm3, %xmm2
  336. addsd %xmm6, %xmm11
  337. movaps %xmm7, %xmm6
  338. mulsd %xmm1, %xmm7
  339. movsd 2 * SIZE(BO), %xmm1
  340. addsd %xmm5, %xmm12
  341. movsd 6 * SIZE(AO), %xmm5
  342. mulsd %xmm3, %xmm6
  343. movsd 3 * SIZE(BO), %xmm3
  344. addq $4 * SIZE, AO
  345. addq $2 * SIZE, BO
  346. decq %rax
  347. BRANCH
  348. jg .L16
  349. ALIGN_4
  350. .L19:
  351. movsd ALPHA, %xmm5
  352. addsd %xmm2, %xmm13
  353. mulsd %xmm5, %xmm8
  354. addsd %xmm7, %xmm14
  355. mulsd %xmm5, %xmm10
  356. addsd %xmm6, %xmm15
  357. mulsd %xmm5, %xmm12
  358. mulsd %xmm5, %xmm14
  359. mulsd %xmm5, %xmm9
  360. mulsd %xmm5, %xmm11
  361. mulsd %xmm5, %xmm13
  362. mulsd %xmm5, %xmm15
  363. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  364. addsd 0 * SIZE(CO1), %xmm8
  365. addsd 1 * SIZE(CO1), %xmm10
  366. addsd 2 * SIZE(CO1), %xmm12
  367. addsd 3 * SIZE(CO1), %xmm14
  368. addsd 0 * SIZE(CO2), %xmm9
  369. addsd 1 * SIZE(CO2), %xmm11
  370. addsd 2 * SIZE(CO2), %xmm13
  371. addsd 3 * SIZE(CO2), %xmm15
  372. #endif
  373. movsd %xmm8, 0 * SIZE(CO1)
  374. movsd %xmm10, 1 * SIZE(CO1)
  375. movsd %xmm12, 2 * SIZE(CO1)
  376. movsd %xmm14, 3 * SIZE(CO1)
  377. movsd %xmm9, 0 * SIZE(CO2)
  378. movsd %xmm11, 1 * SIZE(CO2)
  379. movsd %xmm13, 2 * SIZE(CO2)
  380. movsd %xmm15, 3 * SIZE(CO2)
  381. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  382. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  383. movq K, %rax
  384. subq KKK, %rax
  385. leaq (,%rax, SIZE), %rax
  386. leaq (AO, %rax, 4), AO
  387. leaq (BO, %rax, 2), BO
  388. #endif
  389. #if defined(TRMMKERNEL) && defined(LEFT)
  390. addq $4, KK
  391. #endif
  392. addq $4 * SIZE, CO1
  393. addq $4 * SIZE, CO2
  394. decq I # i --
  395. jg .L11
  396. ALIGN_4
  397. .L20:
  398. testq $2, M
  399. jle .L30
  400. #if !defined(TRMMKERNEL) || \
  401. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  402. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  403. movq B, BO
  404. #else
  405. movq KK, %rax
  406. leaq (, %rax, SIZE), %rax
  407. leaq (AO, %rax, 2), AO
  408. leaq (B, %rax, 2), BO
  409. #endif
  410. movsd 0 * SIZE(AO), %xmm0
  411. xorps %xmm2, %xmm2
  412. movsd 1 * SIZE(AO), %xmm4
  413. xorps %xmm5, %xmm5
  414. movsd 2 * SIZE(AO), %xmm5
  415. xorps %xmm6, %xmm6
  416. movsd 3 * SIZE(AO), %xmm7
  417. movsd 0 * SIZE(BO), %xmm1
  418. xorps %xmm8, %xmm8
  419. xorps %xmm9, %xmm9
  420. movsd 1 * SIZE(BO), %xmm3
  421. xorps %xmm10, %xmm10
  422. xorps %xmm11, %xmm11
  423. #ifndef TRMMKERNEL
  424. movq K, %rax
  425. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  426. movq K, %rax
  427. subq KK, %rax
  428. movq %rax, KKK
  429. #else
  430. movq KK, %rax
  431. #ifdef LEFT
  432. addq $2, %rax
  433. #else
  434. addq $2, %rax
  435. #endif
  436. movq %rax, KKK
  437. #endif
  438. sarq $2, %rax
  439. je .L25
  440. ALIGN_4
  441. .L22:
  442. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  443. addsd %xmm2, %xmm9
  444. movaps %xmm0, %xmm2
  445. mulsd %xmm1, %xmm0
  446. addsd %xmm6, %xmm11
  447. movaps %xmm4, %xmm6
  448. mulsd %xmm1, %xmm4
  449. movsd 2 * SIZE(BO), %xmm1
  450. addsd %xmm0, %xmm8
  451. movsd 4 * SIZE(AO), %xmm0
  452. mulsd %xmm3, %xmm2
  453. addsd %xmm4, %xmm10
  454. movsd 5 * SIZE(AO), %xmm4
  455. mulsd %xmm3, %xmm6
  456. movsd 3 * SIZE(BO), %xmm3
  457. addsd %xmm2, %xmm9
  458. movaps %xmm5, %xmm2
  459. mulsd %xmm1, %xmm5
  460. addsd %xmm6, %xmm11
  461. movaps %xmm7, %xmm6
  462. mulsd %xmm1, %xmm7
  463. movsd 4 * SIZE(BO), %xmm1
  464. addsd %xmm5, %xmm8
  465. movsd 6 * SIZE(AO), %xmm5
  466. mulsd %xmm3, %xmm2
  467. addsd %xmm7, %xmm10
  468. movsd 7 * SIZE(AO), %xmm7
  469. mulsd %xmm3, %xmm6
  470. movsd 5 * SIZE(BO), %xmm3
  471. addsd %xmm2, %xmm9
  472. movaps %xmm0, %xmm2
  473. mulsd %xmm1, %xmm0
  474. addsd %xmm6, %xmm11
  475. movaps %xmm4, %xmm6
  476. mulsd %xmm1, %xmm4
  477. movsd 6 * SIZE(BO), %xmm1
  478. addsd %xmm0, %xmm8
  479. movsd 8 * SIZE(AO), %xmm0
  480. mulsd %xmm3, %xmm2
  481. addsd %xmm4, %xmm10
  482. movsd 9 * SIZE(AO), %xmm4
  483. mulsd %xmm3, %xmm6
  484. movsd 7 * SIZE(BO), %xmm3
  485. addsd %xmm2, %xmm9
  486. movaps %xmm5, %xmm2
  487. mulsd %xmm1, %xmm5
  488. addsd %xmm6, %xmm11
  489. movaps %xmm7, %xmm6
  490. mulsd %xmm1, %xmm7
  491. movsd 8 * SIZE(BO), %xmm1
  492. addsd %xmm5, %xmm8
  493. movsd 10 * SIZE(AO), %xmm5
  494. mulsd %xmm3, %xmm2
  495. addsd %xmm7, %xmm10
  496. movsd 11 * SIZE(AO), %xmm7
  497. mulsd %xmm3, %xmm6
  498. movsd 9 * SIZE(BO), %xmm3
  499. addq $8 * SIZE, AO
  500. addq $8 * SIZE, BO
  501. decq %rax
  502. jne .L22
  503. ALIGN_4
  504. .L25:
  505. #ifndef TRMMKERNEL
  506. movq K, %rax
  507. #else
  508. movq KKK, %rax
  509. #endif
  510. movsd ALPHA, %xmm7
  511. andq $3, %rax
  512. BRANCH
  513. BRANCH
  514. je .L29
  515. ALIGN_4
  516. .L26:
  517. addsd %xmm2, %xmm9
  518. movaps %xmm0, %xmm2
  519. mulsd %xmm1, %xmm0
  520. addsd %xmm6, %xmm11
  521. movaps %xmm4, %xmm6
  522. mulsd %xmm1, %xmm4
  523. movsd 2 * SIZE(BO), %xmm1
  524. mulsd %xmm3, %xmm2
  525. addsd %xmm0, %xmm8
  526. movsd 2 * SIZE(AO), %xmm0
  527. mulsd %xmm3, %xmm6
  528. movsd 3 * SIZE(BO), %xmm3
  529. addsd %xmm4, %xmm10
  530. movsd 3 * SIZE(AO), %xmm4
  531. addq $2 * SIZE, AO
  532. addq $2 * SIZE, BO
  533. decq %rax
  534. BRANCH
  535. jg .L26
  536. ALIGN_4
  537. .L29:
  538. addsd %xmm2, %xmm9
  539. mulsd %xmm7, %xmm8
  540. addsd %xmm6, %xmm11
  541. mulsd %xmm7, %xmm10
  542. mulsd %xmm7, %xmm9
  543. mulsd %xmm7, %xmm11
  544. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  545. addsd 0 * SIZE(CO1), %xmm8
  546. addsd 1 * SIZE(CO1), %xmm10
  547. addsd 0 * SIZE(CO2), %xmm9
  548. addsd 1 * SIZE(CO2), %xmm11
  549. #endif
  550. movsd %xmm8, 0 * SIZE(CO1)
  551. movsd %xmm10, 1 * SIZE(CO1)
  552. movsd %xmm9, 0 * SIZE(CO2)
  553. movsd %xmm11, 1 * SIZE(CO2)
  554. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  555. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  556. movq K, %rax
  557. subq KKK, %rax
  558. leaq (,%rax, SIZE), %rax
  559. leaq (AO, %rax, 2), AO
  560. leaq (BO, %rax, 2), BO
  561. #endif
  562. #if defined(TRMMKERNEL) && defined(LEFT)
  563. addq $2, KK
  564. #endif
  565. addq $2 * SIZE, CO1
  566. addq $2 * SIZE, CO2
  567. ALIGN_4
  568. .L30:
  569. testq $1, M
  570. je .L39
  571. ALIGN_4
  572. #if !defined(TRMMKERNEL) || \
  573. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  574. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  575. movq B, BO
  576. #else
  577. movq KK, %rax
  578. leaq (, %rax, SIZE), %rax
  579. leaq (AO, %rax, 1), AO
  580. leaq (B, %rax, 2), BO
  581. #endif
  582. movsd 0 * SIZE(AO), %xmm0
  583. xorps %xmm7, %xmm7
  584. movsd 1 * SIZE(AO), %xmm2
  585. xorps %xmm5, %xmm5
  586. movsd 0 * SIZE(BO), %xmm1
  587. xorps %xmm8, %xmm8
  588. xorps %xmm9, %xmm9
  589. movsd 1 * SIZE(BO), %xmm3
  590. #ifndef TRMMKERNEL
  591. movq K, %rax
  592. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  593. movq K, %rax
  594. subq KK, %rax
  595. movq %rax, KKK
  596. #else
  597. movq KK, %rax
  598. #ifdef LEFT
  599. addq $1, %rax
  600. #else
  601. addq $2, %rax
  602. #endif
  603. movq %rax, KKK
  604. #endif
  605. sarq $2, %rax
  606. je .L35
  607. ALIGN_4
  608. .L32:
  609. addsd %xmm5, %xmm8
  610. movsd 2 * SIZE(BO), %xmm5
  611. mulsd %xmm0, %xmm1
  612. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  613. addsd %xmm7, %xmm9
  614. movsd 3 * SIZE(BO), %xmm7
  615. mulsd %xmm0, %xmm3
  616. movsd 2 * SIZE(AO), %xmm0
  617. addsd %xmm1, %xmm8
  618. movsd 4 * SIZE(BO), %xmm1
  619. mulsd %xmm2, %xmm5
  620. addsd %xmm3, %xmm9
  621. movsd 5 * SIZE(BO), %xmm3
  622. mulsd %xmm2, %xmm7
  623. movsd 3 * SIZE(AO), %xmm2
  624. addsd %xmm5, %xmm8
  625. movsd 6 * SIZE(BO), %xmm5
  626. mulsd %xmm0, %xmm1
  627. addsd %xmm7, %xmm9
  628. movsd 7 * SIZE(BO), %xmm7
  629. mulsd %xmm0, %xmm3
  630. movsd 4 * SIZE(AO), %xmm0
  631. addsd %xmm1, %xmm8
  632. movsd 8 * SIZE(BO), %xmm1
  633. mulsd %xmm2, %xmm5
  634. addsd %xmm3, %xmm9
  635. movsd 9 * SIZE(BO), %xmm3
  636. mulsd %xmm2, %xmm7
  637. movsd 5 * SIZE(AO), %xmm2
  638. addq $4 * SIZE, AO
  639. addq $8 * SIZE, BO
  640. decq %rax
  641. jne .L32
  642. ALIGN_4
  643. .L35:
  644. #ifndef TRMMKERNEL
  645. movq K, %rax
  646. #else
  647. movq KKK, %rax
  648. #endif
  649. addsd %xmm5, %xmm8
  650. addsd %xmm7, %xmm9
  651. movsd ALPHA, %xmm7
  652. andq $3, %rax
  653. BRANCH
  654. BRANCH
  655. je .L38
  656. ALIGN_4
  657. .L36:
  658. mulsd %xmm0, %xmm1
  659. addq $2 * SIZE, BO
  660. mulsd %xmm0, %xmm3
  661. movsd 1 * SIZE(AO), %xmm0
  662. addsd %xmm1, %xmm8
  663. movsd 0 * SIZE(BO), %xmm1
  664. addsd %xmm3, %xmm9
  665. movsd 1 * SIZE(BO), %xmm3
  666. addq $1 * SIZE, AO
  667. decq %rax
  668. BRANCH
  669. jg .L36
  670. ALIGN_4
  671. .L38:
  672. mulsd %xmm7, %xmm8
  673. mulsd %xmm7, %xmm9
  674. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  675. addsd 0 * SIZE(CO1), %xmm8
  676. addsd 0 * SIZE(CO2), %xmm9
  677. #endif
  678. movsd %xmm8, 0 * SIZE(CO1)
  679. movsd %xmm9, 0 * SIZE(CO2)
  680. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  681. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  682. movq K, %rax
  683. subq KKK, %rax
  684. leaq (,%rax, SIZE), %rax
  685. leaq (AO, %rax, 1), AO
  686. leaq (BO, %rax, 2), BO
  687. #endif
  688. #if defined(TRMMKERNEL) && defined(LEFT)
  689. addq $1, KK
  690. #endif
  691. ALIGN_4
  692. .L39:
  693. #if defined(TRMMKERNEL) && !defined(LEFT)
  694. addl $2, KK
  695. #endif
  696. movq BO, B
  697. decq J # j --
  698. jg .L10
  699. ALIGN_4
  700. .L40:
  701. testq $1, N
  702. je .L999
  703. #if defined(TRMMKERNEL) && defined(LEFT)
  704. movq OFFSET, %rax
  705. movq %rax, KK
  706. #endif
  707. movq C, CO1
  708. addq LDC, C
  709. movq A, AO
  710. movq M, I
  711. sarq $2, I
  712. jle .L50
  713. ALIGN_4
  714. .L41:
  715. #if !defined(TRMMKERNEL) || \
  716. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  717. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  718. movq B, BO
  719. #else
  720. movq KK, %rax
  721. leaq (, %rax, SIZE), %rax
  722. leaq (AO, %rax, 4), AO
  723. leaq (B, %rax, 1), BO
  724. #endif
  725. movsd 0 * SIZE(AO), %xmm0
  726. xorps %xmm9, %xmm9
  727. movsd 1 * SIZE(AO), %xmm1
  728. xorps %xmm11, %xmm11
  729. movsd 2 * SIZE(AO), %xmm2
  730. xorps %xmm13, %xmm13
  731. movsd 3 * SIZE(AO), %xmm3
  732. xorps %xmm15, %xmm15
  733. movsd 0 * SIZE(BO), %xmm4
  734. xorps %xmm8, %xmm8
  735. movsd 1 * SIZE(BO), %xmm5
  736. xorps %xmm10, %xmm10
  737. prefetcht0 3 * SIZE(CO1)
  738. xorps %xmm12, %xmm12
  739. xorps %xmm14, %xmm14
  740. #ifndef TRMMKERNEL
  741. movq K, %rax
  742. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  743. movq K, %rax
  744. subq KK, %rax
  745. movq %rax, KKK
  746. #else
  747. movq KK, %rax
  748. #ifdef LEFT
  749. addq $4, %rax
  750. #else
  751. addq $1, %rax
  752. #endif
  753. movq %rax, KKK
  754. #endif
  755. sarq $2, %rax
  756. je .L45
  757. ALIGN_4
  758. .L42:
  759. addsd %xmm9, %xmm8
  760. movsd 4 * SIZE(AO), %xmm9
  761. mulsd %xmm4, %xmm0
  762. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  763. addsd %xmm11, %xmm10
  764. movsd 5 * SIZE(AO), %xmm11
  765. mulsd %xmm4, %xmm1
  766. addsd %xmm13, %xmm12
  767. movsd 6 * SIZE(AO), %xmm13
  768. mulsd %xmm4, %xmm2
  769. addsd %xmm15, %xmm14
  770. movsd 7 * SIZE(AO), %xmm15
  771. mulsd %xmm4, %xmm3
  772. movsd 2 * SIZE(BO), %xmm4
  773. addsd %xmm0, %xmm8
  774. movsd 8 * SIZE(AO), %xmm0
  775. mulsd %xmm5, %xmm9
  776. addsd %xmm1, %xmm10
  777. movsd 9 * SIZE(AO), %xmm1
  778. mulsd %xmm5, %xmm11
  779. addsd %xmm2, %xmm12
  780. movsd 10 * SIZE(AO), %xmm2
  781. mulsd %xmm5, %xmm13
  782. addsd %xmm3, %xmm14
  783. movsd 11 * SIZE(AO), %xmm3
  784. mulsd %xmm5, %xmm15
  785. movsd 3 * SIZE(BO), %xmm5
  786. addsd %xmm9, %xmm8
  787. movsd 12 * SIZE(AO), %xmm9
  788. mulsd %xmm4, %xmm0
  789. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  790. addsd %xmm11, %xmm10
  791. movsd 13 * SIZE(AO), %xmm11
  792. mulsd %xmm4, %xmm1
  793. addsd %xmm13, %xmm12
  794. movsd 14 * SIZE(AO), %xmm13
  795. mulsd %xmm4, %xmm2
  796. addsd %xmm15, %xmm14
  797. movsd 15 * SIZE(AO), %xmm15
  798. mulsd %xmm4, %xmm3
  799. movsd 4 * SIZE(BO), %xmm4
  800. subq $-16 * SIZE, AO
  801. addsd %xmm0, %xmm8
  802. movsd 0 * SIZE(AO), %xmm0
  803. mulsd %xmm5, %xmm9
  804. addsd %xmm1, %xmm10
  805. movsd 1 * SIZE(AO), %xmm1
  806. mulsd %xmm5, %xmm11
  807. addq $ 4 * SIZE, BO
  808. addsd %xmm2, %xmm12
  809. movsd 2 * SIZE(AO), %xmm2
  810. mulsd %xmm5, %xmm13
  811. decq %rax
  812. addsd %xmm3, %xmm14
  813. movsd 3 * SIZE(AO), %xmm3
  814. mulsd %xmm5, %xmm15
  815. movsd 1 * SIZE(BO), %xmm5
  816. jne .L42
  817. ALIGN_4
  818. .L45:
  819. #ifndef TRMMKERNEL
  820. movq K, %rax
  821. #else
  822. movq KKK, %rax
  823. #endif
  824. movsd ALPHA, %xmm7
  825. addsd %xmm9, %xmm8
  826. addsd %xmm11, %xmm10
  827. addsd %xmm13, %xmm12
  828. addsd %xmm15, %xmm14
  829. andq $3, %rax
  830. BRANCH
  831. BRANCH
  832. je .L49
  833. ALIGN_4
  834. .L46:
  835. mulsd %xmm4, %xmm0
  836. mulsd %xmm4, %xmm1
  837. mulsd %xmm4, %xmm2
  838. mulsd %xmm4, %xmm3
  839. movsd 1 * SIZE(BO), %xmm4
  840. addsd %xmm0, %xmm8
  841. movsd 4 * SIZE(AO), %xmm0
  842. addsd %xmm1, %xmm10
  843. movsd 5 * SIZE(AO), %xmm1
  844. addsd %xmm2, %xmm12
  845. movsd 6 * SIZE(AO), %xmm2
  846. addsd %xmm3, %xmm14
  847. movsd 7 * SIZE(AO), %xmm3
  848. addq $4 * SIZE, AO
  849. addq $1 * SIZE, BO
  850. decq %rax
  851. BRANCH
  852. jg .L46
  853. ALIGN_4
  854. .L49:
  855. mulsd %xmm7, %xmm8
  856. mulsd %xmm7, %xmm10
  857. mulsd %xmm7, %xmm12
  858. mulsd %xmm7, %xmm14
  859. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  860. addsd 0 * SIZE(CO1), %xmm8
  861. addsd 1 * SIZE(CO1), %xmm10
  862. addsd 2 * SIZE(CO1), %xmm12
  863. addsd 3 * SIZE(CO1), %xmm14
  864. #endif
  865. movsd %xmm8, 0 * SIZE(CO1)
  866. movsd %xmm10, 1 * SIZE(CO1)
  867. movsd %xmm12, 2 * SIZE(CO1)
  868. movsd %xmm14, 3 * SIZE(CO1)
  869. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  870. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  871. movq K, %rax
  872. subq KKK, %rax
  873. leaq (,%rax, SIZE), %rax
  874. leaq (AO, %rax, 4), AO
  875. leaq (BO, %rax, 1), BO
  876. #endif
  877. #if defined(TRMMKERNEL) && defined(LEFT)
  878. addq $4, KK
  879. #endif
  880. addq $4 * SIZE, CO1
  881. decq I # i --
  882. jg .L41
  883. ALIGN_4
  884. .L50:
  885. testq $2, M
  886. jle .L60
  887. #if !defined(TRMMKERNEL) || \
  888. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  889. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  890. movq B, BO
  891. #else
  892. movq KK, %rax
  893. leaq (, %rax, SIZE), %rax
  894. leaq (AO, %rax, 2), AO
  895. leaq (B, %rax, 1), BO
  896. #endif
  897. movsd 0 * SIZE(AO), %xmm0
  898. xorps %xmm2, %xmm2
  899. movsd 1 * SIZE(AO), %xmm1
  900. xorps %xmm3, %xmm3
  901. movsd 0 * SIZE(BO), %xmm4
  902. xorps %xmm8, %xmm8
  903. movsd 1 * SIZE(BO), %xmm5
  904. xorps %xmm10, %xmm10
  905. #ifndef TRMMKERNEL
  906. movq K, %rax
  907. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  908. movq K, %rax
  909. subq KK, %rax
  910. movq %rax, KKK
  911. #else
  912. movq KK, %rax
  913. #ifdef LEFT
  914. addq $2, %rax
  915. #else
  916. addq $1, %rax
  917. #endif
  918. movq %rax, KKK
  919. #endif
  920. sarq $2, %rax
  921. je .L55
  922. ALIGN_4
  923. .L52:
  924. addsd %xmm2, %xmm8
  925. movsd 2 * SIZE(AO), %xmm2
  926. mulsd %xmm4, %xmm0
  927. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  928. addsd %xmm3, %xmm10
  929. movsd 3 * SIZE(AO), %xmm3
  930. mulsd %xmm4, %xmm1
  931. movsd 2 * SIZE(BO), %xmm4
  932. addsd %xmm0, %xmm8
  933. movsd 4 * SIZE(AO), %xmm0
  934. mulsd %xmm5, %xmm2
  935. addq $8 * SIZE, AO
  936. addsd %xmm1, %xmm10
  937. movsd -3 * SIZE(AO), %xmm1
  938. mulsd %xmm5, %xmm3
  939. movsd 3 * SIZE(BO), %xmm5
  940. addsd %xmm2, %xmm8
  941. movsd -2 * SIZE(AO), %xmm2
  942. mulsd %xmm4, %xmm0
  943. addq $4 * SIZE, BO
  944. addsd %xmm3, %xmm10
  945. movsd -1 * SIZE(AO), %xmm3
  946. mulsd %xmm4, %xmm1
  947. movsd 0 * SIZE(BO), %xmm4
  948. addsd %xmm0, %xmm8
  949. movsd 0 * SIZE(AO), %xmm0
  950. mulsd %xmm5, %xmm2
  951. decq %rax
  952. addsd %xmm1, %xmm10
  953. movsd 1 * SIZE(AO), %xmm1
  954. mulsd %xmm5, %xmm3
  955. movsd 1 * SIZE(BO), %xmm5
  956. jne .L52
  957. ALIGN_4
  958. .L55:
  959. #ifndef TRMMKERNEL
  960. movq K, %rax
  961. #else
  962. movq KKK, %rax
  963. #endif
  964. movsd ALPHA, %xmm7
  965. addsd %xmm2, %xmm8
  966. addsd %xmm3, %xmm10
  967. andq $3, %rax
  968. BRANCH
  969. BRANCH
  970. je .L59
  971. ALIGN_4
  972. .L56:
  973. mulsd %xmm4, %xmm0
  974. mulsd %xmm4, %xmm1
  975. movsd 1 * SIZE(BO), %xmm4
  976. addsd %xmm0, %xmm8
  977. movsd 2 * SIZE(AO), %xmm0
  978. addsd %xmm1, %xmm10
  979. movsd 3 * SIZE(AO), %xmm1
  980. addq $2 * SIZE, AO
  981. addq $1 * SIZE, BO
  982. decq %rax
  983. BRANCH
  984. jg .L56
  985. ALIGN_4
  986. .L59:
  987. mulsd %xmm7, %xmm8
  988. mulsd %xmm7, %xmm10
  989. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  990. addsd 0 * SIZE(CO1), %xmm8
  991. addsd 1 * SIZE(CO1), %xmm10
  992. #endif
  993. movsd %xmm8, 0 * SIZE(CO1)
  994. movsd %xmm10, 1 * SIZE(CO1)
  995. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  996. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  997. movq K, %rax
  998. subq KKK, %rax
  999. leaq (,%rax, SIZE), %rax
  1000. leaq (AO, %rax, 2), AO
  1001. leaq (BO, %rax, 1), BO
  1002. #endif
  1003. #if defined(TRMMKERNEL) && defined(LEFT)
  1004. addq $2, KK
  1005. #endif
  1006. addq $2 * SIZE, CO1
  1007. ALIGN_4
  1008. .L60:
  1009. testq $1, M
  1010. je .L999
  1011. ALIGN_4
  1012. #if !defined(TRMMKERNEL) || \
  1013. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1014. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1015. movq B, BO
  1016. #else
  1017. movq KK, %rax
  1018. leaq (, %rax, SIZE), %rax
  1019. leaq (AO, %rax, 1), AO
  1020. leaq (B, %rax, 1), BO
  1021. #endif
  1022. movsd 0 * SIZE(AO), %xmm0
  1023. xorps %xmm5, %xmm5
  1024. movsd 1 * SIZE(AO), %xmm2
  1025. xorps %xmm7, %xmm7
  1026. movsd 0 * SIZE(BO), %xmm1
  1027. xorps %xmm8, %xmm8
  1028. movsd 1 * SIZE(BO), %xmm3
  1029. xorps %xmm9, %xmm9
  1030. movsd 2 * SIZE(AO), %xmm4
  1031. movsd 3 * SIZE(AO), %xmm6
  1032. #ifndef TRMMKERNEL
  1033. movq K, %rax
  1034. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1035. movq K, %rax
  1036. subq KK, %rax
  1037. movq %rax, KKK
  1038. #else
  1039. movq KK, %rax
  1040. #ifdef LEFT
  1041. addq $1, %rax
  1042. #else
  1043. addq $1, %rax
  1044. #endif
  1045. movq %rax, KKK
  1046. #endif
  1047. sarq $2, %rax
  1048. je .L65
  1049. ALIGN_4
  1050. .L62:
  1051. addsd %xmm5, %xmm8
  1052. movsd 2 * SIZE(BO), %xmm5
  1053. mulsd %xmm0, %xmm1
  1054. movsd 4 * SIZE(AO), %xmm0
  1055. addsd %xmm7, %xmm9
  1056. movsd 3 * SIZE(BO), %xmm7
  1057. mulsd %xmm2, %xmm3
  1058. movsd 5 * SIZE(AO), %xmm2
  1059. addsd %xmm1, %xmm8
  1060. movsd 4 * SIZE(BO), %xmm1
  1061. mulsd %xmm4, %xmm5
  1062. movsd 6 * SIZE(AO), %xmm4
  1063. addsd %xmm3, %xmm9
  1064. movsd 5 * SIZE(BO), %xmm3
  1065. mulsd %xmm6, %xmm7
  1066. movsd 7 * SIZE(AO), %xmm6
  1067. addq $4 * SIZE, AO
  1068. addq $4 * SIZE, BO
  1069. decq %rax
  1070. jne .L62
  1071. addsd %xmm5, %xmm8
  1072. addsd %xmm7, %xmm9
  1073. ALIGN_4
  1074. .L65:
  1075. #ifndef TRMMKERNEL
  1076. movq K, %rax
  1077. #else
  1078. movq KKK, %rax
  1079. #endif
  1080. movsd ALPHA, %xmm7
  1081. andq $3, %rax
  1082. BRANCH
  1083. BRANCH
  1084. je .L68
  1085. ALIGN_4
  1086. .L66:
  1087. movsd 0 * SIZE(AO), %xmm0
  1088. movsd 0 * SIZE(BO), %xmm1
  1089. mulsd %xmm0, %xmm1
  1090. addsd %xmm1, %xmm8
  1091. addq $1 * SIZE, AO
  1092. addq $1 * SIZE, BO
  1093. decq %rax
  1094. BRANCH
  1095. jg .L66
  1096. ALIGN_4
  1097. .L68:
  1098. addsd %xmm9, %xmm8
  1099. mulsd %xmm7, %xmm8
  1100. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1101. addsd 0 * SIZE(CO1), %xmm8
  1102. #endif
  1103. movsd %xmm8, 0 * SIZE(CO1)
  1104. ALIGN_4
  1105. .L999:
  1106. movq 0(%rsp), %rbx
  1107. movq 8(%rsp), %rbp
  1108. movq 16(%rsp), %r12
  1109. movq 24(%rsp), %r13
  1110. movq 32(%rsp), %r14
  1111. movq 40(%rsp), %r15
  1112. #ifdef WINDOWS_ABI
  1113. movq 48(%rsp), %rdi
  1114. movq 56(%rsp), %rsi
  1115. movups 64(%rsp), %xmm6
  1116. movups 80(%rsp), %xmm7
  1117. movups 96(%rsp), %xmm8
  1118. movups 112(%rsp), %xmm9
  1119. movups 128(%rsp), %xmm10
  1120. movups 144(%rsp), %xmm11
  1121. movups 160(%rsp), %xmm12
  1122. movups 176(%rsp), %xmm13
  1123. movups 192(%rsp), %xmm14
  1124. movups 208(%rsp), %xmm15
  1125. #endif
  1126. addq $STACKSIZE, %rsp
  1127. ret
  1128. EPILOGUE