You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_4x4_penryn.S 38 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define OLD_K %rdx
  43. #define M %r13
  44. #define N %r14
  45. #define K %r15
  46. #define A %rcx
  47. #define B %r8
  48. #define C %r9
  49. #define LDC %r10
  50. #define I %r11
  51. #define AO %rdi
  52. #define BO %rsi
  53. #define CO1 %rbx
  54. #define CO2 %rbp
  55. #define BB %r12
  56. #define PREA %rdx
  57. #ifndef WINDOWS_ABI
  58. #define STACKSIZE 128
  59. #define OLD_LDC 8 + STACKSIZE(%rsp)
  60. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  61. #define ALPHA 48(%rsp)
  62. #define J 56(%rsp)
  63. #define OFFSET 64(%rsp)
  64. #define KK 72(%rsp)
  65. #define KKK 80(%rsp)
  66. #else
  67. #define STACKSIZE 512
  68. #define OLD_A 40 + STACKSIZE(%rsp)
  69. #define OLD_B 48 + STACKSIZE(%rsp)
  70. #define OLD_C 56 + STACKSIZE(%rsp)
  71. #define OLD_LDC 64 + STACKSIZE(%rsp)
  72. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  73. #define ALPHA 224(%rsp)
  74. #define J 232(%rsp)
  75. #define OFFSET 240(%rsp)
  76. #define KK 248(%rsp)
  77. #define KKK 256(%rsp)
  78. #endif
  79. #ifdef NANO
  80. #define PREFETCHSIZE (8 * 2 + 4)
  81. #define PREFETCHW prefetcht0
  82. #define PREFETCHB prefetcht0
  83. #endif
  84. #ifdef DUNNINGTON
  85. #define PREFETCHSIZE (8 * 97 + 4)
  86. #define PREFETCHB prefetcht2
  87. #endif
  88. #ifndef PREFETCH
  89. #define PREFETCH prefetcht0
  90. #endif
  91. #ifndef PREFETCHW
  92. #define PREFETCHW prefetcht2
  93. #endif
  94. #ifndef PREFETCHB
  95. #define PREFETCHB prefetcht0
  96. #endif
  97. #ifndef PREFETCHSIZE
  98. #define PREFETCHSIZE (8 * 17 + 4)
  99. #endif
  100. PROLOGUE
  101. PROFCODE
  102. subq $STACKSIZE, %rsp
  103. movq %rbx, 0(%rsp)
  104. movq %rbp, 8(%rsp)
  105. movq %r12, 16(%rsp)
  106. movq %r13, 24(%rsp)
  107. movq %r14, 32(%rsp)
  108. movq %r15, 40(%rsp)
  109. #ifdef WINDOWS_ABI
  110. movq %rdi, 48(%rsp)
  111. movq %rsi, 56(%rsp)
  112. movups %xmm6, 64(%rsp)
  113. movups %xmm7, 80(%rsp)
  114. movups %xmm8, 96(%rsp)
  115. movups %xmm9, 112(%rsp)
  116. movups %xmm10, 128(%rsp)
  117. movups %xmm11, 144(%rsp)
  118. movups %xmm12, 160(%rsp)
  119. movups %xmm13, 176(%rsp)
  120. movups %xmm14, 192(%rsp)
  121. movups %xmm15, 208(%rsp)
  122. movq ARG1, OLD_M
  123. movq ARG2, OLD_N
  124. movq ARG3, OLD_K
  125. movq OLD_A, A
  126. movq OLD_B, B
  127. movq OLD_C, C
  128. movq OLD_LDC, LDC
  129. #ifdef TRMMKERNEL
  130. movq OLD_OFFSET, %r11
  131. #endif
  132. movaps %xmm3, %xmm0
  133. #else
  134. movq OLD_LDC, LDC
  135. #ifdef TRMMKERNEL
  136. movq OLD_OFFSET, %r11
  137. #endif
  138. #endif
  139. movlps %xmm0, ALPHA
  140. subq $-16 * SIZE, A
  141. subq $-17 * SIZE, B
  142. movq OLD_M, M
  143. movq OLD_N, N
  144. movq OLD_K, K
  145. leaq (, LDC, SIZE), LDC
  146. #ifdef TRMMKERNEL
  147. movq %r11, OFFSET
  148. #ifndef LEFT
  149. negq %r11
  150. #endif
  151. movq %r11, KK
  152. #endif
  153. movq N, J
  154. sarq $2, J
  155. NOBRANCH
  156. jle .L40
  157. ALIGN_4
  158. .L01:
  159. #if defined(TRMMKERNEL) && defined(LEFT)
  160. movq OFFSET, %rax
  161. movq %rax, KK
  162. #endif
  163. movq C, CO1
  164. leaq (C, LDC, 1), CO2
  165. movq A, AO
  166. movq K, %rax
  167. salq $BASE_SHIFT + 2, %rax
  168. leaq (B, %rax), BB
  169. movq M, I
  170. sarq $2, I # i = (m >> 2)
  171. NOBRANCH
  172. jle .L20
  173. ALIGN_4
  174. .L11:
  175. #if !defined(TRMMKERNEL) || \
  176. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  177. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  178. movq B, BO
  179. #else
  180. movq B, BO
  181. movq KK, %rax
  182. leaq (, %rax, SIZE), %rax
  183. leaq (AO, %rax, 4), AO
  184. leaq (BO, %rax, 4), BO
  185. #endif
  186. movaps -16 * SIZE(AO), %xmm0
  187. xorpd %xmm3, %xmm3
  188. movaps -14 * SIZE(AO), %xmm1
  189. xorpd %xmm4, %xmm4
  190. movaps -17 * SIZE(BO), %xmm2
  191. PREFETCHB -16 * SIZE(BB)
  192. xorpd %xmm5, %xmm5
  193. xorpd %xmm6, %xmm6
  194. PREFETCHW 3 * SIZE(CO1)
  195. movaps %xmm4, %xmm8
  196. movaps %xmm4, %xmm9
  197. PREFETCHW 7 * SIZE(CO2)
  198. movaps %xmm4, %xmm10
  199. movaps %xmm4, %xmm11
  200. PREFETCHW 3 * SIZE(CO1, LDC, 2)
  201. movaps %xmm4, %xmm12
  202. movaps %xmm4, %xmm13
  203. PREFETCHW 7 * SIZE(CO2, LDC, 2)
  204. movapd %xmm4, %xmm14
  205. movapd %xmm4, %xmm15
  206. #ifndef TRMMKERNEL
  207. movq K, %rax
  208. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  209. movq K, %rax
  210. subq KK, %rax
  211. movq %rax, KKK
  212. #else
  213. movq KK, %rax
  214. #ifdef LEFT
  215. addq $4, %rax
  216. #else
  217. addq $4, %rax
  218. #endif
  219. movq %rax, KKK
  220. #endif
  221. sarq $2, %rax
  222. NOBRANCH
  223. jle .L15
  224. ALIGN_3
  225. .L12:
  226. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  227. addpd %xmm3, %xmm11
  228. movaps -15 * SIZE(BO), %xmm3
  229. addpd %xmm4, %xmm15
  230. movaps %xmm2, %xmm4
  231. pshufd $0x4e, %xmm2, %xmm7
  232. mulpd %xmm0, %xmm2
  233. mulpd %xmm1, %xmm4
  234. addpd %xmm5, %xmm10
  235. addpd %xmm6, %xmm14
  236. movaps %xmm7, %xmm6
  237. mulpd %xmm0, %xmm7
  238. mulpd %xmm1, %xmm6
  239. addpd %xmm2, %xmm9
  240. movaps -13 * SIZE(BO), %xmm2
  241. addpd %xmm4, %xmm13
  242. movaps %xmm3, %xmm4
  243. pshufd $0x4e, %xmm3, %xmm5
  244. mulpd %xmm0, %xmm3
  245. mulpd %xmm1, %xmm4
  246. addpd %xmm7, %xmm8
  247. addpd %xmm6, %xmm12
  248. movaps %xmm5, %xmm6
  249. mulpd %xmm0, %xmm5
  250. movaps -12 * SIZE(AO), %xmm0
  251. mulpd %xmm1, %xmm6
  252. movaps -10 * SIZE(AO), %xmm1
  253. addpd %xmm3, %xmm11
  254. movaps -11 * SIZE(BO), %xmm3
  255. addpd %xmm4, %xmm15
  256. movaps %xmm2, %xmm4
  257. pshufd $0x4e, %xmm2, %xmm7
  258. mulpd %xmm0, %xmm2
  259. mulpd %xmm1, %xmm4
  260. addpd %xmm5, %xmm10
  261. addpd %xmm6, %xmm14
  262. movaps %xmm7, %xmm6
  263. mulpd %xmm0, %xmm7
  264. mulpd %xmm1, %xmm6
  265. addpd %xmm2, %xmm9
  266. movaps -9 * SIZE(BO), %xmm2
  267. addpd %xmm4, %xmm13
  268. movaps %xmm3, %xmm4
  269. pshufd $0x4e, %xmm3, %xmm5
  270. mulpd %xmm0, %xmm3
  271. mulpd %xmm1, %xmm4
  272. addpd %xmm7, %xmm8
  273. addpd %xmm6, %xmm12
  274. movaps %xmm5, %xmm6
  275. mulpd %xmm0, %xmm5
  276. movaps -8 * SIZE(AO), %xmm0
  277. mulpd %xmm1, %xmm6
  278. movaps -6 * SIZE(AO), %xmm1
  279. addpd %xmm3, %xmm11
  280. movaps -7 * SIZE(BO), %xmm3
  281. addpd %xmm4, %xmm15
  282. movapd %xmm2, %xmm4
  283. pshufd $0x4e, %xmm2, %xmm7
  284. mulpd %xmm0, %xmm2
  285. mulpd %xmm1, %xmm4
  286. addpd %xmm5, %xmm10
  287. addpd %xmm6, %xmm14
  288. movapd %xmm7, %xmm6
  289. mulpd %xmm0, %xmm7
  290. mulpd %xmm1, %xmm6
  291. PADDING
  292. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  293. addpd %xmm2, %xmm9
  294. movaps -5 * SIZE(BO), %xmm2
  295. addpd %xmm4, %xmm13
  296. movaps %xmm3, %xmm4
  297. pshufd $0x4e, %xmm3, %xmm5
  298. mulpd %xmm0, %xmm3
  299. mulpd %xmm1, %xmm4
  300. addpd %xmm7, %xmm8
  301. addpd %xmm6, %xmm12
  302. movaps %xmm5, %xmm6
  303. mulpd %xmm0, %xmm5
  304. movaps -4 * SIZE(AO), %xmm0
  305. mulpd %xmm1, %xmm6
  306. movaps -2 * SIZE(AO), %xmm1
  307. addpd %xmm3, %xmm11
  308. subq $-16 * SIZE, AO
  309. movaps -3 * SIZE(BO), %xmm3
  310. addpd %xmm4, %xmm15
  311. movaps %xmm2, %xmm4
  312. pshufd $0x4e, %xmm2, %xmm7
  313. mulpd %xmm0, %xmm2
  314. mulpd %xmm1, %xmm4
  315. addpd %xmm5, %xmm10
  316. addpd %xmm6, %xmm14
  317. movaps %xmm7, %xmm6
  318. mulpd %xmm0, %xmm7
  319. mulpd %xmm1, %xmm6
  320. addpd %xmm2, %xmm9
  321. movaps -1 * SIZE(BO), %xmm2
  322. addpd %xmm4, %xmm13
  323. movaps %xmm3, %xmm4
  324. pshufd $0x4e, %xmm3, %xmm5
  325. subq $-16 * SIZE, BO
  326. mulpd %xmm0, %xmm3
  327. mulpd %xmm1, %xmm4
  328. addpd %xmm7, %xmm8
  329. addpd %xmm6, %xmm12
  330. movaps %xmm5, %xmm6
  331. mulpd %xmm0, %xmm5
  332. movaps -16 * SIZE(AO), %xmm0
  333. mulpd %xmm1, %xmm6
  334. movaps -14 * SIZE(AO), %xmm1
  335. subq $1, %rax
  336. BRANCH
  337. jg .L12
  338. ALIGN_3
  339. .L15:
  340. PREFETCHB -8 * SIZE(BB)
  341. #ifdef DUNNINGTON
  342. PREFETCHB 0 * SIZE(BB)
  343. PREFETCHB 8 * SIZE(BB)
  344. #endif
  345. #ifndef TRMMKERNEL
  346. movq K, %rax
  347. #else
  348. movq KKK, %rax
  349. #endif
  350. andq $3, %rax # if (k & 1)
  351. BRANCH
  352. je .L18
  353. ALIGN_3
  354. .L16:
  355. addpd %xmm3, %xmm11
  356. movaps -15 * SIZE(BO), %xmm3
  357. addpd %xmm4, %xmm15
  358. movaps %xmm2, %xmm4
  359. pshufd $0x4e, %xmm2, %xmm7
  360. mulpd %xmm0, %xmm2
  361. mulpd %xmm1, %xmm4
  362. addpd %xmm5, %xmm10
  363. addpd %xmm6, %xmm14
  364. movaps %xmm7, %xmm6
  365. mulpd %xmm0, %xmm7
  366. mulpd %xmm1, %xmm6
  367. addpd %xmm2, %xmm9
  368. movaps -13 * SIZE(BO), %xmm2
  369. addpd %xmm4, %xmm13
  370. movaps %xmm3, %xmm4
  371. pshufd $0x4e, %xmm3, %xmm5
  372. mulpd %xmm0, %xmm3
  373. mulpd %xmm1, %xmm4
  374. addpd %xmm7, %xmm8
  375. addpd %xmm6, %xmm12
  376. movaps %xmm5, %xmm6
  377. mulpd %xmm0, %xmm5
  378. movaps -12 * SIZE(AO), %xmm0
  379. mulpd %xmm1, %xmm6
  380. movaps -10 * SIZE(AO), %xmm1
  381. addq $4 * SIZE, AO
  382. addq $4 * SIZE, BO
  383. subq $1, %rax
  384. BRANCH
  385. jg .L16
  386. ALIGN_4
  387. .L18:
  388. movddup ALPHA, %xmm1
  389. #ifndef DUNNINGTON
  390. subq $-16 * SIZE, BB
  391. #else
  392. subq $-32 * SIZE, BB
  393. #endif
  394. addpd %xmm3, %xmm11
  395. addpd %xmm4, %xmm15
  396. addpd %xmm5, %xmm10
  397. addpd %xmm6, %xmm14
  398. movaps %xmm8, %xmm0
  399. movsd %xmm9, %xmm8
  400. mulpd %xmm1, %xmm8
  401. movsd %xmm0, %xmm9
  402. mulpd %xmm1, %xmm9
  403. movaps %xmm10, %xmm0
  404. movsd %xmm11, %xmm10
  405. mulpd %xmm1, %xmm10
  406. movsd %xmm0, %xmm11
  407. mulpd %xmm1, %xmm11
  408. movaps %xmm12, %xmm0
  409. movsd %xmm13, %xmm12
  410. mulpd %xmm1, %xmm12
  411. movsd %xmm0, %xmm13
  412. mulpd %xmm1, %xmm13
  413. movaps %xmm14, %xmm0
  414. movsd %xmm15, %xmm14
  415. mulpd %xmm1, %xmm14
  416. movsd %xmm0, %xmm15
  417. mulpd %xmm1, %xmm15
  418. movq CO1, %rax
  419. orq LDC, %rax
  420. testq $15, %rax
  421. NOBRANCH
  422. jne .L18x
  423. #ifndef TRMMKERNEL
  424. addpd 0 * SIZE(CO1), %xmm8
  425. addpd 2 * SIZE(CO1), %xmm12
  426. addpd 0 * SIZE(CO2), %xmm9
  427. addpd 2 * SIZE(CO2), %xmm13
  428. addpd 0 * SIZE(CO1, LDC, 2), %xmm10
  429. addpd 2 * SIZE(CO1, LDC, 2), %xmm14
  430. addpd 0 * SIZE(CO2, LDC, 2), %xmm11
  431. addpd 2 * SIZE(CO2, LDC, 2), %xmm15
  432. #endif
  433. movaps %xmm8, 0 * SIZE(CO1)
  434. movaps %xmm12, 2 * SIZE(CO1)
  435. movaps %xmm9, 0 * SIZE(CO2)
  436. movaps %xmm13, 2 * SIZE(CO2)
  437. movaps %xmm10, 0 * SIZE(CO1, LDC, 2)
  438. movaps %xmm14, 2 * SIZE(CO1, LDC, 2)
  439. movaps %xmm11, 0 * SIZE(CO2, LDC, 2)
  440. movaps %xmm15, 2 * SIZE(CO2, LDC, 2)
  441. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  442. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  443. movq K, %rax
  444. subq KKK, %rax
  445. leaq (,%rax, SIZE), %rax
  446. leaq (AO, %rax, 4), AO
  447. leaq (BO, %rax, 4), BO
  448. #endif
  449. #if defined(TRMMKERNEL) && defined(LEFT)
  450. addq $4, KK
  451. #endif
  452. addq $4 * SIZE, CO1 # coffset += 4
  453. addq $4 * SIZE, CO2 # coffset += 4
  454. decq I # i --
  455. BRANCH
  456. jg .L11
  457. jmp .L20
  458. ALIGN_4
  459. .L18x:
  460. #ifndef TRMMKERNEL
  461. movsd 0 * SIZE(CO1), %xmm0
  462. movhpd 1 * SIZE(CO1), %xmm0
  463. movsd 2 * SIZE(CO1), %xmm1
  464. movhpd 3 * SIZE(CO1), %xmm1
  465. movsd 0 * SIZE(CO2), %xmm2
  466. movhpd 1 * SIZE(CO2), %xmm2
  467. movsd 2 * SIZE(CO2), %xmm3
  468. movhpd 3 * SIZE(CO2), %xmm3
  469. movsd 0 * SIZE(CO1, LDC, 2), %xmm4
  470. movhpd 1 * SIZE(CO1, LDC, 2), %xmm4
  471. movsd 2 * SIZE(CO1, LDC, 2), %xmm5
  472. movhpd 3 * SIZE(CO1, LDC, 2), %xmm5
  473. movsd 0 * SIZE(CO2, LDC, 2), %xmm6
  474. movhpd 1 * SIZE(CO2, LDC, 2), %xmm6
  475. movsd 2 * SIZE(CO2, LDC, 2), %xmm7
  476. movhpd 3 * SIZE(CO2, LDC, 2), %xmm7
  477. addpd %xmm0, %xmm8
  478. addpd %xmm1, %xmm12
  479. addpd %xmm2, %xmm9
  480. addpd %xmm3, %xmm13
  481. addpd %xmm4, %xmm10
  482. addpd %xmm5, %xmm14
  483. addpd %xmm6, %xmm11
  484. addpd %xmm7, %xmm15
  485. #endif
  486. movsd %xmm8, 0 * SIZE(CO1)
  487. movhpd %xmm8, 1 * SIZE(CO1)
  488. movsd %xmm12, 2 * SIZE(CO1)
  489. movhpd %xmm12, 3 * SIZE(CO1)
  490. movsd %xmm9, 0 * SIZE(CO2)
  491. movhpd %xmm9, 1 * SIZE(CO2)
  492. movsd %xmm13, 2 * SIZE(CO2)
  493. movhpd %xmm13, 3 * SIZE(CO2)
  494. movsd %xmm10, 0 * SIZE(CO1, LDC, 2)
  495. movhpd %xmm10, 1 * SIZE(CO1, LDC, 2)
  496. movsd %xmm14, 2 * SIZE(CO1, LDC, 2)
  497. movhpd %xmm14, 3 * SIZE(CO1, LDC, 2)
  498. movsd %xmm11, 0 * SIZE(CO2, LDC, 2)
  499. movhpd %xmm11, 1 * SIZE(CO2, LDC, 2)
  500. movsd %xmm15, 2 * SIZE(CO2, LDC, 2)
  501. movhpd %xmm15, 3 * SIZE(CO2, LDC, 2)
  502. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  503. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  504. movq K, %rax
  505. subq KKK, %rax
  506. leaq (,%rax, SIZE), %rax
  507. leaq (AO, %rax, 4), AO
  508. leaq (BO, %rax, 4), BO
  509. #endif
  510. #if defined(TRMMKERNEL) && defined(LEFT)
  511. addq $4, KK
  512. #endif
  513. addq $4 * SIZE, CO1 # coffset += 4
  514. addq $4 * SIZE, CO2 # coffset += 4
  515. decq I # i --
  516. BRANCH
  517. jg .L11
  518. ALIGN_4
  519. .L20:
  520. testq $2, M
  521. BRANCH
  522. jle .L30
  523. ALIGN_4
  524. #if !defined(TRMMKERNEL) || \
  525. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  526. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  527. movq B, BO
  528. #else
  529. movq B, BO
  530. movq KK, %rax
  531. leaq (, %rax, SIZE), %rax
  532. leaq (AO, %rax, 2), AO
  533. leaq (BO, %rax, 4), BO
  534. #endif
  535. movaps -16 * SIZE(AO), %xmm0
  536. movaps -17 * SIZE(BO), %xmm2
  537. movaps -15 * SIZE(BO), %xmm3
  538. xorps %xmm3, %xmm3
  539. xorps %xmm4, %xmm4
  540. xorps %xmm5, %xmm5
  541. xorps %xmm6, %xmm6
  542. movaps %xmm3, %xmm8
  543. movaps %xmm3, %xmm9
  544. movaps %xmm3, %xmm10
  545. movaps %xmm3, %xmm11
  546. #ifndef TRMMKERNEL
  547. movq K, %rax
  548. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  549. movq K, %rax
  550. subq KK, %rax
  551. movq %rax, KKK
  552. #else
  553. movq KK, %rax
  554. #ifdef LEFT
  555. addq $2, %rax
  556. #else
  557. addq $4, %rax
  558. #endif
  559. movq %rax, KKK
  560. #endif
  561. sarq $2, %rax
  562. NOBRANCH
  563. jle .L25
  564. ALIGN_4
  565. .L22:
  566. addpd %xmm3, %xmm11
  567. movaps -15 * SIZE(BO), %xmm3
  568. pshufd $0x4e, %xmm2, %xmm7
  569. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  570. mulpd %xmm0, %xmm2
  571. addpd %xmm5, %xmm10
  572. mulpd %xmm0, %xmm7
  573. addpd %xmm2, %xmm9
  574. movaps -13 * SIZE(BO), %xmm2
  575. pshufd $0x4e, %xmm3, %xmm5
  576. mulpd %xmm0, %xmm3
  577. addpd %xmm7, %xmm8
  578. mulpd %xmm0, %xmm5
  579. movaps -14 * SIZE(AO), %xmm0
  580. addpd %xmm3, %xmm11
  581. movaps -11 * SIZE(BO), %xmm3
  582. pshufd $0x4e, %xmm2, %xmm7
  583. mulpd %xmm0, %xmm2
  584. addpd %xmm5, %xmm10
  585. mulpd %xmm0, %xmm7
  586. addpd %xmm2, %xmm9
  587. movaps -9 * SIZE(BO), %xmm2
  588. pshufd $0x4e, %xmm3, %xmm5
  589. mulpd %xmm0, %xmm3
  590. addpd %xmm7, %xmm8
  591. mulpd %xmm0, %xmm5
  592. movaps -12 * SIZE(AO), %xmm0
  593. addpd %xmm3, %xmm11
  594. movaps -7 * SIZE(BO), %xmm3
  595. pshufd $0x4e, %xmm2, %xmm7
  596. mulpd %xmm0, %xmm2
  597. addpd %xmm5, %xmm10
  598. mulpd %xmm0, %xmm7
  599. addpd %xmm2, %xmm9
  600. movaps -5 * SIZE(BO), %xmm2
  601. pshufd $0x4e, %xmm3, %xmm5
  602. mulpd %xmm0, %xmm3
  603. addpd %xmm7, %xmm8
  604. mulpd %xmm0, %xmm5
  605. movaps -10 * SIZE(AO), %xmm0
  606. addpd %xmm3, %xmm11
  607. movaps -3 * SIZE(BO), %xmm3
  608. pshufd $0x4e, %xmm2, %xmm7
  609. mulpd %xmm0, %xmm2
  610. addpd %xmm5, %xmm10
  611. mulpd %xmm0, %xmm7
  612. subq $ -8 * SIZE, AO
  613. addpd %xmm2, %xmm9
  614. movaps -1 * SIZE(BO), %xmm2
  615. pshufd $0x4e, %xmm3, %xmm5
  616. mulpd %xmm0, %xmm3
  617. addpd %xmm7, %xmm8
  618. mulpd %xmm0, %xmm5
  619. movaps -16 * SIZE(AO), %xmm0
  620. subq $-16 * SIZE, BO
  621. subq $1, %rax
  622. BRANCH
  623. jg .L22
  624. ALIGN_4
  625. .L25:
  626. #ifndef TRMMKERNEL
  627. movq K, %rax
  628. #else
  629. movq KKK, %rax
  630. #endif
  631. andq $3, %rax # if (k & 1)
  632. BRANCH
  633. je .L28
  634. ALIGN_4
  635. .L26:
  636. addpd %xmm3, %xmm11
  637. movaps -15 * SIZE(BO), %xmm3
  638. pshufd $0x4e, %xmm2, %xmm7
  639. mulpd %xmm0, %xmm2
  640. addpd %xmm5, %xmm10
  641. mulpd %xmm0, %xmm7
  642. addpd %xmm2, %xmm9
  643. movaps -13 * SIZE(BO), %xmm2
  644. pshufd $0x4e, %xmm3, %xmm5
  645. mulpd %xmm0, %xmm3
  646. addpd %xmm7, %xmm8
  647. mulpd %xmm0, %xmm5
  648. movaps -14 * SIZE(AO), %xmm0
  649. addq $2 * SIZE, AO
  650. addq $4 * SIZE, BO
  651. subq $1, %rax
  652. BRANCH
  653. jg .L26
  654. ALIGN_4
  655. .L28:
  656. addpd %xmm3, %xmm11
  657. addpd %xmm5, %xmm10
  658. movddup ALPHA, %xmm3
  659. movaps %xmm8, %xmm0
  660. movsd %xmm9, %xmm8
  661. mulpd %xmm3, %xmm8
  662. movsd %xmm0, %xmm9
  663. mulpd %xmm3, %xmm9
  664. movaps %xmm10, %xmm0
  665. movsd %xmm11, %xmm10
  666. mulpd %xmm3, %xmm10
  667. movsd %xmm0, %xmm11
  668. mulpd %xmm3, %xmm11
  669. #ifndef TRMMKERNEL
  670. movsd 0 * SIZE(CO1), %xmm0
  671. movhpd 1 * SIZE(CO1), %xmm0
  672. movsd 0 * SIZE(CO2), %xmm2
  673. movhpd 1 * SIZE(CO2), %xmm2
  674. movsd 0 * SIZE(CO1, LDC, 2), %xmm4
  675. movhpd 1 * SIZE(CO1, LDC, 2), %xmm4
  676. movsd 0 * SIZE(CO2, LDC, 2), %xmm6
  677. movhpd 1 * SIZE(CO2, LDC, 2), %xmm6
  678. addpd %xmm0, %xmm8
  679. addpd %xmm2, %xmm9
  680. addpd %xmm4, %xmm10
  681. addpd %xmm6, %xmm11
  682. #endif
  683. movsd %xmm8, 0 * SIZE(CO1)
  684. movhpd %xmm8, 1 * SIZE(CO1)
  685. movsd %xmm9, 0 * SIZE(CO2)
  686. movhpd %xmm9, 1 * SIZE(CO2)
  687. movsd %xmm10, 0 * SIZE(CO1, LDC, 2)
  688. movhpd %xmm10, 1 * SIZE(CO1, LDC, 2)
  689. movsd %xmm11, 0 * SIZE(CO2, LDC, 2)
  690. movhpd %xmm11, 1 * SIZE(CO2, LDC, 2)
  691. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  692. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  693. movq K, %rax
  694. subq KKK, %rax
  695. leaq (,%rax, SIZE), %rax
  696. leaq (AO, %rax, 2), AO
  697. leaq (BO, %rax, 4), BO
  698. #endif
  699. #if defined(TRMMKERNEL) && defined(LEFT)
  700. addq $2, KK
  701. #endif
  702. addq $2 * SIZE, CO1 # coffset += 4
  703. addq $2 * SIZE, CO2 # coffset += 4
  704. ALIGN_4
  705. .L30:
  706. testq $1, M
  707. BRANCH
  708. jle .L39
  709. ALIGN_4
  710. #if !defined(TRMMKERNEL) || \
  711. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  712. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  713. movq B, BO
  714. #else
  715. movq B, BO
  716. movq KK, %rax
  717. leaq (, %rax, SIZE), %rax
  718. addq %rax, AO
  719. leaq (BO, %rax, 4), BO
  720. #endif
  721. movsd -16 * SIZE(AO), %xmm0
  722. movaps -17 * SIZE(BO), %xmm2
  723. movaps -15 * SIZE(BO), %xmm3
  724. xorps %xmm8, %xmm8
  725. xorps %xmm9, %xmm9
  726. xorps %xmm10, %xmm10
  727. xorps %xmm11, %xmm11
  728. #ifndef TRMMKERNEL
  729. movq K, %rax
  730. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  731. movq K, %rax
  732. subq KK, %rax
  733. movq %rax, KKK
  734. #else
  735. movq KK, %rax
  736. #ifdef LEFT
  737. addq $1, %rax
  738. #else
  739. addq $4, %rax
  740. #endif
  741. movq %rax, KKK
  742. #endif
  743. sarq $2, %rax
  744. NOBRANCH
  745. jle .L35
  746. ALIGN_4
  747. .L32:
  748. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  749. shufps $0x44, %xmm0, %xmm0
  750. mulpd %xmm0, %xmm2
  751. mulpd %xmm0, %xmm3
  752. movsd -15 * SIZE(AO), %xmm0
  753. addpd %xmm2, %xmm8
  754. movaps -13 * SIZE(BO), %xmm2
  755. addpd %xmm3, %xmm9
  756. movaps -11 * SIZE(BO), %xmm3
  757. shufps $0x44, %xmm0, %xmm0
  758. mulpd %xmm0, %xmm2
  759. mulpd %xmm0, %xmm3
  760. movsd -14 * SIZE(AO), %xmm0
  761. addpd %xmm2, %xmm10
  762. movaps -9 * SIZE(BO), %xmm2
  763. addpd %xmm3, %xmm11
  764. movaps -7 * SIZE(BO), %xmm3
  765. shufps $0x44, %xmm0, %xmm0
  766. mulpd %xmm0, %xmm2
  767. mulpd %xmm0, %xmm3
  768. movsd -13 * SIZE(AO), %xmm0
  769. addpd %xmm2, %xmm8
  770. movaps -5 * SIZE(BO), %xmm2
  771. addpd %xmm3, %xmm9
  772. movaps -3 * SIZE(BO), %xmm3
  773. shufps $0x44, %xmm0, %xmm0
  774. mulpd %xmm0, %xmm2
  775. mulpd %xmm0, %xmm3
  776. movsd -12 * SIZE(AO), %xmm0
  777. addpd %xmm2, %xmm10
  778. movaps -1 * SIZE(BO), %xmm2
  779. addpd %xmm3, %xmm11
  780. movaps 1 * SIZE(BO), %xmm3
  781. subq $ -4 * SIZE, AO
  782. subq $-16 * SIZE, BO
  783. subq $1, %rax
  784. BRANCH
  785. jg .L32
  786. ALIGN_4
  787. .L35:
  788. #ifndef TRMMKERNEL
  789. movq K, %rax
  790. #else
  791. movq KKK, %rax
  792. #endif
  793. andq $3, %rax # if (k & 1)
  794. BRANCH
  795. je .L38
  796. ALIGN_4
  797. .L36:
  798. shufps $0x44, %xmm0, %xmm0
  799. mulpd %xmm0, %xmm2
  800. mulpd %xmm0, %xmm3
  801. movsd -15 * SIZE(AO), %xmm0
  802. addpd %xmm2, %xmm8
  803. movaps -13 * SIZE(BO), %xmm2
  804. addpd %xmm3, %xmm9
  805. movaps -11 * SIZE(BO), %xmm3
  806. addq $1 * SIZE, AO
  807. addq $4 * SIZE, BO
  808. subq $1, %rax
  809. BRANCH
  810. jg .L36
  811. ALIGN_4
  812. .L38:
  813. movddup ALPHA, %xmm3
  814. addpd %xmm10, %xmm8
  815. addpd %xmm11, %xmm9
  816. #ifndef TRMMKERNEL
  817. movsd 0 * SIZE(CO1), %xmm0
  818. movhpd 0 * SIZE(CO2), %xmm0
  819. movsd 0 * SIZE(CO1, LDC, 2), %xmm1
  820. movhpd 0 * SIZE(CO2, LDC, 2), %xmm1
  821. #endif
  822. mulpd %xmm3, %xmm8
  823. mulpd %xmm3, %xmm9
  824. #ifndef TRMMKERNEL
  825. addpd %xmm0, %xmm8
  826. addpd %xmm1, %xmm9
  827. #endif
  828. movlpd %xmm8, 0 * SIZE(CO1)
  829. movhpd %xmm8, 0 * SIZE(CO2)
  830. movlpd %xmm9, 0 * SIZE(CO1, LDC, 2)
  831. movhpd %xmm9, 0 * SIZE(CO2, LDC, 2)
  832. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  833. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  834. movq K, %rax
  835. subq KKK, %rax
  836. leaq (,%rax, SIZE), %rax
  837. addq %rax, AO
  838. leaq (BO, %rax, 4), BO
  839. #endif
  840. #if defined(TRMMKERNEL) && defined(LEFT)
  841. addq $1, KK
  842. #endif
  843. ALIGN_4
  844. .L39:
  845. #if defined(TRMMKERNEL) && !defined(LEFT)
  846. addq $4, KK
  847. #endif
  848. movq BO, B
  849. leaq (C, LDC, 4), C
  850. subq $1, J
  851. BRANCH
  852. jg .L01
  853. ALIGN_4
  854. .L40:
  855. testq $2, N
  856. BRANCH
  857. jle .L80
  858. movq C, CO1
  859. leaq (C, LDC, 1), CO2
  860. movq A, AO
  861. #if defined(TRMMKERNEL) && defined(LEFT)
  862. movq OFFSET, %rax
  863. movq %rax, KK
  864. #endif
  865. movq K, %rax
  866. salq $BASE_SHIFT + 1, %rax
  867. leaq (B, %rax), BB
  868. movq M, I
  869. sarq $2, I # i = (m >> 2)
  870. NOBRANCH
  871. jle .L60
  872. ALIGN_4
  873. .L51:
  874. #if !defined(TRMMKERNEL) || \
  875. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  876. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  877. movq B, BO
  878. #else
  879. movq B, BO
  880. movq KK, %rax
  881. leaq (, %rax, SIZE), %rax
  882. leaq (AO, %rax, 4), AO
  883. leaq (BO, %rax, 2), BO
  884. #endif
  885. PREFETCHB -16 * SIZE(BB)
  886. subq $-4 * SIZE, BB
  887. movaps -16 * SIZE(AO), %xmm0
  888. movaps -14 * SIZE(AO), %xmm1
  889. movaps -17 * SIZE(BO), %xmm2
  890. PREFETCHW 3 * SIZE(CO1)
  891. xorps %xmm8, %xmm8
  892. xorps %xmm9, %xmm9
  893. PREFETCHW 3 * SIZE(CO2)
  894. xorps %xmm12, %xmm12
  895. xorps %xmm13, %xmm13
  896. #ifndef TRMMKERNEL
  897. movq K, %rax
  898. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  899. movq K, %rax
  900. subq KK, %rax
  901. movq %rax, KKK
  902. #else
  903. movq KK, %rax
  904. #ifdef LEFT
  905. addq $4, %rax
  906. #else
  907. addq $2, %rax
  908. #endif
  909. movq %rax, KKK
  910. #endif
  911. sarq $2, %rax
  912. NOBRANCH
  913. jle .L55
  914. ALIGN_4
  915. .L52:
  916. movaps %xmm2, %xmm4
  917. pshufd $0x4e, %xmm2, %xmm7
  918. mulpd %xmm0, %xmm2
  919. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  920. mulpd %xmm1, %xmm4
  921. movaps %xmm7, %xmm6
  922. mulpd %xmm0, %xmm7
  923. movaps -12 * SIZE(AO), %xmm0
  924. mulpd %xmm1, %xmm6
  925. movaps -10 * SIZE(AO), %xmm1
  926. addpd %xmm2, %xmm9
  927. movaps -15 * SIZE(BO), %xmm2
  928. addpd %xmm4, %xmm13
  929. addpd %xmm7, %xmm8
  930. addpd %xmm6, %xmm12
  931. movaps %xmm2, %xmm4
  932. pshufd $0x4e, %xmm2, %xmm7
  933. mulpd %xmm0, %xmm2
  934. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  935. mulpd %xmm1, %xmm4
  936. movaps %xmm7, %xmm6
  937. mulpd %xmm0, %xmm7
  938. movaps -8 * SIZE(AO), %xmm0
  939. mulpd %xmm1, %xmm6
  940. movaps -6 * SIZE(AO), %xmm1
  941. addpd %xmm2, %xmm9
  942. movaps -13 * SIZE(BO), %xmm2
  943. addpd %xmm4, %xmm13
  944. addpd %xmm7, %xmm8
  945. addpd %xmm6, %xmm12
  946. movaps %xmm2, %xmm4
  947. pshufd $0x4e, %xmm2, %xmm7
  948. mulpd %xmm0, %xmm2
  949. mulpd %xmm1, %xmm4
  950. movaps %xmm7, %xmm6
  951. mulpd %xmm0, %xmm7
  952. movaps -4 * SIZE(AO), %xmm0
  953. mulpd %xmm1, %xmm6
  954. movaps -2 * SIZE(AO), %xmm1
  955. addpd %xmm2, %xmm9
  956. movaps -11 * SIZE(BO), %xmm2
  957. addpd %xmm4, %xmm13
  958. addpd %xmm7, %xmm8
  959. addpd %xmm6, %xmm12
  960. movaps %xmm2, %xmm4
  961. pshufd $0x4e, %xmm2, %xmm7
  962. mulpd %xmm0, %xmm2
  963. mulpd %xmm1, %xmm4
  964. movaps %xmm7, %xmm6
  965. mulpd %xmm0, %xmm7
  966. movaps 0 * SIZE(AO), %xmm0
  967. mulpd %xmm1, %xmm6
  968. movaps 2 * SIZE(AO), %xmm1
  969. addpd %xmm2, %xmm9
  970. movaps -9 * SIZE(BO), %xmm2
  971. addpd %xmm4, %xmm13
  972. addpd %xmm7, %xmm8
  973. addpd %xmm6, %xmm12
  974. subq $-16 * SIZE, AO
  975. subq $ -8 * SIZE, BO
  976. subq $1, %rax
  977. BRANCH
  978. jg .L52
  979. ALIGN_4
  980. .L55:
  981. #ifndef TRMMKERNEL
  982. movq K, %rax
  983. #else
  984. movq KKK, %rax
  985. #endif
  986. andq $3, %rax # if (k & 1)
  987. BRANCH
  988. je .L58
  989. ALIGN_4
  990. .L56:
  991. movaps %xmm2, %xmm4
  992. pshufd $0x4e, %xmm2, %xmm7
  993. mulpd %xmm0, %xmm2
  994. mulpd %xmm1, %xmm4
  995. movaps %xmm7, %xmm6
  996. mulpd %xmm0, %xmm7
  997. movaps -12 * SIZE(AO), %xmm0
  998. mulpd %xmm1, %xmm6
  999. movaps -10 * SIZE(AO), %xmm1
  1000. addpd %xmm2, %xmm9
  1001. movaps -15 * SIZE(BO), %xmm2
  1002. addpd %xmm4, %xmm13
  1003. addpd %xmm7, %xmm8
  1004. addpd %xmm6, %xmm12
  1005. addq $4 * SIZE, AO
  1006. addq $2 * SIZE, BO
  1007. subq $1, %rax
  1008. BRANCH
  1009. jg .L56
  1010. ALIGN_4
  1011. .L58:
  1012. movddup ALPHA, %xmm3
  1013. movaps %xmm8, %xmm0
  1014. movsd %xmm9, %xmm8
  1015. mulpd %xmm3, %xmm8
  1016. movsd %xmm0, %xmm9
  1017. mulpd %xmm3, %xmm9
  1018. movaps %xmm12, %xmm0
  1019. movsd %xmm13, %xmm12
  1020. mulpd %xmm3, %xmm12
  1021. movsd %xmm0, %xmm13
  1022. mulpd %xmm3, %xmm13
  1023. #ifndef TRMMKERNEL
  1024. movsd 0 * SIZE(CO1), %xmm0
  1025. movhpd 1 * SIZE(CO1), %xmm0
  1026. movsd 2 * SIZE(CO1), %xmm1
  1027. movhpd 3 * SIZE(CO1), %xmm1
  1028. movsd 0 * SIZE(CO2), %xmm2
  1029. movhpd 1 * SIZE(CO2), %xmm2
  1030. movsd 2 * SIZE(CO2), %xmm3
  1031. movhpd 3 * SIZE(CO2), %xmm3
  1032. addpd %xmm0, %xmm8
  1033. addpd %xmm1, %xmm12
  1034. addpd %xmm2, %xmm9
  1035. addpd %xmm3, %xmm13
  1036. #endif
  1037. movsd %xmm8, 0 * SIZE(CO1)
  1038. movhpd %xmm8, 1 * SIZE(CO1)
  1039. movsd %xmm12, 2 * SIZE(CO1)
  1040. movhpd %xmm12, 3 * SIZE(CO1)
  1041. movsd %xmm9, 0 * SIZE(CO2)
  1042. movhpd %xmm9, 1 * SIZE(CO2)
  1043. movsd %xmm13, 2 * SIZE(CO2)
  1044. movhpd %xmm13, 3 * SIZE(CO2)
  1045. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1046. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1047. movq K, %rax
  1048. subq KKK, %rax
  1049. leaq (,%rax, SIZE), %rax
  1050. leaq (AO, %rax, 4), AO
  1051. leaq (BO, %rax, 2), BO
  1052. #endif
  1053. #if defined(TRMMKERNEL) && defined(LEFT)
  1054. addq $4, KK
  1055. #endif
  1056. addq $4 * SIZE, CO1
  1057. addq $4 * SIZE, CO2
  1058. decq I
  1059. BRANCH
  1060. jg .L51
  1061. ALIGN_4
  1062. .L60:
  1063. testq $2, M
  1064. BRANCH
  1065. jle .L70
  1066. ALIGN_4
  1067. #if !defined(TRMMKERNEL) || \
  1068. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1069. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1070. movq B, BO
  1071. #else
  1072. movq B, BO
  1073. movq KK, %rax
  1074. leaq (, %rax, SIZE), %rax
  1075. leaq (AO, %rax, 2), AO
  1076. leaq (BO, %rax, 2), BO
  1077. #endif
  1078. movaps -16 * SIZE(AO), %xmm0
  1079. xorps %xmm8, %xmm8
  1080. xorps %xmm9, %xmm9
  1081. movaps -17 * SIZE(BO), %xmm2
  1082. xorps %xmm10, %xmm10
  1083. xorps %xmm11, %xmm11
  1084. #ifndef TRMMKERNEL
  1085. movq K, %rax
  1086. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1087. movq K, %rax
  1088. subq KK, %rax
  1089. movq %rax, KKK
  1090. #else
  1091. movq KK, %rax
  1092. #ifdef LEFT
  1093. addq $2, %rax
  1094. #else
  1095. addq $2, %rax
  1096. #endif
  1097. movq %rax, KKK
  1098. #endif
  1099. sarq $2, %rax
  1100. NOBRANCH
  1101. jle .L65
  1102. ALIGN_4
  1103. .L62:
  1104. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1105. pshufd $0x4e, %xmm2, %xmm7
  1106. mulpd %xmm0, %xmm2
  1107. mulpd %xmm0, %xmm7
  1108. movaps -14 * SIZE(AO), %xmm0
  1109. addpd %xmm2, %xmm9
  1110. addpd %xmm7, %xmm8
  1111. movaps -15 * SIZE(BO), %xmm2
  1112. pshufd $0x4e, %xmm2, %xmm7
  1113. mulpd %xmm0, %xmm2
  1114. mulpd %xmm0, %xmm7
  1115. movaps -12 * SIZE(AO), %xmm0
  1116. addpd %xmm2, %xmm11
  1117. addpd %xmm7, %xmm10
  1118. movaps -13 * SIZE(BO), %xmm2
  1119. pshufd $0x4e, %xmm2, %xmm7
  1120. mulpd %xmm0, %xmm2
  1121. mulpd %xmm0, %xmm7
  1122. movaps -10 * SIZE(AO), %xmm0
  1123. addpd %xmm2, %xmm9
  1124. addpd %xmm7, %xmm8
  1125. movaps -11 * SIZE(BO), %xmm2
  1126. pshufd $0x4e, %xmm2, %xmm7
  1127. mulpd %xmm0, %xmm2
  1128. mulpd %xmm0, %xmm7
  1129. movaps -8 * SIZE(AO), %xmm0
  1130. addpd %xmm2, %xmm11
  1131. addpd %xmm7, %xmm10
  1132. movaps -9 * SIZE(BO), %xmm2
  1133. subq $-8 * SIZE, AO
  1134. subq $-8 * SIZE, BO
  1135. subq $1, %rax
  1136. BRANCH
  1137. jg .L62
  1138. ALIGN_4
  1139. .L65:
  1140. #ifndef TRMMKERNEL
  1141. movq K, %rax
  1142. #else
  1143. movq KKK, %rax
  1144. #endif
  1145. andq $3, %rax # if (k & 1)
  1146. BRANCH
  1147. je .L68
  1148. ALIGN_4
  1149. .L66:
  1150. pshufd $0x4e, %xmm2, %xmm7
  1151. mulpd %xmm0, %xmm2
  1152. mulpd %xmm0, %xmm7
  1153. movaps -14 * SIZE(AO), %xmm0
  1154. addpd %xmm2, %xmm9
  1155. addpd %xmm7, %xmm8
  1156. movaps -15 * SIZE(BO), %xmm2
  1157. addq $2 * SIZE, AO
  1158. addq $2 * SIZE, BO
  1159. subq $1, %rax
  1160. BRANCH
  1161. jg .L66
  1162. ALIGN_4
  1163. .L68:
  1164. addpd %xmm10, %xmm8
  1165. addpd %xmm11, %xmm9
  1166. movddup ALPHA, %xmm3
  1167. movaps %xmm8, %xmm0
  1168. movsd %xmm9, %xmm8
  1169. mulpd %xmm3, %xmm8
  1170. movsd %xmm0, %xmm9
  1171. mulpd %xmm3, %xmm9
  1172. #ifndef TRMMKERNEL
  1173. movsd 0 * SIZE(CO1), %xmm0
  1174. movhpd 1 * SIZE(CO1), %xmm0
  1175. movsd 0 * SIZE(CO2), %xmm2
  1176. movhpd 1 * SIZE(CO2), %xmm2
  1177. addpd %xmm0, %xmm8
  1178. addpd %xmm2, %xmm9
  1179. #endif
  1180. movsd %xmm8, 0 * SIZE(CO1)
  1181. movhpd %xmm8, 1 * SIZE(CO1)
  1182. movsd %xmm9, 0 * SIZE(CO2)
  1183. movhpd %xmm9, 1 * SIZE(CO2)
  1184. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1185. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1186. movq K, %rax
  1187. subq KKK, %rax
  1188. leaq (,%rax, SIZE), %rax
  1189. leaq (AO, %rax, 2), AO
  1190. leaq (BO, %rax, 2), BO
  1191. #endif
  1192. #if defined(TRMMKERNEL) && defined(LEFT)
  1193. addq $2, KK
  1194. #endif
  1195. addq $2 * SIZE, CO1
  1196. addq $2 * SIZE, CO2
  1197. ALIGN_4
  1198. .L70:
  1199. testq $1, M
  1200. BRANCH
  1201. jle .L79
  1202. ALIGN_4
  1203. #if !defined(TRMMKERNEL) || \
  1204. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1205. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1206. movq B, BO
  1207. #else
  1208. movq B, BO
  1209. movq KK, %rax
  1210. leaq (, %rax, SIZE), %rax
  1211. addq %rax, AO
  1212. leaq (BO, %rax, 2), BO
  1213. #endif
  1214. movsd -16 * SIZE(AO), %xmm0
  1215. movaps -17 * SIZE(BO), %xmm2
  1216. xorps %xmm8, %xmm8
  1217. xorps %xmm9, %xmm9
  1218. #ifndef TRMMKERNEL
  1219. movq K, %rax
  1220. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1221. movq K, %rax
  1222. subq KK, %rax
  1223. movq %rax, KKK
  1224. #else
  1225. movq KK, %rax
  1226. #ifdef LEFT
  1227. addq $1, %rax
  1228. #else
  1229. addq $2, %rax
  1230. #endif
  1231. movq %rax, KKK
  1232. #endif
  1233. sarq $2, %rax
  1234. NOBRANCH
  1235. jle .L75
  1236. ALIGN_4
  1237. .L72:
  1238. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1239. shufps $0x44, %xmm0, %xmm0
  1240. mulpd %xmm0, %xmm2
  1241. movsd -15 * SIZE(AO), %xmm0
  1242. addpd %xmm2, %xmm8
  1243. movaps -15 * SIZE(BO), %xmm2
  1244. shufps $0x44, %xmm0, %xmm0
  1245. mulpd %xmm0, %xmm2
  1246. movsd -14 * SIZE(AO), %xmm0
  1247. addpd %xmm2, %xmm9
  1248. movaps -13 * SIZE(BO), %xmm2
  1249. shufps $0x44, %xmm0, %xmm0
  1250. mulpd %xmm0, %xmm2
  1251. movsd -13 * SIZE(AO), %xmm0
  1252. addpd %xmm2, %xmm8
  1253. movaps -11 * SIZE(BO), %xmm2
  1254. shufps $0x44, %xmm0, %xmm0
  1255. mulpd %xmm0, %xmm2
  1256. movsd -12 * SIZE(AO), %xmm0
  1257. addpd %xmm2, %xmm9
  1258. movaps -9 * SIZE(BO), %xmm2
  1259. subq $-4 * SIZE, AO
  1260. subq $-8 * SIZE, BO
  1261. subq $1, %rax
  1262. BRANCH
  1263. jg .L72
  1264. ALIGN_4
  1265. .L75:
  1266. #ifndef TRMMKERNEL
  1267. movq K, %rax
  1268. #else
  1269. movq KKK, %rax
  1270. #endif
  1271. andq $3, %rax # if (k & 1)
  1272. BRANCH
  1273. je .L78
  1274. ALIGN_4
  1275. .L76:
  1276. shufps $0x44, %xmm0, %xmm0
  1277. mulpd %xmm0, %xmm2
  1278. movsd -15 * SIZE(AO), %xmm0
  1279. addpd %xmm2, %xmm8
  1280. movaps -15 * SIZE(BO), %xmm2
  1281. addq $1 * SIZE, AO
  1282. addq $2 * SIZE, BO
  1283. subq $1, %rax
  1284. BRANCH
  1285. jg .L76
  1286. ALIGN_4
  1287. .L78:
  1288. movddup ALPHA, %xmm3
  1289. addpd %xmm9, %xmm8
  1290. #ifndef TRMMKERNEL
  1291. movsd 0 * SIZE(CO1), %xmm0
  1292. movhpd 0 * SIZE(CO2), %xmm0
  1293. #endif
  1294. mulpd %xmm3, %xmm8
  1295. #ifndef TRMMKERNEL
  1296. addpd %xmm0, %xmm8
  1297. #endif
  1298. movlpd %xmm8, 0 * SIZE(CO1)
  1299. movhpd %xmm8, 0 * SIZE(CO2)
  1300. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1301. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1302. movq K, %rax
  1303. subq KKK, %rax
  1304. leaq (,%rax, SIZE), %rax
  1305. addq %rax, AO
  1306. leaq (BO, %rax, 2), BO
  1307. #endif
  1308. #if defined(TRMMKERNEL) && defined(LEFT)
  1309. addq $1, KK
  1310. #endif
  1311. ALIGN_4
  1312. .L79:
  1313. #if defined(TRMMKERNEL) && !defined(LEFT)
  1314. addq $2, KK
  1315. #endif
  1316. leaq (C, LDC, 2), C
  1317. movq BO, B
  1318. ALIGN_4
  1319. .L80:
  1320. testq $1, N
  1321. BRANCH
  1322. jle .L999
  1323. #if defined(TRMMKERNEL) && defined(LEFT)
  1324. movq OFFSET, %rax
  1325. movq %rax, KK
  1326. #endif
  1327. movq C, CO1
  1328. movq A, AO
  1329. movq M, I
  1330. sarq $2, I # i = (m >> 2)
  1331. NOBRANCH
  1332. jle .L100
  1333. ALIGN_4
  1334. .L91:
  1335. #if !defined(TRMMKERNEL) || \
  1336. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1337. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1338. movq B, BO
  1339. #else
  1340. movq B, BO
  1341. movq KK, %rax
  1342. leaq (, %rax, SIZE), %rax
  1343. leaq (AO, %rax, 4), AO
  1344. addq %rax, BO
  1345. #endif
  1346. movaps -16 * SIZE(AO), %xmm0
  1347. movaps -14 * SIZE(AO), %xmm1
  1348. movsd -17 * SIZE(BO), %xmm2
  1349. PREFETCHW 3 * SIZE(CO1)
  1350. xorps %xmm8, %xmm8
  1351. xorps %xmm9, %xmm9
  1352. xorps %xmm12, %xmm12
  1353. xorps %xmm13, %xmm13
  1354. #ifndef TRMMKERNEL
  1355. movq K, %rax
  1356. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1357. movq K, %rax
  1358. subq KK, %rax
  1359. movq %rax, KKK
  1360. #else
  1361. movq KK, %rax
  1362. #ifdef LEFT
  1363. addq $4, %rax
  1364. #else
  1365. addq $1, %rax
  1366. #endif
  1367. movq %rax, KKK
  1368. #endif
  1369. sarq $2, %rax
  1370. NOBRANCH
  1371. jle .L95
  1372. ALIGN_4
  1373. .L92:
  1374. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1375. pshufd $0x44, %xmm2, %xmm3
  1376. pshufd $0x44, %xmm2, %xmm4
  1377. movsd -16 * SIZE(BO), %xmm2
  1378. mulpd %xmm0, %xmm3
  1379. movaps -12 * SIZE(AO), %xmm0
  1380. mulpd %xmm1, %xmm4
  1381. movaps -10 * SIZE(AO), %xmm1
  1382. addpd %xmm3, %xmm8
  1383. addpd %xmm4, %xmm12
  1384. pshufd $0x44, %xmm2, %xmm3
  1385. pshufd $0x44, %xmm2, %xmm4
  1386. movsd -15 * SIZE(BO), %xmm2
  1387. mulpd %xmm0, %xmm3
  1388. movaps -8 * SIZE(AO), %xmm0
  1389. mulpd %xmm1, %xmm4
  1390. movaps -6 * SIZE(AO), %xmm1
  1391. addpd %xmm3, %xmm8
  1392. addpd %xmm4, %xmm12
  1393. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  1394. pshufd $0x44, %xmm2, %xmm3
  1395. pshufd $0x44, %xmm2, %xmm4
  1396. movsd -14 * SIZE(BO), %xmm2
  1397. mulpd %xmm0, %xmm3
  1398. movaps -4 * SIZE(AO), %xmm0
  1399. mulpd %xmm1, %xmm4
  1400. movaps -2 * SIZE(AO), %xmm1
  1401. addpd %xmm3, %xmm8
  1402. addpd %xmm4, %xmm12
  1403. pshufd $0x44, %xmm2, %xmm3
  1404. pshufd $0x44, %xmm2, %xmm4
  1405. movsd -13 * SIZE(BO), %xmm2
  1406. mulpd %xmm0, %xmm3
  1407. movaps 0 * SIZE(AO), %xmm0
  1408. mulpd %xmm1, %xmm4
  1409. movaps 2 * SIZE(AO), %xmm1
  1410. addpd %xmm3, %xmm8
  1411. addpd %xmm4, %xmm12
  1412. subq $-16 * SIZE, AO
  1413. subq $ -4 * SIZE, BO
  1414. subq $1, %rax
  1415. BRANCH
  1416. jg .L92
  1417. ALIGN_4
  1418. .L95:
  1419. #ifndef TRMMKERNEL
  1420. movq K, %rax
  1421. #else
  1422. movq KKK, %rax
  1423. #endif
  1424. andq $3, %rax # if (k & 1)
  1425. BRANCH
  1426. je .L98
  1427. ALIGN_4
  1428. .L96:
  1429. pshufd $0x44, %xmm2, %xmm3
  1430. pshufd $0x44, %xmm2, %xmm4
  1431. movsd -16 * SIZE(BO), %xmm2
  1432. mulpd %xmm0, %xmm3
  1433. movaps -12 * SIZE(AO), %xmm0
  1434. mulpd %xmm1, %xmm4
  1435. movaps -10 * SIZE(AO), %xmm1
  1436. addpd %xmm3, %xmm8
  1437. addpd %xmm4, %xmm12
  1438. addq $4 * SIZE, AO
  1439. addq $1 * SIZE, BO
  1440. subq $1, %rax
  1441. BRANCH
  1442. jg .L96
  1443. ALIGN_4
  1444. .L98:
  1445. movddup ALPHA, %xmm3
  1446. #ifndef TRMMKERNEL
  1447. movsd 0 * SIZE(CO1), %xmm0
  1448. movhpd 1 * SIZE(CO1), %xmm0
  1449. movsd 2 * SIZE(CO1), %xmm1
  1450. movhpd 3 * SIZE(CO1), %xmm1
  1451. #endif
  1452. mulpd %xmm3, %xmm8
  1453. mulpd %xmm3, %xmm12
  1454. #ifndef TRMMKERNEL
  1455. addpd %xmm0, %xmm8
  1456. addpd %xmm1, %xmm12
  1457. #endif
  1458. movsd %xmm8, 0 * SIZE(CO1)
  1459. movhpd %xmm8, 1 * SIZE(CO1)
  1460. movsd %xmm12, 2 * SIZE(CO1)
  1461. movhpd %xmm12, 3 * SIZE(CO1)
  1462. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1463. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1464. movq K, %rax
  1465. subq KKK, %rax
  1466. leaq (,%rax, SIZE), %rax
  1467. leaq (AO, %rax, 4), AO
  1468. addq %rax, BO
  1469. #endif
  1470. #if defined(TRMMKERNEL) && defined(LEFT)
  1471. addq $4, KK
  1472. #endif
  1473. addq $4 * SIZE, CO1
  1474. decq I
  1475. BRANCH
  1476. jg .L91
  1477. ALIGN_4
  1478. .L100:
  1479. testq $2, M
  1480. BRANCH
  1481. jle .L110
  1482. ALIGN_4
  1483. #if !defined(TRMMKERNEL) || \
  1484. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1485. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1486. movq B, BO
  1487. #else
  1488. movq B, BO
  1489. movq KK, %rax
  1490. leaq (, %rax, SIZE), %rax
  1491. leaq (AO, %rax, 2), AO
  1492. addq %rax, BO
  1493. #endif
  1494. movaps -16 * SIZE(AO), %xmm0
  1495. xorps %xmm8, %xmm8
  1496. movaps -17 * SIZE(BO), %xmm2
  1497. xorps %xmm9, %xmm9
  1498. #ifndef TRMMKERNEL
  1499. movq K, %rax
  1500. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1501. movq K, %rax
  1502. subq KK, %rax
  1503. movq %rax, KKK
  1504. #else
  1505. movq KK, %rax
  1506. #ifdef LEFT
  1507. addq $2, %rax
  1508. #else
  1509. addq $1, %rax
  1510. #endif
  1511. movq %rax, KKK
  1512. #endif
  1513. sarq $2, %rax
  1514. NOBRANCH
  1515. jle .L105
  1516. ALIGN_4
  1517. .L102:
  1518. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1519. pshufd $0x44, %xmm2, %xmm3
  1520. movsd -16 * SIZE(BO), %xmm2
  1521. mulpd %xmm0, %xmm3
  1522. movaps -14 * SIZE(AO), %xmm0
  1523. addpd %xmm3, %xmm8
  1524. pshufd $0x44, %xmm2, %xmm3
  1525. movsd -15 * SIZE(BO), %xmm2
  1526. mulpd %xmm0, %xmm3
  1527. movaps -12 * SIZE(AO), %xmm0
  1528. addpd %xmm3, %xmm9
  1529. pshufd $0x44, %xmm2, %xmm3
  1530. movsd -14 * SIZE(BO), %xmm2
  1531. mulpd %xmm0, %xmm3
  1532. movaps -10 * SIZE(AO), %xmm0
  1533. addpd %xmm3, %xmm8
  1534. pshufd $0x44, %xmm2, %xmm3
  1535. movsd -13 * SIZE(BO), %xmm2
  1536. mulpd %xmm0, %xmm3
  1537. movaps -8 * SIZE(AO), %xmm0
  1538. addpd %xmm3, %xmm9
  1539. subq $-8 * SIZE, AO
  1540. subq $-4 * SIZE, BO
  1541. subq $1, %rax
  1542. BRANCH
  1543. jg .L102
  1544. ALIGN_4
  1545. .L105:
  1546. #ifndef TRMMKERNEL
  1547. movq K, %rax
  1548. #else
  1549. movq KKK, %rax
  1550. #endif
  1551. andq $3, %rax # if (k & 1)
  1552. BRANCH
  1553. je .L108
  1554. ALIGN_4
  1555. .L106:
  1556. pshufd $0x44, %xmm2, %xmm3
  1557. movsd -16 * SIZE(BO), %xmm2
  1558. mulpd %xmm0, %xmm3
  1559. movaps -14 * SIZE(AO), %xmm0
  1560. addpd %xmm3, %xmm8
  1561. addq $2 * SIZE, AO
  1562. addq $1 * SIZE, BO
  1563. subq $1, %rax
  1564. BRANCH
  1565. jg .L106
  1566. ALIGN_4
  1567. .L108:
  1568. addpd %xmm9, %xmm8
  1569. movddup ALPHA, %xmm3
  1570. #ifndef TRMMKERNEL
  1571. movsd 0 * SIZE(CO1), %xmm0
  1572. movhpd 1 * SIZE(CO1), %xmm0
  1573. #endif
  1574. mulpd %xmm3, %xmm8
  1575. #ifndef TRMMKERNEL
  1576. addpd %xmm0, %xmm8
  1577. #endif
  1578. movsd %xmm8, 0 * SIZE(CO1)
  1579. movhpd %xmm8, 1 * SIZE(CO1)
  1580. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1581. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1582. movq K, %rax
  1583. subq KKK, %rax
  1584. leaq (,%rax, SIZE), %rax
  1585. leaq (AO, %rax, 2), AO
  1586. addq %rax, BO
  1587. #endif
  1588. #if defined(TRMMKERNEL) && defined(LEFT)
  1589. addq $2, KK
  1590. #endif
  1591. addq $2 * SIZE, CO1
  1592. ALIGN_4
  1593. .L110:
  1594. testq $1, M
  1595. BRANCH
  1596. jle .L999
  1597. #if !defined(TRMMKERNEL) || \
  1598. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1599. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1600. movq B, BO
  1601. #else
  1602. movq B, BO
  1603. movq KK, %rax
  1604. leaq (, %rax, SIZE), %rax
  1605. addq %rax, AO
  1606. addq %rax, BO
  1607. #endif
  1608. movsd -16 * SIZE(AO), %xmm0
  1609. movsd -17 * SIZE(BO), %xmm2
  1610. xorps %xmm8, %xmm8
  1611. xorps %xmm9, %xmm9
  1612. #ifndef TRMMKERNEL
  1613. movq K, %rax
  1614. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1615. movq K, %rax
  1616. subq KK, %rax
  1617. movq %rax, KKK
  1618. #else
  1619. movq KK, %rax
  1620. #ifdef LEFT
  1621. addq $1, %rax
  1622. #else
  1623. addq $1, %rax
  1624. #endif
  1625. movq %rax, KKK
  1626. #endif
  1627. sarq $2, %rax
  1628. NOBRANCH
  1629. jle .L115
  1630. ALIGN_4
  1631. .L112:
  1632. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1633. mulsd %xmm0, %xmm2
  1634. addsd %xmm2, %xmm8
  1635. movsd -15 * SIZE(AO), %xmm0
  1636. movsd -16 * SIZE(BO), %xmm2
  1637. mulsd %xmm0, %xmm2
  1638. addsd %xmm2, %xmm8
  1639. movsd -14 * SIZE(AO), %xmm0
  1640. movsd -15 * SIZE(BO), %xmm2
  1641. mulsd %xmm0, %xmm2
  1642. addsd %xmm2, %xmm8
  1643. movsd -13 * SIZE(AO), %xmm0
  1644. movsd -14 * SIZE(BO), %xmm2
  1645. mulsd %xmm0, %xmm2
  1646. addsd %xmm2, %xmm8
  1647. movsd -12 * SIZE(AO), %xmm0
  1648. movsd -13 * SIZE(BO), %xmm2
  1649. subq $-4 * SIZE, AO
  1650. subq $-4 * SIZE, BO
  1651. subq $1, %rax
  1652. BRANCH
  1653. jg .L112
  1654. ALIGN_4
  1655. .L115:
  1656. #ifndef TRMMKERNEL
  1657. movq K, %rax
  1658. #else
  1659. movq KKK, %rax
  1660. #endif
  1661. andq $3, %rax # if (k & 1)
  1662. BRANCH
  1663. je .L118
  1664. ALIGN_4
  1665. .L116:
  1666. mulsd %xmm0, %xmm2
  1667. addsd %xmm2, %xmm8
  1668. movsd -15 * SIZE(AO), %xmm0
  1669. movsd -16 * SIZE(BO), %xmm2
  1670. addq $1 * SIZE, AO
  1671. addq $1 * SIZE, BO
  1672. subq $1, %rax
  1673. BRANCH
  1674. jg .L116
  1675. ALIGN_4
  1676. .L118:
  1677. movddup ALPHA, %xmm3
  1678. addpd %xmm9, %xmm8
  1679. #ifndef TRMMKERNEL
  1680. movsd 0 * SIZE(CO1), %xmm0
  1681. #endif
  1682. mulsd %xmm3, %xmm8
  1683. #ifndef TRMMKERNEL
  1684. addpd %xmm0, %xmm8
  1685. #endif
  1686. movlpd %xmm8, 0 * SIZE(CO1)
  1687. ALIGN_4
  1688. .L999:
  1689. movq 0(%rsp), %rbx
  1690. movq 8(%rsp), %rbp
  1691. movq 16(%rsp), %r12
  1692. movq 24(%rsp), %r13
  1693. movq 32(%rsp), %r14
  1694. movq 40(%rsp), %r15
  1695. #ifdef WINDOWS_ABI
  1696. movq 48(%rsp), %rdi
  1697. movq 56(%rsp), %rsi
  1698. movups 64(%rsp), %xmm6
  1699. movups 80(%rsp), %xmm7
  1700. movups 96(%rsp), %xmm8
  1701. movups 112(%rsp), %xmm9
  1702. movups 128(%rsp), %xmm10
  1703. movups 144(%rsp), %xmm11
  1704. movups 160(%rsp), %xmm12
  1705. movups 176(%rsp), %xmm13
  1706. movups 192(%rsp), %xmm14
  1707. movups 208(%rsp), %xmm15
  1708. #endif
  1709. addq $STACKSIZE, %rsp
  1710. ret
  1711. EPILOGUE