You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT_2x4_penryn.S 36 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 24 + STACK + ARGS(%esp)
  47. #define ARG_B 28 + STACK + ARGS(%esp)
  48. #define C 32 + STACK + ARGS(%esp)
  49. #define ARG_LDC 36 + STACK + ARGS(%esp)
  50. #define OFFSET 40 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define KK 4 + STACK(%esp)
  53. #define KKK 8 + STACK(%esp)
  54. #define AORIG 12 + STACK(%esp)
  55. #if defined(PENRYN) || defined(DUNNINGTON)
  56. #define PREFETCH prefetcht0
  57. #define PREFETCHSIZE (8 * 21 + 4)
  58. #endif
  59. #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
  60. #define PREFETCH prefetcht0
  61. #define PREFETCHSIZE (8 * 21 + 4)
  62. #endif
  63. #ifdef NANO
  64. #define PREFETCH prefetcht0
  65. #define PREFETCHSIZE (8 * 2)
  66. #endif
  67. #define AA %edx
  68. #define BB %ecx
  69. #define LDC %ebp
  70. #define B %edi
  71. #define CO1 %esi
  72. PROLOGUE
  73. subl $ARGS, %esp
  74. pushl %ebp
  75. pushl %edi
  76. pushl %esi
  77. pushl %ebx
  78. PROFCODE
  79. movl ARG_B, B
  80. movl ARG_LDC, LDC
  81. movl OFFSET, %eax
  82. #ifdef RN
  83. negl %eax
  84. #endif
  85. movl %eax, KK
  86. leal (, LDC, SIZE), LDC
  87. subl $-16 * SIZE, A
  88. subl $-16 * SIZE, B
  89. #ifdef LN
  90. movl M, %eax
  91. leal (, %eax, SIZE), %eax
  92. addl %eax, C
  93. imull K, %eax
  94. addl %eax, A
  95. #endif
  96. #ifdef RT
  97. movl N, %eax
  98. leal (, %eax, SIZE), %eax
  99. imull K, %eax
  100. addl %eax, B
  101. movl N, %eax
  102. imull LDC, %eax
  103. addl %eax, C
  104. #endif
  105. #ifdef RT
  106. movl N, %eax
  107. subl OFFSET, %eax
  108. movl %eax, KK
  109. #endif
  110. testl $1, N
  111. je .L30
  112. #if defined(LT) || defined(RN)
  113. movl A, AA
  114. #else
  115. movl A, %eax
  116. movl %eax, AORIG
  117. #endif
  118. #ifdef RT
  119. movl K, %eax
  120. sall $BASE_SHIFT, %eax
  121. subl %eax, B
  122. #endif
  123. #ifdef RT
  124. subl LDC, C
  125. #endif
  126. movl C, CO1
  127. #ifndef RT
  128. addl LDC, C
  129. #endif
  130. #ifdef LN
  131. movl OFFSET, %eax
  132. addl M, %eax
  133. movl %eax, KK
  134. #endif
  135. #ifdef LT
  136. movl OFFSET, %eax
  137. movl %eax, KK
  138. #endif
  139. movl M, %ebx
  140. sarl $1, %ebx # i = (m >> 2)
  141. jle .L80
  142. ALIGN_4
  143. .L71:
  144. #ifdef LN
  145. movl K, %eax
  146. sall $1 + BASE_SHIFT, %eax
  147. subl %eax, AORIG
  148. #endif
  149. #if defined(LN) || defined(RT)
  150. movl KK, %eax
  151. movl AORIG, AA
  152. leal (, %eax, SIZE), %eax
  153. leal (AA, %eax, 2), AA
  154. #endif
  155. movl B, BB
  156. #if defined(LN) || defined(RT)
  157. movl KK, %eax
  158. sall $BASE_SHIFT, %eax
  159. addl %eax, BB
  160. #endif
  161. movsd -16 * SIZE(AA), %xmm0
  162. movhps -15 * SIZE(AA), %xmm0
  163. pxor %xmm4, %xmm4
  164. movsd -16 * SIZE(BB), %xmm1
  165. movhps -15 * SIZE(BB), %xmm1
  166. pxor %xmm5, %xmm5
  167. #ifdef LN
  168. prefetcht0 -2 * SIZE(CO1)
  169. #else
  170. prefetcht0 1 * SIZE(CO1)
  171. #endif
  172. #if defined(LT) || defined(RN)
  173. movl KK, %eax
  174. #else
  175. movl K, %eax
  176. subl KK, %eax
  177. #endif
  178. sarl $3, %eax
  179. je .L75
  180. ALIGN_4
  181. .L72:
  182. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  183. pshufd $0x44, %xmm1, %xmm2
  184. mulpd %xmm0, %xmm2
  185. movaps -14 * SIZE(AA), %xmm0
  186. addpd %xmm2, %xmm4
  187. pshufd $0xee, %xmm1, %xmm2
  188. movaps -14 * SIZE(BB), %xmm1
  189. mulpd %xmm0, %xmm2
  190. movaps -12 * SIZE(AA), %xmm0
  191. addpd %xmm2, %xmm5
  192. pshufd $0x44, %xmm1, %xmm2
  193. mulpd %xmm0, %xmm2
  194. movaps -10 * SIZE(AA), %xmm0
  195. addpd %xmm2, %xmm4
  196. pshufd $0xee, %xmm1, %xmm2
  197. movaps -12 * SIZE(BB), %xmm1
  198. mulpd %xmm0, %xmm2
  199. movaps -8 * SIZE(AA), %xmm0
  200. addpd %xmm2, %xmm5
  201. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  202. pshufd $0x44, %xmm1, %xmm2
  203. mulpd %xmm0, %xmm2
  204. movaps -6 * SIZE(AA), %xmm0
  205. addpd %xmm2, %xmm4
  206. pshufd $0xee, %xmm1, %xmm2
  207. movaps -10 * SIZE(BB), %xmm1
  208. mulpd %xmm0, %xmm2
  209. movaps -4 * SIZE(AA), %xmm0
  210. addpd %xmm2, %xmm5
  211. pshufd $0x44, %xmm1, %xmm2
  212. mulpd %xmm0, %xmm2
  213. movaps -2 * SIZE(AA), %xmm0
  214. addpd %xmm2, %xmm4
  215. pshufd $0xee, %xmm1, %xmm2
  216. movaps -8 * SIZE(BB), %xmm1
  217. mulpd %xmm0, %xmm2
  218. movaps 0 * SIZE(AA), %xmm0
  219. addpd %xmm2, %xmm5
  220. subl $-16 * SIZE, AA
  221. subl $ -8 * SIZE, BB
  222. subl $1, %eax
  223. jne .L72
  224. ALIGN_4
  225. .L75:
  226. #if defined(LT) || defined(RN)
  227. movl KK, %eax
  228. #else
  229. movl K, %eax
  230. subl KK, %eax
  231. #endif
  232. andl $7, %eax # if (k & 1)
  233. BRANCH
  234. je .L78
  235. ALIGN_3
  236. .L76:
  237. pshufd $0x44, %xmm1, %xmm2
  238. movsd -15 * SIZE(BB), %xmm1
  239. mulpd %xmm0, %xmm2
  240. movaps -14 * SIZE(AA), %xmm0
  241. addpd %xmm2, %xmm4
  242. addl $2 * SIZE, AA
  243. addl $1 * SIZE, BB
  244. decl %eax
  245. jg .L76
  246. ALIGN_4
  247. .L78:
  248. addpd %xmm5, %xmm4
  249. #if defined(LN) || defined(RT)
  250. movl KK, %eax
  251. #ifdef LN
  252. subl $2, %eax
  253. #else
  254. subl $1, %eax
  255. #endif
  256. movl AORIG, AA
  257. leal (, %eax, SIZE), %eax
  258. leal (AA, %eax, 2), AA
  259. leal (B, %eax, 1), BB
  260. #endif
  261. #if defined(LN) || defined(LT)
  262. movapd -16 * SIZE(BB), %xmm1
  263. subpd %xmm4, %xmm1
  264. movapd %xmm1, %xmm0
  265. unpckhpd %xmm1, %xmm1
  266. #else
  267. movapd -16 * SIZE(AA), %xmm0
  268. subpd %xmm4, %xmm0
  269. #endif
  270. #ifdef LN
  271. movsd -13 * SIZE(AA), %xmm4
  272. mulsd %xmm4, %xmm1
  273. movsd -14 * SIZE(AA), %xmm4
  274. mulsd %xmm1, %xmm4
  275. subsd %xmm4, %xmm0
  276. movsd -16 * SIZE(AA), %xmm4
  277. mulsd %xmm4, %xmm0
  278. #endif
  279. #ifdef LT
  280. movsd -16 * SIZE(AA), %xmm4
  281. mulsd %xmm4, %xmm0
  282. movsd -15 * SIZE(AA), %xmm4
  283. mulsd %xmm0, %xmm4
  284. subsd %xmm4, %xmm1
  285. movsd -13 * SIZE(AA), %xmm4
  286. mulsd %xmm4, %xmm1
  287. #endif
  288. #ifdef RN
  289. movddup -16 * SIZE(BB), %xmm4
  290. mulpd %xmm4, %xmm0
  291. #endif
  292. #ifdef RT
  293. movddup -16 * SIZE(BB), %xmm4
  294. mulpd %xmm4, %xmm0
  295. #endif
  296. #if defined(LN) || defined(LT)
  297. movsd %xmm0, -16 * SIZE(BB)
  298. movsd %xmm1, -15 * SIZE(BB)
  299. #else
  300. movapd %xmm0, -16 * SIZE(AA)
  301. #endif
  302. #ifdef LN
  303. subl $2 * SIZE, CO1
  304. #endif
  305. #if defined(LN) || defined(LT)
  306. movsd %xmm0, 0 * SIZE(CO1)
  307. movsd %xmm1, 1 * SIZE(CO1)
  308. #else
  309. movsd %xmm0, 0 * SIZE(CO1)
  310. movhps %xmm0, 1 * SIZE(CO1)
  311. #endif
  312. #ifndef LN
  313. addl $2 * SIZE, CO1
  314. #endif
  315. #if defined(LT) || defined(RN)
  316. movl K, %eax
  317. subl KK, %eax
  318. leal (,%eax, SIZE), %eax
  319. leal (AA, %eax, 2), AA
  320. addl %eax, BB
  321. #endif
  322. #ifdef LN
  323. subl $2, KK
  324. #endif
  325. #ifdef LT
  326. addl $2, KK
  327. #endif
  328. #ifdef RT
  329. movl K, %eax
  330. sall $1 + BASE_SHIFT, %eax
  331. addl %eax, AORIG
  332. #endif
  333. decl %ebx # i --
  334. jg .L71
  335. ALIGN_4
  336. .L80:
  337. movl M, %ebx
  338. testl $1, %ebx # i = (m >> 2)
  339. jle .L89
  340. #ifdef LN
  341. movl K, %eax
  342. sall $BASE_SHIFT, %eax
  343. subl %eax, AORIG
  344. #endif
  345. #if defined(LN) || defined(RT)
  346. movl KK, %eax
  347. movl AORIG, AA
  348. leal (AA, %eax, SIZE), AA
  349. #endif
  350. movl B, BB
  351. #if defined(LN) || defined(RT)
  352. movl KK, %eax
  353. sall $BASE_SHIFT, %eax
  354. addl %eax, BB
  355. #endif
  356. movsd -16 * SIZE(AA), %xmm0
  357. movhps -15 * SIZE(AA), %xmm0
  358. pxor %xmm4, %xmm4
  359. movsd -16 * SIZE(BB), %xmm2
  360. movhps -15 * SIZE(BB), %xmm2
  361. pxor %xmm5, %xmm5
  362. #if defined(LT) || defined(RN)
  363. movl KK, %eax
  364. #else
  365. movl K, %eax
  366. subl KK, %eax
  367. #endif
  368. sarl $3, %eax
  369. je .L85
  370. ALIGN_4
  371. .L82:
  372. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  373. mulpd %xmm0, %xmm2
  374. movaps -14 * SIZE(AA), %xmm0
  375. addpd %xmm2, %xmm4
  376. movaps -14 * SIZE(BB), %xmm2
  377. mulpd %xmm0, %xmm2
  378. movaps -12 * SIZE(AA), %xmm0
  379. addpd %xmm2, %xmm5
  380. movaps -12 * SIZE(BB), %xmm2
  381. mulpd %xmm0, %xmm2
  382. movaps -10 * SIZE(AA), %xmm0
  383. addpd %xmm2, %xmm4
  384. movaps -10 * SIZE(BB), %xmm2
  385. mulpd %xmm0, %xmm2
  386. movaps -8 * SIZE(AA), %xmm0
  387. addpd %xmm2, %xmm5
  388. movaps -8 * SIZE(BB), %xmm2
  389. subl $-8 * SIZE, AA
  390. subl $-8 * SIZE, BB
  391. decl %eax
  392. jne .L82
  393. ALIGN_4
  394. .L85:
  395. #if defined(LT) || defined(RN)
  396. movl KK, %eax
  397. #else
  398. movl K, %eax
  399. subl KK, %eax
  400. #endif
  401. andl $7, %eax
  402. BRANCH
  403. je .L88
  404. .L86:
  405. mulsd %xmm0, %xmm2
  406. movsd -15 * SIZE(AA), %xmm0
  407. addsd %xmm2, %xmm4
  408. movsd -15 * SIZE(BB), %xmm2
  409. addl $1 * SIZE, AA
  410. addl $1 * SIZE, BB
  411. decl %eax
  412. jg .L86
  413. ALIGN_4
  414. .L88:
  415. addpd %xmm5, %xmm4
  416. haddpd %xmm4, %xmm4
  417. #if defined(LN) || defined(RT)
  418. movl KK, %eax
  419. #ifdef LN
  420. subl $1, %eax
  421. #else
  422. subl $1, %eax
  423. #endif
  424. movl AORIG, AA
  425. leal (, %eax, SIZE), %eax
  426. addl %eax, AA
  427. leal (B, %eax, 1), BB
  428. #endif
  429. #if defined(LN) || defined(LT)
  430. movsd -16 * SIZE(BB), %xmm0
  431. subsd %xmm4, %xmm0
  432. #else
  433. movsd -16 * SIZE(AA), %xmm0
  434. subsd %xmm4, %xmm0
  435. #endif
  436. #ifdef LN
  437. movsd -16 * SIZE(AA), %xmm4
  438. mulsd %xmm4, %xmm0
  439. #endif
  440. #ifdef LT
  441. movsd -16 * SIZE(AA), %xmm4
  442. mulsd %xmm4, %xmm0
  443. #endif
  444. #ifdef RN
  445. movsd -16 * SIZE(BB), %xmm4
  446. mulsd %xmm4, %xmm0
  447. #endif
  448. #ifdef RT
  449. movsd -16 * SIZE(BB), %xmm4
  450. mulsd %xmm4, %xmm0
  451. #endif
  452. #if defined(LN) || defined(LT)
  453. movsd %xmm0, -16 * SIZE(BB)
  454. #else
  455. movsd %xmm0, -16 * SIZE(AA)
  456. #endif
  457. #ifdef LN
  458. subl $1 * SIZE, CO1
  459. #endif
  460. #if defined(LN) || defined(LT)
  461. movsd %xmm0, 0 * SIZE(CO1)
  462. #else
  463. movsd %xmm0, 0 * SIZE(CO1)
  464. #endif
  465. #ifndef LN
  466. addl $1 * SIZE, CO1
  467. #endif
  468. #if defined(LT) || defined(RN)
  469. movl K, %eax
  470. subl KK, %eax
  471. leal (,%eax, SIZE), %eax
  472. addl %eax, AA
  473. addl %eax, BB
  474. #endif
  475. #ifdef LN
  476. subl $1, KK
  477. #endif
  478. #ifdef LT
  479. addl $1, KK
  480. #endif
  481. #ifdef RT
  482. movl K, %eax
  483. sall $BASE_SHIFT, %eax
  484. addl %eax, AORIG
  485. #endif
  486. ALIGN_4
  487. .L89:
  488. #ifdef LN
  489. movl K, %eax
  490. leal (B, %eax, SIZE), B
  491. #endif
  492. #if defined(LT) || defined(RN)
  493. movl BB, B
  494. #endif
  495. #ifdef RN
  496. addl $1, KK
  497. #endif
  498. #ifdef RT
  499. subl $1, KK
  500. #endif
  501. ALIGN_4
  502. .L30:
  503. testl $2, N
  504. je .L60
  505. #if defined(LT) || defined(RN)
  506. movl A, AA
  507. #else
  508. movl A, %eax
  509. movl %eax, AORIG
  510. #endif
  511. #ifdef RT
  512. movl K, %eax
  513. sall $1 + BASE_SHIFT, %eax
  514. subl %eax, B
  515. #endif
  516. leal (, LDC, 2), %eax
  517. #ifdef RT
  518. subl %eax, C
  519. #endif
  520. movl C, CO1
  521. #ifndef RT
  522. addl %eax, C
  523. #endif
  524. #ifdef LN
  525. movl OFFSET, %eax
  526. addl M, %eax
  527. movl %eax, KK
  528. #endif
  529. #ifdef LT
  530. movl OFFSET, %eax
  531. movl %eax, KK
  532. #endif
  533. movl M, %ebx
  534. sarl $1, %ebx # i = (m >> 2)
  535. jle .L50
  536. ALIGN_4
  537. .L41:
  538. #ifdef LN
  539. movl K, %eax
  540. sall $1 + BASE_SHIFT, %eax
  541. subl %eax, AORIG
  542. #endif
  543. #if defined(LN) || defined(RT)
  544. movl KK, %eax
  545. movl AORIG, AA
  546. leal (, %eax, SIZE), %eax
  547. leal (AA, %eax, 2), AA
  548. #endif
  549. movl B, BB
  550. #if defined(LN) || defined(RT)
  551. movl KK, %eax
  552. sall $1 + BASE_SHIFT, %eax
  553. addl %eax, BB
  554. #endif
  555. movaps -16 * SIZE(AA), %xmm0
  556. pxor %xmm4, %xmm4
  557. movaps -16 * SIZE(BB), %xmm1
  558. pxor %xmm5, %xmm5
  559. #ifdef LN
  560. prefetcht0 -2 * SIZE(CO1)
  561. pxor %xmm6, %xmm6
  562. prefetcht0 -2 * SIZE(CO1, LDC)
  563. pxor %xmm7, %xmm7
  564. #else
  565. prefetcht0 1 * SIZE(CO1)
  566. pxor %xmm6, %xmm6
  567. prefetcht0 1 * SIZE(CO1, LDC)
  568. pxor %xmm7, %xmm7
  569. #endif
  570. #if defined(LT) || defined(RN)
  571. movl KK, %eax
  572. #else
  573. movl K, %eax
  574. subl KK, %eax
  575. #endif
  576. sarl $3, %eax
  577. je .L45
  578. ALIGN_4
  579. .L42:
  580. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  581. pshufd $0x4e, %xmm1, %xmm2
  582. mulpd %xmm0, %xmm1
  583. mulpd %xmm0, %xmm2
  584. movaps -14 * SIZE(AA), %xmm0
  585. addpd %xmm1, %xmm5
  586. movaps -14 * SIZE(BB), %xmm1
  587. addpd %xmm2, %xmm4
  588. pshufd $0x4e, %xmm1, %xmm2
  589. mulpd %xmm0, %xmm1
  590. mulpd %xmm0, %xmm2
  591. movaps -12 * SIZE(AA), %xmm0
  592. addpd %xmm1, %xmm7
  593. movaps -12 * SIZE(BB), %xmm1
  594. addpd %xmm2, %xmm6
  595. pshufd $0x4e, %xmm1, %xmm2
  596. mulpd %xmm0, %xmm1
  597. mulpd %xmm0, %xmm2
  598. movaps -10 * SIZE(AA), %xmm0
  599. addpd %xmm1, %xmm5
  600. movaps -10 * SIZE(BB), %xmm1
  601. addpd %xmm2, %xmm4
  602. pshufd $0x4e, %xmm1, %xmm2
  603. mulpd %xmm0, %xmm1
  604. mulpd %xmm0, %xmm2
  605. movaps -8 * SIZE(AA), %xmm0
  606. addpd %xmm1, %xmm7
  607. movaps -8 * SIZE(BB), %xmm1
  608. addpd %xmm2, %xmm6
  609. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  610. pshufd $0x4e, %xmm1, %xmm2
  611. mulpd %xmm0, %xmm1
  612. mulpd %xmm0, %xmm2
  613. movaps -6 * SIZE(AA), %xmm0
  614. addpd %xmm1, %xmm5
  615. movaps -6 * SIZE(BB), %xmm1
  616. addpd %xmm2, %xmm4
  617. pshufd $0x4e, %xmm1, %xmm2
  618. mulpd %xmm0, %xmm1
  619. mulpd %xmm0, %xmm2
  620. movaps -4 * SIZE(AA), %xmm0
  621. addpd %xmm1, %xmm7
  622. movaps -4 * SIZE(BB), %xmm1
  623. addpd %xmm2, %xmm6
  624. pshufd $0x4e, %xmm1, %xmm2
  625. mulpd %xmm0, %xmm1
  626. mulpd %xmm0, %xmm2
  627. movaps -2 * SIZE(AA), %xmm0
  628. addpd %xmm1, %xmm5
  629. movaps -2 * SIZE(BB), %xmm1
  630. addpd %xmm2, %xmm4
  631. pshufd $0x4e, %xmm1, %xmm2
  632. mulpd %xmm0, %xmm1
  633. mulpd %xmm0, %xmm2
  634. movaps 0 * SIZE(AA), %xmm0
  635. addpd %xmm1, %xmm7
  636. movaps 0 * SIZE(BB), %xmm1
  637. addpd %xmm2, %xmm6
  638. subl $-16 * SIZE, AA
  639. subl $-16 * SIZE, BB
  640. subl $1, %eax
  641. jne .L42
  642. ALIGN_4
  643. .L45:
  644. #if defined(LT) || defined(RN)
  645. movl KK, %eax
  646. #else
  647. movl K, %eax
  648. subl KK, %eax
  649. #endif
  650. andl $7, %eax # if (k & 1)
  651. BRANCH
  652. je .L48
  653. ALIGN_3
  654. .L46:
  655. pshufd $0x4e, %xmm1, %xmm2
  656. mulpd %xmm0, %xmm1
  657. mulpd %xmm0, %xmm2
  658. movaps -14 * SIZE(AA), %xmm0
  659. addpd %xmm1, %xmm5
  660. movaps -14 * SIZE(BB), %xmm1
  661. addpd %xmm2, %xmm4
  662. addl $2 * SIZE, AA
  663. addl $2 * SIZE, BB
  664. decl %eax
  665. jg .L46
  666. ALIGN_4
  667. .L48:
  668. addpd %xmm6, %xmm4
  669. addpd %xmm7, %xmm5
  670. movaps %xmm4, %xmm0
  671. movsd %xmm5, %xmm4
  672. movsd %xmm0, %xmm5
  673. #if defined(LN) || defined(RT)
  674. movl KK, %eax
  675. #ifdef LN
  676. subl $2, %eax
  677. #else
  678. subl $2, %eax
  679. #endif
  680. movl AORIG, AA
  681. leal (, %eax, SIZE), %eax
  682. leal (AA, %eax, 2), AA
  683. leal (B, %eax, 2), BB
  684. #endif
  685. #if defined(LN) || defined(LT)
  686. movapd %xmm4, %xmm0
  687. unpcklpd %xmm5, %xmm4
  688. unpckhpd %xmm5, %xmm0
  689. movapd -16 * SIZE(BB), %xmm2
  690. movapd -14 * SIZE(BB), %xmm3
  691. subpd %xmm4, %xmm2
  692. subpd %xmm0, %xmm3
  693. #else
  694. movapd -16 * SIZE(AA), %xmm0
  695. movapd -14 * SIZE(AA), %xmm1
  696. subpd %xmm4, %xmm0
  697. subpd %xmm5, %xmm1
  698. #endif
  699. #ifdef LN
  700. movddup -13 * SIZE(AA), %xmm4
  701. mulpd %xmm4, %xmm3
  702. movddup -14 * SIZE(AA), %xmm4
  703. mulpd %xmm3, %xmm4
  704. subpd %xmm4, %xmm2
  705. movddup -16 * SIZE(AA), %xmm4
  706. mulpd %xmm4, %xmm2
  707. #endif
  708. #ifdef LT
  709. movddup -16 * SIZE(AA), %xmm4
  710. mulpd %xmm4, %xmm2
  711. movddup -15 * SIZE(AA), %xmm4
  712. mulpd %xmm2, %xmm4
  713. subpd %xmm4, %xmm3
  714. movddup -13 * SIZE(AA), %xmm4
  715. mulpd %xmm4, %xmm3
  716. #endif
  717. #ifdef RN
  718. movddup -16 * SIZE(BB), %xmm4
  719. mulpd %xmm4, %xmm0
  720. movddup -15 * SIZE(BB), %xmm4
  721. mulpd %xmm0, %xmm4
  722. subpd %xmm4, %xmm1
  723. movddup -13 * SIZE(BB), %xmm4
  724. mulpd %xmm4, %xmm1
  725. #endif
  726. #ifdef RT
  727. movddup -13 * SIZE(BB), %xmm4
  728. mulpd %xmm4, %xmm1
  729. movddup -14 * SIZE(BB), %xmm4
  730. mulpd %xmm1, %xmm4
  731. subpd %xmm4, %xmm0
  732. movddup -16 * SIZE(BB), %xmm4
  733. mulpd %xmm4, %xmm0
  734. #endif
  735. #if defined(LN) || defined(LT)
  736. movapd %xmm2, -16 * SIZE(BB)
  737. movapd %xmm3, -14 * SIZE(BB)
  738. #else
  739. movapd %xmm0, -16 * SIZE(AA)
  740. movapd %xmm1, -14 * SIZE(AA)
  741. #endif
  742. #ifdef LN
  743. subl $2 * SIZE, CO1
  744. #endif
  745. #if defined(LN) || defined(LT)
  746. movsd %xmm2, 0 * SIZE(CO1)
  747. movsd %xmm3, 1 * SIZE(CO1)
  748. movhps %xmm2, 0 * SIZE(CO1, LDC, 1)
  749. movhps %xmm3, 1 * SIZE(CO1, LDC, 1)
  750. #else
  751. movsd %xmm0, 0 * SIZE(CO1)
  752. movhps %xmm0, 1 * SIZE(CO1)
  753. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  754. movhps %xmm1, 1 * SIZE(CO1, LDC, 1)
  755. #endif
  756. #ifndef LN
  757. addl $2 * SIZE, CO1
  758. #endif
  759. #if defined(LT) || defined(RN)
  760. movl K, %eax
  761. subl KK, %eax
  762. leal (,%eax, SIZE), %eax
  763. leal (AA, %eax, 2), AA
  764. leal (BB, %eax, 2), BB
  765. #endif
  766. #ifdef LN
  767. subl $2, KK
  768. #endif
  769. #ifdef LT
  770. addl $2, KK
  771. #endif
  772. #ifdef RT
  773. movl K, %eax
  774. sall $1 + BASE_SHIFT, %eax
  775. addl %eax, AORIG
  776. #endif
  777. decl %ebx # i --
  778. jg .L41
  779. ALIGN_4
  780. .L50:
  781. movl M, %ebx
  782. testl $1, %ebx # i = (m >> 2)
  783. jle .L59
  784. #ifdef LN
  785. movl K, %eax
  786. sall $BASE_SHIFT, %eax
  787. subl %eax, AORIG
  788. #endif
  789. #if defined(LN) || defined(RT)
  790. movl KK, %eax
  791. movl AORIG, AA
  792. leal (AA, %eax, SIZE), AA
  793. #endif
  794. movl B, BB
  795. #if defined(LN) || defined(RT)
  796. movl KK, %eax
  797. sall $1 + BASE_SHIFT, %eax
  798. addl %eax, BB
  799. #endif
  800. movaps -16 * SIZE(AA), %xmm0
  801. pxor %xmm4, %xmm4
  802. movaps -16 * SIZE(BB), %xmm2
  803. pxor %xmm5, %xmm5
  804. #if defined(LT) || defined(RN)
  805. movl KK, %eax
  806. #else
  807. movl K, %eax
  808. subl KK, %eax
  809. #endif
  810. sarl $3, %eax
  811. je .L55
  812. ALIGN_4
  813. .L52:
  814. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  815. pshufd $0x44, %xmm0, %xmm1
  816. mulpd %xmm1, %xmm2
  817. addpd %xmm2, %xmm4
  818. movaps -14 * SIZE(BB), %xmm2
  819. pshufd $0xee, %xmm0, %xmm1
  820. movaps -14 * SIZE(AA), %xmm0
  821. mulpd %xmm1, %xmm2
  822. addpd %xmm2, %xmm5
  823. movaps -12 * SIZE(BB), %xmm2
  824. pshufd $0x44, %xmm0, %xmm1
  825. mulpd %xmm1, %xmm2
  826. addpd %xmm2, %xmm4
  827. movaps -10 * SIZE(BB), %xmm2
  828. pshufd $0xee, %xmm0, %xmm1
  829. movaps -12 * SIZE(AA), %xmm0
  830. mulpd %xmm1, %xmm2
  831. addpd %xmm2, %xmm5
  832. movaps -8 * SIZE(BB), %xmm2
  833. pshufd $0x44, %xmm0, %xmm1
  834. mulpd %xmm1, %xmm2
  835. addpd %xmm2, %xmm4
  836. movaps -6 * SIZE(BB), %xmm2
  837. pshufd $0xee, %xmm0, %xmm1
  838. movaps -10 * SIZE(AA), %xmm0
  839. mulpd %xmm1, %xmm2
  840. addpd %xmm2, %xmm5
  841. movaps -4 * SIZE(BB), %xmm2
  842. pshufd $0x44, %xmm0, %xmm1
  843. mulpd %xmm1, %xmm2
  844. addpd %xmm2, %xmm4
  845. movaps -2 * SIZE(BB), %xmm2
  846. pshufd $0xee, %xmm0, %xmm1
  847. movaps -8 * SIZE(AA), %xmm0
  848. mulpd %xmm1, %xmm2
  849. addpd %xmm2, %xmm5
  850. movaps 0 * SIZE(BB), %xmm2
  851. subl $ -8 * SIZE, AA
  852. subl $-16 * SIZE, BB
  853. subl $1, %eax
  854. jne .L52
  855. ALIGN_4
  856. .L55:
  857. #if defined(LT) || defined(RN)
  858. movl KK, %eax
  859. #else
  860. movl K, %eax
  861. subl KK, %eax
  862. #endif
  863. andl $7, %eax
  864. BRANCH
  865. je .L58
  866. .L56:
  867. pshufd $0x44, %xmm0, %xmm1
  868. movsd -15 * SIZE(AA), %xmm0
  869. mulpd %xmm1, %xmm2
  870. addpd %xmm2, %xmm4
  871. movaps -14 * SIZE(BB), %xmm2
  872. addl $1 * SIZE, AA
  873. addl $2 * SIZE, BB
  874. decl %eax
  875. jg .L56
  876. ALIGN_4
  877. .L58:
  878. addpd %xmm5, %xmm4
  879. #if defined(LN) || defined(RT)
  880. movl KK, %eax
  881. #ifdef LN
  882. subl $1, %eax
  883. #else
  884. subl $2, %eax
  885. #endif
  886. movl AORIG, AA
  887. leal (, %eax, SIZE), %eax
  888. addl %eax, AA
  889. leal (B, %eax, 2), BB
  890. #endif
  891. #if defined(LN) || defined(LT)
  892. movapd -16 * SIZE(BB), %xmm0
  893. subpd %xmm4, %xmm0
  894. #else
  895. movapd -16 * SIZE(AA), %xmm1
  896. subpd %xmm4, %xmm1
  897. movapd %xmm1, %xmm0
  898. unpckhpd %xmm1, %xmm1
  899. #endif
  900. #ifdef LN
  901. movddup -16 * SIZE(AA), %xmm4
  902. mulpd %xmm4, %xmm0
  903. #endif
  904. #ifdef LT
  905. movddup -16 * SIZE(AA), %xmm4
  906. mulpd %xmm4, %xmm0
  907. #endif
  908. #ifdef RN
  909. movsd -16 * SIZE(BB), %xmm4
  910. mulsd %xmm4, %xmm0
  911. movsd -15 * SIZE(BB), %xmm4
  912. mulsd %xmm0, %xmm4
  913. subsd %xmm4, %xmm1
  914. movsd -13 * SIZE(BB), %xmm4
  915. mulsd %xmm4, %xmm1
  916. #endif
  917. #ifdef RT
  918. movsd -13 * SIZE(BB), %xmm4
  919. mulsd %xmm4, %xmm1
  920. movsd -14 * SIZE(BB), %xmm4
  921. mulsd %xmm1, %xmm4
  922. subsd %xmm4, %xmm0
  923. movsd -16 * SIZE(BB), %xmm4
  924. mulsd %xmm4, %xmm0
  925. #endif
  926. #if defined(LN) || defined(LT)
  927. movapd %xmm0, -16 * SIZE(BB)
  928. #else
  929. movsd %xmm0, -16 * SIZE(AA)
  930. movsd %xmm1, -15 * SIZE(AA)
  931. #endif
  932. #ifdef LN
  933. subl $1 * SIZE, CO1
  934. #endif
  935. #if defined(LN) || defined(LT)
  936. movsd %xmm0, 0 * SIZE(CO1)
  937. movhps %xmm0, 0 * SIZE(CO1, LDC, 1)
  938. #else
  939. movsd %xmm0, 0 * SIZE(CO1)
  940. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  941. #endif
  942. #ifndef LN
  943. addl $1 * SIZE, CO1
  944. #endif
  945. #if defined(LT) || defined(RN)
  946. movl K, %eax
  947. subl KK, %eax
  948. leal (,%eax, SIZE), %eax
  949. leal (AA, %eax, 1), AA
  950. leal (BB, %eax, 2), BB
  951. #endif
  952. #ifdef LN
  953. subl $1, KK
  954. #endif
  955. #ifdef LT
  956. addl $1, KK
  957. #endif
  958. #ifdef RT
  959. movl K, %eax
  960. sall $1 + BASE_SHIFT, %eax
  961. addl %eax, AORIG
  962. #endif
  963. ALIGN_4
  964. .L59:
  965. #ifdef LN
  966. movl K, %eax
  967. leal (, %eax, SIZE), %eax
  968. leal (B, %eax, 2), B
  969. #endif
  970. #if defined(LT) || defined(RN)
  971. movl BB, B
  972. #endif
  973. #ifdef RN
  974. addl $2, KK
  975. #endif
  976. #ifdef RT
  977. subl $2, KK
  978. #endif
  979. ALIGN_4
  980. .L60:
  981. movl N, %eax
  982. sarl $2, %eax
  983. movl %eax, J
  984. jle .L999
  985. ALIGN_4
  986. .L10:
  987. #if defined(LT) || defined(RN)
  988. movl A, AA
  989. #else
  990. movl A, %eax
  991. movl %eax, AORIG
  992. #endif
  993. #ifdef RT
  994. movl K, %eax
  995. sall $2 + BASE_SHIFT, %eax
  996. subl %eax, B
  997. #endif
  998. leal (, LDC, 4), %eax
  999. #ifdef RT
  1000. subl %eax, C
  1001. #endif
  1002. movl C, CO1
  1003. #ifndef RT
  1004. addl %eax, C
  1005. #endif
  1006. #ifdef LN
  1007. movl OFFSET, %eax
  1008. addl M, %eax
  1009. movl %eax, KK
  1010. #endif
  1011. #ifdef LT
  1012. movl OFFSET, %eax
  1013. movl %eax, KK
  1014. #endif
  1015. movl M, %ebx
  1016. sarl $1, %ebx # i = (m >> 2)
  1017. jle .L20
  1018. ALIGN_4
  1019. .L11:
  1020. #ifdef LN
  1021. movl K, %eax
  1022. sall $1 + BASE_SHIFT, %eax
  1023. subl %eax, AORIG
  1024. #endif
  1025. #if defined(LN) || defined(RT)
  1026. movl KK, %eax
  1027. movl AORIG, AA
  1028. leal (, %eax, SIZE), %eax
  1029. leal (AA, %eax, 2), AA
  1030. #endif
  1031. movl B, BB
  1032. #if defined(LN) || defined(RT)
  1033. movl KK, %eax
  1034. sall $2 + BASE_SHIFT, %eax
  1035. addl %eax, BB
  1036. #endif
  1037. leal (CO1, LDC, 2), %eax
  1038. movaps -16 * SIZE(AA), %xmm0
  1039. pxor %xmm2, %xmm2
  1040. movaps -16 * SIZE(BB), %xmm1
  1041. pxor %xmm3, %xmm3
  1042. #ifdef LN
  1043. pxor %xmm4, %xmm4
  1044. prefetcht0 -2 * SIZE(CO1)
  1045. pxor %xmm5, %xmm5
  1046. prefetcht0 -2 * SIZE(CO1, LDC)
  1047. pxor %xmm6, %xmm6
  1048. prefetcht0 -2 * SIZE(%eax)
  1049. pxor %xmm7, %xmm7
  1050. prefetcht0 -2 * SIZE(%eax, LDC)
  1051. #else
  1052. pxor %xmm4, %xmm4
  1053. prefetcht0 1 * SIZE(CO1)
  1054. pxor %xmm5, %xmm5
  1055. prefetcht0 1 * SIZE(CO1, LDC)
  1056. pxor %xmm6, %xmm6
  1057. prefetcht0 1 * SIZE(%eax)
  1058. pxor %xmm7, %xmm7
  1059. prefetcht0 1 * SIZE(%eax, LDC)
  1060. #endif
  1061. #if defined(LT) || defined(RN)
  1062. movl KK, %eax
  1063. #else
  1064. movl K, %eax
  1065. subl KK, %eax
  1066. #endif
  1067. sarl $3, %eax
  1068. je .L15
  1069. ALIGN_4
  1070. .L12:
  1071. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1072. addpd %xmm3, %xmm7
  1073. movaps -14 * SIZE(BB), %xmm3
  1074. addpd %xmm2, %xmm6
  1075. pshufd $0x4e, %xmm1, %xmm2
  1076. mulpd %xmm0, %xmm1
  1077. mulpd %xmm0, %xmm2
  1078. addpd %xmm1, %xmm5
  1079. movaps -12 * SIZE(BB), %xmm1
  1080. addpd %xmm2, %xmm4
  1081. pshufd $0x4e, %xmm3, %xmm2
  1082. mulpd %xmm0, %xmm3
  1083. mulpd %xmm0, %xmm2
  1084. movaps -14 * SIZE(AA), %xmm0
  1085. addpd %xmm3, %xmm7
  1086. movaps -10 * SIZE(BB), %xmm3
  1087. addpd %xmm2, %xmm6
  1088. pshufd $0x4e, %xmm1, %xmm2
  1089. mulpd %xmm0, %xmm1
  1090. mulpd %xmm0, %xmm2
  1091. addpd %xmm1, %xmm5
  1092. movaps -8 * SIZE(BB), %xmm1
  1093. addpd %xmm2, %xmm4
  1094. pshufd $0x4e, %xmm3, %xmm2
  1095. mulpd %xmm0, %xmm3
  1096. mulpd %xmm0, %xmm2
  1097. movaps -12 * SIZE(AA), %xmm0
  1098. addpd %xmm3, %xmm7
  1099. movaps -6 * SIZE(BB), %xmm3
  1100. addpd %xmm2, %xmm6
  1101. pshufd $0x4e, %xmm1, %xmm2
  1102. mulpd %xmm0, %xmm1
  1103. mulpd %xmm0, %xmm2
  1104. addpd %xmm1, %xmm5
  1105. movaps -4 * SIZE(BB), %xmm1
  1106. addpd %xmm2, %xmm4
  1107. pshufd $0x4e, %xmm3, %xmm2
  1108. mulpd %xmm0, %xmm3
  1109. mulpd %xmm0, %xmm2
  1110. movaps -10 * SIZE(AA), %xmm0
  1111. addpd %xmm3, %xmm7
  1112. movaps -2 * SIZE(BB), %xmm3
  1113. addpd %xmm2, %xmm6
  1114. pshufd $0x4e, %xmm1, %xmm2
  1115. mulpd %xmm0, %xmm1
  1116. mulpd %xmm0, %xmm2
  1117. addpd %xmm1, %xmm5
  1118. movaps 0 * SIZE(BB), %xmm1
  1119. addpd %xmm2, %xmm4
  1120. pshufd $0x4e, %xmm3, %xmm2
  1121. mulpd %xmm0, %xmm3
  1122. mulpd %xmm0, %xmm2
  1123. movaps -8 * SIZE(AA), %xmm0
  1124. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  1125. addpd %xmm3, %xmm7
  1126. movaps 2 * SIZE(BB), %xmm3
  1127. addpd %xmm2, %xmm6
  1128. pshufd $0x4e, %xmm1, %xmm2
  1129. mulpd %xmm0, %xmm1
  1130. mulpd %xmm0, %xmm2
  1131. addpd %xmm1, %xmm5
  1132. movaps 4 * SIZE(BB), %xmm1
  1133. addpd %xmm2, %xmm4
  1134. pshufd $0x4e, %xmm3, %xmm2
  1135. mulpd %xmm0, %xmm3
  1136. mulpd %xmm0, %xmm2
  1137. movaps -6 * SIZE(AA), %xmm0
  1138. addpd %xmm3, %xmm7
  1139. movaps 6 * SIZE(BB), %xmm3
  1140. addpd %xmm2, %xmm6
  1141. pshufd $0x4e, %xmm1, %xmm2
  1142. mulpd %xmm0, %xmm1
  1143. mulpd %xmm0, %xmm2
  1144. addpd %xmm1, %xmm5
  1145. movaps 8 * SIZE(BB), %xmm1
  1146. addpd %xmm2, %xmm4
  1147. pshufd $0x4e, %xmm3, %xmm2
  1148. mulpd %xmm0, %xmm3
  1149. mulpd %xmm0, %xmm2
  1150. movaps -4 * SIZE(AA), %xmm0
  1151. addpd %xmm3, %xmm7
  1152. movaps 10 * SIZE(BB), %xmm3
  1153. addpd %xmm2, %xmm6
  1154. pshufd $0x4e, %xmm1, %xmm2
  1155. mulpd %xmm0, %xmm1
  1156. mulpd %xmm0, %xmm2
  1157. addpd %xmm1, %xmm5
  1158. movaps 12 * SIZE(BB), %xmm1
  1159. addpd %xmm2, %xmm4
  1160. pshufd $0x4e, %xmm3, %xmm2
  1161. mulpd %xmm0, %xmm3
  1162. mulpd %xmm0, %xmm2
  1163. movaps -2 * SIZE(AA), %xmm0
  1164. addpd %xmm3, %xmm7
  1165. movaps 14 * SIZE(BB), %xmm3
  1166. addpd %xmm2, %xmm6
  1167. pshufd $0x4e, %xmm1, %xmm2
  1168. mulpd %xmm0, %xmm1
  1169. mulpd %xmm0, %xmm2
  1170. addpd %xmm1, %xmm5
  1171. movaps 16 * SIZE(BB), %xmm1
  1172. addpd %xmm2, %xmm4
  1173. pshufd $0x4e, %xmm3, %xmm2
  1174. mulpd %xmm0, %xmm3
  1175. subl $-32 * SIZE, BB
  1176. mulpd %xmm0, %xmm2
  1177. movaps 0 * SIZE(AA), %xmm0
  1178. subl $-16 * SIZE, AA
  1179. subl $1, %eax
  1180. jne .L12
  1181. ALIGN_4
  1182. .L15:
  1183. #if defined(LT) || defined(RN)
  1184. movl KK, %eax
  1185. #else
  1186. movl K, %eax
  1187. subl KK, %eax
  1188. #endif
  1189. andl $7, %eax # if (k & 1)
  1190. BRANCH
  1191. je .L18
  1192. ALIGN_3
  1193. .L16:
  1194. addpd %xmm3, %xmm7
  1195. movaps -14 * SIZE(BB), %xmm3
  1196. addpd %xmm2, %xmm6
  1197. pshufd $0x4e, %xmm1, %xmm2
  1198. mulpd %xmm0, %xmm1
  1199. mulpd %xmm0, %xmm2
  1200. addpd %xmm1, %xmm5
  1201. movaps -12 * SIZE(BB), %xmm1
  1202. addpd %xmm2, %xmm4
  1203. pshufd $0x4e, %xmm3, %xmm2
  1204. mulpd %xmm0, %xmm3
  1205. mulpd %xmm0, %xmm2
  1206. movaps -14 * SIZE(AA), %xmm0
  1207. addl $2 * SIZE, AA
  1208. addl $4 * SIZE, BB
  1209. decl %eax
  1210. jg .L16
  1211. ALIGN_4
  1212. .L18:
  1213. #if defined(LN) || defined(RT)
  1214. movl KK, %eax
  1215. #ifdef LN
  1216. subl $2, %eax
  1217. #else
  1218. subl $4, %eax
  1219. #endif
  1220. movl AORIG, AA
  1221. leal (, %eax, SIZE), %eax
  1222. leal (AA, %eax, 2), AA
  1223. leal (B, %eax, 4), BB
  1224. #endif
  1225. addpd %xmm2, %xmm6
  1226. addpd %xmm3, %xmm7
  1227. movaps %xmm4, %xmm0
  1228. movsd %xmm5, %xmm4
  1229. movsd %xmm0, %xmm5
  1230. movaps %xmm6, %xmm0
  1231. movsd %xmm7, %xmm6
  1232. movsd %xmm0, %xmm7
  1233. #if defined(LN) || defined(LT)
  1234. movapd %xmm4, %xmm0
  1235. unpcklpd %xmm5, %xmm4
  1236. unpckhpd %xmm5, %xmm0
  1237. movapd %xmm6, %xmm1
  1238. unpcklpd %xmm7, %xmm6
  1239. unpckhpd %xmm7, %xmm1
  1240. movapd -16 * SIZE(BB), %xmm2
  1241. movapd -14 * SIZE(BB), %xmm5
  1242. movapd -12 * SIZE(BB), %xmm3
  1243. movapd -10 * SIZE(BB), %xmm7
  1244. subpd %xmm4, %xmm2
  1245. subpd %xmm6, %xmm5
  1246. subpd %xmm0, %xmm3
  1247. subpd %xmm1, %xmm7
  1248. #else
  1249. movapd -16 * SIZE(AA), %xmm0
  1250. movapd -14 * SIZE(AA), %xmm1
  1251. movapd -12 * SIZE(AA), %xmm2
  1252. movapd -10 * SIZE(AA), %xmm3
  1253. subpd %xmm4, %xmm0
  1254. subpd %xmm5, %xmm1
  1255. subpd %xmm6, %xmm2
  1256. subpd %xmm7, %xmm3
  1257. #endif
  1258. #ifdef LN
  1259. movddup -13 * SIZE(AA), %xmm4
  1260. mulpd %xmm4, %xmm3
  1261. mulpd %xmm4, %xmm7
  1262. movddup -14 * SIZE(AA), %xmm4
  1263. movapd %xmm4, %xmm6
  1264. mulpd %xmm3, %xmm4
  1265. subpd %xmm4, %xmm2
  1266. mulpd %xmm7, %xmm6
  1267. subpd %xmm6, %xmm5
  1268. movddup -16 * SIZE(AA), %xmm4
  1269. mulpd %xmm4, %xmm2
  1270. mulpd %xmm4, %xmm5
  1271. #endif
  1272. #ifdef LT
  1273. movddup -16 * SIZE(AA), %xmm4
  1274. mulpd %xmm4, %xmm2
  1275. mulpd %xmm4, %xmm5
  1276. movddup -15 * SIZE(AA), %xmm4
  1277. movapd %xmm4, %xmm6
  1278. mulpd %xmm2, %xmm4
  1279. subpd %xmm4, %xmm3
  1280. mulpd %xmm5, %xmm6
  1281. subpd %xmm6, %xmm7
  1282. movddup -13 * SIZE(AA), %xmm4
  1283. mulpd %xmm4, %xmm3
  1284. mulpd %xmm4, %xmm7
  1285. #endif
  1286. #ifdef RN
  1287. movddup -16 * SIZE(BB), %xmm4
  1288. mulpd %xmm4, %xmm0
  1289. movddup -15 * SIZE(BB), %xmm4
  1290. mulpd %xmm0, %xmm4
  1291. subpd %xmm4, %xmm1
  1292. movddup -14 * SIZE(BB), %xmm4
  1293. mulpd %xmm0, %xmm4
  1294. subpd %xmm4, %xmm2
  1295. movddup -13 * SIZE(BB), %xmm4
  1296. mulpd %xmm0, %xmm4
  1297. subpd %xmm4, %xmm3
  1298. movddup -11 * SIZE(BB), %xmm4
  1299. mulpd %xmm4, %xmm1
  1300. movddup -10 * SIZE(BB), %xmm4
  1301. mulpd %xmm1, %xmm4
  1302. subpd %xmm4, %xmm2
  1303. movddup -9 * SIZE(BB), %xmm4
  1304. mulpd %xmm1, %xmm4
  1305. subpd %xmm4, %xmm3
  1306. movddup -6 * SIZE(BB), %xmm4
  1307. mulpd %xmm4, %xmm2
  1308. movddup -5 * SIZE(BB), %xmm4
  1309. mulpd %xmm2, %xmm4
  1310. subpd %xmm4, %xmm3
  1311. movddup -1 * SIZE(BB), %xmm4
  1312. mulpd %xmm4, %xmm3
  1313. #endif
  1314. #ifdef RT
  1315. movddup -1 * SIZE(BB), %xmm4
  1316. mulpd %xmm4, %xmm3
  1317. movddup -2 * SIZE(BB), %xmm4
  1318. mulpd %xmm3, %xmm4
  1319. subpd %xmm4, %xmm2
  1320. movddup -3 * SIZE(BB), %xmm4
  1321. mulpd %xmm3, %xmm4
  1322. subpd %xmm4, %xmm1
  1323. movddup -4 * SIZE(BB), %xmm4
  1324. mulpd %xmm3, %xmm4
  1325. subpd %xmm4, %xmm0
  1326. movddup -6 * SIZE(BB), %xmm4
  1327. mulpd %xmm4, %xmm2
  1328. movddup -7 * SIZE(BB), %xmm4
  1329. mulpd %xmm2, %xmm4
  1330. subpd %xmm4, %xmm1
  1331. movddup -8 * SIZE(BB), %xmm4
  1332. mulpd %xmm2, %xmm4
  1333. subpd %xmm4, %xmm0
  1334. movddup -11 * SIZE(BB), %xmm4
  1335. mulpd %xmm4, %xmm1
  1336. movddup -12 * SIZE(BB), %xmm4
  1337. mulpd %xmm1, %xmm4
  1338. subpd %xmm4, %xmm0
  1339. movddup -16 * SIZE(BB), %xmm4
  1340. mulpd %xmm4, %xmm0
  1341. #endif
  1342. #if defined(LN) || defined(LT)
  1343. movapd %xmm2, -16 * SIZE(BB)
  1344. movapd %xmm5, -14 * SIZE(BB)
  1345. movapd %xmm3, -12 * SIZE(BB)
  1346. movapd %xmm7, -10 * SIZE(BB)
  1347. #else
  1348. movapd %xmm0, -16 * SIZE(AA)
  1349. movapd %xmm1, -14 * SIZE(AA)
  1350. movapd %xmm2, -12 * SIZE(AA)
  1351. movapd %xmm3, -10 * SIZE(AA)
  1352. #endif
  1353. #ifdef LN
  1354. subl $2 * SIZE, CO1
  1355. #endif
  1356. leal (LDC, LDC, 2), %eax
  1357. #if defined(LN) || defined(LT)
  1358. movsd %xmm2, 0 * SIZE(CO1)
  1359. movsd %xmm3, 1 * SIZE(CO1)
  1360. movhps %xmm2, 0 * SIZE(CO1, LDC, 1)
  1361. movhps %xmm3, 1 * SIZE(CO1, LDC, 1)
  1362. movsd %xmm5, 0 * SIZE(CO1, LDC, 2)
  1363. movsd %xmm7, 1 * SIZE(CO1, LDC, 2)
  1364. movhps %xmm5, 0 * SIZE(CO1, %eax, 1)
  1365. movhps %xmm7, 1 * SIZE(CO1, %eax, 1)
  1366. #else
  1367. movsd %xmm0, 0 * SIZE(CO1)
  1368. movhps %xmm0, 1 * SIZE(CO1)
  1369. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1370. movhps %xmm1, 1 * SIZE(CO1, LDC, 1)
  1371. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  1372. movhps %xmm2, 1 * SIZE(CO1, LDC, 2)
  1373. movsd %xmm3, 0 * SIZE(CO1, %eax, 1)
  1374. movhps %xmm3, 1 * SIZE(CO1, %eax, 1)
  1375. #endif
  1376. #ifndef LN
  1377. addl $2 * SIZE, CO1
  1378. #endif
  1379. #if defined(LT) || defined(RN)
  1380. movl K, %eax
  1381. subl KK, %eax
  1382. leal (,%eax, SIZE), %eax
  1383. leal (AA, %eax, 2), AA
  1384. leal (BB, %eax, 4), BB
  1385. #endif
  1386. #ifdef LN
  1387. subl $2, KK
  1388. #endif
  1389. #ifdef LT
  1390. addl $2, KK
  1391. #endif
  1392. #ifdef RT
  1393. movl K, %eax
  1394. sall $1 + BASE_SHIFT, %eax
  1395. addl %eax, AORIG
  1396. #endif
  1397. decl %ebx # i --
  1398. jg .L11
  1399. ALIGN_4
  1400. .L20:
  1401. movl M, %ebx
  1402. testl $1, %ebx # i = (m >> 2)
  1403. jle .L29
  1404. #ifdef LN
  1405. movl K, %eax
  1406. sall $BASE_SHIFT, %eax
  1407. subl %eax, AORIG
  1408. #endif
  1409. #if defined(LN) || defined(RT)
  1410. movl KK, %eax
  1411. movl AORIG, AA
  1412. leal (AA, %eax, SIZE), AA
  1413. #endif
  1414. movl B, BB
  1415. #if defined(LN) || defined(RT)
  1416. movl KK, %eax
  1417. sall $2 + BASE_SHIFT, %eax
  1418. addl %eax, BB
  1419. #endif
  1420. movaps -16 * SIZE(AA), %xmm0
  1421. pxor %xmm4, %xmm4
  1422. movaps -16 * SIZE(BB), %xmm2
  1423. pxor %xmm5, %xmm5
  1424. movaps -14 * SIZE(BB), %xmm3
  1425. pxor %xmm6, %xmm6
  1426. pxor %xmm7, %xmm7
  1427. #if defined(LT) || defined(RN)
  1428. movl KK, %eax
  1429. #else
  1430. movl K, %eax
  1431. subl KK, %eax
  1432. #endif
  1433. sarl $3, %eax
  1434. je .L25
  1435. ALIGN_4
  1436. .L22:
  1437. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1438. pshufd $0x44, %xmm0, %xmm1
  1439. mulpd %xmm1, %xmm2
  1440. mulpd %xmm1, %xmm3
  1441. addpd %xmm2, %xmm4
  1442. movaps -12 * SIZE(BB), %xmm2
  1443. addpd %xmm3, %xmm5
  1444. movaps -10 * SIZE(BB), %xmm3
  1445. pshufd $0xee, %xmm0, %xmm1
  1446. movaps -14 * SIZE(AA), %xmm0
  1447. mulpd %xmm1, %xmm2
  1448. mulpd %xmm1, %xmm3
  1449. addpd %xmm2, %xmm6
  1450. movaps -8 * SIZE(BB), %xmm2
  1451. addpd %xmm3, %xmm7
  1452. movaps -6 * SIZE(BB), %xmm3
  1453. pshufd $0x44, %xmm0, %xmm1
  1454. mulpd %xmm1, %xmm2
  1455. mulpd %xmm1, %xmm3
  1456. addpd %xmm2, %xmm4
  1457. movaps -4 * SIZE(BB), %xmm2
  1458. addpd %xmm3, %xmm5
  1459. movaps -2 * SIZE(BB), %xmm3
  1460. pshufd $0xee, %xmm0, %xmm1
  1461. movaps -12 * SIZE(AA), %xmm0
  1462. mulpd %xmm1, %xmm2
  1463. mulpd %xmm1, %xmm3
  1464. addpd %xmm2, %xmm6
  1465. movaps 0 * SIZE(BB), %xmm2
  1466. addpd %xmm3, %xmm7
  1467. movaps 2 * SIZE(BB), %xmm3
  1468. pshufd $0x44, %xmm0, %xmm1
  1469. mulpd %xmm1, %xmm2
  1470. mulpd %xmm1, %xmm3
  1471. addpd %xmm2, %xmm4
  1472. movaps 4 * SIZE(BB), %xmm2
  1473. addpd %xmm3, %xmm5
  1474. movaps 6 * SIZE(BB), %xmm3
  1475. pshufd $0xee, %xmm0, %xmm1
  1476. movaps -10 * SIZE(AA), %xmm0
  1477. mulpd %xmm1, %xmm2
  1478. mulpd %xmm1, %xmm3
  1479. addpd %xmm2, %xmm6
  1480. movaps 8 * SIZE(BB), %xmm2
  1481. addpd %xmm3, %xmm7
  1482. movaps 10 * SIZE(BB), %xmm3
  1483. pshufd $0x44, %xmm0, %xmm1
  1484. mulpd %xmm1, %xmm2
  1485. mulpd %xmm1, %xmm3
  1486. addpd %xmm2, %xmm4
  1487. movaps 12 * SIZE(BB), %xmm2
  1488. addpd %xmm3, %xmm5
  1489. movaps 14 * SIZE(BB), %xmm3
  1490. pshufd $0xee, %xmm0, %xmm1
  1491. movaps -8 * SIZE(AA), %xmm0
  1492. mulpd %xmm1, %xmm2
  1493. mulpd %xmm1, %xmm3
  1494. addpd %xmm2, %xmm6
  1495. movaps 16 * SIZE(BB), %xmm2
  1496. addpd %xmm3, %xmm7
  1497. movaps 18 * SIZE(BB), %xmm3
  1498. subl $ -8 * SIZE, AA
  1499. subl $-32 * SIZE, BB
  1500. subl $1, %eax
  1501. jne .L22
  1502. ALIGN_4
  1503. .L25:
  1504. #if defined(LT) || defined(RN)
  1505. movl KK, %eax
  1506. #else
  1507. movl K, %eax
  1508. subl KK, %eax
  1509. #endif
  1510. andl $7, %eax
  1511. BRANCH
  1512. je .L28
  1513. .L26:
  1514. pshufd $0x44, %xmm0, %xmm1
  1515. movsd -15 * SIZE(AA), %xmm0
  1516. mulpd %xmm1, %xmm2
  1517. mulpd %xmm1, %xmm3
  1518. addpd %xmm2, %xmm4
  1519. movaps -12 * SIZE(BB), %xmm2
  1520. addpd %xmm3, %xmm5
  1521. movaps -10 * SIZE(BB), %xmm3
  1522. addl $1 * SIZE, AA
  1523. addl $4 * SIZE, BB
  1524. decl %eax
  1525. jg .L26
  1526. ALIGN_4
  1527. .L28:
  1528. addpd %xmm6, %xmm4
  1529. addpd %xmm7, %xmm5
  1530. #if defined(LN) || defined(RT)
  1531. movl KK, %eax
  1532. #ifdef LN
  1533. subl $1, %eax
  1534. #else
  1535. subl $4, %eax
  1536. #endif
  1537. movl AORIG, AA
  1538. leal (, %eax, SIZE), %eax
  1539. leal (AA, %eax, 1), AA
  1540. leal (B, %eax, 4), BB
  1541. #endif
  1542. #if defined(LN) || defined(LT)
  1543. movapd -16 * SIZE(BB), %xmm0
  1544. movapd -14 * SIZE(BB), %xmm1
  1545. subpd %xmm4, %xmm0
  1546. subpd %xmm5, %xmm1
  1547. #else
  1548. movapd -16 * SIZE(AA), %xmm1
  1549. movapd -14 * SIZE(AA), %xmm3
  1550. subpd %xmm4, %xmm1
  1551. subpd %xmm5, %xmm3
  1552. movapd %xmm1, %xmm0
  1553. unpckhpd %xmm1, %xmm1
  1554. movapd %xmm3, %xmm2
  1555. unpckhpd %xmm3, %xmm3
  1556. #endif
  1557. #ifdef LN
  1558. movddup -16 * SIZE(AA), %xmm4
  1559. mulpd %xmm4, %xmm0
  1560. mulpd %xmm4, %xmm1
  1561. #endif
  1562. #ifdef LT
  1563. movddup -16 * SIZE(AA), %xmm4
  1564. mulpd %xmm4, %xmm0
  1565. mulpd %xmm4, %xmm1
  1566. #endif
  1567. #ifdef RN
  1568. movsd -16 * SIZE(BB), %xmm4
  1569. mulsd %xmm4, %xmm0
  1570. movsd -15 * SIZE(BB), %xmm4
  1571. mulsd %xmm0, %xmm4
  1572. subsd %xmm4, %xmm1
  1573. movsd -14 * SIZE(BB), %xmm4
  1574. mulsd %xmm0, %xmm4
  1575. subsd %xmm4, %xmm2
  1576. movsd -13 * SIZE(BB), %xmm4
  1577. mulsd %xmm0, %xmm4
  1578. subsd %xmm4, %xmm3
  1579. movsd -11 * SIZE(BB), %xmm4
  1580. mulsd %xmm4, %xmm1
  1581. movsd -10 * SIZE(BB), %xmm4
  1582. mulsd %xmm1, %xmm4
  1583. subsd %xmm4, %xmm2
  1584. movsd -9 * SIZE(BB), %xmm4
  1585. mulsd %xmm1, %xmm4
  1586. subsd %xmm4, %xmm3
  1587. movsd -6 * SIZE(BB), %xmm4
  1588. mulsd %xmm4, %xmm2
  1589. movsd -5 * SIZE(BB), %xmm4
  1590. mulsd %xmm2, %xmm4
  1591. subsd %xmm4, %xmm3
  1592. movsd -1 * SIZE(BB), %xmm4
  1593. mulsd %xmm4, %xmm3
  1594. #endif
  1595. #ifdef RT
  1596. movsd -1 * SIZE(BB), %xmm4
  1597. mulsd %xmm4, %xmm3
  1598. movsd -2 * SIZE(BB), %xmm4
  1599. mulsd %xmm3, %xmm4
  1600. subsd %xmm4, %xmm2
  1601. movsd -3 * SIZE(BB), %xmm4
  1602. mulsd %xmm3, %xmm4
  1603. subsd %xmm4, %xmm1
  1604. movsd -4 * SIZE(BB), %xmm4
  1605. mulsd %xmm3, %xmm4
  1606. subsd %xmm4, %xmm0
  1607. movsd -6 * SIZE(BB), %xmm4
  1608. mulsd %xmm4, %xmm2
  1609. movsd -7 * SIZE(BB), %xmm4
  1610. mulsd %xmm2, %xmm4
  1611. subsd %xmm4, %xmm1
  1612. movsd -8 * SIZE(BB), %xmm4
  1613. mulsd %xmm2, %xmm4
  1614. subsd %xmm4, %xmm0
  1615. movsd -11 * SIZE(BB), %xmm4
  1616. mulsd %xmm4, %xmm1
  1617. movsd -12 * SIZE(BB), %xmm4
  1618. mulsd %xmm1, %xmm4
  1619. subsd %xmm4, %xmm0
  1620. movsd -16 * SIZE(BB), %xmm4
  1621. mulsd %xmm4, %xmm0
  1622. #endif
  1623. #if defined(LN) || defined(LT)
  1624. movapd %xmm0, -16 * SIZE(BB)
  1625. movapd %xmm1, -14 * SIZE(BB)
  1626. #else
  1627. movsd %xmm0, -16 * SIZE(AA)
  1628. movsd %xmm1, -15 * SIZE(AA)
  1629. movsd %xmm2, -14 * SIZE(AA)
  1630. movsd %xmm3, -13 * SIZE(AA)
  1631. #endif
  1632. #ifdef LN
  1633. subl $1 * SIZE, CO1
  1634. #endif
  1635. leal (LDC, LDC, 2), %eax
  1636. #if defined(LN) || defined(LT)
  1637. movsd %xmm0, 0 * SIZE(CO1)
  1638. movhps %xmm0, 0 * SIZE(CO1, LDC, 1)
  1639. movsd %xmm1, 0 * SIZE(CO1, LDC, 2)
  1640. movhps %xmm1, 0 * SIZE(CO1, %eax, 1)
  1641. #else
  1642. movsd %xmm0, 0 * SIZE(CO1)
  1643. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1644. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  1645. movsd %xmm3, 0 * SIZE(CO1, %eax, 1)
  1646. #endif
  1647. #ifndef LN
  1648. addl $1 * SIZE, CO1
  1649. #endif
  1650. #if defined(LT) || defined(RN)
  1651. movl K, %eax
  1652. subl KK, %eax
  1653. leal (,%eax, SIZE), %eax
  1654. leal (AA, %eax, 1), AA
  1655. leal (BB, %eax, 4), BB
  1656. #endif
  1657. #ifdef LN
  1658. subl $1, KK
  1659. #endif
  1660. #ifdef LT
  1661. addl $1, KK
  1662. #endif
  1663. #ifdef RT
  1664. movl K, %eax
  1665. sall $BASE_SHIFT, %eax
  1666. addl %eax, AORIG
  1667. #endif
  1668. ALIGN_4
  1669. .L29:
  1670. #ifdef LN
  1671. movl K, %eax
  1672. leal (, %eax, SIZE), %eax
  1673. leal (B, %eax, 4), B
  1674. #endif
  1675. #if defined(LT) || defined(RN)
  1676. movl BB, B
  1677. #endif
  1678. #ifdef RN
  1679. addl $4, KK
  1680. #endif
  1681. #ifdef RT
  1682. subl $4, KK
  1683. #endif
  1684. decl J # j --
  1685. jg .L10
  1686. ALIGN_4
  1687. .L999:
  1688. popl %ebx
  1689. popl %esi
  1690. popl %edi
  1691. popl %ebp
  1692. addl $ARGS, %esp
  1693. ret
  1694. EPILOGUE