You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT_2x4_sse3.S 37 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 24 + STACK + ARGS(%esp)
  47. #define ARG_B 28 + STACK + ARGS(%esp)
  48. #define C 32 + STACK + ARGS(%esp)
  49. #define ARG_LDC 36 + STACK + ARGS(%esp)
  50. #define OFFSET 40 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define KK 4 + STACK(%esp)
  53. #define KKK 8 + STACK(%esp)
  54. #define AORIG 12 + STACK(%esp)
  55. #ifdef PENTIUM4
  56. #define PREFETCH prefetcht1
  57. #define PREFETCHSIZE 84
  58. #endif
  59. #if defined(PENRYN) || defined(DUNNINGTON)
  60. #define PREFETCH prefetcht1
  61. #define PREFETCHSIZE 84
  62. #endif
  63. #ifdef PENTIUMM
  64. #define PREFETCH prefetcht1
  65. #define PREFETCHSIZE 84
  66. #endif
  67. #define AA %edx
  68. #define BB %ecx
  69. #define LDC %ebp
  70. #define B %edi
  71. #define CO1 %esi
  72. PROLOGUE
  73. subl $ARGS, %esp
  74. pushl %ebp
  75. pushl %edi
  76. pushl %esi
  77. pushl %ebx
  78. PROFCODE
  79. movl ARG_B, B
  80. movl ARG_LDC, LDC
  81. movl OFFSET, %eax
  82. #ifdef RN
  83. negl %eax
  84. #endif
  85. movl %eax, KK
  86. leal (, LDC, SIZE), LDC
  87. #ifdef LN
  88. movl M, %eax
  89. leal (, %eax, SIZE), %eax
  90. addl %eax, C
  91. imull K, %eax
  92. addl %eax, A
  93. #endif
  94. #ifdef RT
  95. movl N, %eax
  96. leal (, %eax, SIZE), %eax
  97. imull K, %eax
  98. addl %eax, B
  99. movl N, %eax
  100. imull LDC, %eax
  101. addl %eax, C
  102. #endif
  103. #ifdef RT
  104. movl N, %eax
  105. subl OFFSET, %eax
  106. movl %eax, KK
  107. #endif
  108. movl N, %eax
  109. sarl $2, %eax
  110. movl %eax, J
  111. jle .L30
  112. ALIGN_2
  113. .L10:
  114. #if defined(LT) || defined(RN)
  115. movl A, AA
  116. #else
  117. movl A, %eax
  118. movl %eax, AORIG
  119. #endif
  120. #ifdef RT
  121. movl K, %eax
  122. sall $2 + BASE_SHIFT, %eax
  123. subl %eax, B
  124. #endif
  125. leal (, LDC, 4), %eax
  126. #ifdef RT
  127. subl %eax, C
  128. #endif
  129. movl C, CO1
  130. #ifndef RT
  131. addl %eax, C
  132. #endif
  133. #ifdef LN
  134. movl OFFSET, %eax
  135. addl M, %eax
  136. movl %eax, KK
  137. #endif
  138. #ifdef LT
  139. movl OFFSET, %eax
  140. movl %eax, KK
  141. #endif
  142. movl M, %ebx
  143. sarl $1, %ebx # i = (m >> 2)
  144. jle .L20
  145. ALIGN_4
  146. .L11:
  147. #ifdef LN
  148. movl K, %eax
  149. sall $1 + BASE_SHIFT, %eax
  150. subl %eax, AORIG
  151. #endif
  152. #if defined(LN) || defined(RT)
  153. movl KK, %eax
  154. movl AORIG, AA
  155. leal (, %eax, SIZE), %eax
  156. leal (AA, %eax, 2), AA
  157. #endif
  158. movl B, BB
  159. #if defined(LN) || defined(RT)
  160. movl KK, %eax
  161. sall $2 + BASE_SHIFT, %eax
  162. addl %eax, BB
  163. #endif
  164. movapd 0 * SIZE(AA), %xmm0
  165. pxor %xmm4, %xmm4
  166. movapd 8 * SIZE(AA), %xmm1
  167. pxor %xmm5, %xmm5
  168. movddup 0 * SIZE(BB), %xmm2
  169. pxor %xmm6, %xmm6
  170. movddup 8 * SIZE(BB), %xmm3
  171. pxor %xmm7, %xmm7
  172. leal (LDC, LDC, 2), %eax
  173. #ifdef LN
  174. prefetchnta -2 * SIZE(CO1)
  175. prefetchnta -2 * SIZE(CO1, LDC, 1)
  176. prefetchnta -2 * SIZE(CO1, LDC, 2)
  177. prefetchnta -2 * SIZE(CO1, %eax, 1)
  178. #else
  179. prefetchnta 2 * SIZE(CO1)
  180. prefetchnta 2 * SIZE(CO1, LDC, 1)
  181. prefetchnta 2 * SIZE(CO1, LDC, 2)
  182. prefetchnta 2 * SIZE(CO1, %eax, 1)
  183. #endif
  184. #if defined(LT) || defined(RN)
  185. movl KK, %eax
  186. #else
  187. movl K, %eax
  188. subl KK, %eax
  189. #endif
  190. sarl $3, %eax
  191. je .L15
  192. ALIGN_4
  193. .L12:
  194. mulpd %xmm0, %xmm2
  195. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  196. addpd %xmm2, %xmm4
  197. movddup 1 * SIZE(BB), %xmm2
  198. mulpd %xmm0, %xmm2
  199. addpd %xmm2, %xmm5
  200. movddup 2 * SIZE(BB), %xmm2
  201. mulpd %xmm0, %xmm2
  202. addpd %xmm2, %xmm6
  203. movddup 3 * SIZE(BB), %xmm2
  204. mulpd %xmm0, %xmm2
  205. movapd 2 * SIZE(AA), %xmm0
  206. addpd %xmm2, %xmm7
  207. movddup 4 * SIZE(BB), %xmm2
  208. mulpd %xmm0, %xmm2
  209. addpd %xmm2, %xmm4
  210. movddup 5 * SIZE(BB), %xmm2
  211. mulpd %xmm0, %xmm2
  212. addpd %xmm2, %xmm5
  213. movddup 6 * SIZE(BB), %xmm2
  214. mulpd %xmm0, %xmm2
  215. addpd %xmm2, %xmm6
  216. movddup 7 * SIZE(BB), %xmm2
  217. mulpd %xmm0, %xmm2
  218. movapd 4 * SIZE(AA), %xmm0
  219. addpd %xmm2, %xmm7
  220. movddup 16 * SIZE(BB), %xmm2
  221. mulpd %xmm0, %xmm3
  222. addpd %xmm3, %xmm4
  223. movddup 9 * SIZE(BB), %xmm3
  224. mulpd %xmm0, %xmm3
  225. addpd %xmm3, %xmm5
  226. movddup 10 * SIZE(BB), %xmm3
  227. mulpd %xmm0, %xmm3
  228. addpd %xmm3, %xmm6
  229. movddup 11 * SIZE(BB), %xmm3
  230. mulpd %xmm0, %xmm3
  231. movapd 6 * SIZE(AA), %xmm0
  232. addpd %xmm3, %xmm7
  233. movddup 12 * SIZE(BB), %xmm3
  234. mulpd %xmm0, %xmm3
  235. addpd %xmm3, %xmm4
  236. movddup 13 * SIZE(BB), %xmm3
  237. mulpd %xmm0, %xmm3
  238. addpd %xmm3, %xmm5
  239. movddup 14 * SIZE(BB), %xmm3
  240. mulpd %xmm0, %xmm3
  241. addpd %xmm3, %xmm6
  242. movddup 15 * SIZE(BB), %xmm3
  243. mulpd %xmm0, %xmm3
  244. movapd 16 * SIZE(AA), %xmm0
  245. addpd %xmm3, %xmm7
  246. movddup 24 * SIZE(BB), %xmm3
  247. mulpd %xmm1, %xmm2
  248. addpd %xmm2, %xmm4
  249. movddup 17 * SIZE(BB), %xmm2
  250. mulpd %xmm1, %xmm2
  251. addpd %xmm2, %xmm5
  252. movddup 18 * SIZE(BB), %xmm2
  253. mulpd %xmm1, %xmm2
  254. addpd %xmm2, %xmm6
  255. movddup 19 * SIZE(BB), %xmm2
  256. mulpd %xmm1, %xmm2
  257. movapd 10 * SIZE(AA), %xmm1
  258. addpd %xmm2, %xmm7
  259. movddup 20 * SIZE(BB), %xmm2
  260. mulpd %xmm1, %xmm2
  261. addpd %xmm2, %xmm4
  262. movddup 21 * SIZE(BB), %xmm2
  263. mulpd %xmm1, %xmm2
  264. addpd %xmm2, %xmm5
  265. movddup 22 * SIZE(BB), %xmm2
  266. mulpd %xmm1, %xmm2
  267. addpd %xmm2, %xmm6
  268. movddup 23 * SIZE(BB), %xmm2
  269. mulpd %xmm1, %xmm2
  270. movapd 12 * SIZE(AA), %xmm1
  271. addpd %xmm2, %xmm7
  272. movddup 32 * SIZE(BB), %xmm2
  273. mulpd %xmm1, %xmm3
  274. addpd %xmm3, %xmm4
  275. movddup 25 * SIZE(BB), %xmm3
  276. mulpd %xmm1, %xmm3
  277. addpd %xmm3, %xmm5
  278. movddup 26 * SIZE(BB), %xmm3
  279. mulpd %xmm1, %xmm3
  280. addpd %xmm3, %xmm6
  281. movddup 27 * SIZE(BB), %xmm3
  282. mulpd %xmm1, %xmm3
  283. movapd 14 * SIZE(AA), %xmm1
  284. addpd %xmm3, %xmm7
  285. movddup 28 * SIZE(BB), %xmm3
  286. mulpd %xmm1, %xmm3
  287. addpd %xmm3, %xmm4
  288. movddup 29 * SIZE(BB), %xmm3
  289. mulpd %xmm1, %xmm3
  290. addpd %xmm3, %xmm5
  291. movddup 30 * SIZE(BB), %xmm3
  292. mulpd %xmm1, %xmm3
  293. addpd %xmm3, %xmm6
  294. movddup 31 * SIZE(BB), %xmm3
  295. mulpd %xmm1, %xmm3
  296. movapd 24 * SIZE(AA), %xmm1
  297. addpd %xmm3, %xmm7
  298. movddup 40 * SIZE(BB), %xmm3
  299. addl $32 * SIZE, BB
  300. addl $16 * SIZE, AA
  301. decl %eax
  302. jne .L12
  303. ALIGN_4
  304. .L15:
  305. #if defined(LT) || defined(RN)
  306. movl KK, %eax
  307. #else
  308. movl K, %eax
  309. subl KK, %eax
  310. #endif
  311. andl $7, %eax # if (k & 1)
  312. BRANCH
  313. je .L18
  314. ALIGN_3
  315. .L16:
  316. mulpd %xmm0, %xmm2
  317. addpd %xmm2, %xmm4
  318. movddup 1 * SIZE(BB), %xmm2
  319. mulpd %xmm0, %xmm2
  320. addpd %xmm2, %xmm5
  321. movddup 2 * SIZE(BB), %xmm2
  322. mulpd %xmm0, %xmm2
  323. addpd %xmm2, %xmm6
  324. movddup 3 * SIZE(BB), %xmm2
  325. mulpd %xmm0, %xmm2
  326. movapd 2 * SIZE(AA), %xmm0
  327. addpd %xmm2, %xmm7
  328. movddup 4 * SIZE(BB), %xmm2
  329. addl $2 * SIZE, AA
  330. addl $4 * SIZE, BB
  331. decl %eax
  332. jg .L16
  333. ALIGN_4
  334. .L18:
  335. #if defined(LN) || defined(RT)
  336. movl KK, %eax
  337. #ifdef LN
  338. subl $2, %eax
  339. #else
  340. subl $4, %eax
  341. #endif
  342. movl AORIG, AA
  343. leal (, %eax, SIZE), %eax
  344. leal (AA, %eax, 2), AA
  345. leal (B, %eax, 4), BB
  346. #endif
  347. #if defined(LN) || defined(LT)
  348. movapd %xmm4, %xmm0
  349. unpcklpd %xmm5, %xmm4
  350. unpckhpd %xmm5, %xmm0
  351. movapd %xmm6, %xmm1
  352. unpcklpd %xmm7, %xmm6
  353. unpckhpd %xmm7, %xmm1
  354. movapd 0 * SIZE(BB), %xmm2
  355. movapd 2 * SIZE(BB), %xmm5
  356. movapd 4 * SIZE(BB), %xmm3
  357. movapd 6 * SIZE(BB), %xmm7
  358. subpd %xmm4, %xmm2
  359. subpd %xmm6, %xmm5
  360. subpd %xmm0, %xmm3
  361. subpd %xmm1, %xmm7
  362. #else
  363. movapd 0 * SIZE(AA), %xmm0
  364. movapd 2 * SIZE(AA), %xmm1
  365. movapd 4 * SIZE(AA), %xmm2
  366. movapd 6 * SIZE(AA), %xmm3
  367. subpd %xmm4, %xmm0
  368. subpd %xmm5, %xmm1
  369. subpd %xmm6, %xmm2
  370. subpd %xmm7, %xmm3
  371. #endif
  372. #ifdef LN
  373. movddup 3 * SIZE(AA), %xmm4
  374. mulpd %xmm4, %xmm3
  375. mulpd %xmm4, %xmm7
  376. movddup 2 * SIZE(AA), %xmm4
  377. movapd %xmm4, %xmm6
  378. mulpd %xmm3, %xmm4
  379. subpd %xmm4, %xmm2
  380. mulpd %xmm7, %xmm6
  381. subpd %xmm6, %xmm5
  382. movddup 0 * SIZE(AA), %xmm4
  383. mulpd %xmm4, %xmm2
  384. mulpd %xmm4, %xmm5
  385. #endif
  386. #ifdef LT
  387. movddup 0 * SIZE(AA), %xmm4
  388. mulpd %xmm4, %xmm2
  389. mulpd %xmm4, %xmm5
  390. movddup 1 * SIZE(AA), %xmm4
  391. movapd %xmm4, %xmm6
  392. mulpd %xmm2, %xmm4
  393. subpd %xmm4, %xmm3
  394. mulpd %xmm5, %xmm6
  395. subpd %xmm6, %xmm7
  396. movddup 3 * SIZE(AA), %xmm4
  397. mulpd %xmm4, %xmm3
  398. mulpd %xmm4, %xmm7
  399. #endif
  400. #ifdef RN
  401. movddup 0 * SIZE(BB), %xmm4
  402. mulpd %xmm4, %xmm0
  403. movddup 1 * SIZE(BB), %xmm4
  404. mulpd %xmm0, %xmm4
  405. subpd %xmm4, %xmm1
  406. movddup 2 * SIZE(BB), %xmm4
  407. mulpd %xmm0, %xmm4
  408. subpd %xmm4, %xmm2
  409. movddup 3 * SIZE(BB), %xmm4
  410. mulpd %xmm0, %xmm4
  411. subpd %xmm4, %xmm3
  412. movddup 5 * SIZE(BB), %xmm4
  413. mulpd %xmm4, %xmm1
  414. movddup 6 * SIZE(BB), %xmm4
  415. mulpd %xmm1, %xmm4
  416. subpd %xmm4, %xmm2
  417. movddup 7 * SIZE(BB), %xmm4
  418. mulpd %xmm1, %xmm4
  419. subpd %xmm4, %xmm3
  420. movddup 10 * SIZE(BB), %xmm4
  421. mulpd %xmm4, %xmm2
  422. movddup 11 * SIZE(BB), %xmm4
  423. mulpd %xmm2, %xmm4
  424. subpd %xmm4, %xmm3
  425. movddup 15 * SIZE(BB), %xmm4
  426. mulpd %xmm4, %xmm3
  427. #endif
  428. #ifdef RT
  429. movddup 15 * SIZE(BB), %xmm4
  430. mulpd %xmm4, %xmm3
  431. movddup 14 * SIZE(BB), %xmm4
  432. mulpd %xmm3, %xmm4
  433. subpd %xmm4, %xmm2
  434. movddup 13 * SIZE(BB), %xmm4
  435. mulpd %xmm3, %xmm4
  436. subpd %xmm4, %xmm1
  437. movddup 12 * SIZE(BB), %xmm4
  438. mulpd %xmm3, %xmm4
  439. subpd %xmm4, %xmm0
  440. movddup 10 * SIZE(BB), %xmm4
  441. mulpd %xmm4, %xmm2
  442. movddup 9 * SIZE(BB), %xmm4
  443. mulpd %xmm2, %xmm4
  444. subpd %xmm4, %xmm1
  445. movddup 8 * SIZE(BB), %xmm4
  446. mulpd %xmm2, %xmm4
  447. subpd %xmm4, %xmm0
  448. movddup 5 * SIZE(BB), %xmm4
  449. mulpd %xmm4, %xmm1
  450. movddup 4 * SIZE(BB), %xmm4
  451. mulpd %xmm1, %xmm4
  452. subpd %xmm4, %xmm0
  453. movddup 0 * SIZE(BB), %xmm4
  454. mulpd %xmm4, %xmm0
  455. #endif
  456. #if defined(LN) || defined(LT)
  457. movapd %xmm2, 0 * SIZE(BB)
  458. movapd %xmm5, 2 * SIZE(BB)
  459. movapd %xmm3, 4 * SIZE(BB)
  460. movapd %xmm7, 6 * SIZE(BB)
  461. #else
  462. movapd %xmm0, 0 * SIZE(AA)
  463. movapd %xmm1, 2 * SIZE(AA)
  464. movapd %xmm2, 4 * SIZE(AA)
  465. movapd %xmm3, 6 * SIZE(AA)
  466. #endif
  467. #ifdef LN
  468. subl $2 * SIZE, CO1
  469. #endif
  470. leal (LDC, LDC, 2), %eax
  471. #if defined(LN) || defined(LT)
  472. movsd %xmm2, 0 * SIZE(CO1)
  473. movsd %xmm3, 1 * SIZE(CO1)
  474. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  475. movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
  476. movsd %xmm5, 0 * SIZE(CO1, LDC, 2)
  477. movsd %xmm7, 1 * SIZE(CO1, LDC, 2)
  478. movhpd %xmm5, 0 * SIZE(CO1, %eax, 1)
  479. movhpd %xmm7, 1 * SIZE(CO1, %eax, 1)
  480. #else
  481. movsd %xmm0, 0 * SIZE(CO1)
  482. movhpd %xmm0, 1 * SIZE(CO1)
  483. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  484. movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
  485. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  486. movhpd %xmm2, 1 * SIZE(CO1, LDC, 2)
  487. movsd %xmm3, 0 * SIZE(CO1, %eax, 1)
  488. movhpd %xmm3, 1 * SIZE(CO1, %eax, 1)
  489. #endif
  490. #ifndef LN
  491. addl $2 * SIZE, CO1
  492. #endif
  493. #if defined(LT) || defined(RN)
  494. movl K, %eax
  495. subl KK, %eax
  496. leal (,%eax, SIZE), %eax
  497. leal (AA, %eax, 2), AA
  498. leal (BB, %eax, 4), BB
  499. #endif
  500. #ifdef LN
  501. subl $2, KK
  502. #endif
  503. #ifdef LT
  504. addl $2, KK
  505. #endif
  506. #ifdef RT
  507. movl K, %eax
  508. sall $1 + BASE_SHIFT, %eax
  509. addl %eax, AORIG
  510. #endif
  511. decl %ebx # i --
  512. jg .L11
  513. ALIGN_4
  514. .L20:
  515. movl M, %ebx
  516. testl $1, %ebx # i = (m >> 2)
  517. jle .L29
  518. #ifdef LN
  519. movl K, %eax
  520. sall $BASE_SHIFT, %eax
  521. subl %eax, AORIG
  522. #endif
  523. #if defined(LN) || defined(RT)
  524. movl KK, %eax
  525. movl AORIG, AA
  526. leal (AA, %eax, SIZE), AA
  527. #endif
  528. movl B, BB
  529. #if defined(LN) || defined(RT)
  530. movl KK, %eax
  531. sall $2 + BASE_SHIFT, %eax
  532. addl %eax, BB
  533. #endif
  534. movddup 0 * SIZE(AA), %xmm0
  535. pxor %xmm4, %xmm4
  536. movddup 8 * SIZE(AA), %xmm1
  537. pxor %xmm5, %xmm5
  538. movapd 0 * SIZE(BB), %xmm2
  539. pxor %xmm6, %xmm6
  540. movapd 8 * SIZE(BB), %xmm3
  541. pxor %xmm7, %xmm7
  542. #if defined(LT) || defined(RN)
  543. movl KK, %eax
  544. #else
  545. movl K, %eax
  546. subl KK, %eax
  547. #endif
  548. sarl $4, %eax
  549. je .L25
  550. ALIGN_4
  551. .L22:
  552. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  553. mulpd %xmm0, %xmm2
  554. mulpd 2 * SIZE(BB), %xmm0
  555. addpd %xmm2, %xmm4
  556. movapd 4 * SIZE(BB), %xmm2
  557. addpd %xmm0, %xmm5
  558. movddup 1 * SIZE(AA), %xmm0
  559. mulpd %xmm0, %xmm2
  560. mulpd 6 * SIZE(BB), %xmm0
  561. addpd %xmm2, %xmm6
  562. movapd 16 * SIZE(BB), %xmm2
  563. addpd %xmm0, %xmm7
  564. movddup 2 * SIZE(AA), %xmm0
  565. mulpd %xmm0, %xmm3
  566. mulpd 10 * SIZE(BB), %xmm0
  567. addpd %xmm3, %xmm4
  568. movapd 12 * SIZE(BB), %xmm3
  569. addpd %xmm0, %xmm5
  570. movddup 3 * SIZE(AA), %xmm0
  571. mulpd %xmm0, %xmm3
  572. mulpd 14 * SIZE(BB), %xmm0
  573. addpd %xmm3, %xmm6
  574. movapd 24 * SIZE(BB), %xmm3
  575. addpd %xmm0, %xmm7
  576. movddup 4 * SIZE(AA), %xmm0
  577. mulpd %xmm0, %xmm2
  578. mulpd 18 * SIZE(BB), %xmm0
  579. addpd %xmm2, %xmm4
  580. movapd 20 * SIZE(BB), %xmm2
  581. addpd %xmm0, %xmm5
  582. movddup 5 * SIZE(AA), %xmm0
  583. mulpd %xmm0, %xmm2
  584. mulpd 22 * SIZE(BB), %xmm0
  585. addpd %xmm2, %xmm6
  586. movapd 32 * SIZE(BB), %xmm2
  587. addpd %xmm0, %xmm7
  588. movddup 6 * SIZE(AA), %xmm0
  589. mulpd %xmm0, %xmm3
  590. mulpd 26 * SIZE(BB), %xmm0
  591. addpd %xmm3, %xmm4
  592. movapd 28 * SIZE(BB), %xmm3
  593. addpd %xmm0, %xmm5
  594. movddup 7 * SIZE(AA), %xmm0
  595. mulpd %xmm0, %xmm3
  596. mulpd 30 * SIZE(BB), %xmm0
  597. addpd %xmm3, %xmm6
  598. movapd 40 * SIZE(BB), %xmm3
  599. addpd %xmm0, %xmm7
  600. movddup 16 * SIZE(AA), %xmm0
  601. mulpd %xmm1, %xmm2
  602. mulpd 34 * SIZE(BB), %xmm1
  603. addpd %xmm2, %xmm4
  604. movapd 36 * SIZE(BB), %xmm2
  605. addpd %xmm1, %xmm5
  606. movddup 9 * SIZE(AA), %xmm1
  607. mulpd %xmm1, %xmm2
  608. mulpd 38 * SIZE(BB), %xmm1
  609. addpd %xmm2, %xmm6
  610. movapd 48 * SIZE(BB), %xmm2
  611. addpd %xmm1, %xmm7
  612. movddup 10 * SIZE(AA), %xmm1
  613. mulpd %xmm1, %xmm3
  614. mulpd 42 * SIZE(BB), %xmm1
  615. addpd %xmm3, %xmm4
  616. movapd 44 * SIZE(BB), %xmm3
  617. addpd %xmm1, %xmm5
  618. movddup 11 * SIZE(AA), %xmm1
  619. mulpd %xmm1, %xmm3
  620. mulpd 46 * SIZE(BB), %xmm1
  621. addpd %xmm3, %xmm6
  622. movapd 56 * SIZE(BB), %xmm3
  623. addpd %xmm1, %xmm7
  624. movddup 12 * SIZE(AA), %xmm1
  625. mulpd %xmm1, %xmm2
  626. mulpd 50 * SIZE(BB), %xmm1
  627. addpd %xmm2, %xmm4
  628. movapd 52 * SIZE(BB), %xmm2
  629. addpd %xmm1, %xmm5
  630. movddup 13 * SIZE(AA), %xmm1
  631. mulpd %xmm1, %xmm2
  632. mulpd 54 * SIZE(BB), %xmm1
  633. addpd %xmm2, %xmm6
  634. movapd 64 * SIZE(BB), %xmm2
  635. addpd %xmm1, %xmm7
  636. movddup 14 * SIZE(AA), %xmm1
  637. mulpd %xmm1, %xmm3
  638. mulpd 58 * SIZE(BB), %xmm1
  639. addpd %xmm3, %xmm4
  640. movapd 60 * SIZE(BB), %xmm3
  641. addpd %xmm1, %xmm5
  642. movddup 15 * SIZE(AA), %xmm1
  643. mulpd %xmm1, %xmm3
  644. mulpd 62 * SIZE(BB), %xmm1
  645. addpd %xmm3, %xmm6
  646. movapd 72 * SIZE(BB), %xmm3
  647. addpd %xmm1, %xmm7
  648. movddup 24 * SIZE(AA), %xmm1
  649. addl $16 * SIZE, AA
  650. addl $64 * SIZE, BB
  651. decl %eax
  652. jne .L22
  653. ALIGN_4
  654. .L25:
  655. #if defined(LT) || defined(RN)
  656. movl KK, %eax
  657. #else
  658. movl K, %eax
  659. subl KK, %eax
  660. #endif
  661. andl $15, %eax # if (k & 1)
  662. BRANCH
  663. je .L28
  664. .L26:
  665. mulpd %xmm0, %xmm2
  666. mulpd 2 * SIZE(BB), %xmm0
  667. addpd %xmm2, %xmm4
  668. movapd 4 * SIZE(BB), %xmm2
  669. addpd %xmm0, %xmm5
  670. movddup 1 * SIZE(AA), %xmm0
  671. addl $1 * SIZE, AA
  672. addl $4 * SIZE, BB
  673. decl %eax
  674. jg .L26
  675. ALIGN_4
  676. .L28:
  677. addpd %xmm6, %xmm4
  678. addpd %xmm7, %xmm5
  679. #if defined(LN) || defined(RT)
  680. movl KK, %eax
  681. #ifdef LN
  682. subl $1, %eax
  683. #else
  684. subl $4, %eax
  685. #endif
  686. movl AORIG, AA
  687. leal (, %eax, SIZE), %eax
  688. leal (AA, %eax, 1), AA
  689. leal (B, %eax, 4), BB
  690. #endif
  691. #if defined(LN) || defined(LT)
  692. movapd 0 * SIZE(BB), %xmm0
  693. movapd 2 * SIZE(BB), %xmm1
  694. subpd %xmm4, %xmm0
  695. subpd %xmm5, %xmm1
  696. #else
  697. movapd 0 * SIZE(AA), %xmm1
  698. movapd 2 * SIZE(AA), %xmm3
  699. subpd %xmm4, %xmm1
  700. subpd %xmm5, %xmm3
  701. movapd %xmm1, %xmm0
  702. unpckhpd %xmm1, %xmm1
  703. movapd %xmm3, %xmm2
  704. unpckhpd %xmm3, %xmm3
  705. #endif
  706. #ifdef LN
  707. movddup 0 * SIZE(AA), %xmm4
  708. mulpd %xmm4, %xmm0
  709. mulpd %xmm4, %xmm1
  710. #endif
  711. #ifdef LT
  712. movddup 0 * SIZE(AA), %xmm4
  713. mulpd %xmm4, %xmm0
  714. mulpd %xmm4, %xmm1
  715. #endif
  716. #ifdef RN
  717. movsd 0 * SIZE(BB), %xmm4
  718. mulsd %xmm4, %xmm0
  719. movsd 1 * SIZE(BB), %xmm4
  720. mulsd %xmm0, %xmm4
  721. subsd %xmm4, %xmm1
  722. movsd 2 * SIZE(BB), %xmm4
  723. mulsd %xmm0, %xmm4
  724. subsd %xmm4, %xmm2
  725. movsd 3 * SIZE(BB), %xmm4
  726. mulsd %xmm0, %xmm4
  727. subsd %xmm4, %xmm3
  728. movsd 5 * SIZE(BB), %xmm4
  729. mulsd %xmm4, %xmm1
  730. movsd 6 * SIZE(BB), %xmm4
  731. mulsd %xmm1, %xmm4
  732. subsd %xmm4, %xmm2
  733. movsd 7 * SIZE(BB), %xmm4
  734. mulsd %xmm1, %xmm4
  735. subsd %xmm4, %xmm3
  736. movsd 10 * SIZE(BB), %xmm4
  737. mulsd %xmm4, %xmm2
  738. movsd 11 * SIZE(BB), %xmm4
  739. mulsd %xmm2, %xmm4
  740. subsd %xmm4, %xmm3
  741. movsd 15 * SIZE(BB), %xmm4
  742. mulsd %xmm4, %xmm3
  743. #endif
  744. #ifdef RT
  745. movsd 15 * SIZE(BB), %xmm4
  746. mulsd %xmm4, %xmm3
  747. movsd 14 * SIZE(BB), %xmm4
  748. mulsd %xmm3, %xmm4
  749. subsd %xmm4, %xmm2
  750. movsd 13 * SIZE(BB), %xmm4
  751. mulsd %xmm3, %xmm4
  752. subsd %xmm4, %xmm1
  753. movsd 12 * SIZE(BB), %xmm4
  754. mulsd %xmm3, %xmm4
  755. subsd %xmm4, %xmm0
  756. movsd 10 * SIZE(BB), %xmm4
  757. mulsd %xmm4, %xmm2
  758. movsd 9 * SIZE(BB), %xmm4
  759. mulsd %xmm2, %xmm4
  760. subsd %xmm4, %xmm1
  761. movsd 8 * SIZE(BB), %xmm4
  762. mulsd %xmm2, %xmm4
  763. subsd %xmm4, %xmm0
  764. movsd 5 * SIZE(BB), %xmm4
  765. mulsd %xmm4, %xmm1
  766. movsd 4 * SIZE(BB), %xmm4
  767. mulsd %xmm1, %xmm4
  768. subsd %xmm4, %xmm0
  769. movsd 0 * SIZE(BB), %xmm4
  770. mulsd %xmm4, %xmm0
  771. #endif
  772. #if defined(LN) || defined(LT)
  773. movapd %xmm0, 0 * SIZE(BB)
  774. movapd %xmm1, 2 * SIZE(BB)
  775. #else
  776. movsd %xmm0, 0 * SIZE(AA)
  777. movsd %xmm1, 1 * SIZE(AA)
  778. movsd %xmm2, 2 * SIZE(AA)
  779. movsd %xmm3, 3 * SIZE(AA)
  780. #endif
  781. #ifdef LN
  782. subl $1 * SIZE, CO1
  783. #endif
  784. leal (LDC, LDC, 2), %eax
  785. #if defined(LN) || defined(LT)
  786. movsd %xmm0, 0 * SIZE(CO1)
  787. movhpd %xmm0, 0 * SIZE(CO1, LDC, 1)
  788. movsd %xmm1, 0 * SIZE(CO1, LDC, 2)
  789. movhpd %xmm1, 0 * SIZE(CO1, %eax, 1)
  790. #else
  791. movsd %xmm0, 0 * SIZE(CO1)
  792. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  793. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  794. movsd %xmm3, 0 * SIZE(CO1, %eax, 1)
  795. #endif
  796. #ifndef LN
  797. addl $1 * SIZE, CO1
  798. #endif
  799. #if defined(LT) || defined(RN)
  800. movl K, %eax
  801. subl KK, %eax
  802. leal (,%eax, SIZE), %eax
  803. leal (AA, %eax, 1), AA
  804. leal (BB, %eax, 4), BB
  805. #endif
  806. #ifdef LN
  807. subl $1, KK
  808. #endif
  809. #ifdef LT
  810. addl $1, KK
  811. #endif
  812. #ifdef RT
  813. movl K, %eax
  814. sall $BASE_SHIFT, %eax
  815. addl %eax, AORIG
  816. #endif
  817. ALIGN_4
  818. .L29:
  819. #ifdef LN
  820. movl K, %eax
  821. leal (, %eax, SIZE), %eax
  822. leal (B, %eax, 4), B
  823. #endif
  824. #if defined(LT) || defined(RN)
  825. movl BB, B
  826. #endif
  827. #ifdef RN
  828. addl $4, KK
  829. #endif
  830. #ifdef RT
  831. subl $4, KK
  832. #endif
  833. decl J # j --
  834. jg .L10
  835. ALIGN_4
  836. .L30:
  837. testl $2, N
  838. je .L60
  839. #if defined(LT) || defined(RN)
  840. movl A, AA
  841. #else
  842. movl A, %eax
  843. movl %eax, AORIG
  844. #endif
  845. #ifdef RT
  846. movl K, %eax
  847. sall $1 + BASE_SHIFT, %eax
  848. subl %eax, B
  849. #endif
  850. leal (, LDC, 2), %eax
  851. #ifdef RT
  852. subl %eax, C
  853. #endif
  854. movl C, CO1
  855. #ifndef RT
  856. addl %eax, C
  857. #endif
  858. #ifdef LN
  859. movl OFFSET, %eax
  860. addl M, %eax
  861. movl %eax, KK
  862. #endif
  863. #ifdef LT
  864. movl OFFSET, %eax
  865. movl %eax, KK
  866. #endif
  867. movl M, %ebx
  868. sarl $1, %ebx # i = (m >> 2)
  869. jle .L50
  870. ALIGN_4
  871. .L41:
  872. #ifdef LN
  873. movl K, %eax
  874. sall $1 + BASE_SHIFT, %eax
  875. subl %eax, AORIG
  876. #endif
  877. #if defined(LN) || defined(RT)
  878. movl KK, %eax
  879. movl AORIG, AA
  880. leal (, %eax, SIZE), %eax
  881. leal (AA, %eax, 2), AA
  882. #endif
  883. movl B, BB
  884. #if defined(LN) || defined(RT)
  885. movl KK, %eax
  886. sall $1 + BASE_SHIFT, %eax
  887. addl %eax, BB
  888. #endif
  889. movapd 0 * SIZE(AA), %xmm0
  890. pxor %xmm4, %xmm4
  891. movapd 8 * SIZE(AA), %xmm1
  892. pxor %xmm5, %xmm5
  893. movddup 0 * SIZE(BB), %xmm2
  894. pxor %xmm6, %xmm6
  895. movddup 8 * SIZE(BB), %xmm3
  896. pxor %xmm7, %xmm7
  897. #ifdef LN
  898. prefetchnta -2 * SIZE(CO1)
  899. prefetchnta -2 * SIZE(CO1, LDC, 1)
  900. #else
  901. prefetchnta 2 * SIZE(CO1)
  902. prefetchnta 2 * SIZE(CO1, LDC, 1)
  903. #endif
  904. #if defined(LT) || defined(RN)
  905. movl KK, %eax
  906. #else
  907. movl K, %eax
  908. subl KK, %eax
  909. #endif
  910. sarl $3, %eax
  911. je .L45
  912. ALIGN_4
  913. .L42:
  914. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  915. mulpd %xmm0, %xmm2
  916. addpd %xmm2, %xmm4
  917. movddup 1 * SIZE(BB), %xmm2
  918. mulpd %xmm0, %xmm2
  919. movapd 2 * SIZE(AA), %xmm0
  920. addpd %xmm2, %xmm5
  921. movddup 2 * SIZE(BB), %xmm2
  922. mulpd %xmm0, %xmm2
  923. addpd %xmm2, %xmm6
  924. movddup 3 * SIZE(BB), %xmm2
  925. mulpd %xmm0, %xmm2
  926. movapd 4 * SIZE(AA), %xmm0
  927. addpd %xmm2, %xmm7
  928. movddup 4 * SIZE(BB), %xmm2
  929. mulpd %xmm0, %xmm2
  930. addpd %xmm2, %xmm4
  931. movddup 5 * SIZE(BB), %xmm2
  932. mulpd %xmm0, %xmm2
  933. movapd 6 * SIZE(AA), %xmm0
  934. addpd %xmm2, %xmm5
  935. movddup 6 * SIZE(BB), %xmm2
  936. mulpd %xmm0, %xmm2
  937. addpd %xmm2, %xmm6
  938. movddup 7 * SIZE(BB), %xmm2
  939. mulpd %xmm0, %xmm2
  940. movapd 16 * SIZE(AA), %xmm0
  941. addpd %xmm2, %xmm7
  942. movddup 16 * SIZE(BB), %xmm2
  943. mulpd %xmm1, %xmm3
  944. addpd %xmm3, %xmm4
  945. movddup 9 * SIZE(BB), %xmm3
  946. mulpd %xmm1, %xmm3
  947. movapd 10 * SIZE(AA), %xmm1
  948. addpd %xmm3, %xmm5
  949. movddup 10 * SIZE(BB), %xmm3
  950. mulpd %xmm1, %xmm3
  951. addpd %xmm3, %xmm6
  952. movddup 11 * SIZE(BB), %xmm3
  953. mulpd %xmm1, %xmm3
  954. movapd 12 * SIZE(AA), %xmm1
  955. addpd %xmm3, %xmm7
  956. movddup 12 * SIZE(BB), %xmm3
  957. mulpd %xmm1, %xmm3
  958. addpd %xmm3, %xmm4
  959. movddup 13 * SIZE(BB), %xmm3
  960. mulpd %xmm1, %xmm3
  961. movapd 14 * SIZE(AA), %xmm1
  962. addpd %xmm3, %xmm5
  963. movddup 14 * SIZE(BB), %xmm3
  964. mulpd %xmm1, %xmm3
  965. addpd %xmm3, %xmm6
  966. movddup 15 * SIZE(BB), %xmm3
  967. mulpd %xmm1, %xmm3
  968. movapd 24 * SIZE(AA), %xmm1
  969. addpd %xmm3, %xmm7
  970. movddup 24 * SIZE(BB), %xmm3
  971. addl $16 * SIZE, AA
  972. addl $16 * SIZE, BB
  973. decl %eax
  974. jne .L42
  975. ALIGN_4
  976. .L45:
  977. #if defined(LT) || defined(RN)
  978. movl KK, %eax
  979. #else
  980. movl K, %eax
  981. subl KK, %eax
  982. #endif
  983. andl $7, %eax # if (k & 1)
  984. BRANCH
  985. je .L48
  986. ALIGN_3
  987. .L46:
  988. mulpd %xmm0, %xmm2
  989. addpd %xmm2, %xmm4
  990. movddup 1 * SIZE(BB), %xmm2
  991. mulpd %xmm0, %xmm2
  992. movapd 2 * SIZE(AA), %xmm0
  993. addpd %xmm2, %xmm5
  994. movddup 2 * SIZE(BB), %xmm2
  995. addl $2 * SIZE, AA
  996. addl $2 * SIZE, BB
  997. decl %eax
  998. jg .L46
  999. ALIGN_4
  1000. .L48:
  1001. addpd %xmm6, %xmm4
  1002. addpd %xmm7, %xmm5
  1003. #if defined(LN) || defined(RT)
  1004. movl KK, %eax
  1005. #ifdef LN
  1006. subl $2, %eax
  1007. #else
  1008. subl $2, %eax
  1009. #endif
  1010. movl AORIG, AA
  1011. leal (, %eax, SIZE), %eax
  1012. leal (AA, %eax, 2), AA
  1013. leal (B, %eax, 2), BB
  1014. #endif
  1015. #if defined(LN) || defined(LT)
  1016. movapd %xmm4, %xmm0
  1017. unpcklpd %xmm5, %xmm4
  1018. unpckhpd %xmm5, %xmm0
  1019. movapd 0 * SIZE(BB), %xmm2
  1020. movapd 2 * SIZE(BB), %xmm3
  1021. subpd %xmm4, %xmm2
  1022. subpd %xmm0, %xmm3
  1023. #else
  1024. movapd 0 * SIZE(AA), %xmm0
  1025. movapd 2 * SIZE(AA), %xmm1
  1026. subpd %xmm4, %xmm0
  1027. subpd %xmm5, %xmm1
  1028. #endif
  1029. #ifdef LN
  1030. movddup 3 * SIZE(AA), %xmm4
  1031. mulpd %xmm4, %xmm3
  1032. movddup 2 * SIZE(AA), %xmm4
  1033. mulpd %xmm3, %xmm4
  1034. subpd %xmm4, %xmm2
  1035. movddup 0 * SIZE(AA), %xmm4
  1036. mulpd %xmm4, %xmm2
  1037. #endif
  1038. #ifdef LT
  1039. movddup 0 * SIZE(AA), %xmm4
  1040. mulpd %xmm4, %xmm2
  1041. movddup 1 * SIZE(AA), %xmm4
  1042. mulpd %xmm2, %xmm4
  1043. subpd %xmm4, %xmm3
  1044. movddup 3 * SIZE(AA), %xmm4
  1045. mulpd %xmm4, %xmm3
  1046. #endif
  1047. #ifdef RN
  1048. movddup 0 * SIZE(BB), %xmm4
  1049. mulpd %xmm4, %xmm0
  1050. movddup 1 * SIZE(BB), %xmm4
  1051. mulpd %xmm0, %xmm4
  1052. subpd %xmm4, %xmm1
  1053. movddup 3 * SIZE(BB), %xmm4
  1054. mulpd %xmm4, %xmm1
  1055. #endif
  1056. #ifdef RT
  1057. movddup 3 * SIZE(BB), %xmm4
  1058. mulpd %xmm4, %xmm1
  1059. movddup 2 * SIZE(BB), %xmm4
  1060. mulpd %xmm1, %xmm4
  1061. subpd %xmm4, %xmm0
  1062. movddup 0 * SIZE(BB), %xmm4
  1063. mulpd %xmm4, %xmm0
  1064. #endif
  1065. #if defined(LN) || defined(LT)
  1066. movapd %xmm2, 0 * SIZE(BB)
  1067. movapd %xmm3, 2 * SIZE(BB)
  1068. #else
  1069. movapd %xmm0, 0 * SIZE(AA)
  1070. movapd %xmm1, 2 * SIZE(AA)
  1071. #endif
  1072. #ifdef LN
  1073. subl $2 * SIZE, CO1
  1074. #endif
  1075. #if defined(LN) || defined(LT)
  1076. movsd %xmm2, 0 * SIZE(CO1)
  1077. movsd %xmm3, 1 * SIZE(CO1)
  1078. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  1079. movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
  1080. #else
  1081. movsd %xmm0, 0 * SIZE(CO1)
  1082. movhpd %xmm0, 1 * SIZE(CO1)
  1083. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1084. movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
  1085. #endif
  1086. #ifndef LN
  1087. addl $2 * SIZE, CO1
  1088. #endif
  1089. #if defined(LT) || defined(RN)
  1090. movl K, %eax
  1091. subl KK, %eax
  1092. leal (,%eax, SIZE), %eax
  1093. leal (AA, %eax, 2), AA
  1094. leal (BB, %eax, 2), BB
  1095. #endif
  1096. #ifdef LN
  1097. subl $2, KK
  1098. #endif
  1099. #ifdef LT
  1100. addl $2, KK
  1101. #endif
  1102. #ifdef RT
  1103. movl K, %eax
  1104. sall $1 + BASE_SHIFT, %eax
  1105. addl %eax, AORIG
  1106. #endif
  1107. decl %ebx # i --
  1108. jg .L41
  1109. ALIGN_4
  1110. .L50:
  1111. movl M, %ebx
  1112. testl $1, %ebx # i = (m >> 2)
  1113. jle .L59
  1114. #ifdef LN
  1115. movl K, %eax
  1116. sall $BASE_SHIFT, %eax
  1117. subl %eax, AORIG
  1118. #endif
  1119. #if defined(LN) || defined(RT)
  1120. movl KK, %eax
  1121. movl AORIG, AA
  1122. leal (AA, %eax, SIZE), AA
  1123. #endif
  1124. movl B, BB
  1125. #if defined(LN) || defined(RT)
  1126. movl KK, %eax
  1127. sall $1 + BASE_SHIFT, %eax
  1128. addl %eax, BB
  1129. #endif
  1130. movddup 0 * SIZE(AA), %xmm0
  1131. pxor %xmm4, %xmm4
  1132. movddup 8 * SIZE(AA), %xmm1
  1133. pxor %xmm5, %xmm5
  1134. movapd 0 * SIZE(BB), %xmm2
  1135. pxor %xmm6, %xmm6
  1136. movapd 8 * SIZE(BB), %xmm3
  1137. pxor %xmm7, %xmm7
  1138. #if defined(LT) || defined(RN)
  1139. movl KK, %eax
  1140. #else
  1141. movl K, %eax
  1142. subl KK, %eax
  1143. #endif
  1144. sarl $4, %eax
  1145. je .L55
  1146. ALIGN_4
  1147. .L52:
  1148. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1149. mulpd %xmm0, %xmm2
  1150. movddup 1 * SIZE(AA), %xmm0
  1151. addpd %xmm2, %xmm4
  1152. mulpd 2 * SIZE(BB), %xmm0
  1153. movapd 16 * SIZE(BB), %xmm2
  1154. addpd %xmm0, %xmm5
  1155. movddup 2 * SIZE(AA), %xmm0
  1156. mulpd 4 * SIZE(BB), %xmm0
  1157. addpd %xmm0, %xmm6
  1158. movddup 3 * SIZE(AA), %xmm0
  1159. mulpd 6 * SIZE(BB), %xmm0
  1160. addpd %xmm0, %xmm7
  1161. movddup 4 * SIZE(AA), %xmm0
  1162. mulpd %xmm0, %xmm3
  1163. movddup 5 * SIZE(AA), %xmm0
  1164. addpd %xmm3, %xmm4
  1165. mulpd 10 * SIZE(BB), %xmm0
  1166. movapd 24 * SIZE(BB), %xmm3
  1167. addpd %xmm0, %xmm5
  1168. movddup 6 * SIZE(AA), %xmm0
  1169. mulpd 12 * SIZE(BB), %xmm0
  1170. addpd %xmm0, %xmm6
  1171. movddup 7 * SIZE(AA), %xmm0
  1172. mulpd 14 * SIZE(BB), %xmm0
  1173. addpd %xmm0, %xmm7
  1174. movddup 16 * SIZE(AA), %xmm0
  1175. mulpd %xmm1, %xmm2
  1176. movddup 9 * SIZE(AA), %xmm1
  1177. addpd %xmm2, %xmm4
  1178. mulpd 18 * SIZE(BB), %xmm1
  1179. movapd 32 * SIZE(BB), %xmm2
  1180. addpd %xmm1, %xmm5
  1181. movddup 10 * SIZE(AA), %xmm1
  1182. mulpd 20 * SIZE(BB), %xmm1
  1183. addpd %xmm1, %xmm6
  1184. movddup 11 * SIZE(AA), %xmm1
  1185. mulpd 22 * SIZE(BB), %xmm1
  1186. addpd %xmm1, %xmm7
  1187. movddup 12 * SIZE(AA), %xmm1
  1188. mulpd %xmm1, %xmm3
  1189. movddup 13 * SIZE(AA), %xmm1
  1190. addpd %xmm3, %xmm4
  1191. mulpd 26 * SIZE(BB), %xmm1
  1192. movapd 40 * SIZE(BB), %xmm3
  1193. addpd %xmm1, %xmm5
  1194. movddup 14 * SIZE(AA), %xmm1
  1195. mulpd 28 * SIZE(BB), %xmm1
  1196. addpd %xmm1, %xmm6
  1197. movddup 15 * SIZE(AA), %xmm1
  1198. mulpd 30 * SIZE(BB), %xmm1
  1199. addpd %xmm1, %xmm7
  1200. movddup 24 * SIZE(AA), %xmm1
  1201. addl $16 * SIZE, AA
  1202. addl $32 * SIZE, BB
  1203. decl %eax
  1204. jne .L52
  1205. ALIGN_4
  1206. .L55:
  1207. #if defined(LT) || defined(RN)
  1208. movl KK, %eax
  1209. #else
  1210. movl K, %eax
  1211. subl KK, %eax
  1212. #endif
  1213. andl $15, %eax # if (k & 1)
  1214. BRANCH
  1215. je .L58
  1216. .L56:
  1217. mulpd %xmm0, %xmm2
  1218. movddup 1 * SIZE(AA), %xmm0
  1219. addpd %xmm2, %xmm4
  1220. movapd 2 * SIZE(BB), %xmm2
  1221. addl $1 * SIZE, AA
  1222. addl $2 * SIZE, BB
  1223. decl %eax
  1224. jg .L56
  1225. ALIGN_4
  1226. .L58:
  1227. addpd %xmm5, %xmm4
  1228. addpd %xmm7, %xmm6
  1229. addpd %xmm6, %xmm4
  1230. #if defined(LN) || defined(RT)
  1231. movl KK, %eax
  1232. #ifdef LN
  1233. subl $1, %eax
  1234. #else
  1235. subl $2, %eax
  1236. #endif
  1237. movl AORIG, AA
  1238. leal (, %eax, SIZE), %eax
  1239. addl %eax, AA
  1240. leal (B, %eax, 2), BB
  1241. #endif
  1242. #if defined(LN) || defined(LT)
  1243. movapd 0 * SIZE(BB), %xmm0
  1244. subpd %xmm4, %xmm0
  1245. #else
  1246. movapd 0 * SIZE(AA), %xmm1
  1247. subpd %xmm4, %xmm1
  1248. movapd %xmm1, %xmm0
  1249. unpckhpd %xmm1, %xmm1
  1250. #endif
  1251. #ifdef LN
  1252. movddup 0 * SIZE(AA), %xmm4
  1253. mulpd %xmm4, %xmm0
  1254. #endif
  1255. #ifdef LT
  1256. movddup 0 * SIZE(AA), %xmm4
  1257. mulpd %xmm4, %xmm0
  1258. #endif
  1259. #ifdef RN
  1260. movsd 0 * SIZE(BB), %xmm4
  1261. mulsd %xmm4, %xmm0
  1262. movsd 1 * SIZE(BB), %xmm4
  1263. mulsd %xmm0, %xmm4
  1264. subsd %xmm4, %xmm1
  1265. movsd 3 * SIZE(BB), %xmm4
  1266. mulsd %xmm4, %xmm1
  1267. #endif
  1268. #ifdef RT
  1269. movsd 3 * SIZE(BB), %xmm4
  1270. mulsd %xmm4, %xmm1
  1271. movsd 2 * SIZE(BB), %xmm4
  1272. mulsd %xmm1, %xmm4
  1273. subsd %xmm4, %xmm0
  1274. movsd 0 * SIZE(BB), %xmm4
  1275. mulsd %xmm4, %xmm0
  1276. #endif
  1277. #if defined(LN) || defined(LT)
  1278. movapd %xmm0, 0 * SIZE(BB)
  1279. #else
  1280. movsd %xmm0, 0 * SIZE(AA)
  1281. movsd %xmm1, 1 * SIZE(AA)
  1282. #endif
  1283. #ifdef LN
  1284. subl $1 * SIZE, CO1
  1285. #endif
  1286. #if defined(LN) || defined(LT)
  1287. movsd %xmm0, 0 * SIZE(CO1)
  1288. movhpd %xmm0, 0 * SIZE(CO1, LDC, 1)
  1289. #else
  1290. movsd %xmm0, 0 * SIZE(CO1)
  1291. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1292. #endif
  1293. #ifndef LN
  1294. addl $1 * SIZE, CO1
  1295. #endif
  1296. #if defined(LT) || defined(RN)
  1297. movl K, %eax
  1298. subl KK, %eax
  1299. leal (,%eax, SIZE), %eax
  1300. leal (AA, %eax, 1), AA
  1301. leal (BB, %eax, 2), BB
  1302. #endif
  1303. #ifdef LN
  1304. subl $1, KK
  1305. #endif
  1306. #ifdef LT
  1307. addl $1, KK
  1308. #endif
  1309. #ifdef RT
  1310. movl K, %eax
  1311. sall $1 + BASE_SHIFT, %eax
  1312. addl %eax, AORIG
  1313. #endif
  1314. ALIGN_4
  1315. .L59:
  1316. #ifdef LN
  1317. movl K, %eax
  1318. leal (, %eax, SIZE), %eax
  1319. leal (B, %eax, 2), B
  1320. #endif
  1321. #if defined(LT) || defined(RN)
  1322. movl BB, B
  1323. #endif
  1324. #ifdef RN
  1325. addl $2, KK
  1326. #endif
  1327. #ifdef RT
  1328. subl $2, KK
  1329. #endif
  1330. ALIGN_4
  1331. .L60:
  1332. testl $1, N
  1333. je .L999
  1334. #if defined(LT) || defined(RN)
  1335. movl A, AA
  1336. #else
  1337. movl A, %eax
  1338. movl %eax, AORIG
  1339. #endif
  1340. #ifdef RT
  1341. movl K, %eax
  1342. sall $BASE_SHIFT, %eax
  1343. subl %eax, B
  1344. #endif
  1345. #ifdef RT
  1346. subl LDC, C
  1347. #endif
  1348. movl C, CO1
  1349. #ifndef RT
  1350. addl LDC, C
  1351. #endif
  1352. #ifdef LN
  1353. movl OFFSET, %eax
  1354. addl M, %eax
  1355. movl %eax, KK
  1356. #endif
  1357. #ifdef LT
  1358. movl OFFSET, %eax
  1359. movl %eax, KK
  1360. #endif
  1361. movl M, %ebx
  1362. sarl $1, %ebx # i = (m >> 2)
  1363. jle .L80
  1364. ALIGN_4
  1365. .L71:
  1366. #ifdef LN
  1367. movl K, %eax
  1368. sall $1 + BASE_SHIFT, %eax
  1369. subl %eax, AORIG
  1370. #endif
  1371. #if defined(LN) || defined(RT)
  1372. movl KK, %eax
  1373. movl AORIG, AA
  1374. leal (, %eax, SIZE), %eax
  1375. leal (AA, %eax, 2), AA
  1376. #endif
  1377. movl B, BB
  1378. #if defined(LN) || defined(RT)
  1379. movl KK, %eax
  1380. sall $BASE_SHIFT, %eax
  1381. addl %eax, BB
  1382. #endif
  1383. movapd 0 * SIZE(AA), %xmm0
  1384. pxor %xmm4, %xmm4
  1385. movapd 8 * SIZE(AA), %xmm1
  1386. pxor %xmm5, %xmm5
  1387. movddup 0 * SIZE(BB), %xmm2
  1388. pxor %xmm6, %xmm6
  1389. movddup 4 * SIZE(BB), %xmm3
  1390. pxor %xmm7, %xmm7
  1391. #ifdef LN
  1392. prefetchnta -2 * SIZE(CO1)
  1393. #else
  1394. prefetchnta 2 * SIZE(CO1)
  1395. #endif
  1396. #if defined(LT) || defined(RN)
  1397. movl KK, %eax
  1398. #else
  1399. movl K, %eax
  1400. subl KK, %eax
  1401. #endif
  1402. sarl $3, %eax
  1403. je .L75
  1404. ALIGN_4
  1405. .L72:
  1406. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1407. mulpd %xmm2, %xmm0
  1408. movddup 1 * SIZE(BB), %xmm2
  1409. addpd %xmm0, %xmm4
  1410. movapd 16 * SIZE(AA), %xmm0
  1411. mulpd 2 * SIZE(AA), %xmm2
  1412. addpd %xmm2, %xmm5
  1413. movddup 2 * SIZE(BB), %xmm2
  1414. mulpd 4 * SIZE(AA), %xmm2
  1415. addpd %xmm2, %xmm6
  1416. movddup 3 * SIZE(BB), %xmm2
  1417. mulpd 6 * SIZE(AA), %xmm2
  1418. addpd %xmm2, %xmm7
  1419. movddup 8 * SIZE(BB), %xmm2
  1420. mulpd %xmm3, %xmm1
  1421. movddup 5 * SIZE(BB), %xmm3
  1422. addpd %xmm1, %xmm4
  1423. movapd 24 * SIZE(AA), %xmm1
  1424. mulpd 10 * SIZE(AA), %xmm3
  1425. addpd %xmm3, %xmm5
  1426. movddup 6 * SIZE(BB), %xmm3
  1427. mulpd 12 * SIZE(AA), %xmm3
  1428. addpd %xmm3, %xmm6
  1429. movddup 7 * SIZE(BB), %xmm3
  1430. mulpd 14 * SIZE(AA), %xmm3
  1431. addpd %xmm3, %xmm7
  1432. movddup 12 * SIZE(BB), %xmm3
  1433. addl $16 * SIZE, AA
  1434. addl $ 8 * SIZE, BB
  1435. decl %eax
  1436. jne .L72
  1437. ALIGN_4
  1438. .L75:
  1439. #if defined(LT) || defined(RN)
  1440. movl KK, %eax
  1441. #else
  1442. movl K, %eax
  1443. subl KK, %eax
  1444. #endif
  1445. andl $7, %eax # if (k & 1)
  1446. BRANCH
  1447. je .L78
  1448. ALIGN_3
  1449. .L76:
  1450. mulpd %xmm2, %xmm0
  1451. movddup 1 * SIZE(BB), %xmm2
  1452. addpd %xmm0, %xmm4
  1453. movapd 2 * SIZE(AA), %xmm0
  1454. addl $2 * SIZE, AA
  1455. addl $1 * SIZE, BB
  1456. decl %eax
  1457. jg .L76
  1458. ALIGN_4
  1459. .L78:
  1460. addpd %xmm5, %xmm4
  1461. addpd %xmm7, %xmm6
  1462. addpd %xmm6, %xmm4
  1463. #if defined(LN) || defined(RT)
  1464. movl KK, %eax
  1465. #ifdef LN
  1466. subl $2, %eax
  1467. #else
  1468. subl $1, %eax
  1469. #endif
  1470. movl AORIG, AA
  1471. leal (, %eax, SIZE), %eax
  1472. leal (AA, %eax, 2), AA
  1473. leal (B, %eax, 1), BB
  1474. #endif
  1475. #if defined(LN) || defined(LT)
  1476. movapd 0 * SIZE(BB), %xmm1
  1477. subpd %xmm4, %xmm1
  1478. movapd %xmm1, %xmm0
  1479. unpckhpd %xmm1, %xmm1
  1480. #else
  1481. movapd 0 * SIZE(AA), %xmm0
  1482. subpd %xmm4, %xmm0
  1483. #endif
  1484. #ifdef LN
  1485. movsd 3 * SIZE(AA), %xmm4
  1486. mulsd %xmm4, %xmm1
  1487. movsd 2 * SIZE(AA), %xmm4
  1488. mulsd %xmm1, %xmm4
  1489. subsd %xmm4, %xmm0
  1490. movsd 0 * SIZE(AA), %xmm4
  1491. mulsd %xmm4, %xmm0
  1492. #endif
  1493. #ifdef LT
  1494. movsd 0 * SIZE(AA), %xmm4
  1495. mulsd %xmm4, %xmm0
  1496. movsd 1 * SIZE(AA), %xmm4
  1497. mulsd %xmm0, %xmm4
  1498. subsd %xmm4, %xmm1
  1499. movsd 3 * SIZE(AA), %xmm4
  1500. mulsd %xmm4, %xmm1
  1501. #endif
  1502. #ifdef RN
  1503. movddup 0 * SIZE(BB), %xmm4
  1504. mulpd %xmm4, %xmm0
  1505. #endif
  1506. #ifdef RT
  1507. movddup 0 * SIZE(BB), %xmm4
  1508. mulpd %xmm4, %xmm0
  1509. #endif
  1510. #if defined(LN) || defined(LT)
  1511. movsd %xmm0, 0 * SIZE(BB)
  1512. movsd %xmm1, 1 * SIZE(BB)
  1513. #else
  1514. movapd %xmm0, 0 * SIZE(AA)
  1515. #endif
  1516. #ifdef LN
  1517. subl $2 * SIZE, CO1
  1518. #endif
  1519. #if defined(LN) || defined(LT)
  1520. movsd %xmm0, 0 * SIZE(CO1)
  1521. movsd %xmm1, 1 * SIZE(CO1)
  1522. #else
  1523. movsd %xmm0, 0 * SIZE(CO1)
  1524. movhpd %xmm0, 1 * SIZE(CO1)
  1525. #endif
  1526. #ifndef LN
  1527. addl $2 * SIZE, CO1
  1528. #endif
  1529. #if defined(LT) || defined(RN)
  1530. movl K, %eax
  1531. subl KK, %eax
  1532. leal (,%eax, SIZE), %eax
  1533. leal (AA, %eax, 2), AA
  1534. addl %eax, BB
  1535. #endif
  1536. #ifdef LN
  1537. subl $2, KK
  1538. #endif
  1539. #ifdef LT
  1540. addl $2, KK
  1541. #endif
  1542. #ifdef RT
  1543. movl K, %eax
  1544. sall $1 + BASE_SHIFT, %eax
  1545. addl %eax, AORIG
  1546. #endif
  1547. decl %ebx # i --
  1548. jg .L71
  1549. ALIGN_4
  1550. .L80:
  1551. movl M, %ebx
  1552. testl $1, %ebx # i = (m >> 2)
  1553. jle .L89
  1554. #ifdef LN
  1555. movl K, %eax
  1556. sall $BASE_SHIFT, %eax
  1557. subl %eax, AORIG
  1558. #endif
  1559. #if defined(LN) || defined(RT)
  1560. movl KK, %eax
  1561. movl AORIG, AA
  1562. leal (AA, %eax, SIZE), AA
  1563. #endif
  1564. movl B, BB
  1565. #if defined(LN) || defined(RT)
  1566. movl KK, %eax
  1567. sall $BASE_SHIFT, %eax
  1568. addl %eax, BB
  1569. #endif
  1570. movsd 0 * SIZE(AA), %xmm0
  1571. movhpd 1 * SIZE(AA), %xmm0
  1572. pxor %xmm4, %xmm4
  1573. movsd 8 * SIZE(AA), %xmm1
  1574. movhpd 9 * SIZE(AA), %xmm1
  1575. pxor %xmm5, %xmm5
  1576. movsd 0 * SIZE(BB), %xmm2
  1577. movhpd 1 * SIZE(BB), %xmm2
  1578. pxor %xmm6, %xmm6
  1579. movsd 8 * SIZE(BB), %xmm3
  1580. movhpd 9 * SIZE(BB), %xmm3
  1581. pxor %xmm7, %xmm7
  1582. #if defined(LT) || defined(RN)
  1583. movl KK, %eax
  1584. #else
  1585. movl K, %eax
  1586. subl KK, %eax
  1587. #endif
  1588. sarl $4, %eax
  1589. je .L85
  1590. ALIGN_4
  1591. .L82:
  1592. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1593. mulpd %xmm0, %xmm2
  1594. movapd 2 * SIZE(AA), %xmm0
  1595. addpd %xmm2, %xmm4
  1596. mulpd 2 * SIZE(BB), %xmm0
  1597. movapd 16 * SIZE(BB), %xmm2
  1598. addpd %xmm0, %xmm5
  1599. movapd 4 * SIZE(AA), %xmm0
  1600. mulpd 4 * SIZE(BB), %xmm0
  1601. addpd %xmm0, %xmm6
  1602. movapd 6 * SIZE(AA), %xmm0
  1603. mulpd 6 * SIZE(BB), %xmm0
  1604. addpd %xmm0, %xmm7
  1605. movapd 16 * SIZE(AA), %xmm0
  1606. mulpd %xmm1, %xmm3
  1607. movapd 10 * SIZE(AA), %xmm1
  1608. addpd %xmm3, %xmm4
  1609. mulpd 10 * SIZE(BB), %xmm1
  1610. movapd 24 * SIZE(BB), %xmm3
  1611. addpd %xmm1, %xmm5
  1612. movapd 12 * SIZE(AA), %xmm1
  1613. mulpd 12 * SIZE(BB), %xmm1
  1614. addpd %xmm1, %xmm6
  1615. movapd 14 * SIZE(AA), %xmm1
  1616. mulpd 14 * SIZE(BB), %xmm1
  1617. addpd %xmm1, %xmm7
  1618. movapd 24 * SIZE(AA), %xmm1
  1619. addl $16 * SIZE, AA
  1620. addl $16 * SIZE, BB
  1621. decl %eax
  1622. jne .L82
  1623. ALIGN_4
  1624. .L85:
  1625. #if defined(LT) || defined(RN)
  1626. movl KK, %eax
  1627. #else
  1628. movl K, %eax
  1629. subl KK, %eax
  1630. #endif
  1631. andl $15, %eax # if (k & 1)
  1632. BRANCH
  1633. je .L88
  1634. .L86:
  1635. mulsd %xmm0, %xmm2
  1636. movsd 1 * SIZE(AA), %xmm0
  1637. addsd %xmm2, %xmm4
  1638. movsd 1 * SIZE(BB), %xmm2
  1639. addl $1 * SIZE, AA
  1640. addl $1 * SIZE, BB
  1641. decl %eax
  1642. jg .L86
  1643. ALIGN_4
  1644. .L88:
  1645. addpd %xmm5, %xmm4
  1646. addpd %xmm7, %xmm6
  1647. addpd %xmm6, %xmm4
  1648. haddpd %xmm4, %xmm4
  1649. #if defined(LN) || defined(RT)
  1650. movl KK, %eax
  1651. #ifdef LN
  1652. subl $1, %eax
  1653. #else
  1654. subl $1, %eax
  1655. #endif
  1656. movl AORIG, AA
  1657. leal (, %eax, SIZE), %eax
  1658. addl %eax, AA
  1659. leal (B, %eax, 1), BB
  1660. #endif
  1661. #if defined(LN) || defined(LT)
  1662. movsd 0 * SIZE(BB), %xmm0
  1663. subsd %xmm4, %xmm0
  1664. #else
  1665. movsd 0 * SIZE(AA), %xmm0
  1666. subsd %xmm4, %xmm0
  1667. #endif
  1668. #ifdef LN
  1669. movsd 0 * SIZE(AA), %xmm4
  1670. mulsd %xmm4, %xmm0
  1671. #endif
  1672. #ifdef LT
  1673. movsd 0 * SIZE(AA), %xmm4
  1674. mulsd %xmm4, %xmm0
  1675. #endif
  1676. #ifdef RN
  1677. movsd 0 * SIZE(BB), %xmm4
  1678. mulsd %xmm4, %xmm0
  1679. #endif
  1680. #ifdef RT
  1681. movsd 0 * SIZE(BB), %xmm4
  1682. mulsd %xmm4, %xmm0
  1683. #endif
  1684. #if defined(LN) || defined(LT)
  1685. movsd %xmm0, 0 * SIZE(BB)
  1686. #else
  1687. movsd %xmm0, 0 * SIZE(AA)
  1688. #endif
  1689. #ifdef LN
  1690. subl $1 * SIZE, CO1
  1691. #endif
  1692. #if defined(LN) || defined(LT)
  1693. movsd %xmm0, 0 * SIZE(CO1)
  1694. #else
  1695. movsd %xmm0, 0 * SIZE(CO1)
  1696. #endif
  1697. #ifndef LN
  1698. addl $1 * SIZE, CO1
  1699. #endif
  1700. #if defined(LT) || defined(RN)
  1701. movl K, %eax
  1702. subl KK, %eax
  1703. leal (,%eax, SIZE), %eax
  1704. addl %eax, AA
  1705. addl %eax, BB
  1706. #endif
  1707. #ifdef LN
  1708. subl $1, KK
  1709. #endif
  1710. #ifdef LT
  1711. addl $1, KK
  1712. #endif
  1713. #ifdef RT
  1714. movl K, %eax
  1715. sall $BASE_SHIFT, %eax
  1716. addl %eax, AORIG
  1717. #endif
  1718. ALIGN_4
  1719. .L89:
  1720. #ifdef LN
  1721. movl K, %eax
  1722. leal (B, %eax, SIZE), B
  1723. #endif
  1724. #if defined(LT) || defined(RN)
  1725. movl BB, B
  1726. #endif
  1727. #ifdef RN
  1728. addl $1, KK
  1729. #endif
  1730. #ifdef RT
  1731. subl $1, KK
  1732. #endif
  1733. ALIGN_4
  1734. .L999:
  1735. popl %ebx
  1736. popl %esi
  1737. popl %edi
  1738. popl %ebp
  1739. addl $ARGS, %esp
  1740. ret
  1741. EPILOGUE