You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT_4x2_sse2.S 43 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if !defined(HAVE_SSE2) || !defined(HAVE_MMX)
  41. #error You have to check your configuration.
  42. #endif
  43. #define STACK 16
  44. #define ARGS 0
  45. #define STACK_M 4 + STACK + ARGS(%esi)
  46. #define STACK_N 8 + STACK + ARGS(%esi)
  47. #define STACK_K 12 + STACK + ARGS(%esi)
  48. #define STACK_ALPHA 16 + STACK + ARGS(%esi)
  49. #define STACK_A 24 + STACK + ARGS(%esi)
  50. #define STACK_B 28 + STACK + ARGS(%esi)
  51. #define STACK_C 32 + STACK + ARGS(%esi)
  52. #define STACK_LDC 36 + STACK + ARGS(%esi)
  53. #define STACK_OFFT 40 + STACK + ARGS(%esi)
  54. #define ALPHA 0(%esp)
  55. #define K 16(%esp)
  56. #define N 20(%esp)
  57. #define M 24(%esp)
  58. #define A 28(%esp)
  59. #define C 32(%esp)
  60. #define J 36(%esp)
  61. #define OLD_STACK 40(%esp)
  62. #define OFFSET 44(%esp)
  63. #define KK 48(%esp)
  64. #define KKK 52(%esp)
  65. #define AORIG 56(%esp)
  66. #define BORIG 60(%esp)
  67. #define BUFFER 128(%esp)
  68. #define B %edi
  69. #define LDC %ebp
  70. #define STACK_ALIGN 4096
  71. #define STACK_OFFSET 1024
  72. #define AA %edx
  73. #define BB %ecx
  74. #define PREFETCHSIZE (8 * 4)
  75. #define KERNEL1(address) \
  76. movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \
  77. mulpd %xmm0, %xmm2; \
  78. mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \
  79. addpd %xmm2, %xmm4; \
  80. movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \
  81. addpd %xmm0, %xmm5; \
  82. movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \
  83. mulpd %xmm0, %xmm2; \
  84. mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \
  85. addpd %xmm2, %xmm6; \
  86. movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \
  87. addpd %xmm0, %xmm7; \
  88. movapd 4 * SIZE + (address) * SIZE(AA), %xmm0
  89. #define KERNEL2(address) \
  90. mulpd %xmm0, %xmm2; \
  91. mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \
  92. addpd %xmm2, %xmm4; \
  93. movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \
  94. addpd %xmm0, %xmm5; \
  95. movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \
  96. mulpd %xmm0, %xmm2; \
  97. mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \
  98. addpd %xmm2, %xmm6; \
  99. movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \
  100. addpd %xmm0, %xmm7; \
  101. movapd 16 * SIZE + (address) * SIZE(AA), %xmm0
  102. #define KERNEL3(address) \
  103. movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \
  104. mulpd %xmm1, %xmm3; \
  105. mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \
  106. addpd %xmm3, %xmm4; \
  107. movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \
  108. addpd %xmm1, %xmm5; \
  109. movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \
  110. mulpd %xmm1, %xmm3; \
  111. mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \
  112. addpd %xmm3, %xmm6; \
  113. movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \
  114. addpd %xmm1, %xmm7; \
  115. movapd 12 * SIZE + (address) * SIZE(AA), %xmm1
  116. #define KERNEL4(address) \
  117. mulpd %xmm1, %xmm3; \
  118. mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \
  119. addpd %xmm3, %xmm4; \
  120. movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \
  121. addpd %xmm1, %xmm5; \
  122. movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \
  123. mulpd %xmm1, %xmm3; \
  124. mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \
  125. addpd %xmm3, %xmm6; \
  126. movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \
  127. addpd %xmm1, %xmm7; \
  128. movapd 24 * SIZE + (address) * SIZE(AA), %xmm1
  129. #define KERNEL5(address) \
  130. movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \
  131. mulpd %xmm0, %xmm2; \
  132. mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \
  133. addpd %xmm2, %xmm4; \
  134. movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \
  135. addpd %xmm0, %xmm5; \
  136. movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \
  137. mulpd %xmm0, %xmm2; \
  138. mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \
  139. addpd %xmm2, %xmm6; \
  140. movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \
  141. addpd %xmm0, %xmm7; \
  142. movapd 20 * SIZE + (address) * SIZE(AA), %xmm0
  143. #define KERNEL6(address) \
  144. mulpd %xmm0, %xmm2; \
  145. mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \
  146. addpd %xmm2, %xmm4; \
  147. movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \
  148. addpd %xmm0, %xmm5; \
  149. movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \
  150. mulpd %xmm0, %xmm2; \
  151. mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \
  152. addpd %xmm2, %xmm6; \
  153. movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \
  154. addpd %xmm0, %xmm7; \
  155. movapd 32 * SIZE + (address) * SIZE(AA), %xmm0
  156. #define KERNEL7(address) \
  157. movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \
  158. mulpd %xmm1, %xmm3; \
  159. mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \
  160. addpd %xmm3, %xmm4; \
  161. movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \
  162. addpd %xmm1, %xmm5; \
  163. movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \
  164. mulpd %xmm1, %xmm3; \
  165. mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \
  166. addpd %xmm3, %xmm6; \
  167. movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \
  168. addpd %xmm1, %xmm7; \
  169. movapd 28 * SIZE + (address) * SIZE(AA), %xmm1
  170. #define KERNEL8(address) \
  171. mulpd %xmm1, %xmm3; \
  172. mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \
  173. addpd %xmm3, %xmm4; \
  174. movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \
  175. addpd %xmm1, %xmm5; \
  176. movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \
  177. mulpd %xmm1, %xmm3; \
  178. mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \
  179. addpd %xmm3, %xmm6; \
  180. movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \
  181. addpd %xmm1, %xmm7; \
  182. movapd 40 * SIZE + (address) * SIZE(AA), %xmm1
  183. PROLOGUE
  184. pushl %ebp
  185. pushl %edi
  186. pushl %esi
  187. pushl %ebx
  188. PROFCODE
  189. EMMS
  190. movl %esp, %esi # save old stack
  191. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  192. andl $-STACK_ALIGN, %esp
  193. addl $STACK_OFFSET, %esp
  194. STACK_TOUCHING
  195. movd STACK_M, %mm0
  196. movl STACK_N, %eax
  197. movd STACK_K, %mm1
  198. movd STACK_A, %mm2
  199. movl STACK_B, B
  200. movd STACK_C, %mm3
  201. movl STACK_LDC, LDC
  202. movd STACK_OFFT, %mm4
  203. movd %mm1, K
  204. movl %eax, N
  205. movd %mm0, M
  206. movd %mm2, A
  207. movd %mm3, C
  208. movl %esi, OLD_STACK
  209. movd %mm4, OFFSET
  210. movd %mm4, KK
  211. sall $BASE_SHIFT, LDC
  212. #ifdef LN
  213. movl M, %eax
  214. leal (, %eax, SIZE), %eax
  215. addl %eax, C
  216. imull K, %eax
  217. addl %eax, A
  218. #endif
  219. #ifdef RT
  220. movl N, %eax
  221. leal (, %eax, SIZE), %eax
  222. imull K, %eax
  223. addl %eax, B
  224. movl N, %eax
  225. imull LDC, %eax
  226. addl %eax, C
  227. #endif
  228. #ifdef RN
  229. negl KK
  230. #endif
  231. #ifdef RT
  232. movl N, %eax
  233. subl OFFSET, %eax
  234. movl %eax, KK
  235. #endif
  236. movl N, %eax
  237. testl $1, %eax
  238. jle .L100
  239. ALIGN_2
  240. .L101:
  241. /* Copying to Sub Buffer */
  242. #ifdef LN
  243. movl OFFSET, %eax
  244. addl M, %eax
  245. movl %eax, KK
  246. #endif
  247. leal BUFFER, %ecx
  248. #ifdef RT
  249. movl K, %eax
  250. sall $0 + BASE_SHIFT, %eax
  251. subl %eax, B
  252. #endif
  253. #if defined(LN) || defined(RT)
  254. movl KK, %eax
  255. movl B, BORIG
  256. leal (, %eax, SIZE), %eax
  257. leal (B, %eax, 1), B
  258. leal (BB, %eax, 2), BB
  259. #endif
  260. #ifdef LT
  261. movl OFFSET, %eax
  262. movl %eax, KK
  263. #endif
  264. #if defined(LT) || defined(RN)
  265. movl KK, %eax
  266. #else
  267. movl K, %eax
  268. subl KK, %eax
  269. #endif
  270. sarl $3, %eax
  271. jle .L103
  272. ALIGN_4
  273. .L102:
  274. movsd 0 * SIZE(B), %xmm0
  275. movsd 1 * SIZE(B), %xmm1
  276. movsd 2 * SIZE(B), %xmm2
  277. movsd 3 * SIZE(B), %xmm3
  278. movsd 4 * SIZE(B), %xmm4
  279. movsd 5 * SIZE(B), %xmm5
  280. movsd 6 * SIZE(B), %xmm6
  281. movsd 7 * SIZE(B), %xmm7
  282. unpcklpd %xmm0, %xmm0
  283. unpcklpd %xmm1, %xmm1
  284. unpcklpd %xmm2, %xmm2
  285. unpcklpd %xmm3, %xmm3
  286. unpcklpd %xmm4, %xmm4
  287. unpcklpd %xmm5, %xmm5
  288. unpcklpd %xmm6, %xmm6
  289. unpcklpd %xmm7, %xmm7
  290. movapd %xmm0, 0 * SIZE(%ecx)
  291. movapd %xmm1, 2 * SIZE(%ecx)
  292. movapd %xmm2, 4 * SIZE(%ecx)
  293. movapd %xmm3, 6 * SIZE(%ecx)
  294. movapd %xmm4, 8 * SIZE(%ecx)
  295. movapd %xmm5, 10 * SIZE(%ecx)
  296. movapd %xmm6, 12 * SIZE(%ecx)
  297. movapd %xmm7, 14 * SIZE(%ecx)
  298. prefetcht0 104 * SIZE(B)
  299. addl $ 8 * SIZE, B
  300. addl $16 * SIZE, %ecx
  301. decl %eax
  302. BRANCH
  303. jne .L102
  304. ALIGN_2
  305. .L103:
  306. #if defined(LT) || defined(RN)
  307. movl KK, %eax
  308. #else
  309. movl K, %eax
  310. subl KK, %eax
  311. #endif
  312. andl $7, %eax
  313. BRANCH
  314. jle .L105
  315. ALIGN_2
  316. .L104:
  317. movsd 0 * SIZE(B), %xmm0
  318. unpcklpd %xmm0, %xmm0
  319. movapd %xmm0, 0 * SIZE(%ecx)
  320. addl $1 * SIZE, B
  321. addl $2 * SIZE, %ecx
  322. decl %eax
  323. jne .L104
  324. ALIGN_4
  325. .L105:
  326. #if defined(LT) || defined(RN)
  327. movl A, AA
  328. #else
  329. movl A, %eax
  330. movl %eax, AORIG
  331. #endif
  332. #ifdef RT
  333. subl LDC, C
  334. #endif
  335. movl C, %esi # coffset = c
  336. #ifndef RT
  337. addl LDC, C
  338. #endif
  339. movl M, %ebx
  340. sarl $2, %ebx # i = (m >> 2)
  341. jle .L130
  342. ALIGN_4
  343. .L110:
  344. #ifdef LN
  345. movl K, %eax
  346. sall $2 + BASE_SHIFT, %eax
  347. subl %eax, AORIG
  348. #endif
  349. #if defined(LN) || defined(RT)
  350. movl KK, %eax
  351. movl AORIG, AA
  352. leal (, %eax, SIZE), %eax
  353. leal (AA, %eax, 4), AA
  354. #endif
  355. leal BUFFER, BB
  356. #if defined(LN) || defined(RT)
  357. movl KK, %eax
  358. sall $0 + BASE_SHIFT, %eax
  359. leal (BB, %eax, 2), BB
  360. #endif
  361. movapd 0 * SIZE(BB), %xmm2
  362. pxor %xmm4, %xmm4
  363. movapd 0 * SIZE(AA), %xmm0
  364. pxor %xmm5, %xmm5
  365. movapd 8 * SIZE(BB), %xmm3
  366. pxor %xmm6, %xmm6
  367. movapd 8 * SIZE(AA), %xmm1
  368. pxor %xmm7, %xmm7
  369. #if defined(LT) || defined(RN)
  370. movl KK, %eax
  371. #else
  372. movl K, %eax
  373. subl KK, %eax
  374. #endif
  375. sarl $3, %eax
  376. je .L112
  377. .L111:
  378. mulpd %xmm2, %xmm0
  379. mulpd 2 * SIZE(AA), %xmm2
  380. addpd %xmm0, %xmm4
  381. movapd 4 * SIZE(AA), %xmm0
  382. addpd %xmm2, %xmm6
  383. movapd 2 * SIZE(BB), %xmm2
  384. mulpd %xmm2, %xmm0
  385. mulpd 6 * SIZE(AA), %xmm2
  386. addpd %xmm0, %xmm5
  387. movapd 16 * SIZE(AA), %xmm0
  388. addpd %xmm2, %xmm7
  389. movapd 4 * SIZE(BB), %xmm2
  390. mulpd %xmm2, %xmm1
  391. mulpd 10 * SIZE(AA), %xmm2
  392. addpd %xmm1, %xmm4
  393. movapd 12 * SIZE(AA), %xmm1
  394. addpd %xmm2, %xmm6
  395. movapd 6 * SIZE(BB), %xmm2
  396. mulpd %xmm2, %xmm1
  397. mulpd 14 * SIZE(AA), %xmm2
  398. addpd %xmm1, %xmm5
  399. movapd 24 * SIZE(AA), %xmm1
  400. addpd %xmm2, %xmm7
  401. movapd 16 * SIZE(BB), %xmm2
  402. mulpd %xmm3, %xmm0
  403. mulpd 18 * SIZE(AA), %xmm3
  404. addpd %xmm0, %xmm4
  405. movapd 20 * SIZE(AA), %xmm0
  406. addpd %xmm3, %xmm6
  407. movapd 10 * SIZE(BB), %xmm3
  408. mulpd %xmm3, %xmm0
  409. mulpd 22 * SIZE(AA), %xmm3
  410. addpd %xmm0, %xmm5
  411. movapd 32 * SIZE(AA), %xmm0
  412. addpd %xmm3, %xmm7
  413. movapd 12 * SIZE(BB), %xmm3
  414. mulpd %xmm3, %xmm1
  415. mulpd 26 * SIZE(AA), %xmm3
  416. addpd %xmm1, %xmm4
  417. movapd 28 * SIZE(AA), %xmm1
  418. addpd %xmm3, %xmm6
  419. movapd 14 * SIZE(BB), %xmm3
  420. mulpd %xmm3, %xmm1
  421. mulpd 30 * SIZE(AA), %xmm3
  422. addpd %xmm1, %xmm5
  423. movapd 40 * SIZE(AA), %xmm1
  424. addpd %xmm3, %xmm7
  425. movapd 24 * SIZE(BB), %xmm3
  426. addl $32 * SIZE, AA
  427. addl $16 * SIZE, BB
  428. decl %eax
  429. jne .L111
  430. .L112:
  431. #if defined(LT) || defined(RN)
  432. movl KK, %eax
  433. #else
  434. movl K, %eax
  435. subl KK, %eax
  436. #endif
  437. andl $7, %eax # if (k & 1)
  438. BRANCH
  439. je .L114
  440. .L113:
  441. mulpd %xmm2, %xmm0
  442. mulpd 2 * SIZE(AA), %xmm2
  443. addpd %xmm0, %xmm4
  444. movapd 4 * SIZE(AA), %xmm0
  445. addpd %xmm2, %xmm6
  446. movapd 2 * SIZE(BB), %xmm2
  447. addl $4 * SIZE, AA # aoffset += 8
  448. addl $2 * SIZE, BB # boffset1 += 8
  449. subl $1, %eax
  450. jg .L113
  451. ALIGN_4
  452. .L114:
  453. addpd %xmm5, %xmm4
  454. addpd %xmm7, %xmm6
  455. #if defined(LN) || defined(RT)
  456. movl KK, %eax
  457. #ifdef LN
  458. subl $4, %eax
  459. #else
  460. subl $1, %eax
  461. #endif
  462. movl AORIG, AA
  463. movl BORIG, B
  464. leal BUFFER, BB
  465. leal (, %eax, SIZE), %eax
  466. leal (AA, %eax, 4), AA
  467. leal (B, %eax, 1), B
  468. leal (BB, %eax, 2), BB
  469. #endif
  470. #if defined(LN) || defined(LT)
  471. movapd 0 * SIZE(B), %xmm0
  472. movapd 2 * SIZE(B), %xmm1
  473. #else
  474. movapd 0 * SIZE(AA), %xmm0
  475. movapd 2 * SIZE(AA), %xmm1
  476. #endif
  477. subpd %xmm4, %xmm0
  478. subpd %xmm6, %xmm1
  479. #ifdef LN
  480. movapd %xmm0, %xmm2
  481. unpckhpd %xmm2, %xmm2
  482. movapd %xmm1, %xmm3
  483. unpckhpd %xmm3, %xmm3
  484. movsd 15 * SIZE(AA), %xmm4
  485. mulsd %xmm4, %xmm3
  486. movsd 14 * SIZE(AA), %xmm5
  487. mulsd %xmm3, %xmm5
  488. subsd %xmm5, %xmm1
  489. movsd 13 * SIZE(AA), %xmm6
  490. mulsd %xmm3, %xmm6
  491. subsd %xmm6, %xmm2
  492. movsd 12 * SIZE(AA), %xmm7
  493. mulsd %xmm3, %xmm7
  494. subsd %xmm7, %xmm0
  495. movsd 10 * SIZE(AA), %xmm4
  496. mulsd %xmm4, %xmm1
  497. movsd 9 * SIZE(AA), %xmm5
  498. mulsd %xmm1, %xmm5
  499. subsd %xmm5, %xmm2
  500. movsd 8 * SIZE(AA), %xmm6
  501. mulsd %xmm1, %xmm6
  502. subsd %xmm6, %xmm0
  503. movsd 5 * SIZE(AA), %xmm4
  504. mulsd %xmm4, %xmm2
  505. movsd 4 * SIZE(AA), %xmm5
  506. mulsd %xmm2, %xmm5
  507. subsd %xmm5, %xmm0
  508. movsd 0 * SIZE(AA), %xmm4
  509. mulsd %xmm4, %xmm0
  510. unpcklpd %xmm2, %xmm0
  511. unpcklpd %xmm3, %xmm1
  512. #endif
  513. #ifdef LT
  514. movapd %xmm0, %xmm2
  515. unpckhpd %xmm2, %xmm2
  516. movapd %xmm1, %xmm3
  517. unpckhpd %xmm3, %xmm3
  518. movsd 0 * SIZE(AA), %xmm4
  519. mulsd %xmm4, %xmm0
  520. movsd 1 * SIZE(AA), %xmm5
  521. mulsd %xmm0, %xmm5
  522. subsd %xmm5, %xmm2
  523. movsd 2 * SIZE(AA), %xmm6
  524. mulsd %xmm0, %xmm6
  525. subsd %xmm6, %xmm1
  526. movsd 3 * SIZE(AA), %xmm7
  527. mulsd %xmm0, %xmm7
  528. subsd %xmm7, %xmm3
  529. movsd 5 * SIZE(AA), %xmm4
  530. mulsd %xmm4, %xmm2
  531. movsd 6 * SIZE(AA), %xmm5
  532. mulsd %xmm2, %xmm5
  533. subsd %xmm5, %xmm1
  534. movsd 7 * SIZE(AA), %xmm6
  535. mulsd %xmm2, %xmm6
  536. subsd %xmm6, %xmm3
  537. movsd 10 * SIZE(AA), %xmm4
  538. mulsd %xmm4, %xmm1
  539. movsd 11 * SIZE(AA), %xmm5
  540. mulsd %xmm1, %xmm5
  541. subsd %xmm5, %xmm3
  542. movsd 15 * SIZE(AA), %xmm4
  543. mulsd %xmm4, %xmm3
  544. unpcklpd %xmm2, %xmm0
  545. unpcklpd %xmm3, %xmm1
  546. #endif
  547. #if defined(RN) || defined(RT)
  548. movsd 0 * SIZE(B), %xmm4
  549. movhpd 0 * SIZE(B), %xmm4
  550. mulpd %xmm4, %xmm0
  551. mulpd %xmm4, %xmm1
  552. #endif
  553. #if defined(LN) || defined(LT)
  554. movapd %xmm0, 0 * SIZE(B)
  555. movapd %xmm1, 2 * SIZE(B)
  556. movsd %xmm0, 0 * SIZE(BB)
  557. movsd %xmm0, 1 * SIZE(BB)
  558. movhpd %xmm0, 2 * SIZE(BB)
  559. movhpd %xmm0, 3 * SIZE(BB)
  560. movsd %xmm1, 4 * SIZE(BB)
  561. movsd %xmm1, 5 * SIZE(BB)
  562. movhpd %xmm1, 6 * SIZE(BB)
  563. movhpd %xmm1, 7 * SIZE(BB)
  564. #else
  565. movapd %xmm0, 0 * SIZE(AA)
  566. movapd %xmm1, 2 * SIZE(AA)
  567. #endif
  568. #ifdef LN
  569. subl $4 * SIZE, %esi
  570. #endif
  571. movsd %xmm0, 0 * SIZE(%esi)
  572. movhpd %xmm0, 1 * SIZE(%esi)
  573. movsd %xmm1, 2 * SIZE(%esi)
  574. movhpd %xmm1, 3 * SIZE(%esi)
  575. #ifndef LN
  576. addl $4 * SIZE, %esi
  577. #endif
  578. #if defined(LT) || defined(RN)
  579. movl K, %eax
  580. subl KK, %eax
  581. leal (,%eax, SIZE), %eax
  582. leal (AA, %eax, 4), AA
  583. #ifdef LT
  584. addl $4 * SIZE, B
  585. #endif
  586. #endif
  587. #ifdef LN
  588. subl $4, KK
  589. movl BORIG, B
  590. #endif
  591. #ifdef LT
  592. addl $4, KK
  593. #endif
  594. #ifdef RT
  595. movl K, %eax
  596. movl BORIG, B
  597. sall $2 + BASE_SHIFT, %eax
  598. addl %eax, AORIG
  599. #endif
  600. BRANCH
  601. decl %ebx # i --
  602. jg .L110
  603. ALIGN_2
  604. .L130:
  605. movl M, %ebx
  606. testl $2, %ebx
  607. jle .L150
  608. #ifdef LN
  609. movl K, %eax
  610. sall $1 + BASE_SHIFT, %eax
  611. subl %eax, AORIG
  612. #endif
  613. #if defined(LN) || defined(RT)
  614. movl KK, %eax
  615. movl AORIG, AA
  616. leal (, %eax, SIZE), %eax
  617. leal (AA, %eax, 2), AA
  618. #endif
  619. leal BUFFER, BB
  620. movapd 0 * SIZE(BB), %xmm2
  621. pxor %xmm4, %xmm4
  622. movapd 0 * SIZE(AA), %xmm0
  623. pxor %xmm5, %xmm5
  624. movapd 8 * SIZE(BB), %xmm3
  625. pxor %xmm6, %xmm6
  626. movapd 8 * SIZE(AA), %xmm1
  627. pxor %xmm7, %xmm7
  628. #if defined(LN) || defined(RT)
  629. movl KK, %eax
  630. sall $0 + BASE_SHIFT, %eax
  631. leal (BB, %eax, 2), BB
  632. #endif
  633. #if defined(LT) || defined(RN)
  634. movl KK, %eax
  635. #else
  636. movl K, %eax
  637. subl KK, %eax
  638. #endif
  639. sarl $3, %eax
  640. je .L132
  641. .L131:
  642. mulpd %xmm0, %xmm2
  643. movapd 2 * SIZE(AA), %xmm0
  644. addpd %xmm2, %xmm4
  645. mulpd 2 * SIZE(BB), %xmm0
  646. movapd 16 * SIZE(BB), %xmm2
  647. addpd %xmm0, %xmm5
  648. movapd 4 * SIZE(AA), %xmm0
  649. mulpd 4 * SIZE(BB), %xmm0
  650. addpd %xmm0, %xmm6
  651. movapd 6 * SIZE(AA), %xmm0
  652. mulpd 6 * SIZE(BB), %xmm0
  653. addpd %xmm0, %xmm7
  654. movapd 16 * SIZE(AA), %xmm0
  655. mulpd %xmm1, %xmm3
  656. movapd 10 * SIZE(AA), %xmm1
  657. addpd %xmm3, %xmm4
  658. mulpd 10 * SIZE(BB), %xmm1
  659. movapd 24 * SIZE(BB), %xmm3
  660. addpd %xmm1, %xmm5
  661. movapd 12 * SIZE(AA), %xmm1
  662. mulpd 12 * SIZE(BB), %xmm1
  663. addpd %xmm1, %xmm6
  664. movapd 14 * SIZE(AA), %xmm1
  665. mulpd 14 * SIZE(BB), %xmm1
  666. addpd %xmm1, %xmm7
  667. movapd 24 * SIZE(AA), %xmm1
  668. addl $16 * SIZE, AA
  669. addl $16 * SIZE, BB
  670. BRANCH
  671. decl %eax
  672. jne .L131
  673. .L132:
  674. #if defined(LT) || defined(RN)
  675. movl KK, %eax
  676. #else
  677. movl K, %eax
  678. subl KK, %eax
  679. #endif
  680. andl $7, %eax # if (k & 1)
  681. BRANCH
  682. je .L134
  683. .L133:
  684. movapd 0 * SIZE(AA), %xmm0
  685. mulpd 0 * SIZE(BB), %xmm0
  686. addpd %xmm0, %xmm4
  687. addl $2 * SIZE, AA # aoffset += 8
  688. addl $2 * SIZE, BB # boffset1 += 8
  689. decl %eax
  690. BRANCH
  691. jg .L133
  692. ALIGN_4
  693. .L134:
  694. addpd %xmm5, %xmm4
  695. addpd %xmm7, %xmm6
  696. addpd %xmm6, %xmm4
  697. #if defined(LN) || defined(RT)
  698. movl KK, %eax
  699. #ifdef LN
  700. subl $2, %eax
  701. #else
  702. subl $1, %eax
  703. #endif
  704. movl AORIG, AA
  705. movl BORIG, B
  706. leal BUFFER, BB
  707. leal (, %eax, SIZE), %eax
  708. leal (AA, %eax, 2), AA
  709. leal (B, %eax, 1), B
  710. leal (BB, %eax, 2), BB
  711. #endif
  712. #if defined(LN) || defined(LT)
  713. movapd 0 * SIZE(B), %xmm0
  714. #else
  715. movapd 0 * SIZE(AA), %xmm0
  716. #endif
  717. subpd %xmm4, %xmm0
  718. #ifdef LN
  719. movapd %xmm0, %xmm2
  720. unpckhpd %xmm2, %xmm2
  721. movsd 3 * SIZE(AA), %xmm4
  722. mulsd %xmm4, %xmm2
  723. movsd 2 * SIZE(AA), %xmm5
  724. mulsd %xmm2, %xmm5
  725. subsd %xmm5, %xmm0
  726. movsd 0 * SIZE(AA), %xmm4
  727. mulsd %xmm4, %xmm0
  728. unpcklpd %xmm2, %xmm0
  729. #endif
  730. #ifdef LT
  731. movapd %xmm0, %xmm2
  732. unpckhpd %xmm2, %xmm2
  733. movsd 0 * SIZE(AA), %xmm4
  734. mulsd %xmm4, %xmm0
  735. movsd 1 * SIZE(AA), %xmm5
  736. mulsd %xmm0, %xmm5
  737. subsd %xmm5, %xmm2
  738. movsd 3 * SIZE(AA), %xmm4
  739. mulsd %xmm4, %xmm2
  740. unpcklpd %xmm2, %xmm0
  741. #endif
  742. #if defined(RN) || defined(RT)
  743. movsd 0 * SIZE(B), %xmm4
  744. movhpd 0 * SIZE(B), %xmm4
  745. mulpd %xmm4, %xmm0
  746. #endif
  747. #if defined(LN) || defined(LT)
  748. movapd %xmm0, 0 * SIZE(B)
  749. movsd %xmm0, 0 * SIZE(BB)
  750. movsd %xmm0, 1 * SIZE(BB)
  751. movhpd %xmm0, 2 * SIZE(BB)
  752. movhpd %xmm0, 3 * SIZE(BB)
  753. #else
  754. movapd %xmm0, 0 * SIZE(AA)
  755. #endif
  756. #ifdef LN
  757. subl $2 * SIZE, %esi
  758. #endif
  759. movsd %xmm0, 0 * SIZE(%esi)
  760. movhpd %xmm0, 1 * SIZE(%esi)
  761. #ifndef LN
  762. addl $2 * SIZE, %esi
  763. #endif
  764. #if defined(LT) || defined(RN)
  765. movl K, %eax
  766. subl KK, %eax
  767. leal (,%eax, SIZE), %eax
  768. leal (AA, %eax, 2), AA
  769. #ifdef LT
  770. addl $2 * SIZE, B
  771. #endif
  772. #endif
  773. #ifdef LN
  774. subl $2, KK
  775. movl BORIG, B
  776. #endif
  777. #ifdef LT
  778. addl $2, KK
  779. #endif
  780. #ifdef RT
  781. movl K, %eax
  782. movl BORIG, B
  783. sall $1 + BASE_SHIFT, %eax
  784. addl %eax, AORIG
  785. #endif
  786. ALIGN_2
  787. .L150:
  788. movl M, %ebx
  789. testl $1, %ebx
  790. jle .L159
  791. #ifdef LN
  792. movl K, %eax
  793. sall $0 + BASE_SHIFT, %eax
  794. subl %eax, AORIG
  795. #endif
  796. #if defined(LN) || defined(RT)
  797. movl KK, %eax
  798. movl AORIG, AA
  799. leal (, %eax, SIZE), %eax
  800. leal (AA, %eax, 1), AA
  801. #endif
  802. leal BUFFER, BB
  803. movsd 0 * SIZE(BB), %xmm2
  804. pxor %xmm4, %xmm4
  805. movsd 0 * SIZE(AA), %xmm0
  806. pxor %xmm5, %xmm5
  807. movsd 8 * SIZE(BB), %xmm3
  808. pxor %xmm6, %xmm6
  809. movsd 4 * SIZE(AA), %xmm1
  810. pxor %xmm7, %xmm7
  811. #if defined(LN) || defined(RT)
  812. movl KK, %eax
  813. sall $0 + BASE_SHIFT, %eax
  814. leal (BB, %eax, 2), BB
  815. #endif
  816. #if defined(LT) || defined(RN)
  817. movl KK, %eax
  818. #else
  819. movl K, %eax
  820. subl KK, %eax
  821. #endif
  822. sarl $3, %eax
  823. je .L152
  824. .L151:
  825. mulsd %xmm0, %xmm2
  826. movsd 1 * SIZE(AA), %xmm0
  827. addsd %xmm2, %xmm4
  828. mulsd 2 * SIZE(BB), %xmm0
  829. movsd 16 * SIZE(BB), %xmm2
  830. addsd %xmm0, %xmm4
  831. movsd 2 * SIZE(AA), %xmm0
  832. mulsd 4 * SIZE(BB), %xmm0
  833. addsd %xmm0, %xmm4
  834. movsd 3 * SIZE(AA), %xmm0
  835. mulsd 6 * SIZE(BB), %xmm0
  836. addsd %xmm0, %xmm4
  837. movsd 8 * SIZE(AA), %xmm0
  838. mulsd %xmm1, %xmm3
  839. movsd 5 * SIZE(AA), %xmm1
  840. addsd %xmm3, %xmm4
  841. mulsd 10 * SIZE(BB), %xmm1
  842. movsd 24 * SIZE(BB), %xmm3
  843. addsd %xmm1, %xmm4
  844. movsd 6 * SIZE(AA), %xmm1
  845. mulsd 12 * SIZE(BB), %xmm1
  846. addsd %xmm1, %xmm4
  847. movsd 7 * SIZE(AA), %xmm1
  848. mulsd 14 * SIZE(BB), %xmm1
  849. addsd %xmm1, %xmm4
  850. movsd 12 * SIZE(AA), %xmm1
  851. addl $ 8 * SIZE, AA
  852. addl $16 * SIZE, BB
  853. BRANCH
  854. decl %eax
  855. jne .L151
  856. .L152:
  857. #if defined(LT) || defined(RN)
  858. movl KK, %eax
  859. #else
  860. movl K, %eax
  861. subl KK, %eax
  862. #endif
  863. andl $7, %eax # if (k & 1)
  864. BRANCH
  865. je .L154
  866. .L153:
  867. movsd 0 * SIZE(AA), %xmm0
  868. mulsd 0 * SIZE(BB), %xmm0
  869. addsd %xmm0, %xmm4
  870. addl $1 * SIZE, AA # aoffset += 8
  871. addl $2 * SIZE, BB # boffset1 += 8
  872. decl %eax
  873. BRANCH
  874. jg .L153
  875. ALIGN_4
  876. .L154:
  877. addsd %xmm6, %xmm4
  878. addsd %xmm7, %xmm5
  879. #if defined(LN) || defined(RT)
  880. movl KK, %eax
  881. subl $1, %eax
  882. movl AORIG, AA
  883. movl BORIG, B
  884. leal BUFFER, BB
  885. leal (, %eax, SIZE), %eax
  886. leal (AA, %eax, 1), AA
  887. leal (B, %eax, 1), B
  888. leal (BB, %eax, 2), BB
  889. #endif
  890. #if defined(LN) || defined(LT)
  891. movsd 0 * SIZE(B), %xmm0
  892. #else
  893. movsd 0 * SIZE(AA), %xmm0
  894. #endif
  895. subsd %xmm4, %xmm0
  896. #if defined(LN) || defined(LT)
  897. mulsd 0 * SIZE(AA), %xmm0
  898. #endif
  899. #if defined(RN) || defined(RT)
  900. mulsd 0 * SIZE(B), %xmm0
  901. #endif
  902. #if defined(LN) || defined(LT)
  903. movsd %xmm0, 0 * SIZE(B)
  904. movsd %xmm0, 0 * SIZE(BB)
  905. movsd %xmm0, 1 * SIZE(BB)
  906. #else
  907. movsd %xmm0, 0 * SIZE(AA)
  908. #endif
  909. #ifdef LN
  910. subl $1 * SIZE, %esi
  911. #endif
  912. movsd %xmm0, 0 * SIZE(%esi)
  913. #ifndef LN
  914. addl $1 * SIZE, %esi
  915. #endif
  916. #if defined(LT) || defined(RN)
  917. movl K, %eax
  918. subl KK, %eax
  919. leal (,%eax, SIZE), %eax
  920. leal (AA, %eax, 1), AA
  921. #ifdef LT
  922. addl $1 * SIZE, B
  923. #endif
  924. #endif
  925. #ifdef LN
  926. subl $1, KK
  927. movl BORIG, B
  928. #endif
  929. #ifdef LT
  930. addl $1, KK
  931. #endif
  932. #ifdef RT
  933. movl K, %eax
  934. movl BORIG, B
  935. sall $0 + BASE_SHIFT, %eax
  936. addl %eax, AORIG
  937. #endif
  938. ALIGN_2
  939. .L159:
  940. #ifdef LN
  941. movl K, %eax
  942. leal (, %eax, SIZE), %eax
  943. leal (B, %eax, 1), B
  944. #endif
  945. #if defined(LT) || defined(RN)
  946. movl K, %eax
  947. subl KK, %eax
  948. leal (,%eax, SIZE), %eax
  949. leal (B, %eax, 1), B
  950. #endif
  951. #ifdef RN
  952. addl $1, KK
  953. #endif
  954. #ifdef RT
  955. subl $1, KK
  956. #endif
  957. ALIGN_2
  958. .L100:
  959. movl N, %eax
  960. sarl $1, %eax # j = (n >> 1)
  961. movl %eax, J
  962. jle .L999
  963. ALIGN_2
  964. .L01:
  965. /* Copying to Sub Buffer */
  966. #ifdef LN
  967. movl OFFSET, %eax
  968. addl M, %eax
  969. movl %eax, KK
  970. #endif
  971. leal BUFFER, %ecx
  972. #ifdef RT
  973. movl K, %eax
  974. sall $1 + BASE_SHIFT, %eax
  975. subl %eax, B
  976. #endif
  977. #if defined(LN) || defined(RT)
  978. movl KK, %eax
  979. movl B, BORIG
  980. leal (, %eax, SIZE), %eax
  981. leal (B, %eax, 2), B
  982. leal (BB, %eax, 4), BB
  983. #endif
  984. #ifdef LT
  985. movl OFFSET, %eax
  986. movl %eax, KK
  987. #endif
  988. #if defined(LT) || defined(RN)
  989. movl KK, %eax
  990. #else
  991. movl K, %eax
  992. subl KK, %eax
  993. #endif
  994. sarl $2, %eax
  995. jle .L03
  996. ALIGN_2
  997. .L02:
  998. movsd 0 * SIZE(B), %xmm0
  999. movsd 1 * SIZE(B), %xmm1
  1000. movsd 2 * SIZE(B), %xmm2
  1001. movsd 3 * SIZE(B), %xmm3
  1002. movsd 4 * SIZE(B), %xmm4
  1003. movsd 5 * SIZE(B), %xmm5
  1004. movsd 6 * SIZE(B), %xmm6
  1005. movsd 7 * SIZE(B), %xmm7
  1006. unpcklpd %xmm0, %xmm0
  1007. unpcklpd %xmm1, %xmm1
  1008. unpcklpd %xmm2, %xmm2
  1009. unpcklpd %xmm3, %xmm3
  1010. unpcklpd %xmm4, %xmm4
  1011. unpcklpd %xmm5, %xmm5
  1012. unpcklpd %xmm6, %xmm6
  1013. unpcklpd %xmm7, %xmm7
  1014. movapd %xmm0, 0 * SIZE(%ecx)
  1015. movapd %xmm1, 2 * SIZE(%ecx)
  1016. movapd %xmm2, 4 * SIZE(%ecx)
  1017. movapd %xmm3, 6 * SIZE(%ecx)
  1018. movapd %xmm4, 8 * SIZE(%ecx)
  1019. movapd %xmm5, 10 * SIZE(%ecx)
  1020. movapd %xmm6, 12 * SIZE(%ecx)
  1021. movapd %xmm7, 14 * SIZE(%ecx)
  1022. prefetcht0 104 * SIZE(B)
  1023. addl $ 8 * SIZE, B
  1024. addl $16 * SIZE, %ecx
  1025. decl %eax
  1026. jne .L02
  1027. ALIGN_2
  1028. .L03:
  1029. #if defined(LT) || defined(RN)
  1030. movl KK, %eax
  1031. #else
  1032. movl K, %eax
  1033. subl KK, %eax
  1034. #endif
  1035. andl $3, %eax
  1036. BRANCH
  1037. jle .L05
  1038. ALIGN_4
  1039. .L04:
  1040. movsd 0 * SIZE(B), %xmm0
  1041. movsd 1 * SIZE(B), %xmm1
  1042. unpcklpd %xmm0, %xmm0
  1043. unpcklpd %xmm1, %xmm1
  1044. movapd %xmm0, 0 * SIZE(%ecx)
  1045. movapd %xmm1, 2 * SIZE(%ecx)
  1046. addl $2 * SIZE, B
  1047. addl $4 * SIZE, %ecx
  1048. decl %eax
  1049. jne .L04
  1050. ALIGN_4
  1051. .L05:
  1052. #if defined(LT) || defined(RN)
  1053. movl A, AA
  1054. #else
  1055. movl A, %eax
  1056. movl %eax, AORIG
  1057. #endif
  1058. leal (, LDC, 2), %eax
  1059. #ifdef RT
  1060. subl %eax, C
  1061. #endif
  1062. movl C, %esi # coffset = c
  1063. #ifndef RT
  1064. addl %eax, C
  1065. #endif
  1066. movl M, %ebx
  1067. sarl $2, %ebx # i = (m >> 2)
  1068. jle .L30
  1069. ALIGN_4
  1070. .L10:
  1071. #ifdef LN
  1072. movl K, %eax
  1073. sall $2 + BASE_SHIFT, %eax
  1074. subl %eax, AORIG
  1075. #endif
  1076. #if defined(LN) || defined(RT)
  1077. movl KK, %eax
  1078. movl AORIG, AA
  1079. leal (, %eax, SIZE), %eax
  1080. leal (AA, %eax, 4), AA
  1081. #endif
  1082. leal BUFFER, BB
  1083. #if defined(LN) || defined(RT)
  1084. movl KK, %eax
  1085. sall $1 + BASE_SHIFT, %eax
  1086. leal (BB, %eax, 2), BB
  1087. #endif
  1088. movapd 0 * SIZE(BB), %xmm2
  1089. pxor %xmm4, %xmm4
  1090. movapd 0 * SIZE(AA), %xmm0
  1091. pxor %xmm5, %xmm5
  1092. movapd 8 * SIZE(BB), %xmm3
  1093. pxor %xmm6, %xmm6
  1094. movapd 8 * SIZE(AA), %xmm1
  1095. pxor %xmm7, %xmm7
  1096. prefetcht2 4 * SIZE(%esi)
  1097. prefetcht2 4 * SIZE(%esi, LDC)
  1098. #if defined(LT) || defined(RN)
  1099. movl KK, %eax
  1100. #else
  1101. movl K, %eax
  1102. subl KK, %eax
  1103. #endif
  1104. #ifdef PENTIUM4
  1105. andl $-8, %eax
  1106. NOBRANCH
  1107. je .L12
  1108. sall $3, %eax
  1109. .L1X:
  1110. KERNEL1(32 * 0)
  1111. KERNEL2(32 * 0)
  1112. KERNEL3(32 * 0)
  1113. KERNEL4(32 * 0)
  1114. KERNEL5(32 * 0)
  1115. KERNEL6(32 * 0)
  1116. KERNEL7(32 * 0)
  1117. KERNEL8(32 * 0)
  1118. cmpl $64 * 1, %eax
  1119. NOBRANCH
  1120. jle .L11
  1121. KERNEL1(32 * 1)
  1122. KERNEL2(32 * 1)
  1123. KERNEL3(32 * 1)
  1124. KERNEL4(32 * 1)
  1125. KERNEL5(32 * 1)
  1126. KERNEL6(32 * 1)
  1127. KERNEL7(32 * 1)
  1128. KERNEL8(32 * 1)
  1129. cmpl $64 * 2, %eax
  1130. NOBRANCH
  1131. jle .L11
  1132. KERNEL1(32 * 2)
  1133. KERNEL2(32 * 2)
  1134. KERNEL3(32 * 2)
  1135. KERNEL4(32 * 2)
  1136. KERNEL5(32 * 2)
  1137. KERNEL6(32 * 2)
  1138. KERNEL7(32 * 2)
  1139. KERNEL8(32 * 2)
  1140. cmpl $64 * 3, %eax
  1141. NOBRANCH
  1142. jle .L11
  1143. KERNEL1(32 * 3)
  1144. KERNEL2(32 * 3)
  1145. KERNEL3(32 * 3)
  1146. KERNEL4(32 * 3)
  1147. KERNEL5(32 * 3)
  1148. KERNEL6(32 * 3)
  1149. KERNEL7(32 * 3)
  1150. KERNEL8(32 * 3)
  1151. cmpl $64 * 4, %eax
  1152. NOBRANCH
  1153. jle .L11
  1154. KERNEL1(32 * 4)
  1155. KERNEL2(32 * 4)
  1156. KERNEL3(32 * 4)
  1157. KERNEL4(32 * 4)
  1158. KERNEL5(32 * 4)
  1159. KERNEL6(32 * 4)
  1160. KERNEL7(32 * 4)
  1161. KERNEL8(32 * 4)
  1162. cmpl $64 * 5, %eax
  1163. NOBRANCH
  1164. jle .L11
  1165. KERNEL1(32 * 5)
  1166. KERNEL2(32 * 5)
  1167. KERNEL3(32 * 5)
  1168. KERNEL4(32 * 5)
  1169. KERNEL5(32 * 5)
  1170. KERNEL6(32 * 5)
  1171. KERNEL7(32 * 5)
  1172. KERNEL8(32 * 5)
  1173. cmpl $64 * 6, %eax
  1174. NOBRANCH
  1175. jle .L11
  1176. KERNEL1(32 * 6)
  1177. KERNEL2(32 * 6)
  1178. KERNEL3(32 * 6)
  1179. KERNEL4(32 * 6)
  1180. KERNEL5(32 * 6)
  1181. KERNEL6(32 * 6)
  1182. KERNEL7(32 * 6)
  1183. KERNEL8(32 * 6)
  1184. cmpl $64 * 7, %eax
  1185. NOBRANCH
  1186. jle .L11
  1187. KERNEL1(32 * 7)
  1188. KERNEL2(32 * 7)
  1189. KERNEL3(32 * 7)
  1190. KERNEL4(32 * 7)
  1191. KERNEL5(32 * 7)
  1192. KERNEL6(32 * 7)
  1193. KERNEL7(32 * 7)
  1194. KERNEL8(32 * 7)
  1195. addl $64 * 4 * SIZE, AA
  1196. addl $64 * 4 * SIZE, BB
  1197. subl $64 * 8, %eax
  1198. BRANCH
  1199. jg .L1X
  1200. .L11:
  1201. leal (AA, %eax, 4), AA
  1202. leal (BB, %eax, 4), BB
  1203. #else
  1204. sarl $3, %eax
  1205. je .L12
  1206. .L11:
  1207. KERNEL1(32 * 0)
  1208. KERNEL2(32 * 0)
  1209. KERNEL3(32 * 0)
  1210. KERNEL4(32 * 0)
  1211. KERNEL5(32 * 0)
  1212. KERNEL6(32 * 0)
  1213. KERNEL7(32 * 0)
  1214. KERNEL8(32 * 0)
  1215. addl $32 * SIZE, %ecx
  1216. addl $32 * SIZE, %edx
  1217. decl %eax
  1218. jne .L11
  1219. #endif
  1220. .L12:
  1221. #if defined(LT) || defined(RN)
  1222. movl KK, %eax
  1223. #else
  1224. movl K, %eax
  1225. subl KK, %eax
  1226. #endif
  1227. andl $7, %eax # if (k & 1)
  1228. BRANCH
  1229. je .L14
  1230. .L13:
  1231. mulpd %xmm0, %xmm2
  1232. mulpd 2 * SIZE(BB), %xmm0
  1233. addpd %xmm2, %xmm4
  1234. movapd 0 * SIZE(BB), %xmm2
  1235. addpd %xmm0, %xmm5
  1236. movapd 2 * SIZE(AA), %xmm0
  1237. mulpd %xmm0, %xmm2
  1238. mulpd 2 * SIZE(BB), %xmm0
  1239. addpd %xmm2, %xmm6
  1240. movapd 4 * SIZE(BB), %xmm2
  1241. addpd %xmm0, %xmm7
  1242. movapd 4 * SIZE(AA), %xmm0
  1243. addl $4 * SIZE, AA # aoffset += 8
  1244. addl $4 * SIZE, BB # boffset1 += 8
  1245. subl $1, %eax
  1246. jg .L13
  1247. ALIGN_4
  1248. .L14:
  1249. #if defined(LN) || defined(RT)
  1250. movl KK, %eax
  1251. #ifdef LN
  1252. subl $4, %eax
  1253. #else
  1254. subl $2, %eax
  1255. #endif
  1256. movl AORIG, AA
  1257. movl BORIG, B
  1258. leal BUFFER, BB
  1259. leal (, %eax, SIZE), %eax
  1260. leal (AA, %eax, 4), AA
  1261. leal (B, %eax, 2), B
  1262. leal (BB, %eax, 4), BB
  1263. #endif
  1264. #if defined(LN) || defined(LT)
  1265. movapd %xmm4, %xmm0
  1266. unpcklpd %xmm5, %xmm4
  1267. unpckhpd %xmm5, %xmm0
  1268. movapd %xmm6, %xmm1
  1269. unpcklpd %xmm7, %xmm6
  1270. unpckhpd %xmm7, %xmm1
  1271. movapd 0 * SIZE(B), %xmm2
  1272. movapd 2 * SIZE(B), %xmm3
  1273. movapd 4 * SIZE(B), %xmm5
  1274. movapd 6 * SIZE(B), %xmm7
  1275. subpd %xmm4, %xmm2
  1276. subpd %xmm0, %xmm3
  1277. subpd %xmm6, %xmm5
  1278. subpd %xmm1, %xmm7
  1279. #else
  1280. movapd 0 * SIZE(AA), %xmm0
  1281. movapd 2 * SIZE(AA), %xmm1
  1282. movapd 4 * SIZE(AA), %xmm2
  1283. movapd 6 * SIZE(AA), %xmm3
  1284. subpd %xmm4, %xmm0
  1285. subpd %xmm6, %xmm1
  1286. subpd %xmm5, %xmm2
  1287. subpd %xmm7, %xmm3
  1288. #endif
  1289. #ifdef LN
  1290. movsd 15 * SIZE(AA), %xmm0
  1291. movhpd 15 * SIZE(AA), %xmm0
  1292. mulpd %xmm0, %xmm7
  1293. movsd 14 * SIZE(AA), %xmm0
  1294. movhpd 14 * SIZE(AA), %xmm0
  1295. mulpd %xmm7, %xmm0
  1296. subpd %xmm0, %xmm5
  1297. movsd 13 * SIZE(AA), %xmm0
  1298. movhpd 13 * SIZE(AA), %xmm0
  1299. mulpd %xmm7, %xmm0
  1300. subpd %xmm0, %xmm3
  1301. movsd 12 * SIZE(AA), %xmm0
  1302. movhpd 12 * SIZE(AA), %xmm0
  1303. mulpd %xmm7, %xmm0
  1304. subpd %xmm0, %xmm2
  1305. movsd 10 * SIZE(AA), %xmm0
  1306. movhpd 10 * SIZE(AA), %xmm0
  1307. mulpd %xmm0, %xmm5
  1308. movsd 9 * SIZE(AA), %xmm0
  1309. movhpd 9 * SIZE(AA), %xmm0
  1310. mulpd %xmm5, %xmm0
  1311. subpd %xmm0, %xmm3
  1312. movsd 8 * SIZE(AA), %xmm0
  1313. movhpd 8 * SIZE(AA), %xmm0
  1314. mulpd %xmm5, %xmm0
  1315. subpd %xmm0, %xmm2
  1316. movsd 5 * SIZE(AA), %xmm0
  1317. movhpd 5 * SIZE(AA), %xmm0
  1318. mulpd %xmm0, %xmm3
  1319. movsd 4 * SIZE(AA), %xmm0
  1320. movhpd 4 * SIZE(AA), %xmm0
  1321. mulpd %xmm3, %xmm0
  1322. subpd %xmm0, %xmm2
  1323. movsd 0 * SIZE(AA), %xmm0
  1324. movhpd 0 * SIZE(AA), %xmm0
  1325. mulpd %xmm0, %xmm2
  1326. #endif
  1327. #ifdef LT
  1328. movsd 0 * SIZE(AA), %xmm0
  1329. movhpd 0 * SIZE(AA), %xmm0
  1330. mulpd %xmm0, %xmm2
  1331. movsd 1 * SIZE(AA), %xmm0
  1332. movhpd 1 * SIZE(AA), %xmm0
  1333. mulpd %xmm2, %xmm0
  1334. subpd %xmm0, %xmm3
  1335. movsd 2 * SIZE(AA), %xmm0
  1336. movhpd 2 * SIZE(AA), %xmm0
  1337. mulpd %xmm2, %xmm0
  1338. subpd %xmm0, %xmm5
  1339. movsd 3 * SIZE(AA), %xmm0
  1340. movhpd 3 * SIZE(AA), %xmm0
  1341. mulpd %xmm2, %xmm0
  1342. subpd %xmm0, %xmm7
  1343. movsd 5 * SIZE(AA), %xmm0
  1344. movhpd 5 * SIZE(AA), %xmm0
  1345. mulpd %xmm0, %xmm3
  1346. movsd 6 * SIZE(AA), %xmm0
  1347. movhpd 6 * SIZE(AA), %xmm0
  1348. mulpd %xmm3, %xmm0
  1349. subpd %xmm0, %xmm5
  1350. movsd 7 * SIZE(AA), %xmm0
  1351. movhpd 7 * SIZE(AA), %xmm0
  1352. mulpd %xmm3, %xmm0
  1353. subpd %xmm0, %xmm7
  1354. movsd 10 * SIZE(AA), %xmm0
  1355. movhpd 10 * SIZE(AA), %xmm0
  1356. mulpd %xmm0, %xmm5
  1357. movsd 11 * SIZE(AA), %xmm0
  1358. movhpd 11 * SIZE(AA), %xmm0
  1359. mulpd %xmm5, %xmm0
  1360. subpd %xmm0, %xmm7
  1361. movsd 15 * SIZE(AA), %xmm0
  1362. movhpd 15 * SIZE(AA), %xmm0
  1363. mulpd %xmm0, %xmm7
  1364. #endif
  1365. #ifdef RN
  1366. movsd 0 * SIZE(B), %xmm4
  1367. movhpd 0 * SIZE(B), %xmm4
  1368. mulpd %xmm4, %xmm0
  1369. mulpd %xmm4, %xmm1
  1370. movsd 1 * SIZE(B), %xmm4
  1371. movhpd 1 * SIZE(B), %xmm4
  1372. mulpd %xmm0, %xmm4
  1373. subpd %xmm4, %xmm2
  1374. movsd 1 * SIZE(B), %xmm4
  1375. movhpd 1 * SIZE(B), %xmm4
  1376. mulpd %xmm1, %xmm4
  1377. subpd %xmm4, %xmm3
  1378. movsd 3 * SIZE(B), %xmm4
  1379. movhpd 3 * SIZE(B), %xmm4
  1380. mulpd %xmm4, %xmm2
  1381. mulpd %xmm4, %xmm3
  1382. #endif
  1383. #ifdef RT
  1384. movsd 3 * SIZE(B), %xmm4
  1385. movhpd 3 * SIZE(B), %xmm4
  1386. mulpd %xmm4, %xmm2
  1387. mulpd %xmm4, %xmm3
  1388. movsd 2 * SIZE(B), %xmm4
  1389. movhpd 2 * SIZE(B), %xmm4
  1390. mulpd %xmm2, %xmm4
  1391. subpd %xmm4, %xmm0
  1392. movsd 2 * SIZE(B), %xmm4
  1393. movhpd 2 * SIZE(B), %xmm4
  1394. mulpd %xmm3, %xmm4
  1395. subpd %xmm4, %xmm1
  1396. movsd 0 * SIZE(B), %xmm4
  1397. movhpd 0 * SIZE(B), %xmm4
  1398. mulpd %xmm4, %xmm0
  1399. mulpd %xmm4, %xmm1
  1400. #endif
  1401. #if defined(LN) || defined(LT)
  1402. movapd %xmm2, 0 * SIZE(B)
  1403. movapd %xmm3, 2 * SIZE(B)
  1404. movapd %xmm5, 4 * SIZE(B)
  1405. movapd %xmm7, 6 * SIZE(B)
  1406. movsd %xmm2, 0 * SIZE(BB)
  1407. movsd %xmm2, 1 * SIZE(BB)
  1408. movhpd %xmm2, 2 * SIZE(BB)
  1409. movhpd %xmm2, 3 * SIZE(BB)
  1410. movsd %xmm3, 4 * SIZE(BB)
  1411. movsd %xmm3, 5 * SIZE(BB)
  1412. movhpd %xmm3, 6 * SIZE(BB)
  1413. movhpd %xmm3, 7 * SIZE(BB)
  1414. movsd %xmm5, 8 * SIZE(BB)
  1415. movsd %xmm5, 9 * SIZE(BB)
  1416. movhpd %xmm5, 10 * SIZE(BB)
  1417. movhpd %xmm5, 11 * SIZE(BB)
  1418. movsd %xmm7, 12 * SIZE(BB)
  1419. movsd %xmm7, 13 * SIZE(BB)
  1420. movhpd %xmm7, 14 * SIZE(BB)
  1421. movhpd %xmm7, 15 * SIZE(BB)
  1422. #else
  1423. movapd %xmm0, 0 * SIZE(AA)
  1424. movapd %xmm1, 2 * SIZE(AA)
  1425. movapd %xmm2, 4 * SIZE(AA)
  1426. movapd %xmm3, 6 * SIZE(AA)
  1427. #endif
  1428. #ifdef LN
  1429. subl $4 * SIZE, %esi
  1430. #endif
  1431. #if defined(LN) || defined(LT)
  1432. movsd %xmm2, 0 * SIZE(%esi)
  1433. movsd %xmm3, 1 * SIZE(%esi)
  1434. movsd %xmm5, 2 * SIZE(%esi)
  1435. movsd %xmm7, 3 * SIZE(%esi)
  1436. movhpd %xmm2, 0 * SIZE(%esi, LDC)
  1437. movhpd %xmm3, 1 * SIZE(%esi, LDC)
  1438. movhpd %xmm5, 2 * SIZE(%esi, LDC)
  1439. movhpd %xmm7, 3 * SIZE(%esi, LDC)
  1440. #else
  1441. movsd %xmm0, 0 * SIZE(%esi)
  1442. movhpd %xmm0, 1 * SIZE(%esi)
  1443. movsd %xmm1, 2 * SIZE(%esi)
  1444. movhpd %xmm1, 3 * SIZE(%esi)
  1445. movsd %xmm2, 0 * SIZE(%esi, LDC)
  1446. movhpd %xmm2, 1 * SIZE(%esi, LDC)
  1447. movsd %xmm3, 2 * SIZE(%esi, LDC)
  1448. movhpd %xmm3, 3 * SIZE(%esi, LDC)
  1449. #endif
  1450. #ifndef LN
  1451. addl $4 * SIZE, %esi
  1452. #endif
  1453. #if defined(LT) || defined(RN)
  1454. movl K, %eax
  1455. subl KK, %eax
  1456. leal (,%eax, SIZE), %eax
  1457. leal (AA, %eax, 4), AA
  1458. #ifdef LT
  1459. addl $8 * SIZE, B
  1460. #endif
  1461. #endif
  1462. #ifdef LN
  1463. subl $4, KK
  1464. movl BORIG, B
  1465. #endif
  1466. #ifdef LT
  1467. addl $4, KK
  1468. #endif
  1469. #ifdef RT
  1470. movl K, %eax
  1471. movl BORIG, B
  1472. sall $2 + BASE_SHIFT, %eax
  1473. addl %eax, AORIG
  1474. #endif
  1475. decl %ebx # i --
  1476. jg .L10
  1477. ALIGN_2
  1478. .L30:
  1479. movl M, %ebx
  1480. testl $2, %ebx
  1481. jle .L50
  1482. #ifdef LN
  1483. movl K, %eax
  1484. sall $1 + BASE_SHIFT, %eax
  1485. subl %eax, AORIG
  1486. #endif
  1487. #if defined(LN) || defined(RT)
  1488. movl KK, %eax
  1489. movl AORIG, AA
  1490. leal (, %eax, SIZE), %eax
  1491. leal (AA, %eax, 2), AA
  1492. #endif
  1493. leal BUFFER, BB
  1494. #if defined(LN) || defined(RT)
  1495. movl KK, %eax
  1496. sall $1 + BASE_SHIFT, %eax
  1497. leal (BB, %eax, 2), BB
  1498. #endif
  1499. movapd 0 * SIZE(BB), %xmm2
  1500. pxor %xmm4, %xmm4
  1501. movapd 0 * SIZE(AA), %xmm0
  1502. pxor %xmm5, %xmm5
  1503. movapd 8 * SIZE(BB), %xmm3
  1504. pxor %xmm6, %xmm6
  1505. movapd 8 * SIZE(AA), %xmm1
  1506. pxor %xmm7, %xmm7
  1507. #if defined(LT) || defined(RN)
  1508. movl KK, %eax
  1509. #else
  1510. movl K, %eax
  1511. subl KK, %eax
  1512. #endif
  1513. sarl $3, %eax
  1514. je .L32
  1515. .L31:
  1516. mulpd %xmm0, %xmm2
  1517. mulpd 2 * SIZE(BB), %xmm0
  1518. addpd %xmm2, %xmm4
  1519. movapd 4 * SIZE(BB), %xmm2
  1520. addpd %xmm0, %xmm5
  1521. movapd 2 * SIZE(AA), %xmm0
  1522. mulpd %xmm0, %xmm2
  1523. mulpd 6 * SIZE(BB), %xmm0
  1524. addpd %xmm2, %xmm6
  1525. movapd 16 * SIZE(BB), %xmm2
  1526. addpd %xmm0, %xmm7
  1527. movapd 4 * SIZE(AA), %xmm0
  1528. mulpd %xmm0, %xmm3
  1529. mulpd 10 * SIZE(BB), %xmm0
  1530. addpd %xmm3, %xmm4
  1531. movapd 12 * SIZE(BB), %xmm3
  1532. addpd %xmm0, %xmm5
  1533. movapd 6 * SIZE(AA), %xmm0
  1534. mulpd %xmm0, %xmm3
  1535. mulpd 14 * SIZE(BB), %xmm0
  1536. addpd %xmm3, %xmm6
  1537. movapd 24 * SIZE(BB), %xmm3
  1538. addpd %xmm0, %xmm7
  1539. movapd 16 * SIZE(AA), %xmm0
  1540. mulpd %xmm1, %xmm2
  1541. mulpd 18 * SIZE(BB), %xmm1
  1542. addpd %xmm2, %xmm4
  1543. movapd 20 * SIZE(BB), %xmm2
  1544. addpd %xmm1, %xmm5
  1545. movapd 10 * SIZE(AA), %xmm1
  1546. mulpd %xmm1, %xmm2
  1547. mulpd 22 * SIZE(BB), %xmm1
  1548. addpd %xmm2, %xmm6
  1549. movapd 32 * SIZE(BB), %xmm2
  1550. addpd %xmm1, %xmm7
  1551. movapd 12 * SIZE(AA), %xmm1
  1552. mulpd %xmm1, %xmm3
  1553. mulpd 26 * SIZE(BB), %xmm1
  1554. addpd %xmm3, %xmm4
  1555. movapd 28 * SIZE(BB), %xmm3
  1556. addpd %xmm1, %xmm5
  1557. movapd 14 * SIZE(AA), %xmm1
  1558. mulpd %xmm1, %xmm3
  1559. mulpd 30 * SIZE(BB), %xmm1
  1560. addpd %xmm3, %xmm6
  1561. movapd 40 * SIZE(BB), %xmm3
  1562. addpd %xmm1, %xmm7
  1563. movapd 24 * SIZE(AA), %xmm1
  1564. addl $16 * SIZE, AA
  1565. addl $32 * SIZE, BB
  1566. BRANCH
  1567. decl %eax
  1568. jne .L31
  1569. .L32:
  1570. #if defined(LT) || defined(RN)
  1571. movl KK, %eax
  1572. #else
  1573. movl K, %eax
  1574. subl KK, %eax
  1575. #endif
  1576. andl $7, %eax # if (k & 1)
  1577. BRANCH
  1578. je .L34
  1579. .L33:
  1580. mulpd %xmm0, %xmm2
  1581. mulpd 2 * SIZE(BB), %xmm0
  1582. addpd %xmm2, %xmm4
  1583. movapd 4 * SIZE(BB), %xmm2
  1584. addpd %xmm0, %xmm5
  1585. movapd 2 * SIZE(AA), %xmm0
  1586. addl $2 * SIZE, AA # aoffset += 8
  1587. addl $4 * SIZE, BB # boffset1 += 8
  1588. decl %eax
  1589. BRANCH
  1590. jg .L33
  1591. ALIGN_4
  1592. .L34:
  1593. addpd %xmm6, %xmm4
  1594. addpd %xmm7, %xmm5
  1595. #if defined(LN) || defined(RT)
  1596. movl KK, %eax
  1597. #ifdef LN
  1598. subl $2, %eax
  1599. #else
  1600. subl $2, %eax
  1601. #endif
  1602. movl AORIG, AA
  1603. movl BORIG, B
  1604. leal BUFFER, BB
  1605. leal (, %eax, SIZE), %eax
  1606. leal (AA, %eax, 2), AA
  1607. leal (B, %eax, 2), B
  1608. leal (BB, %eax, 4), BB
  1609. #endif
  1610. #if defined(LN) || defined(LT)
  1611. movapd %xmm4, %xmm0
  1612. unpcklpd %xmm5, %xmm4
  1613. unpckhpd %xmm5, %xmm0
  1614. movapd 0 * SIZE(B), %xmm2
  1615. movapd 2 * SIZE(B), %xmm3
  1616. subpd %xmm4, %xmm2
  1617. subpd %xmm0, %xmm3
  1618. #else
  1619. movapd 0 * SIZE(AA), %xmm0
  1620. movapd 2 * SIZE(AA), %xmm1
  1621. subpd %xmm4, %xmm0
  1622. subpd %xmm5, %xmm1
  1623. #endif
  1624. #ifdef LN
  1625. movsd 3 * SIZE(AA), %xmm0
  1626. movhpd 3 * SIZE(AA), %xmm0
  1627. mulpd %xmm0, %xmm3
  1628. movsd 2 * SIZE(AA), %xmm0
  1629. movhpd 2 * SIZE(AA), %xmm0
  1630. mulpd %xmm3, %xmm0
  1631. subpd %xmm0, %xmm2
  1632. movsd 0 * SIZE(AA), %xmm0
  1633. movhpd 0 * SIZE(AA), %xmm0
  1634. mulpd %xmm0, %xmm2
  1635. #endif
  1636. #ifdef LT
  1637. movsd 0 * SIZE(AA), %xmm0
  1638. movhpd 0 * SIZE(AA), %xmm0
  1639. mulpd %xmm0, %xmm2
  1640. movsd 1 * SIZE(AA), %xmm0
  1641. movhpd 1 * SIZE(AA), %xmm0
  1642. mulpd %xmm2, %xmm0
  1643. subpd %xmm0, %xmm3
  1644. movsd 3 * SIZE(AA), %xmm0
  1645. movhpd 3 * SIZE(AA), %xmm0
  1646. mulpd %xmm0, %xmm3
  1647. #endif
  1648. #ifdef RN
  1649. movsd 0 * SIZE(B), %xmm4
  1650. movhpd 0 * SIZE(B), %xmm4
  1651. mulpd %xmm4, %xmm0
  1652. movsd 1 * SIZE(B), %xmm4
  1653. movhpd 1 * SIZE(B), %xmm4
  1654. mulpd %xmm0, %xmm4
  1655. subpd %xmm4, %xmm1
  1656. movsd 3 * SIZE(B), %xmm4
  1657. movhpd 3 * SIZE(B), %xmm4
  1658. mulpd %xmm4, %xmm1
  1659. #endif
  1660. #ifdef RT
  1661. movsd 3 * SIZE(B), %xmm4
  1662. movhpd 3 * SIZE(B), %xmm4
  1663. mulpd %xmm4, %xmm1
  1664. movsd 2 * SIZE(B), %xmm4
  1665. movhpd 2 * SIZE(B), %xmm4
  1666. mulpd %xmm1, %xmm4
  1667. subpd %xmm4, %xmm0
  1668. movsd 0 * SIZE(B), %xmm4
  1669. movhpd 0 * SIZE(B), %xmm4
  1670. mulpd %xmm4, %xmm0
  1671. #endif
  1672. #if defined(LN) || defined(LT)
  1673. movapd %xmm2, 0 * SIZE(B)
  1674. movapd %xmm3, 2 * SIZE(B)
  1675. movsd %xmm2, 0 * SIZE(BB)
  1676. movsd %xmm2, 1 * SIZE(BB)
  1677. movhpd %xmm2, 2 * SIZE(BB)
  1678. movhpd %xmm2, 3 * SIZE(BB)
  1679. movsd %xmm3, 4 * SIZE(BB)
  1680. movsd %xmm3, 5 * SIZE(BB)
  1681. movhpd %xmm3, 6 * SIZE(BB)
  1682. movhpd %xmm3, 7 * SIZE(BB)
  1683. #else
  1684. movapd %xmm0, 0 * SIZE(AA)
  1685. movapd %xmm1, 2 * SIZE(AA)
  1686. #endif
  1687. #ifdef LN
  1688. subl $2 * SIZE, %esi
  1689. #endif
  1690. #if defined(LN) || defined(LT)
  1691. movsd %xmm2, 0 * SIZE(%esi)
  1692. movsd %xmm3, 1 * SIZE(%esi)
  1693. movhpd %xmm2, 0 * SIZE(%esi, LDC)
  1694. movhpd %xmm3, 1 * SIZE(%esi, LDC)
  1695. #else
  1696. movsd %xmm0, 0 * SIZE(%esi)
  1697. movhpd %xmm0, 1 * SIZE(%esi)
  1698. movsd %xmm1, 0 * SIZE(%esi, LDC)
  1699. movhpd %xmm1, 1 * SIZE(%esi, LDC)
  1700. #endif
  1701. #ifndef LN
  1702. addl $2 * SIZE, %esi
  1703. #endif
  1704. #if defined(LT) || defined(RN)
  1705. movl K, %eax
  1706. subl KK, %eax
  1707. leal (,%eax, SIZE), %eax
  1708. leal (AA, %eax, 2), AA
  1709. #ifdef LT
  1710. addl $4 * SIZE, B
  1711. #endif
  1712. #endif
  1713. #ifdef LN
  1714. subl $2, KK
  1715. movl BORIG, B
  1716. #endif
  1717. #ifdef LT
  1718. addl $2, KK
  1719. #endif
  1720. #ifdef RT
  1721. movl K, %eax
  1722. movl BORIG, B
  1723. sall $1 + BASE_SHIFT, %eax
  1724. addl %eax, AORIG
  1725. #endif
  1726. ALIGN_2
  1727. .L50:
  1728. movl M, %ebx
  1729. testl $1, %ebx
  1730. jle .L99
  1731. #ifdef LN
  1732. movl K, %eax
  1733. sall $0 + BASE_SHIFT, %eax
  1734. subl %eax, AORIG
  1735. #endif
  1736. #if defined(LN) || defined(RT)
  1737. movl KK, %eax
  1738. movl AORIG, AA
  1739. leal (, %eax, SIZE), %eax
  1740. leal (AA, %eax, 1), AA
  1741. #endif
  1742. leal BUFFER, %ecx
  1743. #if defined(LN) || defined(RT)
  1744. movl KK, %eax
  1745. sall $1 + BASE_SHIFT, %eax
  1746. leal (BB, %eax, 2), BB
  1747. #endif
  1748. movsd 0 * SIZE(BB), %xmm2
  1749. pxor %xmm4, %xmm4
  1750. movsd 0 * SIZE(AA), %xmm0
  1751. pxor %xmm5, %xmm5
  1752. movsd 8 * SIZE(BB), %xmm3
  1753. pxor %xmm6, %xmm6
  1754. movsd 4 * SIZE(AA), %xmm1
  1755. pxor %xmm7, %xmm7
  1756. #if defined(LT) || defined(RN)
  1757. movl KK, %eax
  1758. #else
  1759. movl K, %eax
  1760. subl KK, %eax
  1761. #endif
  1762. sarl $3, %eax
  1763. je .L52
  1764. .L51:
  1765. mulsd %xmm0, %xmm2
  1766. mulsd 2 * SIZE(BB), %xmm0
  1767. addsd %xmm2, %xmm4
  1768. movsd 4 * SIZE(BB), %xmm2
  1769. addsd %xmm0, %xmm5
  1770. movsd 1 * SIZE(AA), %xmm0
  1771. mulsd %xmm0, %xmm2
  1772. mulsd 6 * SIZE(BB), %xmm0
  1773. addsd %xmm2, %xmm4
  1774. movsd 16 * SIZE(BB), %xmm2
  1775. addsd %xmm0, %xmm5
  1776. movsd 2 * SIZE(AA), %xmm0
  1777. mulsd %xmm0, %xmm3
  1778. mulsd 10 * SIZE(BB), %xmm0
  1779. addsd %xmm3, %xmm4
  1780. movsd 12 * SIZE(BB), %xmm3
  1781. addsd %xmm0, %xmm5
  1782. movsd 3 * SIZE(AA), %xmm0
  1783. mulsd %xmm0, %xmm3
  1784. mulsd 14 * SIZE(BB), %xmm0
  1785. addsd %xmm3, %xmm4
  1786. movsd 24 * SIZE(BB), %xmm3
  1787. addsd %xmm0, %xmm5
  1788. movsd 8 * SIZE(AA), %xmm0
  1789. mulsd %xmm1, %xmm2
  1790. mulsd 18 * SIZE(BB), %xmm1
  1791. addsd %xmm2, %xmm4
  1792. movsd 20 * SIZE(BB), %xmm2
  1793. addsd %xmm1, %xmm5
  1794. movsd 5 * SIZE(AA), %xmm1
  1795. mulsd %xmm1, %xmm2
  1796. mulsd 22 * SIZE(BB), %xmm1
  1797. addsd %xmm2, %xmm4
  1798. movsd 32 * SIZE(BB), %xmm2
  1799. addsd %xmm1, %xmm5
  1800. movsd 6 * SIZE(AA), %xmm1
  1801. mulsd %xmm1, %xmm3
  1802. mulsd 26 * SIZE(BB), %xmm1
  1803. addsd %xmm3, %xmm4
  1804. movsd 28 * SIZE(BB), %xmm3
  1805. addsd %xmm1, %xmm5
  1806. movsd 7 * SIZE(AA), %xmm1
  1807. mulsd %xmm1, %xmm3
  1808. mulsd 30 * SIZE(BB), %xmm1
  1809. addsd %xmm3, %xmm4
  1810. movsd 40 * SIZE(BB), %xmm3
  1811. addsd %xmm1, %xmm5
  1812. movsd 12 * SIZE(AA), %xmm1
  1813. addl $ 8 * SIZE, AA
  1814. addl $32 * SIZE, BB
  1815. BRANCH
  1816. decl %eax
  1817. jne .L51
  1818. .L52:
  1819. #if defined(LT) || defined(RN)
  1820. movl KK, %eax
  1821. #else
  1822. movl K, %eax
  1823. subl KK, %eax
  1824. #endif
  1825. andl $7, %eax # if (k & 1)
  1826. BRANCH
  1827. je .L54
  1828. .L53:
  1829. mulsd %xmm0, %xmm2
  1830. mulsd 2 * SIZE(BB), %xmm0
  1831. addsd %xmm2, %xmm4
  1832. movsd 4 * SIZE(BB), %xmm2
  1833. addsd %xmm0, %xmm5
  1834. movsd 1 * SIZE(AA), %xmm0
  1835. addl $1 * SIZE, AA # aoffset += 8
  1836. addl $4 * SIZE, BB # boffset1 += 8
  1837. decl %eax
  1838. BRANCH
  1839. jg .L53
  1840. ALIGN_4
  1841. .L54:
  1842. addsd %xmm6, %xmm4
  1843. addsd %xmm7, %xmm5
  1844. #if defined(LN) || defined(RT)
  1845. movl KK, %eax
  1846. #ifdef LN
  1847. subl $1, %eax
  1848. #else
  1849. subl $2, %eax
  1850. #endif
  1851. movl AORIG, AA
  1852. movl BORIG, B
  1853. leal BUFFER, BB
  1854. leal (, %eax, SIZE), %eax
  1855. leal (AA, %eax, 1), AA
  1856. leal (B, %eax, 2), B
  1857. leal (BB, %eax, 4), BB
  1858. #endif
  1859. #if defined(LN) || defined(LT)
  1860. movsd 0 * SIZE(B), %xmm0
  1861. movsd 1 * SIZE(B), %xmm1
  1862. #else
  1863. movsd 0 * SIZE(AA), %xmm0
  1864. movsd 1 * SIZE(AA), %xmm1
  1865. #endif
  1866. subsd %xmm4, %xmm0
  1867. subsd %xmm5, %xmm1
  1868. #if defined(LN) || defined(LT)
  1869. movsd 0 * SIZE(AA), %xmm2
  1870. mulsd %xmm2, %xmm0
  1871. mulsd %xmm2, %xmm1
  1872. #endif
  1873. #ifdef RN
  1874. mulsd 0 * SIZE(B), %xmm0
  1875. movsd 1 * SIZE(B), %xmm4
  1876. mulsd %xmm0, %xmm4
  1877. subsd %xmm4, %xmm1
  1878. mulsd 3 * SIZE(B), %xmm1
  1879. #endif
  1880. #ifdef RT
  1881. mulsd 3 * SIZE(B), %xmm1
  1882. movsd 2 * SIZE(B), %xmm4
  1883. mulsd %xmm1, %xmm4
  1884. subsd %xmm4, %xmm0
  1885. mulsd 0 * SIZE(B), %xmm0
  1886. #endif
  1887. #if defined(LN) || defined(LT)
  1888. movsd %xmm0, 0 * SIZE(B)
  1889. movsd %xmm1, 1 * SIZE(B)
  1890. movsd %xmm0, 0 * SIZE(BB)
  1891. movsd %xmm0, 1 * SIZE(BB)
  1892. movsd %xmm1, 2 * SIZE(BB)
  1893. movsd %xmm1, 3 * SIZE(BB)
  1894. #else
  1895. movsd %xmm0, 0 * SIZE(AA)
  1896. movsd %xmm1, 1 * SIZE(AA)
  1897. #endif
  1898. #ifdef LN
  1899. subl $1 * SIZE, %esi
  1900. #endif
  1901. movsd %xmm0, 0 * SIZE(%esi)
  1902. movsd %xmm1, 0 * SIZE(%esi, LDC)
  1903. #ifndef LN
  1904. addl $1 * SIZE, %esi
  1905. #endif
  1906. #if defined(LT) || defined(RN)
  1907. movl K, %eax
  1908. subl KK, %eax
  1909. leal (,%eax, SIZE), %eax
  1910. leal (AA, %eax, 1), AA
  1911. #ifdef LT
  1912. addl $2 * SIZE, B
  1913. #endif
  1914. #endif
  1915. #ifdef LN
  1916. subl $1, KK
  1917. movl BORIG, B
  1918. #endif
  1919. #ifdef LT
  1920. addl $1, KK
  1921. #endif
  1922. #ifdef RT
  1923. movl K, %eax
  1924. movl BORIG, B
  1925. sall $0 + BASE_SHIFT, %eax
  1926. addl %eax, AORIG
  1927. #endif
  1928. ALIGN_2
  1929. .L99:
  1930. #ifdef LN
  1931. movl K, %eax
  1932. leal (, %eax, SIZE), %eax
  1933. leal (B, %eax, 2), B
  1934. #endif
  1935. #if defined(LT) || defined(RN)
  1936. movl K, %eax
  1937. subl KK, %eax
  1938. leal (,%eax, SIZE), %eax
  1939. leal (B, %eax, 2), B
  1940. #endif
  1941. #ifdef RN
  1942. addl $2, KK
  1943. #endif
  1944. #ifdef RT
  1945. subl $2, KK
  1946. #endif
  1947. decl J # j --
  1948. jg .L01
  1949. ALIGN_2
  1950. .L999:
  1951. movl OLD_STACK, %esp
  1952. EMMS
  1953. popl %ebx
  1954. popl %esi
  1955. popl %edi
  1956. popl %ebp
  1957. ret
  1958. ALIGN_2
  1959. EPILOGUE